Obsah přiložených dokumentů ve formátech ODF - OpenOffice, openxml - MS Office a PDF se extrahuje do čistého textu a indexuje pro fulltextové vyhledávání.

closes #211
2015-02-23 22:21:25 +01:00
parent 860c7227cd
commit 1d2810f78d
23 changed files with 1776 additions and 45 deletions
@@ -349,6 +349,19 @@
 			<version>2.4</version>
 		</dependency>

+		<!-- Text extractors -->
+		<dependency>
+			<groupId>org.apache.odftoolkit</groupId>
+			<artifactId>simple-odf</artifactId>
+			<version>0.7-incubating</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.poi</groupId>
+			<artifactId>poi-ooxml</artifactId>
+			<version>3.11</version>
+		</dependency>
+
 		<!-- Test -->
 		<dependency>
 			<groupId>junit</groupId>
@@ -11,8 +11,8 @@ import info.bukova.isspst.data.User;
 import info.bukova.isspst.reporting.Report;
 import info.bukova.isspst.reporting.ReportMapping;
 import info.bukova.isspst.reporting.ReportType;
-import info.bukova.isspst.services.FullTextService;
 import info.bukova.isspst.services.dbinfo.DbInfoService;
+import info.bukova.isspst.services.fulltext.FullTextService;
 import info.bukova.isspst.services.munits.MUnitService;
 import info.bukova.isspst.services.numberseries.NumberSeriesService;
 import info.bukova.isspst.services.requirement.RequirementTypeService;
@@ -20,19 +20,17 @@ import info.bukova.isspst.services.settings.GlobalSettingsService;
 import info.bukova.isspst.services.users.PermissionService;
 import info.bukova.isspst.services.users.RoleService;
 import info.bukova.isspst.services.users.UserService;
-
-import java.math.BigDecimal;
-import java.util.List;
-
-import javax.servlet.ServletContextEvent;
-import javax.servlet.ServletContextListener;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.security.core.userdetails.UsernameNotFoundException;
 import org.springframework.web.context.WebApplicationContext;
 import org.springframework.web.context.support.WebApplicationContextUtils;

+import javax.servlet.ServletContextEvent;
+import javax.servlet.ServletContextListener;
+import java.math.BigDecimal;
+import java.util.List;
+
 public class AppInitListener implements ServletContextListener {


@@ -10,7 +10,7 @@ import info.bukova.isspst.data.TripBill;
 import info.bukova.isspst.data.TripRequirement;
 import info.bukova.isspst.reporting.Report;
 import info.bukova.isspst.reporting.ReportMapping;
-import info.bukova.isspst.services.FullTextService;
+import info.bukova.isspst.services.fulltext.FullTextService;
 import info.bukova.isspst.services.addressbook.AdbService;
 import info.bukova.isspst.services.buildings.BuildingService;
 import info.bukova.isspst.services.invoicing.InvoicingService;
@@ -0,0 +1,97 @@
+package info.bukova.isspst.data;
+
+import org.hibernate.annotations.Type;
+import org.hibernate.search.annotations.Analyze;
+import org.hibernate.search.annotations.Field;
+import org.hibernate.search.annotations.Index;
+import org.hibernate.search.annotations.Indexed;
+
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.GeneratedValue;
+import javax.persistence.Id;
+import javax.persistence.Table;
+
+/**
+ * @author Pepa Rokos
+ */
+@Entity
+@Table(name = "FILE_CONTENTS")
+@Indexed
+public class FileContent {
+
+	@Id
+	@Column(name = "ID")
+	@GeneratedValue
+	private int id;
+
+	@Column(name = "CONTENT")
+	@Type(type = "text")
+	@Field(index = Index.YES, analyze = Analyze.YES)
+	private String plainText;
+
+	@Column(name = "CONTENT_TYPE")
+	private String contentType;
+
+	@Column(name = "PATH_IN_FILESYSTEM")
+	private String pathInFilesystem;
+
+	public int getId() {
+		return id;
+	}
+
+	public void setId(int id) {
+		this.id = id;
+	}
+
+	public String getPlainText() {
+		return plainText;
+
+	}
+
+	public void setPlainText(String content) {
+		this.plainText = content;
+	}
+
+	public String getContentType() {
+		return contentType;
+	}
+
+	public void setContentType(String contentType) {
+		this.contentType = contentType;
+	}
+
+	public String getPathInFilesystem() {
+		return pathInFilesystem;
+	}
+
+	public void setPathInFilesystem(String pathInFilesystem) {
+		this.pathInFilesystem = pathInFilesystem;
+	}
+
+
+	@Override
+	public boolean equals(Object o) {
+		if (this == o) return true;
+		if (!(o instanceof FileContent)) return false;
+
+		FileContent that = (FileContent) o;
+
+		if (id != that.id) return false;
+		if (plainText != null ? !plainText.equals(that.plainText) : that.plainText != null) return false;
+		if (contentType != null ? !contentType.equals(that.contentType) : that.contentType != null) return false;
+		if (pathInFilesystem != null ? !pathInFilesystem.equals(that.pathInFilesystem) : that.pathInFilesystem != null)
+			return false;
+
+		return true;
+	}
+
+	@Override
+	public int hashCode() {
+		int result = id;
+		result = 31 * result + (plainText != null ? plainText.hashCode() : 0);
+		result = 31 * result + (contentType != null ? contentType.hashCode() : 0);
+		result = 31 * result + (pathInFilesystem != null ? pathInFilesystem.hashCode() : 0);
+		return result;
+	}
+}
@@ -1,13 +1,17 @@
 package info.bukova.isspst.data;

-import org.hibernate.annotations.Type;
 import org.hibernate.search.annotations.Analyze;
 import org.hibernate.search.annotations.Field;
 import org.hibernate.search.annotations.Index;
 import org.hibernate.search.annotations.Indexed;
+import org.hibernate.search.annotations.IndexedEmbedded;

+import javax.persistence.CascadeType;
 import javax.persistence.Column;
 import javax.persistence.Entity;
+import javax.persistence.FetchType;
+import javax.persistence.JoinColumn;
+import javax.persistence.ManyToOne;
 import javax.persistence.Table;

@Entity
@@ -16,23 +20,32 @@ import javax.persistence.Table;
 public class FileMetainfo extends BaseData {

 	@Column(name = "FILE_NAME")
+	@Field(index = Index.YES, analyze = Analyze.YES)
 	private String fileName;
-	@Column(name = "PATH_IN_FILESYSTEM")
-	private String pathInFilesystem;
+
 	@Column(name = "MODULE_ID")
 	private String moduleId;
+
 	@Column(name = "RECORD_ID")
 	private int recordId;
-	@Column(name = "CONTENT")
-	@Type(type = "text")
-	@Field(index = Index.YES, analyze = Analyze.YES)
-	private String content;
+
+	@ManyToOne(fetch = FetchType.EAGER, cascade = CascadeType.ALL)
+	@JoinColumn(name = "CONTENT_ID")
+	@IndexedEmbedded
+	private FileContent content;
+
 	@Column(name = "MD5")
 	private String md5;
+
 	@Column(name = "DESCRIPTION")
+	@Field(index = Index.YES, analyze = Analyze.YES)
 	private String description;
-	@Column(name = "CONTENT_TYPE")
-	private String contentType;
+
+	private void ensureContentExists() {
+		if (content == null) {
+			content = new FileContent();
+		}
+	}

 	public String getFileName() {
 		return fileName;
@@ -43,11 +56,15 @@ public class FileMetainfo extends BaseData {
 	}

 	public String getPathInFilesystem() {
-		return pathInFilesystem;
+		if (content != null) {
+			return content.getPathInFilesystem();
+		}
+		return null;
 	}

 	public void setPathInFilesystem(String pathInFilesystem) {
-		this.pathInFilesystem = pathInFilesystem;
+		ensureContentExists();
+		content.setPathInFilesystem(pathInFilesystem);
 	}

 	public String getModuleId() {
@@ -66,11 +83,11 @@ public class FileMetainfo extends BaseData {
 		this.recordId = recordId;
 	}

-	public String getContent() {
+	public FileContent getContent() {
 		return content;
 	}

-	public void setContent(String content) {
+	public void setContent(FileContent content) {
 		this.content = content;
 	}

@@ -91,11 +108,16 @@ public class FileMetainfo extends BaseData {
 	}

 	public String getContentType() {
-		return contentType;
+		if (content != null) {
+			return content.getContentType();
+		}
+
+		return null;
 	}

 	public void setContentType(String contentType) {
-		this.contentType = contentType;
+		ensureContentExists();
+		content.setContentType(contentType);
 	}

 	@Override
@@ -105,16 +127,12 @@ public class FileMetainfo extends BaseData {

 		FileMetainfo that = (FileMetainfo) o;

-		if (getId() != 0 && getId() != that.getId()) return false;
 		if (recordId != that.recordId) return false;
 		if (content != null ? !content.equals(that.content) : that.content != null) return false;
-		if (contentType != null ? !contentType.equals(that.contentType) : that.contentType != null) return false;
 		if (description != null ? !description.equals(that.description) : that.description != null) return false;
 		if (fileName != null ? !fileName.equals(that.fileName) : that.fileName != null) return false;
 		if (md5 != null ? !md5.equals(that.md5) : that.md5 != null) return false;
 		if (moduleId != null ? !moduleId.equals(that.moduleId) : that.moduleId != null) return false;
-		if (pathInFilesystem != null ? !pathInFilesystem.equals(that.pathInFilesystem) : that.pathInFilesystem != null)
-			return false;

 		return true;
 	}
@@ -122,13 +140,11 @@ public class FileMetainfo extends BaseData {
 	@Override
 	public int hashCode() {
 		int result = fileName != null ? fileName.hashCode() : 0;
-		result = 31 * result + (pathInFilesystem != null ? pathInFilesystem.hashCode() : 0);
 		result = 31 * result + (moduleId != null ? moduleId.hashCode() : 0);
 		result = 31 * result + recordId;
 		result = 31 * result + (content != null ? content.hashCode() : 0);
 		result = 31 * result + (md5 != null ? md5.hashCode() : 0);
 		result = 31 * result + (description != null ? description.hashCode() : 0);
-		result = 31 * result + (contentType != null ? contentType.hashCode() : 0);
 		return result;
 	}
 }
@@ -0,0 +1,14 @@
+package info.bukova.isspst.services.fulltext;
+
+import java.io.ByteArrayInputStream;
+
+/**
+ * @author Pepa Rokos
+ */
+public abstract class AbstractExtractor implements Extractor {
+
+	public String extract(byte[] data) {
+		return extract(new ByteArrayInputStream(data));
+	}
+
+}
@@ -0,0 +1,25 @@
+package info.bukova.isspst.services.fulltext;
+
+import org.apache.poi.POIXMLTextExtractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * @author Pepa Rokos
+ */
+public abstract class AbstractOfficeExtractor extends AbstractExtractor {
+
+	@Override
+	public String extract(InputStream is) throws ExtractorException {
+		try {
+			POIXMLTextExtractor extractor = createExtractor(is);
+			return extractor.getText();
+		} catch (IOException e) {
+			throw new ExtractorException(e);
+		}
+	}
+
+	protected abstract POIXMLTextExtractor createExtractor(InputStream is) throws IOException;
+
+}
@@ -0,0 +1,19 @@
+package info.bukova.isspst.services.fulltext;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * @author Pepa Rokos
+ */
+public class ExcelExtractor extends AbstractOfficeExtractor implements Extractor {
+
+	@Override
+	protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
+		return new XSSFExcelExtractor(new XSSFWorkbook(is));
+	}
+}
@@ -0,0 +1,30 @@
+package info.bukova.isspst.services.fulltext;
+
+import java.io.InputStream;
+
+/**
+ * @author Pepa Rokos
+ *
+ * Rozhraní extractoru čistého textu z formátů Office a PDF
+ */
+public interface Extractor {
+
+	/**
+	 * Extrahuje text z předaného pole bytů
+	 *
+	 * @param data zdrajová data
+	 * @return čistý text
+	 * @throws ExtractorException
+	 */
+	public String extract(byte[] data) throws ExtractorException;
+
+	/**
+	 * Extrahuje text z předaného InputStream objektu
+	 *
+	 * @param is zdrojový InputStream
+	 * @return čistý text
+	 * @throws ExtractorException
+	 */
+	public String extract(InputStream is) throws ExtractorException;
+
+}
@@ -0,0 +1,16 @@
+package info.bukova.isspst.services.fulltext;
+
+import info.bukova.isspst.services.IsspstException;
+
+/**
+ * @author Pepa Rokos
+ *
+ * Výjimka extrakce textu
+ */
+public class ExtractorException extends IsspstException {
+
+	public ExtractorException(Throwable cause) {
+		super("Extractor exception: ", cause);
+	}
+
+}
@@ -0,0 +1,42 @@
+package info.bukova.isspst.services.fulltext;
+
+/**
+ * @author Pepa Rokos
+ *
+ * Factory pro konkrétní extractor
+ */
+public class ExtractorFactory {
+
+	/**
+	 * Vytvoří extractor podle předaného content typu
+	 *
+	 * @param contentType
+	 * @return Extractor
+	 */
+	public static Extractor createExtractor(String contentType) {
+		if (contentType.equals("application/vnd.oasis.opendocument.text")
+				|| contentType.equals("application/vnd.oasis.opendocument.spreadsheet")
+				|| contentType.equals("application/vnd.oasis.opendocument.presentation")) {
+			return new OdfExtractor();
+		}
+
+		if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
+			return new WordExtractor();
+		}
+
+		if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
+			return new ExcelExtractor();
+		}
+
+		if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.slideshow")) {
+			return new PowerPointExtractor();
+		}
+
+		if (contentType.equals("application/pdf")) {
+			return new PdfExtractor();
+		}
+
+		return null;
+	}
+
+}
@@ -1,10 +1,10 @@
-package info.bukova.isspst.services;
-
-import java.util.List;
+package info.bukova.isspst.services.fulltext;

 import org.hibernate.search.annotations.Field;
 import org.hibernate.search.annotations.Indexed;

+import java.util.List;
+
 /**
 * @author Pepa Rokos
 * 
@@ -1,9 +1,10 @@
-package info.bukova.isspst.services;
+package info.bukova.isspst.services.fulltext;

 import info.bukova.isspst.ModuleUtils;
 import info.bukova.isspst.dao.QueryDao;
 import info.bukova.isspst.data.BaseData;
 import info.bukova.isspst.data.User;
+import info.bukova.isspst.services.ModuleNotActiveException;
 import info.bukova.isspst.sort.ReflectionTools;
 import org.apache.lucene.search.Query;
 import org.hibernate.Hibernate;
@@ -0,0 +1,24 @@
+package info.bukova.isspst.services.fulltext;
+
+import org.odftoolkit.simple.Document;
+import org.odftoolkit.simple.common.TextExtractor;
+
+import java.io.InputStream;
+
+/**
+ * @author Pepa Rokos
+ */
+public class OdfExtractor extends AbstractExtractor implements Extractor {
+
+	@Override
+	public String extract(InputStream is) throws ExtractorException {
+		try {
+			Document odfDocument = Document.loadDocument(is);
+			TextExtractor extractor = TextExtractor.newOdfTextExtractor(odfDocument.getContentRoot());
+
+			return extractor.getText();
+		} catch (Exception e) {
+			throw new ExtractorException(e);
+		}
+	}
+}
@@ -0,0 +1,32 @@
+package info.bukova.isspst.services.fulltext;
+
+import com.lowagie.text.pdf.PdfReader;
+import com.lowagie.text.pdf.parser.PdfTextExtractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * @author Pepa Rokos
+ */
+public class PdfExtractor extends AbstractExtractor implements Extractor {
+
+	@Override
+	public String extract(InputStream is) throws ExtractorException {
+		try {
+			PdfReader reader = new PdfReader(is);
+			PdfTextExtractor extractor = new PdfTextExtractor(reader);
+			StringBuilder sb = new StringBuilder();
+
+			for (int i = 1; i <= reader.getNumberOfPages(); i++) {
+				sb.append(extractor.getTextFromPage(i));
+			}
+
+			return sb.toString();
+
+		} catch (IOException e) {
+			throw new ExtractorException(e);
+		}
+	}
+
+}
@@ -0,0 +1,20 @@
+package info.bukova.isspst.services.fulltext;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * @author Pepa Rokos
+ */
+public class PowerPointExtractor extends AbstractOfficeExtractor implements Extractor {
+
+	@Override
+	protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
+		return new XSLFPowerPointExtractor(new XMLSlideShow(is));
+	}
+
+}
@@ -0,0 +1,19 @@
+package info.bukova.isspst.services.fulltext;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * @author Pepa Rokos
+ */
+public class WordExtractor extends AbstractOfficeExtractor implements Extractor {
+
+	@Override
+	protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
+		return new XWPFWordExtractor(new XWPFDocument(is));
+	}
+}
@@ -2,6 +2,8 @@ package info.bukova.isspst.storage;

 import info.bukova.isspst.dao.QueryDao;
 import info.bukova.isspst.data.FileMetainfo;
+import info.bukova.isspst.services.fulltext.Extractor;
+import info.bukova.isspst.services.fulltext.ExtractorFactory;
 import org.apache.commons.codec.binary.Hex;
 import org.hibernate.Query;
 import org.springframework.beans.factory.annotation.Autowired;
@@ -68,6 +70,15 @@ public class DocumentFileStorageImpl extends AbstractFileStorage<FileMetainfo> i
 		return fileName;
 	}

+	private void extractContent(InputStream is, FileMetainfo fileID) {
+		Extractor extractor = ExtractorFactory.createExtractor(fileID.getContentType());
+
+		if (extractor != null) {
+			fileID.getContent().setPlainText(extractor.extract(is));
+		}
+
+	}
+
 	@Override
 	@Transactional
 	public void removeFile(FileMetainfo fileID) {
@@ -126,6 +137,7 @@ public class DocumentFileStorageImpl extends AbstractFileStorage<FileMetainfo> i

 		if (!checkForDuplicate(new ByteArrayInputStream(data), metaInfo)) {
 			saveFile(data, metaInfo);
+			extractContent(new ByteArrayInputStream(data), metaInfo);
 		}

 		return metaInfo;
@@ -140,6 +152,7 @@ public class DocumentFileStorageImpl extends AbstractFileStorage<FileMetainfo> i
 		try {
 			if (!checkForDuplicate(new FileInputStream(file), metaInfo)) {
 				saveFile(file, metaInfo);
+				extractContent(new FileInputStream(file), metaInfo);
 			}
 		} catch (FileNotFoundException e) {
 			//TODO: ošetřit
@@ -192,9 +205,8 @@ public class DocumentFileStorageImpl extends AbstractFileStorage<FileMetainfo> i

 		if (!found.isEmpty()) {
 			FileMetainfo foundInfo = found.get(0);
-			info.setPathInFilesystem(foundInfo.getPathInFilesystem());
 			info.setMd5(foundInfo.getMd5());
-			info.setContentType(foundInfo.getContentType());
+			info.setContent(foundInfo.getContent());

 			return true;
 		} else {
@@ -231,7 +231,7 @@ public class MimeTypes {
 			if (mimeTypes.size() == 0)
 			{
 				HashMap<String, String> tempMap = new HashMap<String, String>();
-				InputStream is = MimeTypes.class.getResourceAsStream("mime.types.properties");
+				InputStream is = MimeTypes.class.getResourceAsStream("/mime.types.properties");
 				try
 				{
 					Properties properties = new Properties();
@@ -248,6 +248,7 @@ public class MimeTypes {
 				}
 				catch (IOException e)
 				{
+					//ToDo: ošetřit
 					//Debug.error(e);
 				}
 				finally
@@ -258,6 +259,7 @@ public class MimeTypes {
 					}
 					catch (IOException e)
 					{
+						//ToDo: ošetřit
 						//Debug.error(e);
 					}
 				}
@@ -1,14 +1,13 @@
 package info.bukova.isspst.ui.search;

 import info.bukova.isspst.UrlResolverHolder;
-import info.bukova.isspst.services.FullTextService;
-
-import java.util.List;
-
+import info.bukova.isspst.services.fulltext.FullTextService;
 import org.zkoss.bind.annotation.Command;
 import org.zkoss.bind.annotation.NotifyChange;
 import org.zkoss.zk.ui.select.annotation.WireVariable;

+import java.util.List;
+
 public class SearchForm {
 	
 	@WireVariable
@@ -34,5 +34,6 @@
        <mapping class="info.bukova.isspst.data.Invoicing"></mapping>
        <mapping class="info.bukova.isspst.data.InvoicingItem"></mapping>
        <mapping class="info.bukova.isspst.data.FileMetainfo"></mapping>
+        <mapping class="info.bukova.isspst.data.FileContent"></mapping>
    </session-factory>
 </hibernate-configuration>
@@ -107,7 +107,7 @@
 				<entry key="#{T(info.bukova.isspst.data.Requirement)}" value-ref="reqEditEval"/>
 				<entry key="#{T(info.bukova.isspst.data.TripRequirement)}" value-ref="tripReqEditEval"/>
 				<entry key="#{T(info.bukova.isspst.data.User)}" value-ref="userEvaluator"/>
-				<entry key="#{T(info.bukova.isspst.services.FullTextService)}" value-ref="serviceEval"/>
+				<entry key="#{T(info.bukova.isspst.services.fulltext.FullTextService)}" value-ref="serviceEval"/>
 			</map>
 		</property>
 		<property name="specialEvaluators">
@@ -440,6 +440,6 @@
 		<property name="validator" ref="validator"/>
 	</bean>
 	
-	<bean id="fulltextService" class="info.bukova.isspst.services.FullTextServiceImpl"/>
+	<bean id="fulltextService" class="info.bukova.isspst.services.fulltext.FullTextServiceImpl"/>
 	
 </beans>