Obsah přiložených dokumentů ve formátech ODF - OpenOffice, openxml - MS Office a PDF se extrahuje do čistého textu a indexuje pro fulltextové vyhledávání.

closes #211
Verze_2.0
Josef Rokos 10 years ago
parent 860c7227cd
commit 1d2810f78d

@ -349,6 +349,19 @@
<version>2.4</version>
</dependency>
<!-- Text extractors -->
<dependency>
<groupId>org.apache.odftoolkit</groupId>
<artifactId>simple-odf</artifactId>
<version>0.7-incubating</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.11</version>
</dependency>
<!-- Test -->
<dependency>
<groupId>junit</groupId>

@ -11,8 +11,8 @@ import info.bukova.isspst.data.User;
import info.bukova.isspst.reporting.Report;
import info.bukova.isspst.reporting.ReportMapping;
import info.bukova.isspst.reporting.ReportType;
import info.bukova.isspst.services.FullTextService;
import info.bukova.isspst.services.dbinfo.DbInfoService;
import info.bukova.isspst.services.fulltext.FullTextService;
import info.bukova.isspst.services.munits.MUnitService;
import info.bukova.isspst.services.numberseries.NumberSeriesService;
import info.bukova.isspst.services.requirement.RequirementTypeService;
@ -20,19 +20,17 @@ import info.bukova.isspst.services.settings.GlobalSettingsService;
import info.bukova.isspst.services.users.PermissionService;
import info.bukova.isspst.services.users.RoleService;
import info.bukova.isspst.services.users.UserService;
import java.math.BigDecimal;
import java.util.List;
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletContextListener;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.security.core.userdetails.UsernameNotFoundException;
import org.springframework.web.context.WebApplicationContext;
import org.springframework.web.context.support.WebApplicationContextUtils;
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletContextListener;
import java.math.BigDecimal;
import java.util.List;
public class AppInitListener implements ServletContextListener {

@ -10,7 +10,7 @@ import info.bukova.isspst.data.TripBill;
import info.bukova.isspst.data.TripRequirement;
import info.bukova.isspst.reporting.Report;
import info.bukova.isspst.reporting.ReportMapping;
import info.bukova.isspst.services.FullTextService;
import info.bukova.isspst.services.fulltext.FullTextService;
import info.bukova.isspst.services.addressbook.AdbService;
import info.bukova.isspst.services.buildings.BuildingService;
import info.bukova.isspst.services.invoicing.InvoicingService;

@ -0,0 +1,97 @@
package info.bukova.isspst.data;
import org.hibernate.annotations.Type;
import org.hibernate.search.annotations.Analyze;
import org.hibernate.search.annotations.Field;
import org.hibernate.search.annotations.Index;
import org.hibernate.search.annotations.Indexed;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.Id;
import javax.persistence.Table;
/**
* @author Pepa Rokos
*/
@Entity
@Table(name = "FILE_CONTENTS")
@Indexed
public class FileContent {
@Id
@Column(name = "ID")
@GeneratedValue
private int id;
@Column(name = "CONTENT")
@Type(type = "text")
@Field(index = Index.YES, analyze = Analyze.YES)
private String plainText;
@Column(name = "CONTENT_TYPE")
private String contentType;
@Column(name = "PATH_IN_FILESYSTEM")
private String pathInFilesystem;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getPlainText() {
return plainText;
}
public void setPlainText(String content) {
this.plainText = content;
}
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
public String getPathInFilesystem() {
return pathInFilesystem;
}
public void setPathInFilesystem(String pathInFilesystem) {
this.pathInFilesystem = pathInFilesystem;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof FileContent)) return false;
FileContent that = (FileContent) o;
if (id != that.id) return false;
if (plainText != null ? !plainText.equals(that.plainText) : that.plainText != null) return false;
if (contentType != null ? !contentType.equals(that.contentType) : that.contentType != null) return false;
if (pathInFilesystem != null ? !pathInFilesystem.equals(that.pathInFilesystem) : that.pathInFilesystem != null)
return false;
return true;
}
@Override
public int hashCode() {
int result = id;
result = 31 * result + (plainText != null ? plainText.hashCode() : 0);
result = 31 * result + (contentType != null ? contentType.hashCode() : 0);
result = 31 * result + (pathInFilesystem != null ? pathInFilesystem.hashCode() : 0);
return result;
}
}

@ -1,13 +1,17 @@
package info.bukova.isspst.data;
import org.hibernate.annotations.Type;
import org.hibernate.search.annotations.Analyze;
import org.hibernate.search.annotations.Field;
import org.hibernate.search.annotations.Index;
import org.hibernate.search.annotations.Indexed;
import org.hibernate.search.annotations.IndexedEmbedded;
import javax.persistence.CascadeType;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.FetchType;
import javax.persistence.JoinColumn;
import javax.persistence.ManyToOne;
import javax.persistence.Table;
@Entity
@ -16,23 +20,32 @@ import javax.persistence.Table;
public class FileMetainfo extends BaseData {
@Column(name = "FILE_NAME")
@Field(index = Index.YES, analyze = Analyze.YES)
private String fileName;
@Column(name = "PATH_IN_FILESYSTEM")
private String pathInFilesystem;
@Column(name = "MODULE_ID")
private String moduleId;
@Column(name = "RECORD_ID")
private int recordId;
@Column(name = "CONTENT")
@Type(type = "text")
@Field(index = Index.YES, analyze = Analyze.YES)
private String content;
@ManyToOne(fetch = FetchType.EAGER, cascade = CascadeType.ALL)
@JoinColumn(name = "CONTENT_ID")
@IndexedEmbedded
private FileContent content;
@Column(name = "MD5")
private String md5;
@Column(name = "DESCRIPTION")
@Field(index = Index.YES, analyze = Analyze.YES)
private String description;
@Column(name = "CONTENT_TYPE")
private String contentType;
private void ensureContentExists() {
if (content == null) {
content = new FileContent();
}
}
public String getFileName() {
return fileName;
@ -43,11 +56,15 @@ public class FileMetainfo extends BaseData {
}
public String getPathInFilesystem() {
return pathInFilesystem;
if (content != null) {
return content.getPathInFilesystem();
}
return null;
}
public void setPathInFilesystem(String pathInFilesystem) {
this.pathInFilesystem = pathInFilesystem;
ensureContentExists();
content.setPathInFilesystem(pathInFilesystem);
}
public String getModuleId() {
@ -66,11 +83,11 @@ public class FileMetainfo extends BaseData {
this.recordId = recordId;
}
public String getContent() {
public FileContent getContent() {
return content;
}
public void setContent(String content) {
public void setContent(FileContent content) {
this.content = content;
}
@ -91,11 +108,16 @@ public class FileMetainfo extends BaseData {
}
public String getContentType() {
return contentType;
if (content != null) {
return content.getContentType();
}
return null;
}
public void setContentType(String contentType) {
this.contentType = contentType;
ensureContentExists();
content.setContentType(contentType);
}
@Override
@ -105,16 +127,12 @@ public class FileMetainfo extends BaseData {
FileMetainfo that = (FileMetainfo) o;
if (getId() != 0 && getId() != that.getId()) return false;
if (recordId != that.recordId) return false;
if (content != null ? !content.equals(that.content) : that.content != null) return false;
if (contentType != null ? !contentType.equals(that.contentType) : that.contentType != null) return false;
if (description != null ? !description.equals(that.description) : that.description != null) return false;
if (fileName != null ? !fileName.equals(that.fileName) : that.fileName != null) return false;
if (md5 != null ? !md5.equals(that.md5) : that.md5 != null) return false;
if (moduleId != null ? !moduleId.equals(that.moduleId) : that.moduleId != null) return false;
if (pathInFilesystem != null ? !pathInFilesystem.equals(that.pathInFilesystem) : that.pathInFilesystem != null)
return false;
return true;
}
@ -122,13 +140,11 @@ public class FileMetainfo extends BaseData {
@Override
public int hashCode() {
int result = fileName != null ? fileName.hashCode() : 0;
result = 31 * result + (pathInFilesystem != null ? pathInFilesystem.hashCode() : 0);
result = 31 * result + (moduleId != null ? moduleId.hashCode() : 0);
result = 31 * result + recordId;
result = 31 * result + (content != null ? content.hashCode() : 0);
result = 31 * result + (md5 != null ? md5.hashCode() : 0);
result = 31 * result + (description != null ? description.hashCode() : 0);
result = 31 * result + (contentType != null ? contentType.hashCode() : 0);
return result;
}
}

@ -0,0 +1,14 @@
package info.bukova.isspst.services.fulltext;
import java.io.ByteArrayInputStream;
/**
* @author Pepa Rokos
*/
public abstract class AbstractExtractor implements Extractor {
public String extract(byte[] data) {
return extract(new ByteArrayInputStream(data));
}
}

@ -0,0 +1,25 @@
package info.bukova.isspst.services.fulltext;
import org.apache.poi.POIXMLTextExtractor;
import java.io.IOException;
import java.io.InputStream;
/**
* @author Pepa Rokos
*/
public abstract class AbstractOfficeExtractor extends AbstractExtractor {
@Override
public String extract(InputStream is) throws ExtractorException {
try {
POIXMLTextExtractor extractor = createExtractor(is);
return extractor.getText();
} catch (IOException e) {
throw new ExtractorException(e);
}
}
protected abstract POIXMLTextExtractor createExtractor(InputStream is) throws IOException;
}

@ -0,0 +1,19 @@
package info.bukova.isspst.services.fulltext;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.IOException;
import java.io.InputStream;
/**
* @author Pepa Rokos
*/
public class ExcelExtractor extends AbstractOfficeExtractor implements Extractor {
@Override
protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
return new XSSFExcelExtractor(new XSSFWorkbook(is));
}
}

@ -0,0 +1,30 @@
package info.bukova.isspst.services.fulltext;
import java.io.InputStream;
/**
* @author Pepa Rokos
*
* Rozhraní extractoru čistého textu z formátů Office a PDF
*/
public interface Extractor {
/**
* Extrahuje text z předaného pole bytů
*
* @param data zdrajová data
* @return čistý text
* @throws ExtractorException
*/
public String extract(byte[] data) throws ExtractorException;
/**
* Extrahuje text z předaného InputStream objektu
*
* @param is zdrojový InputStream
* @return čistý text
* @throws ExtractorException
*/
public String extract(InputStream is) throws ExtractorException;
}

@ -0,0 +1,16 @@
package info.bukova.isspst.services.fulltext;
import info.bukova.isspst.services.IsspstException;
/**
* @author Pepa Rokos
*
* Výjimka extrakce textu
*/
public class ExtractorException extends IsspstException {
public ExtractorException(Throwable cause) {
super("Extractor exception: ", cause);
}
}

@ -0,0 +1,42 @@
package info.bukova.isspst.services.fulltext;
/**
* @author Pepa Rokos
*
* Factory pro konkrétní extractor
*/
public class ExtractorFactory {
/**
* Vytvoří extractor podle předaného content typu
*
* @param contentType
* @return Extractor
*/
public static Extractor createExtractor(String contentType) {
if (contentType.equals("application/vnd.oasis.opendocument.text")
|| contentType.equals("application/vnd.oasis.opendocument.spreadsheet")
|| contentType.equals("application/vnd.oasis.opendocument.presentation")) {
return new OdfExtractor();
}
if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
return new WordExtractor();
}
if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
return new ExcelExtractor();
}
if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.slideshow")) {
return new PowerPointExtractor();
}
if (contentType.equals("application/pdf")) {
return new PdfExtractor();
}
return null;
}
}

@ -1,10 +1,10 @@
package info.bukova.isspst.services;
import java.util.List;
package info.bukova.isspst.services.fulltext;
import org.hibernate.search.annotations.Field;
import org.hibernate.search.annotations.Indexed;
import java.util.List;
/**
* @author Pepa Rokos
*

@ -1,9 +1,10 @@
package info.bukova.isspst.services;
package info.bukova.isspst.services.fulltext;
import info.bukova.isspst.ModuleUtils;
import info.bukova.isspst.dao.QueryDao;
import info.bukova.isspst.data.BaseData;
import info.bukova.isspst.data.User;
import info.bukova.isspst.services.ModuleNotActiveException;
import info.bukova.isspst.sort.ReflectionTools;
import org.apache.lucene.search.Query;
import org.hibernate.Hibernate;

@ -0,0 +1,24 @@
package info.bukova.isspst.services.fulltext;
import org.odftoolkit.simple.Document;
import org.odftoolkit.simple.common.TextExtractor;
import java.io.InputStream;
/**
* @author Pepa Rokos
*/
public class OdfExtractor extends AbstractExtractor implements Extractor {
@Override
public String extract(InputStream is) throws ExtractorException {
try {
Document odfDocument = Document.loadDocument(is);
TextExtractor extractor = TextExtractor.newOdfTextExtractor(odfDocument.getContentRoot());
return extractor.getText();
} catch (Exception e) {
throw new ExtractorException(e);
}
}
}

@ -0,0 +1,32 @@
package info.bukova.isspst.services.fulltext;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.parser.PdfTextExtractor;
import java.io.IOException;
import java.io.InputStream;
/**
* @author Pepa Rokos
*/
public class PdfExtractor extends AbstractExtractor implements Extractor {
@Override
public String extract(InputStream is) throws ExtractorException {
try {
PdfReader reader = new PdfReader(is);
PdfTextExtractor extractor = new PdfTextExtractor(reader);
StringBuilder sb = new StringBuilder();
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
sb.append(extractor.getTextFromPage(i));
}
return sb.toString();
} catch (IOException e) {
throw new ExtractorException(e);
}
}
}

@ -0,0 +1,20 @@
package info.bukova.isspst.services.fulltext;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import java.io.IOException;
import java.io.InputStream;
/**
* @author Pepa Rokos
*/
public class PowerPointExtractor extends AbstractOfficeExtractor implements Extractor {
@Override
protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
return new XSLFPowerPointExtractor(new XMLSlideShow(is));
}
}

@ -0,0 +1,19 @@
package info.bukova.isspst.services.fulltext;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.IOException;
import java.io.InputStream;
/**
* @author Pepa Rokos
*/
public class WordExtractor extends AbstractOfficeExtractor implements Extractor {
@Override
protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
return new XWPFWordExtractor(new XWPFDocument(is));
}
}

@ -2,6 +2,8 @@ package info.bukova.isspst.storage;
import info.bukova.isspst.dao.QueryDao;
import info.bukova.isspst.data.FileMetainfo;
import info.bukova.isspst.services.fulltext.Extractor;
import info.bukova.isspst.services.fulltext.ExtractorFactory;
import org.apache.commons.codec.binary.Hex;
import org.hibernate.Query;
import org.springframework.beans.factory.annotation.Autowired;
@ -68,6 +70,15 @@ public class DocumentFileStorageImpl extends AbstractFileStorage<FileMetainfo> i
return fileName;
}
private void extractContent(InputStream is, FileMetainfo fileID) {
Extractor extractor = ExtractorFactory.createExtractor(fileID.getContentType());
if (extractor != null) {
fileID.getContent().setPlainText(extractor.extract(is));
}
}
@Override
@Transactional
public void removeFile(FileMetainfo fileID) {
@ -126,6 +137,7 @@ public class DocumentFileStorageImpl extends AbstractFileStorage<FileMetainfo> i
if (!checkForDuplicate(new ByteArrayInputStream(data), metaInfo)) {
saveFile(data, metaInfo);
extractContent(new ByteArrayInputStream(data), metaInfo);
}
return metaInfo;
@ -140,6 +152,7 @@ public class DocumentFileStorageImpl extends AbstractFileStorage<FileMetainfo> i
try {
if (!checkForDuplicate(new FileInputStream(file), metaInfo)) {
saveFile(file, metaInfo);
extractContent(new FileInputStream(file), metaInfo);
}
} catch (FileNotFoundException e) {
//TODO: ošetřit
@ -192,9 +205,8 @@ public class DocumentFileStorageImpl extends AbstractFileStorage<FileMetainfo> i
if (!found.isEmpty()) {
FileMetainfo foundInfo = found.get(0);
info.setPathInFilesystem(foundInfo.getPathInFilesystem());
info.setMd5(foundInfo.getMd5());
info.setContentType(foundInfo.getContentType());
info.setContent(foundInfo.getContent());
return true;
} else {

@ -231,7 +231,7 @@ public class MimeTypes {
if (mimeTypes.size() == 0)
{
HashMap<String, String> tempMap = new HashMap<String, String>();
InputStream is = MimeTypes.class.getResourceAsStream("mime.types.properties");
InputStream is = MimeTypes.class.getResourceAsStream("/mime.types.properties");
try
{
Properties properties = new Properties();
@ -248,6 +248,7 @@ public class MimeTypes {
}
catch (IOException e)
{
//ToDo: ošetřit
//Debug.error(e);
}
finally
@ -258,6 +259,7 @@ public class MimeTypes {
}
catch (IOException e)
{
//ToDo: ošetřit
//Debug.error(e);
}
}

@ -1,14 +1,13 @@
package info.bukova.isspst.ui.search;
import info.bukova.isspst.UrlResolverHolder;
import info.bukova.isspst.services.FullTextService;
import java.util.List;
import info.bukova.isspst.services.fulltext.FullTextService;
import org.zkoss.bind.annotation.Command;
import org.zkoss.bind.annotation.NotifyChange;
import org.zkoss.zk.ui.select.annotation.WireVariable;
import java.util.List;
public class SearchForm {
@WireVariable

@ -34,5 +34,6 @@
<mapping class="info.bukova.isspst.data.Invoicing"></mapping>
<mapping class="info.bukova.isspst.data.InvoicingItem"></mapping>
<mapping class="info.bukova.isspst.data.FileMetainfo"></mapping>
<mapping class="info.bukova.isspst.data.FileContent"></mapping>
</session-factory>
</hibernate-configuration>

File diff suppressed because it is too large Load Diff

@ -107,7 +107,7 @@
<entry key="#{T(info.bukova.isspst.data.Requirement)}" value-ref="reqEditEval"/>
<entry key="#{T(info.bukova.isspst.data.TripRequirement)}" value-ref="tripReqEditEval"/>
<entry key="#{T(info.bukova.isspst.data.User)}" value-ref="userEvaluator"/>
<entry key="#{T(info.bukova.isspst.services.FullTextService)}" value-ref="serviceEval"/>
<entry key="#{T(info.bukova.isspst.services.fulltext.FullTextService)}" value-ref="serviceEval"/>
</map>
</property>
<property name="specialEvaluators">
@ -440,6 +440,6 @@
<property name="validator" ref="validator"/>
</bean>
<bean id="fulltextService" class="info.bukova.isspst.services.FullTextServiceImpl"/>
<bean id="fulltextService" class="info.bukova.isspst.services.fulltext.FullTextServiceImpl"/>
</beans>

Loading…
Cancel
Save