Obsah přiložených dokumentů ve formátech ODF - OpenOffice, openxml - MS Office a PDF se extrahuje do čistého textu a indexuje pro fulltextové vyhledávání.
closes #211Verze_2.0
parent
860c7227cd
commit
1d2810f78d
@ -0,0 +1,97 @@
|
||||
package info.bukova.isspst.data;
|
||||
|
||||
import org.hibernate.annotations.Type;
|
||||
import org.hibernate.search.annotations.Analyze;
|
||||
import org.hibernate.search.annotations.Field;
|
||||
import org.hibernate.search.annotations.Index;
|
||||
import org.hibernate.search.annotations.Indexed;
|
||||
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.GeneratedValue;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*/
|
||||
@Entity
|
||||
@Table(name = "FILE_CONTENTS")
|
||||
@Indexed
|
||||
public class FileContent {
|
||||
|
||||
@Id
|
||||
@Column(name = "ID")
|
||||
@GeneratedValue
|
||||
private int id;
|
||||
|
||||
@Column(name = "CONTENT")
|
||||
@Type(type = "text")
|
||||
@Field(index = Index.YES, analyze = Analyze.YES)
|
||||
private String plainText;
|
||||
|
||||
@Column(name = "CONTENT_TYPE")
|
||||
private String contentType;
|
||||
|
||||
@Column(name = "PATH_IN_FILESYSTEM")
|
||||
private String pathInFilesystem;
|
||||
|
||||
public int getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(int id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getPlainText() {
|
||||
return plainText;
|
||||
|
||||
}
|
||||
|
||||
public void setPlainText(String content) {
|
||||
this.plainText = content;
|
||||
}
|
||||
|
||||
public String getContentType() {
|
||||
return contentType;
|
||||
}
|
||||
|
||||
public void setContentType(String contentType) {
|
||||
this.contentType = contentType;
|
||||
}
|
||||
|
||||
public String getPathInFilesystem() {
|
||||
return pathInFilesystem;
|
||||
}
|
||||
|
||||
public void setPathInFilesystem(String pathInFilesystem) {
|
||||
this.pathInFilesystem = pathInFilesystem;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof FileContent)) return false;
|
||||
|
||||
FileContent that = (FileContent) o;
|
||||
|
||||
if (id != that.id) return false;
|
||||
if (plainText != null ? !plainText.equals(that.plainText) : that.plainText != null) return false;
|
||||
if (contentType != null ? !contentType.equals(that.contentType) : that.contentType != null) return false;
|
||||
if (pathInFilesystem != null ? !pathInFilesystem.equals(that.pathInFilesystem) : that.pathInFilesystem != null)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = id;
|
||||
result = 31 * result + (plainText != null ? plainText.hashCode() : 0);
|
||||
result = 31 * result + (contentType != null ? contentType.hashCode() : 0);
|
||||
result = 31 * result + (pathInFilesystem != null ? pathInFilesystem.hashCode() : 0);
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*/
|
||||
public abstract class AbstractExtractor implements Extractor {
|
||||
|
||||
public String extract(byte[] data) {
|
||||
return extract(new ByteArrayInputStream(data));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*/
|
||||
public abstract class AbstractOfficeExtractor extends AbstractExtractor {
|
||||
|
||||
@Override
|
||||
public String extract(InputStream is) throws ExtractorException {
|
||||
try {
|
||||
POIXMLTextExtractor extractor = createExtractor(is);
|
||||
return extractor.getText();
|
||||
} catch (IOException e) {
|
||||
throw new ExtractorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract POIXMLTextExtractor createExtractor(InputStream is) throws IOException;
|
||||
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*/
|
||||
public class ExcelExtractor extends AbstractOfficeExtractor implements Extractor {
|
||||
|
||||
@Override
|
||||
protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
|
||||
return new XSSFExcelExtractor(new XSSFWorkbook(is));
|
||||
}
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*
|
||||
* Rozhraní extractoru čistého textu z formátů Office a PDF
|
||||
*/
|
||||
public interface Extractor {
|
||||
|
||||
/**
|
||||
* Extrahuje text z předaného pole bytů
|
||||
*
|
||||
* @param data zdrajová data
|
||||
* @return čistý text
|
||||
* @throws ExtractorException
|
||||
*/
|
||||
public String extract(byte[] data) throws ExtractorException;
|
||||
|
||||
/**
|
||||
* Extrahuje text z předaného InputStream objektu
|
||||
*
|
||||
* @param is zdrojový InputStream
|
||||
* @return čistý text
|
||||
* @throws ExtractorException
|
||||
*/
|
||||
public String extract(InputStream is) throws ExtractorException;
|
||||
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import info.bukova.isspst.services.IsspstException;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*
|
||||
* Výjimka extrakce textu
|
||||
*/
|
||||
public class ExtractorException extends IsspstException {
|
||||
|
||||
public ExtractorException(Throwable cause) {
|
||||
super("Extractor exception: ", cause);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*
|
||||
* Factory pro konkrétní extractor
|
||||
*/
|
||||
public class ExtractorFactory {
|
||||
|
||||
/**
|
||||
* Vytvoří extractor podle předaného content typu
|
||||
*
|
||||
* @param contentType
|
||||
* @return Extractor
|
||||
*/
|
||||
public static Extractor createExtractor(String contentType) {
|
||||
if (contentType.equals("application/vnd.oasis.opendocument.text")
|
||||
|| contentType.equals("application/vnd.oasis.opendocument.spreadsheet")
|
||||
|| contentType.equals("application/vnd.oasis.opendocument.presentation")) {
|
||||
return new OdfExtractor();
|
||||
}
|
||||
|
||||
if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
|
||||
return new WordExtractor();
|
||||
}
|
||||
|
||||
if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
|
||||
return new ExcelExtractor();
|
||||
}
|
||||
|
||||
if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.slideshow")) {
|
||||
return new PowerPointExtractor();
|
||||
}
|
||||
|
||||
if (contentType.equals("application/pdf")) {
|
||||
return new PdfExtractor();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
@ -1,10 +1,10 @@
|
||||
package info.bukova.isspst.services;
|
||||
|
||||
import java.util.List;
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import org.hibernate.search.annotations.Field;
|
||||
import org.hibernate.search.annotations.Indexed;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*
|
@ -1,9 +1,10 @@
|
||||
package info.bukova.isspst.services;
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import info.bukova.isspst.ModuleUtils;
|
||||
import info.bukova.isspst.dao.QueryDao;
|
||||
import info.bukova.isspst.data.BaseData;
|
||||
import info.bukova.isspst.data.User;
|
||||
import info.bukova.isspst.services.ModuleNotActiveException;
|
||||
import info.bukova.isspst.sort.ReflectionTools;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.hibernate.Hibernate;
|
@ -0,0 +1,24 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import org.odftoolkit.simple.Document;
|
||||
import org.odftoolkit.simple.common.TextExtractor;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*/
|
||||
public class OdfExtractor extends AbstractExtractor implements Extractor {
|
||||
|
||||
@Override
|
||||
public String extract(InputStream is) throws ExtractorException {
|
||||
try {
|
||||
Document odfDocument = Document.loadDocument(is);
|
||||
TextExtractor extractor = TextExtractor.newOdfTextExtractor(odfDocument.getContentRoot());
|
||||
|
||||
return extractor.getText();
|
||||
} catch (Exception e) {
|
||||
throw new ExtractorException(e);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import com.lowagie.text.pdf.PdfReader;
|
||||
import com.lowagie.text.pdf.parser.PdfTextExtractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*/
|
||||
public class PdfExtractor extends AbstractExtractor implements Extractor {
|
||||
|
||||
@Override
|
||||
public String extract(InputStream is) throws ExtractorException {
|
||||
try {
|
||||
PdfReader reader = new PdfReader(is);
|
||||
PdfTextExtractor extractor = new PdfTextExtractor(reader);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
|
||||
sb.append(extractor.getTextFromPage(i));
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new ExtractorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*/
|
||||
public class PowerPointExtractor extends AbstractOfficeExtractor implements Extractor {
|
||||
|
||||
@Override
|
||||
protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
|
||||
return new XSLFPowerPointExtractor(new XMLSlideShow(is));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
package info.bukova.isspst.services.fulltext;
|
||||
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author Pepa Rokos
|
||||
*/
|
||||
public class WordExtractor extends AbstractOfficeExtractor implements Extractor {
|
||||
|
||||
@Override
|
||||
protected POIXMLTextExtractor createExtractor(InputStream is) throws IOException {
|
||||
return new XWPFWordExtractor(new XWPFDocument(is));
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue