001/** 002 * 003 */ 004package org.dllearner.algorithms.isle.index.syntactic; 005 006import java.io.IOException; 007import java.nio.file.Paths; 008import java.util.Collections; 009import java.util.HashSet; 010import java.util.List; 011import java.util.Map; 012import java.util.Map.Entry; 013import java.util.Set; 014 015import org.apache.lucene.analysis.standard.StandardAnalyzer; 016import org.apache.lucene.document.Document; 017import org.apache.lucene.index.DirectoryReader; 018import org.apache.lucene.index.IndexReader; 019import org.apache.lucene.queryparser.classic.ParseException; 020import org.apache.lucene.queryparser.classic.QueryParser; 021import org.apache.lucene.search.IndexSearcher; 022import org.apache.lucene.search.Query; 023import org.apache.lucene.search.ScoreDoc; 024import org.apache.lucene.store.Directory; 025import org.apache.lucene.store.FSDirectory; 026import org.dllearner.algorithms.isle.TextDocumentGenerator; 027import org.dllearner.algorithms.isle.index.AnnotatedDocument; 028import org.dllearner.algorithms.isle.index.AnnotatedTextDocument; 029import org.dllearner.algorithms.isle.index.Index; 030import org.dllearner.algorithms.isle.index.TextDocument; 031import org.dllearner.algorithms.isle.index.Token; 032import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; 033import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; 034import org.semanticweb.owlapi.model.OWLEntity; 035import org.semanticweb.owlapi.model.OWLOntology; 036 037/** 038 * @author Lorenz Buehmann 039 * 040 */ 041public class LuceneSyntacticIndex implements Index { 042 043 private IndexSearcher searcher; 044 private QueryParser parser; 045 private IndexReader indexReader; 046 private String searchField; 047 048 AnnotationEntityTextRetriever textRetriever; 049 050 public LuceneSyntacticIndex(OWLOntology ontology, IndexReader indexReader, String searchField) { 051 this.indexReader = indexReader; 052 this.searchField = searchField; 053 searcher = new IndexSearcher(indexReader); 054 StandardAnalyzer analyzer = new StandardAnalyzer(); 055 parser = new QueryParser(searchField, analyzer); 056 057 textRetriever = new RDFSLabelEntityTextRetriever(ontology); 058 } 059 060 public LuceneSyntacticIndex(OWLOntology ontology, Directory directory, String searchField) throws Exception { 061 this(ontology, DirectoryReader.open(directory), searchField); 062 } 063 064 public LuceneSyntacticIndex(OWLOntology ontology, String indexDirectory, String searchField) throws Exception { 065 this(ontology, DirectoryReader.open(FSDirectory.open(Paths.get(indexDirectory))), searchField); 066 } 067 068 /* (non-Javadoc) 069 * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String) 070 */ 071 @Override 072 public Set<AnnotatedDocument> getDocuments(OWLEntity entity) { 073 Set<AnnotatedDocument> documents = new HashSet<>(); 074 075 Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); 076 077 for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { 078 List<Token> tokens = entry.getKey(); 079 for (Token token : tokens) { 080 try { 081 Query query = parser.parse(token.getRawForm()); 082 ScoreDoc[] result = searcher.search(query, indexReader.numDocs()).scoreDocs; 083 for (ScoreDoc aResult : result) { 084 Document doc = searcher.doc(aResult.doc); 085 documents.add(new AnnotatedTextDocument( 086 TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField)), 087 Collections.EMPTY_SET)); 088 } 089 } catch (ParseException | IOException e) { 090 e.printStackTrace(); 091 } 092 } 093 } 094 095 return documents; 096 } 097 098 /* (non-Javadoc) 099 * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() 100 */ 101 @Override 102 public long getTotalNumberOfDocuments() { 103 return indexReader.numDocs(); 104 } 105 106 public Set<TextDocument> getAllDocuments(){ 107 Set<TextDocument> documents = new HashSet<>(indexReader.numDocs()); 108 for (int i = 0; i < indexReader.numDocs(); i++) { 109 try { 110 Document doc = indexReader.document(i); 111 String content = doc.get(searchField); 112 documents.add(TextDocumentGenerator.getInstance().generateDocument(content)); 113 } catch (IOException e) { 114 e.printStackTrace(); 115 } 116 } 117 return documents; 118 } 119 120 /* (non-Javadoc) 121 * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) 122 */ 123 @Override 124 public long getNumberOfDocumentsFor(OWLEntity entity) { 125 return 0; 126 } 127 128 /* (non-Javadoc) 129 * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) 130 */ 131 @Override 132 public long getNumberOfDocumentsFor(OWLEntity... entities) { 133 return 0; 134 } 135 136}