001/**
002 * 
003 */
004package org.dllearner.algorithms.isle.index.syntactic;
005
006import java.io.IOException;
007import java.nio.file.Paths;
008import java.util.Collections;
009import java.util.HashSet;
010import java.util.List;
011import java.util.Map;
012import java.util.Map.Entry;
013import java.util.Set;
014
015import org.apache.lucene.analysis.standard.StandardAnalyzer;
016import org.apache.lucene.document.Document;
017import org.apache.lucene.index.DirectoryReader;
018import org.apache.lucene.index.IndexReader;
019import org.apache.lucene.queryparser.classic.ParseException;
020import org.apache.lucene.queryparser.classic.QueryParser;
021import org.apache.lucene.search.IndexSearcher;
022import org.apache.lucene.search.Query;
023import org.apache.lucene.search.ScoreDoc;
024import org.apache.lucene.store.Directory;
025import org.apache.lucene.store.FSDirectory;
026import org.dllearner.algorithms.isle.TextDocumentGenerator;
027import org.dllearner.algorithms.isle.index.AnnotatedDocument;
028import org.dllearner.algorithms.isle.index.AnnotatedTextDocument;
029import org.dllearner.algorithms.isle.index.Index;
030import org.dllearner.algorithms.isle.index.TextDocument;
031import org.dllearner.algorithms.isle.index.Token;
032import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever;
033import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever;
034import org.semanticweb.owlapi.model.OWLEntity;
035import org.semanticweb.owlapi.model.OWLOntology;
036
037/**
038 * @author Lorenz Buehmann
039 *
040 */
041public class LuceneSyntacticIndex implements Index {
042        
043        private IndexSearcher searcher;
044        private QueryParser parser;
045        private IndexReader indexReader;
046        private String searchField;
047        
048        AnnotationEntityTextRetriever textRetriever;
049
050        public LuceneSyntacticIndex(OWLOntology ontology, IndexReader indexReader, String searchField) {
051                this.indexReader = indexReader;
052                this.searchField = searchField;
053                searcher = new IndexSearcher(indexReader);
054                StandardAnalyzer analyzer = new StandardAnalyzer();
055                parser = new QueryParser(searchField, analyzer);
056                
057                textRetriever = new RDFSLabelEntityTextRetriever(ontology);
058        }
059        
060        public LuceneSyntacticIndex(OWLOntology ontology, Directory directory, String searchField) throws Exception {
061                this(ontology, DirectoryReader.open(directory), searchField);
062        }
063        
064        public LuceneSyntacticIndex(OWLOntology ontology, String indexDirectory, String searchField) throws Exception {
065                this(ontology, DirectoryReader.open(FSDirectory.open(Paths.get(indexDirectory))), searchField);
066        }
067
068        /* (non-Javadoc)
069         * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String)
070         */
071        @Override
072        public Set<AnnotatedDocument> getDocuments(OWLEntity entity) {
073                Set<AnnotatedDocument> documents = new HashSet<>();
074                
075                Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity);
076                
077                for (Entry<List<Token>, Double> entry : relevantText.entrySet()) {
078                        List<Token> tokens = entry.getKey();
079                        for (Token token : tokens) {
080                                try {
081                                        Query query = parser.parse(token.getRawForm());
082                                        ScoreDoc[] result = searcher.search(query, indexReader.numDocs()).scoreDocs;
083                                        for (ScoreDoc aResult : result) {
084                                                Document doc = searcher.doc(aResult.doc);
085                                                documents.add(new AnnotatedTextDocument(
086                                                                TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField)),
087                                                                Collections.EMPTY_SET));
088                                        }
089                                } catch (ParseException | IOException e) {
090                                        e.printStackTrace();
091                                }
092                        }
093                }
094                
095                return documents;
096        }
097
098        /* (non-Javadoc)
099         * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments()
100         */
101        @Override
102        public long getTotalNumberOfDocuments() {
103                return indexReader.numDocs();
104        }
105        
106        public Set<TextDocument> getAllDocuments(){
107                Set<TextDocument> documents = new HashSet<>(indexReader.numDocs());
108                for (int i = 0; i < indexReader.numDocs(); i++) {
109                        try {
110                                Document doc = indexReader.document(i);
111                                String content = doc.get(searchField);
112                                documents.add(TextDocumentGenerator.getInstance().generateDocument(content));
113                        } catch (IOException e) {
114                                e.printStackTrace();
115                        }
116                }
117                return documents;
118        }
119
120        /* (non-Javadoc)
121         * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity)
122         */
123        @Override
124        public long getNumberOfDocumentsFor(OWLEntity entity) {
125                return 0;
126        }
127
128        /* (non-Javadoc)
129         * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[])
130         */
131        @Override
132        public long getNumberOfDocumentsFor(OWLEntity... entities) {
133                return 0;
134        }
135
136}