001/**
002 * 
003 */
004package org.dllearner.algorithms.isle.index.syntactic;
005
006import org.apache.jena.graph.Triple;
007import org.apache.jena.riot.Lang;
008import org.apache.jena.riot.RDFDataMgr;
009import org.apache.lucene.analysis.Analyzer;
010import org.apache.lucene.analysis.standard.StandardAnalyzer;
011import org.apache.lucene.document.*;
012import org.apache.lucene.index.DirectoryReader;
013import org.apache.lucene.index.IndexReader;
014import org.apache.lucene.index.IndexWriter;
015import org.apache.lucene.index.IndexWriterConfig;
016import org.apache.lucene.index.IndexWriterConfig.OpenMode;
017import org.apache.lucene.queryparser.classic.QueryParser;
018import org.apache.lucene.search.IndexSearcher;
019import org.apache.lucene.search.Query;
020import org.apache.lucene.search.ScoreDoc;
021import org.apache.lucene.search.TopDocs;
022import org.apache.lucene.store.Directory;
023import org.apache.lucene.store.FSDirectory;
024
025import java.io.FileInputStream;
026import java.io.IOException;
027import java.io.InputStream;
028import java.nio.file.Paths;
029import java.util.HashSet;
030import java.util.Iterator;
031import java.util.Set;
032
033/**
034 * Creates a Lucene Index for the labels if classes and properties.
035 * @author Lorenz Buehmann
036 *
037 */
038public class NTriplesFileLuceneSyntacticIndexCreator {
039
040        public NTriplesFileLuceneSyntacticIndexCreator(InputStream nTriplesStream, String indexPath, String searchField) throws IOException {
041                //setup the index
042                Directory directory = FSDirectory.open(Paths.get(indexPath));
043                
044                //setup the index analyzer
045                Analyzer analyzer = new StandardAnalyzer();
046                IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
047                indexWriterConfig.setRAMBufferSizeMB(1024.0);
048                indexWriterConfig.setOpenMode(OpenMode.CREATE);
049                IndexWriter writer = new IndexWriter(directory, indexWriterConfig);
050                
051                System.out.println( "Creating index ..." );
052                
053                // setup the index fields, here two fields, for URI and text
054                FieldType stringType = new FieldType(StringField.TYPE_STORED);
055                stringType.setStoreTermVectors(false);
056                FieldType textType = new FieldType(TextField.TYPE_STORED);
057                textType.setStoreTermVectors(false);
058                
059                Set<Document> documents = new HashSet<>();
060                
061                Iterator<Triple> iterator = RDFDataMgr.createIteratorTriples(nTriplesStream, Lang.NTRIPLES, null);
062
063                Triple triple;
064                String text;
065                String uri;
066                Document doc;
067                int i = 0;
068                while(iterator.hasNext()){
069                        triple = iterator.next();
070                        
071                        uri = triple.getSubject().getURI();
072                        text = triple.getObject().getLiteralLexicalForm();
073                        
074                        doc = new Document();
075                        doc.add(new Field("uri", uri, stringType));
076                        doc.add(new Field(searchField, text, textType));
077                        
078                        writer.addDocument(doc);
079                        if(i++ % 10000 == 0){
080//                              writer.commit();
081                                System.out.println(i);
082                        }
083                        
084                }
085                
086                writer.commit();
087                writer.close();
088        }
089        
090        public static void main(String[] args) throws Exception {
091                String indexFile = "/home/me/Documents/short_abstracts_en.nt";
092//              indexFile = "/tmp/test.nt";
093                String indexPath = "/home/me/Documents/dbpedia/short_abstracts_index";
094//              indexPath = "/tmp/index";
095                String field = "text";
096                new NTriplesFileLuceneSyntacticIndexCreator(new FileInputStream(indexFile), indexPath, field);
097
098                Directory directory = FSDirectory.open(Paths.get(indexPath));
099                IndexReader reader = DirectoryReader.open(directory);
100                IndexSearcher searcher = new IndexSearcher(reader);
101                Analyzer analyzer = new StandardAnalyzer();
102
103                QueryParser parser = new QueryParser(field, analyzer);
104                Query query = parser.parse("film AND direction");
105                
106                TopDocs docs = searcher.search(query, 10);
107                ScoreDoc[] scoreDocs = docs.scoreDocs;
108
109                for (ScoreDoc scoreDoc : scoreDocs) {
110                        Document doc = searcher.doc(scoreDoc.doc);
111                        System.out.println(doc.get(field));
112
113                }
114        }
115        
116
117}