001/** 002 * 003 */ 004package org.dllearner.algorithms.isle.index.syntactic; 005 006import org.apache.jena.graph.Triple; 007import org.apache.jena.riot.Lang; 008import org.apache.jena.riot.RDFDataMgr; 009import org.apache.lucene.analysis.Analyzer; 010import org.apache.lucene.analysis.standard.StandardAnalyzer; 011import org.apache.lucene.document.*; 012import org.apache.lucene.index.DirectoryReader; 013import org.apache.lucene.index.IndexReader; 014import org.apache.lucene.index.IndexWriter; 015import org.apache.lucene.index.IndexWriterConfig; 016import org.apache.lucene.index.IndexWriterConfig.OpenMode; 017import org.apache.lucene.queryparser.classic.QueryParser; 018import org.apache.lucene.search.IndexSearcher; 019import org.apache.lucene.search.Query; 020import org.apache.lucene.search.ScoreDoc; 021import org.apache.lucene.search.TopDocs; 022import org.apache.lucene.store.Directory; 023import org.apache.lucene.store.FSDirectory; 024 025import java.io.FileInputStream; 026import java.io.IOException; 027import java.io.InputStream; 028import java.nio.file.Paths; 029import java.util.HashSet; 030import java.util.Iterator; 031import java.util.Set; 032 033/** 034 * Creates a Lucene Index for the labels if classes and properties. 035 * @author Lorenz Buehmann 036 * 037 */ 038public class NTriplesFileLuceneSyntacticIndexCreator { 039 040 public NTriplesFileLuceneSyntacticIndexCreator(InputStream nTriplesStream, String indexPath, String searchField) throws IOException { 041 //setup the index 042 Directory directory = FSDirectory.open(Paths.get(indexPath)); 043 044 //setup the index analyzer 045 Analyzer analyzer = new StandardAnalyzer(); 046 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); 047 indexWriterConfig.setRAMBufferSizeMB(1024.0); 048 indexWriterConfig.setOpenMode(OpenMode.CREATE); 049 IndexWriter writer = new IndexWriter(directory, indexWriterConfig); 050 051 System.out.println( "Creating index ..." ); 052 053 // setup the index fields, here two fields, for URI and text 054 FieldType stringType = new FieldType(StringField.TYPE_STORED); 055 stringType.setStoreTermVectors(false); 056 FieldType textType = new FieldType(TextField.TYPE_STORED); 057 textType.setStoreTermVectors(false); 058 059 Set<Document> documents = new HashSet<>(); 060 061 Iterator<Triple> iterator = RDFDataMgr.createIteratorTriples(nTriplesStream, Lang.NTRIPLES, null); 062 063 Triple triple; 064 String text; 065 String uri; 066 Document doc; 067 int i = 0; 068 while(iterator.hasNext()){ 069 triple = iterator.next(); 070 071 uri = triple.getSubject().getURI(); 072 text = triple.getObject().getLiteralLexicalForm(); 073 074 doc = new Document(); 075 doc.add(new Field("uri", uri, stringType)); 076 doc.add(new Field(searchField, text, textType)); 077 078 writer.addDocument(doc); 079 if(i++ % 10000 == 0){ 080// writer.commit(); 081 System.out.println(i); 082 } 083 084 } 085 086 writer.commit(); 087 writer.close(); 088 } 089 090 public static void main(String[] args) throws Exception { 091 String indexFile = "/home/me/Documents/short_abstracts_en.nt"; 092// indexFile = "/tmp/test.nt"; 093 String indexPath = "/home/me/Documents/dbpedia/short_abstracts_index"; 094// indexPath = "/tmp/index"; 095 String field = "text"; 096 new NTriplesFileLuceneSyntacticIndexCreator(new FileInputStream(indexFile), indexPath, field); 097 098 Directory directory = FSDirectory.open(Paths.get(indexPath)); 099 IndexReader reader = DirectoryReader.open(directory); 100 IndexSearcher searcher = new IndexSearcher(reader); 101 Analyzer analyzer = new StandardAnalyzer(); 102 103 QueryParser parser = new QueryParser(field, analyzer); 104 Query query = parser.parse("film AND direction"); 105 106 TopDocs docs = searcher.search(query, 10); 107 ScoreDoc[] scoreDocs = docs.scoreDocs; 108 109 for (ScoreDoc scoreDoc : scoreDocs) { 110 Document doc = searcher.doc(scoreDoc.doc); 111 System.out.println(doc.get(field)); 112 113 } 114 } 115 116 117}