001/** 002 * 003 */ 004package org.dllearner.algorithms.isle.index.syntactic; 005 006import com.google.common.base.Joiner; 007import com.google.common.collect.Sets; 008import org.apache.log4j.Logger; 009import org.apache.solr.client.solrj.SolrClient; 010import org.apache.solr.client.solrj.SolrQuery; 011import org.apache.solr.client.solrj.SolrServerException; 012import org.apache.solr.client.solrj.impl.HttpSolrClient; 013import org.apache.solr.client.solrj.response.QueryResponse; 014import org.apache.solr.common.SolrDocument; 015import org.apache.solr.common.SolrDocumentList; 016import org.dllearner.algorithms.isle.TextDocumentGenerator; 017import org.dllearner.algorithms.isle.index.AnnotatedDocument; 018import org.dllearner.algorithms.isle.index.AnnotatedTextDocument; 019import org.dllearner.algorithms.isle.index.Index; 020import org.dllearner.algorithms.isle.index.Token; 021import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; 022import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; 023import org.semanticweb.owlapi.apibinding.OWLManager; 024import org.semanticweb.owlapi.model.*; 025import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; 026 027import java.io.*; 028import java.util.*; 029import java.util.Map.Entry; 030import java.util.concurrent.ExecutorService; 031import java.util.concurrent.Executors; 032import java.util.concurrent.TimeUnit; 033 034/** 035 * @author Lorenz Buehmann 036 * 037 */ 038public class SolrSyntacticIndex implements Index{ 039 040 private static final Logger logger = Logger.getLogger(SolrSyntacticIndex.class); 041 042 private SolrClient solr; 043 private AnnotationEntityTextRetriever textRetriever; 044 private String searchField; 045 private String typesField = "types"; 046 047 private long totalNumberOfDocuments = -1; 048 049 private Map<Set<OWLEntity>, Long> cache = Collections.synchronizedMap(new HashMap<>()); 050 private OWLOntology ontology; 051 private OWLDataFactory df = new OWLDataFactoryImpl(); 052 053 public SolrSyntacticIndex(OWLOntology ontology, String solrServerURL, String searchField) { 054 this.ontology = ontology; 055 this.searchField = searchField; 056 solr = new HttpSolrClient.Builder(solrServerURL).build(); 057 textRetriever = new RDFSLabelEntityTextRetriever(ontology); 058 } 059 060 public void loadCache(File file) throws IOException{ 061 logger.info("Loading cache..."); 062 try(ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file))){ 063 try { 064 cache = Collections.synchronizedMap((Map<Set<OWLEntity>, Long>) ois.readObject()); 065 } catch (ClassNotFoundException e) { 066 e.printStackTrace(); 067 } 068 } 069 logger.info("...done."); 070 OWLEntity e = df.getOWLClass(IRI.create("http://dbpedia.org/ontology/Comics")); 071 int i = 0; 072 for (Set<OWLEntity> entities : cache.keySet()) { 073 if(entities.contains(e)){ 074 System.out.println(entities); 075 i++; 076 } 077 } 078 System.out.println(i); 079 } 080 081 public void buildIndex(Collection<OWLClass> classes){ 082 logger.info("Building cache..."); 083 logger.info("#Classes: " + classes.size()); 084 085 ExecutorService executor = Executors.newFixedThreadPool(6); 086 087 final Set<OWLEntity> owlEntities = new TreeSet<>(); 088 owlEntities.addAll(ontology.getClassesInSignature()); 089 owlEntities.addAll(ontology.getDataPropertiesInSignature()); 090 owlEntities.addAll(ontology.getObjectPropertiesInSignature()); 091 092 final Map<Set<OWLEntity>, Long> frequencyCache = Collections.synchronizedMap(new HashMap<>()); 093 094 //fA resp. fB 095 owlEntities.addAll(classes); 096 for (final OWLEntity entity : owlEntities) { 097 executor.submit(new Runnable() { 098 099 @Override 100 public void run() { 101 Set<OWLEntity> entities = new HashSet<>(); 102 entities.add(entity); 103 long f = getNumberOfDocumentsFor(entity); 104 frequencyCache.put(entities, f); 105 } 106 }); 107 } 108 //fAB 109 for (final OWLClass cls : classes) { 110 logger.info(cls); 111 for (final OWLEntity entity : owlEntities) { 112 if(!cls.equals(entity)){ 113 executor.submit(new Runnable() { 114 115 @Override 116 public void run() { 117 Set<OWLEntity> entities = new HashSet<>(); 118 entities.add(cls); 119 entities.add(entity); 120 long fAB = getNumberOfDocumentsFor(cls, entity); 121 frequencyCache.put(entities, fAB); 122 } 123 }); 124 } 125 } 126 } 127 executor.shutdown(); 128 try { 129 executor.awaitTermination(10, TimeUnit.DAYS); 130 } catch (InterruptedException e) { 131 e.printStackTrace(); 132 } 133 logger.info("Cache size: " + frequencyCache.size()); 134 try { 135 ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream("entity_frequencies.obj")); 136 oos.writeObject(frequencyCache); 137 oos.close(); 138 } catch (IOException e) { 139 e.printStackTrace(); 140 } 141 142 } 143 144 145 146 /* (non-Javadoc) 147 * @see org.dllearner.algorithms.isle.index.Index#getDocuments(org.dllearner.core.owl.Entity) 148 */ 149 @Override 150 public Set<AnnotatedDocument> getDocuments(OWLEntity entity) { 151 Set<AnnotatedDocument> documents = new HashSet<>(); 152 153 Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); 154 155 for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { 156 List<Token> tokens = entry.getKey(); 157 for (Token token : tokens) { 158 SolrQuery query = new SolrQuery(searchField + ":" + token.getRawForm()); 159 query.setRows(Integer.MAX_VALUE);//can be very slow 160 try { 161 QueryResponse response = solr.query(query); 162 SolrDocumentList list = response.getResults(); 163 for (SolrDocument doc : list) { 164 String uri = (String) doc.getFieldValue("uri"); 165 String comment = (String) doc.getFieldValue(searchField); 166 167 documents.add(new AnnotatedTextDocument( 168 TextDocumentGenerator.getInstance().generateDocument((String) doc.getFieldValue(searchField)), 169 Collections.EMPTY_SET)); 170 } 171 } catch (SolrServerException | IOException e) { 172 e.printStackTrace(); 173 } 174 } 175 } 176 return documents; 177 } 178 179 /* (non-Javadoc) 180 * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() 181 */ 182 @Override 183 public long getTotalNumberOfDocuments() { 184 if(totalNumberOfDocuments == -1){ 185 SolrQuery q = new SolrQuery("*:*"); 186 q.setRows(0); // don't actually request any data 187 try { 188 totalNumberOfDocuments = solr.query(q).getResults().getNumFound(); 189 } catch (SolrServerException | IOException e) { 190 e.printStackTrace(); 191 } 192 } 193 return totalNumberOfDocuments; 194 } 195 196 /* (non-Javadoc) 197 * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) 198 */ 199 @Override 200 public synchronized long getNumberOfDocumentsFor(OWLEntity entity) { 201 HashSet<OWLEntity> entitySet = Sets.newHashSet(entity); 202 if(cache.containsKey(entitySet)){ 203 return cache.get(entitySet); 204 } 205 Map<String, Double> relevantText = textRetriever.getRelevantTextSimple(entity); 206 207 String queryString = "("; 208 Set<String> terms = new HashSet<>(); 209 for (Entry<String, Double> entry : relevantText.entrySet()) { 210 String tokens = entry.getKey(); 211 String phrase = tokens; 212 phrase.trim(); 213 terms.add(quotedString(phrase)); 214 } 215 queryString += Joiner.on("OR").join(terms); 216 queryString += ")"; 217 218 SolrQuery query = new SolrQuery(searchField + ":" + queryString);//System.out.println(query); 219 try { 220 QueryResponse response = solr.query(query); 221 SolrDocumentList list = response.getResults(); 222 cache.put(entitySet, list.getNumFound()); 223 return list.getNumFound(); 224 } catch (SolrServerException | IOException e) { 225 e.printStackTrace(); 226 } 227 return -1; 228 } 229 230 /* (non-Javadoc) 231 * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) 232 */ 233 @Override 234 public synchronized long getNumberOfDocumentsFor(OWLEntity... entities) { 235 Set<OWLEntity> entitiesSet = Sets.newHashSet(entities); 236 if(cache.containsKey(entitiesSet)){ 237 return cache.get(entitiesSet); 238 } 239 240 Set<String> queryStringParts = new HashSet<>(); 241 242 for (OWLEntity entity : entities) { 243 Map<String, Double> relevantText = textRetriever.getRelevantTextSimple(entity); 244 245 String queryString = "("; 246 Set<String> terms = new HashSet<>(); 247 for (Entry<String, Double> entry : relevantText.entrySet()) { 248 String tokens = entry.getKey(); 249 String phrase = tokens; 250 phrase.trim(); 251 terms.add(quotedString(phrase)); 252 } 253 queryString += Joiner.on("OR").join(terms); 254 queryString += ")"; 255 queryStringParts.add(queryString); 256 } 257 258 String queryStringConjuction = "(" + Joiner.on("AND").join(queryStringParts) + ")"; 259 260 261 SolrQuery query = new SolrQuery(searchField + ":" + queryStringConjuction);//System.out.println(query); 262 try { 263 QueryResponse response = solr.query(query); 264 SolrDocumentList list = response.getResults(); 265 cache.put(entitiesSet, list.getNumFound()); 266 return list.getNumFound(); 267 } catch (SolrServerException | IOException e) { 268 e.printStackTrace(); 269 } 270 return -1; 271 } 272 273 274 public long getNumberOfDocumentsForTyped(OWLClass resourceClass, OWLEntity entity) { 275 Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); 276 277 String queryString = "("; 278 Set<String> terms = new HashSet<>(); 279 for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { 280 List<Token> tokens = entry.getKey(); 281 String phrase = ""; 282 for (Token token : tokens) { 283// terms.add(token.getRawForm()); 284 phrase += token.getRawForm() + " "; 285 } 286 phrase.trim(); 287 terms.add(quotedString(phrase)); 288 } 289 queryString += Joiner.on("OR").join(terms); 290 queryString += ")";System.out.println(queryString); 291 292 SolrQuery query = new SolrQuery( 293 searchField + ":" + queryString + " AND " + typesField + ":" + quotedString(resourceClass.toStringID()));//System.out.println(query); 294 try { 295 QueryResponse response = solr.query(query); 296 SolrDocumentList list = response.getResults(); 297 return list.getNumFound(); 298 } catch (SolrServerException | IOException e) { 299 e.printStackTrace(); 300 } 301 return -1; 302 } 303 304 private String quotedString(String s){ 305 return "\"" + s.trim() + "\""; 306 } 307 308 public static void main(String[] args) throws Exception { 309 String solrServerURL = "http://solr.aksw.org/en_dbpedia_resources/"; 310 String searchField = "comment"; 311 OWLOntology ontology = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(new File("src/test/resources/org/dllearner/algorithms/isle/dbpedia_3.9.owl")); 312 SolrSyntacticIndex index = new SolrSyntacticIndex(ontology, solrServerURL, searchField); 313 index.loadCache(new File("entity_frequencies.obj")); 314 OWLDataFactory df = new OWLDataFactoryImpl(); 315 long n = index.getNumberOfDocumentsFor(df.getOWLClass(IRI.create("http://dbpedia.org/ontology/Comics"))); 316 System.out.println(n); 317 n = index.getNumberOfDocumentsFor(df.getOWLClass(IRI.create("http://dbpedia.org/ontology/Comics")), df.getOWLObjectProperty(IRI.create("http://dbpedia.org/ontology/largestCity"))); 318 System.out.println(n); 319 } 320 321}