001package org.dllearner.algorithms.isle.index.semantic; 002 003import java.io.File; 004import java.io.FileInputStream; 005import java.io.FileOutputStream; 006import java.io.IOException; 007import java.io.ObjectInputStream; 008import java.io.ObjectOutputStream; 009import java.util.Collection; 010import java.util.HashSet; 011import java.util.Set; 012 013import org.apache.log4j.Logger; 014import org.dllearner.algorithms.isle.EntityCandidateGenerator; 015import org.dllearner.algorithms.isle.TextDocumentGenerator; 016import org.dllearner.algorithms.isle.index.AnnotatedDocument; 017import org.dllearner.algorithms.isle.index.LinguisticAnnotator; 018import org.dllearner.algorithms.isle.index.SemanticAnnotator; 019import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; 020import org.dllearner.algorithms.isle.index.TextDocument; 021import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; 022import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; 023import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; 024import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation; 025import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor; 026import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; 027import org.semanticweb.owlapi.model.OWLAnnotation; 028import org.semanticweb.owlapi.model.OWLAnnotationProperty; 029import org.semanticweb.owlapi.model.OWLEntity; 030import org.semanticweb.owlapi.model.OWLLiteral; 031import org.semanticweb.owlapi.model.OWLOntology; 032 033import com.google.common.hash.HashCode; 034import com.google.common.hash.HashFunction; 035import com.google.common.hash.Hashing; 036 037/** 038 * Interface for an index which is able to resolve a given entity's URI to the set of documents containing 039 * this entity, i.e., documents which contain words disambiguated to the given entity. 040 * 041 * @author Lorenz Buehmann 042 * @author Daniel Fleischhacker 043 */ 044public abstract class SemanticIndexGenerator { 045 046 static HashFunction hf = Hashing.goodFastHash(128); 047 private static final Logger logger = Logger.getLogger(SemanticIndexGenerator.class); 048 private static boolean useCache = false; 049 050 public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, WordSenseDisambiguation wordSenseDisambiguation, 051 EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator){ 052 SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wordSenseDisambiguation, entityCandidateGenerator, linguisticAnnotator); 053 return generateIndex(documents, ontology, semanticAnnotator); 054 } 055 056 public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, SemanticAnnotator semanticAnnotator){ 057 SemanticIndex semanticIndex; 058 //try to load serialized version 059 HashCode hc = hf.newHasher().putInt(documents.hashCode()).putInt(ontology.hashCode()).hash(); 060 File file = new File(hc.toString() + ".ser"); 061 if(useCache && file.exists()){ 062 try { 063 logger.info("Loading semantic index from disk..."); 064 ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file)); 065 semanticIndex = (SemanticIndex) ois.readObject(); 066 ois.close(); 067 logger.info("...done."); 068 } catch (Exception e) { 069 e.printStackTrace(); 070 semanticIndex = buildIndex(semanticAnnotator, documents); 071 } 072 } else { 073 logger.info("Building semantic index..."); 074 semanticIndex = buildIndex(semanticAnnotator, documents); 075 try { 076 ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file)); 077 oos.writeObject(semanticIndex); 078 oos.close(); 079 } catch (IOException e1) { 080 e1.printStackTrace(); 081 } 082 logger.info("...done."); 083 } 084 return semanticIndex; 085 } 086 087 public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, boolean useWordNormalization){ 088 SimpleEntityCandidatesTrie trie; 089 trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), 090 ontology); 091 trie.printTrie(); 092 093 TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); 094 linguisticAnnotator.setNormalizeWords(useWordNormalization); 095 096 SemanticAnnotator semanticAnnotator = new SemanticAnnotator( 097 new StructureBasedWordSenseDisambiguation(new WindowBasedContextExtractor(), ontology), 098 new TrieEntityCandidateGenerator(ontology, trie), 099 linguisticAnnotator); 100 return generateIndex(documents, ontology, semanticAnnotator); 101 } 102 103 public static SemanticIndex generateIndex(OWLOntology ontology, OWLAnnotationProperty annotationProperty, String language, boolean useWordNormalization){ 104 Set<OWLEntity> schemaEntities = new HashSet<>(); 105 schemaEntities.addAll(ontology.getClassesInSignature()); 106 schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); 107 schemaEntities.addAll(ontology.getDataPropertiesInSignature()); 108 Set<String> documents = new HashSet<>(); 109 for (OWLEntity entity : schemaEntities) { 110 String label = null; 111 Collection<OWLAnnotation> annotations = ontology.getAnnotations(); 112 for (OWLAnnotation annotation : annotations) { 113 if (annotation.getProperty().equals(annotationProperty) && 114 annotation.getValue() instanceof OWLLiteral) { 115 OWLLiteral val = (OWLLiteral) annotation.getValue(); 116 if (language != null) { 117 if (val.hasLang(language)) { 118 label = val.getLiteral(); 119 } 120 } 121 else { 122 label = val.getLiteral(); 123 } 124 } 125 } 126 if (label != null) { 127 documents.add(label); 128 } 129 } 130 return generateIndex(documents, ontology, useWordNormalization); 131 } 132 133 /** 134 * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. 135 */ 136 private static SemanticIndex buildIndex(SemanticAnnotator semanticAnnotator, Set<String> documents) { 137 logger.info("Creating semantic index..."); 138 SemanticIndex index = new SemanticIndex(); 139 for (String document : documents) { 140 if (document.isEmpty()) { 141 continue; 142 } 143 TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document); 144 logger.debug("Processing document:" + textDocument); 145 AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(textDocument); 146 for (OWLEntity entity : annotatedDocument.getContainedEntities()) { 147 Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); 148 if (existingAnnotatedDocuments == null) { 149 existingAnnotatedDocuments = new HashSet<>(); 150 index.put(entity, existingAnnotatedDocuments); 151 } 152 existingAnnotatedDocuments.add(annotatedDocument); 153 } 154 logger.debug("Annotated document:" + annotatedDocument); 155 } 156 int size = documents.size(); 157 index.setTotalNrOfDocuments(size); 158 logger.info("...done."); 159 return index; 160 } 161}