Source code

001package org.dllearner.algorithms.isle.index.semantic;
002
003import java.io.File;
004import java.io.FileInputStream;
005import java.io.FileOutputStream;
006import java.io.IOException;
007import java.io.ObjectInputStream;
008import java.io.ObjectOutputStream;
009import java.util.Collection;
010import java.util.HashSet;
011import java.util.Set;
012
013import org.apache.log4j.Logger;
014import org.dllearner.algorithms.isle.EntityCandidateGenerator;
015import org.dllearner.algorithms.isle.TextDocumentGenerator;
016import org.dllearner.algorithms.isle.index.AnnotatedDocument;
017import org.dllearner.algorithms.isle.index.LinguisticAnnotator;
018import org.dllearner.algorithms.isle.index.SemanticAnnotator;
019import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie;
020import org.dllearner.algorithms.isle.index.TextDocument;
021import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator;
022import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator;
023import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever;
024import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation;
025import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor;
026import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation;
027import org.semanticweb.owlapi.model.OWLAnnotation;
028import org.semanticweb.owlapi.model.OWLAnnotationProperty;
029import org.semanticweb.owlapi.model.OWLEntity;
030import org.semanticweb.owlapi.model.OWLLiteral;
031import org.semanticweb.owlapi.model.OWLOntology;
032
033import com.google.common.hash.HashCode;
034import com.google.common.hash.HashFunction;
035import com.google.common.hash.Hashing;
036
037/**
038 * Interface for an index which is able to resolve a given entity's URI to the set of documents containing
039 * this entity, i.e., documents which contain words disambiguated to the given entity.
040 *
041 * @author Lorenz Buehmann
042 * @author Daniel Fleischhacker
043 */
044public abstract class SemanticIndexGenerator {
045
046        static HashFunction hf = Hashing.goodFastHash(128);
047    private static final Logger logger = Logger.getLogger(SemanticIndexGenerator.class);
048    private static boolean useCache = false;
049    
050    public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, WordSenseDisambiguation wordSenseDisambiguation,
051                         EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator){
052        SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wordSenseDisambiguation, entityCandidateGenerator, linguisticAnnotator);
053        return generateIndex(documents, ontology, semanticAnnotator);
054    }
055    
056    public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, SemanticAnnotator semanticAnnotator){
057        SemanticIndex semanticIndex;
058        //try to load serialized version
059        HashCode hc = hf.newHasher().putInt(documents.hashCode()).putInt(ontology.hashCode()).hash();
060        File file = new File(hc.toString() + ".ser");
061        if(useCache && file.exists()){
062                try {
063                        logger.info("Loading semantic index from disk...");
064                                ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
065                                semanticIndex = (SemanticIndex) ois.readObject();
066                                ois.close();
067                                logger.info("...done.");
068                        } catch (Exception e) {
069                                e.printStackTrace();
070                                semanticIndex = buildIndex(semanticAnnotator, documents);
071                        } 
072        } else {
073                logger.info("Building semantic index...");
074                semanticIndex = buildIndex(semanticAnnotator, documents);
075                try {
076                        ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file));
077                        oos.writeObject(semanticIndex);
078                        oos.close();
079                } catch (IOException e1) {
080                        e1.printStackTrace();
081                }
082                logger.info("...done.");
083        }
084        return semanticIndex;
085    }
086    
087    public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, boolean useWordNormalization){
088        SimpleEntityCandidatesTrie trie;
089        trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology),
090                    ontology);
091        trie.printTrie();
092        
093        TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie);
094        linguisticAnnotator.setNormalizeWords(useWordNormalization);
095        
096        SemanticAnnotator semanticAnnotator = new SemanticAnnotator(
097                new StructureBasedWordSenseDisambiguation(new WindowBasedContextExtractor(), ontology),
098                new TrieEntityCandidateGenerator(ontology, trie),
099                linguisticAnnotator);
100        return generateIndex(documents, ontology, semanticAnnotator);
101    }
102    
103    public static SemanticIndex generateIndex(OWLOntology ontology, OWLAnnotationProperty annotationProperty, String language, boolean useWordNormalization){
104        Set<OWLEntity> schemaEntities = new HashSet<>();
105        schemaEntities.addAll(ontology.getClassesInSignature());
106        schemaEntities.addAll(ontology.getObjectPropertiesInSignature());
107        schemaEntities.addAll(ontology.getDataPropertiesInSignature());
108        Set<String> documents = new HashSet<>();
109        for (OWLEntity entity : schemaEntities) {
110            String label = null;
111            Collection<OWLAnnotation> annotations = ontology.getAnnotations();
112            for (OWLAnnotation annotation : annotations) {
113                if (annotation.getProperty().equals(annotationProperty) && 
114                                annotation.getValue() instanceof OWLLiteral) {
115                    OWLLiteral val = (OWLLiteral) annotation.getValue();
116                    if (language != null) {
117                        if (val.hasLang(language)) {
118                            label = val.getLiteral();
119                        }
120                    }
121                    else {
122                        label = val.getLiteral();
123                    }
124                }
125            }
126            if (label != null) {
127                documents.add(label);
128            }
129        }
130        return generateIndex(documents, ontology, useWordNormalization);
131    }
132
133    /**
134     * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents.
135     */
136    private static SemanticIndex buildIndex(SemanticAnnotator semanticAnnotator, Set<String> documents) {
137        logger.info("Creating semantic index...");
138        SemanticIndex index = new SemanticIndex();
139        for (String document : documents) {
140            if (document.isEmpty()) {
141                continue;
142            }
143            TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document);
144            logger.debug("Processing document:" + textDocument);
145            AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(textDocument);
146            for (OWLEntity entity : annotatedDocument.getContainedEntities()) {
147                Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity);
148                if (existingAnnotatedDocuments == null) {
149                    existingAnnotatedDocuments = new HashSet<>();
150                    index.put(entity, existingAnnotatedDocuments);
151                }
152                existingAnnotatedDocuments.add(annotatedDocument);
153            }
154            logger.debug("Annotated document:" + annotatedDocument);
155        }
156        int size = documents.size();
157        index.setTotalNrOfDocuments(size);
158        logger.info("...done.");
159        return index;
160    }
161}