001/** 002 * 003 */ 004package org.dllearner.algorithms.isle.wsd; 005 006import java.io.IOException; 007import java.util.Collection; 008import java.util.HashSet; 009import java.util.Iterator; 010import java.util.List; 011import java.util.Map; 012import java.util.Map.Entry; 013import java.util.Set; 014 015import org.dllearner.algorithms.isle.StructuralEntityContext; 016import org.dllearner.algorithms.isle.VSMCosineDocumentSimilarity; 017import org.dllearner.algorithms.isle.index.Annotation; 018import org.dllearner.algorithms.isle.index.EntityScorePair; 019import org.dllearner.algorithms.isle.index.SemanticAnnotation; 020import org.dllearner.algorithms.isle.index.Token; 021import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; 022import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; 023import org.semanticweb.owlapi.model.OWLEntity; 024import org.semanticweb.owlapi.model.OWLOntology; 025 026import com.google.common.base.Joiner; 027import com.google.common.collect.Sets; 028 029/** 030 * @author Lorenz Buehmann 031 * 032 */ 033public class StructureBasedWordSenseDisambiguation extends WordSenseDisambiguation{ 034 035 private ContextExtractor contextExtractor; 036 private AnnotationEntityTextRetriever textRetriever; 037 038 /** 039 * @param ontology 040 */ 041 public StructureBasedWordSenseDisambiguation(ContextExtractor contextExtractor, OWLOntology ontology) { 042 super(ontology); 043 this.contextExtractor = contextExtractor; 044 045 textRetriever = new RDFSLabelEntityTextRetriever(ontology); 046 } 047 048 /* (non-Javadoc) 049 * @see org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation#disambiguate(org.dllearner.algorithms.isle.index.Annotation, java.util.Set) 050 */ 051 @Override 052 public SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities) { 053 //filter out candidates for which the head noun does not match with the annotated token 054 for (Iterator<EntityScorePair> iterator = candidateEntities.iterator(); iterator.hasNext();) { 055 EntityScorePair entityPair = iterator.next(); 056 OWLEntity entity = entityPair.getEntity(); 057 058 Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); 059 060 boolean matched = false; 061 062 for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { 063 List<Token> tokens = entry.getKey(); 064 065 066 for (Token token : tokens) { 067 if(token.isHead()){ 068 for (Token annotatedToken : annotation.getTokens()) { 069 if(token.getRawForm().equals(annotatedToken.getRawForm())){ 070 matched = true; 071 } 072 } 073 } 074 } 075 076 } 077 078 if(!matched){ 079 iterator.remove(); 080 } 081 } 082 083 System.out.println(annotation); 084 for (EntityScorePair entityScorePair : candidateEntities) { 085 System.out.println(entityScorePair); 086 } 087 088 if(!candidateEntities.isEmpty()){ 089 //get the context of the annotated token 090 List<String> tokenContext = contextExtractor.extractContext(annotation); 091 092 //compare this context with the context of each entity candidate 093 double maxScore = Double.NEGATIVE_INFINITY; 094 OWLEntity bestEntity = null; 095 for (EntityScorePair entityScorePair : candidateEntities) { 096 OWLEntity entity = entityScorePair.getEntity(); 097 //get the context of the entity by analyzing the structure of the ontology 098 Set<String> entityContext = StructuralEntityContext.getContextInNaturalLanguage(ontology, entity); 099 //compute the VSM Cosine Similarity 100 double score = computeScore(tokenContext, entityContext); 101 //set best entity 102 if (score > maxScore) { 103 maxScore = score; 104 bestEntity = entity; 105 } 106 } 107 108 return new SemanticAnnotation(annotation, bestEntity); 109 } 110 return null; 111 } 112 113 /** 114 * Compute the overlap between 2 set of words 115 * @param words1 116 * @param words2 117 * @return 118 */ 119 private double computeScoreSimple(Collection<String> words1, Collection<String> words2){ 120 return Sets.intersection(new HashSet<>(words1), new HashSet<>(words2)).size(); 121 } 122 123 /** 124 * Compute the Cosine Similarity using as VSM. 125 * @param words1 126 * @param words2 127 */ 128 private double computeScore(Collection<String> words1, Collection<String> words2){ 129 double score = 0d; 130 try { 131 score = VSMCosineDocumentSimilarity.getCosineSimilarity(Joiner.on(" ").join(words1), Joiner.on(" ").join(words2)); 132 } catch (IOException e) { 133 e.printStackTrace(); 134 } 135 return score; 136 } 137 138 public static void main(String[] args) throws Exception { 139 String s = "OWLEntity"; 140 System.out.println(s.replace("^(OWL)Entity", "OWLEntity")); 141 } 142}