Source code

001/**
002 * 
003 */
004package org.dllearner.algorithms.isle.wsd;
005
006import java.io.IOException;
007import java.util.Collection;
008import java.util.HashSet;
009import java.util.Iterator;
010import java.util.List;
011import java.util.Map;
012import java.util.Map.Entry;
013import java.util.Set;
014
015import org.dllearner.algorithms.isle.StructuralEntityContext;
016import org.dllearner.algorithms.isle.VSMCosineDocumentSimilarity;
017import org.dllearner.algorithms.isle.index.Annotation;
018import org.dllearner.algorithms.isle.index.EntityScorePair;
019import org.dllearner.algorithms.isle.index.SemanticAnnotation;
020import org.dllearner.algorithms.isle.index.Token;
021import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever;
022import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever;
023import org.semanticweb.owlapi.model.OWLEntity;
024import org.semanticweb.owlapi.model.OWLOntology;
025
026import com.google.common.base.Joiner;
027import com.google.common.collect.Sets;
028
029/**
030 * @author Lorenz Buehmann
031 *
032 */
033public class StructureBasedWordSenseDisambiguation extends WordSenseDisambiguation{
034
035        private ContextExtractor contextExtractor;
036        private AnnotationEntityTextRetriever textRetriever;
037
038        /**
039         * @param ontology
040         */
041        public StructureBasedWordSenseDisambiguation(ContextExtractor contextExtractor, OWLOntology ontology) {
042                super(ontology);
043                this.contextExtractor = contextExtractor;
044                
045                textRetriever = new RDFSLabelEntityTextRetriever(ontology);
046        }
047
048        /* (non-Javadoc)
049         * @see org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation#disambiguate(org.dllearner.algorithms.isle.index.Annotation, java.util.Set)
050         */
051        @Override
052        public SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities) {
053                //filter out candidates for which the head noun does not match with the annotated token
054                for (Iterator<EntityScorePair> iterator = candidateEntities.iterator(); iterator.hasNext();) {
055                        EntityScorePair entityPair = iterator.next();
056                        OWLEntity entity = entityPair.getEntity();
057                        
058                        Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity);
059                        
060                        boolean matched = false;
061                        
062                        for (Entry<List<Token>, Double> entry : relevantText.entrySet()) {
063                                List<Token> tokens = entry.getKey();
064                                
065                                
066                                for (Token token : tokens) {
067                                        if(token.isHead()){
068                                                for (Token annotatedToken : annotation.getTokens()) {
069                                                        if(token.getRawForm().equals(annotatedToken.getRawForm())){
070                                                                matched = true;
071                                                        }
072                                                }
073                                        }
074                                }
075                                
076                        }
077                        
078                        if(!matched){
079                                iterator.remove();
080                        }
081                }
082                
083                System.out.println(annotation);
084                for (EntityScorePair entityScorePair : candidateEntities) {
085                        System.out.println(entityScorePair);
086                }
087                
088                if(!candidateEntities.isEmpty()){
089                        //get the context of the annotated token
090                        List<String> tokenContext = contextExtractor.extractContext(annotation);
091                        
092                        //compare this context with the context of each entity candidate
093                        double maxScore = Double.NEGATIVE_INFINITY;
094                        OWLEntity bestEntity = null;
095                        for (EntityScorePair entityScorePair : candidateEntities) {
096                                OWLEntity entity = entityScorePair.getEntity();
097                //get the context of the entity by analyzing the structure of the ontology
098                Set<String> entityContext = StructuralEntityContext.getContextInNaturalLanguage(ontology, entity);
099                //compute the VSM Cosine Similarity
100                double score = computeScore(tokenContext, entityContext);
101                //set best entity
102                if (score > maxScore) {
103                    maxScore = score;
104                    bestEntity = entity;
105                }
106            }
107
108            return new SemanticAnnotation(annotation, bestEntity);
109                }
110                return null;
111        }
112        
113        /**
114         * Compute the overlap between 2 set of words
115         * @param words1
116         * @param words2
117         * @return
118         */
119        private double computeScoreSimple(Collection<String> words1, Collection<String> words2){
120                return Sets.intersection(new HashSet<>(words1), new HashSet<>(words2)).size();
121        }
122        
123        /**
124         * Compute the Cosine Similarity using as VSM.
125         * @param words1
126         * @param words2
127         */
128        private double computeScore(Collection<String> words1, Collection<String> words2){
129                double score = 0d;
130                try {
131                        score = VSMCosineDocumentSimilarity.getCosineSimilarity(Joiner.on(" ").join(words1), Joiner.on(" ").join(words2));
132                } catch (IOException e) {
133                        e.printStackTrace();
134                }
135                return score;
136        }
137        
138        public static void main(String[] args) throws Exception {
139                String s = "OWLEntity";
140                System.out.println(s.replace("^(OWL)Entity", "OWLEntity"));
141        }
142}