Source code

001package org.dllearner.algorithms.isle.index;
002
003import java.util.ArrayList;
004import java.util.HashMap;
005import java.util.List;
006import java.util.Set;
007
008import org.dllearner.algorithms.isle.EntityCandidateGenerator;
009import org.dllearner.algorithms.isle.StopWordFilter;
010import org.semanticweb.owlapi.model.OWLOntology;
011
012import com.google.common.collect.Lists;
013
014/**
015 * Generates candidates using a entity candidates prefix trie
016 * @author Andre Melo
017 *
018 */
019public class TrieEntityCandidateGenerator extends EntityCandidateGenerator{
020
021        final EntityCandidatesTrie candidatesTrie;
022        final StopWordFilter stopWordFilter = new StopWordFilter();
023        int window = 10;
024        
025        public TrieEntityCandidateGenerator(OWLOntology ontology, EntityCandidatesTrie candidatesTrie) {
026                super(ontology);
027                this.candidatesTrie = candidatesTrie;
028        }
029        
030        @Override
031        public Set<EntityScorePair> getCandidates(Annotation annotation) {
032        Set<EntityScorePair> candidateEntities = candidatesTrie.getCandidateEntities(annotation.getTokens());
033        System.out.println(annotation + " --> " + candidateEntities);
034        return candidateEntities;
035        }
036
037    /**
038     * Postprocess the annotations generated by annotate
039     * The objective is to merge annotations which are likely to belong to the same entity
040     * @param window : maximum distance between the annotations
041     * @return
042     */
043    public void postProcess(HashMap<Annotation,Set<EntityScorePair>> candidatesMap, int window, StopWordFilter stopWordFilter) {
044        Set<Annotation> annotations = candidatesMap.keySet();
045        List<Annotation> sortedAnnotations = new ArrayList<>(annotations);
046        //TODO refactoring
047        /**
048          
049        
050        // Sort annotations by offset in ascending order
051        Collections.sort(sortedAnnotations, new Comparator<Annotation>(){
052            public int compare(Annotation a1,Annotation a2){
053                return Integer.compare(a1.getOffset(), a2.getOffset());
054            }
055        });
056        
057        int windowStart = 0;
058        int windowEnd = 0;
059        for (int i=0; i<sortedAnnotations.size(); i++) {
060                
061                Annotation annotation_i = sortedAnnotations.get(i);
062                int begin_i = annotation_i.getOffset();
063                int end_i = begin_i + annotation_i.getLength()-1;
064                String token_i = annotation_i.getString();
065                Set<OWLEntity> candidates_i = getCandidates(annotation_i);
066                Set<OWLEntity> newCandidates_i = new HashSet<OWLEntity>();
067                
068                // Determine the annotations contained in the window
069                while ((sortedAnnotations.get(windowStart).getOffset()+sortedAnnotations.get(windowStart).getLength()-1)<(begin_i-window))
070                        windowStart++;
071                while (windowEnd<sortedAnnotations.size() && sortedAnnotations.get(windowEnd).getOffset()<(end_i+window))
072                        windowEnd++;
073                
074                // For every annotation in the window (defined by the number of characters between offsets)
075                for (int j=windowStart; j<sortedAnnotations.size() && j<windowEnd; j++) {
076                        if (j!=i) {
077                                Annotation annotation_j = sortedAnnotations.get(j);
078                                String token_j = annotation_j.getString();
079                                Set<OWLEntity> candidates_j = getCandidates(annotation_j);
080                                Set<OWLEntity> intersection = Sets.intersection(candidates_i, candidates_j);
081                                Set<OWLEntity> newCandidates_ij = new HashSet<OWLEntity>();
082                                for (OWLEntity commonEntity: intersection) {
083                                        if (!(stopWordFilter.isStopWord(token_i) && stopWordFilter.isStopWord(token_j))) {
084                                                if (!token_i.contains(token_j) && !token_j.contains(token_i)) {
085                                                        newCandidates_ij.add(commonEntity);
086                                                        //System.out.println("common("+token_i+","+token_j+")="+commonEntity);
087                                                }
088                                        }
089                                }
090                                if (!newCandidates_ij.isEmpty()) {
091                                        Annotation mergedAnnotation = mergeAnnotations(annotation_i,annotation_j);
092                                        // If there's no punctuation in the merged annotation
093                                        if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getString())) {
094                                                candidatesMap.put(mergedAnnotation, newCandidates_ij);
095                                                candidatesMap.remove(annotation_i);
096                                                candidatesMap.remove(annotation_j);
097                                        }
098                                        
099                                        newCandidates_i.addAll(newCandidates_ij);
100                                }
101                        }
102                }
103                
104                // Deletes annotation if it's a stop word and doesn't have any matching annotation in the window
105                if (stopWordFilter.isStopWord(token_i)) {
106                        if (newCandidates_i.isEmpty())
107                                candidatesMap.remove(annotation_i);     
108                }
109        }
110        
111        
112         */
113    }
114
115        private Annotation mergeAnnotations(Annotation annotation_i, Annotation annotation_j) {
116                List<Token> tokens = Lists.newArrayList();
117                tokens.addAll(annotation_i.getTokens());
118                tokens.addAll(annotation_j.getTokens());
119                return new Annotation(annotation_i.getReferencedDocument(), tokens);
120        }
121
122        @Override
123        public HashMap<Annotation, Set<EntityScorePair>> getCandidatesMap(Set<Annotation> annotations) {
124                HashMap<Annotation, Set<EntityScorePair>> candidatesMap = new HashMap<>();
125                for (Annotation annotation: annotations) 
126                        candidatesMap.put(annotation, getCandidates(annotation));
127                
128                postProcess(candidatesMap, window, stopWordFilter);
129                
130                return candidatesMap;
131        }
132}