001package org.dllearner.algorithms.isle.index; 002 003import java.util.ArrayList; 004import java.util.HashMap; 005import java.util.List; 006import java.util.Set; 007 008import org.dllearner.algorithms.isle.EntityCandidateGenerator; 009import org.dllearner.algorithms.isle.StopWordFilter; 010import org.semanticweb.owlapi.model.OWLOntology; 011 012import com.google.common.collect.Lists; 013 014/** 015 * Generates candidates using a entity candidates prefix trie 016 * @author Andre Melo 017 * 018 */ 019public class TrieEntityCandidateGenerator extends EntityCandidateGenerator{ 020 021 final EntityCandidatesTrie candidatesTrie; 022 final StopWordFilter stopWordFilter = new StopWordFilter(); 023 int window = 10; 024 025 public TrieEntityCandidateGenerator(OWLOntology ontology, EntityCandidatesTrie candidatesTrie) { 026 super(ontology); 027 this.candidatesTrie = candidatesTrie; 028 } 029 030 @Override 031 public Set<EntityScorePair> getCandidates(Annotation annotation) { 032 Set<EntityScorePair> candidateEntities = candidatesTrie.getCandidateEntities(annotation.getTokens()); 033 System.out.println(annotation + " --> " + candidateEntities); 034 return candidateEntities; 035 } 036 037 /** 038 * Postprocess the annotations generated by annotate 039 * The objective is to merge annotations which are likely to belong to the same entity 040 * @param window : maximum distance between the annotations 041 * @return 042 */ 043 public void postProcess(HashMap<Annotation,Set<EntityScorePair>> candidatesMap, int window, StopWordFilter stopWordFilter) { 044 Set<Annotation> annotations = candidatesMap.keySet(); 045 List<Annotation> sortedAnnotations = new ArrayList<>(annotations); 046 //TODO refactoring 047 /** 048 049 050 // Sort annotations by offset in ascending order 051 Collections.sort(sortedAnnotations, new Comparator<Annotation>(){ 052 public int compare(Annotation a1,Annotation a2){ 053 return Integer.compare(a1.getOffset(), a2.getOffset()); 054 } 055 }); 056 057 int windowStart = 0; 058 int windowEnd = 0; 059 for (int i=0; i<sortedAnnotations.size(); i++) { 060 061 Annotation annotation_i = sortedAnnotations.get(i); 062 int begin_i = annotation_i.getOffset(); 063 int end_i = begin_i + annotation_i.getLength()-1; 064 String token_i = annotation_i.getString(); 065 Set<OWLEntity> candidates_i = getCandidates(annotation_i); 066 Set<OWLEntity> newCandidates_i = new HashSet<OWLEntity>(); 067 068 // Determine the annotations contained in the window 069 while ((sortedAnnotations.get(windowStart).getOffset()+sortedAnnotations.get(windowStart).getLength()-1)<(begin_i-window)) 070 windowStart++; 071 while (windowEnd<sortedAnnotations.size() && sortedAnnotations.get(windowEnd).getOffset()<(end_i+window)) 072 windowEnd++; 073 074 // For every annotation in the window (defined by the number of characters between offsets) 075 for (int j=windowStart; j<sortedAnnotations.size() && j<windowEnd; j++) { 076 if (j!=i) { 077 Annotation annotation_j = sortedAnnotations.get(j); 078 String token_j = annotation_j.getString(); 079 Set<OWLEntity> candidates_j = getCandidates(annotation_j); 080 Set<OWLEntity> intersection = Sets.intersection(candidates_i, candidates_j); 081 Set<OWLEntity> newCandidates_ij = new HashSet<OWLEntity>(); 082 for (OWLEntity commonEntity: intersection) { 083 if (!(stopWordFilter.isStopWord(token_i) && stopWordFilter.isStopWord(token_j))) { 084 if (!token_i.contains(token_j) && !token_j.contains(token_i)) { 085 newCandidates_ij.add(commonEntity); 086 //System.out.println("common("+token_i+","+token_j+")="+commonEntity); 087 } 088 } 089 } 090 if (!newCandidates_ij.isEmpty()) { 091 Annotation mergedAnnotation = mergeAnnotations(annotation_i,annotation_j); 092 // If there's no punctuation in the merged annotation 093 if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getString())) { 094 candidatesMap.put(mergedAnnotation, newCandidates_ij); 095 candidatesMap.remove(annotation_i); 096 candidatesMap.remove(annotation_j); 097 } 098 099 newCandidates_i.addAll(newCandidates_ij); 100 } 101 } 102 } 103 104 // Deletes annotation if it's a stop word and doesn't have any matching annotation in the window 105 if (stopWordFilter.isStopWord(token_i)) { 106 if (newCandidates_i.isEmpty()) 107 candidatesMap.remove(annotation_i); 108 } 109 } 110 111 112 */ 113 } 114 115 private Annotation mergeAnnotations(Annotation annotation_i, Annotation annotation_j) { 116 List<Token> tokens = Lists.newArrayList(); 117 tokens.addAll(annotation_i.getTokens()); 118 tokens.addAll(annotation_j.getTokens()); 119 return new Annotation(annotation_i.getReferencedDocument(), tokens); 120 } 121 122 @Override 123 public HashMap<Annotation, Set<EntityScorePair>> getCandidatesMap(Set<Annotation> annotations) { 124 HashMap<Annotation, Set<EntityScorePair>> candidatesMap = new HashMap<>(); 125 for (Annotation annotation: annotations) 126 candidatesMap.put(annotation, getCandidates(annotation)); 127 128 postProcess(candidatesMap, window, stopWordFilter); 129 130 return candidatesMap; 131 } 132}