001package org.dllearner.algorithms.isle.index; 002 003import java.util.ArrayList; 004import java.util.List; 005import java.util.Map; 006import java.util.Map.Entry; 007import java.util.Set; 008 009import net.didion.jwnl.data.POS; 010 011import org.dllearner.algorithms.isle.WordNet; 012import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; 013import org.semanticweb.owlapi.model.OWLEntity; 014import org.semanticweb.owlapi.model.OWLOntology; 015 016public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { 017 TokenTree tree; 018 EntityTextRetriever entityTextRetriever; 019 020// /** 021// * Initialize the trie with strings from the provided ontology using a no-op name generator, i.e., only the 022// * actual ontology strings are added and no expansion is done. 023// * 024// * @param entityTextRetriever the text retriever to use 025// * @param ontology the ontology to get strings from 026// */ 027// public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { 028// this(entityTextRetriever, ontology, new DummyNameGenerator()); 029// } 030 031 /** 032 * Initialize the trie with strings from the provided ontology and use the given entity name generator 033 * for generating alternative words. 034 * 035 * @param entityTextRetriever the text retriever to use 036 * @param ontology the ontology to get strings from 037 */ 038 public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { 039 this.entityTextRetriever = entityTextRetriever; 040 buildTrie(ontology); 041 } 042 043 public void buildTrie(OWLOntology ontology) { 044 this.tree = new TokenTree(); 045 Map<OWLEntity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); 046 047 048 for (Entry<OWLEntity, Set<List<Token>>> entry : entity2TokenSet.entrySet()) { 049 OWLEntity entity = entry.getKey(); 050 Set<List<Token>> tokenSet = entry.getValue(); 051 for (List<Token> tokens : tokenSet) { 052 addAlternativeFormsFromWordNet(tokens); 053 addEntry(tokens, entity); 054 addSubsequences(entity, tokens); 055 } 056 } 057 } 058 059 /** 060 * Adds the subsequences of a test 061 * @param entity 062 * @param tokens 063 */ 064 private void addSubsequences(OWLEntity entity, List<Token> tokens) { 065 tree.add(tokens, entity); 066 for (int size = 1; size < tokens.size(); size++) { 067 for (int start = 0; start < tokens.size() - size + 1; start++) { 068 ArrayList<Token> subsequence = new ArrayList<>(); 069 for (int i = 0; i < size; i++) { 070 subsequence.add(tokens.get(start + i)); 071 } 072 addEntry(subsequence, entity); 073 } 074 } 075 } 076 077 private void addAlternativeFormsFromWordNet(List<Token> tokens) { 078 for (Token t : tokens) { 079 POS wordnetPos = null; 080 String posTag = t.getPOSTag(); 081 if (posTag.startsWith("N")) {//nouns 082 wordnetPos = POS.NOUN; 083 } 084 else if (posTag.startsWith("V")) {//verbs 085 wordnetPos = POS.VERB; 086 } 087 else if (posTag.startsWith("J")) {//adjectives 088 wordnetPos = POS.ADJECTIVE; 089 } 090 else if (posTag.startsWith("R")) {//adverbs 091 wordnetPos = POS.ADVERB; 092 } 093 if (wordnetPos == null) { 094 continue; 095 } 096 //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); 097 Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance() 098 .getScoredHyponyms(t.getRawForm(), wordnetPos); 099 100 for (WordNet.LemmaScorePair synonym : alternativeFormPairs) { 101 // ignore all multi word synonyms 102 if (synonym.getLemma().contains("_")) { 103 continue; 104 } 105 //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); 106 t.addAlternativeForm(synonym.getLemma(), synonym.getScore()); 107 } 108 } 109 } 110 111 @Override 112 public void addEntry(List<Token> s, OWLEntity e) { 113 tree.add(s, e); 114 } 115 116 public void addEntry(List<Token> s, OWLEntity e, List<Token> originalTokens) { 117 tree.add(s, e, originalTokens); 118 } 119 120 @Override 121 public Set<EntityScorePair> getCandidateEntities(List<Token> tokens) { 122 Set<EntityScorePair> res = tree.getAllEntitiesScored(tokens); 123 return res; 124 } 125 126 @Override 127 public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { 128 return tree.getOriginalTokensForLongestMatch(tokens); 129 } 130 131 @Override 132 public List<Token> getLongestMatchingText(List<Token> tokens) { 133 return tree.getLongestMatch(tokens); 134 } 135 136 public String toString() { 137 return tree.toString(); 138 } 139 140 public static void main(String[] args) { 141 String[] tokens = "this is a long and very complex text".split(" "); 142 143 List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; 144 145 // generate list of lemmatized wordnet synonyms for each token 146 for (int i = 0; i < tokens.length; i++) { 147 wordnetTokens[i] = new ArrayList<>(); 148 wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i])); 149 for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { 150 System.out.println("Adding: " + LinguisticUtil.getInstance().getNormalizedForm(w)); 151 wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).replaceAll("_", " ")); 152 } 153 } 154 } 155 156 public void printTrie() { 157 System.out.println(this.toString()); 158 159 } 160}