001package org.dllearner.algorithms.isle.index; 002 003import com.google.common.base.Splitter; 004import com.google.common.collect.Lists; 005import org.semanticweb.owlapi.model.IRI; 006import org.semanticweb.owlapi.model.OWLDataFactory; 007import org.semanticweb.owlapi.model.OWLEntity; 008import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; 009 010import java.util.*; 011 012/** 013 * Tree for finding longest matching Token sequence 014 * 015 * @author Daniel Fleischhacker 016 */ 017public class TokenTree { 018 public static final double WORDNET_FACTOR = 0.3d; 019 public static final double ORIGINAL_FACTOR = 1.0d; 020 021 private LinkedHashMap<Token, TokenTree> children; 022 private Set<OWLEntity> entities; 023 private List<Token> originalTokens; 024 private boolean ignoreStopWords = true; 025 026 public TokenTree() { 027 this.children = new LinkedHashMap<>(); 028 this.entities = new HashSet<>(); 029 this.originalTokens = new ArrayList<>(); 030 } 031 032 /** 033 * If set to TRUE, stopwords like 'of, on' are ignored during creation and retrieval operations. 034 * 035 * @param ignoreStopWords the ignoreStopWords to set 036 */ 037 public void setIgnoreStopWords(boolean ignoreStopWords) { 038 this.ignoreStopWords = ignoreStopWords; 039 } 040 041 /** 042 * Adds all given entities to the end of the path resulting from the given tokens. 043 * 044 * @param tokens tokens to locate insertion point for entities 045 * @param entities entities to add 046 */ 047 public void add(List<Token> tokens, Set<OWLEntity> entities, List<Token> originalTokens) { 048 TokenTree curNode = this; 049 for (Token t : tokens) { 050 if (!ignoreStopWords || (ignoreStopWords && !t.isStopWord())) { 051 TokenTree nextNode = curNode.children.get(t); 052 if (nextNode == null) { 053 nextNode = new TokenTree(); 054 curNode.children.put(t, nextNode); 055 } 056 curNode = nextNode; 057 } 058 } 059 curNode.entities.addAll(entities); 060 curNode.originalTokens = new ArrayList<>(originalTokens); 061 } 062 063 public void add(List<Token> tokens, Set<OWLEntity> entities) { 064 add(tokens, entities, tokens); 065 } 066 067 /** 068 * Adds the given entity to the tree. 069 * 070 * @param tokens tokens to locate insertion point for entities 071 * @param entity entity to add 072 */ 073 public void add(List<Token> tokens, OWLEntity entity) { 074 add(tokens, Collections.singleton(entity)); 075 } 076 077 public void add(List<Token> tokens, OWLEntity entity, List<Token> originalTokens) { 078 add(tokens, Collections.singleton(entity), originalTokens); 079 } 080 081 /** 082 * Returns the set of entities located by the given list of tokens. This method does not consider alternative forms. 083 * 084 * @param tokens tokens to locate the information to get 085 * @return located set of entities or null if token sequence not contained in tree 086 */ 087 public Set<OWLEntity> get(List<Token> tokens) { 088 TokenTree curNode = this; 089 for (Token t : tokens) { 090 TokenTree nextNode = getNextTokenTree(curNode, t); 091 if (nextNode == null) { 092 return null; 093 } 094 curNode = nextNode; 095 } 096 return curNode.entities; 097 } 098 099 public Set<EntityScorePair> getAllEntitiesScored(List<Token> tokens) { 100 HashSet<EntityScorePair> resEntities = new HashSet<>(); 101 getAllEntitiesScoredRec(tokens, 0, this, resEntities, 1.0); 102 103 // only keep highest confidence for each entity 104 HashMap<OWLEntity, Double> entityScores = new HashMap<>(); 105 106 for (EntityScorePair p : resEntities) { 107 if (!entityScores.containsKey(p.getEntity())) { 108 entityScores.put(p.getEntity(), p.getScore()); 109 } 110 else { 111 entityScores.put(p.getEntity(), Math.max(p.getScore(), entityScores.get(p.getEntity()))); 112 } 113 } 114 115 TreeSet<EntityScorePair> result = new TreeSet<>(); 116 for (Map.Entry<OWLEntity, Double> e : entityScores.entrySet()) { 117 result.add(new EntityScorePair(e.getKey(), e.getValue())); 118 } 119 120 return result; 121 } 122 123 public void getAllEntitiesScoredRec(List<Token> tokens, int curPosition, TokenTree curTree, 124 HashSet<EntityScorePair> resEntities, Double curScore) { 125 126 if (curPosition == tokens.size()) { 127 for (OWLEntity e : curTree.entities) { 128 resEntities.add(new EntityScorePair(e, curScore)); 129 } 130 return; 131 } 132 Token currentTextToken = tokens.get(curPosition); 133 for (Map.Entry<Token, TokenTree> treeTokenEntry : curTree.children.entrySet()) { 134 if (currentTextToken.equals(treeTokenEntry.getKey())) { 135 getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, 136 curScore * ORIGINAL_FACTOR); 137 } 138 else { 139 for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms() 140 .entrySet()) { 141 if (currentTextToken.getStemmedForm().equals(treeAlternativeForm.getKey())) { 142 getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, 143 curScore * ORIGINAL_FACTOR * treeAlternativeForm.getValue()); 144 } 145 } 146 for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms() 147 .entrySet()) { 148 if (treeTokenEntry.getKey().getStemmedForm().equals(textAlternativeForm.getKey())) { 149 getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, 150 curScore * ORIGINAL_FACTOR * textAlternativeForm.getValue()); 151 } 152 } 153 154 for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms() 155 .entrySet()) { 156 for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms() 157 .entrySet()) { 158 if (treeAlternativeForm.getKey().equals(textAlternativeForm.getKey())) { 159 getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, 160 curScore * treeAlternativeForm.getValue() * textAlternativeForm.getValue()); 161 } 162 } 163 } 164 } 165 } 166 } 167 168 public Set<OWLEntity> getAllEntities(List<Token> tokens) { 169 HashSet<OWLEntity> resEntities = new HashSet<>(); 170 getAllEntitiesRec(tokens, 0, this, resEntities); 171 return resEntities; 172 } 173 174 public void getAllEntitiesRec(List<Token> tokens, int curPosition, TokenTree curTree, HashSet<OWLEntity> resEntities) { 175 if (curPosition == tokens.size()) { 176 resEntities.addAll(curTree.entities); 177 return; 178 } 179 Token t = tokens.get(curPosition); 180 for (Map.Entry<Token, TokenTree> entry : curTree.children.entrySet()) { 181 if (t.equalsWithAlternativeForms(entry.getKey())) { 182 getAllEntitiesRec(tokens, curPosition + 1, entry.getValue(), resEntities); 183 } 184 } 185 } 186 187 /** 188 * Returns the list of tokens which are the longest match with entities assigned in this tree. 189 * 190 * @param tokens list of tokens to check for longest match 191 * @return list of tokens being the longest match, sublist of {@code tokens} anchored at the first token 192 */ 193 public List<Token> getLongestMatch(List<Token> tokens) { 194 List<Token> fallbackTokenList = new ArrayList<>(); 195 TokenTree curNode = this; 196 197 for (Token t : tokens) { 198 TokenTree nextNode = getNextTokenTree(curNode, t); 199 if (nextNode == null) { 200 return fallbackTokenList; 201 } 202 curNode = nextNode; 203 fallbackTokenList.add(t); 204 } 205 return fallbackTokenList; 206 } 207 208 private TokenTree getNextTokenTree(TokenTree current, Token t) { 209 TokenTree next = current.children.get(t); 210 if (next != null) { 211 return next; 212 } 213 for (Map.Entry<Token, TokenTree> child : current.children.entrySet()) { 214 if (child.getKey().equalsWithAlternativeForms(t)) { 215 return child.getValue(); 216 } 217 } 218 return null; 219 } 220 221 /** 222 * Returns the set of entities assigned to the longest matching token subsequence of the given token sequence. 223 * 224 * @param tokens token sequence to search for longest match 225 * @return set of entities assigned to the longest matching token subsequence of the given token sequence 226 */ 227 public Set<OWLEntity> getEntitiesForLongestMatch(List<Token> tokens) { 228 TokenTree fallback = this.entities.isEmpty() ? null : this; 229 TokenTree curNode = this; 230 231 for (Token t : tokens) { 232 TokenTree nextNode = getNextTokenTree(curNode, t); 233 if (nextNode == null) { 234 return fallback == null ? null : fallback.entities; 235 } 236 curNode = nextNode; 237 if (!curNode.entities.isEmpty()) { 238 fallback = curNode; 239 } 240 } 241 242 return fallback == null ? Collections.<OWLEntity>emptySet() : fallback.entities; 243 } 244 245 /** 246 * Returns the original ontology tokens for the longest match 247 */ 248 public List<Token> getOriginalTokensForLongestMatch(List<Token> tokens) { 249 TokenTree fallback = this.entities.isEmpty() ? null : this; 250 TokenTree curNode = this; 251 252 for (Token t : tokens) { 253 TokenTree nextNode = getNextTokenTree(curNode, t); 254 if (nextNode == null) { 255 return fallback == null ? null : fallback.originalTokens; 256 } 257 curNode = nextNode; 258 if (!curNode.entities.isEmpty()) { 259 fallback = curNode; 260 } 261 } 262 263 return fallback == null ? Collections.<Token>emptyList() : fallback.originalTokens; 264 } 265 266 public static void main(String[] args) throws Exception { 267 List<Token> tokens1 = Lists.newLinkedList(); 268 for (String s : Splitter.on(" ").split("this is a token tree")) { 269 tokens1.add(new Token(s, s, s, false, false)); 270 } 271 272 List<Token> tokens2 = Lists.newLinkedList(); 273 for (String s : Splitter.on(" ").split("this is a tokenized tree")) { 274 tokens2.add(new Token(s, s, s, false, false)); 275 } 276 277 OWLDataFactory df = new OWLDataFactoryImpl(); 278 TokenTree tree = new TokenTree(); 279 tree.add(tokens1, df.getOWLClass(IRI.create("TokenTree"))); 280 tree.add(tokens2, df.getOWLClass(IRI.create("TokenizedTree"))); 281 System.out.println(tree); 282 283 System.out.println(tree.getEntitiesForLongestMatch(tokens1)); 284 System.out.println(tree.getLongestMatch(tokens1)); 285 286 List<Token> tokens3 = Lists.newLinkedList(); 287 for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { 288 tokens3.add(new Token(s, s, s, false, false)); 289 } 290 System.out.println(tree.getLongestMatch(tokens3)); 291 } 292 293 294 public String toString() { 295 return "TokenTree\n" + toString(0); 296 } 297 298 public String toString(int indent) { 299 StringBuilder indentStringBuilder = new StringBuilder(); 300 for (int i = 0; i < indent; i++) { 301 indentStringBuilder.append(" "); 302 } 303 String indentString = indentStringBuilder.toString(); 304 StringBuilder sb = new StringBuilder(); 305 for (Map.Entry<Token, TokenTree> e : new TreeMap<>(children).entrySet()) { 306 sb.append(indentString).append(e.getKey().toString()); 307 sb.append("\n"); 308 sb.append(e.getValue().toString(indent + 1)); 309 } 310 return sb.toString(); 311 } 312 313 314}