001package org.dllearner.algorithms.isle.index; 002 003import java.util.HashSet; 004import java.util.List; 005import java.util.Set; 006 007/** 008 * Annotates a document using a prefix trie. 009 * 010 * @author Andre Melo 011 */ 012public class TrieLinguisticAnnotator implements LinguisticAnnotator { 013 EntityCandidatesTrie candidatesTrie; 014 private boolean normalizeWords = true; 015 016 private boolean ignoreStopWords = true; 017 018 public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { 019 this.candidatesTrie = candidatesTrie; 020 } 021 022 /** 023 * Generates annotation based on trie's longest matching strings. By default, the document's contents are 024 * normalized using a lemmatizer. The normalization step can be disabled using the 025 * 026 * @param document the document to get annotations for 027 * @return the set of annotation for the given document 028 */ 029 @Override 030 public Set<Annotation> annotate(TextDocument document) { 031 Set<Annotation> annotations = new HashSet<>(); 032 033 List<Token> matchedTokens; 034 for (Token token : document) { 035 if(!(token.isPunctuation() ||token.isStopWord())){ 036 matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); 037 if(matchedTokens != null && !matchedTokens.isEmpty()){ 038 Annotation annotation = new Annotation(document, matchedTokens); 039 annotations.add(annotation); 040 } 041 } 042 } 043 return annotations; 044 } 045 046 /** 047 * Sets whether the document's contents should be normalized or not. 048 * @param enabled if true normalizing is enabled, otherwise disabled 049 */ 050 public void setNormalizeWords(boolean enabled) { 051 normalizeWords = enabled; 052 } 053}