Source code

001package org.dllearner.algorithms.isle.index;
002
003import java.util.HashSet;
004import java.util.List;
005import java.util.Set;
006
007/**
008 * Annotates a document using a prefix trie.
009 *
010 * @author Andre Melo
011 */
012public class TrieLinguisticAnnotator implements LinguisticAnnotator {
013    EntityCandidatesTrie candidatesTrie;
014    private boolean normalizeWords = true;
015    
016    private boolean ignoreStopWords = true;
017
018    public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) {
019        this.candidatesTrie = candidatesTrie;
020    }
021
022    /**
023     * Generates annotation based on trie's longest matching strings. By default, the document's contents are
024     * normalized using a lemmatizer. The normalization step can be disabled using the
025     *
026     * @param document the document to get annotations for
027     * @return the set of annotation for the given document
028     */
029    @Override
030    public Set<Annotation> annotate(TextDocument document) {
031        Set<Annotation> annotations = new HashSet<>();
032        
033        List<Token> matchedTokens;
034        for (Token token : document) {
035                if(!(token.isPunctuation() ||token.isStopWord())){
036                        matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true));
037                if(matchedTokens != null && !matchedTokens.isEmpty()){
038                        Annotation annotation = new Annotation(document, matchedTokens);
039                    annotations.add(annotation);
040                }
041                } 
042                }
043        return annotations;
044    }
045
046    /**
047     * Sets whether the document's contents should be normalized or not.
048     * @param enabled if true normalizing is enabled, otherwise disabled
049     */
050    public void setNormalizeWords(boolean enabled) {
051        normalizeWords = enabled;
052    }
053}