Source code

001/**
002 * 
003 */
004package org.dllearner.algorithms.isle;
005
006import java.io.File;
007import java.io.IOException;
008import java.util.HashSet;
009import java.util.Iterator;
010import java.util.Set;
011
012import org.dllearner.algorithms.isle.index.Annotation;
013
014import com.google.common.base.Charsets;
015import com.google.common.io.Files;
016
017/**
018 * @author Lorenz Buehmann
019 *
020 */
021public class StopWordFilter {
022        
023        private Set<String> stopWords;
024        private static final String stopWordfile = "src/main/resources/stopwords.txt";
025        
026        public StopWordFilter() {
027                try {
028                        stopWords = new HashSet<>(Files.readLines(new File(stopWordfile), Charsets.UTF_8));
029                } catch (IOException e) {
030                        e.printStackTrace();
031                }
032        }
033        
034        public String removeStopWords(String input) {
035            for (String s : stopWords) {
036                        input = input.replaceAll("\\b" + s + "\\b", "");
037                }
038            return input;
039        }
040        
041        public void removeStopWords(Set<String> words) {
042            words.removeAll(stopWords);
043        }
044        
045        public void removeStopWordAnnotations(Set<Annotation> annotations) {
046                for (Iterator<Annotation> iter = annotations.iterator(); iter.hasNext();) {
047                        Annotation annotation = iter.next();
048                        String token = annotation.getTokens().get(0).getRawForm();
049                        if(stopWords.contains(token)){
050                                iter.remove();
051                        }
052                }
053        }
054        
055        public boolean isStopWord(String token) {
056                return stopWords.contains(token);
057        }
058
059}