001/** 002 * 003 */ 004package org.dllearner.algorithms.isle; 005 006import java.io.File; 007import java.io.IOException; 008import java.util.HashSet; 009import java.util.Iterator; 010import java.util.Set; 011 012import org.dllearner.algorithms.isle.index.Annotation; 013 014import com.google.common.base.Charsets; 015import com.google.common.io.Files; 016 017/** 018 * @author Lorenz Buehmann 019 * 020 */ 021public class StopWordFilter { 022 023 private Set<String> stopWords; 024 private static final String stopWordfile = "src/main/resources/stopwords.txt"; 025 026 public StopWordFilter() { 027 try { 028 stopWords = new HashSet<>(Files.readLines(new File(stopWordfile), Charsets.UTF_8)); 029 } catch (IOException e) { 030 e.printStackTrace(); 031 } 032 } 033 034 public String removeStopWords(String input) { 035 for (String s : stopWords) { 036 input = input.replaceAll("\\b" + s + "\\b", ""); 037 } 038 return input; 039 } 040 041 public void removeStopWords(Set<String> words) { 042 words.removeAll(stopWords); 043 } 044 045 public void removeStopWordAnnotations(Set<Annotation> annotations) { 046 for (Iterator<Annotation> iter = annotations.iterator(); iter.hasNext();) { 047 Annotation annotation = iter.next(); 048 String token = annotation.getTokens().get(0).getRawForm(); 049 if(stopWords.contains(token)){ 050 iter.remove(); 051 } 052 } 053 } 054 055 public boolean isStopWord(String token) { 056 return stopWords.contains(token); 057 } 058 059}