Source code

001package org.dllearner.algorithms.isle.index;
002
003import java.util.ArrayList;
004import java.util.Collections;
005import java.util.HashMap;
006import java.util.List;
007import java.util.Map;
008import java.util.Set;
009import java.util.TreeSet;
010
011import net.didion.jwnl.data.POS;
012
013import org.dllearner.algorithms.isle.WordNet;
014
015import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.DefaultLemmatizer;
016import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.Lemmatizer;
017
018/**
019 * Provides shortcuts to commonly used linguistic operations
020 * @author Daniel Fleischhacker
021 */
022public class LinguisticUtil {
023    private static LinguisticUtil instance;
024
025    private static final WordNet wn = new WordNet();
026    private static POS[] RELEVANT_POS = new POS[]{POS.NOUN, POS.VERB};
027    private static Lemmatizer lemmatizer;
028
029    public static LinguisticUtil getInstance() {
030        if (instance == null) {
031            instance = new LinguisticUtil();
032        }
033        return instance;
034    }
035
036    public LinguisticUtil() {
037        try {
038            lemmatizer = new DefaultLemmatizer();
039        }
040        catch (Exception e) {
041            e.printStackTrace();
042        }
043    }
044
045    public Set<WordNet.LemmaScorePair> getScoredHyponyms(String word, POS pos) {
046        List<WordNet.LemmaScorePair> pairs = wn.getHyponymsScored(pos, word);
047        HashMap<String, Double> lemmaScores = new HashMap<>();
048        for (WordNet.LemmaScorePair p : pairs) {
049            if (!lemmaScores.containsKey(p.getLemma())) {
050                lemmaScores.put(p.getLemma(), p.getScore());
051            }
052            else {
053                lemmaScores.put(p.getLemma(), Math.max(p.getScore(), lemmaScores.get(p.getLemma())));
054            }
055        }
056
057        TreeSet<WordNet.LemmaScorePair> scoredPairs = new TreeSet<>();
058        for (Map.Entry<String, Double> e : lemmaScores.entrySet()) {
059            scoredPairs.add(new WordNet.LemmaScorePair(e.getKey(), e.getValue()));
060        }
061
062        return scoredPairs;
063    }
064
065    /**
066     * Processes the given string and puts camelCased words into single words.
067     * @param camelCase    the word containing camelcase to split
068     * @return all words as camelcase contained in the given word
069     */
070    public String[] getWordsFromCamelCase(String camelCase) {
071        ArrayList<String> resultingWords = new ArrayList<>();
072        StringBuilder sb = new StringBuilder();
073        for (int i = 0; i < camelCase.length(); i++) {
074            // we just ignore characters not matching the defined pattern
075            char curChar = camelCase.charAt(i);
076            if (Character.isWhitespace(curChar)) {
077                sb.append(" ");
078                continue;
079            }
080            else if (!Character.isLetter(curChar)) {
081                continue;
082            }
083            if (Character.isUpperCase(curChar)) { // found a new upper case letter
084                resultingWords.add(sb.toString());
085                sb = new StringBuilder();
086                sb.append(Character.toLowerCase(curChar));
087            }
088            else { // lower case letter
089                sb.append(curChar);
090            }
091        }
092
093        if (sb.length() > 0) {
094            resultingWords.add(sb.toString());
095        }
096
097        return resultingWords.toArray(new String[resultingWords.size()]);
098    }
099
100    /**
101     * Split word into words it contains divided by underscores.
102     *
103     * @param underScored    word to split at underscores
104     * @return words contained in given word
105     */
106    public String[] getWordsFromUnderscored(String underScored) {
107        return underScored.split("_");
108    }
109
110    /**
111     * Returns an array of all synonyms for the given word. Only synonyms for the POS in {@link #RELEVANT_POS} are
112     * returned.
113     *
114     * @param word the word to retrieve synonyms for
115     * @return synonyms for the given word
116     */
117    public String[] getSynonymsForWord(String word) {
118        ArrayList<String> synonyms = new ArrayList<>();
119
120        for (POS pos : RELEVANT_POS) {
121            synonyms.addAll(wn.getAllSynonyms(pos, word));
122        }
123        return synonyms.toArray(new String[synonyms.size()]);
124    }
125
126    /**
127     * Iterates through the hypernym tree for the given word at the given POS and returns a list of all lemmas of the
128     * most frequent synsets visited during traversing the tree.
129     * @param word word to get hypernyms for
130     * @param pos POS to get hypernyms for
131     * @return list of all lemmas of all hypernyms for the given word
132     */
133    public String[] getAllHyponymsForWord(String word, POS pos) {
134        ArrayList<String> hyponyms = new ArrayList<>();
135
136        hyponyms.addAll(wn.getHyponyms(pos, word));
137
138        return hyponyms.toArray(new String[hyponyms.size()]);
139    }
140
141    /**
142     * Returns an array of all synonyms for the given word for the given POS.
143     *
144     * @param word the word to retrieve synonyms for
145     * @param pos  POS to retrieve synonyms for
146     * @return synonyms for the given word
147     */
148    public String[] getSynonymsForWord(String word, POS pos) {
149        ArrayList<String> synonyms = new ArrayList<>();
150
151        synonyms.addAll(wn.getAllSynonyms(pos, word));
152        return synonyms.toArray(new String[synonyms.size()]);
153    }
154
155    /**
156     * Returns an array of the lemmas of the top {@code n} synonyms for the given word. Only synonyms for the POS in
157     * {@link #RELEVANT_POS} are returned.
158     *
159     * @param word the word to retrieve synonyms for
160     * @param n the number of senses to get lemmas for
161     * @return synonyms for the given word
162     */
163    public String[] getTopSynonymsForWord(String word, int n) {
164        ArrayList<String> synonyms = new ArrayList<>();
165
166        for (POS pos : RELEVANT_POS) {
167            synonyms.addAll(wn.getTopSynonyms(pos, word, n));
168        }
169        return synonyms.toArray(new String[synonyms.size()]);
170    }
171
172    /**
173     * Returns the normalized form of the given word. If the word contains spaces, each part separated by spaces is
174     * normalized independently and joined afterwards. If there is an error normalizing the given word, the word itself
175     * is returned.
176     *
177     * @param word the word to get normalized form for
178     * @return normalized form of the word or the word itself on an error
179     */
180    public String getNormalizedForm(String word) {
181        StringBuilder res = new StringBuilder();
182
183        boolean first = true;
184
185        ArrayList<String> singleWords = new ArrayList<>();
186        Collections.addAll(singleWords, word.trim().split(" "));
187
188        for (String w : singleWords) {
189            try {
190                if (first) {
191                    first = false;
192                }
193                else {
194                    res.append(" ");
195                }
196                res.append(lemmatizeSingleWord(w));
197            }
198            catch (Exception e) {
199               throw new RuntimeException(e);
200            }
201        }
202        return res.toString();
203    }
204
205    private String lemmatizeSingleWord(String word) {
206        try {
207            if (lemmatizer == null) {
208                return word;
209            }
210            else {
211                return lemmatizer.lemmatize(word);
212            }
213        }
214        catch (NullPointerException e) {
215            return word;
216        }
217    }
218
219    public static void main(String[] args) {
220        System.out.println(LinguisticUtil.getInstance().getNormalizedForm("going"));
221        for (String s : LinguisticUtil.getInstance().getWordsFromCamelCase("thisIsAClassWith1Name123")) {
222            System.out.println(s);
223            for (String w : LinguisticUtil.getInstance().getSynonymsForWord(s)) {
224                System.out.println(" --> " + w);
225            }
226        }
227    }
228}