001package org.dllearner.algorithms.isle.index; 002 003import java.util.ArrayList; 004import java.util.Collections; 005import java.util.HashMap; 006import java.util.List; 007import java.util.Map; 008import java.util.Set; 009import java.util.TreeSet; 010 011import net.didion.jwnl.data.POS; 012 013import org.dllearner.algorithms.isle.WordNet; 014 015import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.DefaultLemmatizer; 016import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.Lemmatizer; 017 018/** 019 * Provides shortcuts to commonly used linguistic operations 020 * @author Daniel Fleischhacker 021 */ 022public class LinguisticUtil { 023 private static LinguisticUtil instance; 024 025 private static final WordNet wn = new WordNet(); 026 private static POS[] RELEVANT_POS = new POS[]{POS.NOUN, POS.VERB}; 027 private static Lemmatizer lemmatizer; 028 029 public static LinguisticUtil getInstance() { 030 if (instance == null) { 031 instance = new LinguisticUtil(); 032 } 033 return instance; 034 } 035 036 public LinguisticUtil() { 037 try { 038 lemmatizer = new DefaultLemmatizer(); 039 } 040 catch (Exception e) { 041 e.printStackTrace(); 042 } 043 } 044 045 public Set<WordNet.LemmaScorePair> getScoredHyponyms(String word, POS pos) { 046 List<WordNet.LemmaScorePair> pairs = wn.getHyponymsScored(pos, word); 047 HashMap<String, Double> lemmaScores = new HashMap<>(); 048 for (WordNet.LemmaScorePair p : pairs) { 049 if (!lemmaScores.containsKey(p.getLemma())) { 050 lemmaScores.put(p.getLemma(), p.getScore()); 051 } 052 else { 053 lemmaScores.put(p.getLemma(), Math.max(p.getScore(), lemmaScores.get(p.getLemma()))); 054 } 055 } 056 057 TreeSet<WordNet.LemmaScorePair> scoredPairs = new TreeSet<>(); 058 for (Map.Entry<String, Double> e : lemmaScores.entrySet()) { 059 scoredPairs.add(new WordNet.LemmaScorePair(e.getKey(), e.getValue())); 060 } 061 062 return scoredPairs; 063 } 064 065 /** 066 * Processes the given string and puts camelCased words into single words. 067 * @param camelCase the word containing camelcase to split 068 * @return all words as camelcase contained in the given word 069 */ 070 public String[] getWordsFromCamelCase(String camelCase) { 071 ArrayList<String> resultingWords = new ArrayList<>(); 072 StringBuilder sb = new StringBuilder(); 073 for (int i = 0; i < camelCase.length(); i++) { 074 // we just ignore characters not matching the defined pattern 075 char curChar = camelCase.charAt(i); 076 if (Character.isWhitespace(curChar)) { 077 sb.append(" "); 078 continue; 079 } 080 else if (!Character.isLetter(curChar)) { 081 continue; 082 } 083 if (Character.isUpperCase(curChar)) { // found a new upper case letter 084 resultingWords.add(sb.toString()); 085 sb = new StringBuilder(); 086 sb.append(Character.toLowerCase(curChar)); 087 } 088 else { // lower case letter 089 sb.append(curChar); 090 } 091 } 092 093 if (sb.length() > 0) { 094 resultingWords.add(sb.toString()); 095 } 096 097 return resultingWords.toArray(new String[resultingWords.size()]); 098 } 099 100 /** 101 * Split word into words it contains divided by underscores. 102 * 103 * @param underScored word to split at underscores 104 * @return words contained in given word 105 */ 106 public String[] getWordsFromUnderscored(String underScored) { 107 return underScored.split("_"); 108 } 109 110 /** 111 * Returns an array of all synonyms for the given word. Only synonyms for the POS in {@link #RELEVANT_POS} are 112 * returned. 113 * 114 * @param word the word to retrieve synonyms for 115 * @return synonyms for the given word 116 */ 117 public String[] getSynonymsForWord(String word) { 118 ArrayList<String> synonyms = new ArrayList<>(); 119 120 for (POS pos : RELEVANT_POS) { 121 synonyms.addAll(wn.getAllSynonyms(pos, word)); 122 } 123 return synonyms.toArray(new String[synonyms.size()]); 124 } 125 126 /** 127 * Iterates through the hypernym tree for the given word at the given POS and returns a list of all lemmas of the 128 * most frequent synsets visited during traversing the tree. 129 * @param word word to get hypernyms for 130 * @param pos POS to get hypernyms for 131 * @return list of all lemmas of all hypernyms for the given word 132 */ 133 public String[] getAllHyponymsForWord(String word, POS pos) { 134 ArrayList<String> hyponyms = new ArrayList<>(); 135 136 hyponyms.addAll(wn.getHyponyms(pos, word)); 137 138 return hyponyms.toArray(new String[hyponyms.size()]); 139 } 140 141 /** 142 * Returns an array of all synonyms for the given word for the given POS. 143 * 144 * @param word the word to retrieve synonyms for 145 * @param pos POS to retrieve synonyms for 146 * @return synonyms for the given word 147 */ 148 public String[] getSynonymsForWord(String word, POS pos) { 149 ArrayList<String> synonyms = new ArrayList<>(); 150 151 synonyms.addAll(wn.getAllSynonyms(pos, word)); 152 return synonyms.toArray(new String[synonyms.size()]); 153 } 154 155 /** 156 * Returns an array of the lemmas of the top {@code n} synonyms for the given word. Only synonyms for the POS in 157 * {@link #RELEVANT_POS} are returned. 158 * 159 * @param word the word to retrieve synonyms for 160 * @param n the number of senses to get lemmas for 161 * @return synonyms for the given word 162 */ 163 public String[] getTopSynonymsForWord(String word, int n) { 164 ArrayList<String> synonyms = new ArrayList<>(); 165 166 for (POS pos : RELEVANT_POS) { 167 synonyms.addAll(wn.getTopSynonyms(pos, word, n)); 168 } 169 return synonyms.toArray(new String[synonyms.size()]); 170 } 171 172 /** 173 * Returns the normalized form of the given word. If the word contains spaces, each part separated by spaces is 174 * normalized independently and joined afterwards. If there is an error normalizing the given word, the word itself 175 * is returned. 176 * 177 * @param word the word to get normalized form for 178 * @return normalized form of the word or the word itself on an error 179 */ 180 public String getNormalizedForm(String word) { 181 StringBuilder res = new StringBuilder(); 182 183 boolean first = true; 184 185 ArrayList<String> singleWords = new ArrayList<>(); 186 Collections.addAll(singleWords, word.trim().split(" ")); 187 188 for (String w : singleWords) { 189 try { 190 if (first) { 191 first = false; 192 } 193 else { 194 res.append(" "); 195 } 196 res.append(lemmatizeSingleWord(w)); 197 } 198 catch (Exception e) { 199 throw new RuntimeException(e); 200 } 201 } 202 return res.toString(); 203 } 204 205 private String lemmatizeSingleWord(String word) { 206 try { 207 if (lemmatizer == null) { 208 return word; 209 } 210 else { 211 return lemmatizer.lemmatize(word); 212 } 213 } 214 catch (NullPointerException e) { 215 return word; 216 } 217 } 218 219 public static void main(String[] args) { 220 System.out.println(LinguisticUtil.getInstance().getNormalizedForm("going")); 221 for (String s : LinguisticUtil.getInstance().getWordsFromCamelCase("thisIsAClassWith1Name123")) { 222 System.out.println(s); 223 for (String w : LinguisticUtil.getInstance().getSynonymsForWord(s)) { 224 System.out.println(" --> " + w); 225 } 226 } 227 } 228}