001/** 002 * 003 */ 004package org.dllearner.algorithms.isle.index; 005 006import java.io.Serializable; 007import java.util.Collections; 008import java.util.HashMap; 009import java.util.Map; 010import java.util.Set; 011 012import com.google.common.collect.ComparisonChain; 013 014/** 015 * @author Lorenz Buehmann 016 * 017 */ 018public class Token implements Comparable<Token>, Serializable{ 019 020 private String rawForm; 021 private String stemmedForm; 022 private String posTag; 023 private boolean isPunctuation; 024 private boolean isStopWord; 025 private boolean isHead; 026 /// for storing alternative forms of this token, e.g., generated by WordNet synonyms 027 private HashMap<String, Double> alternativeForms; 028 029 030 public Token(String rawForm) { 031 this.rawForm = rawForm; 032 } 033 034 public Token(String rawForm, String stemmedForm, String posTag, boolean isPunctuation, boolean isStopWord) { 035 this.rawForm = rawForm; 036 this.stemmedForm = stemmedForm; 037 this.posTag = posTag; 038 this.isPunctuation = isPunctuation; 039 this.isStopWord = isStopWord; 040 this.alternativeForms = new HashMap<>(); 041 } 042 043 /** 044 * @return the rawForm 045 */ 046 public String getRawForm() { 047 return rawForm; 048 } 049 050 /** 051 * @return the stemmedForm 052 */ 053 public String getStemmedForm() { 054 return stemmedForm; 055 } 056 057 /** 058 * @return the posTag 059 */ 060 public String getPOSTag() { 061 return posTag; 062 } 063 064 /** 065 * Returns the unmodifiable list of alternative surface forms for this token. These alternative forms might be 066 * generated by, e.g., WordNet synonym expansion. 067 * 068 * @return unmodifiable set of alternative surface forms for this token 069 */ 070 public Set<String> getAlternativeForms() { 071 return Collections.unmodifiableSet(alternativeForms.keySet()); 072 } 073 074 /** 075 * Returns the map storing the scored alternative forms of this token. 076 */ 077 public Map<String, Double> getScoredAlternativeForms() { 078 return Collections.unmodifiableMap(alternativeForms); 079 } 080 081 /** 082 * Adds a new surface form to the alternative forms of this token. Alternative forms are included in comparison of 083 * two tokens when using the {@link #equalsWithAlternativeForms}. 084 */ 085 public void addAlternativeForm(String alternativeForm, Double score) { 086 this.alternativeForms.put(alternativeForm, score); 087 } 088 089 /** 090 * @return the isPunctuation 091 */ 092 public boolean isPunctuation() { 093 return isPunctuation; 094 } 095 096 /** 097 * @return the isStopWord 098 */ 099 public boolean isStopWord() { 100 return isStopWord; 101 } 102 103 /** 104 * @param stemmedForm the stemmedForm to set 105 */ 106 public void setStemmedForm(String stemmedForm) { 107 this.stemmedForm = stemmedForm; 108 } 109 110 /** 111 * @param posTag the posTag to set 112 */ 113 public void setPOSTag(String posTag) { 114 this.posTag = posTag; 115 } 116 117 /** 118 * @param isPunctuation the isPunctuation to set 119 */ 120 public void setIsPunctuation(boolean isPunctuation) { 121 this.isPunctuation = isPunctuation; 122 } 123 124 /** 125 * @param isStopWord the isStopWord to set 126 */ 127 public void setIsStopWord(boolean isStopWord) { 128 this.isStopWord = isStopWord; 129 } 130 131 /** 132 * @param isHead the token is the head of the containg sequence of tokens 133 */ 134 public void setIsHead(boolean isHead) { 135 this.isHead = isHead; 136 } 137 138 /** 139 * @return the isHead 140 */ 141 public boolean isHead() { 142 return isHead; 143 } 144 145 /* (non-Javadoc) 146 * @see java.lang.Object#toString() 147 */ 148 @Override 149 public String toString() { 150 return "[Word: " + rawForm + " | Stemmed word: " + stemmedForm + " | POS tag: " + posTag + " | Alternatives: " + alternativeForms.toString() + "]"; 151 } 152 153 /** 154 * Compares the given token to this one including alternative forms. This means that tokens are considered to be 155 * equal iff the POS tags is the same and if the intersection of all surface forms (stemmed forms + alternative 156 * forms) is not empty. 157 * 158 * @param other token to compare this token to 159 * @return true if tokens are equal considering alternative forms, otherwise false 160 */ 161 public boolean equalsWithAlternativeForms(Token other) { 162 if (this == other) { 163 return true; 164 } 165 166 if (!posTag.equals(other.posTag)) { 167 return false; 168 } 169 170 if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.containsKey(stemmedForm) || 171 alternativeForms.containsKey(other.stemmedForm)) { 172 return true; 173 } 174 175 return false; 176 } 177 178 @Override 179 public boolean equals(Object o) { 180 if (this == o) { 181 return true; 182 } 183 if (o == null || getClass() != o.getClass()) { 184 return false; 185 } 186 187 Token token = (Token) o; 188 189 if (!WordTypeComparator.sameWordType(posTag, token.posTag)) { 190 return false; 191 } 192 if (!stemmedForm.equals(token.stemmedForm)) { 193 return false; 194 } 195 196 return true; 197 } 198 199 @Override 200 public int hashCode() { 201 int result = stemmedForm.hashCode(); 202 result = 31 * result + WordTypeComparator.hashCode(posTag); 203 return result; 204 } 205 206 /* (non-Javadoc) 207 * @see java.lang.Comparable#compareTo(java.lang.Object) 208 */ 209 @Override 210 public int compareTo(Token other) { 211 return ComparisonChain.start() 212 .compare(this.rawForm, other.rawForm) 213 .compare(this.posTag, other.posTag) 214 .result(); 215 } 216}