Source code

001/**
002 * 
003 */
004package org.dllearner.algorithms.isle;
005
006import java.io.IOException;
007import java.nio.file.Files;
008import java.util.HashMap;
009import java.util.HashSet;
010import java.util.Map;
011import java.util.Set;
012
013import org.apache.commons.math3.linear.ArrayRealVector;
014import org.apache.commons.math3.linear.RealVector;
015import org.apache.lucene.analysis.Analyzer;
016import org.apache.lucene.analysis.core.SimpleAnalyzer;
017import org.apache.lucene.document.Document;
018import org.apache.lucene.document.Field;
019import org.apache.lucene.document.FieldType;
020import org.apache.lucene.index.DirectoryReader;
021import org.apache.lucene.index.IndexOptions;
022import org.apache.lucene.index.IndexReader;
023import org.apache.lucene.index.IndexWriter;
024import org.apache.lucene.index.IndexWriterConfig;
025import org.apache.lucene.index.Term;
026import org.apache.lucene.index.Terms;
027import org.apache.lucene.index.TermsEnum;
028import org.apache.lucene.store.Directory;
029import org.apache.lucene.store.MMapDirectory;
030import org.apache.lucene.util.BytesRef;
031
032/**
033 * Imagine an N-dimensional space where N is the number of unique words in a pair of texts. Each of the two texts 
034 * can be treated like a vector in this N-dimensional space. The distance between the two vectors is an indication 
035 * of the similarity of the two texts. The cosine of the angle between the two vectors is the most common distance measure.
036 * @author Lorenz Buehmann
037 *
038 */
039public class VSMCosineDocumentSimilarity {
040        
041        enum TermWeighting {
042                TF, TF_IDF
043        }
044        
045        public static final String CONTENT = "Content";
046    public static final FieldType TYPE_STORED = new FieldType();
047    
048    private final Set<String> terms = new HashSet<>();
049    private final RealVector v1;
050    private final RealVector v2;
051    
052    static {
053        TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
054        TYPE_STORED.setTokenized(true);
055        TYPE_STORED.setStored(true);
056        TYPE_STORED.setStoreTermVectors(true);
057        TYPE_STORED.setStoreTermVectorPositions(true);
058        TYPE_STORED.freeze();
059    }
060    
061    public VSMCosineDocumentSimilarity(String s1, String s2, TermWeighting termWeighting) throws IOException {
062        //create the index
063        Directory directory = createIndex(s1, s2);
064        IndexReader reader = DirectoryReader.open(directory);
065        //generate the document vectors
066        if(termWeighting == TermWeighting.TF){//based on term frequency only
067                //compute the term frequencies for document 1
068            Map<String, Integer> f1 = getTermFrequencies(reader, 0);
069            //compute the term frequencies for document 2
070            Map<String, Integer> f2 = getTermFrequencies(reader, 1);
071            reader.close();
072            //map both documents to vector objects
073            v1 = getTermVectorInteger(f1);
074            v2 = getTermVectorInteger(f2);
075        } else if(termWeighting == TermWeighting.TF_IDF){//based on tf*idf weighting
076                //compute the term frequencies for document 1
077            Map<String, Double> f1 = getTermWeights(reader, 0);
078            //compute the term frequencies for document 2
079            Map<String, Double> f2 = getTermWeights(reader, 1);
080            reader.close();
081            //map both documents to vector objects
082            v1 = getTermVectorDouble(f1);
083            v2 = getTermVectorDouble(f2);
084        } else {
085                v1 = null;
086                v2 = null;
087        }
088    }
089    
090    public VSMCosineDocumentSimilarity(String s1, String s2) throws IOException {
091        this(s1, s2, TermWeighting.TF_IDF);
092    }
093    
094    /**
095     * Returns the cosine document similarity between document {@code doc1} and {@code doc2} using TF-IDF as weighting for each term.
096     * The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, 
097     * with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity.
098     * @param doc1
099     * @param doc2
100     * @return
101     * @throws IOException
102     */
103    public static double getCosineSimilarity(String doc1, String doc2)
104            throws IOException {
105        return new VSMCosineDocumentSimilarity(doc1, doc2).getCosineSimilarity();
106    }
107    
108    /**
109     * Returns the cosine document similarity between document {@code doc1} and {@code doc2} based on {@code termWeighting} to compute the weight
110     * for each term in the documents.
111     * The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, 
112     * with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity.
113     * @param doc1
114     * @param doc2
115     * @return
116     * @throws IOException
117     */
118    public static double getCosineSimilarity(String doc1, String doc2, TermWeighting termWeighting)
119            throws IOException {
120        return new VSMCosineDocumentSimilarity(doc1, doc2, termWeighting).getCosineSimilarity();
121    }
122    
123    /**
124     * Create a in-memory Lucene index for both documents.
125     * @param s1
126     * @param s2
127     * @return
128     * @throws IOException
129     */
130    private Directory createIndex(String s1, String s2) throws IOException {
131        Directory directory = new MMapDirectory(Files.createTempDirectory("Lucene"));
132        Analyzer analyzer = new SimpleAnalyzer();
133        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
134        IndexWriter writer = new IndexWriter(directory, iwc);
135        addDocument(writer, s1);
136        addDocument(writer, s2);
137        writer.close();
138        return directory;
139    }
140    
141    /**
142     * Add the document to the Lucene index.
143     * @param writer
144     * @param content
145     * @throws IOException
146     */
147    private void addDocument(IndexWriter writer, String content) throws IOException {
148        Document doc = new Document();
149        Field field = new Field(CONTENT, content, TYPE_STORED);
150        doc.add(field);
151        writer.addDocument(doc);
152    }
153    
154    /**
155     * Get the frequency of each term contained in the document.
156     * @param reader
157     * @param docId
158     * @return
159     * @throws IOException
160     */
161    private Map<String, Integer> getTermFrequencies(IndexReader reader, int docId)
162            throws IOException {
163        Terms vector = reader.getTermVector(docId, CONTENT);
164        TermsEnum termsEnum = vector.iterator();
165        Map<String, Integer> frequencies = new HashMap<>();
166        BytesRef text = null;
167        while ((text = termsEnum.next()) != null) {
168            String term = text.utf8ToString();
169            int freq = (int) termsEnum.totalTermFreq();
170            frequencies.put(term, freq);
171            terms.add(term);
172        }
173        return frequencies;
174    }
175    
176    /**
177     * Get the weight(tf*idf) of each term contained in the document.
178     * @param reader
179     * @param docId
180     * @return
181     * @throws IOException
182     */
183    private Map<String, Double> getTermWeights(IndexReader reader, int docId)
184            throws IOException {
185        Terms vector = reader.getTermVector(docId, CONTENT);
186        //TODO: not sure if this is reasonable but it prevents NPEs
187        if (vector == null) {
188            return new HashMap<>();
189        }
190        TermsEnum termsEnum = vector.iterator();
191        Map<String, Double> weights = new HashMap<>();
192        BytesRef text = null;
193        while ((text = termsEnum.next()) != null) {
194            String term = text.utf8ToString();
195            //get the term frequency
196            int tf = (int) termsEnum.totalTermFreq();
197            //get the document frequency
198            int df = reader.docFreq(new Term(CONTENT, text));
199            //compute the inverse document frequency
200            double idf = getIDF(reader.numDocs(), df);
201            //compute tf*idf
202            double weight = tf * idf;
203            
204            weights.put(term, weight);
205            terms.add(term);
206        }
207        return weights;
208    }
209    
210    private double getIDF(int totalNumberOfDocuments, int documentFrequency){
211        return 1 + Math.log(totalNumberOfDocuments/documentFrequency);
212    }
213    
214    private double getCosineSimilarity() {
215        return (v1.dotProduct(v2)) / (v1.getNorm() * v2.getNorm());
216    }
217    
218    private RealVector getTermVectorInteger(Map<String, Integer> map) {
219        RealVector vector = new ArrayRealVector(terms.size());
220        int i = 0;
221        for (String term : terms) {
222            int value = map.containsKey(term) ? map.get(term) : 0;
223            vector.setEntry(i++, value);
224        }
225        return vector.mapDivide(vector.getL1Norm());
226    }
227    
228    private RealVector getTermVectorDouble(Map<String, Double> map) {
229        RealVector vector = new ArrayRealVector(terms.size());
230        int i = 0;
231        for (String term : terms) {
232            double value = map.containsKey(term) ? map.get(term) : 0d;
233            vector.setEntry(i++, value);
234        }
235        return vector.mapDivide(vector.getL1Norm());
236    }
237    
238    public static void main(String[] args) throws Exception {
239                double cosineSimilarity = VSMCosineDocumentSimilarity.getCosineSimilarity("The king is here", "The salad is cold");
240                System.out.println(cosineSimilarity);
241        }
242
243}