001/**
002 * Copyright (C) 2007 - 2016, Jens Lehmann
003 *
004 * This file is part of DL-Learner.
005 *
006 * DL-Learner is free software; you can redistribute it and/or modify
007 * it under the terms of the GNU General Public License as published by
008 * the Free Software Foundation; either version 3 of the License, or
009 * (at your option) any later version.
010 *
011 * DL-Learner is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
014 * GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License
017 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
018 */
019package org.dllearner.algorithms.qtl.filters;
020
021import org.apache.jena.rdf.model.RDFNode;
022import org.apache.jena.rdf.model.Statement;
023import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
024import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler;
025import uk.ac.shef.wit.simmetrics.similaritymetrics.Levenshtein;
026import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
027
028import java.util.*;
029import java.util.Map.Entry;
030import java.util.function.Predicate;
031
032public class KeywordBasedStatementFilter implements Predicate<Statement> {
033        
034        private Set<String> questionWords;
035        
036        private AbstractStringMetric qGramMetric;
037        private AbstractStringMetric levensteinMetric;
038        private AbstractStringMetric jaroWinklerMetric;
039        private I_Sub substringMetric;
040        
041        private double threshold = 0.4;
042        
043        private int topK = 3;
044        private double topKSumThreshold = 0.8;
045        
046        private Map<Statement, Double> statement2Similarity = new HashMap<>();
047        
048        private Map<RDFNode, Boolean> cache = new HashMap<>();
049        
050        int cnt = 0;
051        
052        public KeywordBasedStatementFilter(Set<String> questionWords){
053                this.questionWords = questionWords;
054                qGramMetric = new QGramsDistance();
055                levensteinMetric = new Levenshtein();
056                jaroWinklerMetric = new JaroWinkler();
057                substringMetric = new I_Sub();
058                
059        }
060
061        private boolean isSimiliar2QuestionWord(String s, Statement st){
062                for(String word : questionWords){
063                        if(areSimiliar(word, s, st)){
064                                return true;
065                        }
066                } 
067                return isSimlarWithSubstringMetrik(s);
068        }
069        
070        private boolean areSimiliar(String s1, String s2, Statement st){
071                return (qGramMetric.getSimilarity(s1, s2) >= threshold) || 
072                (levensteinMetric.getSimilarity(s1, s2) >= threshold);
073        }
074        
075        private boolean isSimlarWithSubstringMetrik(String s){
076                SortedSet<Double> values = new TreeSet<>(Collections.reverseOrder());
077                for(String word : questionWords){
078                        double v = substringMetric.score(word, s, true);
079                        if(v >= threshold){
080                                return true;
081                        } else {
082                                values.add(v);
083                        }
084                } 
085                double sum = 0;
086                for(Double v : getTopK(values)){
087                        if(v >= 0){
088                                sum += v;
089                        }
090                        
091                }
092                return sum >= topKSumThreshold;
093        }
094        
095        private Set<Double> getTopK(SortedSet<Double> values){
096                Set<Double> top = new HashSet<>();
097                int k = 0;
098                for(Double v : values){
099                        if(k == topK){
100                                break;
101                        }
102                        top.add(v);
103                        k++;
104                }
105                return top;
106        }
107        
108        
109        private String getFragment(String uri){
110                int i = uri.lastIndexOf("#");
111                if(i > 0){
112                        return uri.substring(i+1);
113                } else {
114                        return uri.substring(uri.lastIndexOf("/")+1);
115                }
116        }
117
118        @Override
119        public boolean test(Statement s) {
120                Boolean similarPredicate = cache.get(s.getPredicate());
121                Boolean similarObject = cache.get(s.getObject());
122                if(similarPredicate != null && similarObject != null){
123                        return similarPredicate || similarObject;
124                } else if(similarPredicate == null && similarObject != null){
125                        if(similarObject){
126                                return true;
127                        } else {
128                                String predicate = s.getPredicate().getURI().substring(s.getPredicate().getURI().lastIndexOf("/"));
129                                if (isSimiliar2QuestionWord(predicate, s)){
130                                        cache.put(s.getPredicate(), true);
131                                        return true;
132                                } else {
133                                        cache.put(s.getPredicate(), false);
134                                        return false;
135                                }
136                        }
137                } else if(similarPredicate != null && similarObject == null){
138                        if(similarPredicate){
139                                return true;
140                        } else {
141                                String object = null;
142                                if(s.getObject().isURIResource()){
143                                        object = s.getObject().asResource().getURI();
144                                        object = getFragment(s.getObject().asResource().getURI());
145                                } else if(s.getObject().isLiteral()){
146                                        object = s.getObject().asLiteral().getLexicalForm();
147                                }
148                                if(isSimiliar2QuestionWord(object, s)){
149                                        cache.put(s.getObject(), true);
150                                        return true;
151                                } else {
152                                        cache.put(s.getObject(), false);
153                                        return false;
154                                }
155                        }
156                } else {
157                        String predicate = s.getPredicate().getURI().substring(s.getPredicate().getURI().lastIndexOf("/"));
158                        if (isSimiliar2QuestionWord(predicate, s)){
159                                cache.put(s.getPredicate(), true);
160                                return true;
161                        } else {
162                                cache.put(s.getPredicate(), false);
163                        }
164                        String object = null;
165                        if(s.getObject().isURIResource()){
166                                object = s.getObject().asResource().getURI();
167                                object = getFragment(s.getObject().asResource().getURI());
168                        } else if(s.getObject().isLiteral()){
169                                object = s.getObject().asLiteral().getLexicalForm();
170                        }
171                        if(isSimiliar2QuestionWord(object, s)){
172                                cache.put(s.getObject(), true);
173                                return true;
174                        } else {
175                                cache.put(s.getObject(), false);
176                        }
177                        return false;
178                }
179        }
180        
181//      @Override
182//      public boolean accept(Statement s) {
183//              String predicate = s.getPredicate().getURI().substring(s.getPredicate().getURI().lastIndexOf("/"));
184//              String object = null;
185//              if(s.getObject().isURIResource()){
186//                      object = s.getObject().asResource().getURI();
187//                      object = getFragment(s.getObject().asResource().getURI());
188//              } else if(s.getObject().isLiteral()){
189//                      object = s.getObject().asLiteral().getLexicalForm();
190//              }
191//              return isSimiliar2QuestionWord(predicate, s) || isSimiliar2QuestionWord(object, s);
192//      }
193        
194        public void setThreshold(double threshold){
195                this.threshold = threshold;
196        }
197        
198        public double getThreshold(){
199                return threshold;
200        }
201        
202        public Set<Statement> getStatementsBelowThreshold(double threshold){
203                Set<Statement> statements = new HashSet<>();
204                for(Entry<Statement, Double> entry : statement2Similarity.entrySet()){
205                        if(entry.getValue() < threshold){
206                                statements.add(entry.getKey());
207                        }
208                }
209                return statements;
210        }
211
212}