001/** 002 * Copyright (C) 2007 - 2016, Jens Lehmann 003 * 004 * This file is part of DL-Learner. 005 * 006 * DL-Learner is free software; you can redistribute it and/or modify 007 * it under the terms of the GNU General Public License as published by 008 * the Free Software Foundation; either version 3 of the License, or 009 * (at your option) any later version. 010 * 011 * DL-Learner is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 014 * GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License 017 * along with this program. If not, see <http://www.gnu.org/licenses/>. 018 */ 019package org.dllearner.algorithms.qtl.filters; 020 021import org.apache.jena.rdf.model.RDFNode; 022import org.apache.jena.rdf.model.Statement; 023import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; 024import uk.ac.shef.wit.simmetrics.similaritymetrics.JaroWinkler; 025import uk.ac.shef.wit.simmetrics.similaritymetrics.Levenshtein; 026import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; 027 028import java.util.*; 029import java.util.Map.Entry; 030import java.util.function.Predicate; 031 032public class KeywordBasedStatementFilter implements Predicate<Statement> { 033 034 private Set<String> questionWords; 035 036 private AbstractStringMetric qGramMetric; 037 private AbstractStringMetric levensteinMetric; 038 private AbstractStringMetric jaroWinklerMetric; 039 private I_Sub substringMetric; 040 041 private double threshold = 0.4; 042 043 private int topK = 3; 044 private double topKSumThreshold = 0.8; 045 046 private Map<Statement, Double> statement2Similarity = new HashMap<>(); 047 048 private Map<RDFNode, Boolean> cache = new HashMap<>(); 049 050 int cnt = 0; 051 052 public KeywordBasedStatementFilter(Set<String> questionWords){ 053 this.questionWords = questionWords; 054 qGramMetric = new QGramsDistance(); 055 levensteinMetric = new Levenshtein(); 056 jaroWinklerMetric = new JaroWinkler(); 057 substringMetric = new I_Sub(); 058 059 } 060 061 private boolean isSimiliar2QuestionWord(String s, Statement st){ 062 for(String word : questionWords){ 063 if(areSimiliar(word, s, st)){ 064 return true; 065 } 066 } 067 return isSimlarWithSubstringMetrik(s); 068 } 069 070 private boolean areSimiliar(String s1, String s2, Statement st){ 071 return (qGramMetric.getSimilarity(s1, s2) >= threshold) || 072 (levensteinMetric.getSimilarity(s1, s2) >= threshold); 073 } 074 075 private boolean isSimlarWithSubstringMetrik(String s){ 076 SortedSet<Double> values = new TreeSet<>(Collections.reverseOrder()); 077 for(String word : questionWords){ 078 double v = substringMetric.score(word, s, true); 079 if(v >= threshold){ 080 return true; 081 } else { 082 values.add(v); 083 } 084 } 085 double sum = 0; 086 for(Double v : getTopK(values)){ 087 if(v >= 0){ 088 sum += v; 089 } 090 091 } 092 return sum >= topKSumThreshold; 093 } 094 095 private Set<Double> getTopK(SortedSet<Double> values){ 096 Set<Double> top = new HashSet<>(); 097 int k = 0; 098 for(Double v : values){ 099 if(k == topK){ 100 break; 101 } 102 top.add(v); 103 k++; 104 } 105 return top; 106 } 107 108 109 private String getFragment(String uri){ 110 int i = uri.lastIndexOf("#"); 111 if(i > 0){ 112 return uri.substring(i+1); 113 } else { 114 return uri.substring(uri.lastIndexOf("/")+1); 115 } 116 } 117 118 @Override 119 public boolean test(Statement s) { 120 Boolean similarPredicate = cache.get(s.getPredicate()); 121 Boolean similarObject = cache.get(s.getObject()); 122 if(similarPredicate != null && similarObject != null){ 123 return similarPredicate || similarObject; 124 } else if(similarPredicate == null && similarObject != null){ 125 if(similarObject){ 126 return true; 127 } else { 128 String predicate = s.getPredicate().getURI().substring(s.getPredicate().getURI().lastIndexOf("/")); 129 if (isSimiliar2QuestionWord(predicate, s)){ 130 cache.put(s.getPredicate(), true); 131 return true; 132 } else { 133 cache.put(s.getPredicate(), false); 134 return false; 135 } 136 } 137 } else if(similarPredicate != null && similarObject == null){ 138 if(similarPredicate){ 139 return true; 140 } else { 141 String object = null; 142 if(s.getObject().isURIResource()){ 143 object = s.getObject().asResource().getURI(); 144 object = getFragment(s.getObject().asResource().getURI()); 145 } else if(s.getObject().isLiteral()){ 146 object = s.getObject().asLiteral().getLexicalForm(); 147 } 148 if(isSimiliar2QuestionWord(object, s)){ 149 cache.put(s.getObject(), true); 150 return true; 151 } else { 152 cache.put(s.getObject(), false); 153 return false; 154 } 155 } 156 } else { 157 String predicate = s.getPredicate().getURI().substring(s.getPredicate().getURI().lastIndexOf("/")); 158 if (isSimiliar2QuestionWord(predicate, s)){ 159 cache.put(s.getPredicate(), true); 160 return true; 161 } else { 162 cache.put(s.getPredicate(), false); 163 } 164 String object = null; 165 if(s.getObject().isURIResource()){ 166 object = s.getObject().asResource().getURI(); 167 object = getFragment(s.getObject().asResource().getURI()); 168 } else if(s.getObject().isLiteral()){ 169 object = s.getObject().asLiteral().getLexicalForm(); 170 } 171 if(isSimiliar2QuestionWord(object, s)){ 172 cache.put(s.getObject(), true); 173 return true; 174 } else { 175 cache.put(s.getObject(), false); 176 } 177 return false; 178 } 179 } 180 181// @Override 182// public boolean accept(Statement s) { 183// String predicate = s.getPredicate().getURI().substring(s.getPredicate().getURI().lastIndexOf("/")); 184// String object = null; 185// if(s.getObject().isURIResource()){ 186// object = s.getObject().asResource().getURI(); 187// object = getFragment(s.getObject().asResource().getURI()); 188// } else if(s.getObject().isLiteral()){ 189// object = s.getObject().asLiteral().getLexicalForm(); 190// } 191// return isSimiliar2QuestionWord(predicate, s) || isSimiliar2QuestionWord(object, s); 192// } 193 194 public void setThreshold(double threshold){ 195 this.threshold = threshold; 196 } 197 198 public double getThreshold(){ 199 return threshold; 200 } 201 202 public Set<Statement> getStatementsBelowThreshold(double threshold){ 203 Set<Statement> statements = new HashSet<>(); 204 for(Entry<Statement, Double> entry : statement2Similarity.entrySet()){ 205 if(entry.getValue() < threshold){ 206 statements.add(entry.getKey()); 207 } 208 } 209 return statements; 210 } 211 212}