001/** 002 * Copyright (C) 2007 - 2016, Jens Lehmann 003 * 004 * This file is part of DL-Learner. 005 * 006 * DL-Learner is free software; you can redistribute it and/or modify 007 * it under the terms of the GNU General Public License as published by 008 * the Free Software Foundation; either version 3 of the License, or 009 * (at your option) any later version. 010 * 011 * DL-Learner is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 014 * GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License 017 * along with this program. If not, see <http://www.gnu.org/licenses/>. 018 */ 019package org.dllearner.algorithms.qtl.filters; 020 021import java.util.Collections; 022import java.util.HashSet; 023import java.util.List; 024import java.util.Set; 025import java.util.SortedSet; 026import java.util.TreeSet; 027 028import org.dllearner.algorithms.qtl.datastructures.QueryTree; 029import org.dllearner.algorithms.qtl.datastructures.impl.QueryTreeImpl; 030 031import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; 032import uk.ac.shef.wit.simmetrics.similaritymetrics.Levenshtein; 033import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; 034 035public class QuestionBasedQueryTreeFilterAggressive implements QueryTreeFilter{ 036 037private Set<String> questionWords; 038 039 private AbstractStringMetric qGramMetric; 040 private AbstractStringMetric levensteinMetric; 041 private I_Sub substringMetric; 042 043 private double threshold = 0.4; 044 private int topK = 3; 045 private double topKSumThreshold = 0.8; 046 047 private Set<Integer> numbers = new HashSet<>(); 048 049 public QuestionBasedQueryTreeFilterAggressive(Set<String> questionWords){ 050 this.questionWords = questionWords; 051 qGramMetric = new QGramsDistance(); 052 levensteinMetric = new Levenshtein(); 053 substringMetric = new I_Sub(); 054 extractNumbers(); 055 056 } 057 058 @Override 059 public QueryTree<String> getFilteredQueryTree(QueryTree<String> tree){ 060 if(tree.getChildren().isEmpty()){ 061 return tree; 062 } 063 QueryTree<String> copy = new QueryTreeImpl<>(tree); 064 filterTree(copy); 065 return copy; 066 } 067 068 public void setThreshold(double threshold){ 069 this.threshold = threshold; 070 } 071 072 private void filterTree(QueryTree<String> tree){ 073 List<QueryTree<String>> leafs = tree.getLeafs(); 074 QueryTree<String> parent = leafs.get(0).getParent(); 075 String edge = (String) parent.getEdge(leafs.get(0)); 076 String label; 077 for(QueryTree<String> leaf : leafs){ 078 if(!leaf.getParent().getEdge(leaf).equals(edge) || leaf.getParent()!= parent){ 079 removeUnnecessaryEdges(parent, edge); 080 parent = leaf.getParent(); 081 edge = (String) parent.getEdge(leaf); 082 } 083 label = leaf.getUserObject(); 084 edge = (String) leaf.getParent().getEdge(leaf); 085 boolean replace = false; 086 if(leaf.isLiteralNode()){ 087 replace = !literalIsSimiliar2QuestionWord(label); 088 } else { 089 replace = !resourceIsSimilar2QuestionWord(label); 090 } 091 if(replace){ 092 leaf.setUserObject("?"); 093 } 094 095 } 096 } 097 098 private void removeUnnecessaryEdges(QueryTree<String> node, String edge){ 099 List<QueryTree<String>> children = node.getChildren(edge); 100 if(children.size() >= 2){ 101 int removed = 0; 102 for(QueryTree<String> child : children){ 103 if(child.getUserObject().equals("?") && removed < children.size()){ 104 node.removeChild((QueryTreeImpl<String>) child); 105 removed++; 106 } 107 } 108 } 109 110 } 111 112 private boolean resourceIsSimilar2QuestionWord(String resource){ 113 String label = getFragment(resource); 114 for(String word : questionWords){ 115 if(areSimiliar(word, label)){ 116 return true; 117 } 118 } 119 return isSimlarWithSubstringMetrik(label); 120 } 121 122 private boolean literalIsSimiliar2QuestionWord(String literal){ 123 String value = extractLiteralValue(literal); 124 if(isNumber(value)){ 125 if(numbers.isEmpty()){ 126 return false; 127 } else { 128 int i = Integer.parseInt(value); 129 return numbers.contains(i); 130 } 131 } 132 for(String word : questionWords){ 133 if(areSimiliar(word, value)){ 134 return true; 135 } 136 } 137 return isSimlarWithSubstringMetrik(value); 138 } 139 140 private String extractLiteralValue(String literal){ 141 String value = literal; 142 int index = literal.indexOf("^^"); 143 if(index != -1){ 144 value = literal.substring(1, index-1); 145 } else { 146 index = literal.indexOf("@"); 147 if(index != -1){ 148 value = literal.substring(1, index-1); 149 } 150 } 151 return value; 152 153 } 154 155 private void extractNumbers(){ 156 for(String word : questionWords){ 157 if(isNumber(word)){ 158 numbers.add(Integer.valueOf(word)); 159 } 160 } 161 } 162 163 private boolean isNumber(String s){ 164 for (int i = 0; i < s.length(); i++) { 165 if(!Character.isDigit(s.charAt(i))){ 166 return false; 167 } 168 } 169 return true; 170 171 } 172 173 private boolean areSimiliar(String s1, String s2){ 174 return (qGramMetric.getSimilarity(s1, s2) >= threshold) || 175 (levensteinMetric.getSimilarity(s1, s2) >= threshold); 176 } 177 178 private boolean isSimlarWithSubstringMetrik(String s){ 179 SortedSet<Double> values = new TreeSet<>(Collections.reverseOrder()); 180 for(String word : questionWords){ 181 double v = substringMetric.score(word, s, true); 182 if(v >= threshold){ 183 return true; 184 } else { 185 values.add(v); 186 } 187 } 188 double sum = 0; 189 for(Double v : getTopK(values)){ 190 if(v >= 0){ 191 sum += v; 192 } 193 194 } 195 return sum >= topKSumThreshold; 196 } 197 198 private Set<Double> getTopK(SortedSet<Double> values){ 199 Set<Double> top = new HashSet<>(); 200 int k = 0; 201 for(Double v : values){ 202 if(k == topK){ 203 break; 204 } 205 top.add(v); 206 k++; 207 } 208 return top; 209 } 210 211 private String getFragment(String uri){ 212 int i = uri.lastIndexOf("#"); 213 if(i > 0){ 214 return uri.substring(i+1); 215 } else { 216 return uri.substring(uri.lastIndexOf("/")+1); 217 } 218 } 219 220 221}