001/**
002 * Copyright (C) 2007 - 2016, Jens Lehmann
003 *
004 * This file is part of DL-Learner.
005 *
006 * DL-Learner is free software; you can redistribute it and/or modify
007 * it under the terms of the GNU General Public License as published by
008 * the Free Software Foundation; either version 3 of the License, or
009 * (at your option) any later version.
010 *
011 * DL-Learner is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
014 * GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License
017 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
018 */
019package org.dllearner.algorithms.qtl.filters;
020
021import java.util.Collections;
022import java.util.HashSet;
023import java.util.List;
024import java.util.Set;
025import java.util.SortedSet;
026import java.util.TreeSet;
027
028import org.dllearner.algorithms.qtl.datastructures.QueryTree;
029import org.dllearner.algorithms.qtl.datastructures.impl.QueryTreeImpl;
030
031import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
032import uk.ac.shef.wit.simmetrics.similaritymetrics.Levenshtein;
033import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
034
035public class QuestionBasedQueryTreeFilterAggressive implements QueryTreeFilter{
036        
037private Set<String> questionWords;
038        
039        private AbstractStringMetric qGramMetric;
040        private AbstractStringMetric levensteinMetric;
041        private I_Sub substringMetric;
042        
043        private double threshold = 0.4;
044        private int topK = 3;
045        private double topKSumThreshold = 0.8;
046        
047        private Set<Integer> numbers = new HashSet<>();
048        
049        public QuestionBasedQueryTreeFilterAggressive(Set<String> questionWords){
050                this.questionWords = questionWords;
051                qGramMetric = new QGramsDistance();
052                levensteinMetric = new Levenshtein();
053                substringMetric = new I_Sub();
054                extractNumbers();
055                
056        }
057        
058        @Override
059        public QueryTree<String> getFilteredQueryTree(QueryTree<String> tree){
060                if(tree.getChildren().isEmpty()){
061                        return tree;
062                }
063                QueryTree<String> copy = new QueryTreeImpl<>(tree);
064                filterTree(copy);
065                return copy;
066        }
067        
068        public void setThreshold(double threshold){
069                this.threshold = threshold;
070        }
071        
072        private void filterTree(QueryTree<String> tree){
073                List<QueryTree<String>> leafs = tree.getLeafs();
074                QueryTree<String> parent = leafs.get(0).getParent();
075                String edge = (String) parent.getEdge(leafs.get(0));
076                String label;
077                for(QueryTree<String> leaf : leafs){
078                        if(!leaf.getParent().getEdge(leaf).equals(edge) || leaf.getParent()!= parent){
079                                removeUnnecessaryEdges(parent, edge);
080                                parent = leaf.getParent();
081                                edge = (String) parent.getEdge(leaf);
082                        }
083                        label = leaf.getUserObject();
084                        edge = (String) leaf.getParent().getEdge(leaf);
085                        boolean replace = false;
086                        if(leaf.isLiteralNode()){
087                                replace = !literalIsSimiliar2QuestionWord(label);
088                        } else {
089                                replace = !resourceIsSimilar2QuestionWord(label);
090                        }
091                        if(replace){
092                                leaf.setUserObject("?");
093                        }
094                        
095                }
096        }
097        
098        private void removeUnnecessaryEdges(QueryTree<String> node, String edge){
099                List<QueryTree<String>> children = node.getChildren(edge);
100                if(children.size() >= 2){
101                        int removed = 0;
102                        for(QueryTree<String> child : children){
103                                if(child.getUserObject().equals("?") && removed < children.size()){
104                                        node.removeChild((QueryTreeImpl<String>) child);
105                                        removed++;
106                                }
107                        }
108                }
109                
110        }
111        
112        private boolean resourceIsSimilar2QuestionWord(String resource){
113                String label = getFragment(resource);
114                for(String word : questionWords){
115                        if(areSimiliar(word, label)){
116                                return true;
117                        }
118                } 
119                return isSimlarWithSubstringMetrik(label);
120        }
121        
122        private boolean literalIsSimiliar2QuestionWord(String literal){
123                String value = extractLiteralValue(literal);
124                if(isNumber(value)){
125                        if(numbers.isEmpty()){
126                                return false;
127                        } else {
128                                int i = Integer.parseInt(value);
129                                return numbers.contains(i);
130                        }
131                }
132                for(String word : questionWords){
133                        if(areSimiliar(word, value)){
134                                return true;
135                        }
136                } 
137                return isSimlarWithSubstringMetrik(value);
138        }
139        
140        private String extractLiteralValue(String literal){
141                String value = literal;
142                int index = literal.indexOf("^^");
143                if(index != -1){
144                        value = literal.substring(1, index-1);
145                } else {
146                        index = literal.indexOf("@");
147                        if(index != -1){
148                                value = literal.substring(1, index-1);
149                        }
150                }
151                return value;
152                
153        }
154        
155        private void extractNumbers(){
156                for(String word : questionWords){
157                        if(isNumber(word)){
158                                numbers.add(Integer.valueOf(word));
159                        }
160                }
161        }
162        
163        private boolean isNumber(String s){
164                for (int i = 0; i < s.length(); i++) {
165                        if(!Character.isDigit(s.charAt(i))){
166                                return false;
167                        }
168                }
169                return true;
170                     
171        }
172        
173        private boolean areSimiliar(String s1, String s2){
174                return (qGramMetric.getSimilarity(s1, s2) >= threshold) || 
175                (levensteinMetric.getSimilarity(s1, s2) >= threshold);
176        }
177        
178        private boolean isSimlarWithSubstringMetrik(String s){
179                SortedSet<Double> values = new TreeSet<>(Collections.reverseOrder());
180                for(String word : questionWords){
181                        double v = substringMetric.score(word, s, true);
182                        if(v >= threshold){
183                                return true;
184                        } else {
185                                values.add(v);
186                        }
187                } 
188                double sum = 0;
189                for(Double v : getTopK(values)){
190                        if(v >= 0){
191                                sum += v;
192                        }
193                        
194                }
195                return sum >= topKSumThreshold;
196        }
197        
198        private Set<Double> getTopK(SortedSet<Double> values){
199                Set<Double> top = new HashSet<>();
200                int k = 0;
201                for(Double v : values){
202                        if(k == topK){
203                                break;
204                        }
205                        top.add(v);
206                        k++;
207                }
208                return top;
209        }
210        
211        private String getFragment(String uri){
212                int i = uri.lastIndexOf("#");
213                if(i > 0){
214                        return uri.substring(i+1);
215                } else {
216                        return uri.substring(uri.lastIndexOf("/")+1);
217                }
218        }
219        
220
221}