001/**
002 * Copyright (C) 2007 - 2016, Jens Lehmann
003 *
004 * This file is part of DL-Learner.
005 *
006 * DL-Learner is free software; you can redistribute it and/or modify
007 * it under the terms of the GNU General Public License as published by
008 * the Free Software Foundation; either version 3 of the License, or
009 * (at your option) any later version.
010 *
011 * DL-Learner is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
014 * GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License
017 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
018 */
019package org.dllearner.algorithms.el;
020
021import org.apache.log4j.Logger;
022import org.dllearner.core.*;
023import org.dllearner.core.config.ConfigOption;
024import org.dllearner.learningproblems.PosNegLP;
025import org.dllearner.learningproblems.ScoreSimple;
026import org.dllearner.refinementoperators.ELDown;
027import org.dllearner.utilities.OWLAPIUtils;
028import org.dllearner.utilities.ReasoningUtils;
029import org.dllearner.utilities.owl.OWLAPIRenderers;
030import org.dllearner.utilities.owl.OWLClassExpressionMinimizer;
031import org.semanticweb.owlapi.manchestersyntax.parser.ManchesterOWLSyntaxParserException;
032import org.semanticweb.owlapi.model.OWLClassExpression;
033import org.semanticweb.owlapi.model.OWLIndividual;
034
035import java.text.DecimalFormat;
036import java.util.*;
037
038/**
039 * A learning algorithm for EL, which will based on an
040 * ideal refinement operator.
041 * 
042 * The algorithm learns disjunctions of EL trees as follows:
043 * - given pos. and neg. examples, noise in %, min coverage per tree x %
044 * - it searches for an EL tree, which covers at least x % of all positive examples
045 *   and at most (coverage_on_positives * noise) negative examples
046 * - the covered examples are removed from the pos. and neg. examples
047 * - termination: all(?) positive examples covered
048 * 
049 * ev. besser: feste Suchzeiten pro Baum => es wird dann jeweils der beste Baum gewählt
050 * => Terminierung, wenn alles gecovered ist oder kein Baum mit ausreichender Qualität
051 * in dem Zeitfenster gefunden wird
052 * 
053 * In contrast to many other algorithms, only one solution is returned. Additionally,
054 * the algorithm is not really an anytime algorithm, since the solution is constructed
055 * stepwise as a set of trees. 
056 * 
057 * Parameter optimisation:
058 *  - runtime per tree: 10 seconds
059 *  - tradeoff pos/neg: 1.0 1.2 1.4 1.6. 1.8 2.0
060 *  - min score: 0 -2.5 -5 -7.5 -10
061 *  - tests: 30
062 *  - runtime per test: 200 seconds => 2000 seconds cross val => 60000 seconds overall
063 * 
064 * Next idea: 
065 *  - reduce tradeoff for each tree added (start with 2.0 and reduce by 0.1)
066 *  - for the last tress it is not very important to cover less negatives
067 *  - minimum is something between 0 and -1 (ensures that in the worst case as many
068 *    positives as negatives are covered)
069 *  - only high impact parameter is runtime (and maybe start tradeoff)
070 * 
071 * @author Jens Lehmann
072 *
073 */
074@ComponentAnn(name="Disjunctive ELTL", shortName="deltl", version=0.5, description="Disjunctive ELTL is an algorithm based on the refinement operator in http://jens-lehmann.org/files/2009/el_ilp.pdf with support for disjunctions.")
075public class ELLearningAlgorithmDisjunctive extends AbstractCELA {
076
077        private static Logger logger = Logger.getLogger(ELLearningAlgorithmDisjunctive.class);  
078        
079        private ELDown operator;
080        private OWLClassExpressionMinimizer minimizer;
081        
082        private SearchTreeNode startNode;
083        private ELHeuristic heuristic;
084        private TreeSet<SearchTreeNode> candidates;
085        // all trees (for fast redundancy check)
086        private TreeSet<ELDescriptionTree> trees;
087        private double noise;
088        
089        @ConfigOption(defaultValue = "1.0", description="Specifies how long the algorithm should search for a partial solution (a tree).")
090        private double treeSearchTimeSeconds = 1.0;
091        
092        @ConfigOption(defaultValue = "false", description="If yes, then the algorithm tries to cover all positive examples. Note that while this improves accuracy on the testing set, it may lead to overfitting.")
093        private boolean tryFullCoverage = false;
094        
095        @ConfigOption(defaultValue="false", description="algorithm will terminate immediately when a correct definition is found")
096        private boolean stopOnFirstDefinition = false;
097        
098        @ConfigOption(defaultValue="0.0", description="the (approximated) percentage of noise within the examples")
099        private double noisePercentage = 0.0;
100        
101        // the class with which we start the refinement process
102        @ConfigOption(defaultValue="owl:Thing", description="You can specify a start class for the algorithm. To do this, you have to use Manchester OWL syntax without using prefixes.")
103        private OWLClassExpression startClass;
104        
105//      private double noise = 0;
106        private List<ELDescriptionTree> currentSolution = new LinkedList<>();
107        private EvaluatedDescription<? extends Score> bestEvaluatedDescription;
108        // how important not to cover negatives
109        private double posWeight = 1.2; // 2;
110        private int startPosExamplesSize;
111//      private int startNegExamplesSize;
112        private Set<OWLIndividual> currentPosExamples;
113        private Set<OWLIndividual> currentNegExamples;
114        private SearchTreeNode bestCurrentNode;
115        private Score bestCurrentScore = new ScoreSimple(0);
116        private long treeStartTime;
117
118        // minimum score a tree must have to be part of the solution
119        @ConfigOption(defaultValue = "-1", description = "the minimum quality a tree must have to proceed", required = false)
120        private double minimumTreeScore = -1;
121
122        @ConfigOption(description="whether to do real disjoint tests or check that two named classes do not have common instances")
123        private boolean instanceBasedDisjoints;
124
125        private DecimalFormat decFormat = new DecimalFormat("0.00"); 
126        
127        public ELLearningAlgorithmDisjunctive() {}      
128        
129        public ELLearningAlgorithmDisjunctive(AbstractClassExpressionLearningProblem problem, AbstractReasonerComponent reasoner) {
130                super(problem, reasoner);
131        }
132
133        public static Collection<Class<? extends AbstractClassExpressionLearningProblem>> supportedLearningProblems() {
134                Collection<Class<? extends AbstractClassExpressionLearningProblem>> problems = new LinkedList<>();
135                problems.add(PosNegLP.class);
136                return problems;
137        }
138        
139        @Override
140        public void init() throws ComponentInitException {
141                heuristic = new DisjunctiveHeuristic();
142                candidates = new TreeSet<>(heuristic);
143                trees = new TreeSet<>(new ELDescriptionTreeComparator());
144                
145                if(startClass == null) {
146                        startClass = dataFactory.getOWLThing();
147                } else {
148                        try {
149                                this.startClass = OWLAPIUtils.classExpressionPropertyExpander(startClass, reasoner, dataFactory);
150                        } catch (ManchesterOWLSyntaxParserException e) {
151                                logger.info("Error parsing startClass: " + e.getMessage());
152                                this.startClass = dataFactory.getOWLThing();
153                        }
154                }
155                operator = new ELDown(reasoner, instanceBasedDisjoints);
156                operator.init();
157                
158                baseURI = reasoner.getBaseURI();
159                prefixes = reasoner.getPrefixes();
160                
161                minimizer = new OWLClassExpressionMinimizer(dataFactory, reasoner);
162                
163                noise = noisePercentage/100d;
164                
165                initialized = true;
166        }       
167        
168        @Override
169        public void start() {
170//              System.out.println("starting disjunctive ELTL algorithm");
171                stop = false;
172                isRunning = true;
173                reset();
174                int treeCount = 0;
175                
176                while(!stop && !stoppingCriteriaSatisfied()) {
177                        
178                        treeStartTime = System.nanoTime();
179                        // create start node
180                        ELDescriptionTree startTree = new ELDescriptionTree(reasoner, startClass);
181                        addDescriptionTree(startTree, null);
182//                      bestCurrentTree = top;
183                        bestCurrentScore = new ScoreSimple(ScoreSimple.MIN.getAccuracy());
184                        
185                        // main loop
186                        int loop = 0;
187                        while(!stop && !treeCriteriaSatisfied()) {
188                                // pick the best candidate according to the heuristic
189                                SearchTreeNode best = candidates.pollLast();
190//                              System.out.println("best: " + best);
191                                
192                                // apply operator
193                                System.out.print("applying operator ...");
194                                List<ELDescriptionTree> refinements = operator.refine(best.getDescriptionTree());
195                                System.out.println("done " + refinements.size() + " refinements");
196                                // add all refinements to search tree, candidates, best descriptions
197                                for(ELDescriptionTree refinement : refinements) {
198                                        addDescriptionTree(refinement, best);
199                                }
200                                loop++;
201                                // logging
202                                if(logger.isTraceEnabled()) {
203                                        logger.trace("Choosen node " + best);
204                                        logger.trace(startNode.getTreeString(renderer));
205                                        logger.trace("Loop " + loop + " completed.");
206                                }
207                                
208//                              for(SearchTreeNode node : candidates) {
209//                                      System.out.println(node);
210//                              }
211//                              System.out.println(candidates.last());
212//                              System.out.println(candidates.first());
213//                              System.out.println("==");
214                        }
215                        
216                        if(Double.compare(bestCurrentScore.getAccuracy(), minimumTreeScore) > 0) {
217                                // we found a tree (partial solution)
218                                currentSolution.add(bestCurrentNode.getDescriptionTree());
219                                OWLClassExpression bestDescription = bestCurrentNode.getDescriptionTree().transformToClassExpression();
220                                OWLClassExpression bestCombinedDescription = bestDescription;
221                                // form union of trees found so far with 
222                                if(treeCount==0) {
223                                        bestEvaluatedDescription = learningProblem.evaluate(bestDescription);
224                                        bestEvaluatedDescriptions.add(bestEvaluatedDescription);
225                                } else {
226                                        if(!bestEvaluatedDescription.getDescription().equals(dataFactory.getOWLThing())){
227                                                bestCombinedDescription = dataFactory.getOWLObjectUnionOf(bestEvaluatedDescription.getDescription(), bestDescription);
228                                        }
229                                        bestEvaluatedDescription = learningProblem.evaluate(bestCombinedDescription);
230                                        bestEvaluatedDescriptions.add(bestEvaluatedDescription);
231                                }
232                                
233                                // remove already covered examples
234                                Iterator<OWLIndividual> it = currentPosExamples.iterator();
235                                int posCov = 0;
236                                while(it.hasNext()) {
237                                        OWLIndividual ind = it.next();
238                                        if(reasoner.hasType(bestDescription, ind)) {
239//                                              System.out.println("covered pos: " + ind);
240                                                it.remove();
241                                                posCov++;
242                                        }
243                                }
244                                it = currentNegExamples.iterator();
245                                int negCov = 0;
246                                while(it.hasNext()) {
247                                        OWLIndividual ind = it.next();
248                                        if(reasoner.hasType(bestDescription, ind)) {
249//                                              System.out.println("covered neg: " + ind);
250                                                it.remove();
251                                                negCov++;
252                                        }
253                                }
254                                logger.info("tree found: " + OWLAPIRenderers.toManchesterOWLSyntax(bestDescription) + " (" + posCov + " pos covered, " + currentPosExamples.size() + " remaining, " + negCov + " neg covered, " + currentNegExamples.size() + " remaining, score: " + bestCurrentNode.getScore() + ")");
255                                logger.info("combined accuracy: " + decFormat.format(bestEvaluatedDescription.getAccuracy()));
256                        } else {
257                                logger.info("no tree found, which satisfies the minimum criteria - the best was: " + ( bestCurrentNode == null ? "(none)" : OWLAPIRenderers.toManchesterOWLSyntax(bestCurrentNode.getDescriptionTree().transformToClassExpression()) + " with score " + bestCurrentNode.getScore() ));
258                        }
259                        
260                        logger.info(trees.size() + " trees checked");
261                        
262                        // reduce importance of not covering negatives
263                        posWeight = Math.max(1.0, posWeight-0.1);
264                        
265                        // reset temporary variables
266                        candidates.clear();
267                        trees.clear();
268                        
269                        treeCount++;
270                }
271                
272                // simplify solution (in particular necessary when start class is specified)
273                OWLClassExpression niceDescription = minimizer.minimizeClone(bestEvaluatedDescription.getDescription());
274                bestEvaluatedDescription = learningProblem.evaluate(niceDescription);
275                
276                // print solution
277                logger.info("solution : " + OWLAPIRenderers.toManchesterOWLSyntax(bestEvaluatedDescription.getDescription()) + "(acc: " + bestEvaluatedDescription.getAccuracy() + ")");
278                
279                isRunning = false;
280        }
281
282        // evaluates a class expression in tree form
283        private void addDescriptionTree(ELDescriptionTree descriptionTree, SearchTreeNode parentNode) {
284                
285                // redundancy check
286                boolean nonRedundant = trees.add(descriptionTree);
287                if(!nonRedundant) {
288                        return;
289                }
290                
291                // create search tree node
292                SearchTreeNode node = new SearchTreeNode(descriptionTree);
293                
294                // compute score
295                Score score = getTreeScore(descriptionTree);
296                node.setScore(score);
297                
298                // link to parent (unless start node)
299                if(parentNode == null) {
300                        startNode = node;
301                } else {
302                        parentNode.addChild(node);
303                }
304                
305                // TODO: define "too weak" as a coverage on negative examples, which is
306                // too high for the tree to be considered
307                if(score.getAccuracy() != Double.NEGATIVE_INFINITY) {
308                        candidates.add(node);
309                }
310                
311                // check whether this is the best tree
312                if(Double.compare(score.getAccuracy(), bestCurrentScore.getAccuracy()) > 0) {
313                        bestCurrentNode = node;
314                        bestCurrentScore = score;
315                }
316        }
317        
318        private Score getTreeScore(ELDescriptionTree tree) {
319                
320                OWLClassExpression d = tree.transformToClassExpression();
321                
322                double score = 0;
323                
324                // test coverage on current positive examples
325                ReasoningUtils reasoningUtils = new ReasoningUtils(reasoner);
326
327                ReasoningUtils.CoverageCount[] posCoverageCount = reasoningUtils.getCoverageCount(d, currentPosExamples);
328                score += 1 * posCoverageCount[0].trueCount;
329
330                // penalty if a minimum coverage is not achieved (avoids too many trees where
331                // each tree has only little impact)
332                if((startPosExamplesSize > 10 && posCoverageCount[0].trueCount<3) || posCoverageCount[0].trueCount < 1) {
333//                      score -= 100;
334                        // further refining such a tree will not cover more positives
335                        // => reject
336                        return ScoreSimple.MIN;
337                }
338                
339                // test coverage on current negative examples
340                ReasoningUtils.CoverageCount[] negCoverageCount = reasoningUtils.getCoverageCount(d, currentNegExamples);
341                score -= posWeight * negCoverageCount[0].trueCount;
342
343                // remove - does not make sense
344                // check whether tree is too weak, i.e. covers more than noise % negatives
345//              int maxNegCov = (int) Math.round(noise * currentNegExamples.size());
346//              if(negCovered > maxNegCov) {
347//                      return Double.NEGATIVE_INFINITY;
348//              }
349                
350                // length penalty
351                score -= 0.1 * tree.getSize();
352                
353//              System.out.println("score: " + score);
354                
355                return new ScoreSimple(score);
356        }
357        
358        private boolean treeCriteriaSatisfied() {
359                // stop if there are no more candidates (unlikely, but can happen)
360                if(candidates.isEmpty()) {
361                        return true;
362                }               
363                
364                long runTime = System.nanoTime() - treeStartTime;
365                double runTimeSeconds = runTime / (double) 1000000000;
366
367        return runTimeSeconds >= treeSearchTimeSeconds;
368        }
369        
370        private boolean stoppingCriteriaSatisfied() {
371        
372                // stop if we have a node covering all positives and none of the negatives
373//              SearchTreeNode bestNode = candidates.last();
374//              return (bestNode.getCoveredNegatives() == 0);
375                
376                // stop if there are no more positive examples to cover
377                if(stopOnFirstDefinition && currentPosExamples.size()==0) {
378                        return true;
379                }
380                
381                // we stop when the score of the last tree added is too low
382                // (indicating that the algorithm could not find anything appropriate 
383                // in the timeframe set)
384                if(Double.compare(bestCurrentScore.getAccuracy(), minimumTreeScore) <= 0) {
385                        return true;
386                }
387                
388                // stop when almost all positive examples have been covered
389                if(tryFullCoverage) {
390                        return false;
391                } else {
392                        int maxPosRemaining = (int) Math.ceil(startPosExamplesSize * 0.05d);
393                        return (currentPosExamples.size()<=maxPosRemaining);
394                }
395        }
396        
397        private void reset() {
398                // set all values back to their default values (used for running
399                // the algorithm more than once)
400                candidates.clear();
401                trees.clear();
402                currentSolution.clear();
403                bestEvaluatedDescription = learningProblem.evaluate(dataFactory.getOWLThing());
404                // we need to clone in order not to modify the learning problem
405                currentPosExamples = new TreeSet<>(((PosNegLP) getLearningProblem()).getPositiveExamples());
406                currentNegExamples = new TreeSet<>(((PosNegLP) getLearningProblem()).getNegativeExamples());
407                startPosExamplesSize = currentPosExamples.size();
408//              startNegExamplesSize = currentNegExamples.size();
409        }
410        
411        /**
412         * @return the startNode
413         */
414        public SearchTreeNode getStartNode() {
415                return startNode;
416        }
417
418        public OWLClassExpression getStartClass() {
419                return startClass;
420        }
421
422        public void setStartClass(OWLClassExpression startClass) {
423                this.startClass = startClass;
424        }
425
426        public boolean isInstanceBasedDisjoints() {
427                return instanceBasedDisjoints;
428        }
429
430        public void setInstanceBasedDisjoints(boolean instanceBasedDisjoints) {
431                this.instanceBasedDisjoints = instanceBasedDisjoints;
432        }
433
434        public double getTreeSearchTimeSeconds() {
435                return treeSearchTimeSeconds;
436        }
437
438        public void setTreeSearchTimeSeconds(double treeSearchTimeSeconds) {
439                this.treeSearchTimeSeconds = treeSearchTimeSeconds;
440        }
441
442        public boolean isTryFullCoverage() {
443                return tryFullCoverage;
444        }
445
446        public void setTryFullCoverage(boolean tryFullCoverage) {
447                this.tryFullCoverage = tryFullCoverage;
448        }       
449        
450        /**
451         * @return the estimated noise value in percentage
452         */
453        public double getNoisePercentage() {
454                return noisePercentage;
455        }
456        
457        /**
458         * @param noisePercentage the estimated noise value in percentage to set
459         */
460        public void setNoisePercentage(double noisePercentage) {
461                this.noisePercentage = noisePercentage;
462        }
463
464        public double getMinimumTreeScore() {
465                return minimumTreeScore;
466        }
467
468        public void setMinimumTreeScore(double minimumTreeScore) {
469                this.minimumTreeScore = minimumTreeScore;
470        }
471
472        public boolean isStopOnFirstDefinition() {
473                return stopOnFirstDefinition;
474        }
475
476        public void setStopOnFirstDefinition(boolean stopOnFirstDefinition) {
477                this.stopOnFirstDefinition = stopOnFirstDefinition;
478        }
479}