001/** 002 * Copyright (C) 2007 - 2016, Jens Lehmann 003 * 004 * This file is part of DL-Learner. 005 * 006 * DL-Learner is free software; you can redistribute it and/or modify 007 * it under the terms of the GNU General Public License as published by 008 * the Free Software Foundation; either version 3 of the License, or 009 * (at your option) any later version. 010 * 011 * DL-Learner is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 014 * GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License 017 * along with this program. If not, see <http://www.gnu.org/licenses/>. 018 */ 019package org.dllearner.algorithms.el; 020 021import org.apache.log4j.Logger; 022import org.dllearner.core.*; 023import org.dllearner.core.config.ConfigOption; 024import org.dllearner.learningproblems.PosNegLP; 025import org.dllearner.learningproblems.ScoreSimple; 026import org.dllearner.refinementoperators.ELDown; 027import org.dllearner.utilities.OWLAPIUtils; 028import org.dllearner.utilities.ReasoningUtils; 029import org.dllearner.utilities.owl.OWLAPIRenderers; 030import org.dllearner.utilities.owl.OWLClassExpressionMinimizer; 031import org.semanticweb.owlapi.manchestersyntax.parser.ManchesterOWLSyntaxParserException; 032import org.semanticweb.owlapi.model.OWLClassExpression; 033import org.semanticweb.owlapi.model.OWLIndividual; 034 035import java.text.DecimalFormat; 036import java.util.*; 037 038/** 039 * A learning algorithm for EL, which will based on an 040 * ideal refinement operator. 041 * 042 * The algorithm learns disjunctions of EL trees as follows: 043 * - given pos. and neg. examples, noise in %, min coverage per tree x % 044 * - it searches for an EL tree, which covers at least x % of all positive examples 045 * and at most (coverage_on_positives * noise) negative examples 046 * - the covered examples are removed from the pos. and neg. examples 047 * - termination: all(?) positive examples covered 048 * 049 * ev. besser: feste Suchzeiten pro Baum => es wird dann jeweils der beste Baum gewählt 050 * => Terminierung, wenn alles gecovered ist oder kein Baum mit ausreichender Qualität 051 * in dem Zeitfenster gefunden wird 052 * 053 * In contrast to many other algorithms, only one solution is returned. Additionally, 054 * the algorithm is not really an anytime algorithm, since the solution is constructed 055 * stepwise as a set of trees. 056 * 057 * Parameter optimisation: 058 * - runtime per tree: 10 seconds 059 * - tradeoff pos/neg: 1.0 1.2 1.4 1.6. 1.8 2.0 060 * - min score: 0 -2.5 -5 -7.5 -10 061 * - tests: 30 062 * - runtime per test: 200 seconds => 2000 seconds cross val => 60000 seconds overall 063 * 064 * Next idea: 065 * - reduce tradeoff for each tree added (start with 2.0 and reduce by 0.1) 066 * - for the last tress it is not very important to cover less negatives 067 * - minimum is something between 0 and -1 (ensures that in the worst case as many 068 * positives as negatives are covered) 069 * - only high impact parameter is runtime (and maybe start tradeoff) 070 * 071 * @author Jens Lehmann 072 * 073 */ 074@ComponentAnn(name="Disjunctive ELTL", shortName="deltl", version=0.5, description="Disjunctive ELTL is an algorithm based on the refinement operator in http://jens-lehmann.org/files/2009/el_ilp.pdf with support for disjunctions.") 075public class ELLearningAlgorithmDisjunctive extends AbstractCELA { 076 077 private static Logger logger = Logger.getLogger(ELLearningAlgorithmDisjunctive.class); 078 079 private ELDown operator; 080 private OWLClassExpressionMinimizer minimizer; 081 082 private SearchTreeNode startNode; 083 private ELHeuristic heuristic; 084 private TreeSet<SearchTreeNode> candidates; 085 // all trees (for fast redundancy check) 086 private TreeSet<ELDescriptionTree> trees; 087 private double noise; 088 089 @ConfigOption(defaultValue = "1.0", description="Specifies how long the algorithm should search for a partial solution (a tree).") 090 private double treeSearchTimeSeconds = 1.0; 091 092 @ConfigOption(defaultValue = "false", description="If yes, then the algorithm tries to cover all positive examples. Note that while this improves accuracy on the testing set, it may lead to overfitting.") 093 private boolean tryFullCoverage = false; 094 095 @ConfigOption(defaultValue="false", description="algorithm will terminate immediately when a correct definition is found") 096 private boolean stopOnFirstDefinition = false; 097 098 @ConfigOption(defaultValue="0.0", description="the (approximated) percentage of noise within the examples") 099 private double noisePercentage = 0.0; 100 101 // the class with which we start the refinement process 102 @ConfigOption(defaultValue="owl:Thing", description="You can specify a start class for the algorithm. To do this, you have to use Manchester OWL syntax without using prefixes.") 103 private OWLClassExpression startClass; 104 105// private double noise = 0; 106 private List<ELDescriptionTree> currentSolution = new LinkedList<>(); 107 private EvaluatedDescription<? extends Score> bestEvaluatedDescription; 108 // how important not to cover negatives 109 private double posWeight = 1.2; // 2; 110 private int startPosExamplesSize; 111// private int startNegExamplesSize; 112 private Set<OWLIndividual> currentPosExamples; 113 private Set<OWLIndividual> currentNegExamples; 114 private SearchTreeNode bestCurrentNode; 115 private Score bestCurrentScore = new ScoreSimple(0); 116 private long treeStartTime; 117 118 // minimum score a tree must have to be part of the solution 119 @ConfigOption(defaultValue = "-1", description = "the minimum quality a tree must have to proceed", required = false) 120 private double minimumTreeScore = -1; 121 122 @ConfigOption(description="whether to do real disjoint tests or check that two named classes do not have common instances") 123 private boolean instanceBasedDisjoints; 124 125 private DecimalFormat decFormat = new DecimalFormat("0.00"); 126 127 public ELLearningAlgorithmDisjunctive() {} 128 129 public ELLearningAlgorithmDisjunctive(AbstractClassExpressionLearningProblem problem, AbstractReasonerComponent reasoner) { 130 super(problem, reasoner); 131 } 132 133 public static Collection<Class<? extends AbstractClassExpressionLearningProblem>> supportedLearningProblems() { 134 Collection<Class<? extends AbstractClassExpressionLearningProblem>> problems = new LinkedList<>(); 135 problems.add(PosNegLP.class); 136 return problems; 137 } 138 139 @Override 140 public void init() throws ComponentInitException { 141 heuristic = new DisjunctiveHeuristic(); 142 candidates = new TreeSet<>(heuristic); 143 trees = new TreeSet<>(new ELDescriptionTreeComparator()); 144 145 if(startClass == null) { 146 startClass = dataFactory.getOWLThing(); 147 } else { 148 try { 149 this.startClass = OWLAPIUtils.classExpressionPropertyExpander(startClass, reasoner, dataFactory); 150 } catch (ManchesterOWLSyntaxParserException e) { 151 logger.info("Error parsing startClass: " + e.getMessage()); 152 this.startClass = dataFactory.getOWLThing(); 153 } 154 } 155 operator = new ELDown(reasoner, instanceBasedDisjoints); 156 operator.init(); 157 158 baseURI = reasoner.getBaseURI(); 159 prefixes = reasoner.getPrefixes(); 160 161 minimizer = new OWLClassExpressionMinimizer(dataFactory, reasoner); 162 163 noise = noisePercentage/100d; 164 165 initialized = true; 166 } 167 168 @Override 169 public void start() { 170// System.out.println("starting disjunctive ELTL algorithm"); 171 stop = false; 172 isRunning = true; 173 reset(); 174 int treeCount = 0; 175 176 while(!stop && !stoppingCriteriaSatisfied()) { 177 178 treeStartTime = System.nanoTime(); 179 // create start node 180 ELDescriptionTree startTree = new ELDescriptionTree(reasoner, startClass); 181 addDescriptionTree(startTree, null); 182// bestCurrentTree = top; 183 bestCurrentScore = new ScoreSimple(ScoreSimple.MIN.getAccuracy()); 184 185 // main loop 186 int loop = 0; 187 while(!stop && !treeCriteriaSatisfied()) { 188 // pick the best candidate according to the heuristic 189 SearchTreeNode best = candidates.pollLast(); 190// System.out.println("best: " + best); 191 192 // apply operator 193 System.out.print("applying operator ..."); 194 List<ELDescriptionTree> refinements = operator.refine(best.getDescriptionTree()); 195 System.out.println("done " + refinements.size() + " refinements"); 196 // add all refinements to search tree, candidates, best descriptions 197 for(ELDescriptionTree refinement : refinements) { 198 addDescriptionTree(refinement, best); 199 } 200 loop++; 201 // logging 202 if(logger.isTraceEnabled()) { 203 logger.trace("Choosen node " + best); 204 logger.trace(startNode.getTreeString(renderer)); 205 logger.trace("Loop " + loop + " completed."); 206 } 207 208// for(SearchTreeNode node : candidates) { 209// System.out.println(node); 210// } 211// System.out.println(candidates.last()); 212// System.out.println(candidates.first()); 213// System.out.println("=="); 214 } 215 216 if(Double.compare(bestCurrentScore.getAccuracy(), minimumTreeScore) > 0) { 217 // we found a tree (partial solution) 218 currentSolution.add(bestCurrentNode.getDescriptionTree()); 219 OWLClassExpression bestDescription = bestCurrentNode.getDescriptionTree().transformToClassExpression(); 220 OWLClassExpression bestCombinedDescription = bestDescription; 221 // form union of trees found so far with 222 if(treeCount==0) { 223 bestEvaluatedDescription = learningProblem.evaluate(bestDescription); 224 bestEvaluatedDescriptions.add(bestEvaluatedDescription); 225 } else { 226 if(!bestEvaluatedDescription.getDescription().equals(dataFactory.getOWLThing())){ 227 bestCombinedDescription = dataFactory.getOWLObjectUnionOf(bestEvaluatedDescription.getDescription(), bestDescription); 228 } 229 bestEvaluatedDescription = learningProblem.evaluate(bestCombinedDescription); 230 bestEvaluatedDescriptions.add(bestEvaluatedDescription); 231 } 232 233 // remove already covered examples 234 Iterator<OWLIndividual> it = currentPosExamples.iterator(); 235 int posCov = 0; 236 while(it.hasNext()) { 237 OWLIndividual ind = it.next(); 238 if(reasoner.hasType(bestDescription, ind)) { 239// System.out.println("covered pos: " + ind); 240 it.remove(); 241 posCov++; 242 } 243 } 244 it = currentNegExamples.iterator(); 245 int negCov = 0; 246 while(it.hasNext()) { 247 OWLIndividual ind = it.next(); 248 if(reasoner.hasType(bestDescription, ind)) { 249// System.out.println("covered neg: " + ind); 250 it.remove(); 251 negCov++; 252 } 253 } 254 logger.info("tree found: " + OWLAPIRenderers.toManchesterOWLSyntax(bestDescription) + " (" + posCov + " pos covered, " + currentPosExamples.size() + " remaining, " + negCov + " neg covered, " + currentNegExamples.size() + " remaining, score: " + bestCurrentNode.getScore() + ")"); 255 logger.info("combined accuracy: " + decFormat.format(bestEvaluatedDescription.getAccuracy())); 256 } else { 257 logger.info("no tree found, which satisfies the minimum criteria - the best was: " + ( bestCurrentNode == null ? "(none)" : OWLAPIRenderers.toManchesterOWLSyntax(bestCurrentNode.getDescriptionTree().transformToClassExpression()) + " with score " + bestCurrentNode.getScore() )); 258 } 259 260 logger.info(trees.size() + " trees checked"); 261 262 // reduce importance of not covering negatives 263 posWeight = Math.max(1.0, posWeight-0.1); 264 265 // reset temporary variables 266 candidates.clear(); 267 trees.clear(); 268 269 treeCount++; 270 } 271 272 // simplify solution (in particular necessary when start class is specified) 273 OWLClassExpression niceDescription = minimizer.minimizeClone(bestEvaluatedDescription.getDescription()); 274 bestEvaluatedDescription = learningProblem.evaluate(niceDescription); 275 276 // print solution 277 logger.info("solution : " + OWLAPIRenderers.toManchesterOWLSyntax(bestEvaluatedDescription.getDescription()) + "(acc: " + bestEvaluatedDescription.getAccuracy() + ")"); 278 279 isRunning = false; 280 } 281 282 // evaluates a class expression in tree form 283 private void addDescriptionTree(ELDescriptionTree descriptionTree, SearchTreeNode parentNode) { 284 285 // redundancy check 286 boolean nonRedundant = trees.add(descriptionTree); 287 if(!nonRedundant) { 288 return; 289 } 290 291 // create search tree node 292 SearchTreeNode node = new SearchTreeNode(descriptionTree); 293 294 // compute score 295 Score score = getTreeScore(descriptionTree); 296 node.setScore(score); 297 298 // link to parent (unless start node) 299 if(parentNode == null) { 300 startNode = node; 301 } else { 302 parentNode.addChild(node); 303 } 304 305 // TODO: define "too weak" as a coverage on negative examples, which is 306 // too high for the tree to be considered 307 if(score.getAccuracy() != Double.NEGATIVE_INFINITY) { 308 candidates.add(node); 309 } 310 311 // check whether this is the best tree 312 if(Double.compare(score.getAccuracy(), bestCurrentScore.getAccuracy()) > 0) { 313 bestCurrentNode = node; 314 bestCurrentScore = score; 315 } 316 } 317 318 private Score getTreeScore(ELDescriptionTree tree) { 319 320 OWLClassExpression d = tree.transformToClassExpression(); 321 322 double score = 0; 323 324 // test coverage on current positive examples 325 ReasoningUtils reasoningUtils = new ReasoningUtils(reasoner); 326 327 ReasoningUtils.CoverageCount[] posCoverageCount = reasoningUtils.getCoverageCount(d, currentPosExamples); 328 score += 1 * posCoverageCount[0].trueCount; 329 330 // penalty if a minimum coverage is not achieved (avoids too many trees where 331 // each tree has only little impact) 332 if((startPosExamplesSize > 10 && posCoverageCount[0].trueCount<3) || posCoverageCount[0].trueCount < 1) { 333// score -= 100; 334 // further refining such a tree will not cover more positives 335 // => reject 336 return ScoreSimple.MIN; 337 } 338 339 // test coverage on current negative examples 340 ReasoningUtils.CoverageCount[] negCoverageCount = reasoningUtils.getCoverageCount(d, currentNegExamples); 341 score -= posWeight * negCoverageCount[0].trueCount; 342 343 // remove - does not make sense 344 // check whether tree is too weak, i.e. covers more than noise % negatives 345// int maxNegCov = (int) Math.round(noise * currentNegExamples.size()); 346// if(negCovered > maxNegCov) { 347// return Double.NEGATIVE_INFINITY; 348// } 349 350 // length penalty 351 score -= 0.1 * tree.getSize(); 352 353// System.out.println("score: " + score); 354 355 return new ScoreSimple(score); 356 } 357 358 private boolean treeCriteriaSatisfied() { 359 // stop if there are no more candidates (unlikely, but can happen) 360 if(candidates.isEmpty()) { 361 return true; 362 } 363 364 long runTime = System.nanoTime() - treeStartTime; 365 double runTimeSeconds = runTime / (double) 1000000000; 366 367 return runTimeSeconds >= treeSearchTimeSeconds; 368 } 369 370 private boolean stoppingCriteriaSatisfied() { 371 372 // stop if we have a node covering all positives and none of the negatives 373// SearchTreeNode bestNode = candidates.last(); 374// return (bestNode.getCoveredNegatives() == 0); 375 376 // stop if there are no more positive examples to cover 377 if(stopOnFirstDefinition && currentPosExamples.size()==0) { 378 return true; 379 } 380 381 // we stop when the score of the last tree added is too low 382 // (indicating that the algorithm could not find anything appropriate 383 // in the timeframe set) 384 if(Double.compare(bestCurrentScore.getAccuracy(), minimumTreeScore) <= 0) { 385 return true; 386 } 387 388 // stop when almost all positive examples have been covered 389 if(tryFullCoverage) { 390 return false; 391 } else { 392 int maxPosRemaining = (int) Math.ceil(startPosExamplesSize * 0.05d); 393 return (currentPosExamples.size()<=maxPosRemaining); 394 } 395 } 396 397 private void reset() { 398 // set all values back to their default values (used for running 399 // the algorithm more than once) 400 candidates.clear(); 401 trees.clear(); 402 currentSolution.clear(); 403 bestEvaluatedDescription = learningProblem.evaluate(dataFactory.getOWLThing()); 404 // we need to clone in order not to modify the learning problem 405 currentPosExamples = new TreeSet<>(((PosNegLP) getLearningProblem()).getPositiveExamples()); 406 currentNegExamples = new TreeSet<>(((PosNegLP) getLearningProblem()).getNegativeExamples()); 407 startPosExamplesSize = currentPosExamples.size(); 408// startNegExamplesSize = currentNegExamples.size(); 409 } 410 411 /** 412 * @return the startNode 413 */ 414 public SearchTreeNode getStartNode() { 415 return startNode; 416 } 417 418 public OWLClassExpression getStartClass() { 419 return startClass; 420 } 421 422 public void setStartClass(OWLClassExpression startClass) { 423 this.startClass = startClass; 424 } 425 426 public boolean isInstanceBasedDisjoints() { 427 return instanceBasedDisjoints; 428 } 429 430 public void setInstanceBasedDisjoints(boolean instanceBasedDisjoints) { 431 this.instanceBasedDisjoints = instanceBasedDisjoints; 432 } 433 434 public double getTreeSearchTimeSeconds() { 435 return treeSearchTimeSeconds; 436 } 437 438 public void setTreeSearchTimeSeconds(double treeSearchTimeSeconds) { 439 this.treeSearchTimeSeconds = treeSearchTimeSeconds; 440 } 441 442 public boolean isTryFullCoverage() { 443 return tryFullCoverage; 444 } 445 446 public void setTryFullCoverage(boolean tryFullCoverage) { 447 this.tryFullCoverage = tryFullCoverage; 448 } 449 450 /** 451 * @return the estimated noise value in percentage 452 */ 453 public double getNoisePercentage() { 454 return noisePercentage; 455 } 456 457 /** 458 * @param noisePercentage the estimated noise value in percentage to set 459 */ 460 public void setNoisePercentage(double noisePercentage) { 461 this.noisePercentage = noisePercentage; 462 } 463 464 public double getMinimumTreeScore() { 465 return minimumTreeScore; 466 } 467 468 public void setMinimumTreeScore(double minimumTreeScore) { 469 this.minimumTreeScore = minimumTreeScore; 470 } 471 472 public boolean isStopOnFirstDefinition() { 473 return stopOnFirstDefinition; 474 } 475 476 public void setStopOnFirstDefinition(boolean stopOnFirstDefinition) { 477 this.stopOnFirstDefinition = stopOnFirstDefinition; 478 } 479}