001/** 002 * Copyright (C) 2007 - 2016, Jens Lehmann 003 * 004 * This file is part of DL-Learner. 005 * 006 * DL-Learner is free software; you can redistribute it and/or modify 007 * it under the terms of the GNU General Public License as published by 008 * the Free Software Foundation; either version 3 of the License, or 009 * (at your option) any later version. 010 * 011 * DL-Learner is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 014 * GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License 017 * along with this program. If not, see <http://www.gnu.org/licenses/>. 018 */ 019package org.dllearner.learningproblems; 020 021import com.google.common.collect.Sets; 022import org.apache.log4j.Logger; 023import org.dllearner.core.*; 024import org.dllearner.core.config.ConfigOption; 025import org.dllearner.reasoning.SPARQLReasoner; 026import org.dllearner.utilities.Helper; 027import org.semanticweb.owlapi.model.OWLClassExpression; 028import org.semanticweb.owlapi.model.OWLIndividual; 029import org.semanticweb.owlapi.model.OWLNamedIndividual; 030 031import java.util.*; 032 033/** 034 * A learning problem, where we learn from positive examples only. 035 * 036 * @author Jens Lehmann 037 * 038 */ 039@ComponentAnn(name = "positive only learning problem", shortName = "posonlylp", version = 0.6) 040public class PosOnlyLP extends AbstractClassExpressionLearningProblem<ScorePosOnly<OWLNamedIndividual>> { 041 042 private static Logger logger = Logger.getLogger(PosOnlyLP.class); 043 private long nanoStartTime; 044 045 @ConfigOption(description = "the positive examples", required = true) 046 protected SortedSet<OWLIndividual> positiveExamples; 047 048 private List<OWLIndividual> positiveExamplesShuffled; 049// protected SortedSet<OWLIndividual> pseudoNegatives; 050 private List<OWLIndividual> individuals; 051 052 private boolean useApproximations = false; 053 054 // approximation of accuracy +- 0.03 % 055 private static final double approx = 0.03; 056 057 // factor for higher weight on recall (needed for subclass learning) 058 private double coverageFactor; 059 060 public PosOnlyLP() {} 061 062 public PosOnlyLP(AbstractReasonerComponent reasoningService) { 063 super(reasoningService); 064 } 065 066 public PosOnlyLP(AbstractReasonerComponent reasoningService, SortedSet<OWLIndividual> positiveExamples) { 067 super(reasoningService); 068 this.positiveExamples = positiveExamples; 069 } 070 071 @Override 072 public void init() throws ComponentInitException { 073 ExampleLoader exampleLoaderHelper = this.getExampleLoaderHelper(); 074 if (exampleLoaderHelper != null && !exampleLoaderHelper.isInitialized()) { 075 logger.info("Loading examples by expression"); 076 exampleLoaderHelper.setPosOnlyLP(this); 077 exampleLoaderHelper.init(); 078 } 079 080 Random rand = new Random(1); 081 082 if(getReasoner() != null) { 083 if(reasoner instanceof SPARQLReasoner) { 084 throw new ComponentInitException("SPARQL reasoner currently not supported for PosOnly learning problem"); 085 } 086 087 // sanity check whether examples are contained in KB 088 Helper.checkIndividuals(reasoner, positiveExamples); 089 090 SortedSet<OWLIndividual> allIndividuals = getReasoner().getIndividuals(); 091 092 this.individuals = new LinkedList<>(allIndividuals); 093 Collections.shuffle(this.individuals, rand); 094 } 095 096 positiveExamplesShuffled = new LinkedList<>(positiveExamples); 097 Collections.shuffle(positiveExamplesShuffled, rand); 098 099 initialized = true; 100 } 101 102 public SortedSet<OWLIndividual> getPositiveExamples() { 103 return positiveExamples; 104 } 105 106 /** 107 * @param useApproximations the useApproximations to set 108 */ 109 public void setUseApproximations(boolean useApproximations) { 110 this.useApproximations = useApproximations; 111 } 112 113 /* (non-Javadoc) 114 * @see org.dllearner.core.LearningProblem#computeScore(org.dllearner.core.owl.Description) 115 */ 116 @Override 117 public ScorePosOnly<OWLNamedIndividual> computeScore(OWLClassExpression description, double noise) { 118 Set<OWLIndividual> retrieval = getReasoner().getIndividuals(description); 119 120 Set<OWLIndividual> instancesCovered = new TreeSet<>(); 121 Set<OWLIndividual> instancesNotCovered = new TreeSet<>(); 122 for(OWLIndividual ind : positiveExamples) { 123 if(retrieval.contains(ind)) { 124 instancesCovered.add(ind); 125 } else { 126 instancesNotCovered.add(ind); 127 } 128 } 129 130 double coverage = instancesCovered.size()/(double)positiveExamples.size(); 131 double protusion = retrieval.size() == 0 ? 0 : instancesCovered.size()/(double)retrieval.size(); 132 133 // pass only additional instances to score object 134 retrieval.removeAll(instancesCovered); 135 return new ScorePosOnly(instancesCovered, instancesNotCovered, coverage, retrieval, protusion, getAccuracy(coverage, protusion)); 136 } 137 138 private double getAccuracyOrTooWeakApprox(OWLClassExpression description, double noise) { 139 140 // instead of using the standard operation, we use optimisation 141 // and approximation here 142 143 // we abort when there are too many uncovered positives 144 int maxNotCovered = (int) Math.ceil(noise*positiveExamples.size()); 145 int instancesCovered = 0; 146 int instancesNotCovered = 0; 147 int total; 148 boolean estimatedA = false; 149 150 double lowerBorderA = 0; 151 int lowerEstimateA = 0; 152 double upperBorderA = 1; 153 int upperEstimateA = positiveExamples.size(); 154 155 for(OWLIndividual ind : positiveExamplesShuffled) { 156 if(getReasoner().hasType(description, ind)) { 157 instancesCovered++; 158 } else { 159 instancesNotCovered ++; 160 if(instancesNotCovered > maxNotCovered) { 161 return -1; 162 } 163 } 164 165 // approximation step (starting after 10 tests) 166 total = instancesCovered + instancesNotCovered; 167 if(total > 10) { 168 // compute confidence interval 169 double p1 = p1(instancesCovered, total); 170 double p2 = p3(p1, total); 171 lowerBorderA = Math.max(0, p1 - p2); 172 upperBorderA = Math.min(1, p1 + p2); 173 double size = upperBorderA - lowerBorderA; 174 // if the interval has a size smaller than 10%, we can be confident 175 if(size < 2 * approx) { 176 // we have to distinguish the cases that the accuracy limit is 177 // below, within, or above the limit and that the mean is below 178 // or above the limit 179 double mean = instancesCovered/(double)total; 180 181 // if the mean is greater than the required minimum, we can accept; 182 // we also accept if the interval is small and close to the minimum 183 // (worst case is to accept a few inaccurate descriptions) 184 if(mean > noise || (upperBorderA > mean && size < 0.03)) { 185 instancesCovered = (int) (instancesCovered/(double)total * positiveExamples.size()); 186 upperEstimateA = (int) (upperBorderA * positiveExamples.size()); 187 lowerEstimateA = (int) (lowerBorderA * positiveExamples.size()); 188 estimatedA = true; 189 break; 190 } 191 192 // reject only if the upper border is far away (we are very 193 // certain not to lose a potential solution) 194 if(upperBorderA + 0.1 < noise) { 195 return -1; 196 } 197 } 198 } 199 } 200 201 double coverage = instancesCovered/(double)positiveExamples.size(); 202 203 int testsPerformed = 0; 204 int instancesDescription = 0; 205 206 for(OWLIndividual ind : individuals) { 207 208 if(getReasoner().hasType(description, ind)) { 209 instancesDescription++; 210 } 211 212 testsPerformed++; 213 214 if(testsPerformed > 10) { 215 216 // compute confidence interval 217 double p1 = p1(instancesDescription, testsPerformed); 218 double p2 = p3(p1, testsPerformed); 219 double lowerBorder = Math.max(0, p1 - p2); 220 double upperBorder = Math.min(1, p1 + p2); 221 int lowerEstimate = (int) (lowerBorder * individuals.size()); 222 int upperEstimate = (int) (upperBorder * individuals.size()); 223 224 double size; 225 if(estimatedA) { 226// size = 1/(coverageFactor+1) * (coverageFactor * (upperBorderA-lowerBorderA) + Math.sqrt(upperEstimateA/(upperEstimateA+lowerEstimate)) + Math.sqrt(lowerEstimateA/(lowerEstimateA+upperEstimate))); 227 size = getAccuracy(upperBorderA, upperEstimateA/(double)(upperEstimateA+lowerEstimate)) - getAccuracy(lowerBorderA, lowerEstimateA/(double)(lowerEstimateA+upperEstimate)); 228 } else { 229// size = 1/(coverageFactor+1) * (coverageFactor * coverage + Math.sqrt(instancesCovered/(instancesCovered+lowerEstimate)) + Math.sqrt(instancesCovered/(instancesCovered+upperEstimate))); 230 size = getAccuracy(coverage, instancesCovered/(double)(instancesCovered+lowerEstimate)) - getAccuracy(coverage, instancesCovered/(double)(instancesCovered+upperEstimate)); 231 } 232 233 if(size < 0.1) { 234// System.out.println(instancesDescription + " of " + testsPerformed); 235// System.out.println("interval from " + lowerEstimate + " to " + upperEstimate); 236// System.out.println("size: " + size); 237 238// estimatedB = true; 239 // calculate total number of instances 240 instancesDescription = (int) (instancesDescription/(double)testsPerformed * individuals.size()); 241 break; 242 } 243 } 244 } 245 246 // since we measured/estimated accuracy only on instances outside A (superClassInstances 247 // does not include instances of A), we need to add it in the denominator 248 double protusion = instancesCovered/(double)(instancesDescription+instancesCovered); 249 if(instancesCovered + instancesDescription == 0) { 250 protusion = 0; 251 } 252 253 return getAccuracy(coverage, protusion); 254 } 255 256 /* (non-Javadoc) 257 * @see org.dllearner.core.AbstractLearningProblem#getAccuracyOrTooWeak(org.dllearner.core.owl.Description, double) 258 */ 259 @Override 260 public double getAccuracyOrTooWeak(OWLClassExpression description, double noise) { 261 return useApproximations ? getAccuracyOrTooWeakApprox(description, noise) : getAccuracyOrTooWeakExact(description, noise); 262 } 263 264 // exact computation for 5 heuristics; each one adapted to super class 265 // learning; 266 // each one takes the noise parameter into account 267 private double getAccuracyOrTooWeakExact(OWLClassExpression description, double noise) { 268 269 nanoStartTime = System.nanoTime(); 270 271 SortedSet<OWLIndividual> individualsC = reasoner.getIndividuals(description); 272 273 // computing R(C) restricted to relevant instances 274 int additionalInstances = Sets.difference(individualsC, positiveExamples).size(); 275 276 // computing R(A) 277 int coveredInstances = Sets.intersection(individualsC, positiveExamples).size(); 278 279 double recall = coveredInstances / (double) positiveExamples.size(); 280 281 // noise computation is incorrect 282 // if(recall < 1 - noise) { 283 // return -1; 284 // } 285 286 double precision = (additionalInstances + coveredInstances == 0) ? 0 : coveredInstances 287 / (double) (coveredInstances + additionalInstances); 288 289 // best reachable concept has same recall and precision 1: 290 if (((1 + Math.sqrt(coverageFactor)) * recall) / (Math.sqrt(coverageFactor) + 1) < 1 - noise) { 291 return -1; 292 } else { 293 return Heuristics.getFScore(recall, precision, coverageFactor); 294 } 295 296 } 297 298 // see paper: expression used in confidence interval estimation 299 private static double p3(double p1, int total) { 300 return 1.96 * Math.sqrt(p1*(1-p1)/(total+4)); 301 } 302 303 // see paper: p' 304 private static double p1(int success, int total) { 305 return (success+2)/(double)(total+4); 306 } 307 308 @Override 309 @SuppressWarnings("unchecked") 310 public EvaluatedDescription<ScorePosOnly<OWLNamedIndividual>> evaluate(OWLClassExpression description, double noise) { 311 ScorePosOnly score = computeScore(description, noise); 312 return new EvaluatedDescriptionPosOnly(description, score); 313 } 314 315 private double getAccuracy(double coverage, double protusion) { 316 return 0.5 * (coverage + Math.sqrt(protusion)); 317 } 318 319 public void setPositiveExamples(SortedSet<OWLIndividual> positiveExamples) { 320 this.positiveExamples = positiveExamples; 321 } 322 323 public void setPositiveExamples(Set<OWLIndividual> positiveExamples) { 324 this.positiveExamples = new TreeSet<>(positiveExamples); 325 } 326}