Source code

001/**
002 * Copyright (C) 2007 - 2016, Jens Lehmann
003 *
004 * This file is part of DL-Learner.
005 *
006 * DL-Learner is free software; you can redistribute it and/or modify
007 * it under the terms of the GNU General Public License as published by
008 * the Free Software Foundation; either version 3 of the License, or
009 * (at your option) any later version.
010 *
011 * DL-Learner is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
014 * GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License
017 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
018 */
019package org.dllearner.learningproblems;
020
021import com.google.common.collect.Sets;
022import org.apache.log4j.Logger;
023import org.dllearner.core.*;
024import org.dllearner.core.config.ConfigOption;
025import org.dllearner.reasoning.SPARQLReasoner;
026import org.dllearner.utilities.Helper;
027import org.semanticweb.owlapi.model.OWLClassExpression;
028import org.semanticweb.owlapi.model.OWLIndividual;
029import org.semanticweb.owlapi.model.OWLNamedIndividual;
030
031import java.util.*;
032
033/**
034 * A learning problem, where we learn from positive examples only.
035 *
036 * @author Jens Lehmann
037 *
038 */
039@ComponentAnn(name = "positive only learning problem", shortName = "posonlylp", version = 0.6)
040public class PosOnlyLP extends AbstractClassExpressionLearningProblem<ScorePosOnly<OWLNamedIndividual>> {
041
042        private static Logger logger = Logger.getLogger(PosOnlyLP.class);
043    private long nanoStartTime;
044
045        @ConfigOption(description = "the positive examples", required = true)
046        protected SortedSet<OWLIndividual> positiveExamples;
047
048        private List<OWLIndividual> positiveExamplesShuffled;
049//      protected SortedSet<OWLIndividual> pseudoNegatives;
050        private List<OWLIndividual> individuals;
051
052        private boolean useApproximations = false;
053
054        // approximation of accuracy +- 0.03 %
055        private static final double approx = 0.03;
056
057        // factor for higher weight on recall (needed for subclass learning)
058        private double coverageFactor;
059
060        public PosOnlyLP() {}
061
062        public PosOnlyLP(AbstractReasonerComponent reasoningService) {
063                super(reasoningService);
064        }
065
066        public PosOnlyLP(AbstractReasonerComponent reasoningService, SortedSet<OWLIndividual> positiveExamples) {
067                super(reasoningService);
068                this.positiveExamples = positiveExamples;
069        }
070
071        @Override
072        public void init() throws ComponentInitException {
073                ExampleLoader exampleLoaderHelper = this.getExampleLoaderHelper();
074                if (exampleLoaderHelper != null && !exampleLoaderHelper.isInitialized()) {
075                        logger.info("Loading examples by expression");
076                        exampleLoaderHelper.setPosOnlyLP(this);
077                        exampleLoaderHelper.init();
078                }
079
080                Random rand = new Random(1);
081
082                if(getReasoner() != null) {
083                        if(reasoner instanceof SPARQLReasoner) {
084                                throw new ComponentInitException("SPARQL reasoner currently not supported for PosOnly learning problem");
085                        }
086
087                        // sanity check whether examples are contained in KB
088                        Helper.checkIndividuals(reasoner, positiveExamples);
089
090                        SortedSet<OWLIndividual> allIndividuals = getReasoner().getIndividuals();
091
092                        this.individuals = new LinkedList<>(allIndividuals);
093                        Collections.shuffle(this.individuals, rand);
094                }
095
096                positiveExamplesShuffled = new LinkedList<>(positiveExamples);
097                Collections.shuffle(positiveExamplesShuffled, rand);
098                
099                initialized = true;
100        }
101
102        public SortedSet<OWLIndividual> getPositiveExamples() {
103                return positiveExamples;
104        }
105
106        /**
107         * @param useApproximations the useApproximations to set
108         */
109        public void setUseApproximations(boolean useApproximations) {
110                this.useApproximations = useApproximations;
111        }
112
113        /* (non-Javadoc)
114         * @see org.dllearner.core.LearningProblem#computeScore(org.dllearner.core.owl.Description)
115         */
116        @Override
117        public ScorePosOnly<OWLNamedIndividual> computeScore(OWLClassExpression description, double noise) {
118                Set<OWLIndividual> retrieval = getReasoner().getIndividuals(description);
119
120                Set<OWLIndividual> instancesCovered = new TreeSet<>();
121                Set<OWLIndividual> instancesNotCovered = new TreeSet<>();
122                for(OWLIndividual ind : positiveExamples) {
123                        if(retrieval.contains(ind)) {
124                                instancesCovered.add(ind);
125                        } else {
126                                instancesNotCovered.add(ind);
127                        }
128                }
129
130                double coverage = instancesCovered.size()/(double)positiveExamples.size();
131                double protusion = retrieval.size() == 0 ? 0 : instancesCovered.size()/(double)retrieval.size();
132
133                // pass only additional instances to score object
134                retrieval.removeAll(instancesCovered);
135                return new ScorePosOnly(instancesCovered, instancesNotCovered, coverage, retrieval, protusion, getAccuracy(coverage, protusion));
136        }
137
138        private double getAccuracyOrTooWeakApprox(OWLClassExpression description, double noise) {
139
140                // instead of using the standard operation, we use optimisation
141                // and approximation here
142
143                // we abort when there are too many uncovered positives
144                int maxNotCovered = (int) Math.ceil(noise*positiveExamples.size());
145                int instancesCovered = 0;
146                int instancesNotCovered = 0;
147                int total;
148                boolean estimatedA = false;
149
150                double lowerBorderA = 0;
151                int lowerEstimateA = 0;
152                double upperBorderA = 1;
153                int upperEstimateA = positiveExamples.size();
154
155                for(OWLIndividual ind : positiveExamplesShuffled) {
156                        if(getReasoner().hasType(description, ind)) {
157                                instancesCovered++;
158                        } else {
159                                instancesNotCovered ++;
160                                if(instancesNotCovered > maxNotCovered) {
161                                        return -1;
162                                }
163                        }
164
165                        // approximation step (starting after 10 tests)
166                        total = instancesCovered + instancesNotCovered;
167                        if(total > 10) {
168                                // compute confidence interval
169                                double p1 = p1(instancesCovered, total);
170                                double p2 = p3(p1, total);
171                                lowerBorderA = Math.max(0, p1 - p2);
172                                upperBorderA = Math.min(1, p1 + p2);
173                                double size = upperBorderA - lowerBorderA;
174                                // if the interval has a size smaller than 10%, we can be confident
175                                if(size < 2 * approx) {
176                                        // we have to distinguish the cases that the accuracy limit is
177                                        // below, within, or above the limit and that the mean is below
178                                        // or above the limit
179                                        double mean = instancesCovered/(double)total;
180
181                                        // if the mean is greater than the required minimum, we can accept;
182                                        // we also accept if the interval is small and close to the minimum
183                                        // (worst case is to accept a few inaccurate descriptions)
184                                        if(mean > noise || (upperBorderA > mean && size < 0.03)) {
185                                                instancesCovered = (int) (instancesCovered/(double)total * positiveExamples.size());
186                                                upperEstimateA = (int) (upperBorderA * positiveExamples.size());
187                                                lowerEstimateA = (int) (lowerBorderA * positiveExamples.size());
188                                                estimatedA = true;
189                                                break;
190                                        }
191
192                                        // reject only if the upper border is far away (we are very
193                                        // certain not to lose a potential solution)
194                                        if(upperBorderA + 0.1 < noise) {
195                                                return -1;
196                                        }
197                                }
198                        }
199                }
200
201                double coverage = instancesCovered/(double)positiveExamples.size();
202
203                int testsPerformed = 0;
204                int instancesDescription = 0;
205
206                for(OWLIndividual ind : individuals) {
207
208                        if(getReasoner().hasType(description, ind)) {
209                                instancesDescription++;
210                        }
211
212                        testsPerformed++;
213
214                        if(testsPerformed > 10) {
215
216                                // compute confidence interval
217                                double p1 = p1(instancesDescription, testsPerformed);
218                                double p2 = p3(p1, testsPerformed);
219                                double lowerBorder = Math.max(0, p1 - p2);
220                                double upperBorder = Math.min(1, p1 + p2);
221                                int lowerEstimate = (int) (lowerBorder * individuals.size());
222                                int upperEstimate = (int) (upperBorder * individuals.size());
223
224                                double size;
225                                if(estimatedA) {
226//                                      size = 1/(coverageFactor+1) * (coverageFactor * (upperBorderA-lowerBorderA) + Math.sqrt(upperEstimateA/(upperEstimateA+lowerEstimate)) + Math.sqrt(lowerEstimateA/(lowerEstimateA+upperEstimate)));
227                                        size = getAccuracy(upperBorderA, upperEstimateA/(double)(upperEstimateA+lowerEstimate)) - getAccuracy(lowerBorderA, lowerEstimateA/(double)(lowerEstimateA+upperEstimate));
228                                } else {
229//                                      size = 1/(coverageFactor+1) * (coverageFactor * coverage + Math.sqrt(instancesCovered/(instancesCovered+lowerEstimate)) + Math.sqrt(instancesCovered/(instancesCovered+upperEstimate)));
230                                        size = getAccuracy(coverage, instancesCovered/(double)(instancesCovered+lowerEstimate)) - getAccuracy(coverage, instancesCovered/(double)(instancesCovered+upperEstimate));
231                                }
232
233                                if(size < 0.1) {
234//                                      System.out.println(instancesDescription + " of " + testsPerformed);
235//                                      System.out.println("interval from " + lowerEstimate + " to " + upperEstimate);
236//                                      System.out.println("size: " + size);
237
238//                                      estimatedB = true;
239                                        // calculate total number of instances
240                                        instancesDescription = (int) (instancesDescription/(double)testsPerformed * individuals.size());
241                                        break;
242                                }
243                        }
244                }
245
246                // since we measured/estimated accuracy only on instances outside A (superClassInstances
247                // does not include instances of A), we need to add it in the denominator
248                double protusion = instancesCovered/(double)(instancesDescription+instancesCovered);
249                if(instancesCovered + instancesDescription == 0) {
250                        protusion = 0;
251                }
252
253                return getAccuracy(coverage, protusion);
254        }
255
256        /* (non-Javadoc)
257         * @see org.dllearner.core.AbstractLearningProblem#getAccuracyOrTooWeak(org.dllearner.core.owl.Description, double)
258         */
259        @Override
260        public double getAccuracyOrTooWeak(OWLClassExpression description, double noise) {
261                return useApproximations ? getAccuracyOrTooWeakApprox(description, noise) : getAccuracyOrTooWeakExact(description, noise);
262        }
263
264        // exact computation for 5 heuristics; each one adapted to super class
265        // learning;
266        // each one takes the noise parameter into account
267        private double getAccuracyOrTooWeakExact(OWLClassExpression description, double noise) {
268
269                nanoStartTime = System.nanoTime();
270
271                SortedSet<OWLIndividual> individualsC = reasoner.getIndividuals(description);
272
273                // computing R(C) restricted to relevant instances
274                int additionalInstances = Sets.difference(individualsC, positiveExamples).size();
275
276                // computing R(A)
277                int coveredInstances = Sets.intersection(individualsC, positiveExamples).size();
278
279                double recall = coveredInstances / (double) positiveExamples.size();
280
281                // noise computation is incorrect
282                // if(recall < 1 - noise) {
283                // return -1;
284                // }
285
286                double precision = (additionalInstances + coveredInstances == 0) ? 0 : coveredInstances
287                                / (double) (coveredInstances + additionalInstances);
288
289                // best reachable concept has same recall and precision 1:
290                if (((1 + Math.sqrt(coverageFactor)) * recall) / (Math.sqrt(coverageFactor) + 1) < 1 - noise) {
291                        return -1;
292                } else {
293                        return Heuristics.getFScore(recall, precision, coverageFactor);
294                }
295
296        }
297
298        // see paper: expression used in confidence interval estimation
299        private static double p3(double p1, int total) {
300                return 1.96 * Math.sqrt(p1*(1-p1)/(total+4));
301        }
302
303        // see paper: p'
304        private static double p1(int success, int total) {
305                return (success+2)/(double)(total+4);
306        }
307
308        @Override
309        @SuppressWarnings("unchecked")
310        public EvaluatedDescription<ScorePosOnly<OWLNamedIndividual>> evaluate(OWLClassExpression description, double noise) {
311                ScorePosOnly score = computeScore(description, noise);
312                return new EvaluatedDescriptionPosOnly(description, score);
313        }
314
315        private double getAccuracy(double coverage, double protusion) {
316                return 0.5 * (coverage + Math.sqrt(protusion));
317        }
318
319        public void setPositiveExamples(SortedSet<OWLIndividual> positiveExamples) {
320                this.positiveExamples = positiveExamples;
321        }
322
323        public void setPositiveExamples(Set<OWLIndividual> positiveExamples) {
324                this.positiveExamples = new TreeSet<>(positiveExamples);
325        }
326}