Source code

001/**
002 * 
003 */
004package org.dllearner.algorithms.isle.wsd;
005
006import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
007import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
008import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
009import edu.stanford.nlp.ling.CoreLabel;
010import edu.stanford.nlp.pipeline.Annotation;
011import edu.stanford.nlp.pipeline.StanfordCoreNLP;
012import edu.stanford.nlp.util.CoreMap;
013import org.dllearner.algorithms.isle.TextDocumentGenerator;
014import org.dllearner.algorithms.isle.index.Token;
015
016import java.util.ArrayList;
017import java.util.Arrays;
018import java.util.List;
019import java.util.Properties;
020
021/**
022 * @author Lorenz Buehmann
023 * 
024 */
025public class WindowBasedContextExtractor implements ContextExtractor {
026
027        private StanfordCoreNLP pipeline;
028        private int tokensLeft = 10;
029        private int tokensRight = 10;
030
031        public WindowBasedContextExtractor(int tokensLeft, int tokensRight) {
032                this.tokensLeft = tokensLeft;
033                this.tokensRight = tokensRight;
034
035                Properties props = new Properties();
036                props.put("annotators", "tokenize, ssplit");
037                pipeline = new StanfordCoreNLP(props);
038        }
039
040        public WindowBasedContextExtractor(int tokensLeftRight) {
041                tokensLeft = tokensLeftRight;
042                tokensRight = tokensLeftRight;
043
044                Properties props = new Properties();
045                props.put("annotators", "tokenize, ssplit");
046                pipeline = new StanfordCoreNLP(props);
047        }
048
049        public WindowBasedContextExtractor() {
050                Properties props = new Properties();
051                props.put("annotators", "tokenize, ssplit");
052                pipeline = new StanfordCoreNLP(props);
053        }
054
055        /*
056         * (non-Javadoc)
057         * 
058         * @see
059         * org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java
060         * .lang.String, java.lang.String)
061         */
062        @Override
063        public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) {
064                // split text into sentences
065                List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent());
066
067                // find the sentence containing the token of the annotation
068                Token firstToken = annotation.getTokens().get(0);
069                for (CoreMap sentence : sentences) {
070                        boolean found = false;
071                        for (CoreLabel label : sentence.get(TokensAnnotation.class)) {
072                                // this is the text of the token
073                                String word = label.get(TextAnnotation.class);
074                                if (word.equals(firstToken.getRawForm())) {
075                                        found = true;
076                                        break;
077                                }
078                        }
079                        if (found) {
080                                List<String> context = new ArrayList<>();
081                                for (CoreLabel label : sentence.get(TokensAnnotation.class)) {
082                                        // this is the text of the token
083                                        String word = label.get(TextAnnotation.class);
084                                        context.add(word);
085                                }
086                                return context;
087                        }
088                }
089                throw new RuntimeException("Token " + annotation.getString() + " not found in text "
090                                + annotation.getReferencedDocument().getRawContent());
091
092        }
093
094        private List<CoreMap> getSentences(String document) {
095                // create an empty Annotation just with the given text
096                Annotation annotation = new Annotation(document);
097
098                // run all Annotators on this text
099                pipeline.annotate(annotation);
100
101                // these are all the sentences in this document
102                // a CoreMap is essentially a Map that uses class objects as keys and
103                // has values with custom types
104                List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
105
106                return sentences;
107        }
108
109        public static void main(String[] args) throws Exception {
110                String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software,"
111                                + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology.";
112
113                String token = "services";
114                WindowBasedContextExtractor extractor = new WindowBasedContextExtractor();
115                List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American"))));
116                System.out.println(context);
117        }
118
119}