Source code

001/**
002 * 
003 */
004package org.dllearner.algorithms.isle.wsd;
005
006import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
007import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
008import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
009import edu.stanford.nlp.ling.CoreLabel;
010import edu.stanford.nlp.pipeline.Annotation;
011import edu.stanford.nlp.pipeline.StanfordCoreNLP;
012import edu.stanford.nlp.util.CoreMap;
013import org.dllearner.algorithms.isle.TextDocumentGenerator;
014import org.dllearner.algorithms.isle.index.Token;
015
016import java.util.ArrayList;
017import java.util.Arrays;
018import java.util.List;
019import java.util.Properties;
020
021/**
022 * @author Lorenz Buehmann
023 *
024 */
025public class SentenceBasedContextExtractor implements ContextExtractor{
026        
027        private StanfordCoreNLP pipeline;
028
029        public SentenceBasedContextExtractor() {
030                Properties props = new Properties();
031                props.put("annotators", "tokenize, ssplit");
032                pipeline = new StanfordCoreNLP(props);
033        }
034
035        /* (non-Javadoc)
036         * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String)
037         */
038        @Override
039        public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) {
040                //split text into sentences
041                List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent());
042
043                //find the sentence containing the token of the annotation
044                Token firstToken = annotation.getTokens().get(0);
045                for (CoreMap sentence : sentences) {
046                        boolean found = false;
047                        for (CoreLabel label : sentence.get(TokensAnnotation.class)) {
048                                // this is the text of the token
049                                String word = label.get(TextAnnotation.class);
050                                if(word.equals(firstToken.getRawForm())){
051                                        found = true;
052                                        break;
053                                }
054                        }
055                        if(found){
056                                List<String> context = new ArrayList<>();
057                                for (CoreLabel label : sentence.get(TokensAnnotation.class)) {
058                                        // this is the text of the token
059                                        String word = label.get(TextAnnotation.class);
060                                        context.add(word);
061                                }
062                                return context;
063                        }
064                }
065                throw new RuntimeException("Token " + annotation.getString() + " not found in text " + annotation.getReferencedDocument().getRawContent());
066        }
067        
068        private List<CoreMap> getSentences(String document) {
069                // create an empty Annotation just with the given text
070                Annotation annotation = new Annotation(document);
071
072                // run all Annotators on this text
073                pipeline.annotate(annotation);
074
075                // these are all the sentences in this document
076                // a CoreMap is essentially a Map that uses class objects as keys and
077                // has values with custom types
078                List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
079
080                return sentences;
081        }
082        
083        public static void main(String[] args) throws Exception {
084                String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software,"
085                                + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology.";
086        
087                SentenceBasedContextExtractor extractor = new SentenceBasedContextExtractor();
088                List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American"))));
089                System.out.println(context);
090        }
091
092}