001/** 002 * 003 */ 004package org.dllearner.algorithms.isle.wsd; 005 006import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 007import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; 008import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; 009import edu.stanford.nlp.ling.CoreLabel; 010import edu.stanford.nlp.pipeline.Annotation; 011import edu.stanford.nlp.pipeline.StanfordCoreNLP; 012import edu.stanford.nlp.util.CoreMap; 013import org.dllearner.algorithms.isle.TextDocumentGenerator; 014import org.dllearner.algorithms.isle.index.Token; 015 016import java.util.ArrayList; 017import java.util.Arrays; 018import java.util.List; 019import java.util.Properties; 020 021/** 022 * @author Lorenz Buehmann 023 * 024 */ 025public class SentenceBasedContextExtractor implements ContextExtractor{ 026 027 private StanfordCoreNLP pipeline; 028 029 public SentenceBasedContextExtractor() { 030 Properties props = new Properties(); 031 props.put("annotators", "tokenize, ssplit"); 032 pipeline = new StanfordCoreNLP(props); 033 } 034 035 /* (non-Javadoc) 036 * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String) 037 */ 038 @Override 039 public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { 040 //split text into sentences 041 List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); 042 043 //find the sentence containing the token of the annotation 044 Token firstToken = annotation.getTokens().get(0); 045 for (CoreMap sentence : sentences) { 046 boolean found = false; 047 for (CoreLabel label : sentence.get(TokensAnnotation.class)) { 048 // this is the text of the token 049 String word = label.get(TextAnnotation.class); 050 if(word.equals(firstToken.getRawForm())){ 051 found = true; 052 break; 053 } 054 } 055 if(found){ 056 List<String> context = new ArrayList<>(); 057 for (CoreLabel label : sentence.get(TokensAnnotation.class)) { 058 // this is the text of the token 059 String word = label.get(TextAnnotation.class); 060 context.add(word); 061 } 062 return context; 063 } 064 } 065 throw new RuntimeException("Token " + annotation.getString() + " not found in text " + annotation.getReferencedDocument().getRawContent()); 066 } 067 068 private List<CoreMap> getSentences(String document) { 069 // create an empty Annotation just with the given text 070 Annotation annotation = new Annotation(document); 071 072 // run all Annotators on this text 073 pipeline.annotate(annotation); 074 075 // these are all the sentences in this document 076 // a CoreMap is essentially a Map that uses class objects as keys and 077 // has values with custom types 078 List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); 079 080 return sentences; 081 } 082 083 public static void main(String[] args) throws Exception { 084 String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," 085 + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; 086 087 SentenceBasedContextExtractor extractor = new SentenceBasedContextExtractor(); 088 List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American")))); 089 System.out.println(context); 090 } 091 092}