001/** 002 * 003 */ 004package org.dllearner.algorithms.isle.wsd; 005 006import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 007import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; 008import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; 009import edu.stanford.nlp.ling.CoreLabel; 010import edu.stanford.nlp.pipeline.Annotation; 011import edu.stanford.nlp.pipeline.StanfordCoreNLP; 012import edu.stanford.nlp.util.CoreMap; 013import org.dllearner.algorithms.isle.TextDocumentGenerator; 014import org.dllearner.algorithms.isle.index.Token; 015 016import java.util.ArrayList; 017import java.util.Arrays; 018import java.util.List; 019import java.util.Properties; 020 021/** 022 * @author Lorenz Buehmann 023 * 024 */ 025public class WindowBasedContextExtractor implements ContextExtractor { 026 027 private StanfordCoreNLP pipeline; 028 private int tokensLeft = 10; 029 private int tokensRight = 10; 030 031 public WindowBasedContextExtractor(int tokensLeft, int tokensRight) { 032 this.tokensLeft = tokensLeft; 033 this.tokensRight = tokensRight; 034 035 Properties props = new Properties(); 036 props.put("annotators", "tokenize, ssplit"); 037 pipeline = new StanfordCoreNLP(props); 038 } 039 040 public WindowBasedContextExtractor(int tokensLeftRight) { 041 tokensLeft = tokensLeftRight; 042 tokensRight = tokensLeftRight; 043 044 Properties props = new Properties(); 045 props.put("annotators", "tokenize, ssplit"); 046 pipeline = new StanfordCoreNLP(props); 047 } 048 049 public WindowBasedContextExtractor() { 050 Properties props = new Properties(); 051 props.put("annotators", "tokenize, ssplit"); 052 pipeline = new StanfordCoreNLP(props); 053 } 054 055 /* 056 * (non-Javadoc) 057 * 058 * @see 059 * org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java 060 * .lang.String, java.lang.String) 061 */ 062 @Override 063 public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { 064 // split text into sentences 065 List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); 066 067 // find the sentence containing the token of the annotation 068 Token firstToken = annotation.getTokens().get(0); 069 for (CoreMap sentence : sentences) { 070 boolean found = false; 071 for (CoreLabel label : sentence.get(TokensAnnotation.class)) { 072 // this is the text of the token 073 String word = label.get(TextAnnotation.class); 074 if (word.equals(firstToken.getRawForm())) { 075 found = true; 076 break; 077 } 078 } 079 if (found) { 080 List<String> context = new ArrayList<>(); 081 for (CoreLabel label : sentence.get(TokensAnnotation.class)) { 082 // this is the text of the token 083 String word = label.get(TextAnnotation.class); 084 context.add(word); 085 } 086 return context; 087 } 088 } 089 throw new RuntimeException("Token " + annotation.getString() + " not found in text " 090 + annotation.getReferencedDocument().getRawContent()); 091 092 } 093 094 private List<CoreMap> getSentences(String document) { 095 // create an empty Annotation just with the given text 096 Annotation annotation = new Annotation(document); 097 098 // run all Annotators on this text 099 pipeline.annotate(annotation); 100 101 // these are all the sentences in this document 102 // a CoreMap is essentially a Map that uses class objects as keys and 103 // has values with custom types 104 List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); 105 106 return sentences; 107 } 108 109 public static void main(String[] args) throws Exception { 110 String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," 111 + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; 112 113 String token = "services"; 114 WindowBasedContextExtractor extractor = new WindowBasedContextExtractor(); 115 List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American")))); 116 System.out.println(context); 117 } 118 119}