001package org.dllearner.algorithms.isle; 002 003import java.util.ArrayList; 004import java.util.List; 005import java.util.Properties; 006 007import org.dllearner.algorithms.isle.index.TextDocument; 008import org.dllearner.algorithms.isle.index.Token; 009 010import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; 011import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; 012import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 013import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; 014import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; 015import edu.stanford.nlp.ling.CoreLabel; 016import edu.stanford.nlp.pipeline.Annotation; 017import edu.stanford.nlp.pipeline.StanfordCoreNLP; 018import edu.stanford.nlp.trees.CollinsHeadFinder; 019import edu.stanford.nlp.trees.Tree; 020import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; 021import edu.stanford.nlp.util.CoreMap; 022 023public class TextDocumentGenerator { 024 025 private static TextDocumentGenerator instance; 026 027 private StanfordCoreNLP pipeline; 028 private StanfordCoreNLP pipelineSimple; 029 private final String punctuationPattern = "\\p{Punct}"; 030 private final StopWordFilter stopWordFilter = new StopWordFilter(); 031 032 private TextDocumentGenerator(){ 033 Properties props = new Properties(); 034 props.put("annotators", "tokenize, ssplit");//, pos, lemma, parse"); 035 pipelineSimple = new StanfordCoreNLP(props); 036 037 props = new Properties(); 038 props.put("annotators", "tokenize, ssplit, pos, lemma, parse"); 039 pipeline = new StanfordCoreNLP(props); 040 } 041 042 public static synchronized TextDocumentGenerator getInstance(){ 043 if(instance == null){ 044 instance = new TextDocumentGenerator(); 045 } 046 return instance; 047 } 048 049 public TextDocument generateDocument(String text) { 050 return generateDocument(text, false); 051 } 052 053 public TextDocument generateDocument(String text, boolean determineHead) { 054 TextDocument document = new TextDocument(); 055 // create an empty Annotation just with the given text 056 Annotation annotatedDocument = new Annotation(text); 057 058 // run all Annotators on this text 059 pipeline.annotate(annotatedDocument); 060 061 // these are all the sentences in this document 062 // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types 063 List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class); 064 065 for(CoreMap sentence: sentences) { 066 067 //determine the head noun 068 String head = null; 069 if(determineHead){ 070 //if phrase only contains one single token, the task is trivial 071 if(sentence.get(TokensAnnotation.class).size() == 1){ 072 head = sentence.get(TokensAnnotation.class).get(0).get(TextAnnotation.class); 073 } else { 074 Tree tree = sentence.get(TreeAnnotation.class); 075 CollinsHeadFinder headFinder = new CollinsHeadFinder(); 076// Tree head = headFinder.determineHead(tree); 077// System.out.println(sentence); 078// System.out.println(tree.headTerminal(headFinder)); 079 head = tree.headTerminal(headFinder).toString(); 080 081// // Create a reusable pattern object 082// TregexPattern patternMW = TregexPattern.compile("__ >># NP"); 083// // Run the pattern on one particular tree 084// TregexMatcher matcher = patternMW.matcher(tree); 085// // Iterate over all of the subtrees that matched 086// while (matcher.findNextMatchingNode()) { 087// Tree match = matcher.getMatch(); 088// // do what we want to with the subtree 089// } 090 } 091 } 092 093 for (CoreLabel label: sentence.get(TokensAnnotation.class)) { 094 // this is the text of the token 095 String word = label.get(TextAnnotation.class); 096 // this is the POS tag of the token 097 String pos = label.get(PartOfSpeechAnnotation.class); 098 //this is the POS tag of the token 099 String lemma = label.get(LemmaAnnotation.class); 100 //check if token is punctuation 101 boolean isPunctuation = word.matches(punctuationPattern) 102 || (pos != null && (pos.equalsIgnoreCase("-lrb-") || pos.equalsIgnoreCase("-rrb-"))) 103 || word.startsWith("'") 104 ; 105 //check if it is a stop word 106 boolean isStopWord = stopWordFilter.isStopWord(word.toLowerCase()); 107 108 Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); 109 110 if(determineHead && word.equals(head)){ 111 token.setIsHead(true); 112 } 113 114 document.add(token); 115 } 116 } 117 118 return document; 119 } 120 121 public List<String> generateDocumentSimple(String text) { 122 List<String> tokens = new ArrayList<>(); 123 124 // create an empty Annotation just with the given text 125 Annotation annotatedDocument = new Annotation(text); 126 127 // run all Annotators on this text 128 pipelineSimple.annotate(annotatedDocument); 129 130 // these are all the sentences in this document 131 // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types 132 List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class); 133 134 for(CoreMap sentence: sentences) { 135 136 137 for (CoreLabel label: sentence.get(TokensAnnotation.class)) { 138 // this is the text of the token 139 String word = label.get(TextAnnotation.class); 140 141 tokens.add(word); 142 } 143 } 144 145 return tokens; 146 } 147 148 public static void main(String[] args) throws Exception { 149 TextDocument document = TextDocumentGenerator.getInstance().generateDocument("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); 150 System.out.println(document); 151 } 152}