001package org.dllearner.algorithms.isle;
002
003import java.util.ArrayList;
004import java.util.List;
005import java.util.Properties;
006
007import org.dllearner.algorithms.isle.index.TextDocument;
008import org.dllearner.algorithms.isle.index.Token;
009
010import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
011import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
012import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
013import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
014import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
015import edu.stanford.nlp.ling.CoreLabel;
016import edu.stanford.nlp.pipeline.Annotation;
017import edu.stanford.nlp.pipeline.StanfordCoreNLP;
018import edu.stanford.nlp.trees.CollinsHeadFinder;
019import edu.stanford.nlp.trees.Tree;
020import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
021import edu.stanford.nlp.util.CoreMap;
022
023public class TextDocumentGenerator {
024
025        private static TextDocumentGenerator instance;
026        
027        private StanfordCoreNLP pipeline;
028        private StanfordCoreNLP pipelineSimple;
029        private final String punctuationPattern = "\\p{Punct}";
030        private final StopWordFilter stopWordFilter = new StopWordFilter();
031        
032        private TextDocumentGenerator(){
033                Properties props = new Properties();
034            props.put("annotators", "tokenize, ssplit");//, pos, lemma, parse");
035            pipelineSimple = new StanfordCoreNLP(props);
036            
037            props = new Properties();
038            props.put("annotators", "tokenize, ssplit, pos, lemma, parse");
039            pipeline = new StanfordCoreNLP(props);
040        }
041        
042        public static synchronized TextDocumentGenerator getInstance(){
043                if(instance == null){
044                        instance = new TextDocumentGenerator();
045                }
046                return instance;
047        }
048
049        public TextDocument generateDocument(String text) {
050                return generateDocument(text, false);
051        }
052        
053        public TextDocument generateDocument(String text, boolean determineHead) {
054                TextDocument document = new TextDocument();
055            // create an empty Annotation just with the given text
056            Annotation annotatedDocument = new Annotation(text);
057            
058            // run all Annotators on this text
059            pipeline.annotate(annotatedDocument);
060            
061            // these are all the sentences in this document
062            // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
063            List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class);
064            
065            for(CoreMap sentence: sentences) {
066                
067                //determine the head noun
068                String head = null;
069                if(determineHead){
070                        //if phrase only contains one single token, the task is trivial
071                        if(sentence.get(TokensAnnotation.class).size() == 1){
072                                head = sentence.get(TokensAnnotation.class).get(0).get(TextAnnotation.class);
073                        } else {
074                                Tree tree = sentence.get(TreeAnnotation.class);
075                            CollinsHeadFinder headFinder = new CollinsHeadFinder();
076//                          Tree head = headFinder.determineHead(tree);
077//                          System.out.println(sentence);
078//                          System.out.println(tree.headTerminal(headFinder));
079                            head = tree.headTerminal(headFinder).toString();
080                            
081//                          // Create a reusable pattern object 
082//                          TregexPattern patternMW = TregexPattern.compile("__ >># NP"); 
083//                          // Run the pattern on one particular tree 
084//                          TregexMatcher matcher = patternMW.matcher(tree); 
085//                          // Iterate over all of the subtrees that matched 
086//                          while (matcher.findNextMatchingNode()) { 
087//                            Tree match = matcher.getMatch(); 
088//                            // do what we want to with the subtree 
089//                          }
090                        }
091                }
092           
093                for (CoreLabel label: sentence.get(TokensAnnotation.class)) {
094                        // this is the text of the token
095                    String word = label.get(TextAnnotation.class);
096                    // this is the POS tag of the token
097                    String pos = label.get(PartOfSpeechAnnotation.class);
098                    //this is the POS tag of the token
099                    String lemma = label.get(LemmaAnnotation.class);
100                    //check if token is punctuation
101                    boolean isPunctuation = word.matches(punctuationPattern) 
102                                || (pos != null && (pos.equalsIgnoreCase("-lrb-") || pos.equalsIgnoreCase("-rrb-")))
103                                || word.startsWith("'")
104                                ;
105                    //check if it is a stop word
106                    boolean isStopWord = stopWordFilter.isStopWord(word.toLowerCase());
107                   
108                    Token token = new Token(word, lemma, pos, isPunctuation, isStopWord);
109                    
110                    if(determineHead && word.equals(head)){
111                        token.setIsHead(true);
112                    }
113                    
114                    document.add(token);
115                  }
116            }
117                
118                return document;
119        }
120        
121        public List<String> generateDocumentSimple(String text) {
122                List<String> tokens = new ArrayList<>();
123                
124            // create an empty Annotation just with the given text
125            Annotation annotatedDocument = new Annotation(text);
126            
127            // run all Annotators on this text
128            pipelineSimple.annotate(annotatedDocument);
129            
130            // these are all the sentences in this document
131            // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
132            List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class);
133            
134            for(CoreMap sentence: sentences) {
135                
136           
137                for (CoreLabel label: sentence.get(TokensAnnotation.class)) {
138                        // this is the text of the token
139                    String word = label.get(TextAnnotation.class);
140                    
141                    tokens.add(word);
142                }
143            }
144                
145                return tokens;
146        }
147        
148        public static void main(String[] args) throws Exception {
149                TextDocument document = TextDocumentGenerator.getInstance().generateDocument("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. ");
150                System.out.println(document);
151        }
152}