001package org.dllearner.algorithms.isle;
002
003import java.util.List;
004import java.util.Properties;
005
006import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
007import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
008import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
009import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
010import edu.stanford.nlp.ling.CoreLabel;
011import edu.stanford.nlp.pipeline.Annotation;
012import edu.stanford.nlp.pipeline.StanfordCoreNLP;
013import edu.stanford.nlp.util.CoreMap;
014
015public class StanfordPartOfSpeechTagger {
016
017        private static StanfordPartOfSpeechTagger instance;
018        private StanfordCoreNLP pipeline;
019        
020        private StanfordPartOfSpeechTagger(){
021                Properties props = new Properties();
022            props.put("annotators", "tokenize, ssplit, pos");
023            pipeline = new StanfordCoreNLP(props);
024        }
025        
026        public static synchronized StanfordPartOfSpeechTagger getInstance(){
027                if(instance == null){
028                        instance = new StanfordPartOfSpeechTagger();
029                }
030                return instance;
031        }
032
033        public String tag(String text) {
034                String out = "";
035                
036            // create an empty Annotation just with the given text
037            Annotation document = new Annotation(text);
038            
039            // run all Annotators on this text
040            pipeline.annotate(document);
041            
042            // these are all the sentences in this document
043            // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
044            List<CoreMap> sentences = document.get(SentencesAnnotation.class);
045            
046            for(CoreMap sentence: sentences) {
047                for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
048                        // this is the text of the token
049                    String word = token.get(TextAnnotation.class);
050                    // this is the POS tag of the token
051                    String pos = token.get(PartOfSpeechAnnotation.class);
052                   
053                    out += " " + word + "/" + pos;
054                  }
055            }
056                
057                return out.trim();
058        }
059}