Source code

001package org.dllearner.algorithms.isle.index;
002
003import java.util.ArrayList;
004import java.util.LinkedList;
005import java.util.List;
006
007import org.dllearner.algorithms.isle.TextDocumentGenerator;
008
009/**
010 * A simple text document without further formatting or markup.
011 *
012 * @author Daniel Fleischhacker
013 */
014public class TextDocument extends LinkedList<Token> implements Document {
015    public static void main(String[] args) {
016        String s = "This is a very long, nice text for testing our new implementation of TextDocument.";
017        TextDocument doc = TextDocumentGenerator.getInstance().generateDocument(s);
018
019        System.out.println(doc.getRawContent());
020    }
021
022    @Override
023    public String getContent() {
024        return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.STEMMED);
025    }
026
027    @Override
028    public String getRawContent() {
029        return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.RAW);
030    }
031
032    @Override
033    public String getPOSTaggedContent() {
034        return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.POS_TAGGED);
035    }
036
037    /**
038     * Returns a string containing all tokens starting at the token {@code start} until the end of the list. The
039     * surface forms according to {@code level} are used to build the string.
040     *
041     * @param start token to start building the string at, i.e., the first token in the returned string
042     * @param l     level of surface forms to use
043     * @return built string
044     */
045    public String getContentStartingAtToken(Token start, SurfaceFormLevel l) {
046        StringBuilder sb = new StringBuilder();
047        boolean found = false;
048        for (Token t : this) {
049            if (found) {
050                sb.append(" ");
051                String surfaceForm = getStringForLevel(t, l);
052                if (surfaceForm != null) {
053                    sb.append(surfaceForm);
054                }
055            }
056            else if (t == start) {
057                found = true;
058                sb.append(getStringForLevel(t, l));
059            }
060        }
061
062        return sb.toString();
063    }
064
065    /**
066     * Returns a list containing {@code numberOfTokens} successive tokens from this document starting at the given start
067     * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not
068     * counted for the number of tokens.
069     *
070     * @param start             token to start collecting tokens from the document
071     * @param numberOfTokens    number of tokens to collect from the document
072     * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return
073     * @return list containing the given number of relevant tokens, depending in the value of ignorePunctuation, the
074     *          list might contain additional non-relevant (punctuation) tokens
075     */
076    public List<Token> getTokensStartingAtToken(Token start, int numberOfTokens, boolean ignorePunctuation) {
077        ArrayList<Token> tokens = new ArrayList<>();
078
079        int relevantTokens = 0;
080        boolean found = false;
081
082        for (Token t : this) {
083            if (found) {
084                tokens.add(t);
085                if (!ignorePunctuation || !t.isPunctuation()) {
086                    relevantTokens++;
087                }
088            }
089            else if (t == start) {
090                found = true;
091                tokens.add(t);
092            }
093            if (relevantTokens == numberOfTokens) {
094                break;
095            }
096        }
097
098        return tokens;
099    }
100
101    /**
102     * Returns a list containing all successive tokens from this document starting at the given start
103     * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not
104     * counted for the number of tokens.
105     *
106     * @param start             token to start collecting tokens from the document
107     * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return
108     * @return list containing all relevant tokens, depending in the value of ignorePunctuation, the
109     *          list might contain additional non-relevant (punctuation) tokens
110     */
111    public List<Token> getTokensStartingAtToken(Token start, boolean ignorePunctuation) {
112        ArrayList<Token> tokens = new ArrayList<>();
113
114        int relevantTokens = 0;
115        boolean found = false;
116
117        for (Token t : this) {
118            if (found) {
119                tokens.add(t);
120                if (!ignorePunctuation || !t.isPunctuation()) {
121                    relevantTokens++;
122                }
123            }
124            else if (t == start) {
125                found = true;
126                tokens.add(t);
127            }
128        }
129
130        return tokens;
131    }
132
133    private String getStringForLevel(Token t, SurfaceFormLevel l) {
134        switch (l) {
135            case RAW:
136                return t.getRawForm();
137            case POS_TAGGED:
138                return t.getPOSTag();
139            case STEMMED:
140                return t.isPunctuation() ? null : t.getStemmedForm();
141        }
142
143        return null;
144    }
145}