001package org.dllearner.algorithms.isle.index; 002 003import java.util.ArrayList; 004import java.util.LinkedList; 005import java.util.List; 006 007import org.dllearner.algorithms.isle.TextDocumentGenerator; 008 009/** 010 * A simple text document without further formatting or markup. 011 * 012 * @author Daniel Fleischhacker 013 */ 014public class TextDocument extends LinkedList<Token> implements Document { 015 public static void main(String[] args) { 016 String s = "This is a very long, nice text for testing our new implementation of TextDocument."; 017 TextDocument doc = TextDocumentGenerator.getInstance().generateDocument(s); 018 019 System.out.println(doc.getRawContent()); 020 } 021 022 @Override 023 public String getContent() { 024 return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.STEMMED); 025 } 026 027 @Override 028 public String getRawContent() { 029 return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.RAW); 030 } 031 032 @Override 033 public String getPOSTaggedContent() { 034 return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.POS_TAGGED); 035 } 036 037 /** 038 * Returns a string containing all tokens starting at the token {@code start} until the end of the list. The 039 * surface forms according to {@code level} are used to build the string. 040 * 041 * @param start token to start building the string at, i.e., the first token in the returned string 042 * @param l level of surface forms to use 043 * @return built string 044 */ 045 public String getContentStartingAtToken(Token start, SurfaceFormLevel l) { 046 StringBuilder sb = new StringBuilder(); 047 boolean found = false; 048 for (Token t : this) { 049 if (found) { 050 sb.append(" "); 051 String surfaceForm = getStringForLevel(t, l); 052 if (surfaceForm != null) { 053 sb.append(surfaceForm); 054 } 055 } 056 else if (t == start) { 057 found = true; 058 sb.append(getStringForLevel(t, l)); 059 } 060 } 061 062 return sb.toString(); 063 } 064 065 /** 066 * Returns a list containing {@code numberOfTokens} successive tokens from this document starting at the given start 067 * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not 068 * counted for the number of tokens. 069 * 070 * @param start token to start collecting tokens from the document 071 * @param numberOfTokens number of tokens to collect from the document 072 * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return 073 * @return list containing the given number of relevant tokens, depending in the value of ignorePunctuation, the 074 * list might contain additional non-relevant (punctuation) tokens 075 */ 076 public List<Token> getTokensStartingAtToken(Token start, int numberOfTokens, boolean ignorePunctuation) { 077 ArrayList<Token> tokens = new ArrayList<>(); 078 079 int relevantTokens = 0; 080 boolean found = false; 081 082 for (Token t : this) { 083 if (found) { 084 tokens.add(t); 085 if (!ignorePunctuation || !t.isPunctuation()) { 086 relevantTokens++; 087 } 088 } 089 else if (t == start) { 090 found = true; 091 tokens.add(t); 092 } 093 if (relevantTokens == numberOfTokens) { 094 break; 095 } 096 } 097 098 return tokens; 099 } 100 101 /** 102 * Returns a list containing all successive tokens from this document starting at the given start 103 * token. If {@code ignorePunctuation} is set, tokens which represent punctuation are added to the result but not 104 * counted for the number of tokens. 105 * 106 * @param start token to start collecting tokens from the document 107 * @param ignorePunctuation if true, punctuation are not counted towards the number of tokens to return 108 * @return list containing all relevant tokens, depending in the value of ignorePunctuation, the 109 * list might contain additional non-relevant (punctuation) tokens 110 */ 111 public List<Token> getTokensStartingAtToken(Token start, boolean ignorePunctuation) { 112 ArrayList<Token> tokens = new ArrayList<>(); 113 114 int relevantTokens = 0; 115 boolean found = false; 116 117 for (Token t : this) { 118 if (found) { 119 tokens.add(t); 120 if (!ignorePunctuation || !t.isPunctuation()) { 121 relevantTokens++; 122 } 123 } 124 else if (t == start) { 125 found = true; 126 tokens.add(t); 127 } 128 } 129 130 return tokens; 131 } 132 133 private String getStringForLevel(Token t, SurfaceFormLevel l) { 134 switch (l) { 135 case RAW: 136 return t.getRawForm(); 137 case POS_TAGGED: 138 return t.getPOSTag(); 139 case STEMMED: 140 return t.isPunctuation() ? null : t.getStemmedForm(); 141 } 142 143 return null; 144 } 145}