001package org.dllearner.algorithms.isle.index; 002 003/** 004 * Interface for classes representing documents. 005 * 006 * @author Daniel Fleischhacker 007 */ 008public interface Document { 009 /** 010 * Returns the cleaned content of this document represented as a string. This returns the cleaned content, 011 * thus markup and other structure is removed. The raw content can be retrieved using {@link #getRawContent}. 012 * Methods for retrieving more specialized content formats might be implemented by the actual implementations. 013 * 014 * @return this document's text content 015 */ 016 String getContent(); 017 018 /** 019 * Returns the uncleaned content, i.e., as originally retrieved, of this document represented as string. 020 * 021 * @return uncleaned content of this document 022 */ 023 String getRawContent(); 024 025 /** 026 * Returns the uncleaned content with POS tags in form of word1/pos1 word2/pos2 ... as string. 027 * 028 * @return uncleaned content with POS tags 029 */ 030 String getPOSTaggedContent(); 031}