001package org.dllearner.algorithms.isle.index;
002
003/**
004 * Interface for classes representing documents.
005 *
006 * @author Daniel Fleischhacker
007 */
008public interface Document {
009    /**
010     * Returns the cleaned content of this document represented as a string. This returns the cleaned content,
011     * thus markup and other structure is removed. The raw content can be retrieved using {@link #getRawContent}.
012     * Methods for retrieving more specialized content formats might be implemented by the actual implementations.
013     *
014     * @return this document's text content
015     */
016    String getContent();
017
018    /**
019     * Returns the uncleaned content, i.e., as originally retrieved, of this document represented as string.
020     *
021     * @return uncleaned content of this document
022     */
023    String getRawContent();
024    
025    /**
026     * Returns the uncleaned content with POS tags in form of word1/pos1 word2/pos2 ... as string.
027     *
028     * @return uncleaned content with POS tags
029     */
030    String getPOSTaggedContent();
031}