/** * Sets the reader from which to read and create documents. * Default implementation automatically buffers the Reader if it's not * already buffered. Subclasses that don't want buffering may want to override * this method to simply set the global <tt>in</tt> directly. */ public void setReader(Reader in) { this.in = getBufferedReader(in); }
/** * Returns a Reader that reads in the given URL. */ public static Reader getReader(URL url) throws IOException { return (getReader(url.openStream())); }
/** * Reads the next document's worth of text from the reader and turns it into * a Document. Default implementation calls {@link #readNextDocumentText} * and passes it to {@link #parseDocumentText} to create the document. * Subclasses may wish to override either or both of those methods to handle * custom formats of document collections and individual documents * respectively. This method can also be overridden in its entirety to * provide custom reading and construction of documents from input text. */ public BasicDocument<L> readDocument() throws IOException { String text = readNextDocumentText(); if (text == null) { return (null); } return parseDocumentText(text); }
/** * Constructs a new DocumentReader that will read text from the given * Reader and tokenize it into words using the given Tokenizer. The default * implementation will internally buffer the reader if it is not already * buffered, so there is no need to pre-wrap the reader with a BufferedReader. * This class provides many <tt>getReader</tt> methods for conviniently * reading from many input sources. */ public DocumentReader(Reader in, TokenizerFactory<? extends HasWord> tokenizerFactory, boolean keepOriginalText) { if (in != null) { setReader(in); } setTokenizerFactory(tokenizerFactory); this.keepOriginalText = keepOriginalText; }
/** * Reads the next document's worth of text from the reader. Default * implementation reads all the text. Subclasses wishing to read multiple * documents from a single input source should read until the next document * delimiter and return the text so far. Returns null if there is no more * text to be read. */ protected String readNextDocumentText() throws IOException { return readText(in); }
/** * Constructs a new DocumentReader that will read text from the given * Reader and tokenize it into words using the given Tokenizer. The default * implementation will internally buffer the reader if it is not already * buffered, so there is no need to pre-wrap the reader with a BufferedReader. * This class provides many <tt>getReader</tt> methods for conviniently * reading from many input sources. */ public DocumentReader(Reader in, TokenizerFactory<? extends HasWord> tokenizerFactory, boolean keepOriginalText) { if (in != null) { setReader(in); } setTokenizerFactory(tokenizerFactory); this.keepOriginalText = keepOriginalText; }
/** * Inits a new BasicDocument by reading in the text from the given Reader. * * @see #init(String,String,boolean) */ public static <L> BasicDocument<L> init(Reader textReader, String title, boolean keepOriginalText) throws IOException { return init(DocumentReader.readText(textReader), title, keepOriginalText); }
/** * Constructs a new DocumentReader that will read text from the given * Reader and tokenize it into words using the given Tokenizer. The default * implementation will internally buffer the reader if it is not already * buffered, so there is no need to pre-wrap the reader with a BufferedReader. * This class provides many <tt>getReader</tt> methods for conviniently * reading from many input sources. */ public DocumentReader(Reader in, TokenizerFactory<? extends HasWord> tokenizerFactory, boolean keepOriginalText) { if (in != null) { setReader(in); } setTokenizerFactory(tokenizerFactory); this.keepOriginalText = keepOriginalText; }
/** * Reads the next document's worth of text from the reader and turns it into * a Document. Default implementation calls {@link #readNextDocumentText} * and passes it to {@link #parseDocumentText} to create the document. * Subclasses may wish to override either or both of those methods to handle * custom formats of document collections and individual documents * respectively. This method can also be overridden in its entirety to * provide custom reading and construction of documents from input text. */ public BasicDocument<L> readDocument() throws IOException { String text = readNextDocumentText(); if (text == null) { return (null); } return parseDocumentText(text); }
/** * Inits a new BasicDocument by reading in the text from the given File. * * @see #init(String,String,boolean) */ public BasicDocument<L> init(File textFile, String title, boolean keepOriginalText) throws IOException { Reader in = DocumentReader.getReader(textFile); BasicDocument<L> bd = init(in, title, keepOriginalText); in.close(); return bd; }
/** * Returns everything that can be read from the given Reader as a String. * Returns null if the given Reader is null. */ public static String readText(Reader in) throws IOException { // returns null if the reader is null if (in == null) { return (null); } // ensures the reader is buffered BufferedReader br = getBufferedReader(in); // reads all the chars into a buffer StringBuilder sb = new StringBuilder(16000); // make biggish int c; while ((c = br.read()) >= 0) { sb.append((char) c); } return sb.toString(); }
/** * Reads the next document's worth of text from the reader. Default * implementation reads all the text. Subclasses wishing to read multiple * documents from a single input source should read until the next document * delimiter and return the text so far. Returns null if there is no more * text to be read. */ protected String readNextDocumentText() throws IOException { return readText(in); }
/** * Constructs a new DocumentReader that will read text from the given * Reader and tokenize it into words using the given Tokenizer. The default * implementation will internally buffer the reader if it is not already * buffered, so there is no need to pre-wrap the reader with a BufferedReader. * This class provides many <tt>getReader</tt> methods for conviniently * reading from many input sources. */ public DocumentReader(Reader in, TokenizerFactory<? extends HasWord> tokenizerFactory, boolean keepOriginalText) { if (in != null) { setReader(in); } setTokenizerFactory(tokenizerFactory); this.keepOriginalText = keepOriginalText; }
/** * Reads the next document's worth of text from the reader and turns it into * a Document. Default implementation calls {@link #readNextDocumentText} * and passes it to {@link #parseDocumentText} to create the document. * Subclasses may wish to override either or both of those methods to handle * custom formats of document collections and individual documents * respectively. This method can also be overridden in its entirety to * provide custom reading and construction of documents from input text. */ public BasicDocument<L> readDocument() throws IOException { String text = readNextDocumentText(); if (text == null) { return (null); } return parseDocumentText(text); }
/** * Constructs a new BasicDocument by reading in the text from the given URL. * * @see #init(String,String,boolean) */ public BasicDocument<L> init(URL textURL, String title, boolean keepOriginalText) throws IOException { return init(DocumentReader.getReader(textURL), title, keepOriginalText); }
/** * Sets the reader from which to read and create documents. * Default implementation automatically buffers the Reader if it's not * already buffered. Subclasses that don't want buffering may want to override * this method to simply set the global <tt>in</tt> directly. */ public void setReader(Reader in) { this.in = getBufferedReader(in); }
/** * Reads the next document's worth of text from the reader. Default * implementation reads all the text. Subclasses wishing to read multiple * documents from a single input source should read until the next document * delimiter and return the text so far. Returns null if there is no more * text to be read. */ protected String readNextDocumentText() throws IOException { return readText(in); }
/** * Constructs a new DocumentReader that will read text from the given * Reader and tokenize it into words using the given Tokenizer. The default * implementation will internally buffer the reader if it is not already * buffered, so there is no need to pre-wrap the reader with a BufferedReader. * This class provides many <tt>getReader</tt> methods for conviniently * reading from many input sources. */ public DocumentReader(Reader in, TokenizerFactory<? extends HasWord> tokenizerFactory, boolean keepOriginalText) { if (in != null) { setReader(in); } setTokenizerFactory(tokenizerFactory); this.keepOriginalText = keepOriginalText; }
/** * Reads the next document's worth of text from the reader and turns it into * a Document. Default implementation calls {@link #readNextDocumentText} * and passes it to {@link #parseDocumentText} to create the document. * Subclasses may wish to override either or both of those methods to handle * custom formats of document collections and individual documents * respectively. This method can also be overridden in its entirety to * provide custom reading and construction of documents from input text. */ public BasicDocument<L> readDocument() throws IOException { String text = readNextDocumentText(); if (text == null) { return (null); } return parseDocumentText(text); }
/** * Returns a Reader that reads in the given URL. */ public static Reader getReader(URL url) throws IOException { return (getReader(url.openStream())); }