/** * Calls init(words,null) */ public BasicDocument<L> init(List<? extends Word> words) { return init(words, null); }
/** * Calls init((String)null,null,true) */ public static <L> BasicDocument<L> init() { return init((String) null, null, true); }
/** * Calls init(text,null,keepOriginalText) */ public static <L> BasicDocument<L> init(String text, boolean keepOriginalText) { return init(text, null, keepOriginalText); }
/** * Calls init(textReader,null,true) */ public BasicDocument<L> init(Reader textReader) throws IOException { return init(textReader, null, true); }
/** * Calls init(textURL,title,true) */ public BasicDocument<L> init(URL textURL, String title) throws IOException { return init(textURL, title, true); }
/** * Calls init(text,title,true) */ public static <L> BasicDocument<L> init(String text, String title) { return init(text, title, true); }
/** * Calls init(text,null,true) */ public static <L> BasicDocument<L> init(String text) { return init(text, null, true); }
/** * Calls init(textReader,title,true) */ public BasicDocument<L> init(Reader textReader, String title) throws IOException { return init(textReader, title, true); }
/** * Calls init(textFile,title,true) */ public BasicDocument<L> init(File textFile, String title) throws IOException { return init(textFile, title, true); }
/** * Calls init(textReader,null,keepOriginalText) */ public BasicDocument<L> init(Reader textReader, boolean keepOriginalText) throws IOException { return init(textReader, null, keepOriginalText); }
/** * Calls init(textURL,textURL.toExternalForm(),true) */ public BasicDocument<L> init(URL textURL) throws IOException { return init(textURL, textURL.toExternalForm(), true); }
/** * Calls init(textFile,textFile.getCanonicalPath(),true) */ public BasicDocument<L> init(File textFile) throws IOException { return init(textFile, textFile.getCanonicalPath(), true); }
/** * Calls init(textURL,textFile.toExternalForm(),keepOriginalText) */ public BasicDocument<L> init(URL textURL, boolean keepOriginalText) throws IOException { return init(textURL, textURL.toExternalForm(), keepOriginalText); }
/** * Calls init(textFile,textFile.getCanonicalPath(),keepOriginalText) */ public BasicDocument<L> init(File textFile, boolean keepOriginalText) throws IOException { return init(textFile, textFile.getCanonicalPath(), keepOriginalText); }
/** * Inits a new BasicDocument by reading in the text from the given File. * * @see #init(String,String,boolean) */ public BasicDocument<L> init(File textFile, String title, boolean keepOriginalText) throws IOException { Reader in = DocumentReader.getReader(textFile); BasicDocument<L> bd = init(in, title, keepOriginalText); in.close(); return bd; }
/** * Creates a new Document for the given text. Default implementation tokenizes * the text using the tokenizer provided during construction and sticks the words * in a new BasicDocument. The text is also stored as the original text in * the BasicDocument if keepOriginalText was set in the constructor. Subclasses * may wish to extract additional information from the text and/or return another * document subclass with additional meta-data. */ protected BasicDocument<L> parseDocumentText(String text) { new BasicDocument<L>(); return BasicDocument.init(text, keepOriginalText); }
/** * Inits a new BasicDocument by reading in the text from the given Reader. * * @see #init(String,String,boolean) */ public static <L> BasicDocument<L> init(Reader textReader, String title, boolean keepOriginalText) throws IOException { return init(DocumentReader.readText(textReader), title, keepOriginalText); }
/** * Constructs a new BasicDocument by reading in the text from the given URL. * * @see #init(String,String,boolean) */ public BasicDocument<L> init(URL textURL, String title, boolean keepOriginalText) throws IOException { return init(DocumentReader.getReader(textURL), title, keepOriginalText); }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }
/** * For internal debugging purposes only. */ public static void main(String[] args) { new BasicDocument<String>(); Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.out.println("Before:"); System.out.println(htmlDoc); Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc); System.out.println("After:"); System.out.println(txtDoc); Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc); System.out.println("Sentences:"); System.out.println(sentences); } }