/** * Creates a new Document for the given text. Default implementation tokenizes * the text using the tokenizer provided during construction and sticks the words * in a new BasicDocument. The text is also stored as the original text in * the BasicDocument if keepOriginalText was set in the constructor. Subclasses * may wish to extract additional information from the text and/or return another * document subclass with additional meta-data. */ protected BasicDocument<L> parseDocumentText(String text) { new BasicDocument<L>(); return BasicDocument.init(text, keepOriginalText); }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }
/** * For internal debugging purposes only. */ public static void main(String[] args) { new BasicDocument<String>(); Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.out.println("Before:"); System.out.println(htmlDoc); Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc); System.out.println("After:"); System.out.println(txtDoc); Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc); System.out.println("Sentences:"); System.out.println(sentences); } }
/** * Inits a new BasicDocument with the given text contents and title. * The text is tokenized using {@link #parse(String)} to populate the list of words * ("" is used if text is null). If specified, a reference to the * original text is also maintained so that the text() method returns the * text given to this constructor. Returns a reference to this * BasicDocument * for convenience (so it's more like a constructor, but inherited). */ public static <L> BasicDocument<L> init(String text, String title, boolean keepOriginalText) { BasicDocument<L> basicDocument = new BasicDocument<>(); // initializes the List of labels and sets the title basicDocument.setTitle(title); // stores the original text as specified if (keepOriginalText) { basicDocument.originalText = text; } else { basicDocument.originalText = null; } // populates the words by parsing the text basicDocument.parse(text == null ? "" : text); return basicDocument; }
Document<HasWord, Word, Word> d; if (filename.startsWith("http://")) { Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename)); DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>(); d = notags.processDocument(dpre); } else { d = new BasicDocument<HasWord>().init(new File(filename));
if (urlOrFile.startsWith("http://") || urlOrFile.endsWith(".htm") || urlOrFile.endsWith(".html")) { Document<Object, Word, Word> docPre = new BasicDocument<>().init(new URL(urlOrFile)); DocumentProcessor<Word, Word, Object, Word> noTags = new StripTagsProcessor<>(); doc = noTags.processDocument(docPre); } else { doc = new BasicDocument<>(this.getTokenizerFactory()).init(new InputStreamReader(new FileInputStream(filename), encoding));
Document<String, Word, Word> dpre = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new URL(filename)); DocumentProcessor<Word, Word, String, Word> notags = new StripTagsProcessor<>(); d = notags.processDocument(dpre); } else { d = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new File(filename));
bd = ErasureUtils.<BasicDocument<L>>uncheckedCast(getClass().newInstance()); } catch (Exception e) { bd = new BasicDocument<>();
/** * Creates a new Document for the given text. Default implementation tokenizes * the text using the tokenizer provided during construction and sticks the words * in a new BasicDocument. The text is also stored as the original text in * the BasicDocument if keepOriginalText was set in the constructor. Subclasses * may wish to extract additional information from the text and/or return another * document subclass with additional meta-data. */ protected BasicDocument<L> parseDocumentText(String text) { new BasicDocument<L>(); return BasicDocument.init(text, keepOriginalText); }
/** * Creates a new Document for the given text. Default implementation tokenizes * the text using the tokenizer provided during construction and sticks the words * in a new BasicDocument. The text is also stored as the original text in * the BasicDocument if keepOriginalText was set in the constructor. Subclasses * may wish to extract additional information from the text and/or return another * document subclass with additional meta-data. */ protected BasicDocument<L> parseDocumentText(String text) { new BasicDocument<L>(); return BasicDocument.init(text, keepOriginalText); }
/** * Creates a new Document for the given text. Default implementation tokenizes * the text using the tokenizer provided during construction and sticks the words * in a new BasicDocument. The text is also stored as the original text in * the BasicDocument if keepOriginalText was set in the constructor. Subclasses * may wish to extract additional information from the text and/or return another * document subclass with additional meta-data. */ protected BasicDocument<L> parseDocumentText(String text) { new BasicDocument<L>(); return BasicDocument.init(text, keepOriginalText); }
/** * Creates a new Document for the given text. Default implementation tokenizes * the text using the tokenizer provided during construction and sticks the words * in a new BasicDocument. The text is also stored as the original text in * the BasicDocument if keepOriginalText was set in the constructor. Subclasses * may wish to extract additional information from the text and/or return another * document subclass with additional meta-data. */ protected BasicDocument<L> parseDocumentText(String text) { new BasicDocument<L>(); return BasicDocument.init(text, keepOriginalText); }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }
/** * For internal debugging purposes only. */ public static void main(String[] args) { new BasicDocument<String>(); Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.out.println("Before:"); System.out.println(htmlDoc); Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc); System.out.println("After:"); System.out.println(txtDoc); Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc); System.out.println("Sentences:"); System.out.println(sentences); } }
/** * For internal debugging purposes only. */ public static void main(String[] args) { new BasicDocument<String>(); Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.out.println("Before:"); System.out.println(htmlDoc); Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc); System.out.println("After:"); System.out.println(txtDoc); Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc); System.out.println("Sentences:"); System.out.println(sentences); } }
/** * For internal debugging purposes only. */ public static void main(String[] args) { new BasicDocument<String>(); Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.out.println("Before:"); System.out.println(htmlDoc); Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc); System.out.println("After:"); System.out.println(txtDoc); Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc); System.out.println("Sentences:"); System.out.println(sentences); } }
/** * For internal debugging purposes only. */ public static void main(String[] args) { new BasicDocument<String>(); Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.out.println("Before:"); System.out.println(htmlDoc); Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc); System.out.println("After:"); System.out.println(txtDoc); Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc); System.out.println("Sentences:"); System.out.println(sentences); } }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }