edu.stanford.nlp.ling.BasicDocument.<init> java code examples

/**
 * Creates a new Document for the given text. Default implementation tokenizes
 * the text using the tokenizer provided during construction and sticks the words
 * in a new BasicDocument. The text is also stored as the original text in
 * the BasicDocument if keepOriginalText was set in the constructor. Subclasses
 * may wish to extract additional information from the text and/or return another
 * document subclass with additional meta-data.
 */
protected BasicDocument<L> parseDocumentText(String text) {
 new BasicDocument<L>();
 return BasicDocument.init(text, keepOriginalText);
}

/**
 * For internal debugging purposes only. Creates and tests various instances
 * of BasicDocument.
 */
public static void main(String[] args) {
 try {
  printState(BasicDocument.init("this is the text", "this is the title [String]", true));
  printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true));
  File f = File.createTempFile("BasicDocumentTestFile", null);
  f.deleteOnExit();
  PrintWriter out = new PrintWriter(new FileWriter(f));
  out.print("this is the text");
  out.flush();
  out.close();
  printState(new BasicDocument<String>().init(f, "this is the title [File]", true));
  printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true));
 } catch (Exception e) {
  e.printStackTrace();
 }
}

 /**
  * For internal debugging purposes only.
  */
 public static void main(String[] args) {
  new BasicDocument<String>();
  Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");
  System.out.println("Before:");
  System.out.println(htmlDoc);
  Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc);
  System.out.println("After:");
  System.out.println(txtDoc);
  Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc);
  System.out.println("Sentences:");
  System.out.println(sentences);
 }
}

/**
 * Inits a new BasicDocument with the given text contents and title.
 * The text is tokenized using {@link #parse(String)} to populate the list of words
 * ("" is used if text is null). If specified, a reference to the
 * original text is also maintained so that the text() method returns the
 * text given to this constructor. Returns a reference to this
 * BasicDocument
 * for convenience (so it's more like a constructor, but inherited).
 */
public static <L> BasicDocument<L> init(String text, String title, boolean keepOriginalText) {
 BasicDocument<L> basicDocument = new BasicDocument<>();
 // initializes the List of labels and sets the title
 basicDocument.setTitle(title);
 // stores the original text as specified
 if (keepOriginalText) {
  basicDocument.originalText = text;
 } else {
  basicDocument.originalText = null;
 }
 // populates the words by parsing the text
 basicDocument.parse(text == null ? "" : text);
 return basicDocument;
}

Document<HasWord, Word, Word> d;
if (filename.startsWith("http://")) {
 Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename));
 DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>();
 d = notags.processDocument(dpre);
} else {
 d = new BasicDocument<HasWord>().init(new File(filename));

if (urlOrFile.startsWith("http://") || urlOrFile.endsWith(".htm") || urlOrFile.endsWith(".html")) {
 Document<Object, Word, Word> docPre = new BasicDocument<>().init(new URL(urlOrFile));
 DocumentProcessor<Word, Word, Object, Word> noTags = new StripTagsProcessor<>();
 doc = noTags.processDocument(docPre);
} else {
 doc = new BasicDocument<>(this.getTokenizerFactory()).init(new InputStreamReader(new FileInputStream(filename), encoding));

 Document<String, Word, Word> dpre = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new URL(filename));
 DocumentProcessor<Word, Word, String, Word> notags = new StripTagsProcessor<>();
 d = notags.processDocument(dpre);
} else {
 d = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new File(filename));

 bd = ErasureUtils.<BasicDocument<L>>uncheckedCast(getClass().newInstance());
} catch (Exception e) {
 bd = new BasicDocument<>();

/**
 * Creates a new Document for the given text. Default implementation tokenizes
 * the text using the tokenizer provided during construction and sticks the words
 * in a new BasicDocument. The text is also stored as the original text in
 * the BasicDocument if keepOriginalText was set in the constructor. Subclasses
 * may wish to extract additional information from the text and/or return another
 * document subclass with additional meta-data.
 */
protected BasicDocument<L> parseDocumentText(String text) {
 new BasicDocument<L>();
 return BasicDocument.init(text, keepOriginalText);
}

/**
 * Creates a new Document for the given text. Default implementation tokenizes
 * the text using the tokenizer provided during construction and sticks the words
 * in a new BasicDocument. The text is also stored as the original text in
 * the BasicDocument if keepOriginalText was set in the constructor. Subclasses
 * may wish to extract additional information from the text and/or return another
 * document subclass with additional meta-data.
 */
protected BasicDocument<L> parseDocumentText(String text) {
 new BasicDocument<L>();
 return BasicDocument.init(text, keepOriginalText);
}

/**
 * Creates a new Document for the given text. Default implementation tokenizes
 * the text using the tokenizer provided during construction and sticks the words
 * in a new BasicDocument. The text is also stored as the original text in
 * the BasicDocument if keepOriginalText was set in the constructor. Subclasses
 * may wish to extract additional information from the text and/or return another
 * document subclass with additional meta-data.
 */
protected BasicDocument<L> parseDocumentText(String text) {
 new BasicDocument<L>();
 return BasicDocument.init(text, keepOriginalText);
}

/**
 * Creates a new Document for the given text. Default implementation tokenizes
 * the text using the tokenizer provided during construction and sticks the words
 * in a new BasicDocument. The text is also stored as the original text in
 * the BasicDocument if keepOriginalText was set in the constructor. Subclasses
 * may wish to extract additional information from the text and/or return another
 * document subclass with additional meta-data.
 */
protected BasicDocument<L> parseDocumentText(String text) {
 new BasicDocument<L>();
 return BasicDocument.init(text, keepOriginalText);
}

/**
 * For internal debugging purposes only. Creates and tests various instances
 * of BasicDocument.
 */
public static void main(String[] args) {
 try {
  printState(BasicDocument.init("this is the text", "this is the title [String]", true));
  printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true));
  File f = File.createTempFile("BasicDocumentTestFile", null);
  f.deleteOnExit();
  PrintWriter out = new PrintWriter(new FileWriter(f));
  out.print("this is the text");
  out.flush();
  out.close();
  printState(new BasicDocument<String>().init(f, "this is the title [File]", true));
  printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true));
 } catch (Exception e) {
  e.printStackTrace();
 }
}

 /**
  * For internal debugging purposes only.
  */
 public static void main(String[] args) {
  new BasicDocument<String>();
  Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");
  System.out.println("Before:");
  System.out.println(htmlDoc);
  Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc);
  System.out.println("After:");
  System.out.println(txtDoc);
  Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc);
  System.out.println("Sentences:");
  System.out.println(sentences);
 }
}

 /**
  * For internal debugging purposes only.
  */
 public static void main(String[] args) {
  new BasicDocument<String>();
  Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");
  System.out.println("Before:");
  System.out.println(htmlDoc);
  Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc);
  System.out.println("After:");
  System.out.println(txtDoc);
  Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc);
  System.out.println("Sentences:");
  System.out.println(sentences);
 }
}

 /**
  * For internal debugging purposes only.
  */
 public static void main(String[] args) {
  new BasicDocument<String>();
  Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");
  System.out.println("Before:");
  System.out.println(htmlDoc);
  Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc);
  System.out.println("After:");
  System.out.println(txtDoc);
  Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc);
  System.out.println("Sentences:");
  System.out.println(sentences);
 }
}

 /**
  * For internal debugging purposes only.
  */
 public static void main(String[] args) {
  new BasicDocument<String>();
  Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");
  System.out.println("Before:");
  System.out.println(htmlDoc);
  Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc);
  System.out.println("After:");
  System.out.println(txtDoc);
  Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc);
  System.out.println("Sentences:");
  System.out.println(sentences);
 }
}

/**
 * For internal debugging purposes only. Creates and tests various instances
 * of BasicDocument.
 */
public static void main(String[] args) {
 try {
  printState(BasicDocument.init("this is the text", "this is the title [String]", true));
  printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true));
  File f = File.createTempFile("BasicDocumentTestFile", null);
  f.deleteOnExit();
  PrintWriter out = new PrintWriter(new FileWriter(f));
  out.print("this is the text");
  out.flush();
  out.close();
  printState(new BasicDocument<String>().init(f, "this is the title [File]", true));
  printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true));
 } catch (Exception e) {
  e.printStackTrace();
 }
}

/**
 * For internal debugging purposes only. Creates and tests various instances
 * of BasicDocument.
 */
public static void main(String[] args) {
 try {
  printState(BasicDocument.init("this is the text", "this is the title [String]", true));
  printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true));
  File f = File.createTempFile("BasicDocumentTestFile", null);
  f.deleteOnExit();
  PrintWriter out = new PrintWriter(new FileWriter(f));
  out.print("this is the text");
  out.flush();
  out.close();
  printState(new BasicDocument<String>().init(f, "this is the title [File]", true));
  printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true));
 } catch (Exception e) {
  e.printStackTrace();
 }
}

/**
 * For internal debugging purposes only. Creates and tests various instances
 * of BasicDocument.
 */
public static void main(String[] args) {
 try {
  printState(BasicDocument.init("this is the text", "this is the title [String]", true));
  printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true));
  File f = File.createTempFile("BasicDocumentTestFile", null);
  f.deleteOnExit();
  PrintWriter out = new PrintWriter(new FileWriter(f));
  out.print("this is the text");
  out.flush();
  out.close();
  printState(new BasicDocument<String>().init(f, "this is the title [File]", true));
  printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true));
 } catch (Exception e) {
  e.printStackTrace();
 }
}

Javadoc

Constructs a new (empty) BasicDocument using a PTBTokenizer. Call one of the init * methods to populate the document from a desired source.

Popular methods of BasicDocument

addAll
addLabel
Adds the given label to the List of labels for this Document if it is not null.
init
Initializes a new BasicDocument with the given list of words and title.
labels
Returns the complete List of labels for this Document. This is an empty collection if none have been
originalText
Returns the text originally used to construct this document, or null if there was no original text.
parse
Tokenizes the given text to populate the list of words this Document represents. The default impleme
printState
For internal debugging purposes only. Prints the state of the given BasicDocument to stderr.
setLabels
Removes all currently assigned labels for this Document then adds all of the given labels.
setTitle
Sets the title of this Document to the given title. If the given title is null, sets the title to ""
setTokenizerFactory
Sets the tokenizerFactory to be used by #parse(String). Set this tokenizer before calling one of the
title
Returns the title of this document. The title may be empty ("") but will never be null.

title

Popular in Java

Finding current android device location
getSystemService (Context)
getSharedPreferences (Context)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
RandomAccessFile (java.io)
Allows reading from and writing to a file in a random-access manner. This is different from the uni-
Iterator (java.util)
An iterator over a sequence of objects, such as a collection.If a collection has been changed since
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
SortedSet (java.util)
SortedSet is a Set which iterates over its elements in a sorted order. The order is determined eithe
Reference (javax.naming)
JFrame (javax.swing)
From CI to AI: The AI layer in your organization

How to use edu.stanford.nlp.ling.BasicDocumentconstructor

Best Java code snippets using edu.stanford.nlp.ling.BasicDocument.<init> (Showing top 20 results out of 315)

How to use
edu.stanford.nlp.ling.BasicDocument
constructor