edu.stanford.nlp.process.WhitespaceTokenizer java code examples

         new InputStreamReader(System.in, "UTF-8"));
WhitespaceTokenizer<Word> tokenizer =
    new WhitespaceTokenizer<>(new WordTokenFactory(), reader,
        eolIsSignificant);
PrintWriter pw =
 new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
while (tokenizer.hasNext()) {
 Word w = tokenizer.next();
 if (w.value().equals(WhitespaceLexer.NEWLINE)) {
  pw.println("***CR***");

/**
 * Return a tokenizer which might be suitable for tokenizing text that
 * will be used with this Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white space).  The implementation in AbstractTreebankLanguagePack
 * returns a factory for {@link WhitespaceTokenizer}.
 *
 * @return A tokenizer
 */
@Override
public TokenizerFactory<? extends HasWord> getTokenizerFactory() {
 return WhitespaceTokenizer.factory(false);
}

@Override
public Tree next() {
 if (line == null) {
  throw new NoSuchElementException();
 }
 Reader lineReader = new StringReader(line);
 line = null;
 List<Word> words;
 if (tokenized) {
  words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
 } else {
  words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
 }
 if (!words.isEmpty()) {
  // the parser throws an exception if told to parse an empty sentence.
  Tree parseTree = lp.apply(words);
  return parseTree;
 } else {
  return new SimpleTree();
 }
}

@Override
public Tokenizer<T> getTokenizer(Reader r) {
 return new WhitespaceTokenizer<>(factory, r, tokenizeNLs);
}

public static WhitespaceTokenizer<Word>
 newWordWhitespaceTokenizer(Reader r)
{
 return newWordWhitespaceTokenizer(r, false);
}

public WordSegmentingTokenizer(WordSegmenter segmenter, Reader r) {
 this(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r));
}

public static WhitespaceTokenizer<Word>
 newWordWhitespaceTokenizer(Reader r, boolean eolIsSignificant)
{
 return new WhitespaceTokenizer<>(new WordTokenFactory(), r,
     eolIsSignificant);
}

 newWordWhitespaceTokenizer(new StringReader(input), true);
String parseInside = props.getProperty("parseInside");
if (parseInside == null) parseInside = "";

public Tokenizer<HasWord> getTokenizer(Reader r, String extraOptions) {
 boolean tokenizeNewlines = this.tokenizeNLs;
 if (extraOptions != null) {
  Properties prop = StringUtils.stringToProperties(extraOptions);
  tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs);
 }
 return new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r, tokenizeNewlines));
}

         new InputStreamReader(System.in, "UTF-8"));
WhitespaceTokenizer<Word> tokenizer =
 new WhitespaceTokenizer<Word>(new WordTokenFactory(), reader,
                eolIsSignificant);
PrintWriter pw =
 new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
while (tokenizer.hasNext()) {
 Word w = tokenizer.next();
 if (w.value().equals(WhitespaceLexer.NEWLINE)) {
  pw.println("***CR***");

@Override
public Tokenizer<T> getTokenizer(Reader r, String extraOptions) {
 Properties prop = StringUtils.stringToProperties(extraOptions);
 boolean tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs);
 return new WhitespaceTokenizer<>(factory, r, tokenizeNewlines);
}

protected static TokenizerFactory<? extends HasWord>
 chooseTokenizerFactory(boolean tokenize, String tokenizerFactory,
             String tokenizerOptions, boolean invertible) {
 if (tokenize && tokenizerFactory.trim().length() != 0) {
  //return (TokenizerFactory<? extends HasWord>) Class.forName(getTokenizerFactory()).newInstance();
  try {
   @SuppressWarnings({"unchecked"})
   Class<TokenizerFactory<? extends HasWord>> clazz = (Class<TokenizerFactory<? extends HasWord>>) Class.forName(tokenizerFactory.trim());
   Method factoryMethod = clazz.getMethod("newTokenizerFactory");
   @SuppressWarnings({"unchecked"})
   TokenizerFactory<? extends HasWord> factory = (TokenizerFactory<? extends HasWord>) factoryMethod.invoke(tokenizerOptions);
   return factory;
  } catch (Exception e) {
   throw new RuntimeException("Could not load tokenizer factory", e);
  }
 } else if (tokenize) {
  if (invertible) {
   if (tokenizerOptions.equals("")) {
    tokenizerOptions = "invertible=true";
   } else if (!tokenizerOptions.matches("(^|.*,)invertible=true")) {
    tokenizerOptions += ",invertible=true";
   }
   return PTBTokenizerFactory.newCoreLabelTokenizerFactory(tokenizerOptions);
  } else {
   return PTBTokenizerFactory.newWordTokenizerFactory(tokenizerOptions);
  }
 } else {
  return WhitespaceTokenizer.factory();
 }
}

@Override
public Tree next() {
 if (line == null) {
  throw new NoSuchElementException();
 }
 Reader lineReader = new StringReader(line);
 line = null;
 List<Word> words;
 if (tokenized) {
  words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
 } else {
  words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
 }
 if (!words.isEmpty()) {
  // the parser throws an exception if told to parse an empty sentence.
  Tree parseTree = lp.apply(words);
  return parseTree;
 } else {
  return new SimpleTree();
 }
}

public static WhitespaceTokenizer<Word> 
 newWordWhitespaceTokenizer(Reader r) 
{
 return newWordWhitespaceTokenizer(r, false);
}

 eolIsSignificant = sentDelims.contains(WhitespaceLexer.NEWLINE);
 tokenizer = WhitespaceTokenizer.
   newCoreLabelWhitespaceTokenizer(inputReader, eolIsSignificant);
} else {
 if (eolIsSignificant) {

         new InputStreamReader(System.in, "UTF-8"));
WhitespaceTokenizer<Word> tokenizer =
    new WhitespaceTokenizer<>(new WordTokenFactory(), reader,
        eolIsSignificant);
PrintWriter pw =
 new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
while (tokenizer.hasNext()) {
 Word w = tokenizer.next();
 if (w.value().equals(WhitespaceLexer.NEWLINE)) {
  pw.println("***CR***");

public static WhitespaceTokenizer<CoreLabel> newCoreLabelWhitespaceTokenizer(Reader r) {
 return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, false);
}

 Document<String, Word, Word> dpre = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new URL(filename));
 DocumentProcessor<Word, Word, String, Word> notags = new StripTagsProcessor<>();
 d = notags.processDocument(dpre);
} else {
 d = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new File(filename));

@Override
public Tree next() {
 if (line == null) {
  throw new NoSuchElementException();
 }
 Reader lineReader = new StringReader(line);
 line = null;
 List<Word> words;
 if (tokenized) {
  words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
 } else {
  words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
 }
 if (!words.isEmpty()) {
  // the parser throws an exception if told to parse an empty sentence.
  Tree parseTree = lp.apply(words);
  return parseTree;
 } else {
  return new SimpleTree();
 }
}

public static WhitespaceTokenizer<Word>
 newWordWhitespaceTokenizer(Reader r)
{
 return newWordWhitespaceTokenizer(r, false);
}

Javadoc

A WhitespaceTokenizer is a tokenizer that splits on and discards only whitespace characters. This implementation can return Word, CoreLabel or other LexedToken objects. It has a parameter for whether to make EOL a token or whether to treat EOL characters as whitespace. If an EOL is a token, the class returns it as a Word with String value "\n". Implementation note: This was rewritten in Apr 2006 to discard the old StreamTokenizer-based implementation and to replace it with a Unicode compliant JFlex-based version. This tokenizer treats as Whitespace almost exactly the same characters deemed Whitespace by the Java function java.lang.Character#isWhitespace(int). That is, a whitespace is a Unicode SPACE_SEPARATOR, LINE_SEPARATOR or PARAGRAPH_SEPARATOR, or one of the control characters U+0009-U+000D or U+001C-U+001F except the non-breaking space characters. The one addition is to also allow U+0085 as a line ending character, for compatibility with certain IBM systems. For including "spaces" in tokens, it is recommended that you represent them as the non-break space character U+00A0.

Most used methods

<init>
Constructs a new WhitespaceTokenizer
factory
hasNext
newWordWhitespaceTokenizer
next
newCoreLabelWhitespaceTokenizer
tokenize

Popular in Java

Creating JSON documents from java classes using gson
putExtra (Intent)
getSupportFragmentManager (FragmentActivity)
scheduleAtFixedRate (Timer)
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
Permission (java.security)
Legacy security code; do not use.
ResultSet (java.sql)
An interface for an object which represents a database table entry, returned as the result of the qu
Map (java.util)
A Map is a data structure consisting of a set of keys and values in which each key is mapped to a si
JTextField (javax.swing)
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
From CI to AI: The AI layer in your organization

How to useWhitespaceTokenizer in edu.stanford.nlp.process

Best Java code snippets using edu.stanford.nlp.process.WhitespaceTokenizer (Showing top 20 results out of 315)

How to use
WhitespaceTokenizer
in
edu.stanford.nlp.process