new InputStreamReader(System.in, "UTF-8")); WhitespaceTokenizer<Word> tokenizer = new WhitespaceTokenizer<>(new WordTokenFactory(), reader, eolIsSignificant); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true); while (tokenizer.hasNext()) { Word w = tokenizer.next(); if (w.value().equals(WhitespaceLexer.NEWLINE)) { pw.println("***CR***");
/** * Return a tokenizer which might be suitable for tokenizing text that * will be used with this Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white space). The implementation in AbstractTreebankLanguagePack * returns a factory for {@link WhitespaceTokenizer}. * * @return A tokenizer */ @Override public TokenizerFactory<? extends HasWord> getTokenizerFactory() { return WhitespaceTokenizer.factory(false); }
@Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List<Word> words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } }
@Override public Tokenizer<T> getTokenizer(Reader r) { return new WhitespaceTokenizer<>(factory, r, tokenizeNLs); }
public static WhitespaceTokenizer<Word> newWordWhitespaceTokenizer(Reader r) { return newWordWhitespaceTokenizer(r, false); }
public WordSegmentingTokenizer(WordSegmenter segmenter, Reader r) { this(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r)); }
public static WhitespaceTokenizer<Word> newWordWhitespaceTokenizer(Reader r, boolean eolIsSignificant) { return new WhitespaceTokenizer<>(new WordTokenFactory(), r, eolIsSignificant); }
newWordWhitespaceTokenizer(new StringReader(input), true); String parseInside = props.getProperty("parseInside"); if (parseInside == null) parseInside = "";
public Tokenizer<HasWord> getTokenizer(Reader r, String extraOptions) { boolean tokenizeNewlines = this.tokenizeNLs; if (extraOptions != null) { Properties prop = StringUtils.stringToProperties(extraOptions); tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs); } return new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r, tokenizeNewlines)); }
new InputStreamReader(System.in, "UTF-8")); WhitespaceTokenizer<Word> tokenizer = new WhitespaceTokenizer<Word>(new WordTokenFactory(), reader, eolIsSignificant); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true); while (tokenizer.hasNext()) { Word w = tokenizer.next(); if (w.value().equals(WhitespaceLexer.NEWLINE)) { pw.println("***CR***");
@Override public Tokenizer<T> getTokenizer(Reader r, String extraOptions) { Properties prop = StringUtils.stringToProperties(extraOptions); boolean tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs); return new WhitespaceTokenizer<>(factory, r, tokenizeNewlines); }
protected static TokenizerFactory<? extends HasWord> chooseTokenizerFactory(boolean tokenize, String tokenizerFactory, String tokenizerOptions, boolean invertible) { if (tokenize && tokenizerFactory.trim().length() != 0) { //return (TokenizerFactory<? extends HasWord>) Class.forName(getTokenizerFactory()).newInstance(); try { @SuppressWarnings({"unchecked"}) Class<TokenizerFactory<? extends HasWord>> clazz = (Class<TokenizerFactory<? extends HasWord>>) Class.forName(tokenizerFactory.trim()); Method factoryMethod = clazz.getMethod("newTokenizerFactory"); @SuppressWarnings({"unchecked"}) TokenizerFactory<? extends HasWord> factory = (TokenizerFactory<? extends HasWord>) factoryMethod.invoke(tokenizerOptions); return factory; } catch (Exception e) { throw new RuntimeException("Could not load tokenizer factory", e); } } else if (tokenize) { if (invertible) { if (tokenizerOptions.equals("")) { tokenizerOptions = "invertible=true"; } else if (!tokenizerOptions.matches("(^|.*,)invertible=true")) { tokenizerOptions += ",invertible=true"; } return PTBTokenizerFactory.newCoreLabelTokenizerFactory(tokenizerOptions); } else { return PTBTokenizerFactory.newWordTokenizerFactory(tokenizerOptions); } } else { return WhitespaceTokenizer.factory(); } }
@Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List<Word> words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } }
public static WhitespaceTokenizer<Word> newWordWhitespaceTokenizer(Reader r) { return newWordWhitespaceTokenizer(r, false); }
eolIsSignificant = sentDelims.contains(WhitespaceLexer.NEWLINE); tokenizer = WhitespaceTokenizer. newCoreLabelWhitespaceTokenizer(inputReader, eolIsSignificant); } else { if (eolIsSignificant) {
new InputStreamReader(System.in, "UTF-8")); WhitespaceTokenizer<Word> tokenizer = new WhitespaceTokenizer<>(new WordTokenFactory(), reader, eolIsSignificant); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true); while (tokenizer.hasNext()) { Word w = tokenizer.next(); if (w.value().equals(WhitespaceLexer.NEWLINE)) { pw.println("***CR***");
public static WhitespaceTokenizer<CoreLabel> newCoreLabelWhitespaceTokenizer(Reader r) { return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, false); }
Document<String, Word, Word> dpre = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new URL(filename)); DocumentProcessor<Word, Word, String, Word> notags = new StripTagsProcessor<>(); d = notags.processDocument(dpre); } else { d = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new File(filename));
@Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List<Word> words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } }
public static WhitespaceTokenizer<Word> newWordWhitespaceTokenizer(Reader r) { return newWordWhitespaceTokenizer(r, false); }