/** * Returns a "pretty" version of the words in this Document suitable for * display. The default implementation returns each of the words in * this Document separated * by spaces. Specifically, each element that implements {@link HasWord} * has its * {@link HasWord#word} printed, and other elements are skipped. * * Subclasses that maintain additional information may which to * override this method. */ public String presentableText() { StringBuilder sb = new StringBuilder(); for (Word cur : this) { if (sb.length() > 0) { sb.append(' '); } sb.append(cur.word()); } return (sb.toString()); }
public static String tokensToString(Word [] tokens) { StringBuilder sb = new StringBuilder(512); for(int i = 0; i < tokens.length; i ++){ if(i > 0) sb.append(" "); Word l = tokens[i]; sb.append(l.word() + "{" + l.beginPosition() + ", " + l.endPosition() + "}"); } return sb.toString(); }
/** * Stems <code>w</code> and returns stemmed <code>Word</code>. */ public Word stem(Word w) { return (new Word(stem(w.word()))); }
boolean justInsertedNewline = false; // to prevent contiguous newlines for (Word w : in) { String ws = w.word(); if (ws.startsWith("<") && ws.endsWith(">")) { if (markLineBreaks && !justInsertedNewline) {
/** Return the tokens using PTB tokenizer. * * @param str String to tokenize * @return List of tokens */ private String[] ptbTokenize(String str) { // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers if (ptbFactory==null) { ptbFactory = PTBTokenizer.factory(); } Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str)); List<Word> words = tokenizer.tokenize(); String[] res = new String[words.size()]; for (int i = 0, sz = words.size(); i < sz; i++) { res[i] = words.get(i).word(); } return res; }
/** * Test program for demonstrating the Stemmer. It reads text from a * a list of files, stems each word, and writes the result to standard * output. Note that the word stemmed is expected to be in lower case: * forcing lower case must be done outside the Stemmer class. * Usage: Stemmer file-name file-name ... */ public static void main(String[] args) throws IOException { Stemmer s = new Stemmer(); if (args[0].equals("-file")) { Iterator<Word> it = PTBTokenizer.newPTBTokenizer(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); while (it.hasNext()) { Word token = it.next(); System.out.print(s.stem(token.word())); System.out.print(' '); } } else { for (String arg : args) { System.out.print(s.stem(arg)); System.out.print(' '); } } System.out.println(); }
int numAdded = 0; while (tok.hasNext()) { String s = tok.next().word();
DFSAState<Word, Integer> fromState = tr.getSource(); Word word = tr.getInput(); if (!word.word().equals(" ")) segmentedWords.add(0, word); i = fromState.stateID();
for (; ;) { Word word = (Word) sentIter.next(); pw.print(word.word()); if (sentIter.hasNext()) { pw.print(" ");
import edu.stanford.nlp.ling.Word; List<Word> words = ... for (Word word : words) { if (word.word().equals(args(1))) { System.err.println("Yes!"); } }
public List<String> tokenizeString(String string) { final List<String> tokens = new ArrayList<String>(); for (Word w : tokenize(string)) { tokens.add(w.word()); } return tokens; }
/** * Returns a "pretty" version of the words in this Document suitable for * display. The default implementation returns each of the words in * this Document separated * by spaces. Specifically, each element that implements {@link HasWord} * has its * {@link HasWord#word} printed, and other elements are skipped. * * Subclasses that maintain additional information may which to * override this method. */ public String presentableText() { StringBuilder sb = new StringBuilder(); for (Word cur : this) { if (sb.length() > 0) { sb.append(' '); } sb.append(cur.word()); } return (sb.toString()); }
/** * Returns a "pretty" version of the words in this Document suitable for * display. The default implementation returns each of the words in * this Document separated * by spaces. Specifically, each element that implements {@link HasWord} * has its * {@link HasWord#word} printed, and other elements are skipped. * <p/> * <p>Subclasses that maintain additional information may which to * override this method.</p> */ public String presentableText() { StringBuilder sb = new StringBuilder(); for (Word cur : this) { if (sb.length() > 0) { sb.append(' '); } sb.append(cur.word()); } return (sb.toString()); }
@Override public String[] tokenize(String sentence) { Reader r=new StringReader(sentence); PTBTokenizer<Word> tokenizer=PTBTokenizer.newPTBTokenizer(r); List<String> l=new ArrayList<String>(); while(tokenizer.hasNext()) l.add(tokenizer.next().word()); String[] tok=new String[l.size()+1]; tok[0]=is2.io.CONLLReader09.ROOT; int i=1; for(String s:l) tok[i++]=s; return tok; }
@Override public String[] tokenize(String sentence) { Reader r = new StringReader(sentence); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r); List<String> l = new ArrayList<>(); while (tokenizer.hasNext()) { Word w = tokenizer.next(); l.add(w.word()); } String[] tok = new String[l.size() + 1]; tok[0] = is2.io.CONLLReader09.ROOT; int i = 1; for (String s : l) tok[i++] = s; return tok; }
/** Return the tokens using PTB tokenizer. * * @param str String to tokenize * @return List of tokens */ private String[] ptbTokenize(String str) { // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers if (ptbFactory==null) { ptbFactory = PTBTokenizer.factory(); } Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str)); List<Word> words = tokenizer.tokenize(); String[] res = new String[words.size()]; for (int i = 0, sz = words.size(); i < sz; i++) { res[i] = words.get(i).word(); } return res; }
@Override protected Word getNext() { while (wordIter == null || ! wordIter.hasNext()) { if ( ! tok.hasNext()) { return null; } String s = tok.next().word(); if (s == null) { return null; } ArrayList<Word> se = segmentWords(s); wordIter = se.iterator(); } return wordIter.next(); }
@Override public StringInText[] tokenizeplus(String sentence) { Reader r = new StringReader(sentence); List<StringInText> l = new ArrayList<>(); for (String s : tokenize(sentence)) { Word w = new Word(s); l.add(new StringInText(w.word(), w.beginPosition() + startpos, w .endPosition() + startpos)); } StringInText[] tok = new StringInText[l.size()]; // tok[0]=new StringInText(is2.io.CONLLReader09.ROOT,0,0); int i = 0; for (StringInText s : l) tok[i++] = s; startpos += (1 + sentence.length()); return tok; } }
public StringInText[] tokenizeplus(String sentence) { Reader r = new StringReader(sentence); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r); List<StringInText> l = new ArrayList<>(); while (tokenizer.hasNext()) { Word w = tokenizer.next(); l.add(new StringInText(w.word(), w.beginPosition() + startpos, w .endPosition() + startpos)); } StringInText[] tok = new StringInText[l.size() + 1]; tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0); int i = 1; for (StringInText s : l) tok[i++] = s; startpos += (1 + sentence.length()); return tok; }