/** * Create a new word, where the label is formed from * the <code>String</code> passed in. * * @param word The word that will go into the <code>Word</code> * @return The new label */ public Label newLabelFromString(String word) { return new Word(word); }
/** * Create a new word, where the label is formed from * the <code>String</code> passed in. * * @param word The word that will go into the <code>Word</code> * @return The new label */ public Label newLabel(String word) { return new Word(word); }
@Override public Word makeToken(String str, int begin, int length) { return new Word(str, begin, begin+length); }
/** * Create a new word, where the label is formed from * the <code>String</code> passed in. * * @param word The word that will go into the <code>Word</code> * @param options is ignored by a WordFactory * @return The new label */ public Label newLabel(String word, int options) { return new Word(word); }
/** * Create a new <code>Word Label</code>, where the label is * formed from * the <code>Label</code> object passed in. Depending on what fields * each label has, other things will be <code>null</code>. * * @param oldLabel The Label that the new label is being created from * @return a new label of a particular type */ public Label newLabel(Label oldLabel) { return new Word(oldLabel); }
/** * Adds some extremely common words to the stoplist. */ private void addGenericWords() { String[] genericWords = {"a", "an", "the", "and", "or", "but", "nor"}; for (int i = 1; i < 7; i++) { wordSet.add(new Word(genericWords[i])); } }
/** * Returns true if the word is in the stoplist. */ public boolean contains(String word) { return wordSet.contains(new Word(word)); }
/** {@inheritDoc} */ @Override public List<Word> defaultTestSentence() { List<Word> ret = new ArrayList<>(); String[] sent = {"This", "is", "just", "a", "test", "."}; for (String str : sent) { ret.add(new Word(str)); } return ret; }
private static ArrayList<Word> addLast(ArrayList<? extends Word> s) { ArrayList<Word> s2 = new ArrayList<>(s); //s2.add(new StringLabel(Lexicon.BOUNDARY)); s2.add(new Word(Lexicon.BOUNDARY)); return s2; }
/** * Create a Sentence as a list of {@code Word} objects from * an array of String objects. * * @param words The words to make it from * @return The Sentence */ //TODO wsg2010: This should be deprecated in favor of the method below with new labels public static ArrayList<Word> toUntaggedList(String... words) { ArrayList<Word> sent = new ArrayList<>(); for (String str : words) { sent.add(new Word(str)); } return sent; }
private static ArrayList<Word> wordify(List wList) { ArrayList<Word> s = new ArrayList<>(); for (Object obj : wList) { s.add(new Word(obj.toString())); } return s; }
/** * Create an ArrayList as a list of {@code Word} from a * list of {@code String}. * * @param lex a list whose items are of type {@code String} and * are the words * @return The Sentence */ //TODO wsg2010: This should be deprecated in favor of the method below with new labels public static ArrayList<Word> toUntaggedList(List<String> lex) { ArrayList<Word> sent = new ArrayList<>(); for (String str : lex) { sent.add(new Word(str)); } return sent; }
public Word next() throws IOException { String nx = lexer.next(); if (nx == null) { return null; } else { return new Word(nx); } }
/** * Will process a list of strings into a list of HasWord and return * the parse tree associated with that list. */ public Tree parseStrings(List<String> lst) { List<Word> words = new ArrayList<>(); for (String word : lst) { words.add(new Word(word)); } return parse(words); }
/** * Constructs a new stoplist from the contents of a file. It is * assumed that the file contains stopwords, one on a line. * The stopwords need not be in any order. */ public StopList(File list) { wordSet = Generics.newHashSet(); try { BufferedReader reader = new BufferedReader(new FileReader(list)); while (reader.ready()) { wordSet.add(new Word(reader.readLine())); } } catch (IOException e) { throw new RuntimeException(e); //e.printStackTrace(System.err); //addGenericWords(); } }
/** * Stems <code>w</code> and returns stemmed <code>Word</code>. */ public Word stem(Word w) { return (new Word(stem(w.word()))); }
public Word stem(Word w) { return new Word(stem(w.value())); }
/** * Smart tokenization storing the output in an array of CoreLabel * Sets the following fields: * - TextAnnotation - the text of the token * - TokenBeginAnnotation - the byte offset of the token (start) * - TokenEndAnnotation - the byte offset of the token (end) */ public Word [] tokenizeToWords() { List<WordToken> toks = tokenizeToWordTokens(); Word [] labels = new Word[toks.size()]; for(int i = 0; i < toks.size(); i ++){ WordToken tok = toks.get(i); Word l = new Word(tok.getWord(), tok.getStart(), tok.getEnd()); labels[i] = l; } return labels; }
public ArrayList<Word> yieldWords(ArrayList<Word> y) { if (isLeaf()) { y.add(new Word(label())); } else { for (Tree kid : children()) { kid.yieldWords(y); } } return y; }
public List<HasWord> segment(String s) { buildSegmentationLattice(s); ArrayList<Word> sent = maxMatchSegmentation(); printlnErr("raw output: "+ SentenceUtils.listToString(sent)); ArrayList<Word> postProcessedSent = postProcessSentence(sent); printlnErr("processed output: "+ SentenceUtils.listToString(postProcessedSent)); ChineseStringUtils.CTPPostProcessor postProcessor = new ChineseStringUtils.CTPPostProcessor(); String postSentString = postProcessor.postProcessingAnswer(postProcessedSent.toString(), false); printlnErr("Sighan2005 output: "+postSentString); String[] postSentArray = postSentString.split("\\s+"); ArrayList<Word> postSent = new ArrayList<>(); for(String w : postSentArray) { postSent.add(new Word(w)); } return new ArrayList<>(postSent); }