/** * Returns the sentence as a string with a space between words. * It prints out the {@code value()} of each item - * this will give the expected answer for a short form representation * of the "sentence" over a range of cases. It is equivalent to * calling {@code toString(true)}. * * TODO: Sentence used to be a subclass of ArrayList, with this * method as the toString. Therefore, there may be instances of * ArrayList being printed that expect this method to be used. * * @param list The tokenized sentence to print out * @return The tokenized sentence as a String */ public static <T> String listToString(List<T> list) { return listToString(list, true); }
/** * Returns the sentence as a string, based on the original text and spacing * prior to tokenization. * This method assumes that this extra information has been encoded in CoreLabel * objects for each token of the sentence, which do have the original spacing * preserved (done with "invertible=true" for PTBTokenizer). * However, the method has loose typing for easier inter-operation * with old code that still works with a {@code List<HasWord>}. * * @param list The sentence (List of tokens) to print out * @return The original sentence String, which may contain newlines or other artifacts of spacing */ public static <T extends HasWord> String listToOriginalTextString(List<T> list) { return listToOriginalTextString(list, true); }
public static List<CoreLabel> StringToIOB(String str, Character segMarker) { // Whitespace tokenization List<CoreLabel> toks = SentenceUtils.toCoreLabelList(str.trim().split("\\s+")); return StringToIOB(toks, segMarker, false); }
ArrayList<Word> sent = SentenceUtils.toUntaggedList(infile.readLine().split("\\s+")); outfile.println(SentenceUtils.listToString(sent));
public static void main(String[] args) throws Exception { if (args.length != 2) { log.info("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(SentenceUtils.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
/** * Return a default sentence for the language (for testing). * The example is in UTF-8. */ public List<? extends HasWord> defaultTestSentence() { String[] sent = {"هو","استنكر","الحكومة","يوم","امس","."}; return SentenceUtils.toWordList(sent); }
public static <T> String wordToString(T o, final boolean justValue) { return wordToString(o, justValue, null); }
/** * Return a default sentence for the language (for testing) */ @Override public ArrayList<Word> defaultTestSentence() { return SentenceUtils.toUntaggedList("\u951f\u65a4\u62f7", "\u951f\u65a4\u62f7", "\u5b66\u6821", "\u951f\u65a4\u62f7", "\u5b66\u4e60", "\u951f\u65a4\u62f7"); }
/** * Return a default sentence for the language (for testing) */ public List<? extends HasWord> defaultTestSentence() { String[] sent = {"Solch", "einen", "Zuspruch", "hat", "Angela", "Merkel", "lange", "nicht", "mehr", "erlebt", "."}; return SentenceUtils.toWordList(sent); }
/** * As already described, but if separator is not null, then objects * such as TaggedWord * * @param separator The string used to separate Word and Tag * in TaggedWord, etc */ public static <T> String listToString(List<T> list, final boolean justValue, final String separator) { StringBuilder s = new StringBuilder(); for (Iterator<T> wordIterator = list.iterator(); wordIterator.hasNext();) { T o = wordIterator.next(); s.append(wordToString(o, justValue, separator)); if (wordIterator.hasNext()) { s.append(' '); } } return s.toString(); }
/** * Tags the tokenized input string and returns the tagged version. * This method requires the input to already be tokenized. * The tagger wants input that is whitespace separated tokens, tokenized * according to the conventions of the training data. (For instance, * for the Penn Treebank, punctuation marks and possessive "'s" should * be separated from words.) * * @param toTag The untagged input String * @return The same string with tags inserted in the form word/tag */ public String tagTokenizedString(String toTag) { List<Word> sent = SentenceUtils.toUntaggedList(Arrays.asList(toTag.split("\\s+"))); TestSentence testSentence = new TestSentence(this); testSentence.tagSentence(sent, false); return testSentence.getTaggedNice(); }
/** * Returns the sentence as a string with a space between words. * Designed to work robustly, even if the elements stored in the * 'Sentence' are not of type Label. * * This one uses the default separators for any word type that uses * separators, such as TaggedWord. * * @param list The tokenized sentence to print out * @param justValue If {@code true} and the elements are of type * {@code Label}, return just the * {@code value()} of the {@code Label} of each word; * otherwise, * call the {@code toString()} method on each item. * @return The sentence in String form */ public static <T> String listToString(List<T> list, final boolean justValue) { return listToString(list, justValue, null); }
/** Returns the first sentence of TueBaDZ. */ @Override public List<? extends HasWord> defaultTestSentence() { return SentenceUtils.toWordList("Veruntreute", "die", "AWO", "Spendengeld", "?"); }
sb.append("leaf value=\""); sb.append(XMLUtils.escapeXML(SentenceUtils.wordToString(label, true))); sb.append('"'); if (printScores) {
/** * Returns the input sentence for the parser. */ private List<CoreLabel> getInputSentence(Tree t) { if (op.testOptions.forceTags) { if (op.testOptions.preTag) { List<TaggedWord> s = tagger.apply(t.yieldWords()); if(op.testOptions.verbose) { log.info("Guess tags: "+Arrays.toString(s.toArray())); log.info("Gold tags: "+t.labeledYield().toString()); } return SentenceUtils.toCoreLabelList(s); } else if(op.testOptions.noFunctionalForcing) { ArrayList<? extends HasWord> s = t.taggedYield(); for (HasWord word : s) { String tag = ((HasTag) word).tag(); tag = tag.split("-")[0]; ((HasTag) word).setTag(tag); } return SentenceUtils.toCoreLabelList(s); } else { return SentenceUtils.toCoreLabelList(t.taggedYield()); } } else { return SentenceUtils.toCoreLabelList(t.yieldWords()); } }
/** * Return a default sentence for the language (for testing) */ @Override public ArrayList<Word> defaultTestSentence() { return SentenceUtils.toUntaggedList("\u951f\u65a4\u62f7", "\u951f\u65a4\u62f7", "\u5b66\u6821", "\u951f\u65a4\u62f7", "\u5b66\u4e60", "\u951f\u65a4\u62f7"); }
/** * Returns the sentence as a string, based on the original text and spacing * prior to tokenization. * This method assumes that this extra information has been encoded in CoreLabel * objects for each token of the sentence, which do have the original spacing * preserved (done with "invertible=true" for PTBTokenizer). * However, the method has loose typing for easier inter-operation * with old code that still works with a {@code List<HasWord>}. * * @param list The sentence (List of tokens) to print out * @return The original sentence String, which may contain newlines or other artifacts of spacing */ public static <T extends HasWord> String listToOriginalTextString(List<T> list) { return listToOriginalTextString(list, true); }
@Override public void restoreOriginalWords(Tree tree) { if (originalSentence == null || tree == null) { return; } List<Tree> leaves = tree.getLeaves(); int expectedSize = addedPunct ? originalSentence.size() + 1 : originalSentence.size(); if (leaves.size() != expectedSize) { throw new IllegalStateException("originalWords and sentence of different sizes: " + expectedSize + " vs. " + leaves.size() + "\n Orig: " + SentenceUtils.listToString(originalSentence) + "\n Pars: " + SentenceUtils.listToString(leaves)); } Iterator<Tree> leafIterator = leaves.iterator(); for (HasWord word : originalSentence) { Tree leaf = leafIterator.next(); if (!(word instanceof Label)) { continue; } leaf.setLabel((Label) word); } }
public List<? extends HasWord> defaultTestSentence() { String[] sent = {"H", "MWX", "MTPLC", "LA", "RQ", "M", "H", "TWPEH", "H", "MBIFH", "ALA", "GM", "M", "DRKI", "H", "HERMH", "yyDOT"}; return SentenceUtils.toWordList(sent); }
public static <T> String wordToString(T o, final boolean justValue) { return wordToString(o, justValue, null); }