/** * Returns the sentence as a string with a space between words. * It prints out the {@code value()} of each item - * this will give the expected answer for a short form representation * of the "sentence" over a range of cases. It is equivalent to * calling {@code toString(true)}. * * TODO: Sentence used to be a subclass of ArrayList, with this * method as the toString. Therefore, there may be instances of * ArrayList being printed that expect this method to be used. * * @param list The tokenized sentence to print out * @return The tokenized sentence as a String */ public static <T> String listToString(List<T> list) { return listToString(list, true); }
/** * Returns the sentence as a string with a space between words. * Designed to work robustly, even if the elements stored in the * 'Sentence' are not of type Label. * * This one uses the default separators for any word type that uses * separators, such as TaggedWord. * * @param list The tokenized sentence to print out * @param justValue If {@code true} and the elements are of type * {@code Label}, return just the * {@code value()} of the {@code Label} of each word; * otherwise, * call the {@code toString()} method on each item. * @return The sentence in String form */ public static <T> String listToString(List<T> list, final boolean justValue) { return listToString(list, justValue, null); }
@Override public void restoreOriginalWords(Tree tree) { if (originalSentence == null || tree == null) { return; } List<Tree> leaves = tree.getLeaves(); int expectedSize = addedPunct ? originalSentence.size() + 1 : originalSentence.size(); if (leaves.size() != expectedSize) { throw new IllegalStateException("originalWords and sentence of different sizes: " + expectedSize + " vs. " + leaves.size() + "\n Orig: " + SentenceUtils.listToString(originalSentence) + "\n Pars: " + SentenceUtils.listToString(leaves)); } Iterator<Tree> leafIterator = leaves.iterator(); for (HasWord word : originalSentence) { Tree leaf = leafIterator.next(); if (!(word instanceof Label)) { continue; } leaf.setLabel((Label) word); } }
/** * TODO: clearly this should be a default method in ParserQuery once Java 8 comes out */ @Override public void restoreOriginalWords(Tree tree) { if (originalSentence == null || tree == null) { return; } List<Tree> leaves = tree.getLeaves(); if (leaves.size() != originalSentence.size()) { throw new IllegalStateException("originalWords and sentence of different sizes: " + originalSentence.size() + " vs. " + leaves.size() + "\n Orig: " + SentenceUtils.listToString(originalSentence) + "\n Pars: " + SentenceUtils.listToString(leaves)); } // TODO: get rid of this cast Iterator<? extends Label> wordsIterator = (Iterator<? extends Label>) originalSentence.iterator(); for (Tree leaf : leaves) { leaf.setLabel(wordsIterator.next()); } }
public static void main(String[] args) { Properties config = StringUtils.argsToProperties(args); log.info(config); boolean fullSentence = PropertiesUtils.getBool(config, "fullSentence", false); Random random = new Random(); String tagSeparator = config.getProperty("tagSeparator", TaggerConfig.TAG_SEPARATOR); TaggedFileRecord record = TaggedFileRecord.createRecord(config, config.getProperty("input")); for (List<TaggedWord> sentence : record.reader()) { int len = random.nextInt(sentence.size()) + 1; System.out.println(SentenceUtils.listToString(sentence.subList(0, len), false, tagSeparator)); if (fullSentence) { System.out.println(SentenceUtils.listToString(sentence, false, tagSeparator)); } } }
public static void main(String[] args) throws Exception { if (args.length != 2) { log.info("usage: java TaggerDemo modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[1]))); for (List<HasWord> sentence : sentences) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); System.out.println(SentenceUtils.listToString(tSentence, false)); } }
public void printSentences(Iterable<List<? extends HasWord>> sentences, String filename) { try { PrintWriter pw = IOUtils.getPrintWriter(filename); for (List<? extends HasWord> sentence:sentences) { pw.print("<s> "); // Note: Use <s sentence-id > to identify sentences String sentString = SentenceUtils.listToString(sentence); if (sentence.size() > maxSentenceLength) { logger.warning("Sentence length=" + sentence.size() + " is longer than maximum set length " + maxSentenceLength); logger.warning("Long Sentence: " + sentString); } pw.print(sentString); pw.println(" </s>"); } pw.close(); } catch (IOException ex) { throw new RuntimeException(ex); } }
public String toString() { StringBuilder result = new StringBuilder(); for (int prediction = 0; prediction < numClasses; ++prediction) { result.append("Best scores for class " + prediction + "\n"); Map<Integer, PriorityQueue<Tree>> ngrams = classToNGrams.get(prediction); for (Map.Entry<Integer, PriorityQueue<Tree>> entry : ngrams.entrySet()) { List<Tree> trees = Generics.newArrayList(entry.getValue()); Collections.sort(trees, scoreComparator(prediction)); result.append(" Len " + entry.getKey() + "\n"); for (int i = trees.size() - 1; i >= 0; i--) { Tree tree = trees.get(i); result.append(" " + SentenceUtils.listToString(tree.yield()) + " [" + RNNCoreAnnotations.getPredictions(tree).get(prediction) + "]\n"); } } } return result.toString(); } }
/** * Returns the string associated with the input parse tree. Traces and * ATB-specific escape sequences (e.g., "-RRB-" for ")") are removed. * * @param t - A parse tree * @return The yield of the input parse tree */ public static String flattenTree(Tree t) { t = t.prune(emptyFilter, tf); String flatString = SentenceUtils.listToString(t.yield()); return flatString; }
@Override public void evaluate(Tree guess, Tree gold, PrintWriter pw) { if(gold == null || guess == null) { System.err.printf("%s: Cannot compare against a null gold or guess tree!\n",this.getClass().getName()); return; } else if (guess.yield().size() != gold.yield().size()) { log.info("Warning: yield differs:"); log.info("Guess: " + SentenceUtils.listToString(guess.yield())); log.info("Gold: " + SentenceUtils.listToString(gold.yield())); } super.evaluate(guess, gold, pw); }
@Override public void evaluate(Tree guess, Tree gold, PrintWriter pw) { if(gold == null || guess == null) { System.err.printf("%s: Cannot compare against a null gold or guess tree!\n", this.getClass().getName()); return; } else if (guess.yield().size() != gold.yield().size()) { log.info("Warning: yield differs:"); log.info("Guess: " + SentenceUtils.listToString(guess.yield())); log.info("Gold: " + SentenceUtils.listToString(gold.yield())); } super.evaluate(guess, gold, pw); }
private Tree postProcessMWE(Tree t) { String tYield = SentenceUtils.listToString(t.yield()).replaceAll("\\s+", ""); if(tYield.matches("[\\d\\p{Punct}]*")) { List<Tree> kids = new ArrayList<>(); kids.add(treeFactory.newLeaf(tYield)); t = treeFactory.newTreeNode(t.value(), kids); } else { t.setValue(MWE_PHRASAL + t.value()); } return t; }
/** Make a new Annotation from a List of tokenized sentences. */ public Annotation(List<CoreMap> sentences) { super(); this.set(CoreAnnotations.SentencesAnnotation.class, sentences); List<CoreLabel> tokens = new ArrayList<>(); StringBuilder text = new StringBuilder(); for (CoreMap sentence : sentences) { List<CoreLabel> sentenceTokens = sentence.get(CoreAnnotations.TokensAnnotation.class); tokens.addAll(sentenceTokens); if (sentence.containsKey(CoreAnnotations.TextAnnotation.class)) { text.append(sentence.get(CoreAnnotations.TextAnnotation.class)); } else { // If there is no text in the sentence, fake it as best as we can if (text.length() > 0) { text.append('\n'); } text.append(SentenceUtils.listToString(sentenceTokens)); } } this.set(CoreAnnotations.TokensAnnotation.class, tokens); this.set(CoreAnnotations.TextAnnotation.class, text.toString()); }
public void outputTaggedSentence(List<? extends HasWord> sentence, boolean outputLemmas, OutputStyle outputStyle, boolean outputVerbosity, int numSentences, String separator, Writer writer) { try { switch (outputStyle) { case TSV: writer.write(getTsvWords(outputVerbosity, outputLemmas, sentence)); break; case XML: case INLINE_XML: writeXMLSentence(writer, sentence, numSentences, outputLemmas); break; case SLASH_TAGS: writer.write(SentenceUtils.listToString(sentence, false, config.getTagSeparator())); writer.write(separator); break; default: throw new IllegalArgumentException("Unsupported output style " + outputStyle); } } catch (IOException e) { throw new RuntimeIOException(e); } }
/** * Takes a tagged sentence and writes out the xml version. * * @param w Where to write the output to * @param sent A tagged sentence * @param sentNum The sentence index for XML printout * @param outputLemmas Whether to write the lemmas of words */ private static void writeXMLSentence(Writer w, List<? extends HasWord> sent, int sentNum, boolean outputLemmas) { try { w.write(getXMLWords(sent, sentNum, outputLemmas)); } catch (IOException e) { log.info("Error writing sentence " + sentNum + ": " + SentenceUtils.listToString(sent)); throw new RuntimeIOException(e); } }
public List<HasWord> segment(String s) { buildSegmentationLattice(s); ArrayList<Word> sent = maxMatchSegmentation(); printlnErr("raw output: "+ SentenceUtils.listToString(sent)); ArrayList<Word> postProcessedSent = postProcessSentence(sent); printlnErr("processed output: "+ SentenceUtils.listToString(postProcessedSent)); ChineseStringUtils.CTPPostProcessor postProcessor = new ChineseStringUtils.CTPPostProcessor(); String postSentString = postProcessor.postProcessingAnswer(postProcessedSent.toString(), false); printlnErr("Sighan2005 output: "+postSentString); String[] postSentArray = postSentString.split("\\s+"); ArrayList<Word> postSent = new ArrayList<>(); for(String w : postSentArray) { postSent.add(new Word(w)); } return new ArrayList<>(postSent); }
public TreeFromFile(Tree t) { this.treeString = t.toString(); sentence = SentenceUtils.listToString(t.yield()); if(t.label() instanceof HasIndex) { sentId = ((CoreLabel)t.label()).sentIndex(); filename = ((CoreLabel)t.label()).docID(); if(sentId != -1 && filename != null && !filename.equals("")) sentence = String.format("%s-%d %s", filename,sentId,sentence); } }
static public void countMWEStatistics(Tree t, TwoDimensionalCounter<String, String> unigramTagger, TwoDimensionalCounter<String, String> labelPreterm, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> labelTerm, TwoDimensionalCounter<String, String> termLabel) { updateTagger(unigramTagger,t); //Count MWE statistics TregexMatcher m = pMWE.matcher(t); while (m.findNextMatchingNode()) { Tree match = m.getMatch(); String label = match.value(); if(RESOLVE_DUMMY_TAGS && label.equals(FrenchXMLTreeReader.MISSING_PHRASAL)) continue; String preterm = SentenceUtils.listToString(match.preTerminalYield()); String term = SentenceUtils.listToString(match.yield()); labelPreterm.incrementCount(label,preterm); pretermLabel.incrementCount(preterm,label); labelTerm.incrementCount(label,term); termLabel.incrementCount(term, label); } }
/** * Perform (possibly destructive) operations on the tree. Do a top-down DFS on the tree. */ public void visitTree(Tree tree) { if (tree == null) return; String yield = SentenceUtils.listToString(tree.yield()); if (mweDictionary.contains(yield)) { List<Tree> children = getPreterminalSubtrees(tree); String newLabel = "MW" + tree.value(); tree.setValue(newLabel); tree.setChildren(children); // Bottom out of the recursion return; } else { for (Tree subTree : tree.children()) { if (subTree.isPhrasal()) { // Only phrasal trees can have yields > 1!! visitTree(subTree); } } } }
/** * Converts a parse tree into a string of tokens. Each token is a word and * its POS tag separated by the delimiter specified by <code>separator</code> * * @param t - A parse tree * @param removeEscaping - If true, remove LDC escape characters. Otherwise, leave them. * @param separator Word/tag separator * @return A string of tagged words */ public static String taggedStringFromTree(Tree t, boolean removeEscaping, String separator) { t = t.prune(emptyFilter, tf); List<CoreLabel> taggedSentence = t.taggedLabeledYield(); for (CoreLabel token : taggedSentence) { String word = (removeEscaping) ? unEscape(token.word()) : token.word(); token.setWord(word); token.setValue(word); } return SentenceUtils.listToString(taggedSentence, false, separator); }