/** * Only works on English, as it is hard coded for using the * Morphology class, which is English-only */ public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) { List<TaggedWord> tagged; if (getOp().testOptions.preTag) { Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger(); tagged = tagger.apply(tokens); } else { Tree tree = parse(tokens); tagged = tree.taggedYield(); } Morphology morpha = new Morphology(); List<CoreLabel> lemmas = Generics.newArrayList(); for (TaggedWord token : tagged) { CoreLabel label = new CoreLabel(); label.setWord(token.word()); label.setTag(token.tag()); morpha.stem(label); lemmas.add(label); } return lemmas; }
private static List<TaggedWord> cleanTags(List<TaggedWord> twList, TreebankLanguagePack tlp) { int sz = twList.size(); List<TaggedWord> l = new ArrayList<>(sz); for (TaggedWord tw : twList) { TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag())); l.add(tw2); } return l; }
outputTree = outputTree.prune(new BobChrisTreeNormalizer.EmptyFilter()); ArrayList<Label> sentUnstemmed = outputTree.yield(); pw.println(" <words>"); int i = 1; String sent = SentenceUtils.listToString(outputTree.yield(), false); if(ptb2text) { pw.println(PTBTokenizer.ptb2Text(sent)); int i = 1; for (TaggedWord tw : sent) { pw.println(" <word ind=\"" + i + "\" pos=\"" + XMLUtils.escapeXML(tw.tag()) + "\">" + XMLUtils.escapeXML(tw.word()) + "</word>"); i++; Tree indexedTree = outputTree.deepCopy(outputTree.treeFactory(), CoreLabel.factory()); indexedTree.indexLeaves(); Set<Dependency<Label, Label, Object>> depsSet = indexedTree.mapDependencies(dependencyWordFilter, hf); Tree it = outputTree.deepCopy(outputTree.treeFactory(), CoreLabel.factory()); it.indexLeaves(); String tag = PTBTokenizer.ptbToken2Text(w.tag()); String word = PTBTokenizer.ptbToken2Text(w.word());
private Tree outsideBinarizeLocalTree(Tree t, String labelStr, String finalCat, int headNum, TaggedWord head, int leftProcessed, String leftStr, int rightProcessed, String rightStr) { List<Tree> newChildren = new ArrayList<>(2); Label label = new CategoryWordTag(labelStr, head.word(), head.tag()); if (t.numChildren() - leftProcessed - rightProcessed <= 2) { newChildren.add(t.getChild(leftProcessed)); if (t.numChildren() - leftProcessed - rightProcessed == 2) { newChildren.add(t.getChild(leftProcessed + 1));
/** * Set the tags of the original tokens and the leaves if they * aren't already set. */ private static void setMissingTags(CoreMap sentence, Tree tree) { List<TaggedWord> taggedWords = null; List<Label> leaves = null; List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (int i = 0, size = tokens.size(); i < size; ++i) { CoreLabel token = tokens.get(i); if (token.tag() == null) { if (taggedWords == null) { taggedWords = tree.taggedYield(); } if (leaves == null) { leaves = tree.yield(); } token.setTag(taggedWords.get(i).tag()); Label leaf = leaves.get(i); if (leaf instanceof HasTag) { ((HasTag) leaf).setTag(taggedWords.get(i).tag()); } } } }
private CoreMap doOneSentence(CoreMap sentence) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); List<TaggedWord> tagged = null; if (tokens.size() <= maxSentenceLength) { try { tagged = pos.tagSentence(tokens, this.reuseTags); } catch (OutOfMemoryError e) { log.error(e); // Beware that we can now get an OOM in logging, too. log.warn("Tagging of sentence ran out of memory. " + "Will ignore and continue: " + SentenceUtils.listToString(tokens)); } } if (tagged != null) { for (int i = 0, sz = tokens.size(); i < sz; i++) { tokens.get(i).set(CoreAnnotations.PartOfSpeechAnnotation.class, tagged.get(i).tag()); } } else { for (CoreLabel token : tokens) { token.set(CoreAnnotations.PartOfSpeechAnnotation.class, "X"); } } return sentence; }
private static List<WordTag> getNonStemmedWordTagsFromTree(Tree t) { List<WordTag> wordTags = Generics.newArrayList(); ArrayList<TaggedWord> s = t.taggedYield(); for (TaggedWord w : s) { WordTag wt = new WordTag(w.word(), w.tag()); wordTags.add(wt); } return wordTags; }
/** * Takes a sentence composed of CoreLabels and add the tags to the * CoreLabels, modifying the input sentence. If reuseTags is set to * true, any tags supplied with the CoreLabels are taken as correct. */ public void tagCoreLabels(List<CoreLabel> sentence, boolean reuseTags) { List<TaggedWord> taggedWords = tagSentence(sentence, reuseTags); if (taggedWords.size() != sentence.size()) throw new AssertionError("Tagged word list not the same length " + "as the original sentence"); for (int i = 0, size = sentence.size(); i < size; ++i) { sentence.get(i).setTag(taggedWords.get(i).tag()); } }
String cat = t.label().value(); if (t.isLeaf()) { Label label = new Word(cat);//new CategoryWordTag(cat,cat,""); return tf.newLeaf(label); if (t.isPreTerminal()) { Tree childResult = transformTree(t.getChild(0)); String word = childResult.value(); // would be nicer if Word/CWT ?? TaggedWord head = new TaggedWord(word, tag); result = binarizeLocalTree(result, headNum, head);
TaggedWord tw = new TaggedWord(word.word(), ((HasTag) word).tag()); sentenceB.add(tw); } else { CoreLabel boundary = new CoreLabel(); boundary.setWord(Lexicon.BOUNDARY); boundary.setValue(Lexicon.BOUNDARY); boundary.setTag(Lexicon.BOUNDARY_TAG); boundary.setIndex(sentence.size()+1);//1-based indexing used in the parser sentenceB.add(boundary); } else { sentenceB.add(new TaggedWord(Lexicon.BOUNDARY, Lexicon.BOUNDARY_TAG));
private int initialBetasAndLexicon(Tree tree, int position, double weight) { if (tree.isLeaf()) { if (tree.isPreTerminal()) { String tag = tree.label().value(); String word = tree.children()[0].label().value(); TaggedWord tw = new TaggedWord(word, state(tag, 0)); lex.train(tw, position, weight); return (position + 1);
/** * Gets the tagged yield of the tree -- that is, get the preterminals * as well as the terminals. The {@code Label} of all leaf nodes * is returned * as a list ordered by the natural left to right order of the * leaves. Null values, if any, are inserted into the list like any * other value. This has been rewritten to thread, so only one List * is used. * <p/> * <i>Implementation note:</i> when we summon up enough courage, this * method will be changed to take and return a {@code List<W extends TaggedWord>}. * * @param ty The list in which the tagged yield of the tree will be * placed. Normally, this will be empty when the routine is called, * but if not, the new yield is added to the end of the list. * @return a {@code List} of the data in the tree's leaves. */ public <X extends List<TaggedWord>> X taggedYield(X ty) { if (isPreTerminal()) { ty.add(new TaggedWord(firstChild().label(), label())); } else { for (Tree kid : children()) { kid.taggedYield(ty); } } return ty; }
Tree binarizeLocalTree(Tree t, int headNum, TaggedWord head) { //System.out.println("Working on: "+headNum+" -- "+t.label()); if (markovFactor) { String topCat = t.label().value(); Label newLabel = new CategoryWordTag(topCat, head.word(), head.tag()); t.setLabel(newLabel); Tree t2; if (insideFactor) { t2 = markovInsideBinarizeLocalTreeNew(t, headNum, 0, t.numChildren() - 1, true); // t2 = markovInsideBinarizeLocalTree(t, head, headNum, topCat, false); } else { t2 = markovOutsideBinarizeLocalTree(t, head, headNum, topCat, new LinkedList<>(), false); } if (DEBUG) { CategoryWordTag.printWordTag = false; StringBuilder sb1 = new StringBuilder(); localTreeString(t, sb1, 0); StringBuilder sb2 = new StringBuilder(); localTreeString(t2, sb2, 0); System.out.println("Old Local Tree: " + sb1); System.out.println("New Local Tree: " + sb2); CategoryWordTag.printWordTag = true; } return t2; } if (insideFactor) { return insideBinarizeLocalTree(t, headNum, head, 0, 0); } return outsideBinarizeLocalTree(t, t.label().value(), t.label().value(), headNum, head, 0, "", 0, ""); }
@Override public Tree transformTree(Tree t) { Morphology morphology = new Morphology(); List<TaggedWord> tagged = null; int index = 0; for (Tree leaf : t.getLeaves()) { Label label = leaf.label(); if (label == null) { continue; } String tag; if (!(label instanceof HasTag) || ((HasTag) label).tag() == null) { if (tagged == null) { tagged = t.taggedYield(); } tag = tagged.get(index).tag(); } else { tag = ((HasTag) label).tag(); } if (!(label instanceof HasLemma)) { throw new IllegalArgumentException("Got a tree with labels which do not support lemma"); } ((HasLemma) label).setLemma(morphology.lemma(label.value(), tag, true)); ++index; } return t; }
public void tagPOS(List<CoreLabel> tokens, Tree tree) { try { List<TaggedWord> posList = tree.getChild(0).taggedYield(); for (int i = 0; i < tokens.size(); i++) { String pos = posList.get(i).tag(); tokens.get(i).setTag(pos); } } catch (Exception e) { tagPOS(tokens); // At least gives you something. LOG.warn("POS Failed:\n" + tree.pennString()); } }
List<TaggedWord> sentence3 = new ArrayList<>(); for (int i = 0; i < sent3.length; i++) { sentence3.add(new TaggedWord(sent3[i], tag3[i])); parse.pennPrint(); parse.pennPrint(); System.out.println(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); for (Label lab : parse.yield()) { if (lab instanceof CoreLabel) { System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP)); } else { System.out.println(lab);
public static void redoTags(Tree tree, Tagger tagger) { List<Word> words = tree.yieldWords(); List<TaggedWord> tagged = tagger.apply(words); List<Label> tags = tree.preTerminalYield(); if (tags.size() != tagged.size()) { throw new AssertionError("Tags are not the same size"); } for (int i = 0; i < tags.size(); ++i) { tags.get(i).setValue(tagged.get(i).tag()); } }
Set<Constituent> goldConstituents = goldTree.constituents(LabeledConstituent.factory()); Set<Constituent> guessConstituents = guessTree.constituents(LabeledConstituent.factory()); List<TaggedWord> goldWords = goldTree.taggedYield(); List<TaggedWord> guessWords = guessTree.taggedYield(); int len = Math.min(goldWords.size(), guessWords.size()); for (int i = 0; i < len; ++i) { String goldTag = tlp.basicCategory(goldWords.get(i).tag()); String guessTag = tlp.basicCategory(guessWords.get(i).tag()); if (!goldTag.equals(guessTag)) {