/** * Given a line, split it into tagged words and add each word to * the given tagWordMap */ void addTaggedWords(List<TaggedWord> line, Map<String, Set<String>> tagWordMap) { for (TaggedWord taggedWord : line) { String word = taggedWord.word(); String tag = taggedWord.tag(); if (closedTags == null || closedTags.contains(tag)) { if (!tagWordMap.containsKey(tag)) { tagWordMap.put(tag, new TreeSet<>()); } tagWordMap.get(tag).add(word); } } }
public double score(Tree tree) { List<TaggedWord> yield = tree.taggedYield(); int wrong = 0; int len = Math.min(yield.size(), tagged.size()); for (int i = 0; i < len; ++i) { String yieldTag = op.langpack().basicCategory(yield.get(i).tag()); if (!yieldTag.equals(tagged.get(i).tag())) { wrong++; } } return wrong * weight; } }
protected List<IntTaggedWord> listToEvents(List<TaggedWord> taggedWords) { List<IntTaggedWord> itwList = new ArrayList<>(); for (TaggedWord tw : taggedWords) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); itwList.add(iTW); } return itwList; }
/** * Takes a sentence composed of CoreLabels and add the tags to the * CoreLabels, modifying the input sentence. If reuseTags is set to * true, any tags supplied with the CoreLabels are taken as correct. */ public void tagCoreLabels(List<CoreLabel> sentence, boolean reuseTags) { List<TaggedWord> taggedWords = tagSentence(sentence, reuseTags); if (taggedWords.size() != sentence.size()) throw new AssertionError("Tagged word list not the same length " + "as the original sentence"); for (int i = 0, size = sentence.size(); i < size; ++i) { sentence.get(i).setTag(taggedWords.get(i).tag()); } }
public static void redoTags(Tree tree, Tagger tagger) { List<Word> words = tree.yieldWords(); List<TaggedWord> tagged = tagger.apply(words); List<Label> tags = tree.preTerminalYield(); if (tags.size() != tagged.size()) { throw new AssertionError("Tags are not the same size"); } for (int i = 0; i < tags.size(); ++i) { tags.get(i).setValue(tagged.get(i).tag()); } }
private static List<TaggedWord> cleanTags(List<TaggedWord> twList, TreebankLanguagePack tlp) { int sz = twList.size(); List<TaggedWord> l = new ArrayList<>(sz); for (TaggedWord tw : twList) { TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag())); l.add(tw2); } return l; }
private static WordTag toWordTag(TaggedWord tw) { return new WordTag(tw.word(), tw.tag()); }
private static void printPunct(Treebank treebank, TreebankLanguagePack tlp, PrintWriter pw) { if (tlp == null) { log.info("The -punct option requires you to specify -tlp"); } else { Predicate<String> punctTagFilter = tlp.punctuationTagAcceptFilter(); for (Tree t : treebank) { List<TaggedWord> tws = t.taggedYield(); for (TaggedWord tw : tws) { if (punctTagFilter.test(tw.tag())) { pw.println(tw); } } } } }
private int[] testOnTreebank(Collection<TaggedWord> testWords) { int[] totalAndCorrect = new int[2]; totalAndCorrect[0] = 0; totalAndCorrect[1] = 0; for (TaggedWord word : testWords) { String goldTag = word.tag(); String guessTag = ctlp.basicCategory(getTag(word.word())); totalAndCorrect[0]++; if (goldTag.equals(guessTag)) { totalAndCorrect[1]++; } } return totalAndCorrect; }
private static List<WordTag> getNonStemmedWordTagsFromTree(Tree t) { List<WordTag> wordTags = Generics.newArrayList(); ArrayList<TaggedWord> s = t.taggedYield(); for (TaggedWord w : s) { WordTag wt = new WordTag(w.word(), w.tag()); wordTags.add(wt); } return wordTags; }
public void train(TaggedWord tw, double weight) { tokens = tokens + weight; String word = tw.word(); String tag = tw.tag(); // TaggedWord has crummy equality conditions Pair<String,String> wt = new Pair<>(word, tag); wtCount.incrementCount(wt, weight); tagCount.incrementCount(tag, weight); seenWords.add(word); }
/** * * @param t a tree * @return the WordTags corresponding to the leaves of the tree, * stemmed according to their POS tags in the tree. */ private static List<WordTag> getStemmedWordTagsFromTree(Tree t) { List<WordTag> stemmedWordTags = Generics.newArrayList(); ArrayList<TaggedWord> s = t.taggedYield(); for (TaggedWord w : s) { WordTag wt = Morphology.stemStatic(w.word(), w.tag()); stemmedWordTags.add(wt); } return stemmedWordTags; }
/** * Add the given sentence to the statistics counted. Can * be called multiple times with different sentences. */ @Override public void train(List<TaggedWord> sentence, double weight) { featExtractor.train(sentence, weight); for (TaggedWord word : sentence) { datumCounter.incrementCount(word, weight); tagsForWord.add(word.word(), word.tag()); } }
@Override public void train(List<TaggedWord> sentence) { lex.train(sentence, 1.0); String last = null; for (TaggedWord tagLabel : sentence) { String tag = tagLabel.tag(); tagIndex.add(tag); if (last == null) { initial.incrementCount(tag); } else { ruleCounter.incrementCount2D(last, tag); } last = tag; } }
private static void countTaggings(Treebank tb, final PrintWriter pw) { final TwoDimensionalCounter<String,String> wtc = new TwoDimensionalCounter<>(); tb.apply(tree -> { List<TaggedWord> tags = tree.taggedYield(); for (TaggedWord tag : tags) wtc.incrementCount(tag.word(), tag.tag()); }); for (String key : wtc.firstKeySet()) { pw.print(key); pw.print('\t'); Counter<String> ctr = wtc.getCounter(key); for (String k2 : ctr.keySet()) { pw.print(k2 + '\t' + ctr.getCount(k2) + '\t'); } pw.println(); } }
/** Turns a sentence into a flat phrasal tree. * The structure is S -> tag*. And then each tag goes to a word. * The tag is either found from the label or made "WD". * The tag and phrasal node have a StringLabel. * * @param s The Sentence to make the Tree from * @param lf The LabelFactory with which to create the new Tree labels * @return The one phrasal level Tree */ public static Tree toFlatTree(List<? extends HasWord> s, LabelFactory lf) { List<Tree> daughters = new ArrayList<>(s.size()); for (HasWord word : s) { Tree wordNode = new LabeledScoredTreeNode(lf.newLabel(word.word())); if (word instanceof TaggedWord) { TaggedWord taggedWord = (TaggedWord) word; wordNode = new LabeledScoredTreeNode(new StringLabel(taggedWord.tag()), Collections.singletonList(wordNode)); } else { wordNode = new LabeledScoredTreeNode(lf.newLabel("WD"), Collections.singletonList(wordNode)); } daughters.add(wordNode); } return new LabeledScoredTreeNode(new StringLabel("S"), daughters); }
@Override public void train(TaggedWord tw, int loc, double weight) { uwModelTrainer.train(tw, loc, weight); IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); seenCounter.incrementCount(iTW, weight); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); seenCounter.incrementCount(iT, weight); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = new IntTaggedWord(nullWord, nullTag); seenCounter.incrementCount(i, weight); // rules.add(iTW); tags.add(iT); words.add(iW); String tag = tw.tag(); String baseTag = op.langpack().basicCategory(tag); Counter<String> counts = baseTagCounts.get(baseTag); if (counts == null) { counts = new ClassicCounter<>(); baseTagCounts.put(baseTag, counts); } counts.incrementCount(tag, weight); }
/** * Only works on English, as it is hard coded for using the * Morphology class, which is English-only */ public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) { List<TaggedWord> tagged; if (getOp().testOptions.preTag) { Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger(); tagged = tagger.apply(tokens); } else { Tree tree = parse(tokens); tagged = tree.taggedYield(); } Morphology morpha = new Morphology(); List<CoreLabel> lemmas = Generics.newArrayList(); for (TaggedWord token : tagged) { CoreLabel label = new CoreLabel(); label.setWord(token.word()); label.setTag(token.tag()); morpha.stem(label); lemmas.add(label); } return lemmas; }
@Override public final void trainUnannotated(List<TaggedWord> sentence, double weight) { uwModelTrainer.incrementTreesRead(weight); int loc = 0; for (TaggedWord tw : sentence) { String baseTag = op.langpack().basicCategory(tw.tag()); Counter<String> counts = baseTagCounts.get(baseTag); if (counts == null) { ++loc; continue; } double totalCount = counts.totalCount(); if (totalCount == 0) { ++loc; continue; } for (String tag : counts.keySet()) { TaggedWord newTW = new TaggedWord(tw.word(), tag); train(newTW, loc, weight * counts.getCount(tag) / totalCount); } ++loc; } }
/** * Trains this lexicon on the Collection of trees. */ public void train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = NULL_ITW; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 2) { // it's an entirely unknown word int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, nullTag); unSeenCounter.incrementCount(iTS, weight); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(iS, weight); unSeenCounter.incrementCount(i, weight); } } }