public String toString(String divider) { return word() + divider + tag; }
private static String mergeLeavesIntoCollocatedString(Tree t) { StringBuilder sb = new StringBuilder(160); ArrayList<TaggedWord> sent = t.taggedYield(); for (TaggedWord aSent : sent) { sb.append(aSent.word()).append('_'); } return sb.substring(0,sb.length() -1); }
public void train(List<TaggedWord> sentence, double weight) { for (TaggedWord word : sentence) { String wordString = word.word(); wordCounter.incrementCount(wordString, weight); } }
private static String mergeLeavesIntoCollocatedString(Tree[] trees) { StringBuilder sb = new StringBuilder(160); for (Tree t: trees) { ArrayList<TaggedWord> sent = t.taggedYield(); for (TaggedWord aSent : sent) { sb.append(aSent.word()).append('_'); } } return sb.substring(0,sb.length() -1); }
/** * Given a line, split it into tagged words and add each word to * the given tagWordMap */ void addTaggedWords(List<TaggedWord> line, Map<String, Set<String>> tagWordMap) { for (TaggedWord taggedWord : line) { String word = taggedWord.word(); String tag = taggedWord.tag(); if (closedTags == null || closedTags.contains(tag)) { if (!tagWordMap.containsKey(tag)) { tagWordMap.put(tag, new TreeSet<>()); } tagWordMap.get(tag).add(word); } } }
protected List<IntTaggedWord> listToEvents(List<TaggedWord> taggedWords) { List<IntTaggedWord> itwList = new ArrayList<>(); for (TaggedWord tw : taggedWords) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); itwList.add(iTW); } return itwList; }
private static List<TaggedWord> cleanTags(List<TaggedWord> twList, TreebankLanguagePack tlp) { int sz = twList.size(); List<TaggedWord> l = new ArrayList<>(sz); for (TaggedWord tw : twList) { TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag())); l.add(tw2); } return l; }
private static WordTag toWordTag(TaggedWord tw) { return new WordTag(tw.word(), tw.tag()); }
private int[] testOnTreebank(Collection<TaggedWord> testWords) { int[] totalAndCorrect = new int[2]; totalAndCorrect[0] = 0; totalAndCorrect[1] = 0; for (TaggedWord word : testWords) { String goldTag = word.tag(); String guessTag = ctlp.basicCategory(getTag(word.word())); totalAndCorrect[0]++; if (goldTag.equals(guessTag)) { totalAndCorrect[1]++; } } return totalAndCorrect; }
private static List<WordTag> getNonStemmedWordTagsFromTree(Tree t) { List<WordTag> wordTags = Generics.newArrayList(); ArrayList<TaggedWord> s = t.taggedYield(); for (TaggedWord w : s) { WordTag wt = new WordTag(w.word(), w.tag()); wordTags.add(wt); } return wordTags; }
public void train(TaggedWord tw, double weight) { tokens = tokens + weight; String word = tw.word(); String tag = tw.tag(); // TaggedWord has crummy equality conditions Pair<String,String> wt = new Pair<>(word, tag); wtCount.incrementCount(wt, weight); tagCount.incrementCount(tag, weight); seenWords.add(word); }
/** * * @param t a tree * @return the WordTags corresponding to the leaves of the tree, * stemmed according to their POS tags in the tree. */ private static List<WordTag> getStemmedWordTagsFromTree(Tree t) { List<WordTag> stemmedWordTags = Generics.newArrayList(); ArrayList<TaggedWord> s = t.taggedYield(); for (TaggedWord w : s) { WordTag wt = Morphology.stemStatic(w.word(), w.tag()); stemmedWordTags.add(wt); } return stemmedWordTags; }
/** * Add the given sentence to the statistics counted. Can * be called multiple times with different sentences. */ @Override public void train(List<TaggedWord> sentence, double weight) { featExtractor.train(sentence, weight); for (TaggedWord word : sentence) { datumCounter.incrementCount(word, weight); tagsForWord.add(word.word(), word.tag()); } }
private static void countTaggings(Treebank tb, final PrintWriter pw) { final TwoDimensionalCounter<String,String> wtc = new TwoDimensionalCounter<>(); tb.apply(tree -> { List<TaggedWord> tags = tree.taggedYield(); for (TaggedWord tag : tags) wtc.incrementCount(tag.word(), tag.tag()); }); for (String key : wtc.firstKeySet()) { pw.print(key); pw.print('\t'); Counter<String> ctr = wtc.getCounter(key); for (String k2 : ctr.keySet()) { pw.print(k2 + '\t' + ctr.getCount(k2) + '\t'); } pw.println(); } }
@Override public void train(TaggedWord tw, int loc, double weight) { uwModelTrainer.train(tw, loc, weight); IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); seenCounter.incrementCount(iTW, weight); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); seenCounter.incrementCount(iT, weight); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = new IntTaggedWord(nullWord, nullTag); seenCounter.incrementCount(i, weight); // rules.add(iTW); tags.add(iT); words.add(iW); String tag = tw.tag(); String baseTag = op.langpack().basicCategory(tag); Counter<String> counts = baseTagCounts.get(baseTag); if (counts == null) { counts = new ClassicCounter<>(); baseTagCounts.put(baseTag, counts); } counts.incrementCount(tag, weight); }
/** * Only works on English, as it is hard coded for using the * Morphology class, which is English-only */ public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) { List<TaggedWord> tagged; if (getOp().testOptions.preTag) { Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger(); tagged = tagger.apply(tokens); } else { Tree tree = parse(tokens); tagged = tree.taggedYield(); } Morphology morpha = new Morphology(); List<CoreLabel> lemmas = Generics.newArrayList(); for (TaggedWord token : tagged) { CoreLabel label = new CoreLabel(); label.setWord(token.word()); label.setTag(token.tag()); morpha.stem(label); lemmas.add(label); } return lemmas; }
@Override public final void trainUnannotated(List<TaggedWord> sentence, double weight) { uwModelTrainer.incrementTreesRead(weight); int loc = 0; for (TaggedWord tw : sentence) { String baseTag = op.langpack().basicCategory(tw.tag()); Counter<String> counts = baseTagCounts.get(baseTag); if (counts == null) { ++loc; continue; } double totalCount = counts.totalCount(); if (totalCount == 0) { ++loc; continue; } for (String tag : counts.keySet()) { TaggedWord newTW = new TaggedWord(tw.word(), tag); train(newTW, loc, weight * counts.getCount(tag) / totalCount); } ++loc; } }
/** * Trains this lexicon on the Collection of trees. */ public void train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.incrementCount(iW, weight); IntTaggedWord i = NULL_ITW; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.getCount(iW) < 2) { // it's an entirely unknown word int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word)); IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, nullTag); unSeenCounter.incrementCount(iTS, weight); unSeenCounter.incrementCount(iT, weight); unSeenCounter.incrementCount(iS, weight); unSeenCounter.incrementCount(i, weight); } } }