private static StringList getNPlusOneNgram(StringList ngram, String word) { String[] tokens = new String[ngram.size() + 1]; for (int i = 0; i < ngram.size(); i++) { tokens[i] = ngram.getToken(i); } tokens[tokens.length - 1] = word; return new StringList(tokens); }
/** * Tests {@link StringList#compareToIgnoreCase(StringList)}. */ @Test public void testCompareToIgnoreCase() { Assert.assertTrue(new StringList("a", "b").compareToIgnoreCase( new StringList("A", "B"))); }
/** * Tests {@link StringList#equals(Object)}. */ @Test public void testEquals() { Assert.assertEquals(new StringList("a", "b"), new StringList("a", "b")); Assert.assertFalse(new StringList("a", "b").equals( new StringList("A", "B"))); }
/** * calculate the probability of a unigram in a vocabulary using maximum likelihood estimation * * @param word the only word in the unigram * @param set the vocabulary * @return the maximum likelihood probability */ public static double calculateUnigramMLProbability(String word, Collection<StringList> set) { double vocSize = 0d; for (StringList s : set) { vocSize += s.size(); } return count(new StringList(word), set) / vocSize; }
@Override public boolean contains(Object obj) { boolean result = false; if (obj instanceof String) { String str = (String) obj; result = entrySet.contains(new StringListWrapper(new StringList(str))); } return result; } };
@Override public String[] getContext(CharSequence document) { String[] superContext = super.getContext(document); List<String> context = new ArrayList(Arrays.asList(superContext)); document = this.normalizer.normalize(document); SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] words = tokenizer.tokenize(document.toString()); NGramModel tokenNgramModel = new NGramModel(); if (words.length > 0) { tokenNgramModel.add(new StringList(words), 1, 3); Iterator tokenNgramIterator = tokenNgramModel.iterator(); while (tokenNgramIterator.hasNext()) { StringList tokenList = (StringList) tokenNgramIterator.next(); if (tokenList.size() > 0) { context.add("tg=" + tokenList.toString()); } } } return context.toArray(new String[context.size()]); } }
/** * Tests {@link StringList} which uses {@link String#intern}. */ @Test public void testIntern() { StringList l1 = new StringList("a"); StringList l2 = new StringList("a", "b"); Assert.assertTrue(l1.getToken(0) == l2.getToken(0)); }
/** * Tests {@link StringList#toString()}. */ @Test public void testToString() { Assert.assertEquals("[a]", new StringList("a").toString()); Assert.assertEquals("[a,b]", new StringList("a", "b").toString()); } }
/** * Tests {@link StringList#hashCode()}. */ @Test public void testHashCode() { Assert.assertEquals(new StringList("a", "b").hashCode(), new StringList("a", "b").hashCode()); Assert.assertNotEquals(new StringList("a", "b").hashCode(), new StringList("a", "c").hashCode()); }
/** * Tests {@link StringList#iterator()}. */ @Test public void testIterator() { StringList l = new StringList("a"); Iterator<String> it = l.iterator(); Assert.assertTrue(it.hasNext()); Assert.assertEquals("a", it.next()); Assert.assertFalse(it.hasNext()); // now test with more than one string l = new StringList("a", "b", "c"); it = l.iterator(); Assert.assertTrue(it.hasNext()); Assert.assertEquals("a", it.next()); Assert.assertTrue(it.hasNext()); Assert.assertEquals("b", it.next()); Assert.assertTrue(it.hasNext()); Assert.assertEquals("c", it.next()); Assert.assertFalse(it.hasNext()); }
public boolean hasNext() { return index < size(); }
@Override public boolean equals(Object obj) { boolean result; if (obj == this) { result = true; } else if (obj instanceof StringListWrapper) { StringListWrapper other = (StringListWrapper) obj; if (isCaseSensitive) { result = this.stringList.equals(other.getStringList()); } else { result = this.stringList.compareToIgnoreCase(other.getStringList()); } } else { result = false; } return result; }
public String next() { if (hasNext()) { return getToken(index++); } else { throw new NoSuchElementException(); } }
@Override public String toString() { return this.stringList.toString(); } }
for (Iterator<String> it = tokens.iterator(); it.hasNext(); ) {
/** * calculate the probability of a bigram in a vocabulary using maximum likelihood estimation * * @param x0 first word in the bigram * @param x1 second word in the bigram * @param set the vocabulary * @return the maximum likelihood probability */ public static double calculateBigramMLProbability(String x0, String x1, Collection<StringList> set) { return calculateNgramMLProbability(new StringList(x0, x1), set); }
private static Double count(StringList ngram, Iterable<StringList> sentences) { Double count = 0d; for (StringList sentence : sentences) { int idx0 = indexOf(sentence, ngram.getToken(0)); if (idx0 >= 0 && sentence.size() >= idx0 + ngram.size()) { boolean match = true; for (int i = 1; i < ngram.size(); i++) { String sentenceToken = sentence.getToken(idx0 + i); String ngramToken = ngram.getToken(i); match &= sentenceToken.equals(ngramToken); } if (match) { count++; } } } return count; }
/** * calculate the probability of a unigram in a vocabulary using maximum likelihood estimation * * @param word the only word in the unigram * @param set the vocabulary * @return the maximum likelihood probability */ public static double calculateUnigramMLProbability(String word, Collection<StringList> set) { double vocSize = 0d; for (StringList s : set) { vocSize += s.size(); } return count(new StringList(word), set) / vocSize; }