/** * calculate the probability of a bigram in a vocabulary using maximum likelihood estimation * * @param x0 first word in the bigram * @param x1 second word in the bigram * @param set the vocabulary * @return the maximum likelihood probability */ public static double calculateBigramMLProbability(String x0, String x1, Collection<StringList> set) { return calculateNgramMLProbability(new StringList(x0, x1), set); }
public Entry next() { String word = iterator.next(); Attributes tagAttribute = new Attributes(); tagAttribute.setValue("tags", tagsToString(getTags(word))); return new Entry(new StringList(word), tagAttribute); }
public Entry next() { String token = iterator.next(); Attributes attributes = new Attributes(); attributes.setValue("operation", getOperation(token).toString()); return new Entry(new StringList(token), attributes); }
@Test public void testNgramMLProbability() { Collection<StringList> set = new LinkedList<>(); set.add(new StringList("<s>", "I", "am", "Sam", "</s>")); set.add(new StringList("<s>", "Sam", "I", "am", "</s>")); set.add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>")); set.add(new StringList("")); Double d = NGramUtils.calculateNgramMLProbability(new StringList("I", "am", "Sam"), set); Assert.assertEquals(Double.valueOf(0.5), d); d = NGramUtils.calculateNgramMLProbability(new StringList("Sam", "I", "am"), set); Assert.assertEquals(Double.valueOf(1d), d); }
public DictionaryNameFinderTest() { StringList vanessa = new StringList(new String[]{"Vanessa"}); mDictionary.put(vanessa); StringList vanessaWilliams = new StringList("Vanessa", "Williams"); mDictionary.put(vanessaWilliams); StringList max = new StringList(new String[]{"Max"}); mDictionary.put(max); StringList michaelJordan = new StringList("Michael", "Jordan"); mDictionary.put(michaelJordan); }
@Test public void testTrigramMLProbability() { Collection<StringList> set = new LinkedList<>(); set.add(new StringList("<s>", "I", "am", "Sam", "</s>")); set.add(new StringList("<s>", "Sam", "I", "am", "</s>")); set.add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>")); set.add(new StringList("")); Double d = NGramUtils.calculateTrigramMLProbability("I", "am", "Sam", set); Assert.assertEquals(Double.valueOf(0.5), d); d = NGramUtils.calculateTrigramMLProbability("Sam", "I", "am", set); Assert.assertEquals(Double.valueOf(1d), d); }
@Test public void testLinearInterpolation2() throws Exception { Collection<StringList> set = new LinkedList<>(); set.add(new StringList("D", "N", "V", "STOP")); set.add(new StringList("D", "N", "V", "STOP")); Double lambda = 1d / 3d; Double d = NGramUtils.calculateTrigramLinearInterpolationProbability("N", "V", "STOP", set, lambda, lambda, lambda); Assert.assertNotNull(d); Assert.assertEquals("wrong result", Double.valueOf(0.75d), d); }
/** * Tests {@link StringList} which uses {@link String#intern}. */ @Test public void testIntern() { StringList l1 = new StringList("a"); StringList l2 = new StringList("a", "b"); Assert.assertTrue(l1.getToken(0) == l2.getToken(0)); }
/** * Tests the lookup of tokens of different case. */ @Test public void testDifferentCaseLookup() { StringList entry1 = new StringList("1a", "1b"); StringList entry2 = new StringList("1A", "1B"); Dictionary dict = getCaseInsensitive(); dict.put(entry1); Assert.assertTrue(dict.contains(entry2)); }
public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff) throws IOException { NGramModel ngramModel = new NGramModel(); POSSample sample; while ((sample = samples.read()) != null) { String[] words = sample.getSentence(); if (words.length > 0) ngramModel.add(new StringList(words), 1, 1); } ngramModel.cutoff(cutoff, Integer.MAX_VALUE); return ngramModel.toDictionary(true); }
@Test public void testContains2() throws Exception { NGramModel ngramModel = new NGramModel(); StringList tokens = new StringList("the", "bro", "wn"); ngramModel.add(tokens, 1, 3); Assert.assertTrue(ngramModel.contains(new StringList("the"))); }
@Test public void testAdd3() throws Exception { NGramModel ngramModel = new NGramModel(); ngramModel.add(new StringList("the", "brown", "fox"), 2, 3); int count = ngramModel.getCount(new StringList("the", "brown", "fox")); Assert.assertEquals(1, count); count = ngramModel.getCount(new StringList("the", "brown")); Assert.assertEquals(1, count); count = ngramModel.getCount(new StringList("brown", "fox")); Assert.assertEquals(1, count); Assert.assertEquals(3, ngramModel.size()); }
@Test public void testAdd2() throws Exception { NGramModel ngramModel = new NGramModel(); ngramModel.add(new StringList("the", "bro", "wn"), 2, 3); int count = ngramModel.getCount(new StringList("the", "bro", "wn")); Assert.assertEquals(1, count); Assert.assertEquals(3, ngramModel.size()); }
@Test public void testZeroGetCount2() throws Exception { NGramModel ngramModel = new NGramModel(); ngramModel.add(new StringList("the", "bro", "wn")); int count = ngramModel.getCount(new StringList("fox")); Assert.assertEquals(0, count); Assert.assertEquals(1, ngramModel.size()); }
@Test public void testNumberOfGrams() throws Exception { NGramModel ngramModel = new NGramModel(); StringList tokens = new StringList("the", "bro", "wn"); ngramModel.add(tokens, 1, 3); Assert.assertEquals(6, ngramModel.numberOfGrams()); }
/** * Tests for the {@link Dictionary#toString()} method. */ @Test public void testToString() { StringList entry1 = new StringList("1a", "1b"); Dictionary dictA = getCaseInsensitive(); dictA.toString(); dictA.put(entry1); dictA.toString(); }
@Test public void testRemove() throws Exception { NGramModel ngramModel = new NGramModel(); StringList tokens = new StringList("the", "bro", "wn"); ngramModel.add(tokens); ngramModel.remove(tokens); Assert.assertEquals(0, ngramModel.size()); }
@Test public void testCutoff1() throws Exception { NGramModel ngramModel = new NGramModel(); StringList tokens = new StringList("the", "brown", "fox", "jumped"); ngramModel.add(tokens, 1, 3); ngramModel.cutoff(2, 4); Assert.assertEquals(0, ngramModel.size()); }
@Test public void testZeroGetCount() throws Exception { NGramModel ngramModel = new NGramModel(); int count = ngramModel.getCount(new StringList("")); Assert.assertEquals(0, count); Assert.assertEquals(0, ngramModel.size()); }
@Test public void testCutoff2() throws Exception { NGramModel ngramModel = new NGramModel(); StringList tokens = new StringList("the", "brown", "fox", "jumped"); ngramModel.add(tokens, 1, 3); ngramModel.cutoff(1, 3); Assert.assertEquals(9, ngramModel.size()); }