@Override public DictionaryBuilder aggregate(DictionaryBuilder toAgg) throws Exception { Map<String, int[]>[] toAggDicts = toAgg.getDictionaries(false); if (toAggDicts.length != m_dictsPerClass.length) { throw new Exception("Number of dictionaries from the builder to " + "be aggregated does not match our number of dictionaries"); } // we assume that the order of class values is consistent for (int i = 0; i < toAggDicts.length; i++) { Map<String, int[]> toAggDictForClass = toAggDicts[i]; for (Map.Entry<String, int[]> e : toAggDictForClass.entrySet()) { int[] ourCounts = m_dictsPerClass[i].get(e.getKey()); if (ourCounts == null) { ourCounts = new int[2]; m_dictsPerClass[i].put(e.getKey(), ourCounts); } ourCounts[0] += e.getValue()[0]; // word count ourCounts[1] += e.getValue()[1]; // doc count } } m_count += toAgg.m_count; m_docLengthSum += toAgg.m_docLengthSum; return this; }
@Override public DictionaryBuilder aggregate(DictionaryBuilder toAgg) throws Exception { Map<String, int[]>[] toAggDicts = toAgg.getDictionaries(false); if (toAggDicts.length != m_dictsPerClass.length) { throw new Exception("Number of dictionaries from the builder to " + "be aggregated does not match our number of dictionaries"); } // we assume that the order of class values is consistent for (int i = 0; i < toAggDicts.length; i++) { Map<String, int[]> toAggDictForClass = toAggDicts[i]; for (Map.Entry<String, int[]> e : toAggDictForClass.entrySet()) { int[] ourCounts = m_dictsPerClass[i].get(e.getKey()); if (ourCounts == null) { ourCounts = new int[2]; m_dictsPerClass[i].put(e.getKey(), ourCounts); } ourCounts[0] += e.getValue()[0]; // word count ourCounts[1] += e.getValue()[1]; // doc count } } m_count += toAgg.m_count; m_docLengthSum += toAgg.m_docLengthSum; return this; }
public void testTypicalClassAttPresent() throws Exception { Instances data2 = getData2(); Instances structure = new Instances(data2, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data2.numInstances(); i++) { builder.processInstance(data2.instance(i)); } // should be two dictionaries (one for each class) assertEquals(2, builder.getDictionaries(false).length); assertEquals(8, builder.getDictionaries(false)[0].size()); assertEquals(9, builder.getDictionaries(false)[1].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // first dictionary: word count (index 0) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[0]); // first dictionary: doc count (index 1) should be 1 assertEquals(1, builder.getDictionaries(false)[0].get("the")[1]); }
public void testTypicalClassAttPresent() throws Exception { Instances data2 = getData2(); Instances structure = new Instances(data2, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data2.numInstances(); i++) { builder.processInstance(data2.instance(i)); } // should be two dictionaries (one for each class) assertEquals(2, builder.getDictionaries(false).length); assertEquals(8, builder.getDictionaries(false)[0].size()); assertEquals(9, builder.getDictionaries(false)[1].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // first dictionary: word count (index 0) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[0]); // first dictionary: doc count (index 1) should be 1 assertEquals(1, builder.getDictionaries(false)[0].get("the")[1]); }
public void testTypicalNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // word count (index 0) should be 4 assertEquals(4, builder.getDictionaries(false)[0].get("the")[0]); // doc count (index 1) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[1]); }
public void testTypicalNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // word count (index 0) should be 4 assertEquals(4, builder.getDictionaries(false)[0].get("the")[0]); // doc count (index 1) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[1]); }
public void testTypicalNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // word count (index 0) should be 4 assertEquals(4, builder.getDictionaries(false)[0].get("the")[0]); // doc count (index 1) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[1]); }
public void testTypicalNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); // check a couple of words assertTrue(builder.getDictionaries(false)[0].get("the") != null); // word count (index 0) should be 4 assertEquals(4, builder.getDictionaries(false)[0].get("the")[0]); // doc count (index 1) should be 2 assertEquals(2, builder.getDictionaries(false)[0].get("the")[1]); }
public void testInit() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); // should be just one dictionary (i.e. no class attribute, so no per-class // dictionaries) assertEquals(1, builder.getDictionaries(false).length); }
public void testInit() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); // should be just one dictionary (i.e. no class attribute, so no per-class // dictionaries) assertEquals(1, builder.getDictionaries(false).length); }
public void testPruneMinFreq() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(1); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // min freq of 1 should keep all terms assertEquals(15, consolidated.size()); }
public void testPruneMinFreq() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(1); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // min freq of 1 should keep all terms assertEquals(15, consolidated.size()); }
public void testFinalizeDictionaryNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary assertEquals(2, consolidated.size()); }
public void testFinalizeDictionaryNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary assertEquals(2, consolidated.size()); }
public void testFinalizeDictionaryNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary // according to the default min freq of 2 assertEquals(2, consolidated.size()); }
public void testFinalizeDictionaryNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary // according to the default min freq of 2 assertEquals(2, consolidated.size()); }