@Override public void finalizeAggregation() throws Exception { finalizeDictionary(); }
@Override public void finalizeAggregation() throws Exception { finalizeDictionary(); }
public void testSaveLoadDictionaryPlainTextNoNormalize() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); StringWriter sw = new StringWriter(); builder.saveDictionary(sw); StringReader sr = new StringReader(sw.toString()); DictionaryBuilder builder2 = new DictionaryBuilder(); builder2.setup(structure); builder2.loadDictionary(sr); // just returns the loaded dictionary Map<String, int[]> consolidated = builder2.finalizeDictionary(); assertEquals(2, consolidated.size()); }
public void testSaveLoadDictionaryPlainTextNoNormalize() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); StringWriter sw = new StringWriter(); builder.saveDictionary(sw); StringReader sr = new StringReader(sw.toString()); DictionaryBuilder builder2 = new DictionaryBuilder(); builder2.setup(structure); builder2.loadDictionary(sr); // just returns the loaded dictionary Map<String, int[]> consolidated = builder2.finalizeDictionary(); assertEquals(2, consolidated.size()); }
public void testSaveLoadDictionaryPlainTextNormalize() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setNormalize(true); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); StringWriter sw = new StringWriter(); builder.saveDictionary(sw); String dictText = sw.toString(); assertTrue(dictText.startsWith("@@@3.39036")); StringReader sr = new StringReader(dictText); DictionaryBuilder builder2 = new DictionaryBuilder(); builder2.setup(structure); builder2.loadDictionary(sr); // just returns the loaded dictionary Map<String, int[]> consolidated = builder2.finalizeDictionary(); assertEquals(2, consolidated.size()); }
public void testFinalizeDictionaryNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary assertEquals(2, consolidated.size()); }
public void testPruneMinFreq() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(1); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // min freq of 1 should keep all terms assertEquals(15, consolidated.size()); }
public void testFinalizeDictionaryNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary assertEquals(2, consolidated.size()); }
public void testFinalizeDictionaryNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary // according to the default min freq of 2 assertEquals(2, consolidated.size()); }
public void testPruneMinFreq() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(1); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // min freq of 1 should keep all terms assertEquals(15, consolidated.size()); }
public void testFinalizeDictionaryNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } assertEquals(15, builder.getDictionaries(false)[0].size()); Map<String, int[]> consolidated = builder.finalizeDictionary(); // all but "the" and "over" should have been pruned from the dictionary // according to the default min freq of 2 assertEquals(2, consolidated.size()); }
public void testGetVectorizedStructureNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(4, format.numAttributes()); }
public void testGetVectorizedStructureNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(2, format.numAttributes()); }
public void testGetVectorizedStructureNoClassExtraAtts() throws Exception { Instances data1 = getData3(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(4, format.numAttributes()); }
public void testGetVectorizedStructureNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instances format = builder.getVectorizedFormat(); assertTrue(format != null); assertEquals(2, format.numAttributes()); }
public void testAggregateDictionaries() throws Exception { Instances data1 = getData1(); Instances data4 = getData4(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(1); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } Instances structure2 = new Instances(data4, 0); DictionaryBuilder builder2 = new DictionaryBuilder(); builder2.setMinTermFreq(1); builder2.setup(structure2); for (int i = 0; i < data4.numInstances(); i++) { builder2.processInstance(data4.instance(i)); } builder = builder.aggregate(builder2); builder.finalizeAggregation(); Map<String, int[]> consolidated = builder.finalizeDictionary(); assertEquals(17, consolidated.size()); }
public void testVectorizeInstanceWordPresenceNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // values of the two attributes should be 1 (presence indicators) assertEquals(1, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }
public void testVectorizeInstanceWordPresenceNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // values of the two attributes should be 1 (presence indicators) assertEquals(1, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }
public void testVectorizeInstanceWordCountsNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setOutputWordCounts(true); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // "the" occurs twice in the first index and "over" once assertEquals(2, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }
public void testVectorizeInstanceWordCountsNoClass() throws Exception { Instances data1 = getData1(); Instances structure = new Instances(data1, 0); DictionaryBuilder builder = new DictionaryBuilder(); builder.setMinTermFreq(2); builder.setOutputWordCounts(true); builder.setup(structure); for (int i = 0; i < data1.numInstances(); i++) { builder.processInstance(data1.instance(i)); } builder.finalizeDictionary(); Instance vectorized = builder.vectorizeInstance(data1.instance(0)); assertEquals(2, vectorized.numAttributes()); // "the" occurs twice in the first index and "over" once assertEquals(2, (int) vectorized.value(0)); assertEquals(1, (int) vectorized.value(1)); }