@Override public Dictionary<String> build() throws IOException { return builder.build(baseId); }
public TrieDictionaryForest<T> build() { if (trieBuilder.isHasValue()) { //last tree TrieDictionary<T> tree = trieBuilder.build(0); addTree(tree); reset(); } TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees, this.valueDivide, this.accuOffset, this.bytesConverter, baseId); // if input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly. if (forest.getTrees().size() > 1 && !isOrdered) { throw new IllegalStateException("Invalid input data. Unordered data can not be split into multi trees"); } return forest; }
private void addValue(byte[] valueBytes) { ByteArray valueByteArray = new ByteArray(valueBytes); if (previousValue != null && isOrdered) { int comp = previousValue.compareTo(valueByteArray); if (comp == 0) { return; //duplicate value } if (comp > 0) { logger.info("values not in ascending order, previous '{}', current '{}'", previousValue, valueByteArray); isOrdered = false; if (trees.size() > 0) { throw new IllegalStateException("Invalid input data. Unordered data cannot be split into multi trees"); } } } previousValue = valueByteArray; trieBuilder.addValue(valueBytes); curTreeSize += valueBytes.length; if (curTreeSize >= maxTrieTreeSize && isOrdered) { TrieDictionary<T> tree = trieBuilder.build(0); addTree(tree); reset(); } }
this.dict = b.build(0); for (String[] row : rows) { int[] rowIndex = new int[n];
@Test public void dictionaryContainTest() { ArrayList<String> str = new ArrayList<String>(); str.add("part"); str.add("part"); // meant to be dup str.add("par"); str.add("partition"); str.add("party"); str.add("parties"); str.add("paint"); TrieDictionaryBuilder<String> b = newDictBuilder(str); int baseId = new Random().nextInt(100); TrieDictionary<String> dict = b.build(baseId); str.add("py"); b = newDictBuilder(str); baseId = new Random().nextInt(100); TrieDictionary<String> dict2 = b.build(baseId); assertEquals(true, dict2.contains(dict)); assertEquals(false, dict.contains(dict2)); }
private void testEnumeratorValues(String file) throws Exception { InputStream is = new FileInputStream(file); ArrayList<String> str = loadStrings(is); TrieDictionaryBuilder<String> b = newDictBuilder(str); TrieDictionary<String> dict = b.build(0); System.out.println("Dictionary size for file " + file + " is " + dict.getSize()); Stopwatch sw = new Stopwatch(); sw.start(); List<String> values1 = dict.enumeratorValuesByParent(); System.out.println("By iterating id visit the time cost " + sw.elapsed(TimeUnit.MILLISECONDS) + " ms"); sw.reset(); sw.start(); List<String> values2 = dict.enumeratorValues(); System.out.println("By pre-order visit the time cost " + sw.elapsed(TimeUnit.MILLISECONDS) + " ms"); sw.stop(); assertEquals(Sets.newHashSet(values1), Sets.newHashSet(values2)); }
b.addValue("paint"); b.print(); TrieDictionary<String> dict = b.build(0);
TrieDictionary<String> dict = b.build(0);
private static void benchmarkStringDictionary(Iterable<String> str) throws IOException { TrieDictionaryBuilder<String> b = newDictBuilder(str); b.stats().print(); TrieDictionary<String> dict = b.build(0);
@Test public void testAllNullValue() { ArrayList<String> strs = new ArrayList<String>(); strs.add(""); TrieDictionaryBuilder<String> builder = newDictBuilder(strs); TrieDictionary<String> dict = builder.build(0); assertEquals(1, dict.getSize()); assertEquals(0, dict.getIdFromValue("")); }
this.dict = b.build(0);
for (String str : testData) oldTrieBuilder.addValue(str); TrieDictionary<String> oldDict = oldTrieBuilder.build(0); keep |= oldDict.getIdFromValue(testData.get(0)); oldDictTotalBuildTime += (System.currentTimeMillis() - startTime);
@Test public void testSuperLongStringValue() { String longPrefix = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" + "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"; TrieDictionaryBuilder<String> b = new TrieDictionaryBuilder<String>(new StringBytesConverter()); String v1 = longPrefix + "xyz"; b.addValue(v1); String strLen200 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghid"; b.addValue(strLen200); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 25; i++) { sb.append(strLen200); } String strLen5000 = sb.toString(); b.addValue(strLen5000); TrieDictionary<String> dict = b.build(0); dict.dump(System.out); sb.setLength(0); for (int j = 0; j < 7; j++) { sb.append(strLen5000); } String strLen35000 = sb.toString(); b.addValue(strLen35000); Exception ex = null; try { b.build(0); } catch (Exception e) { ex = e; } Assert.assertNotNull(ex); }
dictBuilder.addValue(str); Dictionary<String> dict = dictBuilder.build(0);
@SuppressWarnings("rawtypes") private static Dictionary strsToDict(Collection<String> strs) { TrieDictionaryBuilder<String> builder = new TrieDictionaryBuilder<>(new StringBytesConverter()); for (String str : strs) { builder.addValue(str); } return builder.build(0); }
private static void testStringDictionary(ArrayList<String> str, ArrayList<String> notFound) { TrieDictionaryBuilder<String> b = newDictBuilder(str); int baseId = new Random().nextInt(100); TrieDictionary<String> dict = b.build(baseId);
@Before public void before() { int dataSize = 100 * 10000; TrieDictionaryBuilder<String> b1 = new TrieDictionaryBuilder<>(new StringBytesConverter()); TrieDictionaryForestBuilder<String> b2 = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), 0, 5); this.rawData = genStringDataSet(dataSize); for (String str : this.rawData) { b1.addValue(str); b2.addValue(str); } this.oldDict = b1.build(0); this.newDict = b2.build(); System.out.println("new dict split tree size : " + ((TrieDictionaryForest<String>) newDict).getTrees().size()); }
private static Dictionary newDictionaryOfString() { TrieDictionaryBuilder<String> builder = new TrieDictionaryBuilder<>(new StringBytesConverter()); builder.addValue("Dong"); builder.addValue("George"); builder.addValue("Jason"); builder.addValue("Kejia"); builder.addValue("Luke"); builder.addValue("Mahone"); builder.addValue("Qianhao"); builder.addValue("Shaofeng"); builder.addValue("Xu"); builder.addValue("Yang"); return builder.build(0); }
private static Dictionary buildStringDict(List<byte[]> values, int baseId, int nSamples, ArrayList samples) { TrieDictionaryBuilder builder = new TrieDictionaryBuilder(new StringBytesConverter()); for (byte[] value : values) { String v = Bytes.toString(value); builder.addValue(v); if (samples.size() < nSamples && samples.contains(v) == false) samples.add(v); } return builder.build(baseId); }
private void assertSameBehaviorAsTrie(TrieDictionaryForest<String> dict, ArrayList<String> strs, int baseId) { TrieDictionaryBuilder<String> trieBuilder = new TrieDictionaryBuilder<>(new StringBytesConverter()); for (String s : strs) { if (s != null) trieBuilder.addValue(s); } TrieDictionary<String> trie = trieBuilder.build(baseId); assertEquals(trie.getMaxId(), dict.getMaxId()); assertEquals(trie.getMinId(), dict.getMinId()); assertEquals(trie.getSize(), dict.getSize()); assertEquals(trie.getSizeOfId(), dict.getSizeOfId()); assertEquals(trie.getSizeOfValue(), dict.getSizeOfValue()); }