public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId, int treeSize) { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId); b.setMaxTrieTreeSize(treeSize); for (String s : strs) { b.addValue(s); } return b; }
@Override public Dictionary<String> build() throws IOException { return builder.build(); }
public TrieDictionaryForest<T> build() { if (trieBuilder.isHasValue()) { //last tree TrieDictionary<T> tree = trieBuilder.build(0); addTree(tree); reset(); } TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees, this.valueDivide, this.accuOffset, this.bytesConverter, baseId); // if input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly. if (forest.getTrees().size() > 1 && !isOrdered) { throw new IllegalStateException("Invalid input data. Unordered data can not be split into multi trees"); } return forest; }
public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId) { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId); for (String s : strs) b.addValue(s); return b; }
System.out.println("data size:" + totalSize / 1024 + "KB max tree size:" + maxTreeSize / 1024 + "KB"); TrieDictionaryForestBuilder<String> builder = new TrieDictionaryForestBuilder<String>(converter); builder.setMaxTrieTreeSize(maxTreeSize); for (String str : strs) { builder.addValue(str); TrieDictionaryForest<String> dict = builder.build(); assertEquals(1, dict.getTrees().size()); strs.add("f"); strs.add("a"); builder = new TrieDictionaryForestBuilder<String>(converter); builder.setMaxTrieTreeSize(maxTreeSize); try { for (String str : strs) builder.addValue(str); dict = builder.build(); fail("Input data no sorted and builder have multi trees. Should throw IllegalStateException"); } catch (IllegalStateException e) {
long startTime = System.currentTimeMillis(); BytesConverter<String> converter = new StringBytesConverter(); TrieDictionaryForestBuilder<String> newTrieBuilder = new TrieDictionaryForestBuilder<String>(converter, 0); for (String str : testData) newTrieBuilder.addValue(str); TrieDictionaryForest<String> newDict = newTrieBuilder.build(); keep |= newDict.getIdFromValue(testData.get(0)); newDictTotalBuildTime += (System.currentTimeMillis() - startTime);
builder.setMaxTrieTreeSize(size / treeNum); Iterator<String> it = set.iterator(); while (it.hasNext()) builder.addValue(it.next()); TrieDictionaryForest<String> dict = builder.build(); System.out.println(dict.getTrees().size());
@Test public void emptyDictTest() throws Exception { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter()); TrieDictionaryForest<String> dict = b.build(); try { int id = dict.getIdFromValue("123", 0); fail("id should not exist"); } catch (IllegalArgumentException e) { //right } try { String value = dict.getValueFromIdImpl(123); fail("value should not exist"); } catch (IllegalArgumentException e) { //right } }
@Override public boolean addValue(String value) { if (value == null) return false; builder.addValue(value); return true; }
@Override public void init(DictionaryInfo info, int baseId, String hdfsDir) throws IOException { builder = new TrieDictionaryForestBuilder(new StringBytesConverter(), baseId); }
private void addTree(TrieDictionary<T> tree) { trees.add(tree); int minId = tree.getMinId(); accuOffset.add(curOffset); byte[] valueBytes = tree.getValueBytesFromIdWithoutCache(minId); valueDivide.add(new ByteArray(valueBytes, 0, valueBytes.length)); curOffset += (tree.getMaxId() + 1); checkDictSize(); }
public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId) { this(bytesConverter, baseId, getMaxTrieSizeInMB()); }
@Before public void before() { int dataSize = 100 * 10000; TrieDictionaryBuilder<String> b1 = new TrieDictionaryBuilder<>(new StringBytesConverter()); TrieDictionaryForestBuilder<String> b2 = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), 0, 5); this.rawData = genStringDataSet(dataSize); for (String str : this.rawData) { b1.addValue(str); b2.addValue(str); } this.oldDict = b1.build(0); this.newDict = b2.build(); System.out.println("new dict split tree size : " + ((TrieDictionaryForest<String>) newDict).getTrees().size()); }
public void addValue(T value) { if (value == null) return; byte[] valueBytes = bytesConverter.convertToBytes(value); addValue(valueBytes); }
@Override public void init(DictionaryInfo info, int baseId, String hdfsDir) throws IOException { builder = new TrieDictionaryForestBuilder(new StringBytesConverter(), baseId); }
private void addTree(TrieDictionary<T> tree) { trees.add(tree); int minId = tree.getMinId(); accuOffset.add(curOffset); byte[] valueBytes = tree.getValueBytesFromIdWithoutCache(minId); valueDivide.add(new ByteArray(valueBytes, 0, valueBytes.length)); curOffset += (tree.getMaxId() + 1); checkDictSize(); }
public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId) { this(bytesConverter, baseId, getMaxTrieSizeInMB()); }
@Test public void testEmptyDict() { ArrayList<String> strs = new ArrayList<String>(); TrieDictionaryForestBuilder<String> builder = newDictBuilder(strs, 0); TrieDictionaryForest<String> dict = builder.build(); assertSameBehaviorAsTrie(dict, strs, 0); }
public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterator<String> strs, int baseId, int treeSize) { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId); b.setMaxTrieTreeSize(treeSize); while (strs.hasNext()) b.addValue(strs.next()); return b; }
@Override public boolean addValue(String value) { if (value == null) return false; builder.addValue(value); return true; }