private static Dictionary newDictionaryOfString() { TrieDictionaryBuilder<String> builder = new TrieDictionaryBuilder<>(new StringBytesConverter()); builder.addValue("Dong"); builder.addValue("George"); builder.addValue("Jason"); builder.addValue("Kejia"); builder.addValue("Luke"); builder.addValue("Mahone"); builder.addValue("Qianhao"); builder.addValue("Shaofeng"); builder.addValue("Xu"); builder.addValue("Yang"); return builder.build(0); }
protected byte[] buildTrieBytes(int baseId) { checkOverflowParts(this.root); Stats stats = stats(); int sizeNoValuesBeneath = stats.mbpn_sizeNoValueBeneath; int sizeChildOffset = stats.mbpn_sizeChildOffset; headOut.write(sizeChildOffset); headOut.write(sizeNoValuesBeneath); positiveShortPreCheck(baseId, "baseId"); headOut.writeShort(baseId); positiveShortPreCheck(stats.maxValueLength, "stats.maxValueLength"); headOut.writeShort(stats.maxValueLength); headOut.writeUTF(bytesConverter == null ? "" : bytesConverter.getClass().getName()); o = build_writeNode(root, o, true, sizeNoValuesBeneath, sizeChildOffset, trieBytes); if (root.children.isEmpty() == false) open.addLast(root); build_overwriteChildOffset(offsetMap.get(parent), o - head.length, sizeChildOffset, trieBytes); for (int i = 0; i < parent.children.size(); i++) { Node c = parent.children.get(i); boolean isLastChild = (i == parent.children.size() - 1); offsetMap.put(c, o); o = build_writeNode(c, o, isLastChild, sizeNoValuesBeneath, sizeChildOffset, trieBytes); if (c.children.isEmpty() == false) open.addLast(c);
void addValue(byte[] value) { addValueR(root, value, 0); }
public static void main(String[] args) throws Exception { TrieDictionaryBuilder<String> b = new TrieDictionaryBuilder<String>(new StringBytesConverter()); b.addValue("part"); b.print(); b.addValue("part"); b.print(); b.addValue("par"); b.print(); b.addValue("partition"); b.print(); b.addValue("party"); b.print(); b.addValue("parties"); b.print(); b.addValue("paint"); b.print(); TrieDictionary<String> dict = b.build(0);
public TrieDictionaryForest<T> build() { if (trieBuilder.isHasValue()) { //last tree TrieDictionary<T> tree = trieBuilder.build(0); addTree(tree); reset(); } TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees, this.valueDivide, this.accuOffset, this.bytesConverter, baseId); // if input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly. if (forest.getTrees().size() > 1 && !isOrdered) { throw new IllegalStateException("Invalid input data. Unordered data can not be split into multi trees"); } return forest; }
@Override public Dictionary<String> build() throws IOException { return builder.build(baseId); }
private static TrieDictionaryBuilder<String> newDictBuilder(Iterable<String> str) { TrieDictionaryBuilder<String> b = new TrieDictionaryBuilder<String>(new StringBytesConverter()); for (String s : str) b.addValue(s); return b; }
@Override public boolean addValue(String value) { if (value == null) return false; builder.addValue(value); return true; }
private void addValue(byte[] valueBytes) { ByteArray valueByteArray = new ByteArray(valueBytes); if (previousValue != null && isOrdered) { int comp = previousValue.compareTo(valueByteArray); if (comp == 0) { return; //duplicate value } if (comp > 0) { logger.info("values not in ascending order, previous '{}', current '{}'", previousValue, valueByteArray); isOrdered = false; if (trees.size() > 0) { throw new IllegalStateException("Invalid input data. Unordered data cannot be split into multi trees"); } } } previousValue = valueByteArray; trieBuilder.addValue(valueBytes); curTreeSize += valueBytes.length; if (curTreeSize >= maxTrieTreeSize && isOrdered) { TrieDictionary<T> tree = trieBuilder.build(0); addTree(tree); reset(); } }
private void checkOverflowParts(Node node) { LinkedList<Node> childrenCopy = new LinkedList<Node>(node.children); for (Node child : childrenCopy) { if (child.part.length > 255) { byte[] first255 = Arrays.copyOf(child.part, 255); completeParts.append(node.part); completeParts.append(first255); byte[] visited = completeParts.retrieve(); this.addValue(visited); completeParts.withdraw(255); completeParts.withdraw(node.part.length); } } completeParts.append(node.part); // by here the node.children may have been changed for (Node child : node.children) { checkOverflowParts(child); } completeParts.withdraw(node.part.length); }
public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter, int baseId, int maxTrieTreeSizeMB) { this.bytesConverter = bytesConverter; this.trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter); this.baseId = baseId; this.curOffset = 0; this.maxTrieTreeSize = maxTrieTreeSizeMB * 1024 * 1024; }
/** * Flatten the trie into a byte array for a minimized memory footprint. * Lookup remains fast. Cost is inflexibility to modify (becomes immutable). * <p> * Flattened node structure is HEAD + NODEs, for each node: * - o byte, offset to child node, o = stats.mbpn_sizeChildOffset * - 1 bit, isLastChild flag, the 1st MSB of o * - 1 bit, isEndOfValue flag, the 2nd MSB of o * - c byte, number of values beneath, c = stats.mbpn_sizeNoValueBeneath * - 1 byte, number of value bytes * - n byte, value bytes */ public TrieDictionary<T> build(int baseId) { byte[] trieBytes = buildTrieBytes(baseId); TrieDictionary<T> r = new TrieDictionary<T>(trieBytes); return r; }
public static void main(String[] args) throws Exception { TrieDictionaryBuilder<String> b = new TrieDictionaryBuilder<String>(new StringBytesConverter()); b.addValue(""); b.print(); b.addValue("part"); b.print(); b.addValue("part"); b.print(); b.addValue("par"); b.print(); b.addValue("partition"); b.print(); b.addValue("party"); b.print(); b.addValue("parties"); b.print(); b.addValue("paint"); b.print(); TrieDictionary<String> dict = b.build(0); dict.dump(System.out); dict.getIdFromValueBytes(new byte[10], 0, 0); } }
@Test public void dictionaryContainTest() { ArrayList<String> str = new ArrayList<String>(); str.add("part"); str.add("part"); // meant to be dup str.add("par"); str.add("partition"); str.add("party"); str.add("parties"); str.add("paint"); TrieDictionaryBuilder<String> b = newDictBuilder(str); int baseId = new Random().nextInt(100); TrieDictionary<String> dict = b.build(baseId); str.add("py"); b = newDictBuilder(str); baseId = new Random().nextInt(100); TrieDictionary<String> dict2 = b.build(baseId); assertEquals(true, dict2.contains(dict)); assertEquals(false, dict.contains(dict2)); }
private static TrieDictionaryBuilder<String> newDictBuilder(ArrayList<String> str) { TrieDictionaryBuilder<String> b = new TrieDictionaryBuilder<String>(new StringBytesConverter()); for (String s : str) b.addValue(s); return b; }
public void addValue(T value) { addValue(bytesConverter.convertToBytes(value)); }
private void addValue(byte[] valueBytes) { ByteArray valueByteArray = new ByteArray(valueBytes); if (previousValue != null && isOrdered) { int comp = previousValue.compareTo(valueByteArray); if (comp == 0) { return; //duplicate value } if (comp > 0) { logger.info("values not in ascending order, previous '{}', current '{}'", previousValue, valueByteArray); isOrdered = false; if (trees.size() > 0) { throw new IllegalStateException("Invalid input data. Unordered data cannot be split into multi trees"); } } } previousValue = valueByteArray; trieBuilder.addValue(valueBytes); curTreeSize += valueBytes.length; if (curTreeSize >= maxTrieTreeSize && isOrdered) { TrieDictionary<T> tree = trieBuilder.build(0); addTree(tree); reset(); } }
private void checkOverflowParts(Node node) { LinkedList<Node> childrenCopy = new LinkedList<Node>(node.children); for (Node child : childrenCopy) { if (child.part.length > 255) { byte[] first255 = Arrays.copyOf(child.part, 255); completeParts.append(node.part); completeParts.append(first255); byte[] visited = completeParts.retrieve(); this.addValue(visited); completeParts.withdraw(255); completeParts.withdraw(node.part.length); } } completeParts.append(node.part);// by here the node.children may have // been changed for (Node child : node.children) { checkOverflowParts(child); } completeParts.withdraw(node.part.length); }
private void reset() { curTreeSize = 0; trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter); }