@Test public void testHLLAddHalfDistinct() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().setEncoding(HyperLogLog.EncodingType.DENSE).build(); int unique = size / 2; Set<Long> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { long val = rand.nextInt(unique); hashset.add(val); hll.addLong(val); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; assertEquals((double) hashset.size(), (double) hll.count(), delta); }
@Test public void testHLLDenseSerializationHalfDistinct() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); Random rand = new Random(SEED); Set<Integer> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { int val = rand.nextInt(size / 2); hll.addLong(val); hashset.add(val); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); assertEquals(hashset.size(), hll.count(), delta); assertEquals(hashset.size(), deserializedHLL.count(), delta); }
@Test public void testHLLDenseNoBitPackingHalfDistinct() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false) .build(); Random rand = new Random(SEED); Set<Integer> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { int val = rand.nextInt(size / 2); hll.addLong(val); hashset.add(val); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); assertEquals(hashset.size(), hll.count(), delta); assertEquals(hashset.size(), deserializedHLL.count(), delta); } }
/** * Reduces the accuracy of the HLL provided to a smaller size * @param p0 * - new p size for the new HyperLogLog (smaller or no change) * @return reduced (or same) HyperLogLog instance */ public HyperLogLog squash(final int p0) { if (p0 > p) { throw new IllegalArgumentException( "HyperLogLog cannot be be squashed to be bigger. Current: " + toString() + " Provided: " + p0); } if (p0 == p) { return this; } final HyperLogLog hll = new HyperLogLogBuilder() .setNumRegisterIndexBits(p0).setEncoding(EncodingType.DENSE) .enableNoBias(noBias).build(); final HLLDenseRegister result = hll.denseRegister; if (encoding == EncodingType.SPARSE) { sparseRegister.extractLowBitsTo(result); } else if (encoding == EncodingType.DENSE) { denseRegister.extractLowBitsTo(result); } return hll; }
if (encoding.equals(EncodingType.SPARSE)) { result = HyperLogLog.builder().setNumRegisterIndexBits(p) .setEncoding(EncodingType.SPARSE).build(); int numRegisterEntries = (int) readVulong(in); int[] reg = new int[numRegisterEntries]; .setEncoding(EncodingType.DENSE).enableBitPacking(false).build(); } else { result = HyperLogLog.builder().setNumRegisterIndexBits(p) .setEncoding(EncodingType.DENSE).enableBitPacking(true).build();
@Test(expected = IllegalArgumentException.class) public void testHLLSparseMerge() { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.SPARSE).build(); int size = 500; for (int i = 0; i < size; i++) {
@Test(expected = IllegalArgumentException.class) public void testHLLDenseMerge() { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.DENSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.DENSE).build(); int size = 1000; for (int i = 0; i < size; i++) {
@Test(expected = IllegalArgumentException.class) public void testHLLDenseSparseMerge() { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.SPARSE).build(); int size = 1000; for (int i = 0; i < size; i++) {
@Test(expected = IllegalArgumentException.class) public void testHLLSparseDenseMerge() { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.DENSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.DENSE).build(); int size = 1000; for (int i = 0; i < size; i++) {
@Test(expected = IllegalArgumentException.class) public void testHLLSparseOverflowMerge() { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.SPARSE).build(); int size = 1000; for (int i = 0; i < size; i++) {
@Test public void testHLLSparseSerializationHalfDistinct() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); Random rand = new Random(SEED); Set<Integer> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { int val = rand.nextInt(size / 2); hll.addLong(val); hashset.add(val); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); assertEquals(hashset.size(), hll.count(), delta); assertEquals(hashset.size(), deserializedHLL.count(), delta); }
/** * Reduces the accuracy of the HLL provided to a smaller size * @param p0 * - new p size for the new HyperLogLog (smaller or no change) * @return reduced (or same) HyperLogLog instance */ public HyperLogLog squash(final int p0) { if (p0 > p) { throw new IllegalArgumentException( "HyperLogLog cannot be be squashed to be bigger. Current: " + toString() + " Provided: " + p0); } if (p0 == p) { return this; } final HyperLogLog hll = new HyperLogLogBuilder() .setNumRegisterIndexBits(p0).setEncoding(EncodingType.DENSE) .enableNoBias(noBias).build(); final HLLDenseRegister result = hll.denseRegister; if (encoding == EncodingType.SPARSE) { sparseRegister.extractLowBitsTo(result); } else if (encoding == EncodingType.DENSE) { denseRegister.extractLowBitsTo(result); } return hll; }
@Test public void testHLLSquash() { int[] sizes = new int[] { 500, 1000, 2300, 4096}; int minBits = 9; for (final int size : sizes) { HyperLogLog hlls[] = new HyperLogLog[16]; for (int k = minBits; k < hlls.length; k++) { final HyperLogLog hll = HyperLogLog.builder() .setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(k).build(); for (int i = 0; i < size; i++) { hll.addLong(i); } hlls[k] = hll; } for (int k = minBits; k < hlls.length; k++) { for (int j = k + 1; j < hlls.length; j++) { final HyperLogLog large = hlls[j]; final HyperLogLog small = hlls[k]; final HyperLogLog mush = large .squash(small.getNumRegisterIndexBits()); assertEquals(small.count(), mush.count(), 0); double delta = Math.ceil(small.getStandardError()*size); assertEquals((double) size, (double) mush.count(), delta); } } } }
@Test public void testHLLSparseNoBitPackingHalfDistinct() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) .enableBitPacking(false).build(); Random rand = new Random(SEED); Set<Integer> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { int val = rand.nextInt(size / 2); hll.addLong(val); hashset.add(val); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); assertEquals(hashset.size(), hll.count(), delta); assertEquals(hashset.size(), deserializedHLL.count(), delta); }
@Test public void testHLLSparseNoBitPacking() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) .enableBitPacking(false).build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); }
@Test public void testHLLDenseSerialization() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); }
@Test public void testHLLSparseSerialization() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); }
@Test public void testHLLDenseNoBitPacking() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false) .build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); }
@Test public void testHLLAdd() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().setEncoding(HyperLogLog.EncodingType.DENSE).build(); int size = 100; for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); }
@Test public void testHLLSparseMoreRegisterBits() { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) .setNumRegisterIndexBits(16).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); }