/** * Reduces the accuracy of the HLL provided to a smaller size * @param p0 * - new p size for the new HyperLogLog (smaller or no change) * @return reduced (or same) HyperLogLog instance */ public HyperLogLog squash(final int p0) { if (p0 > p) { throw new IllegalArgumentException( "HyperLogLog cannot be be squashed to be bigger. Current: " + toString() + " Provided: " + p0); } if (p0 == p) { return this; } final HyperLogLog hll = new HyperLogLogBuilder() .setNumRegisterIndexBits(p0).setEncoding(EncodingType.DENSE) .enableNoBias(noBias).build(); final HLLDenseRegister result = hll.denseRegister; if (encoding == EncodingType.SPARSE) { sparseRegister.extractLowBitsTo(result); } else if (encoding == EncodingType.DENSE) { denseRegister.extractLowBitsTo(result); } return hll; }
@Test public void testHLLSquash() { int[] sizes = new int[] { 500, 1000, 2300, 4096}; int minBits = 9; for (final int size : sizes) { HyperLogLog hlls[] = new HyperLogLog[16]; for (int k = minBits; k < hlls.length; k++) { final HyperLogLog hll = HyperLogLog.builder() .setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(k).build(); for (int i = 0; i < size; i++) { hll.addLong(i); } hlls[k] = hll; } for (int k = minBits; k < hlls.length; k++) { for (int j = k + 1; j < hlls.length; j++) { final HyperLogLog large = hlls[j]; final HyperLogLog small = hlls[k]; final HyperLogLog mush = large .squash(small.getNumRegisterIndexBits()); assertEquals(small.count(), mush.count(), 0); double delta = Math.ceil(small.getStandardError()*size); assertEquals((double) size, (double) mush.count(), delta); } } } }
result = HyperLogLog.builder().setNumRegisterIndexBits(p) .setEncoding(EncodingType.SPARSE).build(); int numRegisterEntries = (int) readVulong(in); result = HyperLogLog.builder().setNumRegisterIndexBits(p) .setEncoding(EncodingType.DENSE).enableBitPacking(false).build(); } else { result = HyperLogLog.builder().setNumRegisterIndexBits(p) .setEncoding(EncodingType.DENSE).enableBitPacking(true).build();
/** * Reduces the accuracy of the HLL provided to a smaller size * @param p0 * - new p size for the new HyperLogLog (smaller or no change) * @return reduced (or same) HyperLogLog instance */ public HyperLogLog squash(final int p0) { if (p0 > p) { throw new IllegalArgumentException( "HyperLogLog cannot be be squashed to be bigger. Current: " + toString() + " Provided: " + p0); } if (p0 == p) { return this; } final HyperLogLog hll = new HyperLogLogBuilder() .setNumRegisterIndexBits(p0).setEncoding(EncodingType.DENSE) .enableNoBias(noBias).build(); final HLLDenseRegister result = hll.denseRegister; if (encoding == EncodingType.SPARSE) { sparseRegister.extractLowBitsTo(result); } else if (encoding == EncodingType.DENSE) { denseRegister.extractLowBitsTo(result); } return hll; }
HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.SPARSE).build(); int size = 500;
HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.DENSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.DENSE).build(); int size = 1000;
HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.SPARSE).build(); int size = 1000;
HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.DENSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.DENSE).build(); int size = 1000;
HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) .setEncoding(EncodingType.SPARSE).build(); int size = 1000;
@Test public void testHLLMerge75PercentOverlap() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); int start = (int) (0.25 * size); int end = (int) (size * 1.25); for (int i = start; i < end; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = (long) (1.25 * size); long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Test public void testHLLMerge50PercentOverlap() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); int start = (int) (0.5 * size); int end = (int) (size * 1.5); for (int i = start; i < end; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = (long) (1.5 * size); long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Test public void testHLLMerge25PercentOverlap() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); int start = (int) (0.75 * size); int end = (int) (size * 1.75); for (int i = start; i < end; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = (long) (1.75 * size); long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Test public void testHLLMergeDisjoint() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = size; i < 2 * size; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = 2 * size; long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Test public void testHLLMerge100PercentOverlap() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = size; long actual = hll1.count(); assertEquals(expected, actual, delta); }
public void reset() { hll = HyperLogLog.builder().setNumRegisterIndexBits(12).build(); } }
@Test public void testHLLSparseMoreRegisterBits() { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) .setNumRegisterIndexBits(16).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); }