public static NumDistinctValueEstimator getEmptyNumDistinctValueEstimator(String func, int numBitVectors) { if ("fm".equals(func.toLowerCase())) { return new FMSketch(numBitVectors); } else if ("hll".equals(func.toLowerCase())) { return HyperLogLog.builder().setSizeOptimized().build(); } else { throw new RuntimeException("Can not recognize " + func); } }
@Test public void testHLLAddHalfDistinct() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().build(); int unique = size / 2; Set<Long> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { long val = rand.nextInt(unique); hashset.add(val); hll.addLong(val); } double threshold = size > 40000 ? noBiaslongRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; assertEquals((double) hashset.size(), (double) hll.count(), delta); }
@Test public void testHLLAddHalfDistinct() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().build(); int unique = size / 2; Set<Long> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { long val = rand.nextInt(unique); hashset.add(val); hll.addLong(val); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; assertEquals((double) hashset.size(), (double) hll.count(), delta); } }
@Test public void testHLLSparseSerialization() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); }
@Test public void testHLLMerge25PercentOverlap() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); int start = (int) (0.75 * size); int end = (int) (size * 1.75); for (int i = start; i < end; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = (long) (1.75 * size); long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Test public void testHLLMerge50PercentOverlap() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); int start = (int) (0.5 * size); int end = (int) (size * 1.5); for (int i = start; i < end; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = (long) (1.5 * size); long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Test public void testHLLDenseNoBitPacking() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false) .build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); }
@Test public void testHLLMerge75PercentOverlap() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); int start = (int) (0.25 * size); int end = (int) (size * 1.25); for (int i = start; i < end; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = (long) (1.25 * size); long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Test public void testHLLMerge100PercentOverlap() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = size; long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Test public void testHLLMergeDisjoint() { HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = 0; i < size; i++) { hll1.addLong(i); } HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); for (int i = size; i < 2 * size; i++) { hll2.addLong(i); } hll1.merge(hll2); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; long expected = 2 * size; long actual = hll1.count(); assertEquals(expected, actual, delta); }
@Before public void setUp() throws Exception { conf = MetastoreConf.newMetastoreConf(); MetastoreConf.setBoolVar(conf, MetastoreConf.ConfVars.STATS_FETCH_BITVECTOR, false); MetaStoreTestUtils.setConfForStandloneMode(conf); store = new ObjectStore(); store.setConf(conf); dropAllStoreObjects(store); HiveMetaStore.HMSHandler.createDefaultCatalog(store, new Warehouse(conf)); HyperLogLog hll = HyperLogLog.builder().build(); hll.addLong(1); bitVectors[1] = hll.serialize(); hll = HyperLogLog.builder().build(); hll.addLong(2); hll.addLong(3); hll.addLong(3); hll.addLong(4); bitVectors[0] = hll.serialize(); }
@Test public void testHLLAddHalfDistinct() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().setEncoding(HyperLogLog.EncodingType.DENSE).build(); int unique = size / 2; Set<Long> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { long val = rand.nextInt(unique); hashset.add(val); hll.addLong(val); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; assertEquals((double) hashset.size(), (double) hll.count(), delta); }
@Test public void testHLLNoBiasDisabledHalfDistinct() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().enableNoBias(false).build(); int unique = size / 2; Set<Long> hashset = new HashSet<>(); for (int i = 0; i < size; i++) { long val = rand.nextInt(unique); hashset.add(val); hll.addLong(val); } double threshold = size > 40000 ? biasedlongRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; assertEquals((double) hashset.size(), (double) hll.count(), delta); }
public void reset() { hll = HyperLogLog.builder().setNumRegisterIndexBits(12).build(); } }
@Test public void testHLLAdd() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().setEncoding(HyperLogLog.EncodingType.DENSE).build(); int size = 100; for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); }
public static NumDistinctValueEstimator getEmptyNumDistinctValueEstimator( NumDistinctValueEstimator n) { if (n instanceof FMSketch) { return new FMSketch(((FMSketch) n).getNumBitVectors()); } else { return HyperLogLog.builder().setSizeOptimized().build(); } }
@Test public void testHLLSparseMoreRegisterBits() { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) .setNumRegisterIndexBits(16).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); }
@Test public void testHLLAdd() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().build(); int size = 100; for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); }
@Test public void testHLLAdd() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().build(); int size = 100; for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } double threshold = size > 40000 ? noBiaslongRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); }
@Test public void testHLLNoBiasDisabled() { Random rand = new Random(size); HyperLogLog hll = HyperLogLog.builder().enableNoBias(false).build(); int size = 100; for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } double threshold = size > 40000 ? biasedlongRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); }