@Override public long estimateCount(Object item) { if (item instanceof String) { return estimateCountForStringItem((String) item); } else if (item instanceof byte[]) { return estimateCountForBinaryItem((byte[]) item); } else { return estimateCountForLongItem(Utils.integralToLong(item)); } }
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; int h1 = hashBytesByInt(base, offset, lengthAligned, seed); for (int i = lengthAligned; i < lengthInBytes; i++) { int halfWord = Platform.getByte(base, offset + i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); } return fmix(h1, lengthInBytes); }
@Override public boolean put(Object item) { if (item instanceof String) { return putString((String) item); } else if (item instanceof byte[]) { return putBinary((byte[]) item); } else { return putLong(Utils.integralToLong(item)); } }
@Test public void testCountMinSketch() { Dataset<Long> df = spark.range(1000); CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42); Assert.assertEquals(1000, sketch1.totalCount()); Assert.assertEquals(10, sketch1.depth()); Assert.assertEquals(20, sketch1.width()); CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42); Assert.assertEquals(1000, sketch2.totalCount()); Assert.assertEquals(10, sketch2.depth()); Assert.assertEquals(20, sketch2.width()); CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42); Assert.assertEquals(1000, sketch3.totalCount()); Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3); CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42); Assert.assertEquals(1000, sketch4.totalCount()); Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3); }
@Override public void add(Object item, long count) { if (item instanceof String) { addString((String) item, count); } else if (item instanceof byte[]) { addBinary((byte[]) item, count); } else { addLong(Utils.integralToLong(item), count); } }
@Override public boolean mightContain(Object item) { if (item instanceof String) { return mightContainString((String) item); } else if (item instanceof byte[]) { return mightContainBinary((byte[]) item); } else { return mightContainLong(Utils.integralToLong(item)); } }
@Override public long estimateCount(Object item) { if (item instanceof String) { return estimateCountForStringItem((String) item); } else { return estimateCountForLongItem(Utils.integralToLong(item)); } }
@Override public void add(Object item, long count) { if (item instanceof String) { addString((String) item, count); } else { addLong(Utils.integralToLong(item), count); } }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }
@Override public boolean putString(String item) { return putBinary(Utils.getBytesFromUTF8String(item)); }
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { // This is not compatible with original and another implementations. // But remain it for backward compatibility for the components existing before 2.3. assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; int h1 = hashBytesByInt(base, offset, lengthAligned, seed); for (int i = lengthAligned; i < lengthInBytes; i++) { int halfWord = Platform.getByte(base, offset + i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); } return fmix(h1, lengthInBytes); }
@Override public boolean put(Object item) { if (item instanceof String) { return putString((String) item); } else if (item instanceof byte[]) { return putBinary((byte[]) item); } else { return putLong(Utils.integralToLong(item)); } }
@Test public void testCountMinSketch() { Dataset<Long> df = spark.range(1000); CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42); Assert.assertEquals(1000, sketch1.totalCount()); Assert.assertEquals(10, sketch1.depth()); Assert.assertEquals(20, sketch1.width()); CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42); Assert.assertEquals(1000, sketch2.totalCount()); Assert.assertEquals(10, sketch2.depth()); Assert.assertEquals(20, sketch2.width()); CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42); Assert.assertEquals(1000, sketch3.totalCount()); Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3); CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42); Assert.assertEquals(1000, sketch4.totalCount()); Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3); }
@Override public long estimateCount(Object item) { if (item instanceof String) { return estimateCountForStringItem((String) item); } else if (item instanceof byte[]) { return estimateCountForBinaryItem((byte[]) item); } else { return estimateCountForLongItem(Utils.integralToLong(item)); } }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }
@Override public boolean putString(String item) { return putBinary(Utils.getBytesFromUTF8String(item)); }
@Override public boolean put(Object item) { if (item instanceof String) { return putString((String) item); } else if (item instanceof byte[]) { return putBinary((byte[]) item); } else { return putLong(Utils.integralToLong(item)); } }
@Test public void testCountMinSketch() { Dataset<Long> df = spark.range(1000); CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42); Assert.assertEquals(1000, sketch1.totalCount()); Assert.assertEquals(10, sketch1.depth()); Assert.assertEquals(20, sketch1.width()); CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42); Assert.assertEquals(1000, sketch2.totalCount()); Assert.assertEquals(10, sketch2.depth()); Assert.assertEquals(20, sketch2.width()); CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42); Assert.assertEquals(1000, sketch3.totalCount()); Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3); CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42); Assert.assertEquals(1000, sketch4.totalCount()); Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3); }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }