/** * Creates a {@link BloomFilter} with the expected number of insertions and a default expected * false positive probability of 3%. * * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, * will result in its saturation, and a sharp deterioration of its false positive probability. */ public static BloomFilter create(long expectedNumItems) { return create(expectedNumItems, DEFAULT_FPP); }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }
/** * Creates a {@link BloomFilter} with the expected number of insertions and expected false * positive probability. * * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, * will result in its saturation, and a sharp deterioration of its false positive probability. */ public static BloomFilter create(long expectedNumItems, double fpp) { if (fpp <= 0D || fpp >= 1D) { throw new IllegalArgumentException( "False positive probability must be within range (0.0, 1.0)" ); } return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp)); }
/** * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter. */ public static BloomFilter create(long expectedNumItems, long numBits) { if (expectedNumItems <= 0) { throw new IllegalArgumentException("Expected insertions must be positive"); } if (numBits <= 0) { throw new IllegalArgumentException("Number of bits must be positive"); } return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits); } }
/** * Creates a {@link BloomFilter} with the expected number of insertions and expected false * positive probability. * * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, * will result in its saturation, and a sharp deterioration of its false positive probability. */ public static BloomFilter create(long expectedNumItems, double fpp) { if (fpp <= 0D || fpp >= 1D) { throw new IllegalArgumentException( "False positive probability must be within range (0.0, 1.0)" ); } return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp)); }
/** * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter. */ public static BloomFilter create(long expectedNumItems, long numBits) { if (expectedNumItems <= 0) { throw new IllegalArgumentException("Expected insertions must be positive"); } if (numBits <= 0) { throw new IllegalArgumentException("Number of bits must be positive"); } return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits); } }
/** * Creates a {@link BloomFilter} with the expected number of insertions and expected false * positive probability. * * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, * will result in its saturation, and a sharp deterioration of its false positive probability. */ public static BloomFilter create(long expectedNumItems, double fpp) { if (fpp <= 0D || fpp >= 1D) { throw new IllegalArgumentException( "False positive probability must be within range (0.0, 1.0)" ); } return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp)); }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }
/** * Creates a {@link BloomFilter} with the expected number of insertions and a default expected * false positive probability of 3%. * * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, * will result in its saturation, and a sharp deterioration of its false positive probability. */ public static BloomFilter create(long expectedNumItems) { return create(expectedNumItems, DEFAULT_FPP); }
/** * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter. */ public static BloomFilter create(long expectedNumItems, long numBits) { if (expectedNumItems <= 0) { throw new IllegalArgumentException("Expected insertions must be positive"); } if (numBits <= 0) { throw new IllegalArgumentException("Number of bits must be positive"); } return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits); } }
/** * Creates a {@link BloomFilter} with the expected number of insertions and expected false * positive probability. * * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, * will result in its saturation, and a sharp deterioration of its false positive probability. */ public static BloomFilter create(long expectedNumItems, double fpp) { if (fpp <= 0D || fpp >= 1D) { throw new IllegalArgumentException( "False positive probability must be within range (0.0, 1.0)" ); } return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp)); }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }
/** * Creates a {@link BloomFilter} with the expected number of insertions and a default expected * false positive probability of 3%. * * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, * will result in its saturation, and a sharp deterioration of its false positive probability. */ public static BloomFilter create(long expectedNumItems) { return create(expectedNumItems, DEFAULT_FPP); }
/** * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter. */ public static BloomFilter create(long expectedNumItems, long numBits) { if (expectedNumItems <= 0) { throw new IllegalArgumentException("Expected insertions must be positive"); } if (numBits <= 0) { throw new IllegalArgumentException("Number of bits must be positive"); } return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits); } }
/** * Creates a {@link BloomFilter} with the expected number of insertions and a default expected * false positive probability of 3%. * * Note that overflowing a {@code BloomFilter} with significantly more elements than specified, * will result in its saturation, and a sharp deterioration of its false positive probability. */ public static BloomFilter create(long expectedNumItems) { return create(expectedNumItems, DEFAULT_FPP); }