org.apache.spark.util.sketch.BloomFilter java code examples

/**
 * Creates a {@link BloomFilter} with the expected number of insertions and a default expected
 * false positive probability of 3%.
 *
 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
 * will result in its saturation, and a sharp deterioration of its false positive probability.
 */
public static BloomFilter create(long expectedNumItems) {
 return create(expectedNumItems, DEFAULT_FPP);
}

@Test
public void testBloomFilter() {
 Dataset<Long> df = spark.range(1000);
 BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
 Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter1.mightContain(i));
 }
 BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
 Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter2.mightContain(i * 3));
 }
 BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter3.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter3.mightContain(i));
 }
 BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter4.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter4.mightContain(i * 3));
 }
}

/**
 * Creates a {@link BloomFilter} with the expected number of insertions and expected false
 * positive probability.
 *
 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
 * will result in its saturation, and a sharp deterioration of its false positive probability.
 */
public static BloomFilter create(long expectedNumItems, double fpp) {
 if (fpp <= 0D || fpp >= 1D) {
  throw new IllegalArgumentException(
   "False positive probability must be within range (0.0, 1.0)"
  );
 }
 return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp));
}

 /**
  * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will
  * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter.
  */
 public static BloomFilter create(long expectedNumItems, long numBits) {
  if (expectedNumItems <= 0) {
   throw new IllegalArgumentException("Expected insertions must be positive");
  }

  if (numBits <= 0) {
   throw new IllegalArgumentException("Number of bits must be positive");
  }

  return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits);
 }
}

/**
 * Creates a {@link BloomFilter} with the expected number of insertions and expected false
 * positive probability.
 *
 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
 * will result in its saturation, and a sharp deterioration of its false positive probability.
 */
public static BloomFilter create(long expectedNumItems, double fpp) {
 if (fpp <= 0D || fpp >= 1D) {
  throw new IllegalArgumentException(
   "False positive probability must be within range (0.0, 1.0)"
  );
 }
 return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp));
}

 /**
  * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will
  * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter.
  */
 public static BloomFilter create(long expectedNumItems, long numBits) {
  if (expectedNumItems <= 0) {
   throw new IllegalArgumentException("Expected insertions must be positive");
  }

  if (numBits <= 0) {
   throw new IllegalArgumentException("Number of bits must be positive");
  }

  return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits);
 }
}

/**
 * Creates a {@link BloomFilter} with the expected number of insertions and expected false
 * positive probability.
 *
 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
 * will result in its saturation, and a sharp deterioration of its false positive probability.
 */
public static BloomFilter create(long expectedNumItems, double fpp) {
 if (fpp <= 0D || fpp >= 1D) {
  throw new IllegalArgumentException(
   "False positive probability must be within range (0.0, 1.0)"
  );
 }
 return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp));
}

@Test
public void testBloomFilter() {
 Dataset<Long> df = spark.range(1000);
 BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
 Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter1.mightContain(i));
 }
 BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
 Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter2.mightContain(i * 3));
 }
 BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter3.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter3.mightContain(i));
 }
 BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter4.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter4.mightContain(i * 3));
 }
}

/**
 * Creates a {@link BloomFilter} with the expected number of insertions and a default expected
 * false positive probability of 3%.
 *
 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
 * will result in its saturation, and a sharp deterioration of its false positive probability.
 */
public static BloomFilter create(long expectedNumItems) {
 return create(expectedNumItems, DEFAULT_FPP);
}

 /**
  * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will
  * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter.
  */
 public static BloomFilter create(long expectedNumItems, long numBits) {
  if (expectedNumItems <= 0) {
   throw new IllegalArgumentException("Expected insertions must be positive");
  }

  if (numBits <= 0) {
   throw new IllegalArgumentException("Number of bits must be positive");
  }

  return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits);
 }
}

/**
 * Creates a {@link BloomFilter} with the expected number of insertions and expected false
 * positive probability.
 *
 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
 * will result in its saturation, and a sharp deterioration of its false positive probability.
 */
public static BloomFilter create(long expectedNumItems, double fpp) {
 if (fpp <= 0D || fpp >= 1D) {
  throw new IllegalArgumentException(
   "False positive probability must be within range (0.0, 1.0)"
  );
 }
 return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp));
}

@Test
public void testBloomFilter() {
 Dataset<Long> df = spark.range(1000);
 BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
 Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter1.mightContain(i));
 }
 BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
 Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter2.mightContain(i * 3));
 }
 BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter3.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter3.mightContain(i));
 }
 BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter4.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter4.mightContain(i * 3));
 }
}

/**
 * Creates a {@link BloomFilter} with the expected number of insertions and a default expected
 * false positive probability of 3%.
 *
 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
 * will result in its saturation, and a sharp deterioration of its false positive probability.
 */
public static BloomFilter create(long expectedNumItems) {
 return create(expectedNumItems, DEFAULT_FPP);
}

 /**
  * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will
  * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter.
  */
 public static BloomFilter create(long expectedNumItems, long numBits) {
  if (expectedNumItems <= 0) {
   throw new IllegalArgumentException("Expected insertions must be positive");
  }

  if (numBits <= 0) {
   throw new IllegalArgumentException("Number of bits must be positive");
  }

  return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits);
 }
}

/**
 * Creates a {@link BloomFilter} with the expected number of insertions and a default expected
 * false positive probability of 3%.
 *
 * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
 * will result in its saturation, and a sharp deterioration of its false positive probability.
 */
public static BloomFilter create(long expectedNumItems) {
 return create(expectedNumItems, DEFAULT_FPP);
}

Javadoc

A Bloom filter is a space-efficient probabilistic data structure that offers an approximate containment test with one-sided error: if it claims that an item is contained in it, this might be in error, but if it claims that an item is not contained in it, then this is definitely true. Currently supported data types include:

Byte
Short
Integer
Long
String

The false positive probability ( FPP) of a Bloom filter is defined as the probability that #mightContain(Object) will erroneously return true for an object that has not actually been put in the BloomFilter. The implementation is largely based on the BloomFilter class from Guava.

Most used methods

create
Creates a BloomFilter with given expectedNumItems and numBits, it will pick an optimal numHashFuncti
optimalNumOfBits
Computes m (total bits of Bloom filter) which is expected to achieve, for the specified expected ins
optimalNumOfHashFunctions
Computes the optimal k (number of hashes per item inserted in Bloom filter), given the expected inse
bitSize
Returns the number of bits in the underlying bit array.
expectedFpp
Returns the probability that #mightContain(Object) erroneously return truefor an object that has not
mightContain
Returns true if the element might have been put in this Bloom filter, false if this is definitely no

Popular in Java

Reactive rest calls using spring rest template
setScale (BigDecimal)
getExternalFilesDir (Context)
getApplicationContext (Context)
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
UnknownHostException (java.net)
Thrown when a hostname can not be resolved.
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
Top plugins for Android Studio

How to useBloomFilter in org.apache.spark.util.sketch

Best Java code snippets using org.apache.spark.util.sketch.BloomFilter (Showing top 15 results out of 315)

How to use
BloomFilter
in
org.apache.spark.util.sketch