/** * Determines & initializes bloom filter meta data from user config. Call * {@link #allocBloom()} to allocate bloom filter data. * * @param maxKeys Maximum expected number of keys that will be stored in this * bloom * @param errorRate Desired false positive error rate. Lower rate = more * storage required * @param hashType Type of hash function to use * @param foldFactor When finished adding entries, you may be able to 'fold' * this bloom to save space. Tradeoff potentially excess bytes in * bloom for ability to fold if keyCount is exponentially greater * than maxKeys. * @throws IllegalArgumentException */ // Used only in testcases public BloomFilterChunk(int maxKeys, double errorRate, int hashType, int foldFactor) throws IllegalArgumentException { this(hashType, BloomType.ROW); long bitSize = BloomFilterUtil.computeBitSize(maxKeys, errorRate); hashCount = BloomFilterUtil.optimalFunctionCount(maxKeys, bitSize); this.maxKeys = maxKeys; // increase byteSize so folding is possible byteSize = BloomFilterUtil.computeFoldableByteSize(bitSize, foldFactor); sanityCheck(); }
public static boolean contains(byte[] buf, int offset, int length, ByteBuff bloomBuf, int bloomOffset, int bloomSize, Hash hash, int hashCount) { HashKey<byte[]> hashKey = new ByteArrayHashKey(buf, offset, length); return contains(bloomBuf, bloomOffset, bloomSize, hash, hashCount, hashKey); }
/** * Creates a Bloom filter chunk of the given size. * * @param byteSizeHint the desired number of bytes for the Bloom filter bit * array. Will be increased so that folding is possible. * @param errorRate target false positive rate of the Bloom filter * @param hashType Bloom filter hash function type * @param foldFactor * @param bloomType * @return the new Bloom filter of the desired size */ public static BloomFilterChunk createBySize(int byteSizeHint, double errorRate, int hashType, int foldFactor, BloomType bloomType) { BloomFilterChunk bbf = new BloomFilterChunk(hashType, bloomType); bbf.byteSize = computeFoldableByteSize(byteSizeHint * 8L, foldFactor); long bitSize = bbf.byteSize * 8; bbf.maxKeys = (int) idealMaxKeys(bitSize, errorRate); bbf.hashCount = optimalFunctionCount(bbf.maxKeys, bitSize); // Adjust max keys to bring error rate closer to what was requested, // because byteSize was adjusted to allow for folding, and hashCount was // rounded. bbf.maxKeys = (int) computeMaxKeys(bitSize, errorRate, bbf.hashCount); return bbf; }
public void testSizing() { int bitSize = 8 * 128 * 1024; // 128 KB double errorRate = 0.025; // target false positive rate // How many keys can we store in a Bloom filter of this size maintaining // the given false positive rate, not taking into account that the n long maxKeys = BloomFilterUtil.idealMaxKeys(bitSize, errorRate); assertEquals(136570, maxKeys); // A reverse operation: how many bits would we need to store this many keys // and keep the same low false positive rate? long bitSize2 = BloomFilterUtil.computeBitSize(maxKeys, errorRate); // The bit size comes out a little different due to rounding. assertTrue(Math.abs(bitSize2 - bitSize) * 1.0 / bitSize < 1e-5); }
/** * @param chunkByteSizeHint * each chunk's size in bytes. The real chunk size might be different * as required by the fold factor. * @param errorRate * target false positive rate * @param hashType * hash function type to use * @param maxFold * maximum degree of folding allowed * @param bloomType * the bloom type */ public CompoundBloomFilterWriter(int chunkByteSizeHint, float errorRate, int hashType, int maxFold, boolean cacheOnWrite, CellComparator comparator, BloomType bloomType) { chunkByteSize = BloomFilterUtil.computeFoldableByteSize( chunkByteSizeHint * 8L, maxFold); this.errorRate = errorRate; this.hashType = hashType; this.maxFold = maxFold; this.cacheOnWrite = cacheOnWrite; this.comparator = comparator; this.bloomType = bloomType; }
private static void checkBloomFilterType(ColumnFamilyDescriptor cfd) throws IOException { Configuration conf = new CompoundConfiguration().addStringMap(cfd.getConfiguration()); try { BloomFilterUtil.getBloomFilterParam(cfd.getBloomFilterType(), conf); } catch (IllegalArgumentException e) { throw new DoNotRetryIOException("Failed to get bloom filter param", e); } }
@Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(BloomFilterUtil.formatStats(this)); sb.append(BloomFilterUtil.STATS_RECORD_SEP + "Number of chunks: " + numChunks); sb.append(BloomFilterUtil.STATS_RECORD_SEP + ((comparator != null) ? "Comparator: " + comparator.getClass().getSimpleName() : "Comparator: " + Bytes.BYTES_RAWCOMPARATOR.getClass().getSimpleName())); return sb.toString(); }
/** * Computes the error rate for this Bloom filter, taking into account the * actual number of hash functions and keys inserted. The return value of * this function changes as a Bloom filter is being populated. Used for * reporting the actual error rate of compound Bloom filters when writing * them out. * * @return error rate for this particular Bloom filter */ public double actualErrorRate() { return BloomFilterUtil.actualErrorRate(keyCount, byteSize * 8, hashCount); }
private static <T> boolean contains(ByteBuff bloomBuf, int bloomOffset, int bloomSize, Hash hash, int hashCount, HashKey<T> hashKey) { int hash1 = hash.hash(hashKey, 0); int bloomBitSize = bloomSize << 3; int hash2 = 0; int compositeHash = 0; if (randomGeneratorForTest == null) { // Production mode compositeHash = hash1; hash2 = hash.hash(hashKey, hash1); } for (int i = 0; i < hashCount; i++) { int hashLoc = (randomGeneratorForTest == null // Production mode ? Math.abs(compositeHash % bloomBitSize) // Test mode with "fake look-ups" to estimate "ideal false positive rate" : randomGeneratorForTest.nextInt(bloomBitSize)); compositeHash += hash2; if (!checkBit(hashLoc, bloomBuf, bloomOffset)) { return false; } } return true; }
private void allocateNewChunk() { if (prevChunk == null) { // First chunk chunk = BloomFilterUtil.createBySize(chunkByteSize, errorRate, hashType, maxFold, bloomType); } else { // Use the same parameters as the last chunk, but a new array and // a zero key count. chunk = prevChunk.createAnother(); } if (chunk.getKeyCount() != 0) { throw new IllegalStateException("keyCount=" + chunk.getKeyCount() + " > 0"); } chunk.allocBloom(); ++numChunks; } @Override
@Test public void testCompoundBloomSizing() { int bloomBlockByteSize = 4096; int bloomBlockBitSize = bloomBlockByteSize * 8; double targetErrorRate = 0.01; long maxKeysPerChunk = BloomFilterUtil.idealMaxKeys(bloomBlockBitSize, targetErrorRate); long bloomSize1 = bloomBlockByteSize * 8; long bloomSize2 = BloomFilterUtil.computeBitSize(maxKeysPerChunk, targetErrorRate); double bloomSizeRatio = (bloomSize2 * 1.0 / bloomSize1); assertTrue(Math.abs(bloomSizeRatio - 0.9999) < 0.0001); }
public void testFoldableByteSize() { assertEquals(128, BloomFilterUtil.computeFoldableByteSize(1000, 5)); assertEquals(640, BloomFilterUtil.computeFoldableByteSize(5001, 4)); }
this.bloomParam = BloomFilterUtil.getBloomFilterParam(bloomType, conf); if (LOG.isTraceEnabled()) { LOG.trace("Bloom filter type for " + path + ": " + this.bloomType + ", param: "
public static String toString(BloomFilterChunk bloomFilter) { return formatStats(bloomFilter) + STATS_RECORD_SEP + "Actual error rate: " + String.format("%.8f", bloomFilter.actualErrorRate()); }
public void testSizing() { int bitSize = 8 * 128 * 1024; // 128 KB double errorRate = 0.025; // target false positive rate // How many keys can we store in a Bloom filter of this size maintaining // the given false positive rate, not taking into account that the n long maxKeys = BloomFilterUtil.idealMaxKeys(bitSize, errorRate); assertEquals(136570, maxKeys); // A reverse operation: how many bits would we need to store this many keys // and keep the same low false positive rate? long bitSize2 = BloomFilterUtil.computeBitSize(maxKeys, errorRate); // The bit size comes out a little different due to rounding. assertTrue(Math.abs(bitSize2 - bitSize) * 1.0 / bitSize < 1e-5); }
public static boolean contains(Cell cell, ByteBuff bloomBuf, int bloomOffset, int bloomSize, Hash hash, int hashCount, BloomType type) { HashKey<Cell> hashKey = type == BloomType.ROWCOL ? new RowColBloomHashKey(cell) : new RowBloomHashKey(cell); return contains(bloomBuf, bloomOffset, bloomSize, hash, hashCount, hashKey); }
public void testFoldableByteSize() { assertEquals(128, BloomFilterUtil.computeFoldableByteSize(1000, 5)); assertEquals(640, BloomFilterUtil.computeFoldableByteSize(5001, 4)); }
@Test public void testCompoundBloomSizing() { int bloomBlockByteSize = 4096; int bloomBlockBitSize = bloomBlockByteSize * 8; double targetErrorRate = 0.01; long maxKeysPerChunk = BloomFilterUtil.idealMaxKeys(bloomBlockBitSize, targetErrorRate); long bloomSize1 = bloomBlockByteSize * 8; long bloomSize2 = BloomFilterUtil.computeBitSize(maxKeysPerChunk, targetErrorRate); double bloomSizeRatio = (bloomSize2 * 1.0 / bloomSize1); assertTrue(Math.abs(bloomSizeRatio - 0.9999) < 0.0001); }
@Override public boolean contains(byte[] key, int keyOffset, int keyLength, ByteBuff bloom) { int block = index.rootBlockContainingKey(key, keyOffset, keyLength); if (block < 0) { return false; // This key is not in the file. } boolean result; HFileBlock bloomBlock = getBloomBlock(block); try { ByteBuff bloomBuf = bloomBlock.getBufferReadOnly(); result = BloomFilterUtil.contains(key, keyOffset, keyLength, bloomBuf, bloomBlock.headerSize(), bloomBlock.getUncompressedSizeWithoutHeader(), hash, hashCount); } finally { // After the use return back the block if it was served from a cache. reader.returnBlock(bloomBlock); } if (numPositivesPerChunk != null && result) { // Update statistics. Only used in unit tests. ++numPositivesPerChunk[block]; } return result; }
@Override public boolean contains(Cell keyCell, ByteBuff bloom, BloomType type) { int block = index.rootBlockContainingKey(keyCell); if (block < 0) { return false; // This key is not in the file. } boolean result; HFileBlock bloomBlock = getBloomBlock(block); try { ByteBuff bloomBuf = bloomBlock.getBufferReadOnly(); result = BloomFilterUtil.contains(keyCell, bloomBuf, bloomBlock.headerSize(), bloomBlock.getUncompressedSizeWithoutHeader(), hash, hashCount, type); } finally { // After the use return back the block if it was served from a cache. reader.returnBlock(bloomBlock); } if (numPositivesPerChunk != null && result) { // Update statistics. Only used in unit tests. ++numPositivesPerChunk[block]; } return result; }