@Test public void testRoundDownToPowerOf2() { assertEquals(0, MathUtils.roundDownToPowerOf2(0)); assertEquals(1, MathUtils.roundDownToPowerOf2(1)); assertEquals(2, MathUtils.roundDownToPowerOf2(2)); assertEquals(2, MathUtils.roundDownToPowerOf2(3)); assertEquals(4, MathUtils.roundDownToPowerOf2(4)); assertEquals(4, MathUtils.roundDownToPowerOf2(5)); assertEquals(4, MathUtils.roundDownToPowerOf2(6)); assertEquals(4, MathUtils.roundDownToPowerOf2(7)); assertEquals(8, MathUtils.roundDownToPowerOf2(8)); assertEquals(8, MathUtils.roundDownToPowerOf2(9)); assertEquals(8, MathUtils.roundDownToPowerOf2(15)); assertEquals(16, MathUtils.roundDownToPowerOf2(16)); assertEquals(16, MathUtils.roundDownToPowerOf2(17)); assertEquals(16, MathUtils.roundDownToPowerOf2(31)); assertEquals(32, MathUtils.roundDownToPowerOf2(32)); assertEquals(32, MathUtils.roundDownToPowerOf2(33)); assertEquals(32, MathUtils.roundDownToPowerOf2(42)); assertEquals(32, MathUtils.roundDownToPowerOf2(63)); assertEquals(64, MathUtils.roundDownToPowerOf2(64)); assertEquals(64, MathUtils.roundDownToPowerOf2(125)); assertEquals(16384, MathUtils.roundDownToPowerOf2(25654)); assertEquals(33554432, MathUtils.roundDownToPowerOf2(34366363)); assertEquals(33554432, MathUtils.roundDownToPowerOf2(63463463)); assertEquals(1073741824, MathUtils.roundDownToPowerOf2(1852987883)); assertEquals(1073741824, MathUtils.roundDownToPowerOf2(Integer.MAX_VALUE)); }
private int calcInitialNumBucketSegments() { int recordLength = buildSideSerializer.getLength(); double fraction; // fraction of memory to use for the buckets if (recordLength == -1) { // We don't know the record length, so we start with a small number of buckets, and do resizes if // necessary. // It seems that resizing is quite efficient, so we can err here on the too few bucket segments side. // Even with small records, we lose only ~15% speed. fraction = 0.1; } else { // We know the record length, so we can find a good value for the number of buckets right away, and // won't need any resizes later. (enableResize is false in this case, so no resizing will happen.) // Reasoning behind the formula: // We are aiming for one bucket per record, and one bucket contains one 8 byte pointer. The total // memory overhead of an element will be approximately 8+8 bytes, as the record in the record area // is preceded by a pointer (for the linked list). fraction = 8.0 / (16 + recordLength); } // We make the number of buckets a power of 2 so that taking modulo is efficient. int ret = Math.max(1, MathUtils.roundDownToPowerOf2((int)(numAllMemorySegments * fraction))); // We can't handle more than Integer.MAX_VALUE buckets (eg. because hash functions return int) if ((long)ret * numBucketsPerSegment > Integer.MAX_VALUE) { ret = MathUtils.roundDownToPowerOf2(Integer.MAX_VALUE / numBucketsPerSegment); } return ret; }
private static int getBucketBuffersByRowCount(long rowCount, int maxSegs, int segmentSize) { int minNumBuckets = (int) Math.ceil((rowCount / 0.5)); Preconditions.checkArgument(segmentSize % 16 == 0); return MathUtils.roundDownToPowerOf2((int) Math.max(1, Math.min(maxSegs, Math.ceil(((double) minNumBuckets) * 16 / segmentSize)))); }
private int calcInitialNumBucketSegments() { int recordLength = buildSideSerializer.getLength(); double fraction; // fraction of memory to use for the buckets if (recordLength == -1) { // We don't know the record length, so we start with a small number of buckets, and do resizes if // necessary. // It seems that resizing is quite efficient, so we can err here on the too few bucket segments side. // Even with small records, we lose only ~15% speed. fraction = 0.1; } else { // We know the record length, so we can find a good value for the number of buckets right away, and // won't need any resizes later. (enableResize is false in this case, so no resizing will happen.) // Reasoning behind the formula: // We are aiming for one bucket per record, and one bucket contains one 8 byte pointer. The total // memory overhead of an element will be approximately 8+8 bytes, as the record in the record area // is preceded by a pointer (for the linked list). fraction = 8.0 / (16 + recordLength); } // We make the number of buckets a power of 2 so that taking modulo is efficient. int ret = Math.max(1, MathUtils.roundDownToPowerOf2((int)(numAllMemorySegments * fraction))); // We can't handle more than Integer.MAX_VALUE buckets (eg. because hash functions return int) if ((long)ret * numBucketsPerSegment > Integer.MAX_VALUE) { ret = MathUtils.roundDownToPowerOf2(Integer.MAX_VALUE / numBucketsPerSegment); } return ret; }
private int calcInitialNumBucketSegments() { int recordLength = buildSideSerializer.getLength(); double fraction; // fraction of memory to use for the buckets if (recordLength == -1) { // We don't know the record length, so we start with a small number of buckets, and do resizes if // necessary. // It seems that resizing is quite efficient, so we can err here on the too few bucket segments side. // Even with small records, we lose only ~15% speed. fraction = 0.1; } else { // We know the record length, so we can find a good value for the number of buckets right away, and // won't need any resizes later. (enableResize is false in this case, so no resizing will happen.) // Reasoning behind the formula: // We are aiming for one bucket per record, and one bucket contains one 8 byte pointer. The total // memory overhead of an element will be approximately 8+8 bytes, as the record in the record area // is preceded by a pointer (for the linked list). fraction = 8.0 / (16 + recordLength); } // We make the number of buckets a power of 2 so that taking modulo is efficient. int ret = Math.max(1, MathUtils.roundDownToPowerOf2((int)(numAllMemorySegments * fraction))); // We can't handle more than Integer.MAX_VALUE buckets (eg. because hash functions return int) if ((long)ret * numBucketsPerSegment > Integer.MAX_VALUE) { ret = MathUtils.roundDownToPowerOf2(Integer.MAX_VALUE / numBucketsPerSegment); } return ret; }
private int calcInitialNumBucketSegments() { int recordLength = buildSideSerializer.getLength(); double fraction; // fraction of memory to use for the buckets if (recordLength == -1) { // We don't know the record length, so we start with a small number of buckets, and do resizes if // necessary. // It seems that resizing is quite efficient, so we can err here on the too few bucket segments side. // Even with small records, we lose only ~15% speed. fraction = 0.1; } else { // We know the record length, so we can find a good value for the number of buckets right away, and // won't need any resizes later. (enableResize is false in this case, so no resizing will happen.) // Reasoning behind the formula: // We are aiming for one bucket per record, and one bucket contains one 8 byte pointer. The total // memory overhead of an element will be approximately 8+8 bytes, as the record in the record area // is preceded by a pointer (for the linked list). fraction = 8.0 / (16 + recordLength); } // We make the number of buckets a power of 2 so that taking modulo is efficient. int ret = Math.max(1, MathUtils.roundDownToPowerOf2((int)(numAllMemorySegments * fraction))); // We can't handle more than Integer.MAX_VALUE buckets (eg. because hash functions return int) if ((long)ret * numBucketsPerSegment > Integer.MAX_VALUE) { ret = MathUtils.roundDownToPowerOf2(Integer.MAX_VALUE / numBucketsPerSegment); } return ret; }
private int calcNumBucketSegments(InternalType[] keyTypes, InternalType[] valueTypes) { int calcRecordLength = reusedValue.getFixedLengthPartSize() + BinaryRowUtil.getVariableLength(valueTypes) + reusedKey.getFixedLengthPartSize() + BinaryRowUtil.getVariableLength(keyTypes); // We aim for a 200% utilization of the bucket table. double averageBucketSize = BUCKET_SIZE / LOAD_FACTOR; double fraction = averageBucketSize / (averageBucketSize + calcRecordLength + RECORD_EXTRA_LENGTH); // We make the number of buckets a power of 2 so that taking modulo is efficient. // To avoid rehash as far as possible, here use roundUpToPowerOfTwo firstly int ret = Math.max(1, MathUtils.roundDownToPowerOf2((int) (reservedNumBuffers * fraction))); // We can't handle more than Integer.MAX_VALUE buckets (eg. because hash functions return int) if ((long) ret * numBucketsPerSegment > Integer.MAX_VALUE) { ret = MathUtils.roundDownToPowerOf2(Integer.MAX_VALUE / numBucketsPerSegment); } return ret; }
private BinaryHashBucketArea(BinaryHashTable table, double estimatedRowCount, int maxSegs, double loadFactor) { this.table = table; this.estimatedRowCount = estimatedRowCount; this.loadFactor = loadFactor; this.size = 0; int minNumBuckets = (int) Math.ceil((estimatedRowCount / loadFactor / NUM_ENTRIES_PER_BUCKET)); int bucketNumSegs = Math.max(1, Math.min(maxSegs, (minNumBuckets >>> table.bucketsPerSegmentBits) + ((minNumBuckets & table.bucketsPerSegmentMask) == 0 ? 0 : 1))); int numBuckets = MathUtils.roundDownToPowerOf2(bucketNumSegs << table.bucketsPerSegmentBits); int threshold = (int) (numBuckets * NUM_ENTRIES_PER_BUCKET * loadFactor); MemorySegment[] buckets = new MemorySegment[bucketNumSegs]; table.ensureNumBuffersReturned(bucketNumSegs); // go over all segments that are part of the table for (int i = 0; i < bucketNumSegs; i++) { final MemorySegment seg = table.getNextBuffer(); initMemorySegment(seg); buckets[i] = seg; } setNewBuckets(buckets, numBuckets, threshold); }
/** * Entrance 2: build table from spilled partition when the partition fits entirely into main memory. */ LongHashPartition( LongHashContext context, int partitionNum, BinaryRowSerializer buildSideSerializer, int bucketNumSegs, int recursionLevel, List<MemorySegment> buffers, int lastSegmentLimit) { this(context, buildSideSerializer, listToArray(buffers)); this.partitionNum = partitionNum; this.recursionLevel = recursionLevel; int numBuckets = MathUtils.roundDownToPowerOf2(bucketNumSegs * segmentSize / 16); MemorySegment[] buckets = new MemorySegment[bucketNumSegs]; for (int i = 0; i < bucketNumSegs; i++) { buckets[i] = context.nextSegment(); } setNewBuckets(buckets, numBuckets); this.finalBufferLimit = lastSegmentLimit; }
private void resize() throws IOException { MemorySegment[] oldBuckets = this.buckets; int oldNumBuckets = numBuckets; int newNumSegs = oldBuckets.length * 2; int newNumBuckets = MathUtils.roundDownToPowerOf2(newNumSegs * segmentSize / 16); // request new buckets. MemorySegment[] newBuckets = new MemorySegment[newNumSegs]; for (int i = 0; i < newNumSegs; i++) { MemorySegment seg = context.getNextBuffer(); if (seg == null) { final int spilledPart = context.spillPartition(); if (spilledPart == partitionNum) { // this bucket is no longer in-memory // free new segments. context.returnAll(Arrays.asList(newBuckets)); return; } seg = context.getNextBuffer(); if (seg == null) { throw new RuntimeException( "Bug in HybridHashJoin: No memory became available after spilling a partition."); } } newBuckets[i] = seg; } setNewBuckets(newBuckets, newNumBuckets); reHash(oldBuckets, oldNumBuckets); }
table.getNextBuffers(MathUtils.roundDownToPowerOf2(segSize)), numRecords);
MemorySegment[] oldOverflowSegments = overflowSegments; int newNumSegs = oldBuckets.length * 2; int newNumBuckets = MathUtils.roundDownToPowerOf2(newNumSegs << table.bucketsPerSegmentBits); int newThreshold = (int) (newNumBuckets * NUM_ENTRIES_PER_BUCKET * loadFactor);
} else { checkArgument(minMemorySize > INIT_BUCKET_MEMORY_IN_BYTES, "The minBucketMemorySize is not valid!"); this.initBucketSegmentNum = MathUtils.roundDownToPowerOf2((int) (INIT_BUCKET_MEMORY_IN_BYTES / segmentSize));