public void merge(PercentileCounter counter) { assert this.compression == counter.compression; registers.add(counter.registers); }
public void add(double v) { registers.add(v); }
@Nonnull @Override public TDigest merge(@Nonnull TDigest intermediateResult1, @Nonnull TDigest intermediateResult2) { intermediateResult1.add(intermediateResult2); return intermediateResult1; }
@Override public TDigest applyAggregatedValue(TDigest value, TDigest aggregatedValue) { value.add(aggregatedValue); _maxByteSize = Math.max(_maxByteSize, value.byteSize()); return value; }
@org.openjdk.jmh.annotations.Benchmark public void add(ThreadState state) { if (state.index >= data.length) { state.index = 0; } td.add(data[state.index++]); }
@Benchmark public void timeAdd(MergeBench.ThreadState state) { if (state.index >= data.length) { state.index = 0; } tdigest.add(data[state.index++]); }
@Override public void aggregateGroupBySV(int length, @Nonnull int[] groupKeyArray, @Nonnull GroupByResultHolder groupByResultHolder, @Nonnull BlockValSet... blockValSets) { double[][] valuesArray = blockValSets[0].getDoubleValuesMV(); for (int i = 0; i < length; i++) { TDigest tDigest = getTDigest(groupByResultHolder, groupKeyArray[i]); for (double value : valuesArray[i]) { tDigest.add(value); } } }
@Override public void aggregate(int length, @Nonnull AggregationResultHolder aggregationResultHolder, @Nonnull BlockValSet... blockValSets) { double[][] valuesArray = blockValSets[0].getDoubleValuesMV(); TDigest tDigest = getTDigest(aggregationResultHolder); for (int i = 0; i < length; i++) { for (double value : valuesArray[i]) { tDigest.add(value); } } }
@Override public void aggregateGroupByMV(int length, @Nonnull int[][] groupKeysArray, @Nonnull GroupByResultHolder groupByResultHolder, @Nonnull BlockValSet... blockValSets) { double[][] valuesArray = blockValSets[0].getDoubleValuesMV(); for (int i = 0; i < length; i++) { double[] values = valuesArray[i]; for (int groupKey : groupKeysArray[i]) { TDigest tDigest = getTDigest(groupByResultHolder, groupKey); for (double value : values) { tDigest.add(value); } } } } }
@Override public TDigest getInitialAggregatedValue(Object rawValue) { TDigest initialValue; if (rawValue instanceof byte[]) { byte[] bytes = (byte[]) rawValue; initialValue = deserializeAggregatedValue(bytes); _maxByteSize = Math.max(_maxByteSize, bytes.length); } else { initialValue = TDigest.createMergingDigest(PercentileTDigestAggregationFunction.DEFAULT_TDIGEST_COMPRESSION); initialValue.add(((Number) rawValue).doubleValue()); _maxByteSize = Math.max(_maxByteSize, initialValue.byteSize()); } return initialValue; }
@Setup public void setup() { data = new double[10000000]; for (int i = 0; i < data.length; i++) { data[i] = gen.nextDouble(); } if (method.equals("tree")) { td = new AVLTreeDigest(compression); } else { td = new MergingDigest(500); } // First values are very cheap to add, we are more interested in the steady state, // when the summary is full. Summaries are expected to contain about 5*compression // centroids, hence the 5 factor for (int i = 0; i < 5 * compression; ++i) { td.add(gen.nextDouble()); } }
@Override public void aggregate(int length, @Nonnull AggregationResultHolder aggregationResultHolder, @Nonnull BlockValSet... blockValSets) { TDigest tDigest = getTDigest(aggregationResultHolder); FieldSpec.DataType valueType = blockValSets[0].getValueType(); switch (valueType) { case INT: case LONG: case FLOAT: case DOUBLE: double[] valueArray = blockValSets[0].getDoubleValuesSV(); for (int i = 0; i < length; i++) { tDigest.add(valueArray[i]); } break; case BYTES: // Serialized TDigest byte[][] bytesValues = blockValSets[0].getBytesValuesSV(); for (int i = 0; i < length; i++) { tDigest.add(ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(ByteBuffer.wrap(bytesValues[i]))); } break; default: throw new IllegalStateException("Illegal data type for PERCENTILE_TDIGEST aggregation function: " + valueType); } }
@Override public void aggregateGroupBySV(int length, @Nonnull int[] groupKeyArray, @Nonnull GroupByResultHolder groupByResultHolder, @Nonnull BlockValSet... blockValSets) { FieldSpec.DataType valueType = blockValSets[0].getValueType(); switch (valueType) { case INT: case LONG: case FLOAT: case DOUBLE: double[] valueArray = blockValSets[0].getDoubleValuesSV(); for (int i = 0; i < length; i++) { TDigest tDigest = getTDigest(groupByResultHolder, groupKeyArray[i]); tDigest.add(valueArray[i]); } break; case BYTES: // Serialized TDigest byte[][] bytesValues = blockValSets[0].getBytesValuesSV(); for (int i = 0; i < length; i++) { TDigest tDigest = getTDigest(groupByResultHolder, groupKeyArray[i]); tDigest.add(ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(ByteBuffer.wrap(bytesValues[i]))); } break; default: throw new IllegalStateException("Illegal data type for PERCENTILE_TDIGEST aggregation function: " + valueType); } }
@Setup public void setUp() { random = ThreadLocalRandom.current(); tdigest = tdigestFactory.create(compression); distribution = distributionFactory.create(random); // first values are cheap to add, so pre-fill the t-digest to have more realistic results for (int i = 0; i < 10000; ++i) { tdigest.add(distribution.nextDouble()); } for (int i = 0; i < data.length; ++i) { data[i] = distribution.nextDouble(); } }
@Test public void testTDigest() { for (int i = 0; i < NUM_ITERATIONS; i++) { TDigest expected = TDigest.createMergingDigest(PercentileTDigestAggregationFunction.DEFAULT_TDIGEST_COMPRESSION); int size = RANDOM.nextInt(100) + 1; for (int j = 0; j < size; j++) { expected.add(RANDOM.nextDouble()); } byte[] bytes = ObjectSerDeUtils.serialize(expected); TDigest actual = ObjectSerDeUtils.deserialize(bytes, ObjectSerDeUtils.ObjectType.TDigest); for (int j = 0; j <= 100; j++) { assertEquals(actual.quantile(j / 100.0), expected.quantile(j / 100.0), 1e-5); } } } }
@Test public void testBasic() { int times = 1; int compression = 100; for (int t = 0; t < times; t++) { TDigest tDigest = TDigest.createAvlTreeDigest(compression); Random random = new Random(); int dataSize = 10000; List<Double> dataset = Lists.newArrayListWithCapacity(dataSize); for (int i = 0; i < dataSize; i++) { double d = random.nextDouble(); tDigest.add(d); dataset.add(d); } Collections.sort(dataset); double actualResult = tDigest.quantile(0.5); double expectedResult = MathUtil.findMedianInSortedList(dataset); assertEquals(expectedResult, actualResult, 0.01); } } }
@Test public void testTDigest() { double compression = 100; double quantile = 0.5; PercentileCounter counter = new PercentileCounter(compression, quantile); TDigest tDigest = TDigest.createAvlTreeDigest(compression); Random random = new Random(); int dataSize = 10000; List<Double> dataset = Lists.newArrayListWithCapacity(dataSize); for (int i = 0; i < dataSize; i++) { double d = random.nextDouble(); counter.add(d); tDigest.add(d); } double actualResult = counter.getResultEstimate(); Collections.sort(dataset); double expectedResult = tDigest.quantile(quantile); assertEquals(expectedResult, actualResult, 0); }