@Override public TDigest recordAllData() { if (summary.size() != 0) { throw new IllegalStateException("Can only ask to record added data on an empty summary"); } summary = new AVLGroupTree(true); return super.recordAllData(); }
int start = summary.floor(x); if (start == IntAVLTree.NIL) { start = summary.first(); assert summary.size() == 0; summary.add(x, w, data); count = w; } else { double minDistance = Double.MAX_VALUE; int lastNeighbor = IntAVLTree.NIL; for (int neighbor = start; neighbor != IntAVLTree.NIL; neighbor = summary.next(neighbor)) { double z = Math.abs(summary.mean(neighbor) - x); if (z < minDistance) { start = neighbor; for (int neighbor = start; neighbor != lastNeighbor; neighbor = summary.next(neighbor)) { assert minDistance == Math.abs(summary.mean(neighbor) - x); double q0 = (double) summary.headSum(neighbor) / count; double q1 = q0 + (double) summary.count(neighbor) / count; double k = Math.min(scale.max(q0, compression, count), scale.max(q1, compression, count)); if (summary.count(neighbor) + w <= k) { n++; if (gen.nextDouble() < 1 / n) { summary.add(x, w, data); } else { double centroid = summary.mean(closest);
void checkAggregates() { checkAggregates(tree.root()); }
@Override public void compress() { if (summary.size() <= 1) { return; int node = summary.first(); int w0 = summary.count(node); double n1 = n0 + summary.count(node); int after = summary.next(node); while (after != IntAVLTree.NIL) { w1 = summary.count(after); k1 = scale.max((n1 + w1) / count, compression, count); if (w0 + w1 > Math.min(k0, k1)) { break; } else { double mean = weightedAverage(summary.mean(node), w0, summary.mean(after), w1); List<Double> d1 = summary.data(node); List<Double> d2 = summary.data(after); if (d1 != null && d2 != null) { d1.addAll(d2); summary.update(node, mean, w0 + w1, d1, true); int tmp = summary.next(after); summary.remove(after); after = tmp; n1 += w1;
if (values.size() == 0) { } else if (values.size() == 1) { return values.iterator().next().mean(); int currentNode = values.first(); int currentWeight = values.count(currentNode); return 2 * values.mean(currentNode) - min; if (values.count(values.last()) == 2 && index > count - 2) { return 2 * values.mean(values.last()) - max; return weightedAverage(min, weightSoFar - index, values.mean(currentNode), index - 1); for (int i = 0; i < values.size() - 1; i++) { int nextNode = values.next(currentNode); int nextWeight = values.count(nextNode); if (currentWeight == 1) { if (index < weightSoFar + 0.5) { return values.mean(currentNode); } else { leftExclusion = 0.5; return values.mean(nextNode); } else {
@Override public void compress() { if (summary.size() <= 1) { return; } AVLGroupTree centroids = summary; this.summary = new AVLGroupTree(recordAllData); final int[] nodes = new int[centroids.size()]; nodes[0] = centroids.first(); for (int i = 1; i < nodes.length; ++i) { nodes[i] = centroids.next(nodes[i-1]); assert nodes[i] != IntAVLTree.NIL; } assert centroids.next(nodes[nodes.length - 1]) == IntAVLTree.NIL; for (int i = centroids.size() - 1; i > 0; --i) { final int other = gen.nextInt(i + 1); final int tmp = nodes[other]; nodes[other] = nodes[i]; nodes[i] = tmp; } for (int node : nodes) { add(centroids.mean(node), centroids.count(node), centroids.data(node)); } }
if (values.size() == 0) { return Double.NaN; } else if (values.size() == 1) { return values.iterator().next().mean(); int next = values.floorSum((long) index); assert next != IntAVLTree.NIL; long total = values.headSum(next); final int prev = values.prev(next); if (prev != IntAVLTree.NIL) { previousMean = values.mean(prev); previousIndex = total - (values.count(prev) + 1.0) / 2; final double nextIndex = total + (values.count(next) - 1.0) / 2; if (nextIndex >= index) { if (Double.isNaN(previousMean)) { return values.mean(next); int next2 = values.next(next); final double nextIndex2 = total + values.count(next) + (values.count(next2) - 1.0) / 2; previousMean = (nextIndex2 * values.mean(next) - nextIndex * values.mean(next2)) / (nextIndex2 - nextIndex); return quantile(previousIndex, index, nextIndex, previousMean, values.mean(next)); } else if (values.next(next) == IntAVLTree.NIL) { final double nextMean2 = (values.mean(next) * (nextIndex2 - previousIndex) - previousMean * (nextIndex2 - nextIndex)) / (nextIndex - previousIndex); return quantile(nextIndex, index, nextIndex2, values.mean(next), nextMean2);
public double cdf(double x) { AVLGroupTree values = summary; if (values.size() == 0) { return Double.NaN; } else if (values.size() == 1) { if (x < values.mean(values.first())) return 0; else if (x > values.mean(values.first())) return 1; else return 0.5; } else { int first = values.first(); double firstMean = values.mean(first); if (x > min && x < firstMean) { return interpolateTail(values, x, first, firstMean, min); int last = values.last(); double lastMean = values.mean(last); if (x < max && x > lastMean) { return 1 - interpolateTail(values, x, last, lastMean, max); assert values.size() >= 2; assert x >= firstMean; assert x <= lastMean; Iterator<Centroid> it = values.iterator(); Centroid a = it.next(); double aMean = a.mean();
private double interpolateTail(AVLGroupTree values, double x, int node, double mean, double extremeValue) { int count = values.count(node); assert count > 1; if (count == 2) { // other sample must be on the other side of the mean return 1.0 / size(); } else { // how much weight is available for interpolation? double weight = count / 2.0 - 1; // how much is between min and here? double partialWeight = (extremeValue - x) / (extremeValue - mean) * weight; // account for sample at min along with interpolated weight return (partialWeight + 1.0) / size(); } }
@Override public boolean add(Centroid centroid) { add(centroid.mean(), centroid.count(), centroid.data()); return true; }
/** * A histogram structure that will record a sketch of a distribution. * * @param compression How should accuracy be traded for size? A value of N here will give quantile errors * almost always less than 3/N with considerably smaller errors expected for extreme * quantiles. Conversely, you should expect to track about 5 N centroids for this * accuracy. */ @SuppressWarnings("WeakerAccess") public AVLTreeDigest(double compression) { this.compression = compression; summary = new AVLGroupTree(false); }
/** * Return the last node whose centroid is less than <code>centroid</code>. */ @SuppressWarnings("WeakerAccess") public int floor(double centroid) { int floor = IntAVLTree.NIL; for (int node = tree.root(); node != IntAVLTree.NIL; ) { final int cmp = Double.compare(centroid, mean(node)); if (cmp <= 0) { node = tree.left(node); } else { floor = node; node = tree.right(node); } } return floor; }
/** * Return the last node so that the sum of counts of nodes that are before * it is less than or equal to <code>sum</code>. */ @SuppressWarnings("WeakerAccess") public int floorSum(long sum) { int floor = IntAVLTree.NIL; for (int node = tree.root(); node != IntAVLTree.NIL; ) { final int left = tree.left(node); final long leftCount = aggregatedCounts[left]; if (leftCount <= sum) { floor = node; sum -= leftCount + count(node); node = tree.right(node); } else { node = tree.left(node); } } return floor; }
public void add(Centroid centroid) { add(centroid.mean(), centroid.count(), centroid.data()); }
/** * A histogram structure that will record a sketch of a distribution. * * @param compression How should accuracy be traded for size? A value of N here will give quantile errors * almost always less than 3/N with considerably smaller errors expected for extreme * quantiles. Conversely, you should expect to track about 5 N centroids for this * accuracy. */ public AVLTreeDigest(double compression) { this.compression = compression; summary = new AVLGroupTree(false); }