@Override public double[] compute(final Map<String, Object> combinedAggregators) { final ArrayOfDoublesSketch sketch = (ArrayOfDoublesSketch) getField().compute(combinedAggregators); return new double[] {sketch.getEstimate(), sketch.getLowerBound(numStdDevs), sketch.getUpperBound(numStdDevs)}; }
@Override public Integer compute(final Map<String, Object> combinedAggregators) { final ArrayOfDoublesSketch sketch = (ArrayOfDoublesSketch) getField().compute(combinedAggregators); return sketch.getRetainedEntries(); }
private static SummaryStatistics[] getStats(final ArrayOfDoublesSketch sketch) { final SummaryStatistics[] stats = new SummaryStatistics[sketch.getNumValues()]; Arrays.setAll(stats, i -> new SummaryStatistics()); final ArrayOfDoublesSketchIterator it = sketch.iterator(); while (it.next()) { final double[] values = it.getValues(); for (int i = 0; i < values.length; i++) { stats[i].addValue(values[i]); } } return stats; }
@Override public String toString() { final int seedHash = Short.toUnsignedInt(getSeedHash()); final StringBuilder sb = new StringBuilder(); sb.append("### ").append(this.getClass().getSimpleName()).append(" SUMMARY: ").append(LS); sb.append(" Estimate : ").append(getEstimate()).append(LS); sb.append(" Upper Bound, 95% conf : ").append(getUpperBound(2)).append(LS); sb.append(" Lower Bound, 95% conf : ").append(getLowerBound(2)).append(LS); sb.append(" Theta (double) : ").append(getTheta()).append(LS); sb.append(" Theta (long) : ").append(getThetaLong()).append(LS); sb.append(" EstMode? : ").append(isEstimationMode()).append(LS); sb.append(" Empty? : ").append(isEmpty()).append(LS); sb.append(" Retained Entries : ").append(getRetainedEntries()).append(LS); if (this instanceof ArrayOfDoublesUpdatableSketch) { final ArrayOfDoublesUpdatableSketch updatable = (ArrayOfDoublesUpdatableSketch) this; sb.append(" Nominal Entries (k) : ").append(updatable.getNominalEntries()).append(LS); sb.append(" Current Capacity : ").append(updatable.getCurrentCapacity()).append(LS); sb.append(" Resize Factor : ").append(updatable.getResizeFactor().getValue()).append(LS); sb.append(" Sampling Probability (p): ").append(updatable.getSamplingProbability()).append(LS); } sb.append(" Seed Hash : ") .append(Integer.toHexString(seedHash)).append(" | ").append(seedHash).append(LS); sb.append("### END SKETCH SUMMARY").append(LS); return sb.toString(); }
@Override public void update(final ArrayOfDoublesSketch a, final ArrayOfDoublesSketch b) { if (a != null) { Util.checkSeedHashes(seedHash_, a.getSeedHash()); } if (b != null) { Util.checkSeedHashes(seedHash_, b.getSeedHash()); } if (a != null) { //stays this way even if we end up with no result entries isEmpty_ = a.isEmpty(); final long thetaA = a == null ? Long.MAX_VALUE : a.getThetaLong(); final long thetaB = b == null ? Long.MAX_VALUE : b.getThetaLong(); theta_ = Math.min(thetaA, thetaB); if (a == null || a.getRetainedEntries() == 0) { return; } if (b == null || b.getRetainedEntries() == 0) { getNoMatchSetFromSketch(a); } else { hashTable = convertToHashTable(b); final int lgHashTableSize = Integer.numberOfTrailingZeros(hashTable.length); final int noMatchSize = a.getRetainedEntries(); keys_ = new long[noMatchSize]; values_ = new double[noMatchSize * numValues_]; final ArrayOfDoublesSketchIterator it = a.iterator(); while (it.next()) { final int index = HashOperations.hashSearch(hashTable, lgHashTableSize, it.getKey());
@Test public void serializeDeserializeSampling() { int sketchSize = 16384; int numberOfUniques = sketchSize; ArrayOfDoublesUpdatableSketch sketch1 = new ArrayOfDoublesUpdatableSketchBuilder().setNominalEntries(sketchSize).setSamplingProbability(0.5f).build(); for (int i = 0; i < numberOfUniques; i++) sketch1.update(i, new double[] {1.0}); ArrayOfDoublesSketch sketch2 = ArrayOfDoublesSketch.heapify(Memory.wrap(sketch1.toByteArray())); Assert.assertTrue(sketch2.isEstimationMode()); Assert.assertEquals(sketch2.getEstimate() / numberOfUniques, 1.0, 0.01); Assert.assertEquals(sketch2.getRetainedEntries() / (double) numberOfUniques, 0.5, 0.01); Assert.assertEquals(sketch1.getTheta(), sketch2.getTheta()); }
@Test public void serializeDeserializeSampling() { int sketchSize = 16384; int numberOfUniques = sketchSize; ArrayOfDoublesUpdatableSketch sketch1 = new ArrayOfDoublesUpdatableSketchBuilder(). setNominalEntries(sketchSize).setSamplingProbability(0.5f). build(WritableMemory.wrap(new byte[1000000])); for (int i = 0; i < numberOfUniques; i++) { sketch1.update(i, new double[] {1.0}); } ArrayOfDoublesSketch sketch2 = ArrayOfDoublesSketch.wrap(WritableMemory.wrap(sketch1.toByteArray())); Assert.assertTrue(sketch2.isEstimationMode()); Assert.assertEquals(sketch2.getEstimate() / numberOfUniques, 1.0, 0.01); Assert.assertEquals(sketch2.getRetainedEntries() / (double) numberOfUniques, 0.5, 0.01); Assert.assertEquals(sketch1.getTheta(), sketch2.getTheta()); }
/** * Estimates the cardinality of the set (number of unique values presented to the sketch) * @return best estimate of the number of unique values */ public double getEstimate() { if (!isEstimationMode()) { return getRetainedEntries(); } return getRetainedEntries() / getTheta(); }
@Test public void serializeDeserializeEstimationNoResize() throws Exception { ArrayOfDoublesUpdatableSketch sketch1 = new ArrayOfDoublesUpdatableSketchBuilder(). setResizeFactor(ResizeFactor.X1).build(); for (int j = 0; j < 10; j++) { for (int i = 0; i < 8192; i++) sketch1.update(i, new double[] {1.0}); } byte[] byteArray = sketch1.toByteArray(); //for visual testing //TestUtil.writeBytesToFile(byteArray, "ArrayOfDoublesQuickSelectSketch4K.data"); ArrayOfDoublesSketch sketch2 = ArrayOfDoublesSketch.heapify(Memory.wrap(byteArray)); Assert.assertTrue(sketch2.isEstimationMode()); Assert.assertEquals(sketch2.getEstimate(), 8192, 8192 * 0.99); Assert.assertEquals(sketch1.getTheta(), sketch2.getTheta()); double[][] values = sketch2.getValues(); Assert.assertTrue(values.length >= 4096); for (double[] array: values) Assert.assertEquals(array[0], 10.0); }
@Test public void serializeDeserializeEstimationNoResize() throws Exception { ArrayOfDoublesUpdatableSketch sketch1 = new ArrayOfDoublesUpdatableSketchBuilder().setResizeFactor(ResizeFactor.X1). build(WritableMemory.wrap(new byte[1000000])); for (int j = 0; j < 10; j++) { for (int i = 0; i < 8192; i++) { sketch1.update(i, new double[] {1.0}); } } byte[] byteArray = sketch1.toByteArray(); //for visual testing //TestUtil.writeBytesToFile(byteArray, "ArrayOfDoublesQuickSelectSketch4K.data"); ArrayOfDoublesSketch sketch2 = ArrayOfDoublesSketch.wrap(WritableMemory.wrap(byteArray)); Assert.assertTrue(sketch2.isEstimationMode()); Assert.assertEquals(sketch2.getEstimate(), 8192, 8192 * 0.99); Assert.assertEquals(sketch1.getTheta(), sketch2.getTheta()); double[][] values = sketch2.getValues(); Assert.assertTrue(values.length >= 4096); for (double[] array: values) { Assert.assertEquals(array[0], 10.0); } }
@Test public void serializeDeserializeEstimation() { ArrayOfDoublesUpdatableSketch us = new ArrayOfDoublesUpdatableSketchBuilder().build(WritableMemory.wrap(new byte[1000000])); for (int i = 0; i < 8192; i++) { us.update(i, new double[] {1.0}); } ArrayOfDoublesCompactSketch sketch1 = us.compact(WritableMemory.wrap(new byte[1000000])); ArrayOfDoublesSketch sketch2 = ArrayOfDoublesSketches.wrapSketch(WritableMemory.wrap(sketch1.toByteArray())); Assert.assertFalse(sketch2.isEmpty()); Assert.assertTrue(sketch2.isEstimationMode()); Assert.assertEquals(sketch2.getEstimate(), sketch1.getEstimate()); Assert.assertEquals(sketch2.getThetaLong(), sketch1.getThetaLong()); }
/** * Updates the union by adding a set of entries from a given sketch * @param sketchIn sketch to add to the union */ public void update(final ArrayOfDoublesSketch sketchIn) { if (sketchIn == null) { return; } Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); if (sketch_.getNumValues() != sketchIn.getNumValues()) { throw new SketchesArgumentException("Incompatible sketches: number of values mismatch " + sketch_.getNumValues() + " and " + sketchIn.getNumValues()); } if (sketchIn.isEmpty()) { return; } if (sketchIn.getThetaLong() < theta_) { theta_ = sketchIn.getThetaLong(); } final ArrayOfDoublesSketchIterator it = sketchIn.iterator(); while (it.next()) { sketch_.merge(it.getKey(), it.getValues()); } }
private void getNoMatchSetFromSketch(final ArrayOfDoublesSketch sketch) { count_ = sketch.getRetainedEntries(); keys_ = new long[count_]; values_ = new double[count_ * numValues_]; final ArrayOfDoublesSketchIterator it = sketch.iterator(); int i = 0; while (it.next()) { keys_[i] = it.getKey(); System.arraycopy(it.getValues(), 0, values_, i * numValues_, numValues_); i++; } }
@Override public Object finalizeComputation(final Object object) { return ((ArrayOfDoublesSketch) object).getEstimate(); }
@Test public void heapifyAndUpdateUnion() { int numUniques = 10000; int key = 0; ArrayOfDoublesUpdatableSketch sketch1 = new ArrayOfDoublesUpdatableSketchBuilder().build(); for (int i = 0; i < numUniques; i++) { sketch1.update(key++, new double[] {1}); } ArrayOfDoublesUnion union1 = new ArrayOfDoublesSetOperationBuilder().buildUnion(); union1.update(sketch1); ArrayOfDoublesUnion union2 = ArrayOfDoublesSketches.heapifyUnion(Memory.wrap(union1.toByteArray())); ArrayOfDoublesSketch resultSketch = union2.getResult(); Assert.assertTrue(resultSketch.isEstimationMode()); Assert.assertEquals(resultSketch.getEstimate(), numUniques, numUniques * 0.04); // make sure union update actually needs to modify the union ArrayOfDoublesUpdatableSketch sketch2 = new ArrayOfDoublesUpdatableSketchBuilder().build(); for (int i = 0; i < numUniques; i++) { sketch2.update(key++, new double[] {1}); } union2.update(sketch2); }
/** * Heapify the given Memory and seed as a ArrayOfDoublesSketch * @param mem the given Memory * @param seed the given seed * @return an ArrayOfDoublesSketch */ public static ArrayOfDoublesSketch heapifySketch(final Memory mem, final long seed) { return ArrayOfDoublesSketch.heapify(mem, seed); }
@Override public double[] compute(final Map<String, Object> combinedAggregators) { final ArrayOfDoublesSketch sketch1 = (ArrayOfDoublesSketch) getFields().get(0).compute(combinedAggregators); final ArrayOfDoublesSketch sketch2 = (ArrayOfDoublesSketch) getFields().get(1).compute(combinedAggregators); if (sketch1.getNumValues() != sketch2.getNumValues()) { throw new IAE( "Sketches have different number of values: %d and %d", sketch1.getNumValues(), sketch2.getNumValues() ); } final SummaryStatistics[] stats1 = getStats(sketch1); final SummaryStatistics[] stats2 = getStats(sketch2); final int numberOfValues = sketch1.getNumValues(); final double[] pValues = new double[numberOfValues]; final TTest test = new TTest(); for (int i = 0; i < pValues.length; i++) { pValues[i] = test.tTest(stats1[i], stats2[i]); } return pValues; }
/** * Returns true if the sketch is Estimation Mode (as opposed to Exact Mode). * This is true if theta < 1.0 AND isEmpty() is false. * @return true if the sketch is in estimation mode. */ public boolean isEstimationMode() { return ((theta_ < Long.MAX_VALUE) && !isEmpty()); }
@Test public void serializeDeserializeSmallExact() { ArrayOfDoublesUpdatableSketch us = new ArrayOfDoublesUpdatableSketchBuilder().build(); us.update("a", new double[] {1.0}); us.update("b", new double[] {1.0}); us.update("c", new double[] {1.0}); ArrayOfDoublesCompactSketch sketch1 = us.compact(); ArrayOfDoublesSketch sketch2 = ArrayOfDoublesSketches.heapifySketch(Memory.wrap(sketch1.toByteArray())); Assert.assertFalse(sketch2.isEmpty()); Assert.assertFalse(sketch2.isEstimationMode()); Assert.assertEquals(sketch2.getEstimate(), 3.0); Assert.assertEquals(sketch2.getLowerBound(1), 3.0); Assert.assertEquals(sketch2.getUpperBound(1), 3.0); Assert.assertEquals(sketch2.getRetainedEntries(), 3); Assert.assertEquals(sketch2.getThetaLong(), Long.MAX_VALUE); Assert.assertEquals(sketch2.getTheta(), 1.0); double[][] values = sketch2.getValues(); Assert.assertEquals(values.length, 3); for (double[] array: values) { Assert.assertEquals(array[0], 1.0); } }
return; Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); theta_ = min(theta_, sketchIn.getThetaLong()); isEmpty_ |= sketchIn.isEmpty(); if (isEmpty_ || sketchIn.getRetainedEntries() == 0) { sketch_ = null; return; sketch_ = createSketch(sketchIn.getRetainedEntries(), numValues_, seed_); final ArrayOfDoublesSketchIterator it = sketchIn.iterator(); while (it.next()) { sketch_.insert(it.getKey(), it.getValues()); final int matchSize = min(sketch_.getRetainedEntries(), sketchIn.getRetainedEntries()); final long[] matchKeys = new long[matchSize]; final double[][] matchValues = new double[matchSize][]; int matchCount = 0; final ArrayOfDoublesSketchIterator it = sketchIn.iterator(); while (it.next()) { final double[] values = sketch_.find(it.getKey());