@Override public OptionalDouble getValueFromPlanNodeEstimate(PlanNodeStatsEstimate planNodeStatsEstimate, StatsContext statsContext) { return asOptional(getSymbolStatistics(planNodeStatsEstimate, columnName, statsContext).getDistinctValuesCount()); }
public static PlanNodeStatsEstimate computeAntiJoin(PlanNodeStatsEstimate sourceStats, PlanNodeStatsEstimate filteringSourceStats, Symbol sourceJoinSymbol, Symbol filteringSourceJoinSymbol) { return compute(sourceStats, filteringSourceStats, sourceJoinSymbol, filteringSourceJoinSymbol, (sourceJoinSymbolStats, filteringSourceJoinSymbolStats) -> max(sourceJoinSymbolStats.getDistinctValuesCount() * MIN_ANTI_JOIN_FILTER_COEFFICIENT, sourceJoinSymbolStats.getDistinctValuesCount() - filteringSourceJoinSymbolStats.getDistinctValuesCount())); }
public static PlanNodeStatsEstimate computeSemiJoin(PlanNodeStatsEstimate sourceStats, PlanNodeStatsEstimate filteringSourceStats, Symbol sourceJoinSymbol, Symbol filteringSourceJoinSymbol) { return compute(sourceStats, filteringSourceStats, sourceJoinSymbol, filteringSourceJoinSymbol, (sourceJoinSymbolStats, filteringSourceJoinSymbolStats) -> min(filteringSourceJoinSymbolStats.getDistinctValuesCount(), sourceJoinSymbolStats.getDistinctValuesCount())); }
public SymbolStatsAssertion distinctValuesCountUnknown() { assertTrue(isNaN(statistics.getDistinctValuesCount()), "expected unknown distinctValuesCount but got " + statistics.getDistinctValuesCount()); return this; }
public static StatisticRange from(SymbolStatsEstimate estimate) { return new StatisticRange(estimate.getLowValue(), estimate.getHighValue(), estimate.getDistinctValuesCount()); }
public SymbolStatsAssertion distinctValuesCount(double expected) { assertEstimateEquals(statistics.getDistinctValuesCount(), expected, "distinctValuesCount mismatch"); return this; }
private SymbolStatsEstimate capNDV(SymbolStatsEstimate symbolStats, double rowCount) { double ndv = symbolStats.getDistinctValuesCount(); double nulls = symbolStats.getNullsFraction(); if (isNaN(ndv) || isNaN(rowCount) || isNaN(nulls)) { return symbolStats; } if (ndv <= rowCount * (1 - nulls)) { return symbolStats; } return symbolStats .mapDistinctValuesCount(n -> (min(ndv, rowCount) + rowCount * (1 - nulls)) / 2) .mapNullsFraction(n -> nulls / 2); }
public static PlanNodeStatsEstimate groupBy(PlanNodeStatsEstimate sourceStats, Collection<Symbol> groupBySymbols, Map<Symbol, Aggregation> aggregations) { PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.builder(); for (Symbol groupBySymbol : groupBySymbols) { SymbolStatsEstimate symbolStatistics = sourceStats.getSymbolStatistics(groupBySymbol); result.addSymbolStatistics(groupBySymbol, symbolStatistics.mapNullsFraction(nullsFraction -> { if (nullsFraction == 0.0) { return 0.0; } return 1.0 / (symbolStatistics.getDistinctValuesCount() + 1); })); } double rowsCount = 1; for (Symbol groupBySymbol : groupBySymbols) { SymbolStatsEstimate symbolStatistics = sourceStats.getSymbolStatistics(groupBySymbol); int nullRow = (symbolStatistics.getNullsFraction() == 0.0) ? 0 : 1; rowsCount *= symbolStatistics.getDistinctValuesCount() + nullRow; } result.setOutputRowCount(min(rowsCount, sourceStats.getOutputRowCount())); for (Map.Entry<Symbol, Aggregation> aggregationEntry : aggregations.entrySet()) { result.addSymbolStatistics(aggregationEntry.getKey(), estimateAggregationStats(aggregationEntry.getValue(), sourceStats)); } return result.build(); }
public SymbolStatsAssertion emptyRange() { assertTrue(isNaN(statistics.getLowValue()) && isNaN(statistics.getHighValue()), "expected empty range (NaN, NaN) but got (" + statistics.getLowValue() + ", " + statistics.getHighValue() + ") instead"); assertEquals(statistics.getDistinctValuesCount(), 0., "expected no distinctValuesCount"); assertEquals(statistics.getAverageRowSize(), 0., "expected 0 average row size"); assertEquals(statistics.getNullsFraction(), 1., "expected all nulls"); return this; }
private static void assertSubtractNumberOfDistinctValues(PlanNodeStatsEstimate first, PlanNodeStatsEstimate second, double expected) { assertEquals(subtractSubsetStats(first, second).getSymbolStatistics(SYMBOL).getDistinctValuesCount(), expected); }
private static void assertCapNumberOfDistinctValues(PlanNodeStatsEstimate stats, PlanNodeStatsEstimate cap, double expected) { assertEquals(capStats(stats, cap).getSymbolStatistics(SYMBOL).getDistinctValuesCount(), expected); }
private static void assertMaxNumberOfDistinctValues(PlanNodeStatsEstimate first, PlanNodeStatsEstimate second, double expected) { assertEquals(addStatsAndMaxDistinctValues(first, second).getSymbolStatistics(SYMBOL).getDistinctValuesCount(), expected); }
private static void assertSumNumberOfDistinctValues(PlanNodeStatsEstimate first, PlanNodeStatsEstimate second, double expected) { assertEquals(addStatsAndSumDistinctValues(first, second).getSymbolStatistics(SYMBOL).getDistinctValuesCount(), expected); }
private static PlanNodeStatsEstimate estimateExpressionNotEqualToLiteral( PlanNodeStatsEstimate inputStatistics, SymbolStatsEstimate expressionStatistics, Optional<Symbol> expressionSymbol, OptionalDouble literalValue) { StatisticRange expressionRange = StatisticRange.from(expressionStatistics); StatisticRange filterRange; if (literalValue.isPresent()) { filterRange = new StatisticRange(literalValue.getAsDouble(), literalValue.getAsDouble(), 1); } else { filterRange = new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, 1); } StatisticRange intersectRange = expressionRange.intersect(filterRange); double filterFactor = 1 - expressionRange.overlapPercentWith(intersectRange); PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics); estimate.setOutputRowCount(filterFactor * (1 - expressionStatistics.getNullsFraction()) * inputStatistics.getOutputRowCount()); if (expressionSymbol.isPresent()) { SymbolStatsEstimate symbolNewEstimate = buildFrom(expressionStatistics) .setNullsFraction(0.0) .setDistinctValuesCount(max(expressionStatistics.getDistinctValuesCount() - 1, 0)) .build(); estimate = estimate.addSymbolStatistics(expressionSymbol.get(), symbolNewEstimate); } return estimate.build(); }
public static Builder buildFrom(SymbolStatsEstimate other) { return builder() .setLowValue(other.getLowValue()) .setHighValue(other.getHighValue()) .setNullsFraction(other.getNullsFraction()) .setAverageRowSize(other.getAverageRowSize()) .setDistinctValuesCount(other.getDistinctValuesCount()); }
private Consumer<SymbolStatsAssertion> equalTo(SymbolStatsEstimate estimate) { return symbolAssert -> { symbolAssert .lowValue(estimate.getLowValue()) .highValue(estimate.getHighValue()) .distinctValuesCount(estimate.getDistinctValuesCount()) .nullsFraction(estimate.getNullsFraction()); }; }
private SymbolStatsEstimate estimateCoalesce(SymbolStatsEstimate left, SymbolStatsEstimate right) { // Question to reviewer: do you have a method to check if fraction is empty or saturated? if (left.getNullsFraction() == 0) { return left; } else if (left.getNullsFraction() == 1.0) { return right; } else { return SymbolStatsEstimate.builder() .setLowValue(min(left.getLowValue(), right.getLowValue())) .setHighValue(max(left.getHighValue(), right.getHighValue())) .setDistinctValuesCount(left.getDistinctValuesCount() + min(right.getDistinctValuesCount(), input.getOutputRowCount() * left.getNullsFraction())) .setNullsFraction(left.getNullsFraction() * right.getNullsFraction()) // TODO check if dataSize estimation method is correct .setAverageRowSize(max(left.getAverageRowSize(), right.getAverageRowSize())) .build(); } } }
private void assertSymbolStatsEqual(Symbol symbol, SymbolStatsEstimate actual, SymbolStatsEstimate expected) { assertEstimateEquals(actual.getNullsFraction(), expected.getNullsFraction(), "nullsFraction mismatch for %s", symbol.getName()); assertEstimateEquals(actual.getLowValue(), expected.getLowValue(), "lowValue mismatch for %s", symbol.getName()); assertEstimateEquals(actual.getHighValue(), expected.getHighValue(), "highValue mismatch for %s", symbol.getName()); assertEstimateEquals(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "distinct values count mismatch for %s", symbol.getName()); assertEstimateEquals(actual.getAverageRowSize(), expected.getAverageRowSize(), "average row size mismatch for %s", symbol.getName()); } }
public static PlanNodeStatsEstimate capStats(PlanNodeStatsEstimate stats, PlanNodeStatsEstimate cap) { if (stats.isOutputRowCountUnknown() || cap.isOutputRowCountUnknown()) { return PlanNodeStatsEstimate.unknown(); } PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.builder(); double cappedRowCount = min(stats.getOutputRowCount(), cap.getOutputRowCount()); result.setOutputRowCount(cappedRowCount); stats.getSymbolsWithKnownStatistics().forEach(symbol -> { SymbolStatsEstimate symbolStats = stats.getSymbolStatistics(symbol); SymbolStatsEstimate capSymbolStats = cap.getSymbolStatistics(symbol); SymbolStatsEstimate.Builder newSymbolStats = SymbolStatsEstimate.builder(); // for simplicity keep the average row size the same as in the input // in most cases the average row size doesn't change after applying filters newSymbolStats.setAverageRowSize(symbolStats.getAverageRowSize()); newSymbolStats.setDistinctValuesCount(min(symbolStats.getDistinctValuesCount(), capSymbolStats.getDistinctValuesCount())); newSymbolStats.setLowValue(max(symbolStats.getLowValue(), capSymbolStats.getLowValue())); newSymbolStats.setHighValue(min(symbolStats.getHighValue(), capSymbolStats.getHighValue())); double numberOfNulls = stats.getOutputRowCount() * symbolStats.getNullsFraction(); double capNumberOfNulls = cap.getOutputRowCount() * capSymbolStats.getNullsFraction(); double cappedNumberOfNulls = min(numberOfNulls, capNumberOfNulls); double cappedNullsFraction = cappedRowCount == 0 ? 1 : cappedNumberOfNulls / cappedRowCount; newSymbolStats.setNullsFraction(cappedNullsFraction); result.addSymbolStatistics(symbol, newSymbolStats.build()); }); return result.build(); }
public SymbolStatsAssertion isEqualTo(SymbolStatsEstimate expected) { return nullsFraction(expected.getNullsFraction()) .lowValue(expected.getLowValue()) .highValue(expected.getHighValue()) .distinctValuesCount(expected.getDistinctValuesCount()) .averageRowSize(expected.getAverageRowSize()); } }