@Override public OptionalDouble getValueFromPlanNodeEstimate(PlanNodeStatsEstimate planNodeStatsEstimate, StatsContext statsContext) { return asOptional(getSymbolStatistics(planNodeStatsEstimate, columnName, statsContext).getNullsFraction()); }
public SymbolStatsAssertion nullsFractionUnknown() { assertTrue(isNaN(statistics.getNullsFraction()), "expected unknown nullsFraction but got " + statistics.getNullsFraction()); return this; }
private double getOutputSizeForSymbol(SymbolStatsEstimate symbolStatistics, Type type) { checkArgument(type != null, "type is null"); double averageRowSize = symbolStatistics.getAverageRowSize(); double nullsFraction = firstNonNaN(symbolStatistics.getNullsFraction(), 0d); double numberOfNonNullRows = outputRowCount * (1.0 - nullsFraction); if (isNaN(averageRowSize)) { if (type instanceof FixedWidthType) { averageRowSize = ((FixedWidthType) type).getFixedSize(); } else { averageRowSize = DEFAULT_DATA_SIZE_PER_COLUMN; } } double outputSize = numberOfNonNullRows * averageRowSize; // account for "is null" boolean array outputSize += outputRowCount * Byte.BYTES; // account for offsets array for variable width types if (type instanceof VariableWidthType) { outputSize += outputRowCount * Integer.BYTES; } return outputSize; }
public SymbolStatsAssertion nullsFraction(double expected) { assertEstimateEquals(statistics.getNullsFraction(), expected, "nullsFraction mismatch"); return this; }
private SymbolStatsEstimate capNDV(SymbolStatsEstimate symbolStats, double rowCount) { double ndv = symbolStats.getDistinctValuesCount(); double nulls = symbolStats.getNullsFraction(); if (isNaN(ndv) || isNaN(rowCount) || isNaN(nulls)) { return symbolStats; } if (ndv <= rowCount * (1 - nulls)) { return symbolStats; } return symbolStats .mapDistinctValuesCount(n -> (min(ndv, rowCount) + rowCount * (1 - nulls)) / 2) .mapNullsFraction(n -> nulls / 2); }
public SymbolStatsAssertion emptyRange() { assertTrue(isNaN(statistics.getLowValue()) && isNaN(statistics.getHighValue()), "expected empty range (NaN, NaN) but got (" + statistics.getLowValue() + ", " + statistics.getHighValue() + ") instead"); assertEquals(statistics.getDistinctValuesCount(), 0., "expected no distinctValuesCount"); assertEquals(statistics.getAverageRowSize(), 0., "expected 0 average row size"); assertEquals(statistics.getNullsFraction(), 1., "expected all nulls"); return this; }
private SymbolStatsEstimate estimateCoalesce(SymbolStatsEstimate left, SymbolStatsEstimate right) { // Question to reviewer: do you have a method to check if fraction is empty or saturated? if (left.getNullsFraction() == 0) { return left; } else if (left.getNullsFraction() == 1.0) { return right; } else { return SymbolStatsEstimate.builder() .setLowValue(min(left.getLowValue(), right.getLowValue())) .setHighValue(max(left.getHighValue(), right.getHighValue())) .setDistinctValuesCount(left.getDistinctValuesCount() + min(right.getDistinctValuesCount(), input.getOutputRowCount() * left.getNullsFraction())) .setNullsFraction(left.getNullsFraction() * right.getNullsFraction()) // TODO check if dataSize estimation method is correct .setAverageRowSize(max(left.getAverageRowSize(), right.getAverageRowSize())) .build(); } } }
private static SymbolStatsEstimate addColumnStats(SymbolStatsEstimate leftStats, double leftRows, SymbolStatsEstimate rightStats, double rightRows, double newRowCount, RangeAdditionStrategy strategy) { checkArgument(newRowCount > 0, "newRowCount must be greater than zero"); StatisticRange leftRange = StatisticRange.from(leftStats); StatisticRange rightRange = StatisticRange.from(rightStats); StatisticRange sum = strategy.add(leftRange, rightRange); double nullsCountRight = rightStats.getNullsFraction() * rightRows; double nullsCountLeft = leftStats.getNullsFraction() * leftRows; double totalSizeLeft = (leftRows - nullsCountLeft) * leftStats.getAverageRowSize(); double totalSizeRight = (rightRows - nullsCountRight) * rightStats.getAverageRowSize(); double newNullsFraction = (nullsCountLeft + nullsCountRight) / newRowCount; double newNonNullsRowCount = newRowCount * (1.0 - newNullsFraction); // FIXME, weights to average. left and right should be equal in most cases anyway double newAverageRowSize = newNonNullsRowCount == 0 ? 0 : ((totalSizeLeft + totalSizeRight) / newNonNullsRowCount); return SymbolStatsEstimate.builder() .setStatisticsRange(sum) .setAverageRowSize(newAverageRowSize) .setNullsFraction(newNullsFraction) .build(); } }
public static PlanNodeStatsEstimate groupBy(PlanNodeStatsEstimate sourceStats, Collection<Symbol> groupBySymbols, Map<Symbol, Aggregation> aggregations) { PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.builder(); for (Symbol groupBySymbol : groupBySymbols) { SymbolStatsEstimate symbolStatistics = sourceStats.getSymbolStatistics(groupBySymbol); result.addSymbolStatistics(groupBySymbol, symbolStatistics.mapNullsFraction(nullsFraction -> { if (nullsFraction == 0.0) { return 0.0; } return 1.0 / (symbolStatistics.getDistinctValuesCount() + 1); })); } double rowsCount = 1; for (Symbol groupBySymbol : groupBySymbols) { SymbolStatsEstimate symbolStatistics = sourceStats.getSymbolStatistics(groupBySymbol); int nullRow = (symbolStatistics.getNullsFraction() == 0.0) ? 0 : 1; rowsCount *= symbolStatistics.getDistinctValuesCount() + nullRow; } result.setOutputRowCount(min(rowsCount, sourceStats.getOutputRowCount())); for (Map.Entry<Symbol, Aggregation> aggregationEntry : aggregations.entrySet()) { result.addSymbolStatistics(aggregationEntry.getKey(), estimateAggregationStats(aggregationEntry.getValue(), sourceStats)); } return result.build(); }
private static void assertCapNullsFraction(PlanNodeStatsEstimate stats, PlanNodeStatsEstimate cap, double expected) { assertEquals(capStats(stats, cap).getSymbolStatistics(SYMBOL).getNullsFraction(), expected); }
private static void assertAddNullsFraction(PlanNodeStatsEstimate first, PlanNodeStatsEstimate second, double expected) { assertEquals(addStatsAndSumDistinctValues(first, second).getSymbolStatistics(SYMBOL).getNullsFraction(), expected); }
private static void assertSubtractNullsFraction(PlanNodeStatsEstimate first, PlanNodeStatsEstimate second, double expected) { assertEquals(subtractSubsetStats(first, second).getSymbolStatistics(SYMBOL).getNullsFraction(), expected); }
private static PlanNodeStatsEstimate estimateFilterRange( PlanNodeStatsEstimate inputStatistics, SymbolStatsEstimate expressionStatistics, Optional<Symbol> expressionSymbol, StatisticRange filterRange) { StatisticRange expressionRange = StatisticRange.from(expressionStatistics); StatisticRange intersectRange = expressionRange.intersect(filterRange); double filterFactor = expressionRange.overlapPercentWith(intersectRange); PlanNodeStatsEstimate estimate = inputStatistics.mapOutputRowCount(rowCount -> filterFactor * (1 - expressionStatistics.getNullsFraction()) * rowCount); if (expressionSymbol.isPresent()) { SymbolStatsEstimate symbolNewEstimate = SymbolStatsEstimate.builder() .setAverageRowSize(expressionStatistics.getAverageRowSize()) .setStatisticsRange(intersectRange) .setNullsFraction(0.0) .build(); estimate = estimate.mapSymbolColumnStatistics(expressionSymbol.get(), oldStats -> symbolNewEstimate); } return estimate; }
private static PlanNodeStatsEstimate estimateExpressionNotEqualToLiteral( PlanNodeStatsEstimate inputStatistics, SymbolStatsEstimate expressionStatistics, Optional<Symbol> expressionSymbol, OptionalDouble literalValue) { StatisticRange expressionRange = StatisticRange.from(expressionStatistics); StatisticRange filterRange; if (literalValue.isPresent()) { filterRange = new StatisticRange(literalValue.getAsDouble(), literalValue.getAsDouble(), 1); } else { filterRange = new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, 1); } StatisticRange intersectRange = expressionRange.intersect(filterRange); double filterFactor = 1 - expressionRange.overlapPercentWith(intersectRange); PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics); estimate.setOutputRowCount(filterFactor * (1 - expressionStatistics.getNullsFraction()) * inputStatistics.getOutputRowCount()); if (expressionSymbol.isPresent()) { SymbolStatsEstimate symbolNewEstimate = buildFrom(expressionStatistics) .setNullsFraction(0.0) .setDistinctValuesCount(max(expressionStatistics.getDistinctValuesCount() - 1, 0)) .build(); estimate = estimate.addSymbolStatistics(expressionSymbol.get(), symbolNewEstimate); } return estimate.build(); }
public static Builder buildFrom(SymbolStatsEstimate other) { return builder() .setLowValue(other.getLowValue()) .setHighValue(other.getHighValue()) .setNullsFraction(other.getNullsFraction()) .setAverageRowSize(other.getAverageRowSize()) .setDistinctValuesCount(other.getDistinctValuesCount()); }
@Override protected PlanNodeStatsEstimate visitIsNotNullPredicate(IsNotNullPredicate node, Void context) { if (node.getValue() instanceof SymbolReference) { Symbol symbol = Symbol.from(node.getValue()); SymbolStatsEstimate symbolStats = input.getSymbolStatistics(symbol); PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(input); result.setOutputRowCount(input.getOutputRowCount() * (1 - symbolStats.getNullsFraction())); result.addSymbolStatistics(symbol, symbolStats.mapNullsFraction(x -> 0.0)); return result.build(); } return PlanNodeStatsEstimate.unknown(); }
private Consumer<SymbolStatsAssertion> equalTo(SymbolStatsEstimate estimate) { return symbolAssert -> { symbolAssert .lowValue(estimate.getLowValue()) .highValue(estimate.getHighValue()) .distinctValuesCount(estimate.getDistinctValuesCount()) .nullsFraction(estimate.getNullsFraction()); }; }
private void assertSymbolStatsEqual(Symbol symbol, SymbolStatsEstimate actual, SymbolStatsEstimate expected) { assertEstimateEquals(actual.getNullsFraction(), expected.getNullsFraction(), "nullsFraction mismatch for %s", symbol.getName()); assertEstimateEquals(actual.getLowValue(), expected.getLowValue(), "lowValue mismatch for %s", symbol.getName()); assertEstimateEquals(actual.getHighValue(), expected.getHighValue(), "highValue mismatch for %s", symbol.getName()); assertEstimateEquals(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "distinct values count mismatch for %s", symbol.getName()); assertEstimateEquals(actual.getAverageRowSize(), expected.getAverageRowSize(), "average row size mismatch for %s", symbol.getName()); } }
public SymbolStatsAssertion isEqualTo(SymbolStatsEstimate expected) { return nullsFraction(expected.getNullsFraction()) .lowValue(expected.getLowValue()) .highValue(expected.getHighValue()) .distinctValuesCount(expected.getDistinctValuesCount()) .averageRowSize(expected.getAverageRowSize()); } }
@Override protected PlanNodeStatsEstimate visitIsNullPredicate(IsNullPredicate node, Void context) { if (node.getValue() instanceof SymbolReference) { Symbol symbol = Symbol.from(node.getValue()); SymbolStatsEstimate symbolStats = input.getSymbolStatistics(symbol); PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(input); result.setOutputRowCount(input.getOutputRowCount() * symbolStats.getNullsFraction()); result.addSymbolStatistics(symbol, SymbolStatsEstimate.builder() .setNullsFraction(1.0) .setLowValue(NaN) .setHighValue(NaN) .setDistinctValuesCount(0.0) .build()); return result.build(); } return PlanNodeStatsEstimate.unknown(); }