private static PlanNodeStatsEstimate statsEstimate(Collection<Symbol> symbols, double outputSizeInBytes) { checkArgument(symbols.size() > 0, "No symbols"); checkArgument(ImmutableSet.copyOf(symbols).size() == symbols.size(), "Duplicate symbols"); double rowCount = outputSizeInBytes / symbols.size() / AVERAGE_ROW_SIZE; PlanNodeStatsEstimate.Builder builder = PlanNodeStatsEstimate.builder() .setOutputRowCount(rowCount); for (Symbol symbol : symbols) { builder.addSymbolStatistics( symbol, SymbolStatsEstimate.builder() .setNullsFraction(0) .setAverageRowSize(AVERAGE_ROW_SIZE) .build()); } return builder.build(); }
.setDistinctValuesCount(rowsPerPartition) .setNullsFraction(0.0) .setAverageRowSize(BIGINT.getFixedSize()) .build()) .build());
private static PlanNodeStatsEstimate estimateFilterRange( PlanNodeStatsEstimate inputStatistics, SymbolStatsEstimate expressionStatistics, Optional<Symbol> expressionSymbol, StatisticRange filterRange) { StatisticRange expressionRange = StatisticRange.from(expressionStatistics); StatisticRange intersectRange = expressionRange.intersect(filterRange); double filterFactor = expressionRange.overlapPercentWith(intersectRange); PlanNodeStatsEstimate estimate = inputStatistics.mapOutputRowCount(rowCount -> filterFactor * (1 - expressionStatistics.getNullsFraction()) * rowCount); if (expressionSymbol.isPresent()) { SymbolStatsEstimate symbolNewEstimate = SymbolStatsEstimate.builder() .setAverageRowSize(expressionStatistics.getAverageRowSize()) .setStatisticsRange(intersectRange) .setNullsFraction(0.0) .build(); estimate = estimate.mapSymbolColumnStatistics(expressionSymbol.get(), oldStats -> symbolNewEstimate); } return estimate; }
.setDistinctValuesCount(4) .setNullsFraction(0.1) .setAverageRowSize(2.0) .build()) .addSymbolStatistics(new Symbol("y"), SymbolStatsEstimate.builder() .setDistinctValuesCount(3) .setNullsFraction(0.2) .setAverageRowSize(2.0) .build()) .setOutputRowCount(10)
.setDistinctValuesCount(4) .setNullsFraction(0.1) .setAverageRowSize(2.0) .build()) .addSymbolStatistics(new Symbol("y"), SymbolStatsEstimate.builder() .setDistinctValuesCount(3) .setNullsFraction(0.2) .setAverageRowSize(2.0) .build()) .setOutputRowCount(10)
private static PlanNodeStatsEstimate estimateExpressionEqualToExpression( PlanNodeStatsEstimate inputStatistics, SymbolStatsEstimate leftExpressionStatistics, Optional<Symbol> leftExpressionSymbol, SymbolStatsEstimate rightExpressionStatistics, Optional<Symbol> rightExpressionSymbol) { if (isNaN(leftExpressionStatistics.getDistinctValuesCount()) || isNaN(rightExpressionStatistics.getDistinctValuesCount())) { return PlanNodeStatsEstimate.unknown(); } StatisticRange leftExpressionRange = StatisticRange.from(leftExpressionStatistics); StatisticRange rightExpressionRange = StatisticRange.from(rightExpressionStatistics); StatisticRange intersect = leftExpressionRange.intersect(rightExpressionRange); double nullsFilterFactor = (1 - leftExpressionStatistics.getNullsFraction()) * (1 - rightExpressionStatistics.getNullsFraction()); double leftNdv = leftExpressionRange.getDistinctValuesCount(); double rightNdv = rightExpressionRange.getDistinctValuesCount(); double filterFactor = 1.0 / max(leftNdv, rightNdv, 1); double retainedNdv = min(leftNdv, rightNdv); PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics) .setOutputRowCount(inputStatistics.getOutputRowCount() * nullsFilterFactor * filterFactor); SymbolStatsEstimate equalityStats = SymbolStatsEstimate.builder() .setAverageRowSize(averageExcludingNaNs(leftExpressionStatistics.getAverageRowSize(), rightExpressionStatistics.getAverageRowSize())) .setNullsFraction(0) .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv) .build(); leftExpressionSymbol.ifPresent(symbol -> estimate.addSymbolStatistics(symbol, equalityStats)); rightExpressionSymbol.ifPresent(symbol -> estimate.addSymbolStatistics(symbol, equalityStats)); return estimate.build(); }
.setDistinctValuesCount(4) .setNullsFraction(0.1) .setAverageRowSize(0) .build()) .addSymbolStatistics(new Symbol("all_null"), allNullStats)
private SymbolStatsEstimate estimateCoalesce(SymbolStatsEstimate left, SymbolStatsEstimate right) { // Question to reviewer: do you have a method to check if fraction is empty or saturated? if (left.getNullsFraction() == 0) { return left; } else if (left.getNullsFraction() == 1.0) { return right; } else { return SymbolStatsEstimate.builder() .setLowValue(min(left.getLowValue(), right.getLowValue())) .setHighValue(max(left.getHighValue(), right.getHighValue())) .setDistinctValuesCount(left.getDistinctValuesCount() + min(right.getDistinctValuesCount(), input.getOutputRowCount() * left.getNullsFraction())) .setNullsFraction(left.getNullsFraction() * right.getNullsFraction()) // TODO check if dataSize estimation method is correct .setAverageRowSize(max(left.getAverageRowSize(), right.getAverageRowSize())) .build(); } } }
public static PlanNodeStatsEstimate capStats(PlanNodeStatsEstimate stats, PlanNodeStatsEstimate cap) { if (stats.isOutputRowCountUnknown() || cap.isOutputRowCountUnknown()) { return PlanNodeStatsEstimate.unknown(); } PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.builder(); double cappedRowCount = min(stats.getOutputRowCount(), cap.getOutputRowCount()); result.setOutputRowCount(cappedRowCount); stats.getSymbolsWithKnownStatistics().forEach(symbol -> { SymbolStatsEstimate symbolStats = stats.getSymbolStatistics(symbol); SymbolStatsEstimate capSymbolStats = cap.getSymbolStatistics(symbol); SymbolStatsEstimate.Builder newSymbolStats = SymbolStatsEstimate.builder(); // for simplicity keep the average row size the same as in the input // in most cases the average row size doesn't change after applying filters newSymbolStats.setAverageRowSize(symbolStats.getAverageRowSize()); newSymbolStats.setDistinctValuesCount(min(symbolStats.getDistinctValuesCount(), capSymbolStats.getDistinctValuesCount())); newSymbolStats.setLowValue(max(symbolStats.getLowValue(), capSymbolStats.getLowValue())); newSymbolStats.setHighValue(min(symbolStats.getHighValue(), capSymbolStats.getHighValue())); double numberOfNulls = stats.getOutputRowCount() * symbolStats.getNullsFraction(); double capNumberOfNulls = cap.getOutputRowCount() * capSymbolStats.getNullsFraction(); double cappedNumberOfNulls = min(numberOfNulls, capNumberOfNulls); double cappedNullsFraction = cappedRowCount == 0 ? 1 : cappedNumberOfNulls / cappedRowCount; newSymbolStats.setNullsFraction(cappedNullsFraction); result.addSymbolStatistics(symbol, newSymbolStats.build()); }); return result.build(); }
private static SymbolStatsEstimate addColumnStats(SymbolStatsEstimate leftStats, double leftRows, SymbolStatsEstimate rightStats, double rightRows, double newRowCount, RangeAdditionStrategy strategy) { checkArgument(newRowCount > 0, "newRowCount must be greater than zero"); StatisticRange leftRange = StatisticRange.from(leftStats); StatisticRange rightRange = StatisticRange.from(rightStats); StatisticRange sum = strategy.add(leftRange, rightRange); double nullsCountRight = rightStats.getNullsFraction() * rightRows; double nullsCountLeft = leftStats.getNullsFraction() * leftRows; double totalSizeLeft = (leftRows - nullsCountLeft) * leftStats.getAverageRowSize(); double totalSizeRight = (rightRows - nullsCountRight) * rightStats.getAverageRowSize(); double newNullsFraction = (nullsCountLeft + nullsCountRight) / newRowCount; double newNonNullsRowCount = newRowCount * (1.0 - newNullsFraction); // FIXME, weights to average. left and right should be equal in most cases anyway double newAverageRowSize = newNonNullsRowCount == 0 ? 0 : ((totalSizeLeft + totalSizeRight) / newNonNullsRowCount); return SymbolStatsEstimate.builder() .setStatisticsRange(sum) .setAverageRowSize(newAverageRowSize) .setNullsFraction(newNullsFraction) .build(); } }
@Test public void testCastBigintToDouble() { PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("a"), SymbolStatsEstimate.builder() .setNullsFraction(0.3) .setLowValue(2.0) .setHighValue(10.0) .setDistinctValuesCount(4) .setAverageRowSize(2.0) .build()) .build(); assertCalculate(new Cast(new SymbolReference("a"), "double"), inputStatistics) .lowValue(2.0) .highValue(10.0) .distinctValuesCount(4) .nullsFraction(0.3) .dataSizeUnknown(); }
@Test public void testCastDoubleToBigint() { PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("a"), SymbolStatsEstimate.builder() .setNullsFraction(0.3) .setLowValue(1.6) .setHighValue(17.3) .setDistinctValuesCount(10) .setAverageRowSize(2.0) .build()) .build(); assertCalculate(new Cast(new SymbolReference("a"), "bigint"), inputStatistics) .lowValue(2.0) .highValue(17.0) .distinctValuesCount(10) .nullsFraction(0.3) .dataSizeUnknown(); }
@Test public void testCastDoubleToShortRangeUnknownDistinctValuesCount() { PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("a"), SymbolStatsEstimate.builder() .setNullsFraction(0.3) .setLowValue(1.6) .setHighValue(3.3) .setAverageRowSize(2.0) .build()) .build(); assertCalculate(new Cast(new SymbolReference("a"), "bigint"), inputStatistics) .lowValue(2.0) .highValue(3.0) .distinctValuesCountUnknown() .nullsFraction(0.3) .dataSizeUnknown(); }
@Test public void testCastDoubleToShortRange() { PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("a"), SymbolStatsEstimate.builder() .setNullsFraction(0.3) .setLowValue(1.6) .setHighValue(3.3) .setDistinctValuesCount(10) .setAverageRowSize(2.0) .build()) .build(); assertCalculate(new Cast(new SymbolReference("a"), "bigint"), inputStatistics) .lowValue(2.0) .highValue(3.0) .distinctValuesCount(2) .nullsFraction(0.3) .dataSizeUnknown(); }
private SymbolStatsEstimate toSymbolStatistics(TableStatistics tableStatistics, ColumnStatistics columnStatistics) { double nullsFraction = columnStatistics.getNullsFraction().getValue(); double nonNullRowsCount = tableStatistics.getRowCount().getValue() * (1.0 - nullsFraction); double averageRowSize = nonNullRowsCount == 0 ? 0 : columnStatistics.getDataSize().getValue() / nonNullRowsCount; SymbolStatsEstimate.Builder result = SymbolStatsEstimate.builder(); result.setNullsFraction(nullsFraction); result.setDistinctValuesCount(columnStatistics.getDistinctValuesCount().getValue()); result.setAverageRowSize(averageRowSize); columnStatistics.getRange().ifPresent(range -> { result.setLowValue(range.getMin()); result.setHighValue(range.getMax()); }); return result.build(); } }
@Test public void testSymbolReference() { SymbolStatsEstimate xStats = SymbolStatsEstimate.builder() .setLowValue(-1) .setHighValue(10) .setDistinctValuesCount(4) .setNullsFraction(0.1) .setAverageRowSize(2.0) .build(); PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("x"), xStats) .build(); assertCalculate(expression("x"), inputStatistics).isEqualTo(xStats); assertCalculate(expression("y"), inputStatistics).isEqualTo(SymbolStatsEstimate.unknown()); }
@Test public void testSymbolEqualsSameSymbolFilter() { assertExpression("x = x") .outputRowsCount(750) .symbolStats("x", symbolStats -> SymbolStatsEstimate.builder() .setAverageRowSize(4.0) .setDistinctValuesCount(40.0) .setLowValue(-10.0) .setHighValue(10.0) .build()); }
@Override public Optional<PlanNodeStatsEstimate> calculate(AssignUniqueId assignUniqueId, StatsProvider statsProvider, Lookup lookup, Session session, TypeProvider types) { PlanNodeStatsEstimate sourceStats = statsProvider.getStats(assignUniqueId.getSource()); return Optional.of(PlanNodeStatsEstimate.buildFrom(sourceStats) .addSymbolStatistics(assignUniqueId.getIdColumn(), SymbolStatsEstimate.builder() .setDistinctValuesCount(sourceStats.getOutputRowCount()) .setNullsFraction(0.0) .setAverageRowSize(BIGINT.getFixedSize()) .build()) .build()); } }
private static PlanNodeStatsEstimate statistics(double rowCount, double nullsFraction, double averageRowSize, StatisticRange range) { return PlanNodeStatsEstimate.builder() .setOutputRowCount(rowCount) .addSymbolStatistics(SYMBOL, SymbolStatsEstimate.builder() .setNullsFraction(nullsFraction) .setAverageRowSize(averageRowSize) .setStatisticsRange(range) .build()) .build(); }
public static Builder buildFrom(SymbolStatsEstimate other) { return builder() .setLowValue(other.getLowValue()) .setHighValue(other.getHighValue()) .setNullsFraction(other.getNullsFraction()) .setAverageRowSize(other.getAverageRowSize()) .setDistinctValuesCount(other.getDistinctValuesCount()); }