private SymbolStatsEstimate buildSymbolStatistics(List<Object> values, Session session, Type type) { List<Object> nonNullValues = values.stream() .filter(Objects::nonNull) .collect(toImmutableList()); if (nonNullValues.isEmpty()) { return SymbolStatsEstimate.zero(); } double[] valuesAsDoubles = nonNullValues.stream() .map(value -> toStatsRepresentation(metadata, session, type, value)) .filter(OptionalDouble::isPresent) .mapToDouble(OptionalDouble::getAsDouble) .toArray(); double lowValue = DoubleStream.of(valuesAsDoubles).min().orElse(Double.NEGATIVE_INFINITY); double highValue = DoubleStream.of(valuesAsDoubles).max().orElse(Double.POSITIVE_INFINITY); double valuesCount = values.size(); double nonNullValuesCount = nonNullValues.size(); long distinctValuesCount = nonNullValues.stream().distinct().count(); return SymbolStatsEstimate.builder() .setNullsFraction((valuesCount - nonNullValuesCount) / valuesCount) .setLowValue(lowValue) .setHighValue(highValue) .setDistinctValuesCount(distinctValuesCount) .build(); } }
@Override protected SymbolStatsEstimate visitLiteral(Literal node, Void context) { Object value = evaluate(metadata, session.toConnectorSession(), node); Type type = ExpressionAnalyzer.createConstantAnalyzer(metadata, session, ImmutableList.of(), WarningCollector.NOOP).analyze(node, Scope.create()); OptionalDouble doubleValue = toStatsRepresentation(metadata, session, type, value); SymbolStatsEstimate.Builder estimate = SymbolStatsEstimate.builder() .setNullsFraction(0) .setDistinctValuesCount(1); if (doubleValue.isPresent()) { estimate.setLowValue(doubleValue.getAsDouble()); estimate.setHighValue(doubleValue.getAsDouble()); } return estimate.build(); }
.setLowValue(lowValue) .setHighValue(highValue) .setDistinctValuesCount(distinctValuesCount)
PlanNodeStatsEstimate relationStats = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("x"), SymbolStatsEstimate.builder() .setLowValue(-1) .setHighValue(10) .setDistinctValuesCount(4)
public static PlanNodeStatsEstimate capStats(PlanNodeStatsEstimate stats, PlanNodeStatsEstimate cap) { if (stats.isOutputRowCountUnknown() || cap.isOutputRowCountUnknown()) { return PlanNodeStatsEstimate.unknown(); } PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.builder(); double cappedRowCount = min(stats.getOutputRowCount(), cap.getOutputRowCount()); result.setOutputRowCount(cappedRowCount); stats.getSymbolsWithKnownStatistics().forEach(symbol -> { SymbolStatsEstimate symbolStats = stats.getSymbolStatistics(symbol); SymbolStatsEstimate capSymbolStats = cap.getSymbolStatistics(symbol); SymbolStatsEstimate.Builder newSymbolStats = SymbolStatsEstimate.builder(); // for simplicity keep the average row size the same as in the input // in most cases the average row size doesn't change after applying filters newSymbolStats.setAverageRowSize(symbolStats.getAverageRowSize()); newSymbolStats.setDistinctValuesCount(min(symbolStats.getDistinctValuesCount(), capSymbolStats.getDistinctValuesCount())); newSymbolStats.setLowValue(max(symbolStats.getLowValue(), capSymbolStats.getLowValue())); newSymbolStats.setHighValue(min(symbolStats.getHighValue(), capSymbolStats.getHighValue())); double numberOfNulls = stats.getOutputRowCount() * symbolStats.getNullsFraction(); double capNumberOfNulls = cap.getOutputRowCount() * capSymbolStats.getNullsFraction(); double cappedNumberOfNulls = min(numberOfNulls, capNumberOfNulls); double cappedNullsFraction = cappedRowCount == 0 ? 1 : cappedNumberOfNulls / cappedRowCount; newSymbolStats.setNullsFraction(cappedNullsFraction); result.addSymbolStatistics(symbol, newSymbolStats.build()); }); return result.build(); }
private SymbolStatsEstimate estimateCoalesce(SymbolStatsEstimate left, SymbolStatsEstimate right) { // Question to reviewer: do you have a method to check if fraction is empty or saturated? if (left.getNullsFraction() == 0) { return left; } else if (left.getNullsFraction() == 1.0) { return right; } else { return SymbolStatsEstimate.builder() .setLowValue(min(left.getLowValue(), right.getLowValue())) .setHighValue(max(left.getHighValue(), right.getHighValue())) .setDistinctValuesCount(left.getDistinctValuesCount() + min(right.getDistinctValuesCount(), input.getOutputRowCount() * left.getNullsFraction())) .setNullsFraction(left.getNullsFraction() * right.getNullsFraction()) // TODO check if dataSize estimation method is correct .setAverageRowSize(max(left.getAverageRowSize(), right.getAverageRowSize())) .build(); } } }
@Test public void testCastDoubleToShortRange() { PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("a"), SymbolStatsEstimate.builder() .setNullsFraction(0.3) .setLowValue(1.6) .setHighValue(3.3) .setDistinctValuesCount(10) .setAverageRowSize(2.0) .build()) .build(); assertCalculate(new Cast(new SymbolReference("a"), "bigint"), inputStatistics) .lowValue(2.0) .highValue(3.0) .distinctValuesCount(2) .nullsFraction(0.3) .dataSizeUnknown(); }
@Test public void testCastBigintToDouble() { PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("a"), SymbolStatsEstimate.builder() .setNullsFraction(0.3) .setLowValue(2.0) .setHighValue(10.0) .setDistinctValuesCount(4) .setAverageRowSize(2.0) .build()) .build(); assertCalculate(new Cast(new SymbolReference("a"), "double"), inputStatistics) .lowValue(2.0) .highValue(10.0) .distinctValuesCount(4) .nullsFraction(0.3) .dataSizeUnknown(); }
@Test public void testCastDoubleToBigint() { PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("a"), SymbolStatsEstimate.builder() .setNullsFraction(0.3) .setLowValue(1.6) .setHighValue(17.3) .setDistinctValuesCount(10) .setAverageRowSize(2.0) .build()) .build(); assertCalculate(new Cast(new SymbolReference("a"), "bigint"), inputStatistics) .lowValue(2.0) .highValue(17.0) .distinctValuesCount(10) .nullsFraction(0.3) .dataSizeUnknown(); }
@Test public void testCastDoubleToShortRangeUnknownDistinctValuesCount() { PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("a"), SymbolStatsEstimate.builder() .setNullsFraction(0.3) .setLowValue(1.6) .setHighValue(3.3) .setAverageRowSize(2.0) .build()) .build(); assertCalculate(new Cast(new SymbolReference("a"), "bigint"), inputStatistics) .lowValue(2.0) .highValue(3.0) .distinctValuesCountUnknown() .nullsFraction(0.3) .dataSizeUnknown(); }
@Override protected PlanNodeStatsEstimate visitIsNullPredicate(IsNullPredicate node, Void context) { if (node.getValue() instanceof SymbolReference) { Symbol symbol = Symbol.from(node.getValue()); SymbolStatsEstimate symbolStats = input.getSymbolStatistics(symbol); PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(input); result.setOutputRowCount(input.getOutputRowCount() * symbolStats.getNullsFraction()); result.addSymbolStatistics(symbol, SymbolStatsEstimate.builder() .setNullsFraction(1.0) .setLowValue(NaN) .setHighValue(NaN) .setDistinctValuesCount(0.0) .build()); return result.build(); } return PlanNodeStatsEstimate.unknown(); }
@Override protected SymbolStatsEstimate visitArithmeticUnary(ArithmeticUnaryExpression node, Void context) { SymbolStatsEstimate stats = process(node.getValue()); switch (node.getSign()) { case PLUS: return stats; case MINUS: return SymbolStatsEstimate.buildFrom(stats) .setLowValue(-stats.getHighValue()) .setHighValue(-stats.getLowValue()) .build(); default: throw new IllegalStateException("Unexpected sign: " + node.getSign()); } }
@Test public void testSymbolReference() { SymbolStatsEstimate xStats = SymbolStatsEstimate.builder() .setLowValue(-1) .setHighValue(10) .setDistinctValuesCount(4) .setNullsFraction(0.1) .setAverageRowSize(2.0) .build(); PlanNodeStatsEstimate inputStatistics = PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("x"), xStats) .build(); assertCalculate(expression("x"), inputStatistics).isEqualTo(xStats); assertCalculate(expression("y"), inputStatistics).isEqualTo(SymbolStatsEstimate.unknown()); }
private SymbolStatsEstimate toSymbolStatistics(TableStatistics tableStatistics, ColumnStatistics columnStatistics) { double nullsFraction = columnStatistics.getNullsFraction().getValue(); double nonNullRowsCount = tableStatistics.getRowCount().getValue() * (1.0 - nullsFraction); double averageRowSize = nonNullRowsCount == 0 ? 0 : columnStatistics.getDataSize().getValue() / nonNullRowsCount; SymbolStatsEstimate.Builder result = SymbolStatsEstimate.builder(); result.setNullsFraction(nullsFraction); result.setDistinctValuesCount(columnStatistics.getDistinctValuesCount().getValue()); result.setAverageRowSize(averageRowSize); columnStatistics.getRange().ifPresent(range -> { result.setLowValue(range.getMin()); result.setHighValue(range.getMax()); }); return result.build(); } }
private void testCapDistinctValuesByToDomainRangeLength(Type type, double ndv, Object low, Object high, double expectedNormalizedNdv) { Symbol symbol = new Symbol("x"); SymbolStatsEstimate symbolStats = SymbolStatsEstimate.builder() .setNullsFraction(0) .setDistinctValuesCount(ndv) .setLowValue(asStatsValue(low, type)) .setHighValue(asStatsValue(high, type)) .build(); PlanNodeStatsEstimate estimate = PlanNodeStatsEstimate.builder() .setOutputRowCount(10000000000L) .addSymbolStatistics(symbol, symbolStats).build(); assertNormalized(estimate, TypeProvider.copyOf(ImmutableMap.of(symbol, type))) .symbolStats(symbol, symbolAssert -> symbolAssert.distinctValuesCount(expectedNormalizedNdv)); }
private PlanNodeStatsEstimate xyStats(double lowX, double highX, double lowY, double highY) { return PlanNodeStatsEstimate.builder() .addSymbolStatistics(new Symbol("x"), SymbolStatsEstimate.builder() .setLowValue(lowX) .setHighValue(highX) .build()) .addSymbolStatistics(new Symbol("y"), SymbolStatsEstimate.builder() .setLowValue(lowY) .setHighValue(highY) .build()) .build(); }
@Test public void testSymbolEqualsSameSymbolFilter() { assertExpression("x = x") .outputRowsCount(750) .symbolStats("x", symbolStats -> SymbolStatsEstimate.builder() .setAverageRowSize(4.0) .setDistinctValuesCount(40.0) .setLowValue(-10.0) .setHighValue(10.0) .build()); }
public static Builder buildFrom(SymbolStatsEstimate other) { return builder() .setLowValue(other.getLowValue()) .setHighValue(other.getHighValue()) .setNullsFraction(other.getNullsFraction()) .setAverageRowSize(other.getAverageRowSize()) .setDistinctValuesCount(other.getDistinctValuesCount()); }
public Builder setStatisticsRange(StatisticRange range) { return setLowValue(range.getLow()) .setHighValue(range.getHigh()) .setDistinctValuesCount(range.getDistinctValuesCount()); }
private static SymbolStatistics symbolStatistics(String symbolName, double low, double high, double nullsFraction, double ndv) { return new SymbolStatistics( new Symbol(symbolName), SymbolStatsEstimate.builder() .setLowValue(low) .setHighValue(high) .setNullsFraction(nullsFraction) .setDistinctValuesCount(ndv) .build()); }