Refine search
@Test public void testNoTableStatsForNotSupportedSchema() { Stream.of("sf0.001", "sf0.1", "sf10") .forEach(schemaName -> Table.getBaseTables() .forEach(table -> { SchemaTableName schemaTableName = new SchemaTableName(schemaName, table.getName()); ConnectorTableHandle tableHandle = metadata.getTableHandle(session, schemaTableName); TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, alwaysTrue()); assertTrue(tableStatistics.getRowCount().isUnknown()); assertTrue(tableStatistics.getColumnStatistics().isEmpty()); })); }
public TableStatistics(Estimate rowCount, Map<ColumnHandle, ColumnStatistics> columnStatistics) { this.rowCount = requireNonNull(rowCount, "rowCount can not be null"); if (!rowCount.isUnknown() && rowCount.getValue() < 0) { throw new IllegalArgumentException(format("rowCount must be greater than or equal to 0: %s", rowCount.getValue())); } this.columnStatistics = unmodifiableMap(requireNonNull(columnStatistics, "columnStatistics can not be null")); }
private ColumnStatistics toColumnStatistics(ColumnStatisticsData stats, Type columnType) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(stats.getDistinctValuesCount().map(Estimate::of).orElse(Estimate.unknown())) .setDataSize(stats.getDataSize().map(Estimate::of).orElse(Estimate.unknown())) .setRange(toRange(stats.getMin(), stats.getMax(), columnType)) .build(); }
@VisibleForTesting static Estimate calculateDataSizeForPartitioningKey( HiveColumnHandle column, Type type, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition) { if (!hasDataSize(type)) { return Estimate.unknown(); } double dataSize = 0; for (HivePartition partition : partitions) { int length = getSize(partition.getKeys().get(column)); double rowCount = getPartitionRowCount(partition.getPartitionId(), statistics).orElse(averageRowsPerPartition); dataSize += length * rowCount; } return Estimate.of(dataSize); }
@VisibleForTesting static Estimate calculateDistinctValuesCount(List<HiveColumnStatistics> columnStatistics) { return columnStatistics.stream() .map(MetastoreHiveStatisticsProvider::getDistinctValuesCount) .filter(OptionalLong::isPresent) .map(OptionalLong::getAsLong) .peek(distinctValuesCount -> verify(distinctValuesCount >= 0, "distinctValuesCount must be greater than or equal to zero")) .max(Long::compare) .map(Estimate::of) .orElse(Estimate.unknown()); }
TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, Constraint.alwaysTrue()); assertFalse(tableStatistics.getRowCount().isUnknown(), "row count is unknown"); .getColumnStatistics() .entrySet() .stream() .collect( toImmutableMap( entry -> ((HiveColumnHandle) entry.getKey()).getName(), columnStatistics.getNullsFraction().isUnknown(), "unknown nulls fraction for " + columnName); columnStatistics.getDistinctValuesCount().isUnknown(), "unknown distinct values count for " + columnName); columnStatistics.getDataSize().isUnknown(), "unknown data size for " + columnName); columnStatistics.getDataSize().isUnknown(), "unknown data size for" + columnName);
private TableStatistics createZeroStatistics(Map<String, ColumnHandle> columns, Map<String, Type> columnTypes) { TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(0)); columns.forEach((columnName, columnHandle) -> { Type columnType = columnTypes.get(columnName); verify(columnType != null, "columnType is missing for column: %s", columnName); ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); columnStatistics.setNullsFraction(Estimate.of(0)); columnStatistics.setDistinctValuesCount(Estimate.of(0)); if (hasDataSize(columnType)) { columnStatistics.setDataSize(Estimate.of(0)); } result.setColumnStatistics(columnHandle, columnStatistics.build()); }); return result.build(); }
@Test public void testCalculateDataSize() assertEquals(calculateDataSize(COLUMN, ImmutableList.of(), 0), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(), 1000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(PartitionStatistics.empty()), 1000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCount(1000)), 1000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(dataSize(1000)), 1000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(dataSize(1000), rowsCount(1000)), 1000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(500, 1000)), 2000), Estimate.of(4000)); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(0, 0)), 2000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(0, 0)), 0), Estimate.zero()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(1000, 0)), 2000), Estimate.of(0)); assertEquals( calculateDataSize( rowsCountAndDataSize(1000, 5000)), 5000), Estimate.of(20000)); assertEquals( calculateDataSize( rowsCountAndDataSize(1000, 5000)), 5000), Estimate.of(20000));
private void testTableStats(String schema, TpchTable<?> table, Constraint<ColumnHandle> constraint, double expectedRowCount) { TpchTableHandle tableHandle = tpchMetadata.getTableHandle(session, new SchemaTableName(schema, table.getTableName())); TableStatistics tableStatistics = tpchMetadata.getTableStatistics(session, tableHandle, constraint); double actualRowCountValue = tableStatistics.getRowCount().getValue(); assertEquals(tableStatistics.getRowCount(), Estimate.of(actualRowCountValue)); assertEquals(actualRowCountValue, expectedRowCount, expectedRowCount * TOLERANCE); }
estimateAssertion.assertClose(tableStatistics.getRowCount(), Estimate.of(6), "Row count does not match"); .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(6)) .setRange(new DoubleRange(1, 6)) .build()); .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(3)) .setDataSize(Estimate.of(48.0)) .build()); .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) .setDataSize(Estimate.of(5.0)) .build()); .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) .setRange(new DoubleRange(-5, -5)) .build()); .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(4)) .setRange(new DoubleRange(10227L, 11688L)) .build()); .setNullsFraction(Estimate.of(1)) .setDistinctValuesCount(Estimate.of(0))
return TableStatistics.empty(); return TableStatistics.empty(); double rowCount = averageRowsPerPartition * queriedPartitionsCount; TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(rowCount)); for (Map.Entry<String, ColumnHandle> column : columns.entrySet()) { String columnName = column.getKey();
@Override protected Optional<PlanNodeStatsEstimate> doCalculate(TableScanNode node, StatsProvider sourceStats, Lookup lookup, Session session, TypeProvider types) { // TODO Construct predicate like AddExchanges's LayoutConstraintEvaluator Constraint<ColumnHandle> constraint = new Constraint<>(node.getCurrentConstraint()); TableStatistics tableStatistics = metadata.getTableStatistics(session, node.getTable(), constraint); Map<Symbol, SymbolStatsEstimate> outputSymbolStats = new HashMap<>(); for (Map.Entry<Symbol, ColumnHandle> entry : node.getAssignments().entrySet()) { Symbol symbol = entry.getKey(); Optional<ColumnStatistics> columnStatistics = Optional.ofNullable(tableStatistics.getColumnStatistics().get(entry.getValue())); outputSymbolStats.put(symbol, columnStatistics.map(statistics -> toSymbolStatistics(tableStatistics, statistics)).orElse(SymbolStatsEstimate.unknown())); } return Optional.of(PlanNodeStatsEstimate.builder() .setOutputRowCount(tableStatistics.getRowCount().getValue()) .addSymbolStatistics(outputSymbolStats) .build()); }
@Test public void testNullFraction() { SchemaTableName schemaTableName = new SchemaTableName("sf1", Table.WEB_SITE.getName()); ConnectorTableHandle tableHandle = metadata.getTableHandle(session, schemaTableName); TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, alwaysTrue()); Map<String, ColumnHandle> columnHandles = metadata.getColumnHandles(session, tableHandle); // some null values assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(WebSiteColumn.WEB_REC_END_DATE.getName())), ColumnStatistics.builder() .setNullsFraction(Estimate.of(0.5)) .setDistinctValuesCount(Estimate.of(3)) .setRange(new DoubleRange(10819L, 11549L)) .build()); }
private void testNoTableStats(String schema, TpchTable<?> table) { TpchTableHandle tableHandle = tpchMetadata.getTableHandle(session, new SchemaTableName(schema, table.getTableName())); TableStatistics tableStatistics = tpchMetadata.getTableStatistics(session, tableHandle, alwaysTrue()); assertTrue(tableStatistics.getRowCount().isUnknown()); }
private SymbolStatsEstimate toSymbolStatistics(TableStatistics tableStatistics, ColumnStatistics columnStatistics) { double nullsFraction = columnStatistics.getNullsFraction().getValue(); double nonNullRowsCount = tableStatistics.getRowCount().getValue() * (1.0 - nullsFraction); double averageRowSize = nonNullRowsCount == 0 ? 0 : columnStatistics.getDataSize().getValue() / nonNullRowsCount; SymbolStatsEstimate.Builder result = SymbolStatsEstimate.builder(); result.setNullsFraction(nullsFraction); result.setDistinctValuesCount(columnStatistics.getDistinctValuesCount().getValue()); result.setAverageRowSize(averageRowSize); columnStatistics.getRange().ifPresent(range -> { result.setLowValue(range.getMin()); result.setHighValue(range.getMax()); }); return result.build(); } }
private static Estimate toEstimate(Optional<Double> value) { return value .map(Estimate::of) .orElse(Estimate.unknown()); } }
private static ColumnStatistics createPartitionColumnStatistics( HiveColumnHandle column, Type type, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition, double rowCount) { return ColumnStatistics.builder() .setDistinctValuesCount(Estimate.of(calculateDistinctPartitionKeys(column, partitions, statistics, averageRowsPerPartition))) .setNullsFraction(Estimate.of(calculateNullsFractionForPartitioningKey(column, partitions, statistics, averageRowsPerPartition, rowCount))) .setRange(calculateRangeForPartitioningKey(column, type, partitions)) .setDataSize(calculateDataSizeForPartitioningKey(column, type, partitions, statistics, averageRowsPerPartition)) .build(); }
private static ColumnStatistics createColumnStatistics(Optional<Double> distinctValuesCount, Optional<DoubleRange> range, Optional<Double> dataSize) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(toEstimate(distinctValuesCount)) .setRange(range) .setDataSize(toEstimate(dataSize)) .build(); }
public static Estimate of(double value) { if (isNaN(value)) { throw new IllegalArgumentException("value is NaN"); } if (isInfinite(value)) { throw new IllegalArgumentException("value is infinite"); } return new Estimate(value); }