private ColumnStatistics toColumnStatistics(ColumnStatisticsData stats, Type columnType) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(stats.getDistinctValuesCount().map(Estimate::of).orElse(Estimate.unknown())) .setDataSize(stats.getDataSize().map(Estimate::of).orElse(Estimate.unknown())) .setRange(toRange(stats.getMin(), stats.getMax(), columnType)) .build(); }
private Row createColumnStatsRow(String columnName, Type type, ColumnStatistics columnStatistics) { ImmutableList.Builder<Expression> rowValues = ImmutableList.builder(); rowValues.add(new StringLiteral(columnName)); rowValues.add(createEstimateRepresentation(columnStatistics.getDataSize())); rowValues.add(createEstimateRepresentation(columnStatistics.getDistinctValuesCount())); rowValues.add(createEstimateRepresentation(columnStatistics.getNullsFraction())); rowValues.add(NULL_DOUBLE); rowValues.add(toStringLiteral(type, columnStatistics.getRange().map(DoubleRange::getMin))); rowValues.add(toStringLiteral(type, columnStatistics.getRange().map(DoubleRange::getMax))); return new Row(rowValues.build()); }
@VisibleForTesting static ColumnStatistics createDataColumnStatistics(String column, Type type, double rowsCount, Collection<PartitionStatistics> partitionStatistics) { List<HiveColumnStatistics> columnStatistics = partitionStatistics.stream() .map(PartitionStatistics::getColumnStatistics) .map(statistics -> statistics.get(column)) .filter(Objects::nonNull) .collect(toImmutableList()); if (columnStatistics.isEmpty()) { return ColumnStatistics.empty(); } return ColumnStatistics.builder() .setDistinctValuesCount(calculateDistinctValuesCount(columnStatistics)) .setNullsFraction(calculateNullsFraction(column, partitionStatistics)) .setDataSize(calculateDataSize(column, partitionStatistics, rowsCount)) .setRange(calculateRange(type, columnStatistics)) .build(); }
columnStatistics.getNullsFraction().isUnknown(), "unknown nulls fraction for " + columnName); columnStatistics.getDistinctValuesCount().isUnknown(), "unknown distinct values count for " + columnName); columnStatistics.getDataSize().isUnknown(), "unknown data size for " + columnName); columnStatistics.getDataSize().isUnknown(), "unknown data size for" + columnName);
@Test public void testCreateDataColumnStatistics() { assertEquals(createDataColumnStatistics(COLUMN, BIGINT, 1000, ImmutableList.of()), ColumnStatistics.empty()); assertEquals( createDataColumnStatistics(COLUMN, BIGINT, 1000, ImmutableList.of(PartitionStatistics.empty(), PartitionStatistics.empty())), ColumnStatistics.empty()); assertEquals( createDataColumnStatistics( COLUMN, BIGINT, 1000, ImmutableList.of(new PartitionStatistics(HiveBasicStatistics.createZeroStatistics(), ImmutableMap.of("column2", HiveColumnStatistics.empty())))), ColumnStatistics.empty()); }
columnStatistics.getNullsFraction().isUnknown(), "unknown nulls fraction for " + columnName); columnStatistics.getDistinctValuesCount().isUnknown(), "unknown distinct values count for " + columnName); columnStatistics.getDataSize().isUnknown(), "unknown data size for " + columnName); columnStatistics.getDataSize().isUnknown(), "unknown data size for" + columnName);
@Test public void testCreateDataColumnStatistics() { assertEquals(createDataColumnStatistics(COLUMN, BIGINT, 1000, ImmutableList.of()), ColumnStatistics.empty()); assertEquals( createDataColumnStatistics(COLUMN, BIGINT, 1000, ImmutableList.of(PartitionStatistics.empty(), PartitionStatistics.empty())), ColumnStatistics.empty()); assertEquals( createDataColumnStatistics( COLUMN, BIGINT, 1000, ImmutableList.of(new PartitionStatistics(HiveBasicStatistics.createZeroStatistics(), ImmutableMap.of("column2", HiveColumnStatistics.empty())))), ColumnStatistics.empty()); }
private void assertColumnStatistics(ColumnStatistics actual, ColumnStatistics expected) { estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "Nulls fraction"); estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "Data size"); estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "Distinct values count"); assertEquals(actual.getRange(), expected.getRange()); } }
private ColumnStatistics toColumnStatistics(ColumnStatisticsData columnStatisticsData, Type type, long rowCount) { ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); long nullCount = columnStatisticsData.getNullsCount(); columnStatistics.setNullsFraction(Estimate.of((double) nullCount / rowCount)); columnStatistics.setRange(toRange(columnStatisticsData.getMin(), columnStatisticsData.getMax(), type)); columnStatistics.setDistinctValuesCount(Estimate.of(columnStatisticsData.getDistinctValuesCount())); columnStatistics.setDataSize(columnStatisticsData.getDataSize().map(Estimate::of).orElse(Estimate.unknown())); return columnStatistics.build(); }
private SymbolStatsEstimate toSymbolStatistics(TableStatistics tableStatistics, ColumnStatistics columnStatistics) { double nullsFraction = columnStatistics.getNullsFraction().getValue(); double nonNullRowsCount = tableStatistics.getRowCount().getValue() * (1.0 - nullsFraction); double averageRowSize = nonNullRowsCount == 0 ? 0 : columnStatistics.getDataSize().getValue() / nonNullRowsCount; SymbolStatsEstimate.Builder result = SymbolStatsEstimate.builder(); result.setNullsFraction(nullsFraction); result.setDistinctValuesCount(columnStatistics.getDistinctValuesCount().getValue()); result.setAverageRowSize(averageRowSize); columnStatistics.getRange().ifPresent(range -> { result.setLowValue(range.getMin()); result.setHighValue(range.getMax()); }); return result.build(); } }
private TableStatistics createZeroStatistics(Map<String, ColumnHandle> columns, Map<String, Type> columnTypes) { TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(0)); columns.forEach((columnName, columnHandle) -> { Type columnType = columnTypes.get(columnName); verify(columnType != null, "columnType is missing for column: %s", columnName); ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); columnStatistics.setNullsFraction(Estimate.of(0)); columnStatistics.setDistinctValuesCount(Estimate.of(0)); if (hasDataSize(columnType)) { columnStatistics.setDataSize(Estimate.of(0)); } result.setColumnStatistics(columnHandle, columnStatistics.build()); }); return result.build(); }
private void testColumnStats(String schema, TpchTable<?> table, TpchColumn<?> column, Constraint<ColumnHandle> constraint, ColumnStatistics expected) { TpchTableHandle tableHandle = tpchMetadata.getTableHandle(session, new SchemaTableName(schema, table.getTableName())); TableStatistics tableStatistics = tpchMetadata.getTableStatistics(session, tableHandle, constraint); ColumnHandle columnHandle = tpchMetadata.getColumnHandles(session, tableHandle).get(column.getSimplifiedColumnName()); ColumnStatistics actual = tableStatistics.getColumnStatistics().get(columnHandle); EstimateAssertion estimateAssertion = new EstimateAssertion(TOLERANCE); estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "distinctValuesCount"); estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "dataSize"); estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "nullsFraction"); estimateAssertion.assertClose(actual.getRange(), expected.getRange(), "range"); }
private static ColumnStatistics createPartitionColumnStatistics( HiveColumnHandle column, Type type, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition, double rowCount) { return ColumnStatistics.builder() .setDistinctValuesCount(Estimate.of(calculateDistinctPartitionKeys(column, partitions, statistics, averageRowsPerPartition))) .setNullsFraction(Estimate.of(calculateNullsFractionForPartitioningKey(column, partitions, statistics, averageRowsPerPartition, rowCount))) .setRange(calculateRangeForPartitioningKey(column, type, partitions)) .setDataSize(calculateDataSizeForPartitioningKey(column, type, partitions, statistics, averageRowsPerPartition)) .build(); }
private void assertColumnStatistics(ColumnStatistics actual, ColumnStatistics expected) { estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "Nulls fraction"); estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "Data size"); estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "Distinct values count"); assertEquals(actual.getRange(), expected.getRange()); } }
private static ColumnStatistics createColumnStatistics(Optional<Double> distinctValuesCount, Optional<DoubleRange> range, Optional<Double> dataSize) { return ColumnStatistics.builder() .setNullsFraction(Estimate.zero()) .setDistinctValuesCount(toEstimate(distinctValuesCount)) .setRange(range) .setDataSize(toEstimate(dataSize)) .build(); }
private void testColumnStats(String schema, TpchTable<?> table, TpchColumn<?> column, Constraint<ColumnHandle> constraint, ColumnStatistics expected) { TpchTableHandle tableHandle = tpchMetadata.getTableHandle(session, new SchemaTableName(schema, table.getTableName())); TableStatistics tableStatistics = tpchMetadata.getTableStatistics(session, tableHandle, constraint); ColumnHandle columnHandle = tpchMetadata.getColumnHandles(session, tableHandle).get(column.getSimplifiedColumnName()); ColumnStatistics actual = tableStatistics.getColumnStatistics().get(columnHandle); EstimateAssertion estimateAssertion = new EstimateAssertion(TOLERANCE); estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "distinctValuesCount"); estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "dataSize"); estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "nullsFraction"); estimateAssertion.assertClose(actual.getRange(), expected.getRange(), "range"); }
@Test public void testNullFraction() { SchemaTableName schemaTableName = new SchemaTableName("sf1", Table.WEB_SITE.getName()); ConnectorTableHandle tableHandle = metadata.getTableHandle(session, schemaTableName); TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, alwaysTrue()); Map<String, ColumnHandle> columnHandles = metadata.getColumnHandles(session, tableHandle); // some null values assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(WebSiteColumn.WEB_REC_END_DATE.getName())), ColumnStatistics.builder() .setNullsFraction(Estimate.of(0.5)) .setDistinctValuesCount(Estimate.of(3)) .setRange(new DoubleRange(10819L, 11549L)) .build()); }
@Test public void testGetTableStatisticsUnpartitioned() { PartitionStatistics statistics = PartitionStatistics.builder() .setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())) .setColumnStatistics(ImmutableMap.of(COLUMN, createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))) .build(); MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((table, hivePartitions) -> ImmutableMap.of(UNPARTITIONED_ID, statistics)); TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveClientConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); HiveColumnHandle columnHandle = new HiveColumnHandle(COLUMN, HIVE_LONG, BIGINT.getTypeSignature(), 2, REGULAR, Optional.empty()); TableStatistics expected = TableStatistics.builder() .setRowCount(Estimate.of(1000)) .setColumnStatistics( columnHandle, ColumnStatistics.builder() .setRange(new DoubleRange(-100, 100)) .setNullsFraction(Estimate.of(0.5)) .setDistinctValuesCount(Estimate.of(300)) .build()) .build(); assertEquals( statisticsProvider.getTableStatistics( session, TABLE, ImmutableMap.of(COLUMN, columnHandle), ImmutableMap.of(COLUMN, BIGINT), ImmutableList.of(new HivePartition(TABLE))), expected); }
ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(6)) ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(3)) ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(4)) ColumnStatistics.builder() .setNullsFraction(Estimate.of(1)) .setDistinctValuesCount(Estimate.of(0))
.setColumnStatistics( PARTITION_COLUMN_1, ColumnStatistics.builder() .setDataSize(Estimate.of(7000)) .setNullsFraction(Estimate.of(0)) .setColumnStatistics( PARTITION_COLUMN_2, ColumnStatistics.builder() .setRange(new DoubleRange(1234, 1234)) .setNullsFraction(Estimate.of(0)) .setColumnStatistics( columnHandle, ColumnStatistics.builder() .setRange(new DoubleRange(-100, 100)) .setNullsFraction(Estimate.of(0.5))