private TableStatistics createZeroStatistics(Map<String, ColumnHandle> columns, Map<String, Type> columnTypes) { TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(0)); columns.forEach((columnName, columnHandle) -> { Type columnType = columnTypes.get(columnName); verify(columnType != null, "columnType is missing for column: %s", columnName); ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); columnStatistics.setNullsFraction(Estimate.of(0)); columnStatistics.setDistinctValuesCount(Estimate.of(0)); if (hasDataSize(columnType)) { columnStatistics.setDataSize(Estimate.of(0)); } result.setColumnStatistics(columnHandle, columnStatistics.build()); }); return result.build(); }
private TableStatistics toTableStatistics(Map<String, ColumnHandle> columnHandles, TableStatisticsData statisticsData) { long rowCount = statisticsData.getRowCount(); TableStatistics.Builder tableStatistics = TableStatistics.builder() .setRowCount(Estimate.of(rowCount)); if (rowCount > 0) { Map<String, ColumnStatisticsData> columnsData = statisticsData.getColumns(); for (Map.Entry<String, ColumnHandle> entry : columnHandles.entrySet()) { TpcdsColumnHandle columnHandle = (TpcdsColumnHandle) entry.getValue(); tableStatistics.setColumnStatistics(entry.getValue(), toColumnStatistics(columnsData.get(entry.getKey()), columnHandle.getType(), rowCount)); } } return tableStatistics.build(); }
private ColumnStatistics toColumnStatistics(ColumnStatisticsData columnStatisticsData, Type type, long rowCount) { ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); long nullCount = columnStatisticsData.getNullsCount(); columnStatistics.setNullsFraction(Estimate.of((double) nullCount / rowCount)); columnStatistics.setRange(toRange(columnStatisticsData.getMin(), columnStatisticsData.getMax(), type)); columnStatistics.setDistinctValuesCount(Estimate.of(columnStatisticsData.getDistinctValuesCount())); columnStatistics.setDataSize(columnStatisticsData.getDataSize().map(Estimate::of).orElse(Estimate.unknown())); return columnStatistics.build(); }
result.setRowCount(Estimate.of(rowCount)); for (Map.Entry<String, ColumnHandle> column : columns.entrySet()) { String columnName = column.getKey();
@VisibleForTesting static Estimate calculateDataSizeForPartitioningKey( HiveColumnHandle column, Type type, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition) { if (!hasDataSize(type)) { return Estimate.unknown(); } double dataSize = 0; for (HivePartition partition : partitions) { int length = getSize(partition.getKeys().get(column)); double rowCount = getPartitionRowCount(partition.getPartitionId(), statistics).orElse(averageRowsPerPartition); dataSize += length * rowCount; } return Estimate.of(dataSize); }
private static ColumnStatistics createPartitionColumnStatistics( HiveColumnHandle column, Type type, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition, double rowCount) { return ColumnStatistics.builder() .setDistinctValuesCount(Estimate.of(calculateDistinctPartitionKeys(column, partitions, statistics, averageRowsPerPartition))) .setNullsFraction(Estimate.of(calculateNullsFractionForPartitioningKey(column, partitions, statistics, averageRowsPerPartition, rowCount))) .setRange(calculateRangeForPartitioningKey(column, type, partitions)) .setDataSize(calculateDataSizeForPartitioningKey(column, type, partitions, statistics, averageRowsPerPartition)) .build(); }
private TableStatistics toTableStatistics(TableStatisticsData tableStatisticsData, TpchTableHandle tpchTableHandle, Map<String, ColumnHandle> columnHandles) { TableStatistics.Builder builder = TableStatistics.builder() .setRowCount(Estimate.of(tableStatisticsData.getRowCount())); tableStatisticsData.getColumns().forEach((columnName, stats) -> { TpchColumnHandle columnHandle = (TpchColumnHandle) getColumnHandle(tpchTableHandle, columnHandles, columnName); builder.setColumnStatistics(columnHandle, toColumnStatistics(stats, columnHandle.getType())); }); return builder.build(); }
@Test public void testCalculateNullsFraction() { assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of()), Estimate.unknown()); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(PartitionStatistics.empty())), Estimate.unknown()); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000))), Estimate.unknown()); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000), nullsCount(500))), Estimate.unknown()); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000), nullsCount(500), rowsCountAndNullsCount(1000, 500))), Estimate.of(0.5)); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCountAndNullsCount(2000, 200), rowsCountAndNullsCount(1000, 100))), Estimate.of(0.1)); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCountAndNullsCount(0, 0), rowsCountAndNullsCount(0, 0))), Estimate.of(0)); }
@Test public void testCalculateDistinctValuesCount() { assertEquals(calculateDistinctValuesCount(ImmutableList.of()), Estimate.unknown()); assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.empty())), Estimate.unknown()); assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.empty(), HiveColumnStatistics.empty())), Estimate.unknown()); assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1))), Estimate.of(1)); assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1), distinctValuesCount(2))), Estimate.of(2)); assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1), HiveColumnStatistics.empty())), Estimate.of(1)); assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.empty()))), Estimate.unknown()); assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.of(0), OptionalLong.empty()))), Estimate.of(1)); assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(10), OptionalLong.empty(), OptionalLong.empty()))), Estimate.unknown()); assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(10), OptionalLong.of(10), OptionalLong.empty()))), Estimate.of(2)); assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.of(10), OptionalLong.empty()))), Estimate.unknown()); assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(10), OptionalLong.empty()))), Estimate.of(1)); assertEquals(calculateDistinctValuesCount(ImmutableList.of(createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.empty()))), Estimate.of(0)); assertEquals( calculateDistinctValuesCount(ImmutableList.of( createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(10), OptionalLong.empty()), createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.of(10), OptionalLong.empty()))), Estimate.of(2)); }
private void testTableStats(String schema, TpchTable<?> table, Constraint<ColumnHandle> constraint, double expectedRowCount) { TpchTableHandle tableHandle = tpchMetadata.getTableHandle(session, new SchemaTableName(schema, table.getTableName())); TableStatistics tableStatistics = tpchMetadata.getTableStatistics(session, tableHandle, constraint); double actualRowCountValue = tableStatistics.getRowCount().getValue(); assertEquals(tableStatistics.getRowCount(), Estimate.of(actualRowCountValue)); assertEquals(actualRowCountValue, expectedRowCount, expectedRowCount * TOLERANCE); }
return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
totalNullsCount, totalRowCount); return Estimate.of(((double) totalNullsCount) / totalRowCount);
ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), 2000), Estimate.of(7000)); assertEquals( calculateDataSizeForPartitioningKey( ImmutableMap.of("p1=string1/p2=1234", PartitionStatistics.empty()), 2000), Estimate.of(14000)); assertEquals( calculateDataSizeForPartitioningKey( ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=str2/p2=1234", rowsCount(2000)), 3000), Estimate.of(15000)); assertEquals( calculateDataSizeForPartitioningKey( ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=str2/p2=1234", PartitionStatistics.empty()), 3000), Estimate.of(19000)); assertEquals( calculateDataSizeForPartitioningKey( ImmutableMap.of(), 3000), Estimate.of(33000)); assertEquals( calculateDataSizeForPartitioningKey(
assertEquals(calculateDataSize(COLUMN, ImmutableList.of(dataSize(1000)), 1000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(dataSize(1000), rowsCount(1000)), 1000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(500, 1000)), 2000), Estimate.of(4000)); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(0, 0)), 2000), Estimate.unknown()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(0, 0)), 0), Estimate.zero()); assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(1000, 0)), 2000), Estimate.of(0)); assertEquals( calculateDataSize( rowsCountAndDataSize(1000, 5000)), 5000), Estimate.of(20000)); assertEquals( calculateDataSize( rowsCountAndDataSize(1000, 5000)), 5000), Estimate.of(20000));
@Test public void testNullFraction() { SchemaTableName schemaTableName = new SchemaTableName("sf1", Table.WEB_SITE.getName()); ConnectorTableHandle tableHandle = metadata.getTableHandle(session, schemaTableName); TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, alwaysTrue()); Map<String, ColumnHandle> columnHandles = metadata.getColumnHandles(session, tableHandle); // some null values assertColumnStatistics( tableStatistics.getColumnStatistics().get(columnHandles.get(WebSiteColumn.WEB_REC_END_DATE.getName())), ColumnStatistics.builder() .setNullsFraction(Estimate.of(0.5)) .setDistinctValuesCount(Estimate.of(3)) .setRange(new DoubleRange(10819L, 11549L)) .build()); }
estimateAssertion.assertClose(tableStatistics.getRowCount(), Estimate.of(6), "Row count does not match"); tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_CALL_CENTER_SK.getName())), ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(6)) .setRange(new DoubleRange(1, 6)) .build()); tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_CALL_CENTER_ID.getName())), ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(3)) .setDataSize(Estimate.of(48.0)) .build()); tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_ZIP.getName())), ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) .setDataSize(Estimate.of(5.0)) .build()); tableStatistics.getColumnStatistics().get(columnHandles.get(CallCenterColumn.CC_GMT_OFFSET.getName())), ColumnStatistics.builder() .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) .setRange(new DoubleRange(-5, -5)) .build());
@Test public void testGetTableStatisticsUnpartitioned() { PartitionStatistics statistics = PartitionStatistics.builder() .setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())) .setColumnStatistics(ImmutableMap.of(COLUMN, createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))) .build(); MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((table, hivePartitions) -> ImmutableMap.of(UNPARTITIONED_ID, statistics)); TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveClientConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); HiveColumnHandle columnHandle = new HiveColumnHandle(COLUMN, HIVE_LONG, BIGINT.getTypeSignature(), 2, REGULAR, Optional.empty()); TableStatistics expected = TableStatistics.builder() .setRowCount(Estimate.of(1000)) .setColumnStatistics( columnHandle, ColumnStatistics.builder() .setRange(new DoubleRange(-100, 100)) .setNullsFraction(Estimate.of(0.5)) .setDistinctValuesCount(Estimate.of(300)) .build()) .build(); assertEquals( statisticsProvider.getTableStatistics( session, TABLE, ImmutableMap.of(COLUMN, columnHandle), ImmutableMap.of(COLUMN, BIGINT), ImmutableList.of(new HivePartition(TABLE))), expected); }
HiveColumnHandle columnHandle = new HiveColumnHandle(COLUMN, HIVE_LONG, BIGINT.getTypeSignature(), 2, REGULAR, Optional.empty()); TableStatistics expected = TableStatistics.builder() .setRowCount(Estimate.of(1000)) .setColumnStatistics( PARTITION_COLUMN_1, ColumnStatistics.builder() .setDataSize(Estimate.of(7000)) .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) .build()) .setColumnStatistics( ColumnStatistics.builder() .setRange(new DoubleRange(1234, 1234)) .setNullsFraction(Estimate.of(0)) .setDistinctValuesCount(Estimate.of(1)) .build()) .setColumnStatistics( ColumnStatistics.builder() .setRange(new DoubleRange(-100, 100)) .setNullsFraction(Estimate.of(0.5)) .setDistinctValuesCount(Estimate.of(300)) .build()) .build();
@Test public void testCalculateNullsFraction() { assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of()), Estimate.unknown()); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(PartitionStatistics.empty())), Estimate.unknown()); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000))), Estimate.unknown()); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000), nullsCount(500))), Estimate.unknown()); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000), nullsCount(500), rowsCountAndNullsCount(1000, 500))), Estimate.of(0.5)); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCountAndNullsCount(2000, 200), rowsCountAndNullsCount(1000, 100))), Estimate.of(0.1)); assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCountAndNullsCount(0, 0), rowsCountAndNullsCount(0, 0))), Estimate.of(0)); }
private void testTableStats(String schema, TpchTable<?> table, Constraint<ColumnHandle> constraint, double expectedRowCount) { TpchTableHandle tableHandle = tpchMetadata.getTableHandle(session, new SchemaTableName(schema, table.getTableName())); TableStatistics tableStatistics = tpchMetadata.getTableStatistics(session, tableHandle, constraint); double actualRowCountValue = tableStatistics.getRowCount().getValue(); assertEquals(tableStatistics.getRowCount(), Estimate.of(actualRowCountValue)); assertEquals(actualRowCountValue, expectedRowCount, expectedRowCount * TOLERANCE); }