private static Optional<DoubleRange> createRange(Type type, HiveColumnStatistics statistics) { if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { return statistics.getIntegerStatistics().flatMap(integerStatistics -> createIntegerRange(type, integerStatistics)); } if (type.equals(DOUBLE) || type.equals(REAL)) { return statistics.getDoubleStatistics().flatMap(MetastoreHiveStatisticsProvider::createDoubleRange); } if (type.equals(DATE)) { return statistics.getDateStatistics().flatMap(MetastoreHiveStatisticsProvider::createDateRange); } if (type instanceof DecimalType) { return statistics.getDecimalStatistics().flatMap(MetastoreHiveStatisticsProvider::createDecimalRange); } throw new IllegalArgumentException("Unexpected type: " + type); }
private static ColumnStatisticsObj createDecimalStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) { DecimalColumnStatsData data = new DecimalColumnStatsData(); statistics.getDecimalStatistics().ifPresent(decimalStatistics -> { decimalStatistics.getMin().ifPresent(value -> data.setLowValue(toMetastoreDecimal(value))); decimalStatistics.getMax().ifPresent(value -> data.setHighValue(toMetastoreDecimal(value))); }); statistics.getNullsCount().ifPresent(data::setNumNulls); toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); return new ColumnStatisticsObj(columnName, columnType.toString(), decimalStats(data)); }
@Test public void testBinaryStatsToColumnStatistics() { BinaryColumnStatsData binaryColumnStatsData = new BinaryColumnStatsData(); binaryColumnStatsData.setMaxColLen(100); binaryColumnStatsData.setAvgColLen(22.2); binaryColumnStatsData.setNumNulls(2); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BINARY_TYPE_NAME, binaryStats(binaryColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(4)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.of(100)); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.of(44)); assertEquals(actual.getNullsCount(), OptionalLong.of(2)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testStringStatsToColumnStatistics() { StringColumnStatsData stringColumnStatsData = new StringColumnStatsData(); stringColumnStatsData.setMaxColLen(100); stringColumnStatsData.setAvgColLen(23.333); stringColumnStatsData.setNumNulls(1); stringColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", STRING_TYPE_NAME, stringStats(stringColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(2)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.of(100)); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.of(23)); assertEquals(actual.getNullsCount(), OptionalLong.of(1)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(1)); }
@Test public void testDecimalStatsToColumnStatistics() { DecimalColumnStatsData decimalColumnStatsData = new DecimalColumnStatsData(); BigDecimal low = new BigDecimal("0"); decimalColumnStatsData.setLowValue(new Decimal(ByteBuffer.wrap(low.unscaledValue().toByteArray()), (short) low.scale())); BigDecimal high = new BigDecimal("100"); decimalColumnStatsData.setHighValue(new Decimal(ByteBuffer.wrap(high.unscaledValue().toByteArray()), (short) high.scale())); decimalColumnStatsData.setNumNulls(1); decimalColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DECIMAL_TYPE_NAME, decimalStats(decimalColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.of(new DecimalStatistics(Optional.of(low), Optional.of(high)))); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.of(1)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); }
@Test public void testEmptyDateStatsToColumnStatistics() { DateColumnStatsData emptyDateColumnStatsData = new DateColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DATE_TYPE_NAME, dateStats(emptyDateColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.of(new DateStatistics(Optional.empty(), Optional.empty()))); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyDecimalStatsToColumnStatistics() { DecimalColumnStatsData emptyDecimalColumnStatsData = new DecimalColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DECIMAL_TYPE_NAME, decimalStats(emptyDecimalColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.of(new DecimalStatistics(Optional.empty(), Optional.empty()))); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyStringColumnStatsData() { StringColumnStatsData emptyStringColumnStatsData = new StringColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", STRING_TYPE_NAME, stringStats(emptyStringColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyBinaryStatsToColumnStatistics() { BinaryColumnStatsData emptyBinaryColumnStatsData = new BinaryColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BINARY_TYPE_NAME, binaryStats(emptyBinaryColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testLongStatsToColumnStatistics() { LongColumnStatsData longColumnStatsData = new LongColumnStatsData(); longColumnStatsData.setLowValue(0); longColumnStatsData.setHighValue(100); longColumnStatsData.setNumNulls(1); longColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BIGINT_TYPE_NAME, longStats(longColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.of(new IntegerStatistics(OptionalLong.of(0), OptionalLong.of(100)))); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.of(1)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); }
columnStatistics.getDecimalStatistics().ifPresent(decimalStatistics -> { Optional<BigDecimal> min = decimalStatistics.getMin(); Optional<BigDecimal> max = decimalStatistics.getMax();
private Builder(HiveColumnStatistics other) { this.integerStatistics = other.getIntegerStatistics(); this.doubleStatistics = other.getDoubleStatistics(); this.decimalStatistics = other.getDecimalStatistics(); this.dateStatistics = other.getDateStatistics(); this.booleanStatistics = other.getBooleanStatistics(); this.maxValueSizeInBytes = other.getMaxValueSizeInBytes(); this.totalSizeInBytes = other.getTotalSizeInBytes(); this.nullsCount = other.getNullsCount(); this.distinctValuesCount = other.getDistinctValuesCount(); }
@Test public void testBooleanStatsToColumnStatistics() { BooleanColumnStatsData booleanColumnStatsData = new BooleanColumnStatsData(); booleanColumnStatsData.setNumTrues(100); booleanColumnStatsData.setNumFalses(10); booleanColumnStatsData.setNumNulls(0); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(booleanColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.of(100), OptionalLong.of(10)))); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.of(0)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyBooleanStatsToColumnStatistics() { BooleanColumnStatsData emptyBooleanColumnStatsData = new BooleanColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(emptyBooleanColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty()))); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyDoubleStatsToColumnStatistics() { DoubleColumnStatsData emptyDoubleColumnStatsData = new DoubleColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(emptyDoubleColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.of(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty()))); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyLongStatsToColumnStatistics() { LongColumnStatsData emptyLongColumnStatsData = new LongColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BIGINT_TYPE_NAME, longStats(emptyLongColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.of(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty()))); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testDoubleStatsToColumnStatistics() { DoubleColumnStatsData doubleColumnStatsData = new DoubleColumnStatsData(); doubleColumnStatsData.setLowValue(0); doubleColumnStatsData.setHighValue(100); doubleColumnStatsData.setNumNulls(1); doubleColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(doubleColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.of(new DoubleStatistics(OptionalDouble.of(0), OptionalDouble.of(100)))); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.of(1)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); }
@Test public void testDateStatsToColumnStatistics() { DateColumnStatsData dateColumnStatsData = new DateColumnStatsData(); dateColumnStatsData.setLowValue(new Date(1000)); dateColumnStatsData.setHighValue(new Date(2000)); dateColumnStatsData.setNumNulls(1); dateColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DATE_TYPE_NAME, dateStats(dateColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.of(new DateStatistics(Optional.of(LocalDate.ofEpochDay(1000)), Optional.of(LocalDate.ofEpochDay(2000))))); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.of(1)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); }
public static HiveColumnStatistics merge(HiveColumnStatistics first, HiveColumnStatistics second) { return new HiveColumnStatistics( mergeIntegerStatistics(first.getIntegerStatistics(), second.getIntegerStatistics()), mergeDoubleStatistics(first.getDoubleStatistics(), second.getDoubleStatistics()), mergeDecimalStatistics(first.getDecimalStatistics(), second.getDecimalStatistics()), mergeDateStatistics(first.getDateStatistics(), second.getDateStatistics()), mergeBooleanStatistics(first.getBooleanStatistics(), second.getBooleanStatistics()), reduce(first.getMaxValueSizeInBytes(), second.getMaxValueSizeInBytes(), MAX, true), reduce(first.getTotalSizeInBytes(), second.getTotalSizeInBytes(), ADD, true), reduce(first.getNullsCount(), second.getNullsCount(), ADD, false), reduce(first.getDistinctValuesCount(), second.getDistinctValuesCount(), MAX, false)); }
@Test public void testEmptyBooleanStatsToColumnStatistics() { BooleanColumnStatsData emptyBooleanColumnStatsData = new BooleanColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(emptyBooleanColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty()))); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }