@Override public String toString() { if (this.hasNonNullValue()) { if (isNumNullsSet()) { return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); } else { return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); } } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; }
@Override public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp >= 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public String toString() { if (this.hasNonNullValue()) { if (isNumNullsSet()) { return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); } else { return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); } } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; }
@Override public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp <= 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
/** * IS FALSE predicate. */ private static LogicalExpression createIsFalsePredicate(LogicalExpression expr) { return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> { if (isAllNulls(exprStat, evaluator.getRowCount())) { return RowsMatch.NONE; } if (!exprStat.hasNonNullValue()) { return RowsMatch.SOME; } if (((BooleanStatistics) exprStat).getMin()) { return RowsMatch.NONE; } return ((BooleanStatistics) exprStat).getMax() ? RowsMatch.SOME : checkNull(exprStat); }); }
/** * IS NOT FALSE predicate. */ private static LogicalExpression createIsNotFalsePredicate(LogicalExpression expr) { return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> { if (isAllNulls(exprStat, evaluator.getRowCount())) { return RowsMatch.ALL; } if (!exprStat.hasNonNullValue()) { return RowsMatch.SOME; } if (!((BooleanStatistics) exprStat).getMax()) { return hasNoNulls(exprStat) ? RowsMatch.NONE : RowsMatch.SOME; } return ((BooleanStatistics) exprStat).getMin() ? RowsMatch.ALL : RowsMatch.SOME; }); }
/** * IS TRUE predicate. */ private static LogicalExpression createIsTruePredicate(LogicalExpression expr) { return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> { if (isAllNulls(exprStat, evaluator.getRowCount())) { return RowsMatch.NONE; } if (!exprStat.hasNonNullValue()) { return RowsMatch.SOME; } if (!((BooleanStatistics) exprStat).getMax()) { return RowsMatch.NONE; } return ((BooleanStatistics) exprStat).getMin() ? checkNull(exprStat) : RowsMatch.SOME; }); }
/** * IS NOT TRUE predicate. */ private static LogicalExpression createIsNotTruePredicate(LogicalExpression expr) { return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) -> { if (isAllNulls(exprStat, evaluator.getRowCount())) { return RowsMatch.ALL; } if (!exprStat.hasNonNullValue()) { return RowsMatch.SOME; } if (((BooleanStatistics) exprStat).getMin()) { return hasNoNulls(exprStat) ? RowsMatch.NONE : RowsMatch.SOME; } return ((BooleanStatistics) exprStat).getMax() ? RowsMatch.SOME : RowsMatch.ALL; }); }
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; // Merge stats only if they have the same type if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw StatisticsClassException.create(this, stats); } }
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; // Merge stats only if they have the same type if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw StatisticsClassException.create(this, stats); } }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) { Column<T> filterColumn = ltEq.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never less than or // equal to a value. for all x, null is never <= x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v <= someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = ltEq.getValue(); // drop if value < min return stats.compareMinToValue(value) > 0; }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Lt<T> lt) { Column<T> filterColumn = lt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never less than a // value. for all x, null is never < x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v < someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = lt.getValue(); // drop if value <= min return stats.compareMinToValue(value) >= 0; }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) { Column<T> filterColumn = gtEq.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never greater than or // equal to a value. for all x, null is never >= x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v >= someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gtEq.getValue(); // drop if value > max return stats.compareMaxToValue(value) < 0; }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never greater than a // value. for all x, null is never > x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gt.getValue(); // drop if value >= max return stats.compareMaxToValue(value) <= 0; }
public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics stats) { Statistics formatStats = new Statistics(); // Don't write stats larger than the max size rather than truncating. The // rationale is that some engines may use the minimum value in the page as // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. if (!stats.isEmpty() && stats.isSmallerThan(MAX_STATS_SIZE)) { formatStats.setNull_count(stats.getNumNulls()); if (stats.hasNonNullValue()) { byte[] min = stats.getMinBytes(); byte[] max = stats.getMaxBytes(); // Fill the former min-max statistics only if the comparison logic is // signed so the logic of V1 and V2 stats are the same (which is // trivially true for equal min-max values) if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { formatStats.setMin(min); formatStats.setMax(max); } if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) { formatStats.setMin_value(min); formatStats.setMax_value(max); } } } return formatStats; }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never greater than a // value. for all x, null is never > x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gt.getValue(); // drop if value >= max return stats.compareMaxToValue(value) <= 0; }
/** * Adds the data from the specified statistics to this builder * * @param stats * the statistics to be added */ public void add(Statistics<?> stats) { if (stats.hasNonNullValue()) { nullPages.add(false); Object min = stats.genericGetMin(); Object max = stats.genericGetMax(); addMinMax(min, max); pageIndexes.add(nextPageIndex); minMaxSize += sizeOf(min); minMaxSize += sizeOf(max); } else { nullPages.add(true); } nullCounts.add(stats.getNumNulls()); ++nextPageIndex; }