/** * @param stat statistics object * @return <tt>true</tt> if the input stat object has valid statistics; false otherwise */ static boolean isNullOrEmpty(Statistics stat) { return stat == null || stat.isEmpty(); }
@Override public <T> Boolean isNull(BoundReference<T> ref) { // no need to check whether the field is required because binding evaluates that case // if the column has no null values, the expression cannot match Integer id = ref.fieldId(); Preconditions.checkNotNull(struct.field(id), "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_MIGHT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty() && colStats.getNumNulls() == 0) { // there are stats and no values are null => all values are non-null return ROWS_CANNOT_MATCH; } return ROWS_MIGHT_MATCH; }
@Override public String toString() { if (this.hasNonNullValue()) { if (isNumNullsSet()) { return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); } else { return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); } } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; }
@Override public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { if (numberOfRows == 0) { return false; } ImmutableMap.Builder<ColumnDescriptor, Domain> domains = ImmutableMap.builder(); for (RichColumnDescriptor column : columns) { Statistics<?> columnStatistics = statistics.get(column); Domain domain; Type type = getPrestoType(effectivePredicate, column); if (columnStatistics == null || columnStatistics.isEmpty()) { // no stats for column domain = Domain.all(type); } else { domain = getDomain(type, numberOfRows, columnStatistics, id, column.toString(), failOnCorruptedParquetStatistics); } domains.put(column, domain); } TupleDomain<ColumnDescriptor> stripeDomain = TupleDomain.withColumnDomains(domains.build()); return effectivePredicate.overlaps(stripeDomain); }
@Override public String toString() { if (this.hasNonNullValue()) { if (isNumNullsSet()) { return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); } else { return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); } } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; }
@Override public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp >= 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp <= 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { if (numberOfRows == 0) { return false; } if (effectivePredicate.isNone()) { return false; } Map<ColumnDescriptor, Domain> effectivePredicateDomains = effectivePredicate.getDomains() .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains")); for (RichColumnDescriptor column : columns) { Domain effectivePredicateDomain = effectivePredicateDomains.get(column); if (effectivePredicateDomain == null) { continue; } Statistics<?> columnStatistics = statistics.get(column); if (columnStatistics == null || columnStatistics.isEmpty()) { // no stats for column } else { Domain domain = getDomain(effectivePredicateDomain.getType(), numberOfRows, columnStatistics, id, column.toString(), failOnCorruptedParquetStatistics); if (effectivePredicateDomain.intersect(domain).isNone()) { return false; } } } return true; }
@Override public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
@Override public Statistics<T> visitFunctionHolderExpression(FunctionHolderExpression holderExpr, Void value) throws RuntimeException { FuncHolder funcHolder = holderExpr.getHolder(); if (! (funcHolder instanceof DrillSimpleFuncHolder)) { // Only Drill function is allowed. return null; } final String funcName = ((DrillSimpleFuncHolder) funcHolder).getRegisteredNames()[0]; if (FunctionReplacementUtils.isCastFunction(funcName)) { Statistics stat = holderExpr.args.get(0).accept(this, null); if (stat != null && ! stat.isEmpty()) { return evalCastFunc(holderExpr, stat); } } return null; }
private PageHeader newDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, org.apache.parquet.column.statistics.Statistics<?> statistics, org.apache.parquet.column.Encoding dataEncoding, int rlByteLength, int dlByteLength) { // TODO: pageHeader.crc = ...; DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2( valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength); if (!statistics.isEmpty()) { dataPageHeaderV2.setStatistics( toParquetStatistics(statistics)); } PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize); pageHeader.setData_page_header_v2(dataPageHeaderV2); return pageHeader; }
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; // Merge stats only if they have the same type if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw StatisticsClassException.create(this, stats); } }
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; // Merge stats only if they have the same type if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw StatisticsClassException.create(this, stats); } }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) { Column<T> filterColumn = ltEq.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never less than or // equal to a value. for all x, null is never <= x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v <= someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = ltEq.getValue(); // drop if value < min return stats.compareMinToValue(value) > 0; }
private PageHeader newDataPageHeader( int uncompressedSize, int compressedSize, int valueCount, org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding rlEncoding, org.apache.parquet.column.Encoding dlEncoding, org.apache.parquet.column.Encoding valuesEncoding) { PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize); // TODO: pageHeader.crc = ...; pageHeader.setData_page_header(new DataPageHeader( valueCount, getEncoding(valuesEncoding), getEncoding(dlEncoding), getEncoding(rlEncoding))); if (!statistics.isEmpty()) { pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics)); } return pageHeader; }
public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics stats) { Statistics formatStats = new Statistics(); // Don't write stats larger than the max size rather than truncating. The // rationale is that some engines may use the minimum value in the page as // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. if (!stats.isEmpty() && stats.isSmallerThan(MAX_STATS_SIZE)) { formatStats.setNull_count(stats.getNumNulls()); if (stats.hasNonNullValue()) { byte[] min = stats.getMinBytes(); byte[] max = stats.getMaxBytes(); // Fill the former min-max statistics only if the comparison logic is // signed so the logic of V1 and V2 stats are the same (which is // trivially true for equal min-max values) if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { formatStats.setMin(min); formatStats.setMax(max); } if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) { formatStats.setMin_value(min); formatStats.setMax_value(max); } } } return formatStats; }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never greater than a // value. for all x, null is never > x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gt.getValue(); // drop if value >= max return stats.compareMaxToValue(value) <= 0; }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never greater than a // value. for all x, null is never > x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gt.getValue(); // drop if value >= max return stats.compareMaxToValue(value) <= 0; }