/** * NE (!=) predicate. */ private static <C extends Comparable<C>> LogicalExpression createNEPredicate( LogicalExpression left, LogicalExpression right ) { return new ParquetComparisonPredicate<C>(left, right, (leftStat, rightStat) -> { if (leftStat.compareMaxToValue(rightStat.genericGetMin()) < 0 || rightStat.compareMaxToValue(leftStat.genericGetMin()) < 0) { return checkNull(leftStat, rightStat); } return leftStat.compareMaxToValue(rightStat.genericGetMax()) == 0 && leftStat.compareMinToValue(rightStat.genericGetMin()) == 0 ? RowsMatch.NONE : RowsMatch.SOME; }); }
@Override public String toString() { if (this.hasNonNullValue()) { if (isNumNullsSet()) { return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); } else { return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); } } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; }
/** * Equality comparison method to compare two statistics objects. * @param other Object to compare against * @return true if objects are equal, false otherwise */ @Override public boolean equals(Object other) { if (other == this) return true; if (!(other instanceof Statistics)) return false; Statistics stats = (Statistics) other; return type.equals(stats.type) && Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && Arrays.equals(stats.getMinBytes(), this.getMinBytes()) && stats.getNumNulls() == this.getNumNulls(); }
/** * Returns the string representation of min for debugging/logging purposes. * * @return the min value as a string */ public String minAsString() { return stringify(genericGetMin()); }
public Statistics<?> build() { Statistics<?> stats = createStats(type); if (min != null && max != null) { stats.setMinMaxFromBytes(min, max); } stats.num_nulls = this.numNulls; return stats; } }
/** * Returns the string representation of max for debugging/logging purposes. * * @return the max value as a string */ public String maxAsString() { return stringify(genericGetMax()); }
/** * Adds the data from the specified statistics to this builder * * @param stats * the statistics to be added */ public void add(Statistics<?> stats) { if (stats.hasNonNullValue()) { nullPages.add(false); Object min = stats.genericGetMin(); Object max = stats.genericGetMax(); addMinMax(min, max); pageIndexes.add(nextPageIndex); minMaxSize += sizeOf(min); minMaxSize += sizeOf(max); } else { nullPages.add(true); } nullCounts.add(stats.getNumNulls()); ++nextPageIndex; }
public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics stats) { Statistics formatStats = new Statistics(); // Don't write stats larger than the max size rather than truncating. The // rationale is that some engines may use the minimum value in the page as // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. if (!stats.isEmpty() && stats.isSmallerThan(MAX_STATS_SIZE)) { formatStats.setNull_count(stats.getNumNulls()); if (stats.hasNonNullValue()) { byte[] min = stats.getMinBytes(); byte[] max = stats.getMaxBytes(); // Fill the former min-max statistics only if the comparison logic is // signed so the logic of V1 and V2 stats are the same (which is // trivially true for equal min-max values) if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { formatStats.setMin(min); formatStats.setMax(max); } if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) { formatStats.setMin_value(min); formatStats.setMax_value(max); } } } return formatStats; }
if (stats.isEmpty()) { if (stats.isNumNullsSet() && hasNulls(meta)) { if (!stats.hasNonNullValue()) { return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0;
if (stats.isEmpty()) { if (!stats.hasNonNullValue()) { new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(), stats.comparator());
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; // Merge stats only if they have the same type if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw StatisticsClassException.create(this, stats); } }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Lt<T> lt) { Column<T> filterColumn = lt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never less than a // value. for all x, null is never < x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v < someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = lt.getValue(); // drop if value <= min return stats.compareMinToValue(value) >= 0; }
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never greater than a // value. for all x, null is never > x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gt.getValue(); // drop if value >= max return stats.compareMaxToValue(value) <= 0; }
/** * Creates decimal statistics where min and max values are re-created using given scale. * * @param statistics statistics that needs to be adjusted * @param scale adjustment scale * @return adjusted statistics */ @SuppressWarnings("unchecked") private Statistics<C> adjustDecimalStatistics(Statistics<C> statistics, int scale) { byte[] minBytes = new BigDecimal(new BigInteger(statistics.getMinBytes())) .setScale(scale, RoundingMode.HALF_UP).unscaledValue().toByteArray(); byte[] maxBytes = new BigDecimal(new BigInteger(statistics.getMaxBytes())) .setScale(scale, RoundingMode.HALF_UP).unscaledValue().toByteArray(); return (Statistics<C>) Statistics.getBuilderForReading(statistics.type()) .withMin(minBytes) .withMax(maxBytes) .withNumNulls(statistics.getNumNulls()) .build(); }
@Override public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
/** * Checks that column chunk's statistics does not have nulls * * @param stat parquet column statistics * @return <tt>true</tt> if the parquet file does not have nulls and <tt>false</tt> otherwise */ static boolean hasNoNulls(Statistics stat) { return stat.getNumNulls() == 0; }
/** * @param stat statistics object * @return <tt>true</tt> if the input stat object has valid statistics; false otherwise */ static boolean isNullOrEmpty(Statistics stat) { return stat == null || stat.isEmpty(); }
@Override public <T> Boolean isNull(BoundReference<T> ref) { // no need to check whether the field is required because binding evaluates that case // if the column has no null values, the expression cannot match Integer id = ref.fieldId(); Preconditions.checkNotNull(struct.field(id), "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_MIGHT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty() && colStats.getNumNulls() == 0) { // there are stats and no values are null => all values are non-null return ROWS_CANNOT_MATCH; } return ROWS_MIGHT_MATCH; }
Statistics stat = Statistics.getStatsBasedOnType(primitiveType); Statistics convertedStat = stat; stat.setNumNulls(numNulls); case DATE: convertedStat = new LongStatistics(); convertedStat.setNumNulls(stat.getNumNulls()); long minMS = convertToDrillDateValue(Integer.parseInt(min.toString())); long maxMS = convertToDrillDateValue(Integer.parseInt(max.toString())); .named("decimal_type"); convertedStat = Statistics.getBuilderForReading(decimalType) .withMin(minBytes) .withMax(maxBytes)
public static org.apache.parquet.column.statistics.Statistics<?> readStats(Statistics statistics, PrimitiveTypeName type) { org.apache.parquet.column.statistics.Statistics<?> stats = org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType(type); if (statistics != null) { if (statistics.isSetMax() && statistics.isSetMin()) { stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); } stats.setNumNulls(statistics.null_count); } return stats; }