/** * Checks that column chunk's statistics does not have nulls * * @param stat parquet column statistics * @return <tt>true</tt> if the parquet file does not have nulls and <tt>false</tt> otherwise */ static boolean hasNoNulls(Statistics stat) { return stat.getNumNulls() == 0; }
/** * Checks that column chunk's statistics has only nulls * * @param stat parquet column statistics * @param rowCount number of rows in the parquet file * @return <tt>true</tt> if all rows are null in the parquet file and <tt>false</tt> otherwise */ static boolean isAllNulls(Statistics stat, long rowCount) { Preconditions.checkArgument(rowCount >= 0, String.format("negative rowCount %d is not valid", rowCount)); return stat.getNumNulls() == rowCount; }
private boolean hasNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() > 0; }
private boolean hasNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() > 0; }
/** * Hash code for the statistics object * @return hash code int */ @Override public int hashCode() { return 31 * type.hashCode() + 31 * Arrays.hashCode(getMaxBytes()) + 17 * Arrays.hashCode(getMinBytes()) + Long.valueOf(this.getNumNulls()).hashCode(); }
/** * Hash code for the statistics object * @return hash code int */ @Override public int hashCode() { return 31 * type.hashCode() + 31 * Arrays.hashCode(getMaxBytes()) + 17 * Arrays.hashCode(getMinBytes()) + Long.valueOf(this.getNumNulls()).hashCode(); }
private boolean isAllNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() == column.getValueCount(); }
private boolean isAllNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() == column.getValueCount(); }
/** * Equality comparison method to compare two statistics objects. * @param other Object to compare against * @return true if objects are equal, false otherwise */ @Override public boolean equals(Object other) { if (other == this) return true; if (!(other instanceof Statistics)) return false; Statistics stats = (Statistics) other; return type.equals(stats.type) && Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && Arrays.equals(stats.getMinBytes(), this.getMinBytes()) && stats.getNumNulls() == this.getNumNulls(); }
@Override public <T> Boolean notNull(BoundReference<T> ref) { // no need to check whether the field is required because binding evaluates that case // if the column has no non-null values, the expression cannot match Integer id = ref.fieldId(); Preconditions.checkNotNull(struct.field(id), "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && valueCount - colStats.getNumNulls() == 0) { // (num nulls == value count) => all values are null => no non-null values return ROWS_CANNOT_MATCH; } return ROWS_MIGHT_MATCH; }
/** * Equality comparison method to compare two statistics objects. * @param other Object to compare against * @return true if objects are equal, false otherwise */ @Override public boolean equals(Object other) { if (other == this) return true; if (!(other instanceof Statistics)) return false; Statistics stats = (Statistics) other; return type.equals(stats.type) && Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && Arrays.equals(stats.getMinBytes(), this.getMinBytes()) && stats.getNumNulls() == this.getNumNulls(); }
@Override public String toString() { if (this.hasNonNullValue()) { if (isNumNullsSet()) { return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); } else { return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); } } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; }
@Override public String toString() { if (this.hasNonNullValue()) { if (isNumNullsSet()) { return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); } else { return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString()); } } else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else return "no stats for this column"; }
/** * Creates decimal statistics where min and max values are re-created using given scale. * * @param statistics statistics that needs to be adjusted * @param scale adjustment scale * @return adjusted statistics */ @SuppressWarnings("unchecked") private Statistics<C> adjustDecimalStatistics(Statistics<C> statistics, int scale) { byte[] minBytes = new BigDecimal(new BigInteger(statistics.getMinBytes())) .setScale(scale, RoundingMode.HALF_UP).unscaledValue().toByteArray(); byte[] maxBytes = new BigDecimal(new BigInteger(statistics.getMaxBytes())) .setScale(scale, RoundingMode.HALF_UP).unscaledValue().toByteArray(); return (Statistics<C>) Statistics.getBuilderForReading(statistics.type()) .withMin(minBytes) .withMax(maxBytes) .withNumNulls(statistics.getNumNulls()) .build(); }
@Override public <T> Boolean isNull(BoundReference<T> ref) { // no need to check whether the field is required because binding evaluates that case // if the column has no null values, the expression cannot match Integer id = ref.fieldId(); Preconditions.checkNotNull(struct.field(id), "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_MIGHT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty() && colStats.getNumNulls() == 0) { // there are stats and no values are null => all values are non-null return ROWS_CANNOT_MATCH; } return ROWS_MIGHT_MATCH; }
@Override void writePage(int rowCount, int valueCount, Statistics<?> statistics, ValuesWriter repetitionLevels, ValuesWriter definitionLevels, ValuesWriter values) throws IOException { // TODO: rework this API. The bytes shall be retrieved before the encoding (encoding might be different otherwise) BytesInput bytes = values.getBytes(); Encoding encoding = values.getEncoding(); pageWriter.writePageV2( rowCount, Ints.checkedCast(statistics.getNumNulls()), valueCount, repetitionLevels.getBytes(), definitionLevels.getBytes(), encoding, bytes, statistics); } }
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; // Merge stats only if they have the same type if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw StatisticsClassException.create(this, stats); } }
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; // Merge stats only if they have the same type if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw StatisticsClassException.create(this, stats); } }
public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics stats) { Statistics formatStats = new Statistics(); // Don't write stats larger than the max size rather than truncating. The // rationale is that some engines may use the minimum value in the page as // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. if (!stats.isEmpty() && stats.isSmallerThan(MAX_STATS_SIZE)) { formatStats.setNull_count(stats.getNumNulls()); if (stats.hasNonNullValue()) { byte[] min = stats.getMinBytes(); byte[] max = stats.getMaxBytes(); // Fill the former min-max statistics only if the comparison logic is // signed so the logic of V1 and V2 stats are the same (which is // trivially true for equal min-max values) if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { formatStats.setMin(min); formatStats.setMax(max); } if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) { formatStats.setMin_value(min); formatStats.setMax_value(max); } } } return formatStats; }
/** * Adds the data from the specified statistics to this builder * * @param stats * the statistics to be added */ public void add(Statistics<?> stats) { if (stats.hasNonNullValue()) { nullPages.add(false); Object min = stats.genericGetMin(); Object max = stats.genericGetMax(); addMinMax(min, max); pageIndexes.add(nextPageIndex); minMaxSize += sizeOf(min); minMaxSize += sizeOf(max); } else { nullPages.add(true); } nullCounts.add(stats.getNumNulls()); ++nextPageIndex; }