public static parquet.column.statistics.Statistics<?> readStats(Statistics statistics, PrimitiveTypeName type) { parquet.column.statistics.Statistics<?> stats = parquet.column.statistics.Statistics.getStatsBasedOnType(type); if (statistics != null) { if (statistics.isSetMax() && statistics.isSetMin()) { stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); } stats.setNumNulls(statistics.null_count); } return stats; }
throws ParquetCorruptionException if (statistics == null || statistics.isEmpty()) { return Domain.all(type); if (statistics.getNumNulls() == rowCount) { return Domain.onlyNull(type); boolean hasNullValue = statistics.getNumNulls() != 0L; if (statistics.genericGetMin() == null || statistics.genericGetMax() == null) { return Domain.create(ValueSet.all(type), hasNullValue);
@Override public <T extends Comparable<T>> Boolean visit(Eq<T> eq) { Column<T> filterColumn = eq.getColumn(); T value = eq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (value == null) { // we are looking for records where v eq(null) // so drop if there are no nulls in this chunk return !hasNulls(columnChunk); } if (isAllNulls(columnChunk)) { // we are looking for records where v eq(someNonNull) // and this is a column of all nulls, so drop it return true; } // drop if value < min || value > max return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0; }
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; if (this.getClass() == stats.getClass()) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw new StatisticsClassException(this.getClass().toString(), stats.getClass().toString()); } }
public static Statistics toParquetStatistics(parquet.column.statistics.Statistics statistics) { Statistics stats = new Statistics(); if (!statistics.isEmpty()) { stats.setNull_count(statistics.getNumNulls()); if(statistics.hasNonNullValue()) { stats.setMax(statistics.getMaxBytes()); stats.setMin(statistics.getMinBytes()); } } return stats; }
private void resetStatistics() { this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); }
@Override public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) { Column<T> filterColumn = ltEq.getColumn(); T value = ltEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v <= someValue // this chunk is all nulls, so we can drop it return true; } // drop if value < min return value.compareTo(stats.genericGetMin()) < 0; }
@Override public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) { Column<T> filterColumn = gtEq.getColumn(); T value = gtEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v >= someValue // this chunk is all nulls, so we can drop it return true; } // drop if value >= max return value.compareTo(stats.genericGetMax()) > 0; }
/** * Equality comparison method to compare two statistics objects. * @param stats Statistics object to compare against * @return true if objects are equal, false otherwise */ public boolean equals(Statistics stats) { return Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && Arrays.equals(stats.getMinBytes(), this.getMinBytes()) && stats.getNumNulls() == this.getNumNulls(); }
@Override public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { if (numberOfRows == 0) { return false; } ImmutableMap.Builder<ColumnDescriptor, Domain> domains = ImmutableMap.builder(); for (RichColumnDescriptor column : columns) { Statistics<?> columnStatistics = statistics.get(column); Domain domain; Type type = getPrestoType(effectivePredicate, column); if (columnStatistics == null || columnStatistics.isEmpty()) { // no stats for column domain = Domain.all(type); } else { domain = getDomain(type, numberOfRows, columnStatistics, id, column.toString(), failOnCorruptedParquetStatistics); } domains.put(column, domain); } TupleDomain<ColumnDescriptor> stripeDomain = TupleDomain.withColumnDomains(domains.build()); return effectivePredicate.overlaps(stripeDomain); }
private boolean hasNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() > 0; }
@Override public void writePage(BytesInput bytes, int valueCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException { long uncompressedSize = bytes.size(); BytesInput compressedBytes = compressor.compress(bytes); long compressedSize = compressedBytes.size(); parquetMetadataConverter.writeDataPageHeader( (int)uncompressedSize, (int)compressedSize, valueCount, statistics, rlEncoding, dlEncoding, valuesEncoding, buf); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; this.totalStatistics.mergeStatistics(statistics); compressedBytes.writeAllTo(buf); encodings.add(rlEncoding); encodings.add(dlEncoding); encodings.add(valuesEncoding); }
@Override public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) { Column<T> filterColumn = notEq.getColumn(); T value = notEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (value == null) { // we are looking for records where v notEq(null) // so, if this is a column of all nulls, we can drop it return isAllNulls(columnChunk); } if (hasNulls(columnChunk)) { // we are looking for records where v notEq(someNonNull) // but this chunk contains nulls, we cannot drop it return false; } // drop if this is a column where min = max = value return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0; }
private void resetStatistics() { this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); }
@Override public <T extends Comparable<T>> Boolean visit(Lt<T> lt) { Column<T> filterColumn = lt.getColumn(); T value = lt.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v < someValue // this chunk is all nulls, so we can drop it return true; } // drop if value <= min return value.compareTo(stats.genericGetMin()) <= 0; }
@Override public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); T value = gt.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return true; } // drop if value >= max return value.compareTo(stats.genericGetMax()) >= 0; }
/** * Hash code for the statistics object * @return hash code int */ public int hashCode() { return 31 * Arrays.hashCode(getMaxBytes()) + 17 * Arrays.hashCode(getMinBytes()) + Long.valueOf(this.getNumNulls()).hashCode(); }
@Override public boolean matches(long numberOfRows, Map<Integer, Statistics<?>> statisticsByColumnIndex) { if (numberOfRows == 0) { return false; } ImmutableMap.Builder<C, Domain> domains = ImmutableMap.builder(); for (ColumnReference<C> columnReference : columnReferences) { Statistics<?> statistics = statisticsByColumnIndex.get(columnReference.getOrdinal()); Domain domain; if (statistics == null || statistics.isEmpty()) { // no stats for column domain = Domain.all(columnReference.getType()); } else { domain = getDomain(columnReference.getType(), numberOfRows, statistics); } domains.put(columnReference.getColumn(), domain); } TupleDomain<C> stripeDomain = TupleDomain.withColumnDomains(domains.build()); return effectivePredicate.overlaps(stripeDomain); }
private boolean isAllNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() == column.getValueCount(); }
this.totalValueCount += valueCount; this.pageCount += 1; this.totalStatistics.mergeStatistics(statistics);