@Override public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statistics, ParquetDataSourceId id, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { if (numberOfRows == 0) { return false; } ImmutableMap.Builder<ColumnDescriptor, Domain> domains = ImmutableMap.builder(); for (RichColumnDescriptor column : columns) { Statistics<?> columnStatistics = statistics.get(column); Domain domain; Type type = getPrestoType(effectivePredicate, column); if (columnStatistics == null || columnStatistics.isEmpty()) { // no stats for column domain = Domain.all(type); } else { domain = getDomain(type, numberOfRows, columnStatistics, id, column.toString(), failOnCorruptedParquetStatistics); } domains.put(column, domain); } TupleDomain<ColumnDescriptor> stripeDomain = TupleDomain.withColumnDomains(domains.build()); return effectivePredicate.overlaps(stripeDomain); }
throws ParquetCorruptionException if (statistics == null || statistics.isEmpty()) { return Domain.all(type);
/** * Method to merge this statistics object with the object passed * as parameter. Merging keeps the smallest of min values, largest of max * values and combines the number of null counts. * @param stats Statistics object to merge with */ public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; if (this.getClass() == stats.getClass()) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { throw new StatisticsClassException(this.getClass().toString(), stats.getClass().toString()); } }
@Override public boolean matches(long numberOfRows, Map<Integer, Statistics<?>> statisticsByColumnIndex) { if (numberOfRows == 0) { return false; } ImmutableMap.Builder<C, Domain> domains = ImmutableMap.builder(); for (ColumnReference<C> columnReference : columnReferences) { Statistics<?> statistics = statisticsByColumnIndex.get(columnReference.getOrdinal()); Domain domain; if (statistics == null || statistics.isEmpty()) { // no stats for column domain = Domain.all(columnReference.getType()); } else { domain = getDomain(columnReference.getType(), numberOfRows, statistics); } domains.put(columnReference.getColumn(), domain); } TupleDomain<C> stripeDomain = TupleDomain.withColumnDomains(domains.build()); return effectivePredicate.overlaps(stripeDomain); }
@Override public <T extends Comparable<T>> Boolean visit(Lt<T> lt) { Column<T> filterColumn = lt.getColumn(); T value = lt.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v < someValue // this chunk is all nulls, so we can drop it return true; } // drop if value <= min return value.compareTo(stats.genericGetMin()) <= 0; }
@Override public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); T value = gt.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return true; } // drop if value >= max return value.compareTo(stats.genericGetMax()) >= 0; }
@Override public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) { Column<T> filterColumn = ltEq.getColumn(); T value = ltEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v <= someValue // this chunk is all nulls, so we can drop it return true; } // drop if value < min return value.compareTo(stats.genericGetMin()) < 0; }
@Override public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) { Column<T> filterColumn = gtEq.getColumn(); T value = gtEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v >= someValue // this chunk is all nulls, so we can drop it return true; } // drop if value >= max return value.compareTo(stats.genericGetMax()) > 0; }
@Override public <T extends Comparable<T>> Boolean visit(Eq<T> eq) { Column<T> filterColumn = eq.getColumn(); T value = eq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (value == null) { // we are looking for records where v eq(null) // so drop if there are no nulls in this chunk return !hasNulls(columnChunk); } if (isAllNulls(columnChunk)) { // we are looking for records where v eq(someNonNull) // and this is a column of all nulls, so drop it return true; } // drop if value < min || value > max return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0; }
@Override public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) { Column<T> filterColumn = notEq.getColumn(); T value = notEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (value == null) { // we are looking for records where v notEq(null) // so, if this is a column of all nulls, we can drop it return isAllNulls(columnChunk); } if (hasNulls(columnChunk)) { // we are looking for records where v notEq(someNonNull) // but this chunk contains nulls, we cannot drop it return false; } // drop if this is a column where min = max = value return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0; }
private PageHeader newDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, parquet.column.statistics.Statistics<?> statistics, parquet.column.Encoding dataEncoding, int rlByteLength, int dlByteLength) { // TODO: pageHeader.crc = ...; DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2( valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength); if (!statistics.isEmpty()) { dataPageHeaderV2.setStatistics(toParquetStatistics(statistics)); } PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize); pageHeader.setData_page_header_v2(dataPageHeaderV2); return pageHeader; }
public static Statistics toParquetStatistics(parquet.column.statistics.Statistics statistics) { Statistics stats = new Statistics(); if (!statistics.isEmpty()) { stats.setNull_count(statistics.getNumNulls()); if(statistics.hasNonNullValue()) { stats.setMax(statistics.getMaxBytes()); stats.setMin(statistics.getMinBytes()); } } return stats; }
private PageHeader newDataPageHeader( int uncompressedSize, int compressedSize, int valueCount, parquet.column.statistics.Statistics statistics, parquet.column.Encoding rlEncoding, parquet.column.Encoding dlEncoding, parquet.column.Encoding valuesEncoding) { PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize); // TODO: pageHeader.crc = ...; pageHeader.setData_page_header(new DataPageHeader( valueCount, getEncoding(valuesEncoding), getEncoding(dlEncoding), getEncoding(rlEncoding))); if (!statistics.isEmpty()) { pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics)); } return pageHeader; }
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) { //rowGroup.total_byte_size = ; List<ColumnChunkMetaData> columns = block.getColumns(); List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset columnChunk.file_path = block.getPath(); // they are in the same file for now columnChunk.meta_data = new parquet.format.ColumnMetaData( getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), columnMetaData.getCodec().getParquetCompressionCodec(), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); rowGroups.add(rowGroup); }
private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(UserDefined<T, U> ud, boolean inverted) { Column<T> filterColumn = ud.getColumn(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); U udp = ud.getUserDefinedPredicate(); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // there is no min max, there is nothing // else we can say about this chunk, we // cannot drop it. return false; } parquet.filter2.predicate.Statistics<T> udpStats = new parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax()); if (inverted) { return udp.inverseCanDrop(udpStats); } else { return udp.canDrop(udpStats); } }
@VisibleForTesting public static Domain getDomain(Type type, long rowCount, Statistics<?> statistics) if (statistics == null || statistics.isEmpty()) { return Domain.all(type);