private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath) { ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder(); for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { Statistics<?> columnStatistics = columnMetaData.getStatistics(); if (columnStatistics != null) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { statistics.put(descriptor, columnStatistics); } } } return statistics.build(); }
private boolean hasNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() > 0; }
private static Map<Integer, Statistics<?>> getStatisticsByColumnOrdinal(BlockMetaData blockMetadata) { ImmutableMap.Builder<Integer, Statistics<?>> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < blockMetadata.getColumns().size(); ordinal++) { Statistics<?> columnStatistics = blockMetadata.getColumns().get(ordinal).getStatistics(); if (columnStatistics != null) { statistics.put(ordinal, columnStatistics); } } return statistics.build(); }
private boolean isAllNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() == column.getValueCount(); }
@Override public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) { Column<T> filterColumn = ltEq.getColumn(); T value = ltEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v <= someValue // this chunk is all nulls, so we can drop it return true; } // drop if value < min return value.compareTo(stats.genericGetMin()) < 0; }
@Override public <T extends Comparable<T>> Boolean visit(Lt<T> lt) { Column<T> filterColumn = lt.getColumn(); T value = lt.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v < someValue // this chunk is all nulls, so we can drop it return true; } // drop if value <= min return value.compareTo(stats.genericGetMin()) <= 0; }
@Override public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); T value = gt.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return true; } // drop if value >= max return value.compareTo(stats.genericGetMax()) >= 0; }
@Override public <T extends Comparable<T>> Boolean visit(GtEq<T> gtEq) { Column<T> filterColumn = gtEq.getColumn(); T value = gtEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // we are looking for records where v >= someValue // this chunk is all nulls, so we can drop it return true; } // drop if value >= max return value.compareTo(stats.genericGetMax()) > 0; }
@Override public <T extends Comparable<T>> Boolean visit(Eq<T> eq) { Column<T> filterColumn = eq.getColumn(); T value = eq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (value == null) { // we are looking for records where v eq(null) // so drop if there are no nulls in this chunk return !hasNulls(columnChunk); } if (isAllNulls(columnChunk)) { // we are looking for records where v eq(someNonNull) // and this is a column of all nulls, so drop it return true; } // drop if value < min || value > max return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0; }
@Override public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) { Column<T> filterColumn = notEq.getColumn(); T value = notEq.getValue(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (value == null) { // we are looking for records where v notEq(null) // so, if this is a column of all nulls, we can drop it return isAllNulls(columnChunk); } if (hasNulls(columnChunk)) { // we are looking for records where v notEq(someNonNull) // but this chunk contains nulls, we cannot drop it return false; } // drop if this is a column where min = max = value return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0; }
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) { //rowGroup.total_byte_size = ; List<ColumnChunkMetaData> columns = block.getColumns(); List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset columnChunk.file_path = block.getPath(); // they are in the same file for now columnChunk.meta_data = new parquet.format.ColumnMetaData( getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), columnMetaData.getCodec().getParquetCompressionCodec(), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); rowGroups.add(rowGroup); }
private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(UserDefined<T, U> ud, boolean inverted) { Column<T> filterColumn = ud.getColumn(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); U udp = ud.getUserDefinedPredicate(); Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return false; } if (isAllNulls(columnChunk)) { // there is no min max, there is nothing // else we can say about this chunk, we // cannot drop it. return false; } parquet.filter2.predicate.Statistics<T> udpStats = new parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax()); if (inverted) { return udp.inverseCanDrop(udpStats); } else { return udp.canDrop(udpStats); } }
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }