private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath) { ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder(); for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { Statistics<?> columnStatistics = columnMetaData.getStatistics(); if (columnStatistics != null) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { statistics.put(descriptor, columnStatistics); } } } return statistics.build(); }
private static Map<ColumnDescriptor, DictionaryDescriptor> getDictionaries(BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain) { ImmutableMap.Builder<ColumnDescriptor, DictionaryDescriptor> dictionaries = ImmutableMap.builder(); for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { if (isOnlyDictionaryEncodingPages(columnMetaData.getEncodings()) && isColumnPredicate(descriptor, parquetTupleDomain)) { int totalSize = toIntExact(columnMetaData.getTotalSize()); byte[] buffer = new byte[totalSize]; dataSource.readFully(columnMetaData.getStartingPos(), buffer); Optional<DictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnMetaData.getCodec()); dictionaries.put(descriptor, new DictionaryDescriptor(descriptor, dictionaryPage)); break; } } } return dictionaries.build(); }
private ColumnChunkMetaData getColumnChunkMetaData(ColumnDescriptor columnDescriptor) throws IOException { for (ColumnChunkMetaData metadata : currentBlockMetadata.getColumns()) { if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) { return metadata; } } throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor); }
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) { footerBlocks.add(block);
private static Map<Integer, Statistics<?>> getStatisticsByColumnOrdinal(BlockMetaData blockMetadata) { ImmutableMap.Builder<Integer, Statistics<?>> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < blockMetadata.getColumns().size(); ordinal++) { Statistics<?> columnStatistics = blockMetadata.getColumns().get(ordinal).getStatistics(); if (columnStatistics != null) { statistics.put(ordinal, columnStatistics); } } return statistics.build(); }
/** * * @return the starting pos of first column */ public long getStartingPos() { return getColumns().get(0).getStartingPos(); } @Override
/** * @return the compressed size of all columns */ public long getCompressedSize() { long totalSize = 0; for (ColumnChunkMetaData col : getColumns()) { totalSize += col.getTotalSize(); } return totalSize; } }
@Override public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) { FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate(); // check that the schema of the filter matches the schema of the file SchemaCompatibilityValidator.validate(filterPredicate, schema); List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>(); for (BlockMetaData block : blocks) { if (!StatisticsFilter.canDrop(filterPredicate, block.getColumns())) { filteredBlocks.add(block); } } return filteredBlocks; }
List<ColumnChunkMetaData> columns = block.getColumns(); if (columns.isEmpty()) { return Collections.emptyList(); for (ColumnChunkMetaData column : block.getColumns()) { long off = column.getFirstDataPageOffset(); long len = column.getTotalSize();
/** * get the standard deviation of the column chunk sizes. * @param avgSize * @return */ @Override public double[] getColumnChunkSizeStdDev (double[] avgSize) { double[] dev = new double[this.columnCount]; for (int i = 0; i < this.columnCount; ++i) { dev[i] = 0; } for (BlockMetaData block : getBlocks()) { int i = 0; for (ColumnChunkMetaData column : block.getColumns()) { dev[i] += Math.pow(column.getTotalSize() - avgSize[i], 2); i++; } } long blockCount = this.getRowGroupCount(); for (int i = 0; i < this.columnCount; ++i) { dev[i] = Math.sqrt(dev[i] / blockCount); } return dev; }
false); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { if (block.getColumns().isEmpty()) { continue; for (ColumnChunkMetaData column : block.getColumns()) { long offset = column.getFirstDataPageOffset(); long size = column.getTotalSize();
private ColumnChunkMetaData getColumnChunkMetaData(ColumnDescriptor columnDescriptor) throws IOException { for (ColumnChunkMetaData metadata : currentBlockMetadata.getColumns()) { if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) { return metadata; } } throw new ParquetCorruptionException("Malformed Parquet file. Could not find column metadata %s", columnDescriptor); }
/** * get the average column chunk size of all the row groups * @return */ @Override public double[] getAvgColumnChunkSize () { double[] sum = new double[this.columnCount]; for (int i = 0; i < this.columnCount; ++i) { sum[i] = 0; } for (BlockMetaData block : getBlocks()) { int i = 0; for (ColumnChunkMetaData column : block.getColumns()) { sum[i] += column.getTotalSize(); i++; } } long blockCount = this.getRowGroupCount(); for (int i = 0; i < this.columnCount; ++i) { sum[i] /= blockCount; } return sum; }
private static long end(List<BlockMetaData> blocks, String requestedSchema) { MessageType requested = MessageTypeParser.parseMessageType(requestedSchema); long length = 0; for (BlockMetaData block : blocks) { List<ColumnChunkMetaData> columns = block.getColumns(); for (ColumnChunkMetaData column : columns) { if (requested.containsPath(column.getPath().toArray())) { length += column.getTotalSize(); } } } return length; }
for (int ordinal = 0; ordinal < blockMetadata.getColumns().size(); ordinal++) { ColumnChunkMetaData columnChunkMetaData = blockMetadata.getColumns().get(ordinal);
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) { long rows = meta.getRowCount(); long tbs = meta.getTotalByteSize(); long offset = meta.getStartingPos(); out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset); out.rule('-'); showDetails(out, meta.getColumns()); }
for (ColumnChunkMetaData mc : block.getColumns()) { ColumnPath pathKey = mc.getPath(); BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException { MessageType requested = MessageTypeParser.parseMessageType(requestedSchema); long length = 0; for (BlockMetaData block : this.getRowGroups()) { List<ColumnChunkMetaData> columns = block.getColumns(); for (ColumnChunkMetaData column : columns) { if (requested.containsPath(column.getPath().toArray())) { length += column.getTotalSize(); } } } BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1); long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize(); long[] rowGroupOffsets = new long[this.getRowGroupCount()]; for (int i = 0; i < rowGroupOffsets.length; i++) { rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos(); } return new ParquetInputSplit( fileStatus.getPath(), hdfsBlock.getOffset(), end, length, hdfsBlock.getHosts(), rowGroupOffsets ); } }
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) { //rowGroup.total_byte_size = ; List<ColumnChunkMetaData> columns = block.getColumns(); List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset columnChunk.file_path = block.getPath(); // they are in the same file for now columnChunk.meta_data = new parquet.format.ColumnMetaData( getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), columnMetaData.getCodec().getParquetCompressionCodec(), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); rowGroups.add(rowGroup); }
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }