private static Map<ColumnDescriptor, DictionaryDescriptor> getDictionaries(BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain) { ImmutableMap.Builder<ColumnDescriptor, DictionaryDescriptor> dictionaries = ImmutableMap.builder(); for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { if (isOnlyDictionaryEncodingPages(columnMetaData.getEncodings()) && isColumnPredicate(descriptor, parquetTupleDomain)) { int totalSize = toIntExact(columnMetaData.getTotalSize()); byte[] buffer = new byte[totalSize]; dataSource.readFully(columnMetaData.getStartingPos(), buffer); Optional<DictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnMetaData.getCodec()); dictionaries.put(descriptor, new DictionaryDescriptor(descriptor, dictionaryPage)); break; } } } return dictionaries.build(); }
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) { footerBlocks.add(block);
ColumnPath columnPath = ColumnPath.get(path); PrimitiveTypeName primitiveTypeName = messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(); ColumnChunkMetaData column = ColumnChunkMetaData.get( columnPath, primitiveTypeName,
private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath) { ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder(); for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { Statistics<?> columnStatistics = columnMetaData.getStatistics(); if (columnStatistics != null) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { statistics.put(descriptor, columnStatistics); } } } return statistics.build(); }
private ColumnChunk readPrimitive(PrimitiveField field) throws IOException { ColumnDescriptor columnDescriptor = field.getDescriptor(); PrimitiveColumnReader columnReader = columnReaders[field.getId()]; if (columnReader.getPageReader() == null) { validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows"); ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor); long startingPosition = metadata.getStartingPos(); int totalSize = toIntExact(metadata.getTotalSize()); byte[] buffer = allocateBlock(totalSize); dataSource.readFully(startingPosition, buffer); ColumnChunkDescriptor descriptor = new ColumnChunkDescriptor(columnDescriptor, metadata, totalSize); ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0); columnReader.setPageReader(columnChunk.readAllPages()); } return columnReader.readPrimitive(field); }
private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) { long doff = meta.getDictionaryPageOffset(); long foff = meta.getFirstDataPageOffset(); long tsize = meta.getTotalSize(); long usize = meta.getTotalUncompressedSize(); long count = meta.getValueCount(); double ratio = usize / (double)tsize; String encodings = Joiner.on(',').skipNulls().join(meta.getEncodings()); if (name) { String path = Joiner.on('.').skipNulls().join(meta.getPath()); out.format("%s: ", path); } out.format(" %s", meta.getType()); out.format(" %s", meta.getCodec()); out.format(" DO:%d", doff); out.format(" FPO:%d", foff); out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio); out.format(" VC:%d", count); if (!encodings.isEmpty()) out.format(" ENC:%s", encodings); out.println(); }
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }
long end = -1L; for (ColumnChunkMetaData column : block.getColumns()) { long off = column.getFirstDataPageOffset(); long len = column.getTotalSize(); begin = Math.min(begin, off); end = Math.max(end, off + len);
private static long end(List<BlockMetaData> blocks, String requestedSchema) { MessageType requested = MessageTypeParser.parseMessageType(requestedSchema); long length = 0; for (BlockMetaData block : blocks) { List<ColumnChunkMetaData> columns = block.getColumns(); for (ColumnChunkMetaData column : columns) { if (requested.containsPath(column.getPath().toArray())) { length += column.getTotalSize(); } } } return length; }
ConsecutiveChunkList currentChunks = null; for (ColumnChunkMetaData mc : block.getColumns()) { ColumnPath pathKey = mc.getPath(); BenchmarkCounter.incrementTotalBytes(mc.getTotalSize()); ColumnDescriptor columnDescriptor = paths.get(pathKey); if (columnDescriptor != null) { long startingPos = mc.getStartingPos(); allChunks.add(currentChunks); currentChunks.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize()));
DictionaryPage dictionaryPage = null; long valuesCountReadSoFar = 0; while (valuesCountReadSoFar < descriptor.metadata.getValueCount()) { PageHeader pageHeader = readPageHeader(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); if (valuesCountReadSoFar != descriptor.metadata.getValueCount()) { "Expected " + descriptor.metadata.getValueCount() + " values in column chunk at " + filePath + " offset " + descriptor.metadata.getFirstDataPageOffset() + " but got " + valuesCountReadSoFar + " values instead over " + pagesInChunk.size() + " pages ending at file offset " + (descriptor.fileOffset + pos())); BytesDecompressor decompressor = codecFactory.getDecompressor(descriptor.metadata.getCodec()); return new ColumnChunkPageReader(decompressor, pagesInChunk, dictionaryPage);
public PageReader readAllPages() throws IOException { List<DataPage> pages = new ArrayList<>(); DictionaryPage dictionaryPage = null; long valueCount = 0; while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) { PageHeader pageHeader = readPageHeader(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); switch (pageHeader.type) { case DICTIONARY_PAGE: if (dictionaryPage != null) { throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor()); } dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize); break; case DATA_PAGE: valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages); break; case DATA_PAGE_V2: valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages); break; default: skip(compressedPageSize); break; } } return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage); }
/** * @return the compressed size of all columns */ public long getCompressedSize() { long totalSize = 0; for (ColumnChunkMetaData col : getColumns()) { totalSize += col.getTotalSize(); } return totalSize; } }
private boolean hasNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() > 0; }
private ColumnChunkMetaData getColumnChunkMetaData(ColumnDescriptor columnDescriptor) throws IOException { for (ColumnChunkMetaData metadata : currentBlockMetadata.getColumns()) { if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) { return metadata; } } throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor); }
/** * @return the offset of the first byte in the chunk */ public long getStartingPos() { long dictionaryPageOffset = getDictionaryPageOffset(); long firstDataPageOffset = getFirstDataPageOffset(); if (dictionaryPageOffset > 0 && dictionaryPageOffset < firstDataPageOffset) { // if there's a dictionary and it's before the first data page, start from there return dictionaryPageOffset; } return firstDataPageOffset; }
private boolean isAllNulls(ColumnChunkMetaData column) { return column.getStatistics().getNumNulls() == column.getValueCount(); }
/** * * @return the starting pos of first column */ public long getStartingPos() { return getColumns().get(0).getStartingPos(); } @Override
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) { //rowGroup.total_byte_size = ; List<ColumnChunkMetaData> columns = block.getColumns(); List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset columnChunk.file_path = block.getPath(); // they are in the same file for now columnChunk.meta_data = new parquet.format.ColumnMetaData( getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), columnMetaData.getCodec().getParquetCompressionCodec(), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); rowGroups.add(rowGroup); }
long end = -1L; for (ColumnChunkMetaData column : block.getColumns()) { long offset = column.getFirstDataPageOffset(); long size = column.getTotalSize(); begin = Math.min(begin, offset); end = Math.max(end, offset + size);