private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath) { ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder(); for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) { Statistics<?> columnStatistics = columnMetaData.getStatistics(); if (columnStatistics != null) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray())); if (descriptor != null) { statistics.put(descriptor, columnStatistics); } } } return statistics.build(); }
private boolean advanceToNextRowGroup() { currentRowGroupMemoryContext.close(); currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext(); if (currentBlock == blocks.size()) { return false; } currentBlockMetadata = blocks.get(currentBlock); currentBlock = currentBlock + 1; nextRowInGroup = 0L; currentGroupRowCount = currentBlockMetadata.getRowCount(); initializeColumnReaders(); return true; }
if (rowGroups != null) { for (RowGroup rowGroup : rowGroups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup); metaData.total_compressed_size, metaData.total_uncompressed_size); blockMetaData.addColumn(column); blockMetaData.setPath(filePath); blocks.add(blockMetaData);
List<ColumnChunkMetaData> columns = block.getColumns(); if (columns.isEmpty()) { return Collections.emptyList(); for (ColumnChunkMetaData column : block.getColumns()) { long off = column.getFirstDataPageOffset(); long len = column.getTotalSize(); if (this.offset <= begin && end <= this.offset + this.fragmentSize && block.getRowCount() != 0) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format(
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) { long rows = meta.getRowCount(); long tbs = meta.getTotalByteSize(); long offset = meta.getStartingPos(); out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset); out.rule('-'); showDetails(out, meta.getColumns()); }
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) { //rowGroup.total_byte_size = ; List<ColumnChunkMetaData> columns = block.getColumns(); List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset columnChunk.file_path = block.getPath(); // they are in the same file for now columnChunk.meta_data = new parquet.format.ColumnMetaData( getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), columnMetaData.getCodec().getParquetCompressionCodec(), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); rowGroups.add(rowGroup); }
/** * end a column (once all rep, def and data have been written) * @throws IOException */ public void endColumn() throws IOException { state = state.endColumn(); if (DEBUG) LOG.debug(out.getPos() + ": end column"); currentBlock.addColumn(ColumnChunkMetaData.get( currentChunkPath, currentChunkType, currentChunkCodec, currentEncodings, currentStatistics, currentChunkFirstDataPage, currentChunkDictionaryPageOffset, currentChunkValueCount, compressedLength, uncompressedLength)); if (DEBUG) LOG.info("ended Column chumk: " + currentColumn); currentColumn = null; this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength); this.uncompressedLength = 0; this.compressedLength = 0; }
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException { MessageType requested = MessageTypeParser.parseMessageType(requestedSchema); long length = 0; for (BlockMetaData block : this.getRowGroups()) { List<ColumnChunkMetaData> columns = block.getColumns(); for (ColumnChunkMetaData column : columns) { if (requested.containsPath(column.getPath().toArray())) { length += column.getTotalSize(); } } } BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1); long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize(); long[] rowGroupOffsets = new long[this.getRowGroupCount()]; for (int i = 0; i < rowGroupOffsets.length; i++) { rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos(); } return new ParquetInputSplit( fileStatus.getPath(), hdfsBlock.getOffset(), end, length, hdfsBlock.getHosts(), rowGroupOffsets ); } }
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) { splitGroup.add(block); for (int i = 0; i < splitGroup.size(); i++) { BlockMetaData block = splitGroup.get(i); offsets[i] = block.getStartingPos();
/** * start a block * @param recordCount the record count in this block * @throws IOException */ public void startBlock(long recordCount) throws IOException { state = state.startBlock(); if (DEBUG) LOG.debug(out.getPos() + ": start block"); // out.write(MAGIC); // TODO: add a magic delimiter currentBlock = new BlockMetaData(); currentRecordCount = recordCount; }
/** * ends a block once all column chunks have been written * @throws IOException */ public void endBlock() throws IOException { state = state.endBlock(); if (DEBUG) LOG.debug(out.getPos() + ": end block"); currentBlock.setRowCount(currentRecordCount); blocks.add(currentBlock); currentBlock = null; }
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
private static long computeTotalRecords(List<BlockMetaData> blocks) { long result = 0L; for (BlockMetaData block : blocks) { result += block.getTotalByteSize(); } return result; }
/** * @param rowGroupMetadata * @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group; * return false if the mid point of row group is in the same hdfs block */ private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) { boolean isNewHdfsBlock = false; long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2); //if mid point is not in the current HDFS block any more, return true while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) { isNewHdfsBlock = true; currentMidPointHDFSBlockIndex++; if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length) throw new ParquetDecodingException("the row group is not in hdfs blocks in the file: midpoint of row groups is " + rowGroupMidPoint + ", the end of the hdfs block is " + getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1)); } while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) { currentStartHdfsBlockIndex++; if (currentStartHdfsBlockIndex >= hdfsBlocks.length) throw new ParquetDecodingException("The row group does not start in this file: row group offset is " + rowGroupMetadata.getStartingPos() + " but the end of hdfs blocks of file is " + getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)); } return isNewHdfsBlock; }
private static long[] offsets(List<BlockMetaData> blocks) { long[] offsets = new long[blocks.size()]; for (int i = 0; i < offsets.length; i++) { offsets[i] = blocks.get(i).getStartingPos(); } return offsets; }
false); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { if (block.getColumns().isEmpty()) { continue; for (ColumnChunkMetaData column : block.getColumns()) { long offset = column.getFirstDataPageOffset(); long size = column.getTotalSize(); + "path={0}, rows={1}, range={2}+{3}, allocation={4}", //$NON-NLS-1$ status.getPath(), block.getRowCount(), begin, end - begin,
/** * end a column (once all rep, def and data have been written) * @throws IOException */ public void endColumn() throws IOException { state = state.endColumn(); if (DEBUG) LOG.debug(out.getPos() + ": end column"); currentBlock.addColumn(ColumnChunkMetaData.get( currentChunkPath, currentChunkType, currentChunkCodec, currentEncodings, currentStatistics, currentChunkFirstDataPage, currentChunkDictionaryPageOffset, currentChunkValueCount, compressedLength, uncompressedLength)); if (DEBUG) LOG.info("ended Column chumk: " + currentColumn); currentColumn = null; this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength); this.uncompressedLength = 0; this.compressedLength = 0; }
/** * start a block * @param recordCount the record count in this block * @throws IOException */ public void startBlock(long recordCount) throws IOException { state = state.startBlock(); if (DEBUG) LOG.debug(out.getPos() + ": start block"); // out.write(MAGIC); // TODO: add a magic delimiter currentBlock = new BlockMetaData(); currentRecordCount = recordCount; }
/** * ends a block once all column chunks have been written * @throws IOException */ public void endBlock() throws IOException { state = state.endBlock(); if (DEBUG) LOG.debug(out.getPos() + ": end block"); currentBlock.setRowCount(currentRecordCount); blocks.add(currentBlock); currentBlock = null; }
private static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toString(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String path = footer.getFile().toString(); if (!path.startsWith(rootPath)) { throw new ParquetEncodingException(path + " invalid: all the files must be contained in the root " + root); } path = path.substring(rootPath.length()); while (path.startsWith("/")) { path = path.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(path); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }