private static long computeTotalRecords(List<BlockMetaData> blocks) { long result = 0L; for (BlockMetaData block : blocks) { result += block.getTotalByteSize(); } return result; }
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) { long rows = meta.getRowCount(); long tbs = meta.getTotalByteSize(); long offset = meta.getStartingPos(); out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset); out.rule('-'); showDetails(out, meta.getColumns()); }
/** * end a column (once all rep, def and data have been written) * @throws IOException */ public void endColumn() throws IOException { state = state.endColumn(); if (DEBUG) LOG.debug(out.getPos() + ": end column"); currentBlock.addColumn(ColumnChunkMetaData.get( currentChunkPath, currentChunkType, currentChunkCodec, currentEncodings, currentStatistics, currentChunkFirstDataPage, currentChunkDictionaryPageOffset, currentChunkValueCount, compressedLength, uncompressedLength)); if (DEBUG) LOG.info("ended Column chumk: " + currentColumn); currentColumn = null; this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength); this.uncompressedLength = 0; this.compressedLength = 0; }
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException { MessageType requested = MessageTypeParser.parseMessageType(requestedSchema); long length = 0; for (BlockMetaData block : this.getRowGroups()) { List<ColumnChunkMetaData> columns = block.getColumns(); for (ColumnChunkMetaData column : columns) { if (requested.containsPath(column.getPath().toArray())) { length += column.getTotalSize(); } } } BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1); long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize(); long[] rowGroupOffsets = new long[this.getRowGroupCount()]; for (int i = 0; i < rowGroupOffsets.length; i++) { rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos(); } return new ParquetInputSplit( fileStatus.getPath(), hdfsBlock.getOffset(), end, length, hdfsBlock.getHosts(), rowGroupOffsets ); } }
/** * end a column (once all rep, def and data have been written) * @throws IOException */ public void endColumn() throws IOException { state = state.endColumn(); if (DEBUG) LOG.debug(out.getPos() + ": end column"); currentBlock.addColumn(ColumnChunkMetaData.get( currentChunkPath, currentChunkType, currentChunkCodec, currentEncodings, currentStatistics, currentChunkFirstDataPage, currentChunkDictionaryPageOffset, currentChunkValueCount, compressedLength, uncompressedLength)); if (DEBUG) LOG.info("ended Column chumk: " + currentColumn); currentColumn = null; this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength); this.uncompressedLength = 0; this.compressedLength = 0; }
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) { //rowGroup.total_byte_size = ; List<ColumnChunkMetaData> columns = block.getColumns(); List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset columnChunk.file_path = block.getPath(); // they are in the same file for now columnChunk.meta_data = new parquet.format.ColumnMetaData( getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), columnMetaData.getCodec().getParquetCompressionCodec(), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset(); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics())); } // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); rowGroups.add(rowGroup); }