parquet.hadoop.metadata.BlockMetaData java code examples

private static Map<ColumnDescriptor, Statistics<?>> getStatistics(BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath)
{
  ImmutableMap.Builder<ColumnDescriptor, Statistics<?>> statistics = ImmutableMap.builder();
  for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
    Statistics<?> columnStatistics = columnMetaData.getStatistics();
    if (columnStatistics != null) {
      RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
      if (descriptor != null) {
        statistics.put(descriptor, columnStatistics);
      }
    }
  }
  return statistics.build();
}

private boolean advanceToNextRowGroup()
{
  currentRowGroupMemoryContext.close();
  currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext();
  if (currentBlock == blocks.size()) {
    return false;
  }
  currentBlockMetadata = blocks.get(currentBlock);
  currentBlock = currentBlock + 1;
  nextRowInGroup = 0L;
  currentGroupRowCount = currentBlockMetadata.getRowCount();
  initializeColumnReaders();
  return true;
}

if (rowGroups != null) {
  for (RowGroup rowGroup : rowGroups) {
    BlockMetaData blockMetaData = new BlockMetaData();
    blockMetaData.setRowCount(rowGroup.getNum_rows());
    blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
    List<ColumnChunk> columns = rowGroup.getColumns();
    validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
          metaData.total_compressed_size,
          metaData.total_uncompressed_size);
      blockMetaData.addColumn(column);
    blockMetaData.setPath(filePath);
    blocks.add(blockMetaData);

List<ColumnChunkMetaData> columns = block.getColumns();
if (columns.isEmpty()) {
  return Collections.emptyList();
for (ColumnChunkMetaData column : block.getColumns()) {
  long off = column.getFirstDataPageOffset();
  long len = column.getTotalSize();
if (this.offset <= begin && end <= this.offset + this.fragmentSize && block.getRowCount() != 0) {
  if (LOG.isDebugEnabled()) {
    LOG.debug(MessageFormat.format(

private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
 long rows = meta.getRowCount();
 long tbs = meta.getTotalByteSize();
 long offset = meta.getStartingPos();
 out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
 out.rule('-');
 showDetails(out, meta.getColumns());
}

 private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
  //rowGroup.total_byte_size = ;
  List<ColumnChunkMetaData> columns = block.getColumns();
  List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
  for (ColumnChunkMetaData columnMetaData : columns) {
   ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
   columnChunk.file_path = block.getPath(); // they are in the same file for now
   columnChunk.meta_data = new parquet.format.ColumnMetaData(
     getType(columnMetaData.getType()),
     toFormatEncodings(columnMetaData.getEncodings()),
     Arrays.asList(columnMetaData.getPath().toArray()),
     columnMetaData.getCodec().getParquetCompressionCodec(),
     columnMetaData.getValueCount(),
     columnMetaData.getTotalUncompressedSize(),
     columnMetaData.getTotalSize(),
     columnMetaData.getFirstDataPageOffset());
   columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset();
   if (!columnMetaData.getStatistics().isEmpty()) {
    columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics()));
   }
//      columnChunk.meta_data.index_page_offset = ;
//      columnChunk.meta_data.key_value_metadata = ; // nothing yet

   parquetColumns.add(columnChunk);
  }
  RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
  rowGroups.add(rowGroup);
 }

/**
 * end a column (once all rep, def and data have been written)
 * @throws IOException
 */
public void endColumn() throws IOException {
 state = state.endColumn();
 if (DEBUG) LOG.debug(out.getPos() + ": end column");
 currentBlock.addColumn(ColumnChunkMetaData.get(
   currentChunkPath,
   currentChunkType,
   currentChunkCodec,
   currentEncodings,
   currentStatistics,
   currentChunkFirstDataPage,
   currentChunkDictionaryPageOffset,
   currentChunkValueCount,
   compressedLength,
   uncompressedLength));
 if (DEBUG) LOG.info("ended Column chumk: " + currentColumn);
 currentColumn = null;
 this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
 this.uncompressedLength = 0;
 this.compressedLength = 0;
}

 public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;
  for (BlockMetaData block : this.getRowGroups()) {
   List<ColumnChunkMetaData> columns = block.getColumns();
   for (ColumnChunkMetaData column : columns) {
    if (requested.containsPath(column.getPath().toArray())) {
     length += column.getTotalSize();
    }
   }
  }
  BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
  long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();
  long[] rowGroupOffsets = new long[this.getRowGroupCount()];
  for (int i = 0; i < rowGroupOffsets.length; i++) {
   rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
  }
  return new ParquetInputSplit(
      fileStatus.getPath(),
      hdfsBlock.getOffset(),
      end,
      length,
      hdfsBlock.getHosts(),
      rowGroupOffsets
  );
 }
}

  long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
  if (firstDataPage >= start && firstDataPage < start + length) {
    splitGroup.add(block);
for (int i = 0; i < splitGroup.size(); i++) {
  BlockMetaData block = splitGroup.get(i);
  offsets[i] = block.getStartingPos();

 /**
  * start a block
  * @param recordCount the record count in this block
  * @throws IOException
  */
 public void startBlock(long recordCount) throws IOException {
  state = state.startBlock();
  if (DEBUG) LOG.debug(out.getPos() + ": start block");
//    out.write(MAGIC); // TODO: add a magic delimiter
  currentBlock = new BlockMetaData();
  currentRecordCount = recordCount;
 }

/**
 * ends a block once all column chunks have been written
 * @throws IOException
 */
public void endBlock() throws IOException {
 state = state.endBlock();
 if (DEBUG) LOG.debug(out.getPos() + ": end block");
 currentBlock.setRowCount(currentRecordCount);
 blocks.add(currentBlock);
 currentBlock = null;
}

static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
 String rootPath = root.toUri().getPath();
 GlobalMetaData fileMetaData = null;
 List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
 for (Footer footer : footers) {
   String footerPath = footer.getFile().toUri().getPath();
  if (!footerPath.startsWith(rootPath)) {
   throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
  }
  footerPath = footerPath.substring(rootPath.length());
  while (footerPath.startsWith("/")) {
   footerPath = footerPath.substring(1);
  }
  fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
  for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
   block.setPath(footerPath);
   blocks.add(block);
  }
 }
 return new ParquetMetadata(fileMetaData.merge(), blocks);
}

private static long computeTotalRecords(List<BlockMetaData> blocks) {
  long result = 0L;
  for (BlockMetaData block : blocks) {
    result += block.getTotalByteSize();
  }
  return result;
}

/**
 * @param rowGroupMetadata
 * @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group;
 * return false if the mid point of row group is in the same hdfs block
 */
private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) {
 boolean isNewHdfsBlock = false;
 long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2);
 //if mid point is not in the current HDFS block any more, return true
 while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) {
  isNewHdfsBlock = true;
  currentMidPointHDFSBlockIndex++;
  if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length)
   throw new ParquetDecodingException("the row group is not in hdfs blocks in the file: midpoint of row groups is "
       + rowGroupMidPoint
       + ", the end of the hdfs block is "
       + getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1));
 }
 while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) {
  currentStartHdfsBlockIndex++;
  if (currentStartHdfsBlockIndex >= hdfsBlocks.length)
   throw new ParquetDecodingException("The row group does not start in this file: row group offset is "
       + rowGroupMetadata.getStartingPos()
       + " but the end of hdfs blocks of file is "
       + getHDFSBlockEndingPosition(currentStartHdfsBlockIndex));
 }
 return isNewHdfsBlock;
}

private static long[] offsets(List<BlockMetaData> blocks) {
 long[] offsets = new long[blocks.size()];
 for (int i = 0; i < offsets.length; i++) {
  offsets[i] = blocks.get(i).getStartingPos();
 }
 return offsets;
}

    false);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
  if (block.getColumns().isEmpty()) {
    continue;
  for (ColumnChunkMetaData column : block.getColumns()) {
    long offset = column.getFirstDataPageOffset();
    long size = column.getTotalSize();
        + "path={0}, rows={1}, range={2}+{3}, allocation={4}", //$NON-NLS-1$
        status.getPath(),
        block.getRowCount(),
        begin,
        end - begin,

/**
 * end a column (once all rep, def and data have been written)
 * @throws IOException
 */
public void endColumn() throws IOException {
 state = state.endColumn();
 if (DEBUG) LOG.debug(out.getPos() + ": end column");
 currentBlock.addColumn(ColumnChunkMetaData.get(
   currentChunkPath,
   currentChunkType,
   currentChunkCodec,
   currentEncodings,
   currentStatistics,
   currentChunkFirstDataPage,
   currentChunkDictionaryPageOffset,
   currentChunkValueCount,
   compressedLength,
   uncompressedLength));
 if (DEBUG) LOG.info("ended Column chumk: " + currentColumn);
 currentColumn = null;
 this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
 this.uncompressedLength = 0;
 this.compressedLength = 0;
}

 /**
  * start a block
  * @param recordCount the record count in this block
  * @throws IOException
  */
 public void startBlock(long recordCount) throws IOException {
  state = state.startBlock();
  if (DEBUG) LOG.debug(out.getPos() + ": start block");
//    out.write(MAGIC); // TODO: add a magic delimiter
  currentBlock = new BlockMetaData();
  currentRecordCount = recordCount;
 }

/**
 * ends a block once all column chunks have been written
 * @throws IOException
 */
public void endBlock() throws IOException {
 state = state.endBlock();
 if (DEBUG) LOG.debug(out.getPos() + ": end block");
 currentBlock.setRowCount(currentRecordCount);
 blocks.add(currentBlock);
 currentBlock = null;
}

private static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
 String rootPath = root.toString();
 GlobalMetaData fileMetaData = null;
 List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
 for (Footer footer : footers) {
  String path = footer.getFile().toString();
  if (!path.startsWith(rootPath)) {
   throw new ParquetEncodingException(path + " invalid: all the files must be contained in the root " + root);
  }
  path = path.substring(rootPath.length());
  while (path.startsWith("/")) {
   path = path.substring(1);
  }
  fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
  for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
   block.setPath(path);
   blocks.add(block);
  }
 }
 return new ParquetMetadata(fileMetaData.merge(), blocks);
}

Javadoc

Block metadata stored in the footer and passed in an InputSplit

Most used methods

getPath

Popular in Java

Making http post requests using okhttp
setContentView (Activity)
getSupportFragmentManager (FragmentActivity)
onRequestPermissionsResult (Fragment)
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
SocketTimeoutException (java.net)
This exception is thrown when a timeout expired on a socket read or accept operation.
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
From CI to AI: The AI layer in your organization

How to useBlockMetaData in parquet.hadoop.metadata

Best Java code snippets using parquet.hadoop.metadata.BlockMetaData (Showing top 20 results out of 315)

How to use
BlockMetaData
in
parquet.hadoop.metadata