parquet.hadoop.metadata.ParquetMetadata java code examples

Refine search

FileMetaData

FSDataInputStream inputStream = fileSystem.open(path);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats);
for (BlockMetaData block : parquetMetadata.getBlocks()) {
  long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
  if (firstDataPage >= start && firstDataPage < start + length) {

return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);

public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
 List<BlockMetaData> blocks = parquetMetadata.getBlocks();
 List<RowGroup> rowGroups = new ArrayList<RowGroup>();
 int numRows = 0;
 for (BlockMetaData block : blocks) {
  numRows += block.getRowCount();
  addRowGroup(parquetMetadata, rowGroups, block);
 }
 FileMetaData fileMetaData = new FileMetaData(
   currentVersion,
   toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
   numRows,
   rowGroups);
 Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
 for (Entry<String, String> keyValue : keyValues) {
  addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
 }
 fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
 return fileMetaData;
}

/**
 * Read the parquet schema from a parquet File
 */
private MessageType readSchemaFromDataFile(Path parquetFilePath) throws IOException {
 LOG.info("Reading schema from " + parquetFilePath);
 if (!fs.exists(parquetFilePath)) {
  throw new IllegalArgumentException(
    "Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
 }
 ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath,
   ParquetMetadataConverter.NO_FILTER);
 return fileFooter.getFileMetaData().getSchema();
}

static List<Footer> footersFromSummaryFile(final Path parent, ParquetMetadata mergedFooters) {
 Map<Path, ParquetMetadata> footers = new HashMap<Path, ParquetMetadata>();
 List<BlockMetaData> blocks = mergedFooters.getBlocks();
 for (BlockMetaData block : blocks) {
  String path = block.getPath();
  Path fullPath = new Path(parent, path);
  ParquetMetadata current = footers.get(fullPath);
  if (current == null) {
   current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList<BlockMetaData>());
   footers.put(fullPath, current);
  }
  current.getBlocks().add(block);
 }
 List<Footer> result = new ArrayList<Footer>();
 for (Entry<Path, ParquetMetadata> entry : footers.entrySet()) {
  result.add(new Footer(entry.getKey(), entry.getValue()));
 }
 return result;
}

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (footersIterator.hasNext()) {
  Footer footer = footersIterator.next();
  reader = new InternalParquetRecordReader<T>(readSupport, filter);
  reader.initialize(
    readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
    readContext.getReadSupportMetadata(), footer.getFile(), footer.getParquetMetadata().getBlocks(), conf);
 }
}

public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) {
 showDetails(out, meta.getFileMetaData());
 long i = 1;
 for (BlockMetaData bmeta : meta.getBlocks()) {
  out.println();
  showDetails(out, bmeta, i++);
 }
}

public parquet.hadoop.metadata.FileMetaData getFileMetaData ()
{
  return this.metaData.getFileMetaData();
}

  public List<BlockMetaData> getBlocks ()
  {
    return this.metaData.getBlocks();
  }
}

/**
 *
 * @param parquetMetaData
 * @return the json representation
 */
public static String toJSON(ParquetMetadata parquetMetaData) {
 return toJSON(parquetMetaData, objectMapper);
}

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (footersIterator.hasNext()) {
  Footer footer = footersIterator.next();
  List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();
  MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();
  List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
    filter, blocks, fileSchema);
  reader = new InternalParquetRecordReader<T>(readSupport, filter);
  reader.initialize(fileSchema,
    footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
    footer.getFile(), filteredBlocks, conf);
 }
}

/**
 * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the
 * twitter parquet to support hive 1.1.0
 */
private static MessageType readSchema(Configuration conf, Path parquetFilePath) {
 try {
  return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema();
 } catch (IOException e) {
  throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e);
 }
}

static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
 String rootPath = root.toUri().getPath();
 GlobalMetaData fileMetaData = null;
 List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
 for (Footer footer : footers) {
   String footerPath = footer.getFile().toUri().getPath();
  if (!footerPath.startsWith(rootPath)) {
   throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
  }
  footerPath = footerPath.substring(rootPath.length());
  while (footerPath.startsWith("/")) {
   footerPath = footerPath.substring(1);
  }
  fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
  for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
   block.setPath(footerPath);
   blocks.add(block);
  }
 }
 return new ParquetMetadata(fileMetaData.merge(), blocks);
}

static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
 GlobalMetaData fileMetaData = null;
 for (Footer footer : footers) {
  ParquetMetadata currentMetadata = footer.getParquetMetadata();
  fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
 }
 return fileMetaData;
}

/**
 * writes a _metadata and _common_metadata file
 * @param configuration the configuration to use to get the FileSystem
 * @param outputPath the directory to write the _metadata file to
 * @param footers the list of footers to merge
 * @throws IOException
 */
public static void writeMetadataFile(Configuration configuration, Path outputPath, List<Footer> footers) throws IOException {
 ParquetMetadata metadataFooter = mergeFooters(outputPath, footers);
 FileSystem fs = outputPath.getFileSystem(configuration);
 outputPath = outputPath.makeQualified(fs);
 writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE);
 metadataFooter.getBlocks().clear();
 writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE);
}

/**
 *
 * @param parquetMetaData
 * @return the pretty printed json representation
 */
public static String toPrettyJSON(ParquetMetadata parquetMetaData) {
 return toJSON(parquetMetaData, prettyObjectMapper);
}

 MessageType fileSchema = footer.getFileMetaData().getSchema();
 Filter filter = getFilter(configuration);
 filteredBlocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
 for (BlockMetaData block : footer.getBlocks()) {
  if (offsets.contains(block.getStartingPos())) {
   filteredBlocks.add(block);
  long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
  for (int i = 0; i < foundRowGroupOffsets.length; i++) {
   foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
MessageType fileSchema = footer.getFileMetaData().getSchema();
Map<String, String> fileMetaData = footer.getFileMetaData().getKeyValueMetaData();
internalReader.initialize(
  fileSchema, fileMetaData, path, filteredBlocks, configuration);

List<BlockMetaData> blocks = filterBlocks(footer.getBlocks());
if (blocks.isEmpty()) {
  return null;
this.materializer = new DataModelMaterializer(
    descriptor,
    footer.getFileMetaData().getSchema(),
    mappingConfiguration);
this.columnIo = new ColumnIOFactory().getColumnIO(
    materializer.getMaterializeSchema(),
    footer.getFileMetaData().getSchema());

private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException {
  FileMetaData fileMetaData = meta.getFileMetaData();
  if (FILE_READER_NEWER_CTOR != null) {
    try {
      return FILE_READER_NEWER_CTOR.newInstance(
          hadoopConfiguration,
          fileMetaData,
          path,
          blocks,
          fileMetaData.getSchema().getColumns());
    } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) {
      LOG.debug("failed ParquetFileReader.<init>", e);
    }
  }
  return new ParquetFileReader(
      hadoopConfiguration,
      path,
      blocks,
      fileMetaData.getSchema().getColumns());
}

private static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
 String rootPath = root.toString();
 GlobalMetaData fileMetaData = null;
 List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
 for (Footer footer : footers) {
  String path = footer.getFile().toString();
  if (!path.startsWith(rootPath)) {
   throw new ParquetEncodingException(path + " invalid: all the files must be contained in the root " + root);
  }
  path = path.substring(rootPath.length());
  while (path.startsWith("/")) {
   path = path.substring(1);
  }
  fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
  for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
   block.setPath(path);
   blocks.add(block);
  }
 }
 return new ParquetMetadata(fileMetaData.merge(), blocks);
}

Javadoc

Meta Data block stored in the footer of the file contains file level (Codec, Schema, ...) and block level (location, columns, record count, ...) meta data

Most used methods

Popular in Java

Making http post requests using okhttp
putExtra (Intent)
setContentView (Activity)
compareTo (BigDecimal)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Reference (javax.naming)
JList (javax.swing)
Top plugins for WebStorm

How to useParquetMetadata in parquet.hadoop.metadata

Best Java code snippets using parquet.hadoop.metadata.ParquetMetadata (Showing top 20 results out of 315)

Refine search

How to use
ParquetMetadata
in
parquet.hadoop.metadata