parquet.hadoop.metadata.FileMetaData java code examples

Refine search

ParquetMetadata

FSDataInputStream inputStream = fileSystem.open(path);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats);
for (BlockMetaData block : parquetMetadata.getBlocks()) {
  long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
  if (firstDataPage >= start && firstDataPage < start + length) {

return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);

public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
 List<BlockMetaData> blocks = parquetMetadata.getBlocks();
 List<RowGroup> rowGroups = new ArrayList<RowGroup>();
 int numRows = 0;
 for (BlockMetaData block : blocks) {
  numRows += block.getRowCount();
  addRowGroup(parquetMetadata, rowGroups, block);
 }
 FileMetaData fileMetaData = new FileMetaData(
   currentVersion,
   toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
   numRows,
   rowGroups);
 Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
 for (Entry<String, String> keyValue : keyValues) {
  addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
 }
 fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
 return fileMetaData;
}

/**
 * Read the parquet schema from a parquet File
 */
private MessageType readSchemaFromDataFile(Path parquetFilePath) throws IOException {
 LOG.info("Reading schema from " + parquetFilePath);
 if (!fs.exists(parquetFilePath)) {
  throw new IllegalArgumentException(
    "Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
 }
 ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath,
   ParquetMetadataConverter.NO_FILTER);
 return fileFooter.getFileMetaData().getSchema();
}

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (footersIterator.hasNext()) {
  Footer footer = footersIterator.next();
  reader = new InternalParquetRecordReader<T>(readSupport, filter);
  reader.initialize(
    readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
    readContext.getReadSupportMetadata(), footer.getFile(), footer.getParquetMetadata().getBlocks(), conf);
 }
}

 createdBy.addAll(mergedMetadata.getCreatedBy());
if ((schema == null && toMerge.getSchema() != null)
  || (schema != null && !schema.equals(toMerge.getSchema()))) {
 schema = mergeInto(toMerge.getSchema(), schema);
for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
 Set<String> values = newKeyValues.get(entry.getKey());
 if (values == null) {
createdBy.add(toMerge.getCreatedBy());
return new GlobalMetaData(
  schema,

this.fields = this.fileMetaDataList.get(0).getFileMetaData().getSchema().getFields();
this.columnCount = this.fileMetaDataList.get(0).getFileMetaData().getSchema().getFieldCount();

/**
 * Will merge the metadata as if it was coming from a single file.
 * (for all part files written together this will always work)
 * If there are conflicting values an exception will be thrown
 * @return the merged version of this
 */
public FileMetaData merge() {
 String createdByString = createdBy.size() == 1 ?
  createdBy.iterator().next() :
  createdBy.toString();
 Map<String, String> mergedKeyValues = new HashMap<String, String>();
 for (Entry<String, Set<String>> entry : keyValueMetaData.entrySet()) {
  if (entry.getValue().size() > 1) {
   throw new RuntimeException("could not merge metadata: key " + entry.getKey() + " has conflicting values: " + entry.getValue());
  }
  mergedKeyValues.put(entry.getKey(), entry.getValue().iterator().next());
 }
 return new FileMetaData(schema, mergedKeyValues, createdByString);
}

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (footersIterator.hasNext()) {
  Footer footer = footersIterator.next();
  List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();
  MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();
  List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
    filter, blocks, fileSchema);
  reader = new InternalParquetRecordReader<T>(readSupport, filter);
  reader.initialize(fileSchema,
    footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
    footer.getFile(), filteredBlocks, conf);
 }
}

/**
 * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the
 * twitter parquet to support hive 1.1.0
 */
private static MessageType readSchema(Configuration conf, Path parquetFilePath) {
 try {
  return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema();
 } catch (IOException e) {
  throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e);
 }
}

public static void showDetails(PrettyPrintWriter out, FileMetaData meta) {
 out.format("creator: %s%n", meta.getCreatedBy());
 Map<String,String> extra = meta.getKeyValueMetaData();
 if (extra != null) {
  for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) {
   out.print("extra: ");
   out.incrementTabLevel();
   out.format("%s = %s%n", entry.getKey(), entry.getValue());
   out.decrementTabLevel();
  }
 }
 out.println();
 out.format("file schema: %s%n", meta.getSchema().getName());
 out.rule('-');
 showDetails(out, meta.getSchema());
}

long fileSize = fileToStore.length();
String size = (fileSize > 1024) ? (fileSize/1024) + "Kb" : fileSize + "b";
fileMetaData = new FileMetaData(fileName, size, true);

 MessageType fileSchema = footer.getFileMetaData().getSchema();
 Filter filter = getFilter(configuration);
 filteredBlocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
 for (BlockMetaData block : footer.getBlocks()) {
  if (offsets.contains(block.getStartingPos())) {
   filteredBlocks.add(block);
MessageType fileSchema = footer.getFileMetaData().getSchema();
Map<String, String> fileMetaData = footer.getFileMetaData().getKeyValueMetaData();
internalReader.initialize(
  fileSchema, fileMetaData, path, filteredBlocks, configuration);

List<BlockMetaData> blocks = filterBlocks(footer.getBlocks());
if (blocks.isEmpty()) {
  return null;
this.materializer = new DataModelMaterializer(
    descriptor,
    footer.getFileMetaData().getSchema(),
    mappingConfiguration);
this.columnIo = new ColumnIOFactory().getColumnIO(
    materializer.getMaterializeSchema(),
    footer.getFileMetaData().getSchema());

private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException {
  FileMetaData fileMetaData = meta.getFileMetaData();
  if (FILE_READER_NEWER_CTOR != null) {
    try {
      return FILE_READER_NEWER_CTOR.newInstance(
          hadoopConfiguration,
          fileMetaData,
          path,
          blocks,
          fileMetaData.getSchema().getColumns());
    } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) {
      LOG.debug("failed ParquetFileReader.<init>", e);
    }
  }
  return new ParquetFileReader(
      hadoopConfiguration,
      path,
      blocks,
      fileMetaData.getSchema().getColumns());
}

static GlobalMetaData mergeInto(
  FileMetaData toMerge,
  GlobalMetaData mergedMetadata,
  boolean strict) {
 MessageType schema = null;
 Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
 Set<String> createdBy = new HashSet<String>();
 if (mergedMetadata != null) {
  schema = mergedMetadata.getSchema();
  newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
  createdBy.addAll(mergedMetadata.getCreatedBy());
 }
 if ((schema == null && toMerge.getSchema() != null)
   || (schema != null && !schema.equals(toMerge.getSchema()))) {
  schema = mergeInto(toMerge.getSchema(), schema, strict);
 }
 for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
  Set<String> values = newKeyValues.get(entry.getKey());
  if (values == null) {
   values = new HashSet<String>();
   newKeyValues.put(entry.getKey(), values);
  }
  values.add(entry.getValue());
 }
 createdBy.add(toMerge.getCreatedBy());
 return new GlobalMetaData(
   schema,
   newKeyValues,
   createdBy);
}

/**
 * ends a file once all blocks have been written.
 * closes the file.
 * @param extraMetaData the extra meta data to write in the footer
 * @throws IOException
 */
public void end(Map<String, String> extraMetaData) throws IOException {
 state = state.end();
 if (DEBUG) LOG.debug(out.getPos() + ": end");
 ParquetMetadata footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
 serializeFooter(footer, out);
 out.close();
}

try {
  ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(configuration, path);
  FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
  MessageType fileSchema = fileMetaData.getSchema();
  for (BlockMetaData block : parquetMetadata.getBlocks()) {
    long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= start && firstDataPage < start + length) {
    ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
    blocks = blocks.stream()
        .filter(block -> predicateMatches(parquetPredicate, block, configuration, dataSource, requestedSchema, effectivePredicate))
      fileMetaData.getSchema(),
      fileMetaData.getKeyValueMetaData(),
      requestedSchema,
      blocks,

FileStatus fileStatus = fs.getFileStatus(file);
ParquetMetadata parquetMetaData = footer.getParquetMetadata();
List<BlockMetaData> blocks = parquetMetaData.getBlocks();
filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
rowGroupsDropped += blocks.size() - filteredBlocks.size();

MessageType schema = metaData.getFileMetaData().getSchema();

Javadoc

File level meta data (Schema, codec, ...)

Most used methods

Popular in Java

Creating JSON documents from java classes using gson
findViewById (Activity)
getApplicationContext (Context)
scheduleAtFixedRate (ScheduledExecutorService)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
Selector (java.nio.channels)
A controller for the selection of SelectableChannel objects. Selectable channels can be registered w
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
JCheckBox (javax.swing)
Top 12 Jupyter Notebook extensions

How to useFileMetaData in parquet.hadoop.metadata

Best Java code snippets using parquet.hadoop.metadata.FileMetaData (Showing top 20 results out of 315)

Refine search

How to use
FileMetaData
in
parquet.hadoop.metadata