parquet.hadoop.metadata.FileMetaData.getSchema java code examples

ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats);

/**
 * Read the parquet schema from a parquet File
 */
private MessageType readSchemaFromDataFile(Path parquetFilePath) throws IOException {
 LOG.info("Reading schema from " + parquetFilePath);
 if (!fs.exists(parquetFilePath)) {
  throw new IllegalArgumentException(
    "Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
 }
 ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath,
   ParquetMetadataConverter.NO_FILTER);
 return fileFooter.getFileMetaData().getSchema();
}

private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException {
  FileMetaData fileMetaData = meta.getFileMetaData();
  if (FILE_READER_NEWER_CTOR != null) {
    try {
      return FILE_READER_NEWER_CTOR.newInstance(
          hadoopConfiguration,
          fileMetaData,
          path,
          blocks,
          fileMetaData.getSchema().getColumns());
    } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) {
      LOG.debug("failed ParquetFileReader.<init>", e);
    }
  }
  return new ParquetFileReader(
      hadoopConfiguration,
      path,
      blocks,
      fileMetaData.getSchema().getColumns());
}

 createdBy.addAll(mergedMetadata.getCreatedBy());
if ((schema == null && toMerge.getSchema() != null)
  || (schema != null && !schema.equals(toMerge.getSchema()))) {
 schema = mergeInto(toMerge.getSchema(), schema);

static GlobalMetaData mergeInto(
  FileMetaData toMerge,
  GlobalMetaData mergedMetadata,
  boolean strict) {
 MessageType schema = null;
 Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
 Set<String> createdBy = new HashSet<String>();
 if (mergedMetadata != null) {
  schema = mergedMetadata.getSchema();
  newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
  createdBy.addAll(mergedMetadata.getCreatedBy());
 }
 if ((schema == null && toMerge.getSchema() != null)
   || (schema != null && !schema.equals(toMerge.getSchema()))) {
  schema = mergeInto(toMerge.getSchema(), schema, strict);
 }
 for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
  Set<String> values = newKeyValues.get(entry.getKey());
  if (values == null) {
   values = new HashSet<String>();
   newKeyValues.put(entry.getKey(), values);
  }
  values.add(entry.getValue());
 }
 createdBy.add(toMerge.getCreatedBy());
 return new GlobalMetaData(
   schema,
   newKeyValues,
   createdBy);
}

/**
 * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the
 * twitter parquet to support hive 1.1.0
 */
private static MessageType readSchema(Configuration conf, Path parquetFilePath) {
 try {
  return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema();
 } catch (IOException e) {
  throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e);
 }
}

MessageType schema = metaData.getFileMetaData().getSchema();

this.fields = this.fileMetaDataList.get(0).getFileMetaData().getSchema().getFields();
this.columnCount = this.fileMetaDataList.get(0).getFileMetaData().getSchema().getFieldCount();

private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) {
 try {
  Hfs hfs;
  if( tap instanceof CompositeTap )
   hfs = (Hfs) ( (CompositeTap) tap ).getChildTaps().next();
  else
   hfs = (Hfs) tap;
  List<Footer> footers = getFooters(flowProcess, hfs);
  if(footers.isEmpty()) {
   throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
  } else {
   return footers.get(0).getParquetMetadata().getFileMetaData().getSchema();
  }
 } catch (IOException e) {
  throw new TapException(e);
 }
}

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  String[] args = options.getArgs();
  String input = args[0];
  Configuration conf = new Configuration();
  Path inpath = new Path(input);
  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
  MessageType schema = metaData.getFileMetaData().getSchema();
  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                       .withAutoColumn()
                       .withAutoCrop()
                       .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
                       .withColumnPadding(1)
                       .withMaxBufferedLines(1000000)
                       .withFlushOnTab()
                       .build();
  boolean showmd = !options.hasOption('m');
  boolean showdt = !options.hasOption('d');
  Set<String> showColumns = null;
  if (options.hasOption('c')) {
    String[] cols = options.getOptionValues('c');
    showColumns = new HashSet<String>(Arrays.asList(cols));
  }
  dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}

public static void showDetails(PrettyPrintWriter out, FileMetaData meta) {
 out.format("creator: %s%n", meta.getCreatedBy());
 Map<String,String> extra = meta.getKeyValueMetaData();
 if (extra != null) {
  for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) {
   out.print("extra: ");
   out.incrementTabLevel();
   out.format("%s = %s%n", entry.getKey(), entry.getValue());
   out.decrementTabLevel();
  }
 }
 out.println();
 out.format("file schema: %s%n", meta.getSchema().getName());
 out.rule('-');
 showDetails(out, meta.getSchema());
}

this.materializer = new DataModelMaterializer(
    descriptor,
    footer.getFileMetaData().getSchema(),
    mappingConfiguration);
this.columnIo = new ColumnIOFactory().getColumnIO(
    materializer.getMaterializeSchema(),
    footer.getFileMetaData().getSchema());

filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
rowGroupsDropped += blocks.size() - filteredBlocks.size();

ParquetMetadata metadata = metadatas[i];
MessageType schema = metadata.getFileMetaData().getSchema();

public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
 List<BlockMetaData> blocks = parquetMetadata.getBlocks();
 List<RowGroup> rowGroups = new ArrayList<RowGroup>();
 int numRows = 0;
 for (BlockMetaData block : blocks) {
  numRows += block.getRowCount();
  addRowGroup(parquetMetadata, rowGroups, block);
 }
 FileMetaData fileMetaData = new FileMetaData(
   currentVersion,
   toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
   numRows,
   rowGroups);
 Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
 for (Entry<String, String> keyValue : keyValues) {
  addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
 }
 fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
 return fileMetaData;
}

 MessageType fileSchema = footer.getFileMetaData().getSchema();
 Filter filter = getFilter(configuration);
 filteredBlocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
MessageType fileSchema = footer.getFileMetaData().getSchema();
Map<String, String> fileMetaData = footer.getFileMetaData().getKeyValueMetaData();
internalReader.initialize(

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (footersIterator.hasNext()) {
  Footer footer = footersIterator.next();
  List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();
  MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();
  List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
    filter, blocks, fileSchema);
  reader = new InternalParquetRecordReader<T>(readSupport, filter);
  reader.initialize(fileSchema,
    footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
    footer.getFile(), filteredBlocks, conf);
 }
}

  null, fileMetaData.getSchema()));
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata()
  .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
 filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
 if (filtedBlocks.isEmpty()) {
  LOG.debug("All row groups are dropped due to filter predicates");
  filtedBlocks,
  readContext.getRequestedSchema().toString(),
  fileMetaData.getSchema().toString(),
  fileMetaData.getKeyValueMetaData(),
  readContext.getReadSupportMetadata());

ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(jobConf, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();

private static void add(ParquetMetadata footer) {
 for (BlockMetaData blockMetaData : footer.getBlocks()) {
  ++ blockCount;
  MessageType schema = footer.getFileMetaData().getSchema();
  recordCount += blockMetaData.getRowCount();
  List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
  for (ColumnChunkMetaData columnMetaData : columns) {
   ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
   add(
     desc,
     columnMetaData.getValueCount(),
     columnMetaData.getTotalSize(),
     columnMetaData.getTotalUncompressedSize(),
     columnMetaData.getEncodings(),
     columnMetaData.getStatistics());
  }
 }
}

Popular methods of FileMetaData

Popular in Java

Reactive rest calls using spring rest template
compareTo (BigDecimal)
setRequestProperty (URLConnection)
scheduleAtFixedRate (Timer)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
Socket (java.net)
Provides a client-side TCP socket.
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
JTextField (javax.swing)
Best plugins for Eclipse

How to use getSchemamethodin parquet.hadoop.metadata.FileMetaData

Best Java code snippets using parquet.hadoop.metadata.FileMetaData.getSchema (Showing top 20 results out of 315)

How to use
getSchema
method
in
parquet.hadoop.metadata.FileMetaData