parquet.hadoop.metadata.ParquetMetadata.getFileMetaData java code examples

FSDataInputStream inputStream = fileSystem.open(path);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats);

public parquet.hadoop.metadata.FileMetaData getFileMetaData ()
{
  return this.metaData.getFileMetaData();
}

/**
 * Read the parquet schema from a parquet File
 */
private MessageType readSchemaFromDataFile(Path parquetFilePath) throws IOException {
 LOG.info("Reading schema from " + parquetFilePath);
 if (!fs.exists(parquetFilePath)) {
  throw new IllegalArgumentException(
    "Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
 }
 ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath,
   ParquetMetadataConverter.NO_FILTER);
 return fileFooter.getFileMetaData().getSchema();
}

static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
 GlobalMetaData fileMetaData = null;
 for (Footer footer : footers) {
  ParquetMetadata currentMetadata = footer.getParquetMetadata();
  fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
 }
 return fileMetaData;
}

/**
 * Will merge the metadata of all the footers together
 * @param footers the list files footers to merge
 * @return the global meta data for all the footers
 */
static GlobalMetaData getGlobalMetaData(List<Footer> footers) {
 GlobalMetaData fileMetaData = null;
 for (Footer footer : footers) {
  ParquetMetadata currentMetadata = footer.getParquetMetadata();
  fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData);
 }
 return fileMetaData;
}

static List<Footer> footersFromSummaryFile(final Path parent, ParquetMetadata mergedFooters) {
 Map<Path, ParquetMetadata> footers = new HashMap<Path, ParquetMetadata>();
 List<BlockMetaData> blocks = mergedFooters.getBlocks();
 for (BlockMetaData block : blocks) {
  String path = block.getPath();
  Path fullPath = new Path(parent, path);
  ParquetMetadata current = footers.get(fullPath);
  if (current == null) {
   current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList<BlockMetaData>());
   footers.put(fullPath, current);
  }
  current.getBlocks().add(block);
 }
 List<Footer> result = new ArrayList<Footer>();
 for (Entry<Path, ParquetMetadata> entry : footers.entrySet()) {
  result.add(new Footer(entry.getKey(), entry.getValue()));
 }
 return result;
}

/**
 * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the
 * twitter parquet to support hive 1.1.0
 */
private static MessageType readSchema(Configuration conf, Path parquetFilePath) {
 try {
  return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema();
 } catch (IOException e) {
  throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e);
 }
}

static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
 String rootPath = root.toUri().getPath();
 GlobalMetaData fileMetaData = null;
 List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
 for (Footer footer : footers) {
   String footerPath = footer.getFile().toUri().getPath();
  if (!footerPath.startsWith(rootPath)) {
   throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
  }
  footerPath = footerPath.substring(rootPath.length());
  while (footerPath.startsWith("/")) {
   footerPath = footerPath.substring(1);
  }
  fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
  for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
   block.setPath(footerPath);
   blocks.add(block);
  }
 }
 return new ParquetMetadata(fileMetaData.merge(), blocks);
}

MessageType schema = metaData.getFileMetaData().getSchema();

public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
 List<BlockMetaData> blocks = parquetMetadata.getBlocks();
 List<RowGroup> rowGroups = new ArrayList<RowGroup>();
 int numRows = 0;
 for (BlockMetaData block : blocks) {
  numRows += block.getRowCount();
  addRowGroup(parquetMetadata, rowGroups, block);
 }
 FileMetaData fileMetaData = new FileMetaData(
   currentVersion,
   toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
   numRows,
   rowGroups);
 Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
 for (Entry<String, String> keyValue : keyValues) {
  addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
 }
 fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
 return fileMetaData;
}

private static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
 String rootPath = root.toString();
 GlobalMetaData fileMetaData = null;
 List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
 for (Footer footer : footers) {
  String path = footer.getFile().toString();
  if (!path.startsWith(rootPath)) {
   throw new ParquetEncodingException(path + " invalid: all the files must be contained in the root " + root);
  }
  path = path.substring(rootPath.length());
  while (path.startsWith("/")) {
   path = path.substring(1);
  }
  fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
  for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
   block.setPath(path);
   blocks.add(block);
  }
 }
 return new ParquetMetadata(fileMetaData.merge(), blocks);
}

public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) {
 showDetails(out, meta.getFileMetaData());
 long i = 1;
 for (BlockMetaData bmeta : meta.getBlocks()) {
  out.println();
  showDetails(out, bmeta, i++);
 }
}

private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException {
  FileMetaData fileMetaData = meta.getFileMetaData();
  if (FILE_READER_NEWER_CTOR != null) {
    try {
      return FILE_READER_NEWER_CTOR.newInstance(
          hadoopConfiguration,
          fileMetaData,
          path,
          blocks,
          fileMetaData.getSchema().getColumns());
    } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) {
      LOG.debug("failed ParquetFileReader.<init>", e);
    }
  }
  return new ParquetFileReader(
      hadoopConfiguration,
      path,
      blocks,
      fileMetaData.getSchema().getColumns());
}

private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) {
 try {
  Hfs hfs;
  if( tap instanceof CompositeTap )
   hfs = (Hfs) ( (CompositeTap) tap ).getChildTaps().next();
  else
   hfs = (Hfs) tap;
  List<Footer> footers = getFooters(flowProcess, hfs);
  if(footers.isEmpty()) {
   throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
  } else {
   return footers.get(0).getParquetMetadata().getFileMetaData().getSchema();
  }
 } catch (IOException e) {
  throw new TapException(e);
 }
}

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  String[] args = options.getArgs();
  String input = args[0];
  Configuration conf = new Configuration();
  Path inpath = new Path(input);
  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
  MessageType schema = metaData.getFileMetaData().getSchema();
  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                       .withAutoColumn()
                       .withAutoCrop()
                       .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
                       .withColumnPadding(1)
                       .withMaxBufferedLines(1000000)
                       .withFlushOnTab()
                       .build();
  boolean showmd = !options.hasOption('m');
  boolean showdt = !options.hasOption('d');
  Set<String> showColumns = null;
  if (options.hasOption('c')) {
    String[] cols = options.getOptionValues('c');
    showColumns = new HashSet<String>(Arrays.asList(cols));
  }
  dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (footersIterator.hasNext()) {
  Footer footer = footersIterator.next();
  List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();
  MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();
  List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
    filter, blocks, fileSchema);
  reader = new InternalParquetRecordReader<T>(readSupport, filter);
  reader.initialize(fileSchema,
    footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
    footer.getFile(), filteredBlocks, conf);
 }
}

this.materializer = new DataModelMaterializer(
    descriptor,
    footer.getFileMetaData().getSchema(),
    mappingConfiguration);
this.columnIo = new ColumnIOFactory().getColumnIO(
    materializer.getMaterializeSchema(),
    footer.getFileMetaData().getSchema());

filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
rowGroupsDropped += blocks.size() - filteredBlocks.size();

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (footersIterator.hasNext()) {
  Footer footer = footersIterator.next();
  reader = new InternalParquetRecordReader<T>(readSupport, filter);
  reader.initialize(
    readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
    readContext.getReadSupportMetadata(), footer.getFile(), footer.getParquetMetadata().getBlocks(), conf);
 }
}

private static void add(ParquetMetadata footer) {
 for (BlockMetaData blockMetaData : footer.getBlocks()) {
  ++ blockCount;
  MessageType schema = footer.getFileMetaData().getSchema();
  recordCount += blockMetaData.getRowCount();
  List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
  for (ColumnChunkMetaData columnMetaData : columns) {
   ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
   add(
     desc,
     columnMetaData.getValueCount(),
     columnMetaData.getTotalSize(),
     columnMetaData.getTotalUncompressedSize(),
     columnMetaData.getEncodings(),
     columnMetaData.getStatistics());
  }
 }
}

Popular methods of ParquetMetadata

Popular in Java

Making http post requests using okhttp
getSharedPreferences (Context)
getContentResolver (Context)
setScale (BigDecimal)
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Permission (java.security)
Legacy security code; do not use.
TimeUnit (java.util.concurrent)
A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
GridBagLayout (java.awt)
The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
Filter (javax.servlet)
A filter is an object that performs filtering tasks on either the request to a resource (a servlet o
JTable (javax.swing)
Top Vim plugins

How to use getFileMetaDatamethodin parquet.hadoop.metadata.ParquetMetadata

Best Java code snippets using parquet.hadoop.metadata.ParquetMetadata.getFileMetaData (Showing top 20 results out of 315)

How to use
getFileMetaData
method
in
parquet.hadoop.metadata.ParquetMetadata