org.apache.parquet.hadoop.ParquetFileReader java code examples

Refine search

  return ParquetFileReader.readFooter(new ParquetFooterInputFromCache(footerData), filter);
 } finally {
  metadataCache.decRefBuffer(footerData);
final FileSystem fs = file.getFileSystem(configuration);
final FileStatus stat = fs.getFileStatus(file);
if (cacheKey == null || metadataCache == null) {
 return readFooterFromFile(file, fs, stat, filter);
try (SeekableInputStream stream = HadoopStreams.wrap(fs.open(file))) {
 long footerLengthIndex = stat.getLen()
   - ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length;
  return ParquetFileReader.readFooter(new ParquetFooterInputFromCache(footerData), filter);
 } finally {
  metadataCache.decRefBuffer(footerData);

/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadDecisionTreeModel(String pathToMdl) {
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    final Map<Integer, NodeData> nodes = new TreeMap<>();
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        NodeData nodeData = extractNodeDataFromParquetRow(g);
        nodes.put(nodeData.id, nodeData);
      }
    }
    return buildDecisionTreeModel(nodes);
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return null;
}

@Override
public void close() throws IOException {
 if (cacheFsPath != null) {
  LlapCacheAwareFs.unregisterFile(cacheFsPath);
 }
 if (reader != null) {
  reader.close();
 }
}

Configuration config = new Configuration();
config.set("spark.sql.parquet.binaryAsString", "false");
config.set("spark.sql.parquet.int96AsTimestamp", "false");
this.file = new Path(path);
long length = this.file.getFileSystem(config).getFileStatus(this.file).getLen();
ParquetMetadata footer = readFooter(config, file, range(0, length));
List<BlockMetaData> blocks = footer.getBlocks();
this.fileSchema = footer.getFileMetaData().getSchema();
  Types.MessageTypeBuilder builder = Types.buildMessage();
  for (String s: columns) {
   if (!fileSchema.containsField(s)) {
    throw new IOException("Can only project existing columns. Unknown field: " + s +
        " File schema:\n" + fileSchema);
   builder.addFields(fileSchema.getType(s));
this.reader = new ParquetFileReader(
  config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
for (BlockMetaData block : reader.getRowGroups()) {
 this.totalRowCount += block.getRowCount();

final FileStatus[] fileStatuses = fs.listStatus(new Path(parquetDir));
} else if (lastEntry.isDirectory()) {
  final FileStatus[] directoryEntries = fs.listStatus(lastEntry.getPath());
    log.info("Reading schema data from : {}", lastFile.getPath().toString());
    parquetFilePath = Optional.of(lastFile.getPath());
  } else {
    throw new JobRuntimeException("Encountered a directory where there should only be files. Path: "
        + lastFile.getPath().toString());
    ParquetFileReader.readFooter(new Configuration(),
        parquetFilePath.get(), ParquetMetadataConverter.NO_FILTER);
final MessageType messageType = metadata.getFileMetaData().getSchema();
final ParquetSchemaConverter converter = new ParquetSchemaConverter(new SQLConf());
final StructType structType = converter.convert(messageType);

public ParquetFileLoader(String filename, boolean lazy) {
  super(filename);
  this.path = new Path(this.filename);
  this.lazy = lazy;
  this.configuration = new Configuration();
  System.setProperty("hadoop.home.dir", "/");
  this.configuration.set("hadoop.security.authentication", "simple");
  this.configuration.set("hadoop.security.authorization", "false");
  try {
    this.metadata = ParquetFileReader.readFooter(this.configuration, this.path,
        ParquetMetadataConverter.NO_FILTER);
  } catch (IOException ex) {
    throw new RuntimeException(ex);
  }
}

public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile, boolean sort) throws Exception {
 try {
  deleteTableIfExists(outputFile);
  test("use dfs_test");
//    test("ALTER SESSION SET \"planner.add_producer_consumer\" = false");
  String query = select(selection, inputTable, sort);
  System.out.println(outputFile);
  String create = "CREATE TABLE " + outputFile + " AS " + query;
  String validateQuery = select(validationSelection, outputFile, sort);
  test(create);
  test(validateQuery); // TODO: remove
  testBuilder()
    .unOrdered()
    .sqlQuery(validateQuery)
    .sqlBaselineQuery(query)
    .go();
  Configuration hadoopConf = new Configuration();
  Path output = new Path(getDfsTestTmpSchemaLocation(), outputFile);
  FileSystem fs = output.getFileSystem(hadoopConf);
  for (FileStatus file : fs.listStatus(output)) {
   ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS);
   String version = footer.getFileMetaData().getKeyValueMetaData().get(DREMIO_VERSION_PROPERTY);
   assertEquals(DremioVersionInfo.getVersion(), version);
   PageHeaderUtil.validatePageHeaders(file.getPath(), footer);
  }
 } finally {
  deleteTableIfExists(outputFile);
 }
}

static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException {
 Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE);
 Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE);
 FileSystem fileSystem = basePath.getFileSystem(configuration);
 if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) {
  // reading the summary file that does not contain the row groups
  LOG.info("reading summary file: {}", commonMetaDataFile);
  return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups));
 } else if (fileSystem.exists(metadataFile)) {
  LOG.info("reading summary file: {}", metadataFile);
  return readFooter(configuration, metadataFile, filter(skipRowGroups));
 } else {
  return null;
 }
}

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
 // copy custom configuration to the Configuration passed to the ReadSupport
 Configuration conf = new Configuration();
 if (options instanceof HadoopReadOptions) {
  conf = ((HadoopReadOptions) options).getConf();
 }
 for (String property : options.getPropertyNames()) {
  conf.set(property, options.getProperty(property));
 }
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
 this.filterRecords = options.useRecordFilter();
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

IOUtils.copy(in, new FileOutputStream(tempFile));
Configuration conf = new Configuration();
Path path = new Path(tempFile.getAbsolutePath());
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
parserHandler.setTotalRows(getRowCount(metaData));
try (ParquetReader<SimpleRecord> reader = ParquetReader.builder(new SimpleReadSupport(), new Path(tempFile.getAbsolutePath())).build()) {
  parserHandler.newSheet("");
  int rowNum = 0;
      Object val = nameValue.getValue();
      if (!(val instanceof SimpleRecord)) {
        Type type = schema.getType(schema.getFieldIndex(name));
        row.put(name, getRecordValue(val, type));

final ParquetMetadata metadata;
final UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI();
final Configuration conf = new Configuration(fs.getConf());
try {
 metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>)() -> {
  try (ParquetFileReader parquetFileReader = ParquetFileReader.open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) {
   return parquetFileReader.getFooter();
MessageType schema = metadata.getFileMetaData().getSchema();
schema.getPaths();
for (String[] path : schema.getPaths()) {
 colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
logger.debug("Contains corrupt dates: {}.", containsCorruptDates);
for (BlockMetaData rowGroup : metadata.getBlocks()) {
 List<ColumnMetadata_v3> columnMetadataList = new ArrayList<>();
 long length = 0;
String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();

 footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
 blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
 footer = readFooter(configuration, file, NO_FILTER);
 Set<Long> offsets = new HashSet<>();
 for (long offset : rowGroupOffsets) {
 for (BlockMetaData block : footer.getBlocks()) {
  if (offsets.contains(block.getStartingPos())) {
   blocks.add(block);
this.fileSchema = footer.getFileMetaData().getSchema();
Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
this.requestedSchema = readContext.getRequestedSchema();
String sparkRequestedSchemaString =
  configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA());
this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString);
this.reader = new ParquetFileReader(
  configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
for (BlockMetaData block : reader.getRowGroups()) {
 this.totalRowCount += block.getRowCount();

private ParquetMetadata readFooter(Configuration conf, String path, ParquetReaderConfig readerConfig) throws IOException {
 try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(path),
  readerConfig.addCountersToConf(conf)), readerConfig.toReadOptions())) {
  return reader.getFooter();
 }
}

public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) {
 ParquetMetadata footer;
 try {
  // TODO(vc): Should we use the parallel reading version here?
  footer = ParquetFileReader
    .readFooter(getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath);
 } catch (IOException e) {
  throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath,
    e);
 }
 return footer;
}

final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
if (parquetMetadata.getBlocks().size() > 1) {
 throw new IOException(
  format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
   parquetMetadata.getBlocks().size(), filePath));
final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();
for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
 columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
try(final FSDataInputStream in = fs.open(filePath)) {
 for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
  if (isBinaryType(columnChunkMetaData.getType())) {

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

public void initialize(FileMetaData parquetFileMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.fileSchema = parquetFileMetadata.getSchema();
 this.file = file;
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

 cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file,
  HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID),
  HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID),
 FileSystem fs = file.getFileSystem(configuration);
 if (cacheKey instanceof Long && HiveConf.getBoolVar(
   cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
 blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
 for (BlockMetaData block : footer.getBlocks()) {
  if (offsets.contains(block.getStartingPos())) {
   blocks.add(block);
 this.totalRowCount += block.getRowCount();
this.fileSchema = footer.getFileMetaData().getSchema();
this.reader = new ParquetFileReader(
 configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

if (oldSplit instanceof FileSplit) {
 final Path finalPath = ((FileSplit) oldSplit).getPath();
 jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());
 final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
 final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
 final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
  null, fileMetaData.getSchema()));
  .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
 final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
 final long splitStart = ((FileSplit) oldSplit).getStart();
 FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
 if (filter != null) {
  filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
  if (filtedBlocks.isEmpty()) {
   LOG.debug("All row groups are dropped due to filter predicates");
  oldSplit.getLocations(),
  filtedBlocks,
  readContext.getRequestedSchema().toString(),
  fileMetaData.getSchema().toString(),
  fileMetaData.getKeyValueMetaData(),
  readContext.getReadSupportMetadata());

/**
 * Specifically reads a given summary file
 * @param configuration a configuration
 * @param summaryStatus file status for a summary file
 * @return the metadata translated for each file
 * @throws IOException if an exception is thrown while reading the summary file
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static List<Footer> readSummaryFile(Configuration configuration, FileStatus summaryStatus) throws IOException {
 final Path parent = summaryStatus.getPath().getParent();
 ParquetMetadata mergedFooters = readFooter(configuration, summaryStatus, filter(false));
 return footersFromSummaryFile(parent, mergedFooters);
}

Javadoc

Internal implementation of the Parquet file reader as a block container

Most used methods

readFooter
Reads the meta data block in the footer of the file using provided input stream
readNextRowGroup
Reads all the columns requested from the row group at the current file position.
close
<init>
open
Open a InputFile with ParquetReadOptions.
getRowGroups
getFooter
getFileMetaData
readAllFootersInParallelUsingSummaryFiles
for files provided, check if there's a summary file. If a summary file is found it is used otherwise
readFooters
getDictionaryReader
readSummaryFile
Specifically reads a given summary file

Popular in Java

Reading from database using SQL prepared statement
getContentResolver (Context)
notifyDataSetChanged (ArrayAdapter)
onCreateOptionsMenu (Activity)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
Socket (java.net)
Provides a client-side TCP socket.
URLConnection (java.net)
A connection to a URL for reading or writing. For HTTP connections, see HttpURLConnection for docume
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
Best plugins for Eclipse

How to useParquetFileReader in org.apache.parquet.hadoop

Best Java code snippets using org.apache.parquet.hadoop.ParquetFileReader (Showing top 20 results out of 315)

Refine search

How to use
ParquetFileReader
in
org.apache.parquet.hadoop