parquet.hadoop.ParquetFileReader java code examples

public ParquetFileMetadata(Configuration conf, Path hdfsFilePath) throws IOException
{
  this.metaData = ParquetFileReader.readFooter(conf, hdfsFilePath, NO_FILTER);
}

public void close() throws IOException {
 if (reader != null) {
  reader.close();
 }
}

/**
 * Read the footers of all the files under that path (recursively)
 * not using summary files.
 * rowGroups are not skipped
 * @param configuration the configuration to access the FS
 * @param fileStatus the root dir
 * @return all the footers
 * @throws IOException
 */
public static List<Footer> readAllFootersInParallel(Configuration configuration, FileStatus fileStatus) throws IOException {
 List<FileStatus> statuses = listFiles(configuration, fileStatus);
 return readAllFootersInParallel(configuration, statuses, false);
}

/**
 * Read the footers of all the files under that path (recursively)
 * using summary files if possible
 * @param configuration the configuration to access the FS
 * @param fileStatus the root dir
 * @return all the footers
 * @throws IOException
 */
public static List<Footer> readFooters(Configuration configuration, FileStatus pathStatus, boolean skipRowGroups) throws IOException {
 List<FileStatus> files = listFiles(configuration, pathStatus);
 return readAllFootersInParallelUsingSummaryFiles(configuration, files, skipRowGroups);
}

/**
 * Specifically reads a given summary file
 * @param configuration
 * @param summaryStatus
 * @return the metadata translated for each file
 * @throws IOException
 */
public static List<Footer> readSummaryFile(Configuration configuration, FileStatus summaryStatus) throws IOException {
 final Path parent = summaryStatus.getPath().getParent();
 ParquetMetadata mergedFooters = readFooter(configuration, summaryStatus, filter(false));
 return footersFromSummaryFile(parent, mergedFooters);
}

readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
while ((pageReadStore = reader.readNextRowGroup()) != null)
reader.close();

        path));
  ParquetMetadata footer = ParquetFileReader.readFooter(
      hadoopConfiguration, path, ParquetMetadataConverter.NO_FILTER);
  List<BlockMetaData> blocks = filterBlocks(footer.getBlocks());
      footer.getFileMetaData().getSchema());
return fileReader.readNextRowGroup();

static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException {
 Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE);
 Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE);
 FileSystem fileSystem = basePath.getFileSystem(configuration);
 if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) {
  // reading the summary file that does not contain the row groups
  if (Log.INFO) LOG.info("reading summary file: " + commonMetaDataFile);
  return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups));
 } else if (fileSystem.exists(metadataFile)) {
  if (Log.INFO) LOG.info("reading summary file: " + metadataFile);
  return readFooter(configuration, metadataFile, filter(skipRowGroups));
 } else {
  return null;
 }
}

private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException {
  FileMetaData fileMetaData = meta.getFileMetaData();
  if (FILE_READER_NEWER_CTOR != null) {
    try {
      return FILE_READER_NEWER_CTOR.newInstance(
          hadoopConfiguration,
          fileMetaData,
          path,
          blocks,
          fileMetaData.getSchema().getColumns());
    } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) {
      LOG.debug("failed ParquetFileReader.<init>", e);
    }
  }
  return new ParquetFileReader(
      hadoopConfiguration,
      path,
      blocks,
      fileMetaData.getSchema().getColumns());
}

private void checkRead() throws IOException {
 if (current == totalCountLoadedSoFar) {
  if (current != 0) {
   long timeAssembling = System.currentTimeMillis() - startedAssemblingCurrentBlockAt;
   totalTimeSpentProcessingRecords += timeAssembling;
   LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
   long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
   long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
   long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
   LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
  }
  LOG.info("at row " + current + ". reading next block");
  long t0 = System.currentTimeMillis();
  PageReadStore pages = reader.readNextRowGroup();
  if (pages == null) {
   throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
  }
  long timeSpentReading = System.currentTimeMillis() - t0;
  totalTimeSpentReadingBytes += timeSpentReading;
  BenchmarkCounter.incrementTime(timeSpentReading);
  LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount());
  if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema);
  MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema);
  recordReader = columnIO.getRecordReader(pages, recordConverter, recordFilter);
  startedAssemblingCurrentBlockAt = System.currentTimeMillis();
  totalCountLoadedSoFar += pages.getRowCount();
  ++ currentBlock;
 }
}

@Deprecated
public static List<Footer> readFooters(Configuration configuration, Path path) throws IOException {
 return readFooters(configuration, status(configuration, path));
}

/**
 * for files provided, check if there's a summary file.
 * If a summary file is found it is used otherwise the file footer is used.
 * @param configuration the hadoop conf to connect to the file system;
 * @param partFiles the part files to read
 * @return the footers for those files using the summary file if possible.
 * @throws IOException
 */
@Deprecated
public static List<Footer> readAllFootersInParallelUsingSummaryFiles(Configuration configuration, List<FileStatus> partFiles) throws IOException {
 return readAllFootersInParallelUsingSummaryFiles(configuration, partFiles, false);
}

@Deprecated
public static List<Footer> readAllFootersInParallel(final Configuration configuration, List<FileStatus> partFiles) throws IOException {
 return readAllFootersInParallel(configuration, partFiles, false);
}

List<Map<Path, Footer>> footersFromSummaries = runAllInParallel(configuration.getInt(PARQUET_READ_PARALLELISM, 5), summaries);
for (Map<Path, Footer> footers : footersFromSummaries) {
 cache.putAll(footers);
result.addAll(readAllFootersInParallel(configuration, toRead, skipRowGroups));

/**
 * this always returns the row groups
 * @param configuration
 * @param pathStatus
 * @return
 * @throws IOException
 */
@Deprecated
public static List<Footer> readFooters(Configuration configuration, FileStatus pathStatus) throws IOException {
 return readFooters(configuration, pathStatus, false);
}

freader = new ParquetFileReader(conf, inpath, rblocks, columns);
PageReadStore store = freader.readNextRowGroup();
while (store != null) {
  out.incrementTabLevel();
  store = freader.readNextRowGroup();
freader.close();
long total = blocks.size();
long offset = 1;
freader = new ParquetFileReader(conf, inpath, blocks, Collections.singletonList(column));
PageReadStore store = freader.readNextRowGroup();
while (store != null) {
  ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema);
  store = freader.readNextRowGroup();
out.flushColumns();
if (freader != null) {
  freader.close();

public void initialize(MessageType requestedSchema, MessageType fileSchema,
            Map<String, String> extraMetadata, Map<String, String> readSupportMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 this.requestedSchema = requestedSchema;
 this.fileSchema = fileSchema;
 this.file = file;
 this.columnCount = this.requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, extraMetadata, fileSchema,
   new ReadSupport.ReadContext(requestedSchema, readSupportMetadata));
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
 throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);

private ParquetReader(Configuration conf,
           Path file,
           ReadSupport<T> readSupport,
           Filter filter) throws IOException {
 this.readSupport = readSupport;
 this.filter = checkNotNull(filter, "filter");
 this.conf = conf;
 FileSystem fs = file.getFileSystem(conf);
 List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
 List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
 this.footersIterator = footers.iterator();
}

public static void writeMetaDataFile(Configuration configuration, Path outputPath) {
 if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
  try {
   final FileSystem fileSystem = outputPath.getFileSystem(configuration);
   FileStatus outputStatus = fileSystem.getFileStatus(outputPath);
   List<Footer> footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus);
   try {
    ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers);
   } catch (Exception e) {
    LOG.warn("could not write summary file for " + outputPath, e);
    final Path metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE);
    if (fileSystem.exists(metadataPath)) {
     fileSystem.delete(metadataPath, true);
    }
   }
  } catch (Exception e) {
   LOG.warn("could not write summary file for " + outputPath, e);
  }
 }
}

Javadoc

Internal implementation of the Parquet file reader as a block container

Most used methods

readFooter
Reads the meta data in the footer of the file. Skipping row groups (or not) based on the provided fi
<init>
close
readNextRowGroup
Reads all the columns requested from the row group at the current file position.
readAllFootersInParallel
Read the footers of all the files under that path (recursively) not using summary files. rowGroups a
readAllFootersInParallelUsingSummaryFiles
for files provided, check if there's a summary file. If a summary file is found it is used otherwise
readFooters
filter
footersFromSummaryFile
listFiles
readSummaryFile
Specifically reads a given summary file
readSummaryMetadata

Popular in Java

Parsing JSON documents to java classes using gson
runOnUiThread (Activity)
onRequestPermissionsResult (Fragment)
scheduleAtFixedRate (Timer)
HttpServer (com.sun.net.httpserver)
This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
SSLHandshakeException (javax.net.ssl)
The exception that is thrown when a handshake could not be completed successfully.
DateTimeFormat (org.joda.time.format)
Factory that creates instances of DateTimeFormatter from patterns and styles. Datetime formatting i
GridBagLayout (java.awt)
The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
Option (scala)
CodeWhisperer alternatives

How to useParquetFileReader in parquet.hadoop

Best Java code snippets using parquet.hadoop.ParquetFileReader (Showing top 20 results out of 315)

How to use
ParquetFileReader
in
parquet.hadoop