public ParquetFileMetadata(Configuration conf, Path hdfsFilePath) throws IOException { this.metaData = ParquetFileReader.readFooter(conf, hdfsFilePath, NO_FILTER); }
public void close() throws IOException { if (reader != null) { reader.close(); } }
/** * Read the footers of all the files under that path (recursively) * not using summary files. * rowGroups are not skipped * @param configuration the configuration to access the FS * @param fileStatus the root dir * @return all the footers * @throws IOException */ public static List<Footer> readAllFootersInParallel(Configuration configuration, FileStatus fileStatus) throws IOException { List<FileStatus> statuses = listFiles(configuration, fileStatus); return readAllFootersInParallel(configuration, statuses, false); }
/** * Read the footers of all the files under that path (recursively) * using summary files if possible * @param configuration the configuration to access the FS * @param fileStatus the root dir * @return all the footers * @throws IOException */ public static List<Footer> readFooters(Configuration configuration, FileStatus pathStatus, boolean skipRowGroups) throws IOException { List<FileStatus> files = listFiles(configuration, pathStatus); return readAllFootersInParallelUsingSummaryFiles(configuration, files, skipRowGroups); }
/** * Specifically reads a given summary file * @param configuration * @param summaryStatus * @return the metadata translated for each file * @throws IOException */ public static List<Footer> readSummaryFile(Configuration configuration, FileStatus summaryStatus) throws IOException { final Path parent = summaryStatus.getPath().getParent(); ParquetMetadata mergedFooters = readFooter(configuration, summaryStatus, filter(false)); return footersFromSummaryFile(parent, mergedFooters); }
readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors)); while ((pageReadStore = reader.readNextRowGroup()) != null) reader.close();
path)); ParquetMetadata footer = ParquetFileReader.readFooter( hadoopConfiguration, path, ParquetMetadataConverter.NO_FILTER); List<BlockMetaData> blocks = filterBlocks(footer.getBlocks()); footer.getFileMetaData().getSchema()); return fileReader.readNextRowGroup();
static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException { Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE); Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE); FileSystem fileSystem = basePath.getFileSystem(configuration); if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) { // reading the summary file that does not contain the row groups if (Log.INFO) LOG.info("reading summary file: " + commonMetaDataFile); return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups)); } else if (fileSystem.exists(metadataFile)) { if (Log.INFO) LOG.info("reading summary file: " + metadataFile); return readFooter(configuration, metadataFile, filter(skipRowGroups)); } else { return null; } }
private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException { FileMetaData fileMetaData = meta.getFileMetaData(); if (FILE_READER_NEWER_CTOR != null) { try { return FILE_READER_NEWER_CTOR.newInstance( hadoopConfiguration, fileMetaData, path, blocks, fileMetaData.getSchema().getColumns()); } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) { LOG.debug("failed ParquetFileReader.<init>", e); } } return new ParquetFileReader( hadoopConfiguration, path, blocks, fileMetaData.getSchema().getColumns()); }
private void checkRead() throws IOException { if (current == totalCountLoadedSoFar) { if (current != 0) { long timeAssembling = System.currentTimeMillis() - startedAssemblingCurrentBlockAt; totalTimeSpentProcessingRecords += timeAssembling; LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms"); long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes; long percentReading = 100 * totalTimeSpentReadingBytes / totalTime; long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime; LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)"); } LOG.info("at row " + current + ". reading next block"); long t0 = System.currentTimeMillis(); PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total); } long timeSpentReading = System.currentTimeMillis() - t0; totalTimeSpentReadingBytes += timeSpentReading; BenchmarkCounter.incrementTime(timeSpentReading); LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount()); if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema); MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema); recordReader = columnIO.getRecordReader(pages, recordConverter, recordFilter); startedAssemblingCurrentBlockAt = System.currentTimeMillis(); totalCountLoadedSoFar += pages.getRowCount(); ++ currentBlock; } }
@Deprecated public static List<Footer> readFooters(Configuration configuration, Path path) throws IOException { return readFooters(configuration, status(configuration, path)); }
/** * for files provided, check if there's a summary file. * If a summary file is found it is used otherwise the file footer is used. * @param configuration the hadoop conf to connect to the file system; * @param partFiles the part files to read * @return the footers for those files using the summary file if possible. * @throws IOException */ @Deprecated public static List<Footer> readAllFootersInParallelUsingSummaryFiles(Configuration configuration, List<FileStatus> partFiles) throws IOException { return readAllFootersInParallelUsingSummaryFiles(configuration, partFiles, false); }
@Deprecated public static List<Footer> readAllFootersInParallel(final Configuration configuration, List<FileStatus> partFiles) throws IOException { return readAllFootersInParallel(configuration, partFiles, false); }
List<Map<Path, Footer>> footersFromSummaries = runAllInParallel(configuration.getInt(PARQUET_READ_PARALLELISM, 5), summaries); for (Map<Path, Footer> footers : footersFromSummaries) { cache.putAll(footers); result.addAll(readAllFootersInParallel(configuration, toRead, skipRowGroups));
/** * this always returns the row groups * @param configuration * @param pathStatus * @return * @throws IOException */ @Deprecated public static List<Footer> readFooters(Configuration configuration, FileStatus pathStatus) throws IOException { return readFooters(configuration, pathStatus, false); }
freader = new ParquetFileReader(conf, inpath, rblocks, columns); PageReadStore store = freader.readNextRowGroup(); while (store != null) { out.incrementTabLevel(); store = freader.readNextRowGroup(); freader.close(); long total = blocks.size(); long offset = 1; freader = new ParquetFileReader(conf, inpath, blocks, Collections.singletonList(column)); PageReadStore store = freader.readNextRowGroup(); while (store != null) { ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema); store = freader.readNextRowGroup(); out.flushColumns(); if (freader != null) { freader.close();
public void initialize(MessageType requestedSchema, MessageType fileSchema, Map<String, String> extraMetadata, Map<String, String> readSupportMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { this.requestedSchema = requestedSchema; this.fileSchema = fileSchema; this.file = file; this.columnCount = this.requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, extraMetadata, fileSchema, new ReadSupport.ReadContext(requestedSchema, readSupportMetadata)); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); }
PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); }
public static void writeMetaDataFile(Configuration configuration, Path outputPath) { if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { final FileSystem fileSystem = outputPath.getFileSystem(configuration); FileStatus outputStatus = fileSystem.getFileStatus(outputPath); List<Footer> footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus); try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers); } catch (Exception e) { LOG.warn("could not write summary file for " + outputPath, e); final Path metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE); if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true); } } } catch (Exception e) { LOG.warn("could not write summary file for " + outputPath, e); } } }