/** * for files provided, check if there's a summary file. * If a summary file is found it is used otherwise the file footer is used. * @param configuration the hadoop conf to connect to the file system; * @param partFiles the part files to read * @return the footers for those files using the summary file if possible. * @throws IOException */ @Deprecated public static List<Footer> readAllFootersInParallelUsingSummaryFiles(Configuration configuration, List<FileStatus> partFiles) throws IOException { return readAllFootersInParallelUsingSummaryFiles(configuration, partFiles, false); }
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); }
/** * Read the footers of all the files under that path (recursively) * using summary files if possible * @param configuration the configuration to access the FS * @param fileStatus the root dir * @return all the footers * @throws IOException */ public static List<Footer> readFooters(Configuration configuration, FileStatus pathStatus, boolean skipRowGroups) throws IOException { List<FileStatus> files = listFiles(configuration, pathStatus); return readAllFootersInParallelUsingSummaryFiles(configuration, files, skipRowGroups); }
/** * the footers for the files * @param configuration to connect to the file system * @param statuses the files to open * @return the footers of the files * @throws IOException */ public List<Footer> getFooters(Configuration configuration, Collection<FileStatus> statuses) throws IOException { if (Log.DEBUG) LOG.debug("reading " + statuses.size() + " files"); boolean taskSideMetaData = isTaskSideMetaData(configuration); return ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(configuration, statuses, taskSideMetaData); }
/** * @param conf the configuration * @param file the file to read * @param readSupport to materialize records * @param filter the filter to use to filter records * @throws IOException */ public ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, UnboundRecordFilter filter) throws IOException { this.readSupport = readSupport; this.filter = filter; this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses); this.footersIterator = footers.iterator(); globalMetaData = ParquetFileWriter.getGlobalMetaData(footers); List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { blocks.addAll(footer.getParquetMetadata().getBlocks()); } MessageType schema = globalMetaData.getSchema(); Map<String, Set<String>> extraMetadata = globalMetaData.getKeyValueMetaData(); readContext = readSupport.init(new InitContext(conf, extraMetadata, schema)); }