static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict); } return fileMetaData; }
/** * Will merge the metadata of all the footers together * @param footers the list files footers to merge * @return the global meta data for all the footers */ static GlobalMetaData getGlobalMetaData(List<Footer> footers) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData); } return fileMetaData; }
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
@Override public Map<Path, Footer> call() throws Exception { ParquetMetadata mergedMetadata = readSummaryMetadata(configuration, path, skipRowGroups); if (mergedMetadata != null) { final List<Footer> footers; if (skipRowGroups) { footers = new ArrayList<Footer>(); for (FileStatus f : partFiles) { footers.add(new Footer(f.getPath(), mergedMetadata)); } } else { footers = footersFromSummaryFile(path, mergedMetadata); } Map<Path, Footer> map = new HashMap<Path, Footer>(); for (Footer footer : footers) { // the folder may have been moved footer = new Footer(new Path(path, footer.getFile().getName()), footer.getParquetMetadata()); map.put(footer.getFile(), footer); } return map; } else { return Collections.emptyMap(); } } });
private static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toString(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String path = footer.getFile().toString(); if (!path.startsWith(rootPath)) { throw new ParquetEncodingException(path + " invalid: all the files must be contained in the root " + root); } path = path.substring(rootPath.length()); while (path.startsWith("/")) { path = path.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(path); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
public FootersCacheValue(FileStatusWrapper status, Footer footer) { this.modificationTime = status.getModificationTime(); this.footer = new Footer(footer.getFile(), footer.getParquetMetadata()); }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks(); MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema(); List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups( filter, blocks, fileSchema); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize(fileSchema, footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), footer.getFile(), filteredBlocks, conf); } }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize( readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), readContext.getReadSupportMetadata(), footer.getFile(), footer.getParquetMetadata().getBlocks(), conf); } }
private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) { try { Hfs hfs; if( tap instanceof CompositeTap ) hfs = (Hfs) ( (CompositeTap) tap ).getChildTaps().next(); else hfs = (Hfs) tap; List<Footer> footers = getFooters(flowProcess, hfs); if(footers.isEmpty()) { throw new TapException("Could not read Parquet metadata at " + hfs.getPath()); } else { return footers.get(0).getParquetMetadata().getFileMetaData().getSchema(); } } catch (IOException e) { throw new TapException(e); } }
/** * @param conf the configuration * @param file the file to read * @param readSupport to materialize records * @param filter the filter to use to filter records * @throws IOException */ public ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, UnboundRecordFilter filter) throws IOException { this.readSupport = readSupport; this.filter = filter; this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses); this.footersIterator = footers.iterator(); globalMetaData = ParquetFileWriter.getGlobalMetaData(footers); List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { blocks.addAll(footer.getParquetMetadata().getBlocks()); } MessageType schema = globalMetaData.getSchema(); Map<String, Set<String>> extraMetadata = globalMetaData.getKeyValueMetaData(); readContext = readSupport.init(new InitContext(conf, extraMetadata, schema)); }
List<Footer> readSummaryFile = ParquetFileReader.readSummaryFile(configuration, summaryStatus); for (Footer footer : readSummaryFile) { add(footer.getParquetMetadata());
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; Configuration conf = new Configuration(); Path inputPath = new Path(input); FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath); List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false); PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter() .withAutoColumn() .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE) .withColumnPadding(1) .build(); for(Footer f: footers) { out.format("file: %s%n" , f.getFile()); MetadataUtils.showDetails(out, f.getParquetMetadata()); out.flushColumns(); } } }
BlockMap.computeBlocks(context.getFileSystem(), status), false); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { if (block.getColumns().isEmpty()) { continue;
FileSystem fs = file.getFileSystem(configuration); FileStatus fileStatus = fs.getFileStatus(file); ParquetMetadata parquetMetaData = footer.getParquetMetadata(); List<BlockMetaData> blocks = parquetMetaData.getBlocks();