static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict); } return fileMetaData; }
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict); } return fileMetaData; }
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); for (Footer footer : footers) { for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) { totalRowCount += block.getRowCount(); } } }
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); for (Footer footer : footers) { for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) { totalRowCount += block.getRowCount(); } } }
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks(); MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema(); List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups( filter, blocks, fileSchema); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize(footer.getParquetMetadata().getFileMetaData(), footer.getFile(), filteredBlocks, conf); } }
@Override public Map<Path, Footer> call() throws Exception { ParquetMetadata mergedMetadata = readSummaryMetadata(configuration, path, skipRowGroups); if (mergedMetadata != null) { final List<Footer> footers; if (skipRowGroups) { footers = new ArrayList<Footer>(); for (FileStatus f : partFiles) { footers.add(new Footer(f.getPath(), mergedMetadata)); } } else { footers = footersFromSummaryFile(path, mergedMetadata); } Map<Path, Footer> map = new HashMap<Path, Footer>(); for (Footer footer : footers) { // the folder may have been moved footer = new Footer(new Path(path, footer.getFile().getName()), footer.getParquetMetadata()); map.put(footer.getFile(), footer); } return map; } else { return Collections.emptyMap(); } } });
@Override public Map<Path, Footer> call() throws Exception { ParquetMetadata mergedMetadata = readSummaryMetadata(configuration, path, skipRowGroups); if (mergedMetadata != null) { final List<Footer> footers; if (skipRowGroups) { footers = new ArrayList<Footer>(); for (FileStatus f : partFiles) { footers.add(new Footer(f.getPath(), mergedMetadata)); } } else { footers = footersFromSummaryFile(path, mergedMetadata); } Map<Path, Footer> map = new HashMap<Path, Footer>(); for (Footer footer : footers) { // the folder may have been moved footer = new Footer(new Path(path, footer.getFile().getName()), footer.getParquetMetadata()); map.put(footer.getFile(), footer); } return map; } else { return Collections.emptyMap(); } } });
public FootersCacheValue(FileStatusWrapper status, Footer footer) { this.modificationTime = status.getModificationTime(); this.footer = new Footer(footer.getFile(), footer.getParquetMetadata()); }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks(); MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema(); List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups( filter, blocks, fileSchema); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize(footer.getParquetMetadata().getFileMetaData(), footer.getFile(), filteredBlocks, conf); } }
private ParquetMetadata getParquetMetadata() { return getFooters().stream().findFirst().get().getParquetMetadata(); }
public FootersCacheValue(FileStatusWrapper status, Footer footer) { this.modificationTime = status.getModificationTime(); this.footer = new Footer(footer.getFile(), footer.getParquetMetadata()); }
@Override public List<IParquetInputField> readSchema( String file ) throws Exception { return inClassloader( () -> { ConfigurationProxy conf = new ConfigurationProxy(); S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf ); Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) ); FileSystem fs = FileSystem.get( filePath.toUri(), conf ); FileStatus fileStatus = fs.getFileStatus( filePath ); List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true ); if ( footers.isEmpty() ) { return new ArrayList<IParquetInputField>(); } else { ParquetMetadata meta = footers.get( 0 ).getParquetMetadata(); MessageType schema = meta.getFileMetaData().getSchema(); return ParquetConverter.buildInputFields( schema ); } } ); } }
private MessageType readSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) { try { Hfs hfs; if( tap instanceof CompositeTap ) hfs = (Hfs) ( (CompositeTap) tap ).getChildTaps().next(); else hfs = (Hfs) tap; List<Footer> footers = getFooters(flowProcess, hfs); if(footers.isEmpty()) { throw new TapException("Could not read Parquet metadata at " + hfs.getPath()); } else { return footers.get(0).getParquetMetadata().getFileMetaData().getSchema(); } } catch (IOException e) { throw new TapException(e); } }
private MessageType readSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) { try { Hfs hfs; if( tap instanceof CompositeTap ) hfs = (Hfs) ( (CompositeTap) tap ).getChildTaps().next(); else hfs = (Hfs) tap; List<Footer> footers = getFooters(flowProcess, hfs); if(footers.isEmpty()) { throw new TapException("Could not read Parquet metadata at " + hfs.getPath()); } else { return footers.get(0).getParquetMetadata().getFileMetaData().getSchema(); } } catch (IOException e) { throw new TapException(e); } }
private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) { try { Hfs hfs; if( tap instanceof CompositeTap ) hfs = (Hfs) ( (CompositeTap) tap ).getChildTaps().next(); else hfs = (Hfs) tap; List<Footer> footers = getFooters(flowProcess, hfs); if(footers.isEmpty()) { throw new TapException("Could not read Parquet metadata at " + hfs.getPath()); } else { return footers.get(0).getParquetMetadata().getFileMetaData().getSchema(); } } catch (IOException e) { throw new TapException(e); } }
FileSystem fs = file.getFileSystem(configuration); FileStatus fileStatus = fs.getFileStatus(file); ParquetMetadata parquetMetaData = footer.getParquetMetadata(); List<BlockMetaData> blocks = parquetMetaData.getBlocks();
FileSystem fs = file.getFileSystem(configuration); FileStatus fileStatus = fs.getFileStatus(file); ParquetMetadata parquetMetaData = footer.getParquetMetadata(); List<BlockMetaData> blocks = parquetMetaData.getBlocks();
private void validateFooters(final List<Footer> metadata) { logger.debug(metadata.toString()); assertEquals(3, metadata.size()); for (Footer footer : metadata) { final File file = new File(footer.getFile().toUri()); assertTrue(file.getName(), file.getName().startsWith("part")); assertTrue(file.getPath(), file.exists()); final ParquetMetadata parquetMetadata = footer.getParquetMetadata(); assertEquals(2, parquetMetadata.getBlocks().size()); final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData(); assertEquals("bar", keyValueMetaData.get("foo")); assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName())); } }