for (BlockMetaData block : parquetMetadata.getBlocks()) { long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) {
public List<BlockMetaData> getBlocks () { return this.metaData.getBlocks(); } }
static List<Footer> footersFromSummaryFile(final Path parent, ParquetMetadata mergedFooters) { Map<Path, ParquetMetadata> footers = new HashMap<Path, ParquetMetadata>(); List<BlockMetaData> blocks = mergedFooters.getBlocks(); for (BlockMetaData block : blocks) { String path = block.getPath(); Path fullPath = new Path(parent, path); ParquetMetadata current = footers.get(fullPath); if (current == null) { current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList<BlockMetaData>()); footers.put(fullPath, current); } current.getBlocks().add(block); } List<Footer> result = new ArrayList<Footer>(); for (Entry<Path, ParquetMetadata> entry : footers.entrySet()) { result.add(new Footer(entry.getKey(), entry.getValue())); } return result; }
/** * writes a _metadata and _common_metadata file * @param configuration the configuration to use to get the FileSystem * @param outputPath the directory to write the _metadata file to * @param footers the list of footers to merge * @throws IOException */ public static void writeMetadataFile(Configuration configuration, Path outputPath, List<Footer> footers) throws IOException { ParquetMetadata metadataFooter = mergeFooters(outputPath, footers); FileSystem fs = outputPath.getFileSystem(configuration); outputPath = outputPath.makeQualified(fs); writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE); metadataFooter.getBlocks().clear(); writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE); }
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
private static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toString(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String path = footer.getFile().toString(); if (!path.startsWith(rootPath)) { throw new ParquetEncodingException(path + " invalid: all the files must be contained in the root " + root); } path = path.substring(rootPath.length()); while (path.startsWith("/")) { path = path.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(path); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
/** * @param conf the configuration * @param file the file to read * @param readSupport to materialize records * @param filter the filter to use to filter records * @throws IOException */ public ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, UnboundRecordFilter filter) throws IOException { this.readSupport = readSupport; this.filter = filter; this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses); this.footersIterator = footers.iterator(); globalMetaData = ParquetFileWriter.getGlobalMetaData(footers); List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { blocks.addAll(footer.getParquetMetadata().getBlocks()); } MessageType schema = globalMetaData.getSchema(); Map<String, Set<String>> extraMetadata = globalMetaData.getKeyValueMetaData(); readContext = readSupport.init(new InitContext(conf, extraMetadata, schema)); }
public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) { showDetails(out, meta.getFileMetaData()); long i = 1; for (BlockMetaData bmeta : meta.getBlocks()) { out.println(); showDetails(out, bmeta, i++); } }
BlockMap.computeBlocks(context.getFileSystem(), status), false); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { if (block.getColumns().isEmpty()) { continue;
MessageType fileSchema = footer.getFileMetaData().getSchema(); Filter filter = getFilter(configuration); filteredBlocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else { for (BlockMetaData block : footer.getBlocks()) { if (offsets.contains(block.getStartingPos())) { filteredBlocks.add(block); long[] foundRowGroupOffsets = new long[footer.getBlocks().size()]; for (int i = 0; i < foundRowGroupOffsets.length; i++) { foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
try { ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, path, NO_FILTER); List<BlockMetaData> blocks = parquetMetadata.getBlocks(); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema();
FileStatus fileStatus = fs.getFileStatus(file); ParquetMetadata parquetMetaData = footer.getParquetMetadata(); List<BlockMetaData> blocks = parquetMetaData.getBlocks();
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize( readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), readContext.getReadSupportMetadata(), footer.getFile(), footer.getParquetMetadata().getBlocks(), conf); } }
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); int numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); return fileMetaData; }
List<BlockMetaData> blocks = filterBlocks(footer.getBlocks()); if (blocks.isEmpty()) { return null;
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks(); MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema(); List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups( filter, blocks, fileSchema); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize(fileSchema, footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), footer.getFile(), filteredBlocks, conf); } }
fileMetaData.getKeyValueMetaData(), fileSchema, parquetMetadata.getBlocks(), jobConf, dataSource);
final List<BlockMetaData> blocks = parquetMetadata.getBlocks(); final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }