FSDataInputStream inputStream = fileSystem.open(path); ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats);
public parquet.hadoop.metadata.FileMetaData getFileMetaData () { return this.metaData.getFileMetaData(); }
/** * Read the parquet schema from a parquet File */ private MessageType readSchemaFromDataFile(Path parquetFilePath) throws IOException { LOG.info("Reading schema from " + parquetFilePath); if (!fs.exists(parquetFilePath)) { throw new IllegalArgumentException( "Failed to read schema from data file " + parquetFilePath + ". File does not exist."); } ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER); return fileFooter.getFileMetaData().getSchema(); }
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict); } return fileMetaData; }
/** * Will merge the metadata of all the footers together * @param footers the list files footers to merge * @return the global meta data for all the footers */ static GlobalMetaData getGlobalMetaData(List<Footer> footers) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData); } return fileMetaData; }
static List<Footer> footersFromSummaryFile(final Path parent, ParquetMetadata mergedFooters) { Map<Path, ParquetMetadata> footers = new HashMap<Path, ParquetMetadata>(); List<BlockMetaData> blocks = mergedFooters.getBlocks(); for (BlockMetaData block : blocks) { String path = block.getPath(); Path fullPath = new Path(parent, path); ParquetMetadata current = footers.get(fullPath); if (current == null) { current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList<BlockMetaData>()); footers.put(fullPath, current); } current.getBlocks().add(block); } List<Footer> result = new ArrayList<Footer>(); for (Entry<Path, ParquetMetadata> entry : footers.entrySet()) { result.add(new Footer(entry.getKey(), entry.getValue())); } return result; }
/** * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the * twitter parquet to support hive 1.1.0 */ private static MessageType readSchema(Configuration conf, Path parquetFilePath) { try { return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema(); } catch (IOException e) { throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); } }
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
MessageType schema = metaData.getFileMetaData().getSchema();
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); int numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); return fileMetaData; }
private static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toString(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String path = footer.getFile().toString(); if (!path.startsWith(rootPath)) { throw new ParquetEncodingException(path + " invalid: all the files must be contained in the root " + root); } path = path.substring(rootPath.length()); while (path.startsWith("/")) { path = path.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(path); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) { showDetails(out, meta.getFileMetaData()); long i = 1; for (BlockMetaData bmeta : meta.getBlocks()) { out.println(); showDetails(out, bmeta, i++); } }
private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException { FileMetaData fileMetaData = meta.getFileMetaData(); if (FILE_READER_NEWER_CTOR != null) { try { return FILE_READER_NEWER_CTOR.newInstance( hadoopConfiguration, fileMetaData, path, blocks, fileMetaData.getSchema().getColumns()); } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) { LOG.debug("failed ParquetFileReader.<init>", e); } } return new ParquetFileReader( hadoopConfiguration, path, blocks, fileMetaData.getSchema().getColumns()); }
private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) { try { Hfs hfs; if( tap instanceof CompositeTap ) hfs = (Hfs) ( (CompositeTap) tap ).getChildTaps().next(); else hfs = (Hfs) tap; List<Footer> footers = getFooters(flowProcess, hfs); if(footers.isEmpty()) { throw new TapException("Could not read Parquet metadata at " + hfs.getPath()); } else { return footers.get(0).getParquetMetadata().getFileMetaData().getSchema(); } } catch (IOException e) { throw new TapException(e); } }
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; Configuration conf = new Configuration(); Path inpath = new Path(input); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath); MessageType schema = metaData.getFileMetaData().getSchema(); PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter() .withAutoColumn() .withAutoCrop() .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES) .withColumnPadding(1) .withMaxBufferedLines(1000000) .withFlushOnTab() .build(); boolean showmd = !options.hasOption('m'); boolean showdt = !options.hasOption('d'); Set<String> showColumns = null; if (options.hasOption('c')) { String[] cols = options.getOptionValues('c'); showColumns = new HashSet<String>(Arrays.asList(cols)); } dump(out, metaData, schema, inpath, showmd, showdt, showColumns); }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks(); MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema(); List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups( filter, blocks, fileSchema); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize(fileSchema, footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), footer.getFile(), filteredBlocks, conf); } }
this.materializer = new DataModelMaterializer( descriptor, footer.getFileMetaData().getSchema(), mappingConfiguration); this.columnIo = new ColumnIOFactory().getColumnIO( materializer.getMaterializeSchema(), footer.getFileMetaData().getSchema());
filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema()); rowGroupsDropped += blocks.size() - filteredBlocks.size();
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize( readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), readContext.getReadSupportMetadata(), footer.getFile(), footer.getParquetMetadata().getBlocks(), conf); } }
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }