Refine search
FSDataInputStream inputStream = fileSystem.open(path); ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats); for (BlockMetaData block : parquetMetadata.getBlocks()) { long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) {
return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); int numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); return fileMetaData; }
/** * Read the parquet schema from a parquet File */ private MessageType readSchemaFromDataFile(Path parquetFilePath) throws IOException { LOG.info("Reading schema from " + parquetFilePath); if (!fs.exists(parquetFilePath)) { throw new IllegalArgumentException( "Failed to read schema from data file " + parquetFilePath + ". File does not exist."); } ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER); return fileFooter.getFileMetaData().getSchema(); }
static List<Footer> footersFromSummaryFile(final Path parent, ParquetMetadata mergedFooters) { Map<Path, ParquetMetadata> footers = new HashMap<Path, ParquetMetadata>(); List<BlockMetaData> blocks = mergedFooters.getBlocks(); for (BlockMetaData block : blocks) { String path = block.getPath(); Path fullPath = new Path(parent, path); ParquetMetadata current = footers.get(fullPath); if (current == null) { current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList<BlockMetaData>()); footers.put(fullPath, current); } current.getBlocks().add(block); } List<Footer> result = new ArrayList<Footer>(); for (Entry<Path, ParquetMetadata> entry : footers.entrySet()) { result.add(new Footer(entry.getKey(), entry.getValue())); } return result; }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize( readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), readContext.getReadSupportMetadata(), footer.getFile(), footer.getParquetMetadata().getBlocks(), conf); } }
public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) { showDetails(out, meta.getFileMetaData()); long i = 1; for (BlockMetaData bmeta : meta.getBlocks()) { out.println(); showDetails(out, bmeta, i++); } }
public parquet.hadoop.metadata.FileMetaData getFileMetaData () { return this.metaData.getFileMetaData(); }
public List<BlockMetaData> getBlocks () { return this.metaData.getBlocks(); } }
/** * * @param parquetMetaData * @return the json representation */ public static String toJSON(ParquetMetadata parquetMetaData) { return toJSON(parquetMetaData, objectMapper); }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks(); MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema(); List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups( filter, blocks, fileSchema); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize(fileSchema, footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), footer.getFile(), filteredBlocks, conf); } }
/** * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the * twitter parquet to support hive 1.1.0 */ private static MessageType readSchema(Configuration conf, Path parquetFilePath) { try { return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema(); } catch (IOException e) { throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); } }
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict); } return fileMetaData; }
/** * writes a _metadata and _common_metadata file * @param configuration the configuration to use to get the FileSystem * @param outputPath the directory to write the _metadata file to * @param footers the list of footers to merge * @throws IOException */ public static void writeMetadataFile(Configuration configuration, Path outputPath, List<Footer> footers) throws IOException { ParquetMetadata metadataFooter = mergeFooters(outputPath, footers); FileSystem fs = outputPath.getFileSystem(configuration); outputPath = outputPath.makeQualified(fs); writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE); metadataFooter.getBlocks().clear(); writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE); }
/** * * @param parquetMetaData * @return the pretty printed json representation */ public static String toPrettyJSON(ParquetMetadata parquetMetaData) { return toJSON(parquetMetaData, prettyObjectMapper); }
MessageType fileSchema = footer.getFileMetaData().getSchema(); Filter filter = getFilter(configuration); filteredBlocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else { for (BlockMetaData block : footer.getBlocks()) { if (offsets.contains(block.getStartingPos())) { filteredBlocks.add(block); long[] foundRowGroupOffsets = new long[footer.getBlocks().size()]; for (int i = 0; i < foundRowGroupOffsets.length; i++) { foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos(); MessageType fileSchema = footer.getFileMetaData().getSchema(); Map<String, String> fileMetaData = footer.getFileMetaData().getKeyValueMetaData(); internalReader.initialize( fileSchema, fileMetaData, path, filteredBlocks, configuration);
List<BlockMetaData> blocks = filterBlocks(footer.getBlocks()); if (blocks.isEmpty()) { return null; this.materializer = new DataModelMaterializer( descriptor, footer.getFileMetaData().getSchema(), mappingConfiguration); this.columnIo = new ColumnIOFactory().getColumnIO( materializer.getMaterializeSchema(), footer.getFileMetaData().getSchema());
private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException { FileMetaData fileMetaData = meta.getFileMetaData(); if (FILE_READER_NEWER_CTOR != null) { try { return FILE_READER_NEWER_CTOR.newInstance( hadoopConfiguration, fileMetaData, path, blocks, fileMetaData.getSchema().getColumns()); } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) { LOG.debug("failed ParquetFileReader.<init>", e); } } return new ParquetFileReader( hadoopConfiguration, path, blocks, fileMetaData.getSchema().getColumns()); }
private static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toString(); GlobalMetaData fileMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String path = footer.getFile().toString(); if (!path.startsWith(rootPath)) { throw new ParquetEncodingException(path + " invalid: all the files must be contained in the root " + root); } path = path.substring(rootPath.length()); while (path.startsWith("/")) { path = path.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(path); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }