Refine search
FSDataInputStream inputStream = fileSystem.open(path); ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats); for (BlockMetaData block : parquetMetadata.getBlocks()) { long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) {
return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); int numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); return fileMetaData; }
/** * Read the parquet schema from a parquet File */ private MessageType readSchemaFromDataFile(Path parquetFilePath) throws IOException { LOG.info("Reading schema from " + parquetFilePath); if (!fs.exists(parquetFilePath)) { throw new IllegalArgumentException( "Failed to read schema from data file " + parquetFilePath + ". File does not exist."); } ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER); return fileFooter.getFileMetaData().getSchema(); }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize( readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), readContext.getReadSupportMetadata(), footer.getFile(), footer.getParquetMetadata().getBlocks(), conf); } }
createdBy.addAll(mergedMetadata.getCreatedBy()); if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) { schema = mergeInto(toMerge.getSchema(), schema); for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) { Set<String> values = newKeyValues.get(entry.getKey()); if (values == null) { createdBy.add(toMerge.getCreatedBy()); return new GlobalMetaData( schema,
/** * Will merge the metadata as if it was coming from a single file. * (for all part files written together this will always work) * If there are conflicting values an exception will be thrown * @return the merged version of this */ public FileMetaData merge() { String createdByString = createdBy.size() == 1 ? createdBy.iterator().next() : createdBy.toString(); Map<String, String> mergedKeyValues = new HashMap<String, String>(); for (Entry<String, Set<String>> entry : keyValueMetaData.entrySet()) { if (entry.getValue().size() > 1) { throw new RuntimeException("could not merge metadata: key " + entry.getKey() + " has conflicting values: " + entry.getValue()); } mergedKeyValues.put(entry.getKey(), entry.getValue().iterator().next()); } return new FileMetaData(schema, mergedKeyValues, createdByString); }
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks(); MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema(); List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups( filter, blocks, fileSchema); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize(fileSchema, footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), footer.getFile(), filteredBlocks, conf); } }
/** * Reads the schema from the parquet file. This is different from ParquetUtils as it uses the * twitter parquet to support hive 1.1.0 */ private static MessageType readSchema(Configuration conf, Path parquetFilePath) { try { return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema(); } catch (IOException e) { throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); } }
public static void showDetails(PrettyPrintWriter out, FileMetaData meta) { out.format("creator: %s%n", meta.getCreatedBy()); Map<String,String> extra = meta.getKeyValueMetaData(); if (extra != null) { for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) { out.print("extra: "); out.incrementTabLevel(); out.format("%s = %s%n", entry.getKey(), entry.getValue()); out.decrementTabLevel(); } } out.println(); out.format("file schema: %s%n", meta.getSchema().getName()); out.rule('-'); showDetails(out, meta.getSchema()); }
long fileSize = fileToStore.length(); String size = (fileSize > 1024) ? (fileSize/1024) + "Kb" : fileSize + "b"; fileMetaData = new FileMetaData(fileName, size, true);
MessageType fileSchema = footer.getFileMetaData().getSchema(); Filter filter = getFilter(configuration); filteredBlocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else { for (BlockMetaData block : footer.getBlocks()) { if (offsets.contains(block.getStartingPos())) { filteredBlocks.add(block); MessageType fileSchema = footer.getFileMetaData().getSchema(); Map<String, String> fileMetaData = footer.getFileMetaData().getKeyValueMetaData(); internalReader.initialize( fileSchema, fileMetaData, path, filteredBlocks, configuration);
List<BlockMetaData> blocks = filterBlocks(footer.getBlocks()); if (blocks.isEmpty()) { return null; this.materializer = new DataModelMaterializer( descriptor, footer.getFileMetaData().getSchema(), mappingConfiguration); this.columnIo = new ColumnIOFactory().getColumnIO( materializer.getMaterializeSchema(), footer.getFileMetaData().getSchema());
private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException { FileMetaData fileMetaData = meta.getFileMetaData(); if (FILE_READER_NEWER_CTOR != null) { try { return FILE_READER_NEWER_CTOR.newInstance( hadoopConfiguration, fileMetaData, path, blocks, fileMetaData.getSchema().getColumns()); } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) { LOG.debug("failed ParquetFileReader.<init>", e); } } return new ParquetFileReader( hadoopConfiguration, path, blocks, fileMetaData.getSchema().getColumns()); }
static GlobalMetaData mergeInto( FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) { MessageType schema = null; Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>(); Set<String> createdBy = new HashSet<String>(); if (mergedMetadata != null) { schema = mergedMetadata.getSchema(); newKeyValues.putAll(mergedMetadata.getKeyValueMetaData()); createdBy.addAll(mergedMetadata.getCreatedBy()); } if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) { schema = mergeInto(toMerge.getSchema(), schema, strict); } for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) { Set<String> values = newKeyValues.get(entry.getKey()); if (values == null) { values = new HashSet<String>(); newKeyValues.put(entry.getKey(), values); } values.add(entry.getValue()); } createdBy.add(toMerge.getCreatedBy()); return new GlobalMetaData( schema, newKeyValues, createdBy); }
/** * ends a file once all blocks have been written. * closes the file. * @param extraMetaData the extra meta data to write in the footer * @throws IOException */ public void end(Map<String, String> extraMetaData) throws IOException { state = state.end(); if (DEBUG) LOG.debug(out.getPos() + ": end"); ParquetMetadata footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks); serializeFooter(footer, out); out.close(); }
try { ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(configuration, path); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); for (BlockMetaData block : parquetMetadata.getBlocks()) { long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) { ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager); blocks = blocks.stream() .filter(block -> predicateMatches(parquetPredicate, block, configuration, dataSource, requestedSchema, effectivePredicate)) fileMetaData.getSchema(), fileMetaData.getKeyValueMetaData(), requestedSchema, blocks,
FileStatus fileStatus = fs.getFileStatus(file); ParquetMetadata parquetMetaData = footer.getParquetMetadata(); List<BlockMetaData> blocks = parquetMetaData.getBlocks(); filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema()); rowGroupsDropped += blocks.size() - filteredBlocks.size();
MessageType schema = metaData.getFileMetaData().getSchema();