return ParquetFileReader.readFooter(new ParquetFooterInputFromCache(footerData), filter); } finally { metadataCache.decRefBuffer(footerData); final FileSystem fs = file.getFileSystem(configuration); final FileStatus stat = fs.getFileStatus(file); if (cacheKey == null || metadataCache == null) { return readFooterFromFile(file, fs, stat, filter); try (SeekableInputStream stream = HadoopStreams.wrap(fs.open(file))) { long footerLengthIndex = stat.getLen() - ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length; return ParquetFileReader.readFooter(new ParquetFooterInputFromCache(footerData), filter); } finally { metadataCache.decRefBuffer(footerData);
/** * Load Decision Tree model. * * @param pathToMdl Path to model. */ private static Model loadDecisionTreeModel(String pathToMdl) { try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, NodeData> nodes = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); NodeData nodeData = extractNodeDataFromParquetRow(g); nodes.put(nodeData.id, nodeData); } } return buildDecisionTreeModel(nodes); } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return null; }
@Override public void close() throws IOException { if (cacheFsPath != null) { LlapCacheAwareFs.unregisterFile(cacheFsPath); } if (reader != null) { reader.close(); } }
Configuration config = new Configuration(); config.set("spark.sql.parquet.binaryAsString", "false"); config.set("spark.sql.parquet.int96AsTimestamp", "false"); this.file = new Path(path); long length = this.file.getFileSystem(config).getFileStatus(this.file).getLen(); ParquetMetadata footer = readFooter(config, file, range(0, length)); List<BlockMetaData> blocks = footer.getBlocks(); this.fileSchema = footer.getFileMetaData().getSchema(); Types.MessageTypeBuilder builder = Types.buildMessage(); for (String s: columns) { if (!fileSchema.containsField(s)) { throw new IOException("Can only project existing columns. Unknown field: " + s + " File schema:\n" + fileSchema); builder.addFields(fileSchema.getType(s)); this.reader = new ParquetFileReader( config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns()); for (BlockMetaData block : reader.getRowGroups()) { this.totalRowCount += block.getRowCount();
final FileStatus[] fileStatuses = fs.listStatus(new Path(parquetDir)); } else if (lastEntry.isDirectory()) { final FileStatus[] directoryEntries = fs.listStatus(lastEntry.getPath()); log.info("Reading schema data from : {}", lastFile.getPath().toString()); parquetFilePath = Optional.of(lastFile.getPath()); } else { throw new JobRuntimeException("Encountered a directory where there should only be files. Path: " + lastFile.getPath().toString()); ParquetFileReader.readFooter(new Configuration(), parquetFilePath.get(), ParquetMetadataConverter.NO_FILTER); final MessageType messageType = metadata.getFileMetaData().getSchema(); final ParquetSchemaConverter converter = new ParquetSchemaConverter(new SQLConf()); final StructType structType = converter.convert(messageType);
public ParquetFileLoader(String filename, boolean lazy) { super(filename); this.path = new Path(this.filename); this.lazy = lazy; this.configuration = new Configuration(); System.setProperty("hadoop.home.dir", "/"); this.configuration.set("hadoop.security.authentication", "simple"); this.configuration.set("hadoop.security.authorization", "false"); try { this.metadata = ParquetFileReader.readFooter(this.configuration, this.path, ParquetMetadataConverter.NO_FILTER); } catch (IOException ex) { throw new RuntimeException(ex); } }
public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile, boolean sort) throws Exception { try { deleteTableIfExists(outputFile); test("use dfs_test"); // test("ALTER SESSION SET \"planner.add_producer_consumer\" = false"); String query = select(selection, inputTable, sort); System.out.println(outputFile); String create = "CREATE TABLE " + outputFile + " AS " + query; String validateQuery = select(validationSelection, outputFile, sort); test(create); test(validateQuery); // TODO: remove testBuilder() .unOrdered() .sqlQuery(validateQuery) .sqlBaselineQuery(query) .go(); Configuration hadoopConf = new Configuration(); Path output = new Path(getDfsTestTmpSchemaLocation(), outputFile); FileSystem fs = output.getFileSystem(hadoopConf); for (FileStatus file : fs.listStatus(output)) { ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS); String version = footer.getFileMetaData().getKeyValueMetaData().get(DREMIO_VERSION_PROPERTY); assertEquals(DremioVersionInfo.getVersion(), version); PageHeaderUtil.validatePageHeaders(file.getPath(), footer); } } finally { deleteTableIfExists(outputFile); } }
static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException { Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE); Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE); FileSystem fileSystem = basePath.getFileSystem(configuration); if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) { // reading the summary file that does not contain the row groups LOG.info("reading summary file: {}", commonMetaDataFile); return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups)); } else if (fileSystem.exists(metadataFile)) { LOG.info("reading summary file: {}", metadataFile); return readFooter(configuration, metadataFile, filter(skipRowGroups)); } else { return null; } }
public void initialize(ParquetFileReader reader, ParquetReadOptions options) { // copy custom configuration to the Configuration passed to the ReadSupport Configuration conf = new Configuration(); if (options instanceof HadoopReadOptions) { conf = ((HadoopReadOptions) options).getConf(); } for (String property : options.getPropertyNames()) { conf.set(property, options.getProperty(property)); } // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext); this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true); this.total = reader.getRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total); this.filterRecords = options.useRecordFilter(); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
IOUtils.copy(in, new FileOutputStream(tempFile)); Configuration conf = new Configuration(); Path path = new Path(tempFile.getAbsolutePath()); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); parserHandler.setTotalRows(getRowCount(metaData)); try (ParquetReader<SimpleRecord> reader = ParquetReader.builder(new SimpleReadSupport(), new Path(tempFile.getAbsolutePath())).build()) { parserHandler.newSheet(""); int rowNum = 0; Object val = nameValue.getValue(); if (!(val instanceof SimpleRecord)) { Type type = schema.getType(schema.getFieldIndex(name)); row.put(name, getRecordValue(val, type));
final ParquetMetadata metadata; final UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI(); final Configuration conf = new Configuration(fs.getConf()); try { metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>)() -> { try (ParquetFileReader parquetFileReader = ParquetFileReader.open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) { return parquetFileReader.getFooter(); MessageType schema = metadata.getFileMetaData().getSchema(); schema.getPaths(); for (String[] path : schema.getPaths()) { colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0)); logger.debug("Contains corrupt dates: {}.", containsCorruptDates); for (BlockMetaData rowGroup : metadata.getBlocks()) { List<ColumnMetadata_v3> columnMetadataList = new ArrayList<>(); long length = 0; String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else { footer = readFooter(configuration, file, NO_FILTER); Set<Long> offsets = new HashSet<>(); for (long offset : rowGroupOffsets) { for (BlockMetaData block : footer.getBlocks()) { if (offsets.contains(block.getStartingPos())) { blocks.add(block); this.fileSchema = footer.getFileMetaData().getSchema(); Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData(); ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration)); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( this.requestedSchema = readContext.getRequestedSchema(); String sparkRequestedSchemaString = configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA()); this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString); this.reader = new ParquetFileReader( configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns()); for (BlockMetaData block : reader.getRowGroups()) { this.totalRowCount += block.getRowCount();
private ParquetMetadata readFooter(Configuration conf, String path, ParquetReaderConfig readerConfig) throws IOException { try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(path), readerConfig.addCountersToConf(conf)), readerConfig.toReadOptions())) { return reader.getFooter(); } }
public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { ParquetMetadata footer; try { // TODO(vc): Should we use the parallel reading version here? footer = ParquetFileReader .readFooter(getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); } catch (IOException e) { throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); } return footer; }
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER); if (parquetMetadata.getBlocks().size() > 1) { throw new IOException( format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s", parquetMetadata.getBlocks().size(), filePath)); final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0); final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap(); for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) { columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor); try(final FSDataInputStream in = fs.open(filePath)) { for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) { if (isBinaryType(columnChunkMetaData.getType())) {
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), FileSystem fs = file.getFileSystem(configuration); if (cacheKey instanceof Long && HiveConf.getBoolVar( cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) { MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else { for (BlockMetaData block : footer.getBlocks()) { if (offsets.contains(block.getStartingPos())) { blocks.add(block); this.totalRowCount += block.getRowCount(); this.fileSchema = footer.getFileMetaData().getSchema(); this.reader = new ParquetFileReader( configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
if (oldSplit instanceof FileSplit) { final Path finalPath = ((FileSplit) oldSplit).getPath(); jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent()); final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath); final List<BlockMetaData> blocks = parquetMetadata.getBlocks(); final FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); null, fileMetaData.getSchema())); .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount(); final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>(); final long splitStart = ((FileSplit) oldSplit).getStart(); FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema()); if (filter != null) { filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema()); if (filtedBlocks.isEmpty()) { LOG.debug("All row groups are dropped due to filter predicates"); oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
/** * Specifically reads a given summary file * @param configuration a configuration * @param summaryStatus file status for a summary file * @return the metadata translated for each file * @throws IOException if an exception is thrown while reading the summary file * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated public static List<Footer> readSummaryFile(Configuration configuration, FileStatus summaryStatus) throws IOException { final Path parent = summaryStatus.getPath().getParent(); ParquetMetadata mergedFooters = readFooter(configuration, summaryStatus, filter(false)); return footersFromSummaryFile(parent, mergedFooters); }