configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration);
ParquetMetadata footer = readFooter(config, file, range(0, length));
ParquetMetadata footer = readFooter(config, file, range(0, length));
ParquetMetadata footer = readFooter(config, file, range(0, length));
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration);
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration);
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration);
public Builder withRange(long start, long end) { this.metadataFilter = ParquetMetadataConverter.range(start, end); return this; }
public Builder withRange(long start, long end) { this.metadataFilter = ParquetMetadataConverter.range(start, end); return this; }
/** * Check whether Parquet schema matches the given Flink schema. */ private void checkSchema(Configuration hadoopConf, ParquetInputSplit split) throws IOException { ParquetMetadataConverter.MetadataFilter metadataFilter = ParquetMetadataConverter.range(split.getStart(), split.getEnd()); ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(hadoopConf, split.getPath(), metadataFilter); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType parquetSchema = fileMetaData.getSchema(); ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(); Map<String, InternalType> fieldName2TypeInfoMap = schemaConverter.convertToInternalType(parquetSchema); for (int i = 0; i < fieldNames.length; ++i) { String fieldName = fieldNames[i]; InternalType fieldType = fieldTypes[i]; if (!fieldName2TypeInfoMap.containsKey(fieldName)) { throw new IllegalArgumentException(fieldName + " can not be found in parquet schema"); } InternalType parquetFieldType = fieldName2TypeInfoMap.get(fieldName); if (!fieldType.equals(parquetFieldType)) { throw new IllegalArgumentException(parquetFieldType + " can not be convert to " + fieldType); } } }
ParquetMetadata footer = readFooter(config, file, range(0, length));
@Override protected final ParseWriter parseChunk(int cidx, ParseReader din, ParseWriter dout) { if (! (din instanceof FVecParseReader)) { // TODO: Should we modify the interface to expose the underlying chunk for non-streaming parsers? throw new IllegalStateException("We only accept parser readers backed by a Vec (no streaming support!)."); } Chunk chunk = ((FVecParseReader) din).getChunk(); Vec vec = chunk.vec(); // extract metadata, we want to read only the row groups that have centers in this chunk ParquetMetadataConverter.MetadataFilter chunkFilter = ParquetMetadataConverter.range( chunk.start(), chunk.start() + chunk.len()); ParquetMetadata metadata = VecParquetReader.readFooter(_metadata, chunkFilter); if (metadata.getBlocks().isEmpty()) { Log.trace("Chunk #", cidx, " doesn't contain any Parquet block center."); return dout; } Log.info("Processing ", metadata.getBlocks().size(), " blocks of chunk #", cidx); VecParquetReader reader = new VecParquetReader(vec, metadata, dout, _setup.getColumnTypes(), _keepColumns, _setup.get_parse_columns_indices().length); try { Long recordNumber; do { recordNumber = reader.read(); } while (recordNumber != null); } catch (IOException e) { throw new RuntimeException("Failed to parse records", e); } return dout; }
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration);
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration);