public Builder copy(ParquetReadOptions options) { useSignedStringMinMax(options.useSignedStringMinMax); useStatsFilter(options.useStatsFilter); useDictionaryFilter(options.useDictionaryFilter); useRecordFilter(options.useRecordFilter); withRecordFilter(options.recordFilter); withMetadataFilter(options.metadataFilter); withCodecFactory(options.codecFactory); withAllocator(options.allocator); for (Map.Entry<String, String> keyValue : options.properties.entrySet()) { set(keyValue.getKey(), keyValue.getValue()); } return this; }
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException { Path path = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); // if task.side.metadata is set, rowGroupOffsets is null ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration); if (rowGroupOffsets != null) { optionsBuilder.withOffsets(rowGroupOffsets); } else { optionsBuilder.withRange(split.getStart(), split.getEnd()); } // open a reader with the metadata filter ParquetFileReader reader = ParquetFileReader.open( HadoopInputFile.fromPath(path, configuration), optionsBuilder.build()); if (rowGroupOffsets != null) { // verify a row group was found for each offset List<BlockMetaData> blocks = reader.getFooter().getBlocks(); if (blocks.size() != rowGroupOffsets.length) { throw new IllegalStateException( "All of the offsets in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks); } } if (!reader.getRowGroups().isEmpty()) { checkDeltaByteArrayProblem( reader.getFooter().getFileMetaData(), configuration, reader.getRowGroups().get(0)); } internalReader.initialize(reader, configuration); }
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException { Path path = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); // if task.side.metadata is set, rowGroupOffsets is null ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration); if (rowGroupOffsets != null) { optionsBuilder.withOffsets(rowGroupOffsets); } else { optionsBuilder.withRange(split.getStart(), split.getEnd()); } // open a reader with the metadata filter ParquetFileReader reader = ParquetFileReader.open( HadoopInputFile.fromPath(path, configuration), optionsBuilder.build()); if (rowGroupOffsets != null) { // verify a row group was found for each offset List<BlockMetaData> blocks = reader.getFooter().getBlocks(); if (blocks.size() != rowGroupOffsets.length) { throw new IllegalStateException( "All of the offsets in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks); } } if (!reader.getRowGroups().isEmpty()) { checkDeltaByteArrayProblem( reader.getFooter().getFileMetaData(), configuration, reader.getRowGroups().get(0)); } internalReader.initialize(reader, configuration); }
/** * Reads the meta data block in the footer of the file using provided input stream * @param file a {@link InputFile} to read * @param filter the filter to apply to row groups * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file * @deprecated will be removed in 2.0.0; * use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)} */ @Deprecated public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException { ParquetReadOptions options; if (file instanceof HadoopInputFile) { options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration()) .withMetadataFilter(filter).build(); } else { options = ParquetReadOptions.builder().withMetadataFilter(filter).build(); } try (SeekableInputStream in = file.newStream()) { return readFooter(file, options, in); } }
public ParquetReader<T> build() throws IOException { ParquetReadOptions options = optionsBuilder.build(); if (path != null) { FileSystem fs = path.getFileSystem(conf); FileStatus stat = fs.getFileStatus(path); if (stat.isFile()) { return new ParquetReader<>( Collections.singletonList((InputFile) HadoopInputFile.fromStatus(stat, conf)), options, getReadSupport()); } else { List<InputFile> files = new ArrayList<>(); for (FileStatus fileStatus : fs.listStatus(path, HiddenFileFilter.INSTANCE)) { files.add(HadoopInputFile.fromStatus(fileStatus, conf)); } return new ParquetReader<T>(files, options, getReadSupport()); } } else { return new ParquetReader<>(Collections.singletonList(file), options, getReadSupport()); } } }
/** * Reads the meta data block in the footer of the file using provided input stream * @param file a {@link InputFile} to read * @param filter the filter to apply to row groups * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file * @deprecated will be removed in 2.0.0; * use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)} */ @Deprecated public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException { ParquetReadOptions options; if (file instanceof HadoopInputFile) { options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration()) .withMetadataFilter(filter).build(); } else { options = ParquetReadOptions.builder().withMetadataFilter(filter).build(); } try (SeekableInputStream in = file.newStream()) { return readFooter(file, options, in); } }
public ParquetReader<T> build() throws IOException { ParquetReadOptions options = optionsBuilder.build(); if (path != null) { FileSystem fs = path.getFileSystem(conf); FileStatus stat = fs.getFileStatus(path); if (stat.isFile()) { return new ParquetReader<>( Collections.singletonList((InputFile) HadoopInputFile.fromStatus(stat, conf)), options, getReadSupport()); } else { List<InputFile> files = new ArrayList<>(); for (FileStatus fileStatus : fs.listStatus(path, HiddenFileFilter.INSTANCE)) { files.add(HadoopInputFile.fromStatus(fileStatus, conf)); } return new ParquetReader<T>(files, options, getReadSupport()); } } else { return new ParquetReader<>(Collections.singletonList(file), options, getReadSupport()); } } }
public Builder<T> withFileRange(long start, long end) { optionsBuilder.withRange(start, end); return this; }
public Builder<T> useSignedStringMinMax() { optionsBuilder.useSignedStringMinMax(); return this; }
public Builder<T> useSignedStringMinMax(boolean useSignedStringMinMax) { optionsBuilder.useSignedStringMinMax(useSignedStringMinMax); return this; }
public Builder<T> useStatsFilter(boolean useStatsFilter) { optionsBuilder.useStatsFilter(useStatsFilter); return this; }
public Builder<T> useDictionaryFilter() { optionsBuilder.useDictionaryFilter(); return this; }
/** * @param conf a configuration * @param file a file path to open * @param filter a metadata filter * @return a parquet file reader * @throws IOException if there is an error while opening the file * @deprecated will be removed in 2.0.0; use {@link #open(InputFile,ParquetReadOptions)} */ @Deprecated public static ParquetFileReader open(Configuration conf, Path file, MetadataFilter filter) throws IOException { return open(HadoopInputFile.fromPath(file, conf), HadoopReadOptions.builder(conf).withMetadataFilter(filter).build()); }
public Builder<T> useStatsFilter() { optionsBuilder.useStatsFilter(); return this; }
public Builder<T> set(String key, String value) { optionsBuilder.set(key, value); return this; }
public Builder<T> useRecordFilter() { optionsBuilder.useRecordFilter(); return this; }
public Builder<T> useDictionaryFilter(boolean useDictionaryFilter) { optionsBuilder.useDictionaryFilter(useDictionaryFilter); return this; }
public static Builder builder() { return new Builder(); }
public Builder<T> useRecordFilter() { optionsBuilder.useRecordFilter(); return this; }