public ParquetReader<T> build() throws IOException { ParquetReadOptions options = optionsBuilder.build(); if (path != null) { FileSystem fs = path.getFileSystem(conf); FileStatus stat = fs.getFileStatus(path); if (stat.isFile()) { return new ParquetReader<>( Collections.singletonList((InputFile) HadoopInputFile.fromStatus(stat, conf)), options, getReadSupport()); } else { List<InputFile> files = new ArrayList<>(); for (FileStatus fileStatus : fs.listStatus(path, HiddenFileFilter.INSTANCE)) { files.add(HadoopInputFile.fromStatus(fileStatus, conf)); } return new ParquetReader<T>(files, options, getReadSupport()); } } else { return new ParquetReader<>(Collections.singletonList(file), options, getReadSupport()); } } }
public ParquetReader<T> build() throws IOException { ParquetReadOptions options = optionsBuilder.build(); if (path != null) { FileSystem fs = path.getFileSystem(conf); FileStatus stat = fs.getFileStatus(path); if (stat.isFile()) { return new ParquetReader<>( Collections.singletonList((InputFile) HadoopInputFile.fromStatus(stat, conf)), options, getReadSupport()); } else { List<InputFile> files = new ArrayList<>(); for (FileStatus fileStatus : fs.listStatus(path, HiddenFileFilter.INSTANCE)) { files.add(HadoopInputFile.fromStatus(fileStatus, conf)); } return new ParquetReader<T>(files, options, getReadSupport()); } } else { return new ParquetReader<>(Collections.singletonList(file), options, getReadSupport()); } } }
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, FilterCompat.Filter filter) throws IOException { this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)), HadoopReadOptions.builder(conf) .withRecordFilter(checkNotNull(filter, "filter")) .build(), readSupport); }
ParquetReadOptions options = optionsBuilder.build();
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException { Path path = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); // if task.side.metadata is set, rowGroupOffsets is null ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration); if (rowGroupOffsets != null) { optionsBuilder.withOffsets(rowGroupOffsets); } else { optionsBuilder.withRange(split.getStart(), split.getEnd()); } // open a reader with the metadata filter ParquetFileReader reader = ParquetFileReader.open( HadoopInputFile.fromPath(path, configuration), optionsBuilder.build()); if (rowGroupOffsets != null) { // verify a row group was found for each offset List<BlockMetaData> blocks = reader.getFooter().getBlocks(); if (blocks.size() != rowGroupOffsets.length) { throw new IllegalStateException( "All of the offsets in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks); } } if (!reader.getRowGroups().isEmpty()) { checkDeltaByteArrayProblem( reader.getFooter().getFileMetaData(), configuration, reader.getRowGroups().get(0)); } internalReader.initialize(reader, configuration); }
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException { Path path = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); // if task.side.metadata is set, rowGroupOffsets is null ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration); if (rowGroupOffsets != null) { optionsBuilder.withOffsets(rowGroupOffsets); } else { optionsBuilder.withRange(split.getStart(), split.getEnd()); } // open a reader with the metadata filter ParquetFileReader reader = ParquetFileReader.open( HadoopInputFile.fromPath(path, configuration), optionsBuilder.build()); if (rowGroupOffsets != null) { // verify a row group was found for each offset List<BlockMetaData> blocks = reader.getFooter().getBlocks(); if (blocks.size() != rowGroupOffsets.length) { throw new IllegalStateException( "All of the offsets in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks); } } if (!reader.getRowGroups().isEmpty()) { checkDeltaByteArrayProblem( reader.getFooter().getFileMetaData(), configuration, reader.getRowGroups().get(0)); } internalReader.initialize(reader, configuration); }
/** * Reads the meta data block in the footer of the file using provided input stream * @param file a {@link InputFile} to read * @param filter the filter to apply to row groups * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file * @deprecated will be removed in 2.0.0; * use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)} */ @Deprecated public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException { ParquetReadOptions options; if (file instanceof HadoopInputFile) { options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration()) .withMetadataFilter(filter).build(); } else { options = ParquetReadOptions.builder().withMetadataFilter(filter).build(); } try (SeekableInputStream in = file.newStream()) { return readFooter(file, options, in); } }
/** * Reads the meta data block in the footer of the file using provided input stream * @param file a {@link InputFile} to read * @param filter the filter to apply to row groups * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file * @deprecated will be removed in 2.0.0; * use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)} */ @Deprecated public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException { ParquetReadOptions options; if (file instanceof HadoopInputFile) { options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration()) .withMetadataFilter(filter).build(); } else { options = ParquetReadOptions.builder().withMetadataFilter(filter).build(); } try (SeekableInputStream in = file.newStream()) { return readFooter(file, options, in); } }
/** * @param conf a configuration * @param file a file path to open * @param filter a metadata filter * @return a parquet file reader * @throws IOException if there is an error while opening the file * @deprecated will be removed in 2.0.0; use {@link #open(InputFile,ParquetReadOptions)} */ @Deprecated public static ParquetFileReader open(Configuration conf, Path file, MetadataFilter filter) throws IOException { return open(HadoopInputFile.fromPath(file, conf), HadoopReadOptions.builder(conf).withMetadataFilter(filter).build()); }
/** * @param conf a configuration * @param file a file path to open * @param filter a metadata filter * @return a parquet file reader * @throws IOException if there is an error while opening the file * @deprecated will be removed in 2.0.0; use {@link #open(InputFile,ParquetReadOptions)} */ @Deprecated public static ParquetFileReader open(Configuration conf, Path file, MetadataFilter filter) throws IOException { return open(HadoopInputFile.fromPath(file, conf), HadoopReadOptions.builder(conf).withMetadataFilter(filter).build()); }
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, FilterCompat.Filter filter) throws IOException { this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)), HadoopReadOptions.builder(conf) .withRecordFilter(checkNotNull(filter, "filter")) .build(), readSupport); }
/** * Open a {@link InputFile file}. * * @param file an input file * @return an open ParquetFileReader * @throws IOException if there is an error while opening the file */ public static ParquetFileReader open(InputFile file) throws IOException { return new ParquetFileReader(file, ParquetReadOptions.builder().build()); }
/** * @param conf the Hadoop Configuration * @param file Path to a parquet file * @param filter a {@link MetadataFilter} for selecting row groups * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader(Configuration conf, Path file, MetadataFilter filter) throws IOException { this(HadoopInputFile.fromPath(file, conf), HadoopReadOptions.builder(conf).withMetadataFilter(filter).build()); }
public ParquetReadOptions toReadOptions() { return ParquetReadOptions.builder() .useSignedStringMinMax(enableStringsSignedMinMax) .build(); }
/** * @param conf the Hadoop Configuration * @param file Path to a parquet file * @param filter a {@link MetadataFilter} for selecting row groups * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader(Configuration conf, Path file, MetadataFilter filter) throws IOException { this(HadoopInputFile.fromPath(file, conf), HadoopReadOptions.builder(conf).withMetadataFilter(filter).build()); }
/** * Open a {@link InputFile file}. * * @param file an input file * @return an open ParquetFileReader * @throws IOException if there is an error while opening the file */ public static ParquetFileReader open(InputFile file) throws IOException { return new ParquetFileReader(file, ParquetReadOptions.builder().build()); }