boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false); this.file = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr"); split = new ParquetInputSplit(finalPath, splitStart, splitLength,
@Override public String toString() { String hosts; try{ hosts = Arrays.toString(getLocations()); } catch (Exception e) { // IOException/InterruptedException could be thrown hosts = "(" + e + ")"; } return this.getClass().getSimpleName() + "{" + "part: " + getPath() + " start: " + getStart() + " end: " + getEnd() + " length: " + getLength() + " hosts: " + hosts + (rowGroupOffsets == null ? "" : (" row groups: " + Arrays.toString(rowGroupOffsets))) + "}"; }
/** * Check whether Parquet schema matches the given Flink schema. */ private void checkSchema(Configuration hadoopConf, ParquetInputSplit split) throws IOException { ParquetMetadataConverter.MetadataFilter metadataFilter = ParquetMetadataConverter.range(split.getStart(), split.getEnd()); ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(hadoopConf, split.getPath(), metadataFilter); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType parquetSchema = fileMetaData.getSchema(); ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(); Map<String, InternalType> fieldName2TypeInfoMap = schemaConverter.convertToInternalType(parquetSchema); for (int i = 0; i < fieldNames.length; ++i) { String fieldName = fieldNames[i]; InternalType fieldType = fieldTypes[i]; if (!fieldName2TypeInfoMap.containsKey(fieldName)) { throw new IllegalArgumentException(fieldName + " can not be found in parquet schema"); } InternalType parquetFieldType = fieldName2TypeInfoMap.get(fieldName); if (!fieldType.equals(parquetFieldType)) { throw new IllegalArgumentException(parquetFieldType + " can not be convert to " + fieldType); } } }
@Override public void readFields(DataInput in) throws IOException { realSplit = new ParquetInputSplit(); realSplit.readFields(in); }
private ParquetInputSplit toParquetSplit(InputSplit split) throws IOException { if (split instanceof ParquetInputSplit) { return (ParquetInputSplit) split; } else if (split instanceof FileSplit) { return ParquetInputSplit.from((FileSplit) split); } else if (split instanceof org.apache.hadoop.mapred.FileSplit) { return ParquetInputSplit.from( (org.apache.hadoop.mapred.FileSplit) split); } else { throw new IllegalArgumentException( "Invalid split (not a FileSplit or ParquetInputSplit): " + split); } } }
@Override public String toString() { String hosts; try{ hosts = Arrays.toString(getLocations()); } catch (Exception e) { // IOException/InterruptedException could be thrown hosts = "(" + e + ")"; } return this.getClass().getSimpleName() + "{" + "part: " + getPath() + " start: " + getStart() + " end: " + getEnd() + " length: " + getLength() + " hosts: " + hosts + (rowGroupOffsets == null ? "" : (" row groups: " + Arrays.toString(rowGroupOffsets))) + "}"; }
@Override public void readFields(DataInput in) throws IOException { realSplit = new ParquetInputSplit(); realSplit.readFields(in); }
private ParquetInputSplit toParquetSplit(InputSplit split) throws IOException { if (split instanceof ParquetInputSplit) { return (ParquetInputSplit) split; } else if (split instanceof FileSplit) { return ParquetInputSplit.from((FileSplit) split); } else if (split instanceof org.apache.hadoop.mapred.FileSplit) { return ParquetInputSplit.from( (org.apache.hadoop.mapred.FileSplit) split); } else { throw new IllegalArgumentException( "Invalid split (not a FileSplit or ParquetInputSplit): " + split); } } }
Configuration configuration = taskAttemptContext.getConfiguration(); ParquetInputSplit split = (ParquetInputSplit)inputSplit; this.file = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
/** * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}. * * @param split a mapreduce FileSplit * @return a ParquetInputSplit * @throws IOException if there is an error while creating the Parquet split */ static ParquetInputSplit from(FileSplit split) throws IOException { return new ParquetInputSplit(split.getPath(), split.getStart(), split.getStart() + split.getLength(), split.getLength(), split.getLocations(), null); }
/** * {@inheritDoc} */ @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration configuration = ContextUtil.getConfiguration(jobContext); List<InputSplit> splits = new ArrayList<InputSplit>(); if (isTaskSideMetaData(configuration)) { // Although not required by the API, some clients may depend on always // receiving ParquetInputSplit. Translation is required at some point. for (InputSplit split : super.getSplits(jobContext)) { Preconditions.checkArgument(split instanceof FileSplit, "Cannot wrap non-FileSplit: " + split); splits.add(ParquetInputSplit.from((FileSplit) split)); } return splits; } else { splits.addAll(getSplits(configuration, getFooters(jobContext))); } return splits; }
Configuration configuration = taskAttemptContext.getConfiguration(); ParquetInputSplit split = (ParquetInputSplit)inputSplit; this.file = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
/** * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}. * * @param split a mapreduce FileSplit * @return a ParquetInputSplit * @throws IOException if there is an error while creating the Parquet split */ static ParquetInputSplit from(FileSplit split) throws IOException { return new ParquetInputSplit(split.getPath(), split.getStart(), split.getStart() + split.getLength(), split.getLength(), split.getLocations(), null); }
/** * {@inheritDoc} */ @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration configuration = ContextUtil.getConfiguration(jobContext); List<InputSplit> splits = new ArrayList<InputSplit>(); if (isTaskSideMetaData(configuration)) { // Although not required by the API, some clients may depend on always // receiving ParquetInputSplit. Translation is required at some point. for (InputSplit split : super.getSplits(jobContext)) { Preconditions.checkArgument(split instanceof FileSplit, "Cannot wrap non-FileSplit: " + split); splits.add(ParquetInputSplit.from((FileSplit) split)); } return splits; } else { splits.addAll(getSplits(configuration, getFooters(jobContext))); } return splits; }
Configuration configuration = taskAttemptContext.getConfiguration(); ParquetInputSplit split = (ParquetInputSplit)inputSplit; this.file = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
/** * Builds a {@code ParquetInputSplit} from a mapred * {@link org.apache.hadoop.mapred.FileSplit}. * * @param split a mapred FileSplit * @return a ParquetInputSplit * @throws IOException if there is an error while creating the Parquet split */ static ParquetInputSplit from(org.apache.hadoop.mapred.FileSplit split) throws IOException { return new ParquetInputSplit(split.getPath(), split.getStart(), split.getStart() + split.getLength(), split.getLength(), split.getLocations(), null); }
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException { Path path = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); // if task.side.metadata is set, rowGroupOffsets is null ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration); if (rowGroupOffsets != null) { optionsBuilder.withOffsets(rowGroupOffsets); } else { optionsBuilder.withRange(split.getStart(), split.getEnd()); } // open a reader with the metadata filter ParquetFileReader reader = ParquetFileReader.open( HadoopInputFile.fromPath(path, configuration), optionsBuilder.build()); if (rowGroupOffsets != null) { // verify a row group was found for each offset List<BlockMetaData> blocks = reader.getFooter().getBlocks(); if (blocks.size() != rowGroupOffsets.length) { throw new IllegalStateException( "All of the offsets in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks); } } if (!reader.getRowGroups().isEmpty()) { checkDeltaByteArrayProblem( reader.getFooter().getFileMetaData(), configuration, reader.getRowGroups().get(0)); } internalReader.initialize(reader, configuration); }