@Override public void readFields(DataInput in) throws IOException { realSplit = new ParquetInputSplit(); realSplit.readFields(in); }
private ParquetInputSplit toParquetSplit(InputSplit split) throws IOException { if (split instanceof ParquetInputSplit) { return (ParquetInputSplit) split; } else if (split instanceof FileSplit) { return ParquetInputSplit.from((FileSplit) split); } else if (split instanceof org.apache.hadoop.mapred.FileSplit) { return ParquetInputSplit.from( (org.apache.hadoop.mapred.FileSplit) split); } else { throw new IllegalArgumentException( "Invalid split (not a FileSplit or ParquetInputSplit): " + split); } } }
@Override public String toString() { String hosts; try{ hosts = Arrays.toString(getLocations()); } catch (Exception e) { // IOException/InterruptedException could be thrown hosts = "(" + e + ")"; } return this.getClass().getSimpleName() + "{" + "part: " + getPath() + " start: " + getStart() + " end: " + getEnd() + " length: " + getLength() + " hosts: " + hosts + (rowGroupOffsets == null ? "" : (" row groups: " + Arrays.toString(rowGroupOffsets))) + "}"; }
@Override public void initialize(GuaguaFileSplit split) throws IOException { ReadSupport<Tuple> readSupport = getReadSupportInstance(this.conf); this.parquetRecordReader = new ParquetRecordReader<Tuple>(readSupport, getFilter(this.conf)); ParquetInputSplit parquetInputSplit = new ParquetInputSplit(new Path(split.getPath()), split.getOffset(), split.getOffset() + split.getLength(), split.getLength(), null, null); try { this.parquetRecordReader.initialize(parquetInputSplit, buildContext()); } catch (InterruptedException e) { throw new GuaguaRuntimeException(e); } }
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException { Path path = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); List<BlockMetaData> filteredBlocks; ParquetMetadata footer; footer = readFooter(configuration, path, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); Filter filter = getFilter(configuration); + " found: " + filteredBlocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
@Override public long getLength() throws IOException { return realSplit.getLength(); }
@Override public String[] getLocations() throws IOException { return realSplit.getLocations(); }
ParquetInputSplit split = new ParquetInputSplit(path, start, start + length, length, null, offsets);
/** * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}. * * @param split a mapreduce FileSplit * @return a ParquetInputSplit * @throws IOException */ static ParquetInputSplit from(FileSplit split) throws IOException { return new ParquetInputSplit(split.getPath(), split.getStart(), split.getStart() + split.getLength(), split.getLength(), split.getLocations(), null); }
/** * {@inheritDoc} */ @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration configuration = ContextUtil.getConfiguration(jobContext); List<InputSplit> splits = new ArrayList<InputSplit>(); if (isTaskSideMetaData(configuration)) { // Although not required by the API, some clients may depend on always // receiving ParquetInputSplit. Translation is required at some point. for (InputSplit split : super.getSplits(jobContext)) { Preconditions.checkArgument(split instanceof FileSplit, "Cannot wrap non-FileSplit: " + split); splits.add(ParquetInputSplit.from((FileSplit) split)); } return splits; } else { splits.addAll(getSplits(configuration, getFooters(jobContext))); } return splits; }
/** * Builds a {@code ParquetInputSplit} from a mapred * {@link org.apache.hadoop.mapred.FileSplit}. * * @param split a mapreduce FileSplit * @return a ParquetInputSplit * @throws IOException */ static ParquetInputSplit from(org.apache.hadoop.mapred.FileSplit split) throws IOException { return new ParquetInputSplit(split.getPath(), split.getStart(), split.getStart() + split.getLength(), split.getLength(), split.getLocations(), null); }
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException { MessageType requested = MessageTypeParser.parseMessageType(requestedSchema); long length = 0; for (BlockMetaData block : this.getRowGroups()) { List<ColumnChunkMetaData> columns = block.getColumns(); for (ColumnChunkMetaData column : columns) { if (requested.containsPath(column.getPath().toArray())) { length += column.getTotalSize(); } } } BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1); long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize(); long[] rowGroupOffsets = new long[this.getRowGroupCount()]; for (int i = 0; i < rowGroupOffsets.length; i++) { rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos(); } return new ParquetInputSplit( fileStatus.getPath(), hdfsBlock.getOffset(), end, length, hdfsBlock.getHosts(), rowGroupOffsets ); } }
skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr"); split = new ParquetInputSplit(finalPath, splitStart, splitLength,