parquet.hadoop.ParquetInputSplit java code examples

@Override
public void readFields(DataInput in) throws IOException {
 realSplit = new ParquetInputSplit();
 realSplit.readFields(in);
}

 private ParquetInputSplit toParquetSplit(InputSplit split) throws IOException {
  if (split instanceof ParquetInputSplit) {
   return (ParquetInputSplit) split;
  } else if (split instanceof FileSplit) {
   return ParquetInputSplit.from((FileSplit) split);
  } else if (split instanceof org.apache.hadoop.mapred.FileSplit) {
   return ParquetInputSplit.from(
     (org.apache.hadoop.mapred.FileSplit) split);
  } else {
   throw new IllegalArgumentException(
     "Invalid split (not a FileSplit or ParquetInputSplit): " + split);
  }
 }
}

@Override
public String toString() {
 String hosts;
 try{
   hosts = Arrays.toString(getLocations());
 } catch (Exception e) {
  // IOException/InterruptedException could be thrown
  hosts = "(" + e + ")";
 }
 return this.getClass().getSimpleName() + "{" +
     "part: " + getPath()
   + " start: " + getStart()
   + " end: " + getEnd()
   + " length: " + getLength()
   + " hosts: " + hosts
   + (rowGroupOffsets == null ? "" : (" row groups: " + Arrays.toString(rowGroupOffsets)))
   + "}";
}

@Override
public void initialize(GuaguaFileSplit split) throws IOException {
  ReadSupport<Tuple> readSupport = getReadSupportInstance(this.conf);
  this.parquetRecordReader = new ParquetRecordReader<Tuple>(readSupport, getFilter(this.conf));
  ParquetInputSplit parquetInputSplit = new ParquetInputSplit(new Path(split.getPath()), split.getOffset(),
      split.getOffset() + split.getLength(), split.getLength(), null, null);
  try {
    this.parquetRecordReader.initialize(parquetInputSplit, buildContext());
  } catch (InterruptedException e) {
    throw new GuaguaRuntimeException(e);
  }
}

private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException {
 Path path = split.getPath();
 long[] rowGroupOffsets = split.getRowGroupOffsets();
 List<BlockMetaData> filteredBlocks;
 ParquetMetadata footer;
  footer = readFooter(configuration, path, range(split.getStart(), split.getEnd()));
  MessageType fileSchema = footer.getFileMetaData().getSchema();
  Filter filter = getFilter(configuration);
     + " found: " + filteredBlocks
     + " out of: " + Arrays.toString(foundRowGroupOffsets)
     + " in range " + split.getStart() + ", " + split.getEnd());

 Map<String, String> extraMetadata,
 Map<String, String> readSupportMetadata) {
this(path, start, length, end(blocks, requestedSchema), hosts, offsets(blocks));

@Override
public long getLength() throws IOException {
  return realSplit.getLength();
}

@Override
public String[] getLocations() throws IOException {
  return realSplit.getLocations();
}

ParquetInputSplit split = new ParquetInputSplit(path, start, start + length, length, null, offsets);

/**
 * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}.
 *
 * @param split a mapreduce FileSplit
 * @return a ParquetInputSplit
 * @throws IOException
 */
static ParquetInputSplit from(FileSplit split) throws IOException {
 return new ParquetInputSplit(split.getPath(),
   split.getStart(), split.getStart() + split.getLength(),
   split.getLength(), split.getLocations(), null);
}

/**
 * {@inheritDoc}
 */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
 Configuration configuration = ContextUtil.getConfiguration(jobContext);
 List<InputSplit> splits = new ArrayList<InputSplit>();
 if (isTaskSideMetaData(configuration)) {
  // Although not required by the API, some clients may depend on always
  // receiving ParquetInputSplit. Translation is required at some point.
  for (InputSplit split : super.getSplits(jobContext)) {
   Preconditions.checkArgument(split instanceof FileSplit,
     "Cannot wrap non-FileSplit: " + split);
   splits.add(ParquetInputSplit.from((FileSplit) split));
  }
  return splits;
 } else {
  splits.addAll(getSplits(configuration, getFooters(jobContext)));
 }
 return splits;
}

/**
 * Builds a {@code ParquetInputSplit} from a mapred
 * {@link org.apache.hadoop.mapred.FileSplit}.
 *
 * @param split a mapreduce FileSplit
 * @return a ParquetInputSplit
 * @throws IOException
 */
static ParquetInputSplit from(org.apache.hadoop.mapred.FileSplit split) throws IOException {
 return new ParquetInputSplit(split.getPath(),
   split.getStart(), split.getStart() + split.getLength(),
   split.getLength(), split.getLocations(), null);
}

 public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;
  for (BlockMetaData block : this.getRowGroups()) {
   List<ColumnChunkMetaData> columns = block.getColumns();
   for (ColumnChunkMetaData column : columns) {
    if (requested.containsPath(column.getPath().toArray())) {
     length += column.getTotalSize();
    }
   }
  }
  BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
  long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();
  long[] rowGroupOffsets = new long[this.getRowGroupCount()];
  for (int i = 0; i < rowGroupOffsets.length; i++) {
   rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
  }
  return new ParquetInputSplit(
      fileStatus.getPath(),
      hdfsBlock.getOffset(),
      end,
      length,
      hdfsBlock.getHosts(),
      rowGroupOffsets
  );
 }
}

 skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr");
split = new ParquetInputSplit(finalPath,
  splitStart,
  splitLength,

Javadoc

An input split for the Parquet format It contains the information to read one block of the file. This class is private to the ParquetInputFormat. Backward compatibility is not maintained.

Most used methods

<init>
end
from
Builds a ParquetInputSplit from a mapreduce FileSplit.
getEnd
getLength
getLocations
getPath
getRowGroupOffsets
getStart
offsets
readArray
readFields

Popular in Java

Creating JSON documents from java classes using gson
putExtra (Intent)
compareTo (BigDecimal)
onCreateOptionsMenu (Activity)
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Loader (org.hibernate.loader)
Abstract superclass of object loading (and querying) strategies. This class implements useful common
Top plugins for WebStorm

How to useParquetInputSplit in parquet.hadoop

Best Java code snippets using parquet.hadoop.ParquetInputSplit (Showing top 14 results out of 315)

How to use
ParquetInputSplit
in
parquet.hadoop