org.apache.parquet.hadoop.ParquetInputSplit java code examples

boolean indexAccess =
 configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
   configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
    + " found: " + blocks
    + " out of: " + Arrays.toString(foundRowGroupOffsets)
    + " in range " + split.getStart() + ", " + split.getEnd());

 skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr");
split = new ParquetInputSplit(finalPath,
 splitStart,
 splitLength,

@Override
public String toString() {
 String hosts;
 try{
   hosts = Arrays.toString(getLocations());
 } catch (Exception e) {
  // IOException/InterruptedException could be thrown
  hosts = "(" + e + ")";
 }
 return this.getClass().getSimpleName() + "{" +
     "part: " + getPath()
   + " start: " + getStart()
   + " end: " + getEnd()
   + " length: " + getLength()
   + " hosts: " + hosts
   + (rowGroupOffsets == null ? "" : (" row groups: " + Arrays.toString(rowGroupOffsets)))
   + "}";
}

/**
 * Check whether Parquet schema matches the given Flink schema.
 */
private void checkSchema(Configuration hadoopConf, ParquetInputSplit split) throws IOException {
  ParquetMetadataConverter.MetadataFilter metadataFilter =
      ParquetMetadataConverter.range(split.getStart(), split.getEnd());
  ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(hadoopConf, split.getPath(), metadataFilter);
  FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
  MessageType parquetSchema = fileMetaData.getSchema();
  ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter();
  Map<String, InternalType> fieldName2TypeInfoMap =
      schemaConverter.convertToInternalType(parquetSchema);
  for (int i = 0; i < fieldNames.length; ++i) {
    String fieldName = fieldNames[i];
    InternalType fieldType = fieldTypes[i];
    if (!fieldName2TypeInfoMap.containsKey(fieldName)) {
      throw new IllegalArgumentException(fieldName + " can not be found in parquet schema");
    }
    InternalType parquetFieldType = fieldName2TypeInfoMap.get(fieldName);
    if (!fieldType.equals(parquetFieldType)) {
      throw new IllegalArgumentException(parquetFieldType + " can not be convert to " + fieldType);
    }
  }
}

@Override
public void readFields(DataInput in) throws IOException {
 realSplit = new ParquetInputSplit();
 realSplit.readFields(in);
}

 Map<String, String> extraMetadata,
 Map<String, String> readSupportMetadata) {
this(path, start, end(blocks, requestedSchema), length, hosts, offsets(blocks));

 private ParquetInputSplit toParquetSplit(InputSplit split) throws IOException {
  if (split instanceof ParquetInputSplit) {
   return (ParquetInputSplit) split;
  } else if (split instanceof FileSplit) {
   return ParquetInputSplit.from((FileSplit) split);
  } else if (split instanceof org.apache.hadoop.mapred.FileSplit) {
   return ParquetInputSplit.from(
     (org.apache.hadoop.mapred.FileSplit) split);
  } else {
   throw new IllegalArgumentException(
     "Invalid split (not a FileSplit or ParquetInputSplit): " + split);
  }
 }
}

@Override
public String toString() {
 String hosts;
 try{
   hosts = Arrays.toString(getLocations());
 } catch (Exception e) {
  // IOException/InterruptedException could be thrown
  hosts = "(" + e + ")";
 }
 return this.getClass().getSimpleName() + "{" +
     "part: " + getPath()
   + " start: " + getStart()
   + " end: " + getEnd()
   + " length: " + getLength()
   + " hosts: " + hosts
   + (rowGroupOffsets == null ? "" : (" row groups: " + Arrays.toString(rowGroupOffsets)))
   + "}";
}

@Override
public void readFields(DataInput in) throws IOException {
 realSplit = new ParquetInputSplit();
 realSplit.readFields(in);
}

 Map<String, String> extraMetadata,
 Map<String, String> readSupportMetadata) {
this(path, start, end(blocks, requestedSchema), length, hosts, offsets(blocks));

 private ParquetInputSplit toParquetSplit(InputSplit split) throws IOException {
  if (split instanceof ParquetInputSplit) {
   return (ParquetInputSplit) split;
  } else if (split instanceof FileSplit) {
   return ParquetInputSplit.from((FileSplit) split);
  } else if (split instanceof org.apache.hadoop.mapred.FileSplit) {
   return ParquetInputSplit.from(
     (org.apache.hadoop.mapred.FileSplit) split);
  } else {
   throw new IllegalArgumentException(
     "Invalid split (not a FileSplit or ParquetInputSplit): " + split);
  }
 }
}

Configuration configuration = taskAttemptContext.getConfiguration();
ParquetInputSplit split = (ParquetInputSplit)inputSplit;
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
 footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
      + " found: " + blocks
      + " out of: " + Arrays.toString(foundRowGroupOffsets)
      + " in range " + split.getStart() + ", " + split.getEnd());

/**
 * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}.
 *
 * @param split a mapreduce FileSplit
 * @return a ParquetInputSplit
 * @throws IOException if there is an error while creating the Parquet split
 */
static ParquetInputSplit from(FileSplit split) throws IOException {
 return new ParquetInputSplit(split.getPath(),
   split.getStart(), split.getStart() + split.getLength(),
   split.getLength(), split.getLocations(), null);
}

/**
 * {@inheritDoc}
 */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
 Configuration configuration = ContextUtil.getConfiguration(jobContext);
 List<InputSplit> splits = new ArrayList<InputSplit>();
 if (isTaskSideMetaData(configuration)) {
  // Although not required by the API, some clients may depend on always
  // receiving ParquetInputSplit. Translation is required at some point.
  for (InputSplit split : super.getSplits(jobContext)) {
   Preconditions.checkArgument(split instanceof FileSplit,
     "Cannot wrap non-FileSplit: " + split);
   splits.add(ParquetInputSplit.from((FileSplit) split));
  }
  return splits;
 } else {
  splits.addAll(getSplits(configuration, getFooters(jobContext)));
 }
 return splits;
}

Configuration configuration = taskAttemptContext.getConfiguration();
ParquetInputSplit split = (ParquetInputSplit)inputSplit;
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
 footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
      + " found: " + blocks
      + " out of: " + Arrays.toString(foundRowGroupOffsets)
      + " in range " + split.getStart() + ", " + split.getEnd());

/**
 * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}.
 *
 * @param split a mapreduce FileSplit
 * @return a ParquetInputSplit
 * @throws IOException if there is an error while creating the Parquet split
 */
static ParquetInputSplit from(FileSplit split) throws IOException {
 return new ParquetInputSplit(split.getPath(),
   split.getStart(), split.getStart() + split.getLength(),
   split.getLength(), split.getLocations(), null);
}

/**
 * {@inheritDoc}
 */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
 Configuration configuration = ContextUtil.getConfiguration(jobContext);
 List<InputSplit> splits = new ArrayList<InputSplit>();
 if (isTaskSideMetaData(configuration)) {
  // Although not required by the API, some clients may depend on always
  // receiving ParquetInputSplit. Translation is required at some point.
  for (InputSplit split : super.getSplits(jobContext)) {
   Preconditions.checkArgument(split instanceof FileSplit,
     "Cannot wrap non-FileSplit: " + split);
   splits.add(ParquetInputSplit.from((FileSplit) split));
  }
  return splits;
 } else {
  splits.addAll(getSplits(configuration, getFooters(jobContext)));
 }
 return splits;
}

Configuration configuration = taskAttemptContext.getConfiguration();
ParquetInputSplit split = (ParquetInputSplit)inputSplit;
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
 footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
      + " found: " + blocks
      + " out of: " + Arrays.toString(foundRowGroupOffsets)
      + " in range " + split.getStart() + ", " + split.getEnd());

/**
 * Builds a {@code ParquetInputSplit} from a mapred
 * {@link org.apache.hadoop.mapred.FileSplit}.
 *
 * @param split a mapred FileSplit
 * @return a ParquetInputSplit
 * @throws IOException if there is an error while creating the Parquet split
 */
static ParquetInputSplit from(org.apache.hadoop.mapred.FileSplit split) throws IOException {
 return new ParquetInputSplit(split.getPath(),
   split.getStart(), split.getStart() + split.getLength(),
   split.getLength(), split.getLocations(), null);
}

private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException {
 Path path = split.getPath();
 long[] rowGroupOffsets = split.getRowGroupOffsets();
 // if task.side.metadata is set, rowGroupOffsets is null
 ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration);
 if (rowGroupOffsets != null) {
  optionsBuilder.withOffsets(rowGroupOffsets);
 } else {
  optionsBuilder.withRange(split.getStart(), split.getEnd());
 }
 // open a reader with the metadata filter
 ParquetFileReader reader = ParquetFileReader.open(
   HadoopInputFile.fromPath(path, configuration), optionsBuilder.build());
 if (rowGroupOffsets != null) {
  // verify a row group was found for each offset
  List<BlockMetaData> blocks = reader.getFooter().getBlocks();
  if (blocks.size() != rowGroupOffsets.length) {
   throw new IllegalStateException(
     "All of the offsets in the split should be found in the file."
     + " expected: " + Arrays.toString(rowGroupOffsets)
     + " found: " + blocks);
  }
 }
 if (!reader.getRowGroups().isEmpty()) {
  checkDeltaByteArrayProblem(
    reader.getFooter().getFileMetaData(), configuration,
    reader.getRowGroups().get(0));
 }
 internalReader.initialize(reader, configuration);
}

Javadoc

An input split for the Parquet format It contains the information to read one block of the file. This class is private to the ParquetInputFormat. Backward compatibility is not maintained.

Most used methods

getEnd
getPath
getRowGroupOffsets
getStart
<init>
end
from
Builds a ParquetInputSplit from a mapreduce FileSplit.
getLength
getLocations
offsets
readArray
readFields

Popular in Java

Parsing JSON documents to java classes using gson
scheduleAtFixedRate (Timer)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
onRequestPermissionsResult (Fragment)
FileOutputStream (java.io)
An output stream that writes bytes to a file. If the output file exists, it can be replaced or appen
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
GridBagLayout (java.awt)
The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
Kernel (java.awt.image)
Filter (javax.servlet)
A filter is an object that performs filtering tasks on either the request to a resource (a servlet o
Top Vim plugins

How to useParquetInputSplit in org.apache.parquet.hadoop

Best Java code snippets using org.apache.parquet.hadoop.ParquetInputSplit (Showing top 20 results out of 315)

How to use
ParquetInputSplit
in
org.apache.parquet.hadoop