parquet.hadoop.ParquetInputFormat java code examples

public MapredParquetInputFormat() {
 this(new ParquetInputFormat<ArrayWritable>(DataWritableReadSupport.class));
}

 @Override
 public RecordReader<Void, Tuple> createRecordReader(
   InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
     throws IOException, InterruptedException {
  // for local mode we don't want to keep that around
  inputFormatCache.remove(location);
  return super.createRecordReader(inputSplit, taskAttemptContext);
 }
};

private void setPredicatePushdown(JobConf jobConf) {
 if (this.config.filterPredicate != null) {
  ParquetInputFormat.setFilterPredicate(jobConf, this.config.filterPredicate);
 }
}
@Override

/**
 * Returns a non-null Filter, which is a wrapper around either a
 * FilterPredicate, an UnboundRecordFilter, or a no-op filter.
 */
public static Filter getFilter(Configuration conf) {
 return FilterCompat.get(getFilterPredicate(conf), getUnboundRecordFilterInstance(conf));
}

/**
 * {@inheritDoc}
 */
@Override
public RecordReader<Void, T> createRecordReader(
  InputSplit inputSplit,
  TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
 Configuration conf = ContextUtil.getConfiguration(taskAttemptContext);
 ReadSupport<T> readSupport = getReadSupport(conf);
 return new ParquetRecordReader<T>(readSupport, getFilter(conf));
}

/**
 * {@inheritDoc}
 */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
 Configuration configuration = ContextUtil.getConfiguration(jobContext);
 List<InputSplit> splits = new ArrayList<InputSplit>();
 if (isTaskSideMetaData(configuration)) {
  // Although not required by the API, some clients may depend on always
  // receiving ParquetInputSplit. Translation is required at some point.
  for (InputSplit split : super.getSplits(jobContext)) {
   Preconditions.checkArgument(split instanceof FileSplit,
     "Cannot wrap non-FileSplit: " + split);
   splits.add(ParquetInputSplit.from((FileSplit) split));
  }
  return splits;
 } else {
  splits.addAll(getSplits(configuration, getFooters(jobContext)));
 }
 return splits;
}

/**
 * Create Hadoop job according to arguments from main.
 */
@Override
public synchronized Job createJob(String[] args) throws IOException {
  Job job = super.createJob(args);
  // for parquet format job, we have to append parquet schema field. We can only set parquet.pig.schema here
  // because of 'Job' dependency. While the other two required list parameters are in TrainModelProcessor.
  @SuppressWarnings("rawtypes")
  final GlobalMetaData globalMetaData = new ParquetInputFormat().getGlobalMetaData(job);
  Schema schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData());
  String schemaStr = pigSchemaToString(schema);
  job.getConfiguration().set("parquet.pig.schema", schemaStr);
  return job;
}

@SuppressWarnings("rawtypes")
@Override
public void sourceConfInit(FlowProcess<JobConf> fp,
  Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
 if (filterPredicate != null) {
  ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate);
 }
 jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
 ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class);
 TupleReadSupport.setRequestedFields(jobConf, getSourceFields());
}

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 if (isTaskSideMetaData(job)) {
  return super.getSplits(job, numSplits);
 }
 List<Footer> footers = getFooters(job);
 List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
 if (splits == null) {
  return null;
 }
 InputSplit[] resultSplits = new InputSplit[splits.size()];
 int i = 0;
 for (ParquetInputSplit split : splits) {
  resultSplits[i++] = new ParquetInputSplitWrapper(split);
 }
 return resultSplits;
}

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
 boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
 final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
 if (maxSplitSize < 0 || minSplitSize < 0) {
  throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
 }
 GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
 ReadContext readContext = getReadSupport(configuration).init(new InitContext(
   configuration,
   globalMetaData.getKeyValueMetaData(),
   globalMetaData.getSchema()));
 return new ClientSideMetadataSplitStrategy().getSplits(
   configuration, footers, maxSplitSize, minSplitSize, readContext);
}

List<FileStatus> statuses = listStatus(jobContext);
if (statuses.isEmpty()) {
 return Collections.emptyList();
List<Footer> newFooters = getFooters(config, missingStatuses);
for (Footer newFooter : newFooters) {

public RecordReaderWrapper(
  InputSplit oldSplit, JobConf oldJobConf, Reporter reporter)
  throws IOException {
 splitLen = oldSplit.getLength();
 try {
  realReader = new ParquetRecordReader<V>(
    ParquetInputFormat.<V>getReadSupportInstance(oldJobConf),
    ParquetInputFormat.getFilter(oldJobConf));
  if (oldSplit instanceof ParquetInputSplitWrapper) {
   realReader.initialize(((ParquetInputSplitWrapper) oldSplit).realSplit, oldJobConf, reporter);
  } else if (oldSplit instanceof FileSplit) {
   realReader.initialize((FileSplit) oldSplit, oldJobConf, reporter);
  } else {
   throw new IllegalArgumentException(
     "Invalid split (not a FileSplit or ParquetInputSplitWrapper): " + oldSplit);
  }
  // read once to gain access to key and value objects
  if (realReader.nextKeyValue()) {
   firstRecord = true;
   valueContainer = new Container<V>();
   valueContainer.set(realReader.getCurrentValue());
  } else {
   eof = true;
  }
 } catch (InterruptedException e) {
  Thread.interrupted();
  throw new IOException(e);
 }
}

private void initSchema(Job job) throws IOException {
 if (schema != null) {
  return;
 }
 if (schema == null && requestedSchema != null) {
  // this is only true in front-end
  schema = requestedSchema;
 }
 if (schema == null) {
  // no requested schema => use the schema from the file
  final GlobalMetaData globalMetaData = getParquetInputFormat().getGlobalMetaData(job);
  schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData());
 }
 if (isElephantBirdCompatible(job)) {
  convertToElephantBirdCompatibleSchema(schema);
 }
}

public List<Footer> getFooters(Configuration configuration, List<FileStatus> statuses) throws IOException {
 return getFooters(configuration, (Collection<FileStatus>)statuses);
}

 throws IOException {
List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
Filter filter = ParquetInputFormat.getFilter(configuration);

public static void setUnboundRecordFilter(Job job, Class<? extends UnboundRecordFilter> filterClass) {
 Configuration conf = ContextUtil.getConfiguration(job);
 checkArgument(getFilterPredicate(conf) == null,
   "You cannot provide an UnboundRecordFilter after providing a FilterPredicate");
 conf.set(UNBOUND_RECORD_FILTER, filterClass.getName());
}

@Override
protected List<FileStatus> listStatus(JobContext jobContext) throws IOException {
 return getAllFileRecursively(super.listStatus(jobContext),
   ContextUtil.getConfiguration(jobContext));
}

@Override
public ResourceStatistics getStatistics(String location, Job job)
  throws IOException {
 if (DEBUG) LOG.debug("LoadMetadata.getStatistics(" + location + ", " + job + ")");
 /* We need to call setInput since setLocation is not
   guaranteed to be called before this */
 setInput(location, job);
 long length = 0;
 try {
  for (InputSplit split : getParquetInputFormat().getSplits(job)) {
   length += split.getLength();
  }
 } catch (InterruptedException e) {
  LOG.warn("Interrupted: ", e);
  return null;
 }
 ResourceStatistics stats = new ResourceStatistics();
 // TODO use pig-0.12 setBytes api when its available
 stats.setmBytes(length / 1024 / 1024);
 return stats;
}

/**
 * @param jobContext the current job context
 * @return the merged metadata from the footers
 * @throws IOException
 */
public GlobalMetaData getGlobalMetaData(JobContext jobContext) throws IOException {
 return ParquetFileWriter.getGlobalMetaData(getFooters(jobContext));
}

 Filter filter = getFilter(configuration);
 filteredBlocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {

Javadoc

The input format to read a Parquet file. It requires an implementation of ReadSupport to materialize the records. The requestedSchema will control how the original records get projected by the loader. It must be a subset of the original schema. Only the columns needed to reconstruct the records with the requestedSchema will be scanned.

Most used methods

<init>
Constructor for subclasses, such as AvroParquetInputFormat, or wrappers. Subclasses and wrappers ma
createRecordReader
getGlobalMetaData
getSplits
setFilterPredicate
getAllFileRecursively
getFilter
Returns a non-null Filter, which is a wrapper around either a FilterPredicate, an UnboundRecordFilte
getFilterPredicate
getFooters
getFormatMinSplitSize
getReadSupport
getReadSupportClass

Popular in Java

Parsing JSON documents to java classes using gson
getSharedPreferences (Context)
getExternalFilesDir (Context)
getSupportFragmentManager (FragmentActivity)
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
URL (java.net)
A Uniform Resource Locator that identifies the location of an Internet resource as specified by RFC
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
Top plugins for Android Studio

How to useParquetInputFormat in parquet.hadoop

Best Java code snippets using parquet.hadoop.ParquetInputFormat (Showing top 20 results out of 315)

How to use
ParquetInputFormat
in
parquet.hadoop