public MapredParquetInputFormat() { this(new ParquetInputFormat<ArrayWritable>(DataWritableReadSupport.class)); }
@Override public RecordReader<Void, Tuple> createRecordReader( InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { // for local mode we don't want to keep that around inputFormatCache.remove(location); return super.createRecordReader(inputSplit, taskAttemptContext); } };
private void setPredicatePushdown(JobConf jobConf) { if (this.config.filterPredicate != null) { ParquetInputFormat.setFilterPredicate(jobConf, this.config.filterPredicate); } } @Override
/** * Returns a non-null Filter, which is a wrapper around either a * FilterPredicate, an UnboundRecordFilter, or a no-op filter. */ public static Filter getFilter(Configuration conf) { return FilterCompat.get(getFilterPredicate(conf), getUnboundRecordFilterInstance(conf)); }
/** * {@inheritDoc} */ @Override public RecordReader<Void, T> createRecordReader( InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Configuration conf = ContextUtil.getConfiguration(taskAttemptContext); ReadSupport<T> readSupport = getReadSupport(conf); return new ParquetRecordReader<T>(readSupport, getFilter(conf)); }
/** * {@inheritDoc} */ @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration configuration = ContextUtil.getConfiguration(jobContext); List<InputSplit> splits = new ArrayList<InputSplit>(); if (isTaskSideMetaData(configuration)) { // Although not required by the API, some clients may depend on always // receiving ParquetInputSplit. Translation is required at some point. for (InputSplit split : super.getSplits(jobContext)) { Preconditions.checkArgument(split instanceof FileSplit, "Cannot wrap non-FileSplit: " + split); splits.add(ParquetInputSplit.from((FileSplit) split)); } return splits; } else { splits.addAll(getSplits(configuration, getFooters(jobContext))); } return splits; }
/** * Create Hadoop job according to arguments from main. */ @Override public synchronized Job createJob(String[] args) throws IOException { Job job = super.createJob(args); // for parquet format job, we have to append parquet schema field. We can only set parquet.pig.schema here // because of 'Job' dependency. While the other two required list parameters are in TrainModelProcessor. @SuppressWarnings("rawtypes") final GlobalMetaData globalMetaData = new ParquetInputFormat().getGlobalMetaData(job); Schema schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData()); String schemaStr = pigSchemaToString(schema); job.getConfiguration().set("parquet.pig.schema", schemaStr); return job; }
@SuppressWarnings("rawtypes") @Override public void sourceConfInit(FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { if (filterPredicate != null) { ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate); } jobConf.setInputFormat(DeprecatedParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class); TupleReadSupport.setRequestedFields(jobConf, getSourceFields()); }
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { if (isTaskSideMetaData(job)) { return super.getSplits(job, numSplits); } List<Footer> footers = getFooters(job); List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers); if (splits == null) { return null; } InputSplit[] resultSplits = new InputSplit[splits.size()]; int i = 0; for (ParquetInputSplit split : splits) { resultSplits[i++] = new ParquetInputSplitWrapper(split); } return resultSplits; }
/** * @param configuration the configuration to connect to the file system * @param footers the footers of the files to read * @return the splits for the footers * @throws IOException * @deprecated split planning using file footers will be removed */ @Deprecated public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException { boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE); final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L)); if (maxSplitSize < 0 || minSplitSize < 0) { throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize); } GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking); ReadContext readContext = getReadSupport(configuration).init(new InitContext( configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema())); return new ClientSideMetadataSplitStrategy().getSplits( configuration, footers, maxSplitSize, minSplitSize, readContext); }
List<FileStatus> statuses = listStatus(jobContext); if (statuses.isEmpty()) { return Collections.emptyList(); List<Footer> newFooters = getFooters(config, missingStatuses); for (Footer newFooter : newFooters) {
public RecordReaderWrapper( InputSplit oldSplit, JobConf oldJobConf, Reporter reporter) throws IOException { splitLen = oldSplit.getLength(); try { realReader = new ParquetRecordReader<V>( ParquetInputFormat.<V>getReadSupportInstance(oldJobConf), ParquetInputFormat.getFilter(oldJobConf)); if (oldSplit instanceof ParquetInputSplitWrapper) { realReader.initialize(((ParquetInputSplitWrapper) oldSplit).realSplit, oldJobConf, reporter); } else if (oldSplit instanceof FileSplit) { realReader.initialize((FileSplit) oldSplit, oldJobConf, reporter); } else { throw new IllegalArgumentException( "Invalid split (not a FileSplit or ParquetInputSplitWrapper): " + oldSplit); } // read once to gain access to key and value objects if (realReader.nextKeyValue()) { firstRecord = true; valueContainer = new Container<V>(); valueContainer.set(realReader.getCurrentValue()); } else { eof = true; } } catch (InterruptedException e) { Thread.interrupted(); throw new IOException(e); } }
private void initSchema(Job job) throws IOException { if (schema != null) { return; } if (schema == null && requestedSchema != null) { // this is only true in front-end schema = requestedSchema; } if (schema == null) { // no requested schema => use the schema from the file final GlobalMetaData globalMetaData = getParquetInputFormat().getGlobalMetaData(job); schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData()); } if (isElephantBirdCompatible(job)) { convertToElephantBirdCompatibleSchema(schema); } }
public List<Footer> getFooters(Configuration configuration, List<FileStatus> statuses) throws IOException { return getFooters(configuration, (Collection<FileStatus>)statuses); }
throws IOException { List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); Filter filter = ParquetInputFormat.getFilter(configuration);
public static void setUnboundRecordFilter(Job job, Class<? extends UnboundRecordFilter> filterClass) { Configuration conf = ContextUtil.getConfiguration(job); checkArgument(getFilterPredicate(conf) == null, "You cannot provide an UnboundRecordFilter after providing a FilterPredicate"); conf.set(UNBOUND_RECORD_FILTER, filterClass.getName()); }
@Override protected List<FileStatus> listStatus(JobContext jobContext) throws IOException { return getAllFileRecursively(super.listStatus(jobContext), ContextUtil.getConfiguration(jobContext)); }
@Override public ResourceStatistics getStatistics(String location, Job job) throws IOException { if (DEBUG) LOG.debug("LoadMetadata.getStatistics(" + location + ", " + job + ")"); /* We need to call setInput since setLocation is not guaranteed to be called before this */ setInput(location, job); long length = 0; try { for (InputSplit split : getParquetInputFormat().getSplits(job)) { length += split.getLength(); } } catch (InterruptedException e) { LOG.warn("Interrupted: ", e); return null; } ResourceStatistics stats = new ResourceStatistics(); // TODO use pig-0.12 setBytes api when its available stats.setmBytes(length / 1024 / 1024); return stats; }
/** * @param jobContext the current job context * @return the merged metadata from the footers * @throws IOException */ public GlobalMetaData getGlobalMetaData(JobContext jobContext) throws IOException { return ParquetFileWriter.getGlobalMetaData(getFooters(jobContext)); }
Filter filter = getFilter(configuration); filteredBlocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else {