/** * Create an {@link org.apache.tez.mapreduce.input.MRInput.MRInputConfigBuilder} * @param conf Configuration for the {@link MRInputLegacy} * @param inputFormat InputFormat derived class * @return {@link org.apache.tez.mapreduce.input.MRInput.MRInputConfigBuilder} */ public static MRInputConfigBuilder createConfigBuilder(Configuration conf, Class<?> inputFormat) { return MRInput.createConfigBuilder(conf, inputFormat).setInputClassName(MRInputLegacy.class.getName()); }
@Override public void prepare() { LOG.info( "calling {}#start() on: {}", logicalInput.getClass().getSimpleName(), getSource() ); logicalInput.start(); Hadoop2TezFlowProcess tezFlowProcess = (Hadoop2TezFlowProcess) FlowProcessWrapper.undelegate( flowProcess ); TezConfiguration configuration = tezFlowProcess.getConfiguration(); try { reader = (MRReader) logicalInput.getReader(); } catch( IOException exception ) { throw new CascadeException( "unable to get reader", exception ); } // set the cascading.source.path property for the current split // if a TezGroupedSplit, currently won't set TezUtil.setSourcePathForSplit( logicalInput, reader, configuration ); }
@Override public boolean next() throws IOException { getContext().notifyProgress(); return false; }
void processSplitEvent(InputDataInformationEvent event) throws IOException { rrLock.lock(); try { initFromEventInternal(event); if (LOG.isDebugEnabled()) { LOG.debug(getContext().getSourceVertexName() + " notifying on RecordReader initialized"); } rrInited.signal(); } finally { rrLock.unlock(); } }
@Override public List<Event> initialize() throws IOException { super.initialize(); getContext().inputIsReady(); this.splitInfoViaEvents = jobConf.getBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS_DEFAULT); LOG.info(getContext().getSourceVertexName() + " using newmapreduce API=" + useNewApi + ", split via event=" + splitInfoViaEvents + ", numPhysicalInputs=" + getNumPhysicalInputs()); initializeInternal(); return null; }
@Override public void handleEvents(List<Event> inputEvents) throws Exception { if (getNumPhysicalInputs() == 0) { throw new IllegalStateException( "Unexpected event. MRInput has been setup to receive 0 events"); } if (eventReceived || inputEvents.size() != 1) { throw new IllegalStateException( "MRInput expects only a single input. Received: current eventListSize: " + inputEvents.size() + "Received previous input: " + eventReceived); } Event event = inputEvents.iterator().next(); Preconditions.checkArgument(event instanceof InputDataInformationEvent, getClass().getSimpleName() + " can only handle a single event of type: " + InputDataInformationEvent.class.getSimpleName()); processSplitEvent((InputDataInformationEvent) event); }
private NewRecordReader(MRInput in) throws IOException { this.in = in; this.reader = in.getReader(); }
"Only a single instance of record reader can be created for this input."); readerCreated = true; if (getNumPhysicalInputs() == 0) { return new KeyValueReader() { @Override try { if (!mrReader.isSetup()) checkAndAwaitRecordReaderInitialization(); } finally { rrLock.unlock();
@Override public void start() { Preconditions.checkState(getNumPhysicalInputs() == 0 || getNumPhysicalInputs() == 1, "Expecting 0 or 1 physical input for MRInput"); }
@Override public void run() throws Exception { Preconditions.checkArgument(getInputs().size() == 1); boolean inUnion = true; if (getContext().getTaskVertexName().equals("map3")) { inUnion = false; } Preconditions.checkArgument(getOutputs().size() == (inUnion ? 2 : 1)); Preconditions.checkArgument(getOutputs().containsKey("checker")); MRInput input = (MRInput) getInputs().values().iterator().next(); KeyValueReader kvReader = input.getReader(); Output output = getOutputs().get("checker"); KeyValueWriter kvWriter = (KeyValueWriter) output.getWriter(); MROutput parts = null; KeyValueWriter partsWriter = null; if (inUnion) { parts = (MROutput) getOutputs().get("parts"); partsWriter = parts.getWriter(); } while (kvReader.next()) { StringTokenizer itr = new StringTokenizer(kvReader.getCurrentValue().toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); kvWriter.write(word, one); if (inUnion) { partsWriter.write(word, one); } } } }
/** * Create an {@link org.apache.tez.mapreduce.input.MRInput.MRInputConfigBuilder} for a FileInputFormat * @param conf Configuration for the {@link MRInputLegacy} * @param inputFormat FileInputFormat derived class * @param inputPaths Comma separated input paths * @return {@link org.apache.tez.mapreduce.input.MRInput.MRInputConfigBuilder} */ public static MRInputConfigBuilder createConfigBuilder(Configuration conf, Class<?> inputFormat, String inputPaths) { return MRInput.createConfigBuilder(conf, inputFormat, inputPaths).setInputClassName( MRInputLegacy.class.getName()); }
@Override public void prepare() { LOG.info( "calling {}#start() on: {}", logicalInput.getClass().getSimpleName(), getSource() ); logicalInput.start(); Hadoop2TezFlowProcess tezFlowProcess = (Hadoop2TezFlowProcess) FlowProcessWrapper.undelegate( flowProcess ); TezConfiguration configuration = tezFlowProcess.getConfiguration(); try { reader = (MRReader) logicalInput.getReader(); } catch( IOException exception ) { throw new CascadeException( "unable to get reader", exception ); } // set the cascading.source.path property for the current split // if a TezGroupedSplit, currently won't set TezUtil.setSourcePathForSplit( logicalInput, reader, configuration ); }
reader = input.getReader();
void checkAndAwaitRecordReaderInitialization() throws IOException { assert rrLock.getHoldCount() == 1; rrLock.lock(); try { if (LOG.isDebugEnabled()) { LOG.debug(getContext().getSourceVertexName() + " awaiting RecordReader initialization"); } rrInited.await(); } catch (Exception e) { throw new IOException( "Interrupted waiting for RecordReader initiailization"); } finally { rrLock.unlock(); } }
inputConf.set("mapred.input.format.class", TextInputFormat.class.getName()); inputConf.set(FileInputFormat.INPUT_DIR, inputPath); MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null); DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();
@Override public List<Event> close() throws IOException { mrReader.close(); long inputRecords = getContext().getCounters() .findCounter(TaskCounter.INPUT_RECORDS_PROCESSED).getValue(); getContext().getStatisticsReporter().reportItemsProcessed(inputRecords); return null; }
MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1) .groupSplits(false).build()); Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName())); v2.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2) .groupSplits(false).build()); Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(TokenProcessor.class.getName())); v3.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath3) .groupSplits(false).build()); CartesianProductConfig cartesianProductConfig;
private void initFromEventInternal(InputDataInformationEvent initEvent) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug(getContext().getSourceVertexName() + " initializing RecordReader from event"); LOG.debug(getContext().getSourceVertexName() + " split Details -> SplitClass: " + split.getClass().getName() + ", NewSplit: " + split + ", length: " + splitLength); splitLength = split.getLength(); if (LOG.isDebugEnabled()) { LOG.debug(getContext().getSourceVertexName() + " split Details -> SplitClass: " + split.getClass().getName() + ", OldSplit: " + split + ", length: " + splitLength); getContext().getCounters().findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES) .increment(splitLength); LOG.info(getContext().getSourceVertexName() + " initialized RecordReader from event");
MRInput.MRInputConfigBuilder configBuilder = MRInput.createConfigBuilder( sourceConf, null );
mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(), getContext() .getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext() .getTaskIndex(), getContext().getTaskAttemptNumber(), getContext()); } else { mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter, getContext()); TaskSplitMetaInfo thisTaskMetaInfo = allMetaInfo[getContext().getTaskIndex()]; TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(), thisTaskMetaInfo.getStartOffset()); if (useNewApi) { org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils .getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters() .findCounter(TaskCounter.SPLIT_RAW_BYTES)); try { mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(), inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(), getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext().getTaskIndex(), getContext().getTaskAttemptNumber(), getContext()); } else { org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils .getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters() .findCounter(TaskCounter.SPLIT_RAW_BYTES)); splitLength = oldInputSplit.getLength(); mrReader = new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(),