protected Tap getSourceTapFor( String sourceName, Stereotype<Protocol, Format> stereotype ) { List<Resource<Protocol, Format, SinkMode>> resources = getSourceResources( sourceName ); Tap[] taps = createTapFor( stereotype, resources, Role.source ); if( taps == null ) return null; if( taps.length == 1 ) return taps[ 0 ]; return new MultiSourceTap( taps ); }
public void iterate( Set<Hfs> taps, MultiSourceTap value ) { Iterator<Tap> childTaps = value.getChildTaps(); while( childTaps.hasNext() ) accumulate( taps, childTaps.next() ); }
return new MultiSourceTap( taps.toArray( new Tap[ taps.size() ] ) ).openForRead( flowProcess, input );
@Override public TupleEntryIterator openForRead( FlowProcess<? extends Config> flowProcess, Input input ) throws IOException { if( input != null ) return findMatchingTap( flowProcess ).openForRead( flowProcess, input ); Iterator iterators[] = new Iterator[ getTaps().length ]; for( int i = 0; i < getTaps().length; i++ ) iterators[ i ] = new TupleIterator( getTaps()[ i ].openForRead( flowProcess ) ); return new TupleEntryChainIterator( getSourceFields(), iterators ); }
@Test public void testNestedMultiSourceGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); assertEquals( 2, source.getNumChildTaps() ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), getOutputPath( "globmultisource" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "globmultiource2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
@Test public void testNestedMultiSourceGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); assertEquals( 2, source.getNumChildTaps() ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), getOutputPath( "globmultisource" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "globmultiource2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
private Flow multiTapFlow( Tap[] sources, String path ) { Pipe pipe = new Pipe( "multitap" ); pipe = new Each( pipe, new Identity() ); Tap source = new MultiSourceTap( sources ); Tap sink = getPlatform().getTextFile( getOutputPath( path + "/multitap" ), SinkMode.REPLACE ); return getPlatform().getFlowConnector().connect( "multi-tap", source, sink, pipe ); }
return new MultiSourceTap( taps.toArray( new Tap[ taps.size() ] ) ).openForRead( flowProcess, input );
public void iterate( Set<Hfs> taps, MultiSourceTap value ) { Iterator<Tap> childTaps = value.getChildTaps(); while( childTaps.hasNext() ) accumulate( taps, childTaps.next() ); }
private Flow multiTapFlow( Tap[] sources, String path ) { Pipe pipe = new Pipe( "multitap" ); pipe = new Each( pipe, new Identity() ); Tap source = new MultiSourceTap( sources ); Tap sink = getPlatform().getTextFile( getOutputPath( path + "/multitap" ), SinkMode.REPLACE ); return getPlatform().getFlowConnector().connect( "multi-tap", source, sink, pipe ); }
@Test public void testMultiSourceIterator() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10 ); GlobHfs sourceMulti = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r],owe?}.txt" ); source = new MultiSourceTap( sourceMulti ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10, null ); }
private static List<String> getPrettyNamesForTaps(Set<Tap> taps, boolean removeRandomSuffixFromTempTaps) { List<String> prettyNames = new ArrayList<String>(); for (Tap tap : taps) { if (tap instanceof NullTap || tap instanceof MemorySourceTap) { // MemorySourceTap and NullTap both have really annoying random identifiers that aren't important to note prettyNames.add(tap.getClass().getSimpleName()); } else if (tap instanceof MultiSourceTap) { // concatenate all sources in a multi source tap Iterator children = ((MultiSourceTap) tap).getChildTaps(); while (children.hasNext()) { Object object = children.next(); if (object instanceof Tap) { prettyNames.add(getPrettyNameForTap((Tap) object, removeRandomSuffixFromTempTaps)); } } } else { prettyNames.add(getPrettyNameForTap(tap, removeRandomSuffixFromTempTaps)); } } return prettyNames; }
@Test public void testMultiSourceIterator() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); Tap source = new MultiSourceTap( sourceLower, sourceUpper ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10 ); }
@Test public void testMultiSourceIterator() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10 ); GlobHfs sourceMulti = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r],owe?}.txt" ); source = new MultiSourceTap( sourceMulti ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10, null ); }
public static long getSourceModified( Object confCopy, Iterator<Tap> values, long sinkModified ) throws IOException { long sourceModified = 0; while( values.hasNext() ) { Tap source = values.next(); if( source instanceof MultiSourceTap ) return getSourceModified( confCopy, ( (MultiSourceTap) source ).getChildTaps(), sinkModified ); sourceModified = source.getModifiedTime( confCopy ); // source modified returns zero if does not exist // this should minimize number of times we touch any file meta-data server if( sourceModified == 0 && !source.resourceExists( confCopy ) ) throw new FlowException( "source does not exist: " + source ); if( sinkModified < sourceModified ) return sourceModified; } return sourceModified; }
@Test public void testMultiSourceIterator() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); Tap source = new MultiSourceTap( sourceLower, sourceUpper ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10 ); }
@Test public void testCombinedHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Hfs sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputFileLower ); Hfs sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputFileUpper ); // create a CombinedHfs instance on these files Tap source = new MultiSourceTap<Hfs, JobConf, RecordReader>( sourceLower, sourceUpper ); FlowProcess<JobConf> process = getPlatform().getFlowProcess(); JobConf conf = process.getConfigCopy(); // set the combine flag conf.setBoolean( HfsProps.COMBINE_INPUT_FILES, true ); conf.set( "cascading.flow.platform", "hadoop" ); // only supported on mr based platforms // test the input format and the split source.sourceConfInit( process, conf ); InputFormat inputFormat = conf.getInputFormat(); assertEquals( Hfs.CombinedInputFormat.class, inputFormat.getClass() ); InputSplit[] splits = inputFormat.getSplits( conf, 1 ); assertEquals( 1, splits.length ); validateLength( source.openForRead( process ), 10 ); }
@Test public void testCombinedHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Hfs sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputFileLower ); Hfs sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputFileUpper ); // create a CombinedHfs instance on these files Tap source = new MultiSourceTap<Hfs, JobConf, RecordReader>( sourceLower, sourceUpper ); FlowProcess<JobConf> process = getPlatform().getFlowProcess(); JobConf conf = process.getConfigCopy(); // set the combine flag conf.setBoolean( HfsProps.COMBINE_INPUT_FILES, true ); conf.set( "cascading.flow.platform", "hadoop" ); // only supported on mr based platforms // test the input format and the split source.sourceConfInit( process, conf ); InputFormat inputFormat = conf.getInputFormat(); assertEquals( Hfs.CombinedInputFormat.class, inputFormat.getClass() ); InputSplit[] splits = inputFormat.getSplits( conf, 1 ); assertEquals( 1, splits.length ); validateLength( source.openForRead( process ), 10 ); }
taps.add( getPlatform().getPartitionTap( getPlatform().getDelimitedFile( new Fields( "upper" ), "+", getOutputPath( "/partitioned/eee" ) ), sourcePartition, 1 ) ); MultiSourceTap multiSourceTap = new MultiSourceTap( taps.toArray( new Tap[ taps.size() ] ) );
taps.add( getPlatform().getPartitionTap( getPlatform().getDelimitedFile( new Fields( "upper" ), "+", getOutputPath( "/partitioned/eee" ) ), sourcePartition, 1 ) ); MultiSourceTap multiSourceTap = new MultiSourceTap( taps.toArray( new Tap[ taps.size() ] ) );