public MulitStepFlowGraph() { Pipe lower = new Pipe( "lower" ); Pipe upper = new Pipe( "upper" ); lower = new Checkpoint( lower ); upper = new Checkpoint( upper ); lower = new Checkpoint( lower ); upper = new Checkpoint( upper ); Pipe sink = new Merge( "sink", lower, upper ); Map<String, Tap> sources = createHashMap(); sources.put( lower.getName(), new NonTap( new Fields( "offset", "line" ) ) ); sources.put( upper.getName(), new NonTap( new Fields( "offset", "line" ) ) ); Map<String, Tap> sinks = createHashMap(); sinks.put( sink.getName(), new NonTap( new Fields( "offset", "line" ) ) ); initialize( sources, sinks, sink ); } }
/** * Method addCheckpoint adds a new checkpoint {@link Tap} named after the given {@link Checkpoint} for use in the resulting {@link Flow}. * * @param pipe of Pipe * @param checkpoint of Tap * @return FlowDef */ public FlowDef addCheckpoint( Checkpoint pipe, Tap checkpoint ) { addCheckpoint( pipe.getName(), checkpoint ); return this; }
@Test public void testHashJoinCheckpointWithDistCacheDecorator() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); pipeUpper = new Checkpoint( pipeUpper ); Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); Map<Object, Object> properties = getProperties(); FlowConnectorProps.setCheckpointTapDecoratorClass( properties, "cascading.tap.hadoop.DistCacheTap" ); Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); flow.complete(); validateLength( flow, 5 ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); }
String name = ( (Checkpoint) flowElement ).getName();
@Test public void testHashJoinCheckpointWithDistCacheDecorator() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); pipeUpper = new Checkpoint( pipeUpper ); Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); Map<Object, Object> properties = getProperties(); FlowConnectorProps.setCheckpointTapDecoratorClass( properties, "cascading.tap.hadoop.DistCacheTap" ); Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); flow.complete(); validateLength( flow, 5 ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); }
pipe = new Checkpoint( pipe ); pipe = new Checkpoint( pipe ); pipe = new Checkpoint( pipe ); // this should be collapsed into the sink tap, not be a fourth job
pipe = new Checkpoint( pipe ); pipe = new Checkpoint( pipe ); pipe = new Checkpoint( pipe ); // this should be collapsed into the sink tap, not be a fourth job
Pipe tail2 = new Each( new Pipe( "sink2", splice ), new Fields( "line" ), new Identity( new Fields( "line" ) ) ); tail2 = new Checkpoint( tail2 );
Pipe tail2 = new Each( new Pipe( "sink2", splice ), new Fields( "line" ), new Identity( new Fields( "line" ) ) ); tail2 = new Checkpoint( tail2 );
@Test public void testDuplicateCheckpoint() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new Checkpoint( "checkpoint", pipe ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); pipe = new Checkpoint( "checkpoint", pipe ); Tap sink = getPlatform().getTextFile( getOutputPath( "duplicatecheckpoint" ), SinkMode.REPLACE ); FlowDef flowDef = FlowDef.flowDef() .setName( "restartable" ) .addSource( "test", source ) .addTailSink( pipe, sink ) .setRunID( "restartable" ); try { Flow flow = getPlatform().getFlowConnector().connect( flowDef ); fail( "should throw element graph exception" ); } catch( Exception exception ) { // ignore } }
private Flow thirdCheckpointFlow( Tap source, String path ) { Pipe pipe = new Pipe( "third" ); pipe = new Each( pipe, new FieldJoiner( new Fields( "mangled" ), "-" ) ); pipe = new Checkpoint( "checkpoint", pipe ); pipe = new Each( pipe, new Identity() ); Tap sink = getPlatform().getTabDelimitedFile( new Fields( "mangled" ), getOutputPath( "unusedpath" ), SinkMode.REPLACE ); Tap checkpoint = getPlatform().getTabDelimitedFile( Fields.ALL, getOutputPath( path ), SinkMode.REPLACE ); FlowDef flowDef = FlowDef.flowDef() .addSource( pipe, source ) .addTailSink( pipe, sink ) .addCheckpoint( "checkpoint", checkpoint ); return getPlatform().getFlowConnector().connect( flowDef ); }
private Flow thirdCheckpointFlow( Tap source, String path ) { Pipe pipe = new Pipe( "third" ); pipe = new Each( pipe, new FieldJoiner( new Fields( "mangled" ), "-" ) ); pipe = new Checkpoint( "checkpoint", pipe ); pipe = new Each( pipe, new Identity() ); Tap sink = getPlatform().getTabDelimitedFile( new Fields( "mangled" ), getOutputPath( "unusedpath" ), SinkMode.REPLACE ); Tap checkpoint = getPlatform().getTabDelimitedFile( Fields.ALL, getOutputPath( path ), SinkMode.REPLACE ); FlowDef flowDef = FlowDef.flowDef() .addSource( pipe, source ) .addTailSink( pipe, sink ) .addCheckpoint( "checkpoint", checkpoint ); return getPlatform().getFlowConnector().connect( flowDef ); }
@Test public void testDuplicateCheckpoint() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new Checkpoint( "checkpoint", pipe ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); pipe = new Checkpoint( "checkpoint", pipe ); Tap sink = getPlatform().getTextFile( getOutputPath( "duplicatecheckpoint" ), SinkMode.REPLACE ); FlowDef flowDef = FlowDef.flowDef() .setName( "restartable" ) .addSource( "test", source ) .addTailSink( pipe, sink ) .setRunID( "restartable" ); try { Flow flow = getPlatform().getFlowConnector().connect( flowDef ); fail( "should throw element graph exception" ); } catch( Exception exception ) { // ignore } }
Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); pipeUpper = new Checkpoint( pipeUpper );
@Test public void testSimpleCheckpoint() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new Checkpoint( pipe ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplecheckpoint" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 8, null ); if( !getPlatform().isMapReduce() ) return; List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 2, steps.size() ); }
Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); pipeUpper = new Checkpoint( pipeUpper );
@Test public void testSimpleCheckpoint() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new Checkpoint( pipe ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplecheckpoint" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 8, null ); if( !getPlatform().isMapReduce() ) return; List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 2, steps.size() ); }
@Test public void testSimpleCheckpointTextIntermediate() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new Checkpoint( "checkpoint", pipe ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = getPlatform().getTextFile( getOutputPath( "checkpoint/sink" ), SinkMode.REPLACE ); Tap checkpoint = getPlatform().getDelimitedFile( Fields.ALL, true, "\t", "\"", getOutputPath( "checkpoint/tap" ), SinkMode.REPLACE ); FlowDef flowDef = flowDef() .addSource( pipe, source ) .addTailSink( pipe, sink ) .addCheckpoint( "checkpoint", checkpoint ); Flow flow = getPlatform().getFlowConnector().connect( flowDef ); flow.complete(); validateLength( flow, 8 ); if( !( getPlatform().isMapReduce() ) ) return; List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 2, steps.size() ); validateLength( flow.openTapForRead( checkpoint ), 10 ); }
@Test public void testSimpleCheckpointTextIntermediate() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new Checkpoint( "checkpoint", pipe ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = getPlatform().getTextFile( getOutputPath( "checkpoint/sink" ), SinkMode.REPLACE ); Tap checkpoint = getPlatform().getDelimitedFile( Fields.ALL, true, "\t", "\"", getOutputPath( "checkpoint/tap" ), SinkMode.REPLACE ); FlowDef flowDef = flowDef() .addSource( pipe, source ) .addTailSink( pipe, sink ) .addCheckpoint( "checkpoint", checkpoint ); Flow flow = getPlatform().getFlowConnector().connect( flowDef ); flow.complete(); validateLength( flow, 8 ); if( !( getPlatform().isMapReduce() ) ) return; List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 2, steps.size() ); validateLength( flow.openTapForRead( checkpoint ), 10 ); }
private Flow createRestartableFlow( String sinkPath, boolean fail ) { Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new Checkpoint( pipe ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); pipe = new Each( pipe, new TestFunction( new Fields( "insert" ), new Tuple( "value" ), fail ? 2 : -1 ) ); Tap sink = getPlatform().getTextFile( sinkPath, SinkMode.REPLACE ); FlowDef flowDef = FlowDef.flowDef() .setName( "restartable" ) .addSource( "test", source ) .addTailSink( pipe, sink ) .setRunID( "restartable" ); return getPlatform().getFlowConnector().connect( flowDef ); }