public static Map<String, Container> mergeMaps(Map<String, Container>... maps) { Map<String, Container> map = Arrays.stream(maps) .flatMap(x -> x.values().stream()) .collect(Collectors.groupingBy(v -> v.getTitle().substring(6))) .entrySet().stream() .map(e -> new Merge().new Container(e.getValue().stream().mapToInt(x -> x.getVal()).sum(), e.getValue().get(0).getTitle())) .collect(Collectors.toMap(e -> ((Container) e).getTitle().substring(6), e -> e)); return map; }
@Test public void testPipeMerge() { Pipe pipe = new Pipe( "foo" ); pipe = new Each( pipe, new Fields( "a" ), new Identity() ); pipe = new Merge( pipe, new Pipe( "bar" ) ); assertEqualsTrace( "cascading.TraceTest.testPipeMerge(TraceTest.java", pipe.getTrace() ); }
public MulitStepFlowGraph() { Pipe lower = new Pipe( "lower" ); Pipe upper = new Pipe( "upper" ); lower = new Checkpoint( lower ); upper = new Checkpoint( upper ); lower = new Checkpoint( lower ); upper = new Checkpoint( upper ); Pipe sink = new Merge( "sink", lower, upper ); Map<String, Tap> sources = createHashMap(); sources.put( lower.getName(), new NonTap( new Fields( "offset", "line" ) ) ); sources.put( upper.getName(), new NonTap( new Fields( "offset", "line" ) ) ); Map<String, Tap> sinks = createHashMap(); sinks.put( sink.getName(), new NonTap( new Fields( "offset", "line" ) ) ); initialize( sources, sinks, sink ); } }
@Test public void testSimpleMerge() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "simplemerge" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 10 ); Collection results = getSinkAsList( flow ); assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) ); assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) ); }
@Test public void testSimpleMerge() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "simplemerge" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 10 ); Collection results = getSinkAsList( flow ); assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) ); assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) ); }
@Test public void testGroupByInsensitive() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sourceUpper = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "insensitivegrouping" + NONDETERMINISTIC ), SinkMode.REPLACE ); Pipe pipeLower = new Pipe( "lower" ); Pipe pipeUpper = new Pipe( "upper" ); Pipe merge = new Merge( pipeLower, pipeUpper ); Fields charFields = new Fields( "char" ); charFields.setComparator( "char", new LowerComparator() ); Pipe splice = new GroupBy( "groupby", merge, charFields ); splice = new Every( splice, new Fields( "char" ), new Count() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); // we can't guarantee if the grouping key will be upper or lower validateLength( flow, 5, 1, Pattern.compile( "^\\w+\\s2$" ) ); } }
@Test public void testGroupByInsensitive() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sourceUpper = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "insensitivegrouping" + NONDETERMINISTIC ), SinkMode.REPLACE ); Pipe pipeLower = new Pipe( "lower" ); Pipe pipeUpper = new Pipe( "upper" ); Pipe merge = new Merge( pipeLower, pipeUpper ); Fields charFields = new Fields( "char" ); charFields.setComparator( "char", new LowerComparator() ); Pipe splice = new GroupBy( "groupby", merge, charFields ); splice = new Every( splice, new Fields( "char" ), new Count() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); // we can't guarantee if the grouping key will be upper or lower validateLength( flow, 5, 1, Pattern.compile( "^\\w+\\s2$" ) ); } }
@Test public void testSimpleMergeThreeChain() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); getPlatform().copyFromLocal( inputFileLowerOffset ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); sources.put( "offset", sourceLowerOffset ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechain" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new Merge( splice, pipeOffset ); splice = new Each( splice, new Fields( "num", "char" ), new Identity() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 14 ); }
@Test public void testSameSourceMergeThreeChainGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Map sources = new HashMap(); sources.put( "split", sourceLower ); Tap sink = getPlatform().getTextFile( getOutputPath( "samemergethreechaingroup" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipe = new Pipe( "split" ); Pipe pipeLower = new Each( new Pipe( "lower", pipe ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper", pipe ), new Fields( "line" ), splitter ); Pipe pipeOffset = new Each( new Pipe( "offset", pipe ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); //put group before merge to test path counts splice = new GroupBy( splice, new Fields( "num" ) ); splice = new Merge( splice, pipeOffset ); // this group has its incoming paths counted, gated by the previous group splice = new GroupBy( splice, new Fields( "num" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 2, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 15 ); }
public HashJoinMergeIntoHashJoinStreamedStreamedMergeGraph() { Map sources = new HashMap(); sources.put( "lower", new NonTap( "lower", new Fields( "offset", "line" ) ) ); sources.put( "upper", new NonTap( "upper", new Fields( "offset", "line" ) ) ); sources.put( "offset", new NonTap( "offset", new Fields( "offset", "line" ) ) ); Map sinks = new HashMap(); sinks.put( "sink", new NonTap( "sink", new Fields( "offset", "line" ) ) ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) ); Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) ); Pipe splice = new HashJoin( pipeLower, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) ); splice = new Retain( splice, new Fields( "num1", "char1" ) ); splice = new Merge( "merge1", splice, pipeUpper ); splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) ); splice = new Retain( splice, new Fields( "num1", "char1" ) ); splice = new Merge( "merge2", splice, pipeUpper ); splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) ); splice = new Pipe( "sink", splice ); initialize( sources, sinks, splice ); } }
public HashJoinsIntoMerge() { Map sources = new HashMap(); sources.put( "lower", new NonTap( "lower", new Fields( "offset", "line" ) ) ); sources.put( "upper", new NonTap( "upper", new Fields( "offset", "line" ) ) ); sources.put( "lhs", new NonTap( "lhs", new Fields( "offset", "line" ) ) ); sources.put( "rhs", new NonTap( "rhs", new Fields( "offset", "line" ) ) ); Map sinks = new HashMap(); sinks.put( "sink", new NonTap( "sink", new Fields( "offset", "line" ) ) ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); upperLower = new Each( upperLower, new Identity() ); Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); lhsRhs = new Each( lhsRhs, new Identity() ); Pipe merge = new Merge( "sink", Pipe.pipes( upperLower, lhsRhs ) ); initialize( sources, sinks, merge ); } }
@Test public void testSimpleMergeThreeChainCoGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); getPlatform().copyFromLocal( inputFileLowerOffset ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); sources.put( "offset", sourceLowerOffset ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechaincogroup" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) ); Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new CoGroup( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 6 ); }
@Test public void testSimpleMergeThreeChainCoGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); getPlatform().copyFromLocal( inputFileLowerOffset ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); sources.put( "offset", sourceLowerOffset ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechaincogroup" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) ); Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new CoGroup( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 6 ); }
@Test public void testSameSourceMerge() throws Exception { getPlatform().copyFromLocal( inputFileLower ); Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sink = getPlatform().getTextFile( getOutputPath(), SinkMode.REPLACE ); Pipe lhs = new Pipe( "lhs" ); lhs = new Pipe( "lhs", lhs ); // rule should catch adjacent pipes Pipe rhs = new Pipe( "rhs" ); Pipe merge = new Merge( "merge", lhs, rhs ); FlowDef flowDef = FlowDef.flowDef() .addSource( lhs, source ) .addSource( rhs, source ) .addTailSink( merge, sink ); Flow flow = getPlatform().getFlowConnector().connect( flowDef ); flow.complete(); validateLength( flow, 10 ); }
@Test public void testSameSourceMerge() throws Exception { getPlatform().copyFromLocal( inputFileLower ); Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sink = getPlatform().getTextFile( getOutputPath(), SinkMode.REPLACE ); Pipe lhs = new Pipe( "lhs" ); lhs = new Pipe( "lhs", lhs ); // rule should catch adjacent pipes Pipe rhs = new Pipe( "rhs" ); Pipe merge = new Merge( "merge", lhs, rhs ); FlowDef flowDef = FlowDef.flowDef() .addSource( lhs, source ) .addSource( rhs, source ) .addTailSink( merge, sink ); Flow flow = getPlatform().getFlowConnector().connect( flowDef ); flow.complete(); validateLength( flow, 10 ); }
@Test public void testSplitSameSourceMergedComplex() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemergedcomplex" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "split" ); pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) ); Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); Pipe merged = new Merge( "merged-first", left, right ); merged = new Each( merged, new Fields( "line" ), new Identity() ); left = new Each( new Pipe( "left", merged ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); right = new Each( new Pipe( "right", merged ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); merged = new Merge( "merged-second", left, right ); merged = new Each( merged, new Fields( "line" ), new Identity() ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 3 ); }
@Test public void testSplitSameSourceMergedComplex() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemergedcomplex" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "split" ); pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) ); Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); Pipe merged = new Merge( "merged-first", left, right ); merged = new Each( merged, new Fields( "line" ), new Identity() ); left = new Each( new Pipe( "left", merged ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); right = new Each( new Pipe( "right", merged ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); merged = new Merge( "merged-second", left, right ); merged = new Each( merged, new Fields( "line" ), new Identity() ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 3 ); }
@Test public void testSplitSameSourceMerged() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemerged" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "split" ); pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) ); Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); Pipe merged = new Merge( "merged", left, right ); merged = new Each( merged, new Fields( "line" ), new Identity() ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 3 ); }
@Test public void testSplitSameSourceMerged() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemerged" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "split" ); pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) ); Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); Pipe merged = new Merge( "merged", left, right ); merged = new Each( merged, new Fields( "line" ), new Identity() ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 3 ); }
@Test public void testSameSourceMergeHashJoin() throws Exception { getPlatform().copyFromLocal( inputFileLower ); Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sink = getPlatform().getTextFile( getOutputPath(), SinkMode.REPLACE ); Pipe mergeLhs = new Pipe( "lhs" ); Pipe mergeRhs = new Pipe( "rhs" ); Pipe mergePipe = new Merge( "merge", mergeLhs, mergeRhs ); mergePipe = new Rename( mergePipe, new Fields( "num", "char" ), new Fields( "merged.num", "merged.char" ) ); Pipe joinRhs = new Pipe( "join" ); joinRhs = new Rename( joinRhs, new Fields( "num", "char" ), new Fields( "rhs.num", "rhs.char" ) ); Pipe lookupJoin = new HashJoin( mergePipe, new Fields( "merged.num" ), joinRhs, new Fields( "rhs.num" ) ); Pipe retain = new Retain( lookupJoin, new Fields( "merged.num", "merged.char", "rhs.char" ) ); Pipe out = new Rename( retain, new Fields( "merged.num", "merged.char", "rhs.char" ), new Fields( "num", "merged", "char" ) ); FlowDef flowDef = FlowDef.flowDef() .addSource( mergeLhs, source ) .addSource( mergeRhs, source ) .addSource( joinRhs, source ) .addTailSink( out, sink ); FlowConnector flowConnector = getPlatform().getFlowConnector(); Flow flow = flowConnector.connect( flowDef ); flow.complete(); validateLength( flow, 10 ); }