public static Map<String, Container> mergeMaps(Map<String, Container>... maps) { Map<String, Container> map = Arrays.stream(maps) .flatMap(x -> x.values().stream()) .collect(Collectors.groupingBy(v -> v.getTitle().substring(6))) .entrySet().stream() .map(e -> new Merge().new Container(e.getValue().stream().mapToInt(x -> x.getVal()).sum(), e.getValue().get(0).getTitle())) .collect(Collectors.toMap(e -> ((Container) e).getTitle().substring(6), e -> e)); return map; }
@Override public Scope outgoingScopeFor(Set<Scope> incomingScopes) { Scope toUse = null; for (Scope s : incomingScopes) { if (s.getName().equals(toAccept)) { toUse = s; } } return new Scope(super.outgoingScopeFor(Collections.singleton(toUse))); }
@Test public void testPipeMerge() { Pipe pipe = new Pipe( "foo" ); pipe = new Each( pipe, new Fields( "a" ), new Identity() ); pipe = new Merge( pipe, new Pipe( "bar" ) ); assertEqualsTrace( "cascading.TraceTest.testPipeMerge(TraceTest.java", pipe.getTrace() ); }
public MulitStepFlowGraph() { Pipe lower = new Pipe( "lower" ); Pipe upper = new Pipe( "upper" ); lower = new Checkpoint( lower ); upper = new Checkpoint( upper ); lower = new Checkpoint( lower ); upper = new Checkpoint( upper ); Pipe sink = new Merge( "sink", lower, upper ); Map<String, Tap> sources = createHashMap(); sources.put( lower.getName(), new NonTap( new Fields( "offset", "line" ) ) ); sources.put( upper.getName(), new NonTap( new Fields( "offset", "line" ) ) ); Map<String, Tap> sinks = createHashMap(); sinks.put( sink.getName(), new NonTap( new Fields( "offset", "line" ) ) ); initialize( sources, sinks, sink ); } }
Pipe merged = new Merge( pruned, rhs ); Pipe grouped = new GroupBy( merged, new Fields( "id2" ) );
Pipe merged = new Merge( pruned, rhs ); Pipe grouped = new GroupBy( merged, new Fields( "id2" ) );
/** * Confirms support for Merge->GroupBy * <p> * On Tez, this results in an identity node * <p> * TODO: tez planner logical optimization - remove Merge and push 'merge' to GroupBy */ @Test public void testMergeGroupBy() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sourceUpper = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath(), SinkMode.REPLACE ); Pipe pipeLower = new Pipe( "lower" ); Pipe pipeUpper = new Pipe( "upper" ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new GroupBy( splice, Fields.ALL ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 10 ); Collection results = getSinkAsList( flow ); assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) ); assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) ); }
/** * Confirms support for Merge->GroupBy * <p> * On Tez, this results in an identity node * <p> * TODO: tez planner logical optimization - remove Merge and push 'merge' to GroupBy */ @Test public void testMergeGroupBy() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sourceUpper = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath(), SinkMode.REPLACE ); Pipe pipeLower = new Pipe( "lower" ); Pipe pipeUpper = new Pipe( "upper" ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new GroupBy( splice, Fields.ALL ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 10 ); Collection results = getSinkAsList( flow ); assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) ); assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) ); }
@Test public void testSimpleMerge() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "simplemerge" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 10 ); Collection results = getSinkAsList( flow ); assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) ); assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) ); }
@Test public void testSimpleMerge() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "simplemerge" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 10 ); Collection results = getSinkAsList( flow ); assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) ); assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) ); }
@Test public void testGroupByInsensitive() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sourceUpper = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "insensitivegrouping" + NONDETERMINISTIC ), SinkMode.REPLACE ); Pipe pipeLower = new Pipe( "lower" ); Pipe pipeUpper = new Pipe( "upper" ); Pipe merge = new Merge( pipeLower, pipeUpper ); Fields charFields = new Fields( "char" ); charFields.setComparator( "char", new LowerComparator() ); Pipe splice = new GroupBy( "groupby", merge, charFields ); splice = new Every( splice, new Fields( "char" ), new Count() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); // we can't guarantee if the grouping key will be upper or lower validateLength( flow, 5, 1, Pattern.compile( "^\\w+\\s2$" ) ); } }
@Test public void testSimpleMergeThreeChainGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); getPlatform().copyFromLocal( inputFileLowerOffset ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); sources.put( "offset", sourceLowerOffset ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechaingroup" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new Merge( splice, pipeOffset ); splice = new GroupBy( splice, new Fields( "num" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 14 ); }
@Test public void testGroupByInsensitive() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); Tap sourceUpper = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "insensitivegrouping" + NONDETERMINISTIC ), SinkMode.REPLACE ); Pipe pipeLower = new Pipe( "lower" ); Pipe pipeUpper = new Pipe( "upper" ); Pipe merge = new Merge( pipeLower, pipeUpper ); Fields charFields = new Fields( "char" ); charFields.setComparator( "char", new LowerComparator() ); Pipe splice = new GroupBy( "groupby", merge, charFields ); splice = new Every( splice, new Fields( "char" ), new Count() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); // we can't guarantee if the grouping key will be upper or lower validateLength( flow, 5, 1, Pattern.compile( "^\\w+\\s2$" ) ); } }
@Test public void testSimpleMergeThreeChainGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); getPlatform().copyFromLocal( inputFileLowerOffset ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); sources.put( "offset", sourceLowerOffset ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechaingroup" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new Merge( splice, pipeOffset ); splice = new GroupBy( splice, new Fields( "num" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 14 ); }
@Test public void testSimpleMergeThreeChain() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); getPlatform().copyFromLocal( inputFileLowerOffset ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); sources.put( "offset", sourceLowerOffset ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechain" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new Merge( splice, pipeOffset ); splice = new Each( splice, new Fields( "num", "char" ), new Identity() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 14 ); }
@Test public void testSimpleMergeThreeChain() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); getPlatform().copyFromLocal( inputFileLowerOffset ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); sources.put( "offset", sourceLowerOffset ); Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechain" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); splice = new Merge( splice, pipeOffset ); splice = new Each( splice, new Fields( "num", "char" ), new Identity() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); flow.complete(); validateLength( flow, 14 ); }
@Test public void testSameSourceMergeThreeChainGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Map sources = new HashMap(); sources.put( "split", sourceLower ); Tap sink = getPlatform().getTextFile( getOutputPath( "samemergethreechaingroup" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipe = new Pipe( "split" ); Pipe pipeLower = new Each( new Pipe( "lower", pipe ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper", pipe ), new Fields( "line" ), splitter ); Pipe pipeOffset = new Each( new Pipe( "offset", pipe ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); //put group before merge to test path counts splice = new GroupBy( splice, new Fields( "num" ) ); splice = new Merge( splice, pipeOffset ); // this group has its incoming paths counted, gated by the previous group splice = new GroupBy( splice, new Fields( "num" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 2, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 15 ); }
join = new CoGroup( pipeLower, numLHS, pipeUpper, numRHS, declaredFields, new InnerJoin() ); else if( isMerge && !isGroup ) join = new Merge( pipeLower, pipeUpper ); else join = new HashJoin( pipeLower, numLHS, pipeUpper, numRHS, declaredFields, new InnerJoin() );
@Test public void testSameSourceMergeThreeChainGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); Tap sourceLower = getPlatform().getTextFile( inputFileLower ); Map sources = new HashMap(); sources.put( "split", sourceLower ); Tap sink = getPlatform().getTextFile( getOutputPath( "samemergethreechaingroup" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipe = new Pipe( "split" ); Pipe pipeLower = new Each( new Pipe( "lower", pipe ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper", pipe ), new Fields( "line" ), splitter ); Pipe pipeOffset = new Each( new Pipe( "offset", pipe ), new Fields( "line" ), splitter ); Pipe splice = new Merge( "merge", pipeLower, pipeUpper ); //put group before merge to test path counts splice = new GroupBy( splice, new Fields( "num" ) ); splice = new Merge( splice, pipeOffset ); // this group has its incoming paths counted, gated by the previous group splice = new GroupBy( splice, new Fields( "num" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); if( getPlatform().isMapReduce() ) assertEquals( "wrong num jobs", 2, flow.getFlowSteps().size() ); flow.complete(); validateLength( flow, 15 ); }
join = new CoGroup( pipeLower, numLHS, pipeUpper, numRHS, declaredFields, new InnerJoin() ); else if( isMerge && !isGroup ) join = new Merge( pipeLower, pipeUpper ); else join = new HashJoin( pipeLower, numLHS, pipeUpper, numRHS, declaredFields, new InnerJoin() );