@Test(expected = InvalidProgramException.class) public void testJoinKeyInvalidAtomic5() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<ArrayList<Integer>> ds1 = env.fromElements(new ArrayList<Integer>()); DataSet<Integer> ds2 = env.fromElements(0, 0, 0); ds1.join(ds2).where("*").equalTo("*"); }
/** * Convert a bipartite graph into an undirected graph that contains only bottom vertices. An edge between two * vertices in the new graph will exist only if the original bipartite graph contains a top vertex they are both * connected to. * * <p>The simple projection performs a single join and returns edges containing the bipartite edge values. * * <p>Note: KB must override .equals(). This requirement may be removed in a future release. * * @return simple bottom projection of the bipartite graph */ public Graph<KB, VVB, Tuple2<EV, EV>> projectionBottomSimple() { DataSet<Edge<KB, Tuple2<EV, EV>>> newEdges = edges.join(edges) .where(0) .equalTo(0) .with(new ProjectionBottomSimple<>()) .name("Simple bottom projection"); return Graph.fromDataSet(bottomVertices, newEdges, context); }
@Test public void testJoinWithHuge() throws Exception { /* * Join with Huge */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds1 = CollectionDataSets.getSmall3TupleDataSet(env); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds2 = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple2<String, String>> joinDs = ds1.joinWithHuge(ds2) .where(1) .equalTo(1) .with(new T3T5FlatJoin()); List<Tuple2<String, String>> result = joinDs.collect(); String expected = "Hi,Hallo\n" + "Hello,Hallo Welt\n" + "Hello world,Hallo Welt\n"; compareResultAsTuples(result, expected); }
@Test public void reuseSinglePartitioningJoin2() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> joined = set1 .partitionByHash(0,1) .map(new MockMapper()).withForwardedFields("0;1") .join(set2, JoinOperatorBase.JoinHint.REPARTITION_HASH_FIRST) .where(0,1).equalTo(2,1).with(new MockJoin()); joined.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode join = (DualInputPlanNode)sink.getInput().getSource(); checkValidJoinInputProperties(join); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, String>> ds = env.createInput(new LargeJoinDataGeneratorInputFormat(1000000)); ds.join(ds).where(0).equalTo(1).with(new Joiner()).writeAsText(resultPath); env.execute("Local Selfjoin Test Job"); }
public static void connectedComponentsWithCoGroup(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(Integer.parseInt(args[0])); DataSet<Tuple1<Long>> initialVertices = env.readCsvFile(args[1]).types(Long.class).name(VERTEX_SOURCE); DataSet<Tuple2<Long, Long>> edges = env.readCsvFile(args[2]).types(Long.class, Long.class).name(EDGES_SOURCE); DataSet<Tuple2<Long, Long>> verticesWithId = initialVertices.flatMap(new DummyMapFunction()); DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = verticesWithId.iterateDelta(verticesWithId, Integer.parseInt(args[4]), 0).name(ITERATION_NAME); DataSet<Tuple2<Long, Long>> joinWithNeighbors = iteration.getWorkset().join(edges) .where(0).equalTo(0) .with(new DummyJoinFunction()).name(JOIN_NEIGHBORS_MATCH); DataSet<Tuple2<Long, Long>> minAndUpdate = joinWithNeighbors.coGroup(iteration.getSolutionSet()) .where(0).equalTo(0) .with(new DummyCoGroupFunction()).name(MIN_ID_AND_UPDATE); iteration.closeWith(minAndUpdate, minAndUpdate).writeAsCsv(args[3]).name(SINK); env.execute(); }
public static DataSet<Tuple2<Long, Long>> doBulkIteration(DataSet<Tuple2<Long, Long>> vertices, DataSet<Tuple2<Long, Long>> edges) { // open a bulk iteration IterativeDataSet<Tuple2<Long, Long>> iteration = vertices.iterate(20); DataSet<Tuple2<Long, Long>> changes = iteration .join(edges).where(0).equalTo(0).with(new Join222()) .groupBy(0).aggregate(Aggregations.MIN, 1) .join(iteration).where(0).equalTo(0) .flatMap(new FlatMapJoin()); // close the bulk iteration return iteration.closeWith(changes); }
public static DataSet<Tuple2<Long, Double>> constructPlan(DataSet<Tuple2<Long, Double>> initialData, int numIterations) { DeltaIteration<Tuple2<Long, Double>, Tuple2<Long, Double>> iteration = initialData.iterateDelta(initialData, numIterations, 0); DataSet<Tuple2<Long, Double>> delta = iteration.getSolutionSet() .join(iteration.getWorkset().flatMap(new Duplicator())).where(0).equalTo(0).with(new SummingJoin()).name(JOIN_1) .groupBy(0).aggregate(Aggregations.MIN, 1).map(new Expander()) .join(iteration.getSolutionSet()).where(0).equalTo(0).with(new SummingJoinProject()).name(JOIN_2); DataSet<Tuple2<Long, Double>> changes = delta.groupBy(0).aggregate(Aggregations.SUM, 1); DataSet<Tuple2<Long, Double>> result = iteration.closeWith(delta, changes); return result; }
@Test(expected = InvalidProgramException.class) public void testJoinKeyInvalidAtomic6() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> ds1 = env.fromElements(0, 0, 0); DataSet<ArrayList<Integer>> ds2 = env.fromElements(new ArrayList<Integer>()); ds1.join(ds2).where("*").equalTo("*"); }
@Test public void reuseSinglePartitioningJoin4() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> joined = set1 .partitionByHash(0) .map(new MockMapper()).withForwardedFields("0") .join(set2, JoinOperatorBase.JoinHint.REPARTITION_HASH_FIRST) .where(0,1).equalTo(2,1).with(new MockJoin()); joined.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode join = (DualInputPlanNode)sink.getInput().getSource(); checkValidJoinInputProperties(join); }
private void executeTask(JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, boolean slow, int parallelism) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new InfiniteIntegerTupleInputFormat(slow)); DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new InfiniteIntegerTupleInputFormat(slow)); input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE) .where(0) .equalTo(0) .with(joiner) .output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>()); env.setParallelism(parallelism); runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000); }
/** * Convert a bipartite graph into an undirected graph that contains only top vertices. An edge between two vertices * in the new graph will exist only if the original bipartite graph contains a bottom vertex they are both * connected to. * * <p>The simple projection performs a single join and returns edges containing the bipartite edge values. * * <p>Note: KT must override .equals(). This requirement may be removed in a future release. * * @return simple top projection of the bipartite graph */ public Graph<KT, VVT, Tuple2<EV, EV>> projectionTopSimple() { DataSet<Edge<KT, Tuple2<EV, EV>>> newEdges = edges.join(edges) .where(1) .equalTo(1) .with(new ProjectionTopSimple<>()) .name("Simple top projection"); return Graph.fromDataSet(topVertices, newEdges, context); }
private Plan getTestPlanRightStatic(String strategy) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSet<Tuple3<Long, Long, Long>> bigInput = env.readCsvFile("file://bigFile").types(Long.class, Long.class, Long.class).name("bigFile"); DataSet<Tuple3<Long, Long, Long>> smallInput = env.readCsvFile("file://smallFile").types(Long.class, Long.class, Long.class).name("smallFile"); IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10); Configuration joinStrategy = new Configuration(); joinStrategy.setString(Optimizer.HINT_SHIP_STRATEGY, Optimizer.HINT_SHIP_STRATEGY_REPARTITION_HASH); if(!strategy.equals("")) { joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy); } DataSet<Tuple3<Long, Long, Long>> inner = iteration.join(smallInput).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy); DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner); output.output(new DiscardingOutputFormat<Tuple3<Long, Long, Long>>()); return env.createProgramPlan(); }
@Test public void testJoinKeyNestedTuples() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Tuple2<Integer, String>, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyNestedTupleData, nestedTupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); try { ds1.join(ds2).where("f0.f0").equalTo(4); } catch (Exception e) { e.printStackTrace(); Assert.fail(); } }
@Test public void reuseSinglePartitioningJoin1() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> joined = set1 .partitionByHash(0,1) .map(new MockMapper()).withForwardedFields("0;1") .join(set2, JoinOperatorBase.JoinHint.REPARTITION_HASH_FIRST) .where(0,1).equalTo(0,1).with(new MockJoin()); joined.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode join = (DualInputPlanNode)sink.getInput().getSource(); checkValidJoinInputProperties(join); }
private void executeTaskWithGenerator( JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, int keys, int vals, int msecsTillCanceling, int maxTimeTillCanceled) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new UniformIntTupleGeneratorInputFormat(keys, vals)); DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new UniformIntTupleGeneratorInputFormat(keys, vals)); input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE) .where(0) .equalTo(0) .with(joiner) .output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>()); env.setParallelism(PARALLELISM); runAndCancelJob(env.createProgramPlan(), msecsTillCanceling, maxTimeTillCanceled); }
/** * Convert a bipartite graph into a graph that contains only bottom vertices. An edge between two vertices in the * new graph will exist only if the original bipartite graph contains at least one top vertex they both connect to. * * <p>The full projection performs three joins and returns edges containing the connecting vertex ID and value, * both bottom vertex values, and both bipartite edge values. * * <p>Note: KB must override .equals(). This requirement may be removed in a future release. * * @return full bottom projection of the bipartite graph */ public Graph<KB, VVB, Projection<KT, VVT, VVB, EV>> projectionBottomFull() { DataSet<Tuple5<KT, KB, EV, VVT, VVB>> edgesWithVertices = joinEdgeWithVertices(); DataSet<Edge<KB, Projection<KT, VVT, VVB, EV>>> newEdges = edgesWithVertices.join(edgesWithVertices) .where(0) .equalTo(0) .with(new ProjectionBottomFull<>()) .name("Full bottom projection"); return Graph.fromDataSet(bottomVertices, newEdges, context); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> data1 = env.generateSequence(1, 100); DataSet<Long> data2 = env.generateSequence(1, 100); IterativeDataSet<Long> firstIteration = data1.iterate(100); DataSet<Long> firstResult = firstIteration.closeWith(firstIteration.map(new IdMapper())); IterativeDataSet<Long> mainIteration = data2.map(new IdMapper()).iterate(100); DataSet<Long> joined = mainIteration.join(firstResult) .where(new IdKeyExtractor()).equalTo(new IdKeyExtractor()) .with(new Joiner()); DataSet<Long> mainResult = mainIteration.closeWith(joined); mainResult.output(new DiscardingOutputFormat<Long>()); env.execute(); }
@Test public void testJoinKeyWithCustomContainingTuple0() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomTypeWithTuple> ds1 = env.fromCollection(customTypeWithTupleData); DataSet<CustomTypeWithTuple> ds2 = env.fromCollection(customTypeWithTupleData); try { ds1.join(ds2).where("intByString.f0").equalTo("myInt"); } catch (Exception e) { e.printStackTrace(); Assert.fail(); } }
public static void tcph3(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(Integer.parseInt(args[0])); //order id, order status, order data, order prio, ship prio DataSet<Tuple5<Long, String, String, String, Integer>> orders = env.readCsvFile(args[1]) .fieldDelimiter("|").lineDelimiter("\n") .includeFields("101011001").types(Long.class, String.class, String.class, String.class, Integer.class) .name(ORDERS); //order id, extended price DataSet<Tuple2<Long, Double>> lineItems = env.readCsvFile(args[2]) .fieldDelimiter("|").lineDelimiter("\n") .includeFields("100001").types(Long.class, Double.class) .name(LINEITEM); DataSet<Tuple2<Long, Integer>> filterO = orders.flatMap(new FilterO()).name(MAPPER_NAME); DataSet<Tuple3<Long, Integer, Double>> joinLiO = filterO.join(lineItems).where(0).equalTo(0).with(new JoinLiO()).name(JOIN_NAME); DataSet<Tuple3<Long, Integer, Double>> aggLiO = joinLiO.groupBy(0, 1).reduceGroup(new AggLiO()).name(REDUCE_NAME); aggLiO.writeAsCsv(args[3], "\n", "|").name(SINK); env.execute(); }