@Test public void testJoinKeyMixedKeySelectorTurned() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); try { ds1.join(ds2).where(new KeySelector<CustomType, Integer>() { @Override public Integer getKey(CustomType value) throws Exception { return value.myInt; } }).equalTo("myInt"); } catch (Exception e) { e.printStackTrace(); Assert.fail(); } }
private Plan getTestPlanLeftStatic(String strategy) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); @SuppressWarnings("unchecked") DataSet<Tuple3<Long, Long, Long>> bigInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L), new Tuple3<Long, Long, Long>(1L, 2L, 3L),new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Big"); @SuppressWarnings("unchecked") DataSet<Tuple3<Long, Long, Long>> smallInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Small"); IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10); Configuration joinStrategy = new Configuration(); joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy); DataSet<Tuple3<Long, Long, Long>> inner = smallInput.join(iteration).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy); DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner); output.output(new DiscardingOutputFormat<Tuple3<Long,Long,Long>>()); return env.createProgramPlan(); }
@Test public void testJoinNestedPojoAgainstTupleSelectedUsingInteger() throws Exception { /* * Join nested pojo against tuple (selected as an integer) */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<POJO> ds1 = CollectionDataSets.getSmallPojoDataSet(env); DataSet<Tuple7<Integer, String, Integer, Integer, Long, String, Long>> ds2 = CollectionDataSets.getSmallTuplebasedDataSet(env); DataSet<Tuple2<POJO, Tuple7<Integer, String, Integer, Integer, Long, String, Long>>> joinDs = ds1.join(ds2).where("nestedPojo.longNumber").equalTo(6); // <--- difference! List<Tuple2<POJO, Tuple7<Integer, String, Integer, Integer, Long, String, Long>>> result = joinDs.collect(); String expected = "1 First (10,100,1000,One) 10000,(1,First,10,100,1000,One,10000)\n" + "2 Second (20,200,2000,Two) 20000,(2,Second,20,200,2000,Two,20000)\n" + "3 Third (30,300,3000,Three) 30000,(3,Third,30,300,3000,Three,30000)\n"; compareResultAsTuples(result, expected); }
@Test public void testJoinNestedPojoAgainstTupleSelectedUsingString() throws Exception { /* * Join nested pojo against tuple (selected using a string) */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<POJO> ds1 = CollectionDataSets.getSmallPojoDataSet(env); DataSet<Tuple7<Integer, String, Integer, Integer, Long, String, Long>> ds2 = CollectionDataSets.getSmallTuplebasedDataSet(env); DataSet<Tuple2<POJO, Tuple7<Integer, String, Integer, Integer, Long, String, Long>>> joinDs = ds1.join(ds2).where("nestedPojo.longNumber").equalTo("f6"); List<Tuple2<POJO, Tuple7<Integer, String, Integer, Integer, Long, String, Long>>> result = joinDs.collect(); String expected = "1 First (10,100,1000,One) 10000,(1,First,10,100,1000,One,10000)\n" + "2 Second (20,200,2000,Two) 20000,(2,Second,20,200,2000,Two,20000)\n" + "3 Third (30,300,3000,Three) 30000,(3,Third,30,300,3000,Three,30000)\n"; compareResultAsTuples(result, expected); }
private Plan getTestPlanRightStatic(String strategy) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSet<Tuple3<Long, Long, Long>> bigInput = env.readCsvFile("file://bigFile").types(Long.class, Long.class, Long.class).name("bigFile"); DataSet<Tuple3<Long, Long, Long>> smallInput = env.readCsvFile("file://smallFile").types(Long.class, Long.class, Long.class).name("smallFile"); IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10); Configuration joinStrategy = new Configuration(); joinStrategy.setString(Optimizer.HINT_SHIP_STRATEGY, Optimizer.HINT_SHIP_STRATEGY_REPARTITION_HASH); if(!strategy.equals("")) { joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy); } DataSet<Tuple3<Long, Long, Long>> inner = iteration.join(smallInput).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy); DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner); output.output(new DiscardingOutputFormat<Tuple3<Long, Long, Long>>()); return env.createProgramPlan(); }
private DataSet<Tuple5<KT, KB, EV, VVT, VVB>> joinEdgeWithVertices() { return edges .join(topVertices, JoinHint.REPARTITION_HASH_SECOND) .where(0) .equalTo(0) .projectFirst(0, 1, 2) .<Tuple4<KT, KB, EV, VVT>>projectSecond(1) .name("Edge with vertex") .join(bottomVertices, JoinHint.REPARTITION_HASH_SECOND) .where(1) .equalTo(0) .projectFirst(0, 1, 2, 3) .<Tuple5<KT, KB, EV, VVT, VVB>>projectSecond(1) .name("Edge with vertices"); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> data1 = env.generateSequence(1, 100); DataSet<Long> data2 = env.generateSequence(1, 100); IterativeDataSet<Long> firstIteration = data1.iterate(100); DataSet<Long> firstResult = firstIteration.closeWith(firstIteration.map(new IdMapper())); IterativeDataSet<Long> mainIteration = data2.map(new IdMapper()).iterate(100); DataSet<Long> joined = mainIteration.join(firstResult) .where(new IdKeyExtractor()).equalTo(new IdKeyExtractor()) .with(new Joiner()); DataSet<Long> mainResult = mainIteration.closeWith(joined); mainResult.output(new DiscardingOutputFormat<Long>()); env.execute(); }
@Test public void noPreviousPartitioningJoin1() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> joined = set1 .join(set2, JoinOperatorBase.JoinHint.REPARTITION_HASH_FIRST) .where(0,1).equalTo(0,1).with(new MockJoin()); joined.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode join = (DualInputPlanNode)sink.getInput().getSource(); checkValidJoinInputProperties(join); }
@Test public void testJoinProjection5() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { ds1.join(ds2).where(0).equalTo(0) .projectSecond(0, 2) .projectFirst(1, 4) .projectFirst(1); } catch (Exception e) { Assert.fail(); } }
@Test public void testJoinProjection4() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { ds1.join(ds2).where(0).equalTo(0) .projectFirst(0, 2) .projectSecond(1, 4) .projectFirst(1); } catch (Exception e) { Assert.fail(); } }
@Test public void noPreviousPartitioningJoin2() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> joined = set1 .join(set2, JoinOperatorBase.JoinHint.REPARTITION_HASH_FIRST) .where(0,1).equalTo(2,1).with(new MockJoin()); joined.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode join = (DualInputPlanNode)sink.getInput().getSource(); checkValidJoinInputProperties(join); }
/** * Apply a filtering function to the graph and return a sub-graph that * satisfies the predicates only for the vertices. * * @param vertexFilter the filter function for vertices. * @return the resulting sub-graph. */ public Graph<K, VV, EV> filterOnVertices(FilterFunction<Vertex<K, VV>> vertexFilter) { DataSet<Vertex<K, VV>> filteredVertices = this.vertices.filter(vertexFilter); DataSet<Edge<K, EV>> remainingEdges = this.edges.join(filteredVertices) .where(0).equalTo(0).with(new ProjectEdge<>()) .join(filteredVertices).where(1).equalTo(0) .with(new ProjectEdge<>()).name("Filter on vertices"); return new Graph<>(filteredVertices, remainingEdges, this.context); }
/** * This method allows access to the graph's edge values along with its source and target vertex values. * * @return a triplet DataSet consisting of (srcVertexId, trgVertexId, srcVertexValue, trgVertexValue, edgeValue) */ public DataSet<Triplet<K, VV, EV>> getTriplets() { return this.getVertices() .join(this.getEdges()).where(0).equalTo(0) .with(new ProjectEdgeWithSrcValue<>()) .name("Project edge with source value") .join(this.getVertices()).where(1).equalTo(0) .with(new ProjectEdgeWithVertexValues<>()) .name("Project edge with vertex values"); }
@Test public void testJoinProjection7() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { ds1.join(ds2).where(0).equalTo(0) .projectSecond() .projectFirst(1, 4); } catch (Exception e) { Assert.fail(); } }
@Test public void testJoinProjection3() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { ds1.join(ds2).where(0).equalTo(0) .projectFirst(0) .projectSecond(3); } catch (Exception e) { Assert.fail(); } }
@Test public void testJoinProjection27() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { ds1.join(ds2).where(0).equalTo(0) .projectSecond() .projectFirst(1, 4); } catch (Exception e) { Assert.fail(); } }
private void executeTaskWithGenerator( JoinFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> joiner, int keys, int vals, int msecsTillCanceling, int maxTimeTillCanceled) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, Integer>> input1 = env.createInput(new UniformIntTupleGeneratorInputFormat(keys, vals)); DataSet<Tuple2<Integer, Integer>> input2 = env.createInput(new UniformIntTupleGeneratorInputFormat(keys, vals)); input1.join(input2, JoinOperatorBase.JoinHint.REPARTITION_SORT_MERGE) .where(0) .equalTo(0) .with(joiner) .output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>()); env.setParallelism(PARALLELISM); runAndCancelJob(env.createProgramPlan(), msecsTillCanceling, maxTimeTillCanceled); }
@Test(expected = InvalidProgramException.class) public void testJoinKeyMixedWrong() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // wrongly mix String and Integer ds1.join(ds2).where("myString").equalTo(new KeySelector<CustomType, Integer>() { @Override public Integer getKey(CustomType value) throws Exception { return value.myInt; } }); }
@Test public void testJoinKeyNestedTuplesWithCustom() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<CustomType, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyNestedCustomTupleData, nestedCustomTupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); try { TypeInformation<?> t = ds1.join(ds2).where("f0.myInt").equalTo(4).getType(); assertTrue("not a composite type", t instanceof CompositeType); } catch (Exception e) { e.printStackTrace(); Assert.fail(); } }
@Test public void testJoinProjection2() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { ds1.join(ds2).where(0).equalTo(0) .projectFirst(0, 3); } catch (Exception e) { Assert.fail(); } }