@Before public void initiate() { ExecutionEnvironment.getExecutionEnvironment().setParallelism(5); }
@Test public void testReduceOnNonKeyedDataset() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); // creates the input data and distributes them evenly among the available downstream tasks DataSet<Tuple2<Integer, Boolean>> input = createNonKeyedInput(env); List<Tuple2<Integer, Boolean>> actual = input.reduceGroup(new NonKeyedCombReducer()).collect(); String expected = "10,true\n"; compareResultAsTuples(actual, expected); }
/** * Ensure that the user can pass a custom configuration object to the LocalEnvironment. */ @Test public void testLocalEnvironmentWithConfig() throws Exception { Configuration conf = new Configuration(); conf.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, PARALLELISM); final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); env.setParallelism(ExecutionConfig.PARALLELISM_AUTO_MAX); env.getConfig().disableSysoutLogging(); DataSet<Integer> result = env.createInput(new ParallelismDependentInputFormat()) .rebalance() .mapPartition(new RichMapPartitionFunction<Integer, Integer>() { @Override public void mapPartition(Iterable<Integer> values, Collector<Integer> out) throws Exception { out.collect(getRuntimeContext().getIndexOfThisSubtask()); } }); List<Integer> resultCollection = result.collect(); assertEquals(PARALLELISM, resultCollection.size()); }
@Test public void testReduceOnKeyedDataset() throws Exception { // set up the execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); // creates the input data and distributes them evenly among the available downstream tasks DataSet<Tuple3<String, Integer, Boolean>> input = createKeyedInput(env); List<Tuple3<String, Integer, Boolean>> actual = input.groupBy(0).reduceGroup(new KeyedCombReducer()).collect(); String expected = "k1,6,true\nk2,4,true\n"; compareResultAsTuples(actual, expected); }
@Test public void checkSinglePartitionedSource4() { ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType); data.getSplitDataProperties() .splitsPartitionedBy("f1"); data.output(new DiscardingOutputFormat<Tuple3<Long,SomePojo,String>>()); Plan plan = env.createProgramPlan(); // submit the plan to the compiler OptimizedPlan oPlan = compileNoStats(plan); // check the optimized Plan SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next(); SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor(); GlobalProperties gprops = sourceNode.getGlobalProperties(); LocalProperties lprops = sourceNode.getLocalProperties(); Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(1, 2, 3))); Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING); Assert.assertTrue(lprops.getGroupedFields() == null); Assert.assertTrue(lprops.getOrdering() == null); }
public void run() throws Exception { LOG.info("Random seed = {}", RANDOM_SEED); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().disableSysoutLogging(); for (int parallelism = MAX_PARALLELISM; parallelism > 0; parallelism--) { LOG.info("Parallelism = {}", parallelism); env.setParallelism(parallelism); testReduce(env); testGroupedReduce(env); testJoin(env); testCross(env); } }
@Test public void testGroupingWithPojoContainingMultiplePojos() throws Exception { /* * Test grouping with pojo containing multiple pojos (was a bug) */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<CollectionDataSets.PojoWithMultiplePojos> ds = CollectionDataSets.getPojoWithMultiplePojos(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("p2.a2") .reduceGroup(new GroupReducer6()); List<String> result = reduceDs.collect(); String expected = "b\nccc\nee\n"; compareResultAsText(result, expected); }
@Test public void checkSinglePartitionedSource5() { ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType); data.getSplitDataProperties() .splitsPartitionedBy("f1.stringField"); data.output(new DiscardingOutputFormat<Tuple3<Long, SomePojo, String>>()); Plan plan = env.createProgramPlan(); // submit the plan to the compiler OptimizedPlan oPlan = compileNoStats(plan); // check the optimized Plan SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next(); SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor(); GlobalProperties gprops = sourceNode.getGlobalProperties(); LocalProperties lprops = sourceNode.getLocalProperties(); Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(3))); Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING); Assert.assertTrue(lprops.getGroupedFields() == null); Assert.assertTrue(lprops.getOrdering() == null); }
private Plan getWordCountPlan(File inFile, File outFile, int parallelism) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.readTextFile(inFile.getAbsolutePath()) .flatMap(new Tokenizer()) .groupBy(0) .sum(1) .writeAsCsv(outFile.getAbsolutePath()); return env.createProgramPlan(); } }
@Test public void testJavaCollectionsWithinPojos() throws Exception { /* * Test Java collections within pojos ( == test kryo) */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<CollectionDataSets.PojoWithCollection> ds = CollectionDataSets.getPojoWithCollection(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("key") .reduceGroup(new GroupReducer7()); List<String> result = reduceDs.collect(); String expected = "callFor key 0 we got: pojo.a=apojo.a=bFor key 0 we got: pojo.a=a2pojo.a=b2\n"; compareResultAsText(result, expected); }
@Test public void checkSinglePartitionedSource6() { ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType); data.getSplitDataProperties() .splitsPartitionedBy("f1.intField; f2"); data.output(new DiscardingOutputFormat<Tuple3<Long, SomePojo, String>>()); Plan plan = env.createProgramPlan(); // submit the plan to the compiler OptimizedPlan oPlan = compileNoStats(plan); // check the optimized Plan SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next(); SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor(); GlobalProperties gprops = sourceNode.getGlobalProperties(); LocalProperties lprops = sourceNode.getLocalProperties(); Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(2, 4))); Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING); Assert.assertTrue(lprops.getGroupedFields() == null); Assert.assertTrue(lprops.getOrdering() == null); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8); IterativeDataSet<Integer> iteration = data.iterate(10); DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc"); final List<Integer> resultList = new ArrayList<Integer>(); iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList)); env.execute(); Assert.assertEquals(8, resultList.get(0).intValue()); }
@Test public void testDisjointDataflows() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(5); env.getConfig().disableSysoutLogging(); // generate two different flows env.generateSequence(1, 10).output(new DiscardingOutputFormat<Long>()); env.generateSequence(1, 10).output(new DiscardingOutputFormat<Long>()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void checkSinglePartitionedOrderedSource6() { ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType); data.getSplitDataProperties() .splitsPartitionedBy("f1.intField") .splitsOrderedBy("f1", new Order[]{Order.DESCENDING}); data.output(new DiscardingOutputFormat<Tuple3<Long, SomePojo, String>>()); Plan plan = env.createProgramPlan(); // submit the plan to the compiler OptimizedPlan oPlan = compileNoStats(plan); // check the optimized Plan SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next(); SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor(); GlobalProperties gprops = sourceNode.getGlobalProperties(); LocalProperties lprops = sourceNode.getLocalProperties(); Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(2))); Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING); Assert.assertTrue(new FieldSet(lprops.getGroupedFields().toArray()).equals(new FieldSet(1,2,3))); Assert.assertTrue(lprops.getOrdering() == null); }
public void executeTask(MapFunction<Integer, Integer> mapper) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env .createInput(new InfiniteIntegerInputFormat(false)) .map(mapper) .output(new DiscardingOutputFormat<Integer>()); env.setParallelism(PARALLELISM); runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000); }
@Test public void testPartitionPojoInvalidType() { try { final int parallelism = 4; ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); DataSet<Pojo> data = env.fromElements(new Pojo()) .rebalance(); try { data .partitionCustom(new TestPartitionerLong(), "a"); fail("Should throw an exception"); } catch (InvalidProgramException e) { // expected } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
/** * Tests compiler fail for join program with replicated data source and changing parallelism. */ @Test(expected = CompilerException.class) public void checkJoinWithReplicatedSourceInputChangingparallelism() { ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class); ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif = new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo)); DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO)); DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class); DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1 .join(source2).where("*").equalTo("*").setParallelism(DEFAULT_PARALLELISM+2) .writeAsText("/some/newpath"); Plan plan = env.createProgramPlan(); // submit the plan to the compiler OptimizedPlan oPlan = compileNoStats(plan); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); DataSet<Long> input = env.generateSequence(1, 10); DataSet<Long> bc1 = env.generateSequence(1, 5); DataSet<Long> bc2 = env.generateSequence(6, 10); List<Long> result = input .map(new Mapper()) .withBroadcastSet(bc1.union(bc2), BC_NAME) .reduce(new Reducer()) .collect(); Assert.assertEquals(Long.valueOf(3025), result.get(0)); }
@Test public void testPartitionTuplesInvalidType() { try { final int parallelism = 4; ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); DataSet<Tuple2<Integer, Integer>> data = env.fromElements(new Tuple2<Integer,Integer>(0, 0)) .rebalance(); try { data .partitionCustom(new TestPartitionerLong(), 0); fail("Should throw an exception"); } catch (InvalidProgramException e) { // expected } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
/** * Tests compiler fail for join program with replicated data source behind rebalance. */ @Test(expected = CompilerException.class) public void checkJoinWithReplicatedSourceInputBehindRebalance() { ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class); ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif = new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo)); DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO)); DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class); DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1 .rebalance() .join(source2).where("*").equalTo("*") .writeAsText("/some/newpath"); Plan plan = env.createProgramPlan(); // submit the plan to the compiler OptimizedPlan oPlan = compileNoStats(plan); }