org.apache.flink.api.java.ExecutionEnvironment.setParallelism java code examples

Refine search

@Before
public void initiate() {
  ExecutionEnvironment.getExecutionEnvironment().setParallelism(5);
}

@Test
public void testReduceOnNonKeyedDataset() throws Exception {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(4);
  // creates the input data and distributes them evenly among the available downstream tasks
  DataSet<Tuple2<Integer, Boolean>> input = createNonKeyedInput(env);
  List<Tuple2<Integer, Boolean>> actual = input.reduceGroup(new NonKeyedCombReducer()).collect();
  String expected = "10,true\n";
  compareResultAsTuples(actual, expected);
}

/**
 * Ensure that the user can pass a custom configuration object to the LocalEnvironment.
 */
@Test
public void testLocalEnvironmentWithConfig() throws Exception {
  Configuration conf = new Configuration();
  conf.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, PARALLELISM);
  final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
  env.setParallelism(ExecutionConfig.PARALLELISM_AUTO_MAX);
  env.getConfig().disableSysoutLogging();
  DataSet<Integer> result = env.createInput(new ParallelismDependentInputFormat())
      .rebalance()
      .mapPartition(new RichMapPartitionFunction<Integer, Integer>() {
        @Override
        public void mapPartition(Iterable<Integer> values, Collector<Integer> out) throws Exception {
          out.collect(getRuntimeContext().getIndexOfThisSubtask());
        }
      });
  List<Integer> resultCollection = result.collect();
  assertEquals(PARALLELISM, resultCollection.size());
}

@Test
public void testReduceOnKeyedDataset() throws Exception {
  // set up the execution environment
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(4);
  // creates the input data and distributes them evenly among the available downstream tasks
  DataSet<Tuple3<String, Integer, Boolean>> input = createKeyedInput(env);
  List<Tuple3<String, Integer, Boolean>> actual = input.groupBy(0).reduceGroup(new KeyedCombReducer()).collect();
  String expected = "k1,6,true\nk2,4,true\n";
  compareResultAsTuples(actual, expected);
}

@Test
public void checkSinglePartitionedSource4() {
  ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType);
  data.getSplitDataProperties()
      .splitsPartitionedBy("f1");
  data.output(new DiscardingOutputFormat<Tuple3<Long,SomePojo,String>>());
  Plan plan = env.createProgramPlan();
  // submit the plan to the compiler
  OptimizedPlan oPlan = compileNoStats(plan);
  // check the optimized Plan
  SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
  SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();
  GlobalProperties gprops = sourceNode.getGlobalProperties();
  LocalProperties lprops = sourceNode.getLocalProperties();
  Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(1, 2, 3)));
  Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
  Assert.assertTrue(lprops.getGroupedFields() == null);
  Assert.assertTrue(lprops.getOrdering() == null);
}

public void run() throws Exception {
  LOG.info("Random seed = {}", RANDOM_SEED);
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.getConfig().disableSysoutLogging();
  for (int parallelism = MAX_PARALLELISM; parallelism > 0; parallelism--) {
    LOG.info("Parallelism = {}", parallelism);
    env.setParallelism(parallelism);
    testReduce(env);
    testGroupedReduce(env);
    testJoin(env);
    testCross(env);
  }
}

@Test
public void testGroupingWithPojoContainingMultiplePojos() throws Exception {
  /*
   * Test grouping with pojo containing multiple pojos (was a bug)
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(1);
  DataSet<CollectionDataSets.PojoWithMultiplePojos> ds = CollectionDataSets.getPojoWithMultiplePojos(env);
  // f0.f0 is first integer
  DataSet<String> reduceDs = ds.groupBy("p2.a2")
      .reduceGroup(new GroupReducer6());
  List<String> result = reduceDs.collect();
  String expected = "b\nccc\nee\n";
  compareResultAsText(result, expected);
}

@Test
public void checkSinglePartitionedSource5() {
  ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType);
  data.getSplitDataProperties()
      .splitsPartitionedBy("f1.stringField");
  data.output(new DiscardingOutputFormat<Tuple3<Long, SomePojo, String>>());
  Plan plan = env.createProgramPlan();
  // submit the plan to the compiler
  OptimizedPlan oPlan = compileNoStats(plan);
  // check the optimized Plan
  SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
  SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();
  GlobalProperties gprops = sourceNode.getGlobalProperties();
  LocalProperties lprops = sourceNode.getLocalProperties();
  Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(3)));
  Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
  Assert.assertTrue(lprops.getGroupedFields() == null);
  Assert.assertTrue(lprops.getOrdering() == null);
}

  private Plan getWordCountPlan(File inFile, File outFile, int parallelism) {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(parallelism);
    env.readTextFile(inFile.getAbsolutePath())
      .flatMap(new Tokenizer())
      .groupBy(0)
      .sum(1)
      .writeAsCsv(outFile.getAbsolutePath());
    return env.createProgramPlan();
  }
}

@Test
public void testJavaCollectionsWithinPojos() throws Exception {
  /*
   * Test Java collections within pojos ( == test kryo)
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(1);
  DataSet<CollectionDataSets.PojoWithCollection> ds = CollectionDataSets.getPojoWithCollection(env);
  // f0.f0 is first integer
  DataSet<String> reduceDs = ds.groupBy("key")
      .reduceGroup(new GroupReducer7());
  List<String> result = reduceDs.collect();
  String expected = "callFor key 0 we got: pojo.a=apojo.a=bFor key 0 we got: pojo.a=a2pojo.a=b2\n";
  compareResultAsText(result, expected);
}

@Test
public void checkSinglePartitionedSource6() {
  ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType);
  data.getSplitDataProperties()
      .splitsPartitionedBy("f1.intField; f2");
  data.output(new DiscardingOutputFormat<Tuple3<Long, SomePojo, String>>());
  Plan plan = env.createProgramPlan();
  // submit the plan to the compiler
  OptimizedPlan oPlan = compileNoStats(plan);
  // check the optimized Plan
  SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
  SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();
  GlobalProperties gprops = sourceNode.getGlobalProperties();
  LocalProperties lprops = sourceNode.getLocalProperties();
  Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(2, 4)));
  Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
  Assert.assertTrue(lprops.getGroupedFields() == null);
  Assert.assertTrue(lprops.getOrdering() == null);
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(4);
  DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8);
  IterativeDataSet<Integer> iteration = data.iterate(10);
  DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc");
  final List<Integer> resultList = new ArrayList<Integer>();
  iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList));
  env.execute();
  Assert.assertEquals(8, resultList.get(0).intValue());
}

@Test
public void testDisjointDataflows() {
  try {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(5);
    env.getConfig().disableSysoutLogging();
    // generate two different flows
    env.generateSequence(1, 10).output(new DiscardingOutputFormat<Long>());
    env.generateSequence(1, 10).output(new DiscardingOutputFormat<Long>());
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

@Test
public void checkSinglePartitionedOrderedSource6() {
  ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType);
  data.getSplitDataProperties()
      .splitsPartitionedBy("f1.intField")
      .splitsOrderedBy("f1", new Order[]{Order.DESCENDING});
  data.output(new DiscardingOutputFormat<Tuple3<Long, SomePojo, String>>());
  Plan plan = env.createProgramPlan();
  // submit the plan to the compiler
  OptimizedPlan oPlan = compileNoStats(plan);
  // check the optimized Plan
  SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
  SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();
  GlobalProperties gprops = sourceNode.getGlobalProperties();
  LocalProperties lprops = sourceNode.getLocalProperties();
  Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(2)));
  Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
  Assert.assertTrue(new FieldSet(lprops.getGroupedFields().toArray()).equals(new FieldSet(1,2,3)));
  Assert.assertTrue(lprops.getOrdering() == null);
}

public void executeTask(MapFunction<Integer, Integer> mapper) throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env
      .createInput(new InfiniteIntegerInputFormat(false))
      .map(mapper)
      .output(new DiscardingOutputFormat<Integer>());
  env.setParallelism(PARALLELISM);
  runAndCancelJob(env.createProgramPlan(), 5 * 1000, 10 * 1000);
}

@Test
public void testPartitionPojoInvalidType() {
  try {
    final int parallelism = 4;
    
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(parallelism);
    
    DataSet<Pojo> data = env.fromElements(new Pojo())
        .rebalance();
    
    try {
      data
        .partitionCustom(new TestPartitionerLong(), "a");
      fail("Should throw an exception");
    }
    catch (InvalidProgramException e) {
      // expected
    }
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

/**
 * Tests compiler fail for join program with replicated data source and changing parallelism.
 */
@Test(expected = CompilerException.class)
public void checkJoinWithReplicatedSourceInputChangingparallelism() {
  ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
  ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
      new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));
  DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
  DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);
  DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
      .join(source2).where("*").equalTo("*").setParallelism(DEFAULT_PARALLELISM+2)
      .writeAsText("/some/newpath");
  Plan plan = env.createProgramPlan();
  // submit the plan to the compiler
  OptimizedPlan oPlan = compileNoStats(plan);
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(4);
  DataSet<Long> input = env.generateSequence(1, 10);
  DataSet<Long> bc1 = env.generateSequence(1, 5);
  DataSet<Long> bc2 = env.generateSequence(6, 10);
  List<Long> result = input
      .map(new Mapper())
      .withBroadcastSet(bc1.union(bc2), BC_NAME)
      .reduce(new Reducer())
      .collect();
  Assert.assertEquals(Long.valueOf(3025), result.get(0));
}

@Test
public void testPartitionTuplesInvalidType() {
  try {
    final int parallelism = 4;
    
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(parallelism);
    
    DataSet<Tuple2<Integer, Integer>> data = env.fromElements(new Tuple2<Integer,Integer>(0, 0))
        .rebalance();
    
    try {
      data
        .partitionCustom(new TestPartitionerLong(), 0);
      fail("Should throw an exception");
    }
    catch (InvalidProgramException e) {
      // expected
    }
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

/**
 * Tests compiler fail for join program with replicated data source behind rebalance.
 */
@Test(expected = CompilerException.class)
public void checkJoinWithReplicatedSourceInputBehindRebalance() {
  ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
  env.setParallelism(DEFAULT_PARALLELISM);
  TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
  ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
      new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));
  DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
  DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);
  DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
      .rebalance()
      .join(source2).where("*").equalTo("*")
      .writeAsText("/some/newpath");
  Plan plan = env.createProgramPlan();
  // submit the plan to the compiler
  OptimizedPlan oPlan = compileNoStats(plan);
}

Javadoc

Sets the parallelism for operations executed through this environment. Setting a parallelism of x here will cause all operators (such as join, map, reduce) to run with x parallel instances.

This method overrides the default parallelism for this environment. The LocalEnvironment uses by default a value equal to the number of hardware contexts (CPU cores / threads). When executing the program via the command line client from a JAR file, the default parallelism is the one configured for that setup.

Popular methods of ExecutionEnvironment

getExecutionEnvironment
Creates an execution environment that represents the context in which the program is currently execu
execute
Triggers the program execution. The environment will execute all parts of the program that have resu
getConfig
Gets the config object that defines execution parameters.
fromCollection
Creates a DataSet from the given iterator. Because the iterator will remain unmodified until the act
fromElements
Creates a new data set that contains the given elements. The elements must all be of the same type,
createInput
Generic method to create an input DataSet with in InputFormat. The DataSet will not be immediately c
getParallelism
Gets the parallelism with which operation are executed by default. Operations can individually overr
createLocalEnvironment
Creates a LocalEnvironment which is used for executing Flink jobs.
readTextFile
Creates a DataSet that represents the Strings produced by reading the given file line wise. The java
getLastJobExecutionResult
Returns the org.apache.flink.api.common.JobExecutionResult of the last executed job.
readCsvFile
Creates a CSV reader to read a comma separated value (CSV) file. The reader has options to define pa
generateSequence
Creates a new data set that contains a sequence of numbers. The data set will be created in parallel

Popular in Java

Updating database using SQL prepared statement
setContentView (Activity)
addToBackStack (FragmentTransaction)
startActivity (Activity)
Charset (java.nio.charset)
A charset is a named mapping between Unicode characters and byte sequences. Every Charset can decode
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
TreeSet (java.util)
TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
From CI to AI: The AI layer in your organization

How to use setParallelismmethodin org.apache.flink.api.java.ExecutionEnvironment

Best Java code snippets using org.apache.flink.api.java.ExecutionEnvironment.setParallelism (Showing top 20 results out of 315)

Refine search

How to use
setParallelism
method
in
org.apache.flink.api.java.ExecutionEnvironment