@Override public Graph<LongValue, NullValue, NullValue> generate() { int scale = Long.SIZE - Long.numberOfLeadingZeros(vertexCount - 1); // Edges int cyclesPerEdge = noiseEnabled ? 5 * scale : scale; List<BlockInfo<T>> generatorBlocks = randomGenerableFactory .getRandomGenerables(edgeCount, cyclesPerEdge); DataSet<Edge<LongValue, NullValue>> edges = env .fromCollection(generatorBlocks) .name("Random generators") .rebalance() .setParallelism(parallelism) .name("Rebalance") .flatMap(new GenerateEdges<>(vertexCount, scale, a, b, c, noiseEnabled, noise)) .setParallelism(parallelism) .name("RMat graph edges"); // Vertices DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSet(edges, parallelism); // Graph return Graph.fromDataSet(vertices, edges, env); }
private <K extends Tuple> void createHashPartitionOperation(PythonOperationInfo info) { DataSet<Tuple2<K, byte[]>> op1 = sets.getDataSet(info.parentID); DataSet<byte[]> result = op1 .partitionByHash(info.keys.toArray(new String[info.keys.size()])).setParallelism(info.parallelism) .map(new KeyDiscarder<K>()).setParallelism(info.parallelism).name("HashPartitionPostStep"); sets.add(info.setID, result); }
/** * Enforces a re-balancing of the DataSet, i.e., the DataSet is evenly distributed over all parallel instances of the * following task. This can help to improve performance in case of heavy data skew and compute intensive operations. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @return The re-balanced DataSet. */ public PartitionOperator<T> rebalance() { return new PartitionOperator<>(this, PartitionMethod.REBALANCE, Utils.getCallLocationName()); }
private void createRebalanceOperation(PythonOperationInfo info) { DataSet<?> op = sets.getDataSet(info.parentID); sets.add(info.setID, op.rebalance().setParallelism(info.parallelism).name("Rebalance")); }
UnaryOperatorInformation<T, T> operatorInfo = new UnaryOperatorInformation<>(getType(), getType()); PartitionOperatorBase<T> rebalancedInput = new PartitionOperatorBase<>(operatorInfo, pMethod, name); rebalancedInput.setInput(input); rebalancedInput.setParallelism(getParallelism()); UnaryOperatorInformation<T, T> operatorInfo = new UnaryOperatorInformation<>(getType(), getType()); PartitionOperatorBase<T> partitionedInput = new PartitionOperatorBase<>(operatorInfo, pMethod, logicalKeyPositions, name); partitionedInput.setInput(input); partitionedInput.setParallelism(getParallelism()); partitionedInput.setDistribution(distribution); partitionedInput.setCustomPartitioner(customPartitioner); partitionedInput.setOrdering(computeOrdering(pKeys, orders)); return translateSelectorFunctionPartitioner(selectorKeys, pMethod, name, input, getParallelism(), customPartitioner, orders);
@Test public void testCustomPartitioningTupleInvalidType() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, Integer>> data = env.fromElements(new Tuple2<Integer, Integer>(0, 0)) .rebalance().setParallelism(4); try { data.groupBy(0).withPartitioner(new TestPartitionerLong()); fail("Should throw an exception"); } catch (InvalidProgramException e) {} } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testRangePartitionWithKeyExpression() throws Exception { /* * Test range partition with key expression */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(3); DataSet<POJO> ds = CollectionDataSets.getDuplicatePojoDataSet(env); DataSet<Long> uniqLongs = ds .partitionByRange("nestedPojo.longNumber").setParallelism(4) .mapPartition(new UniqueNestedPojoLongMapper()); List<Long> result = uniqLongs.collect(); String expected = "10000\n" + "20000\n" + "30000\n"; compareResultAsText(result, expected); }
@Test public void reuseBothPartitioningCoGroup6() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1 .partitionByHash(2) .map(new MockMapper()).withForwardedFields("2") .coGroup(set2.partitionByHash(2) .map(new MockMapper()) .withForwardedFields("2")) .where(0, 2).equalTo(1, 2).with(new MockCoGroup()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); checkValidCoGroupInputProperties(coGroup); }
private void createRebalanceOperation(OperationInfo info) throws IOException { DataSet op = (DataSet) sets.get(info.parentID); sets.put(info.setID, op.rebalance().name("Rebalance")); }
.flatMap(new RichFlatMapFunction<Long, Long>() {
@Test public void testIncorrectSerializer4() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(PARLLELISM); env.getConfig().disableSysoutLogging(); env .generateSequence(1, 10 * PARLLELISM) .map(new MapFunction<Long, ConsumesTooLittleSpanning>() { @Override public ConsumesTooLittleSpanning map(Long value) throws Exception { return new ConsumesTooLittleSpanning(); } }) .rebalance() .output(new DiscardingOutputFormat<ConsumesTooLittleSpanning>()); env.execute(); } catch (ProgramInvocationException e) { Throwable rootCause = e.getCause().getCause(); assertTrue(rootCause instanceof IOException); assertTrue(rootCause.getMessage().contains("broken serialization")); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void CoGroupWithDifferentDistributionTest() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); TestDistribution testDistribution1 = new TestDistribution(3); TestDistribution testDistribution2 = new TestDistribution(2); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = DataSetUtils.partitionByRange(set1, testDistribution1, 0) .coGroup(DataSetUtils.partitionByRange(set2, testDistribution2, 0)) .where(0).equalTo(0).with(new CoGroupFunc()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); Channel input1 = coGroup.getInput1(); Channel input2 = coGroup.getInput2(); assertEquals(ShipStrategyType.PARTITION_HASH, input1.getShipStrategy()); assertEquals(ShipStrategyType.PARTITION_HASH, input2.getShipStrategy()); }
@Test public void testCustomPartitioningTupleInvalidType() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Pojo2> data = env.fromElements(new Pojo2()) .rebalance().setParallelism(4); try { data.groupBy("a").withPartitioner(new TestPartitionerLong()); fail("Should throw an exception"); } catch (InvalidProgramException e) {} } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
.setParallelism(dop) .name("reduce-" + node.getID());
@Test public void testHashPartitionWithKeyExpression() throws Exception { /* * Test hash partition with key expression */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(3); DataSet<POJO> ds = CollectionDataSets.getDuplicatePojoDataSet(env); DataSet<Long> uniqLongs = ds .partitionByHash("nestedPojo.longNumber").setParallelism(4) .mapPartition(new UniqueNestedPojoLongMapper()); List<Long> result = uniqLongs.collect(); String expected = "10000\n" + "20000\n" + "30000\n"; compareResultAsText(result, expected); }
@Test public void reuseBothPartitioningCoGroup5() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1 .partitionByHash(2) .map(new MockMapper()).withForwardedFields("2") .coGroup(set2.partitionByHash(1) .map(new MockMapper()) .withForwardedFields("1")) .where(0, 2).equalTo(2, 1).with(new MockCoGroup()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); checkValidCoGroupInputProperties(coGroup); }
UnaryOperatorInformation<T, T> operatorInfo = new UnaryOperatorInformation<>(getType(), getType()); PartitionOperatorBase<T> rebalancedInput = new PartitionOperatorBase<>(operatorInfo, pMethod, name); rebalancedInput.setInput(input); rebalancedInput.setParallelism(getParallelism()); UnaryOperatorInformation<T, T> operatorInfo = new UnaryOperatorInformation<>(getType(), getType()); PartitionOperatorBase<T> partitionedInput = new PartitionOperatorBase<>(operatorInfo, pMethod, logicalKeyPositions, name); partitionedInput.setInput(input); partitionedInput.setParallelism(getParallelism()); partitionedInput.setDistribution(distribution); partitionedInput.setCustomPartitioner(customPartitioner); partitionedInput.setOrdering(computeOrdering(pKeys, orders)); return translateSelectorFunctionPartitioner(selectorKeys, pMethod, name, input, getParallelism(), customPartitioner, orders);
/** * creates the initial working set from the edge candidates * * @return initial working set with the expand embeddings */ private DataSet<ExpandEmbedding> preProcess() { if (direction == ExpandDirection.IN) { candidateEdges = candidateEdges .map(new ReverseEdgeEmbedding()) .name(getName() + " - Reverse Edges"); } this.candidateEdgeTuples = candidateEdges .map(new ExtractKeyedCandidateEdges()) .name(getName() + " - Create candidate edge tuples") .partitionByHash(0) .name(getName() + " - Partition edge tuples"); return input.join(candidateEdgeTuples, joinHint) .where(new ExtractExpandColumn(expandColumn)).equalTo(0) .with(new CreateExpandEmbedding( distinctVertexColumns, distinctEdgeColumns, closingColumn )) .name(getName() + " - Initial expansion"); }
.flatMap(new ConnectedComponents.UndirectEdge());
@Test public void testIncorrectSerializer3() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(PARLLELISM); env.getConfig().disableSysoutLogging(); env .generateSequence(1, 10 * PARLLELISM) .map(new MapFunction<Long, ConsumesTooLittle>() { @Override public ConsumesTooLittle map(Long value) throws Exception { return new ConsumesTooLittle(); } }) .rebalance() .output(new DiscardingOutputFormat<ConsumesTooLittle>()); env.execute(); } catch (JobExecutionException e) { Throwable rootCause = e.getCause(); assertTrue(rootCause instanceof IOException); assertTrue(rootCause.getMessage().contains("broken serialization")); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }