/** * Returns a new set containing the first n elements in this grouped {@link DataSet}. * * @param n The desired number of elements for each group. * @return A GroupReduceOperator that represents the DataSet containing the elements. */ public GroupReduceOperator<T, T> first(int n) { if (n < 1) { throw new InvalidProgramException("Parameter n of first(n) must be at least 1."); } return reduceGroup(new FirstReducer<T>(n)); }
private <IN, OUT> DataSet<OUT> applyReduceOperation(UnsortedGrouping<IN> op1, PythonOperationInfo info, TypeInformation<OUT> type) { return op1 .reduceGroup(new IdentityGroupReduce<IN>()).setCombinable(false).setParallelism(info.parallelism).name("PythonReducePreStep") .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type)) .setParallelism(info.parallelism).name(info.name); } }
private <IN, OUT> DataSet<OUT> applyGroupReduceOperation(UnsortedGrouping<IN> op1, PythonOperationInfo info, TypeInformation<OUT> type) { return op1 .reduceGroup(new IdentityGroupReduce<IN>()).setCombinable(false).setParallelism(info.parallelism).name("PythonGroupReducePreStep") .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type)) .setParallelism(info.parallelism).name(info.name); }
@Test public void testForkingReduceOnKeyedDataset() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); // creates the input data and distributes them evenly among the available downstream tasks DataSet<Tuple3<String, Integer, Boolean>> input = createKeyedInput(env); UnsortedGrouping<Tuple3<String, Integer, Boolean>> counts = input.groupBy(0); DataSet<Tuple3<String, Integer, Boolean>> r1 = counts.reduceGroup(new KeyedCombReducer()); DataSet<Tuple3<String, Integer, Boolean>> r2 = counts.reduceGroup(new KeyedGroupCombReducer()); List<Tuple3<String, Integer, Boolean>> actual = r1.union(r2).collect(); String expected = "k1,6,true\n" + "k2,4,true\n" + "k1,6,true\n" + "k2,4,true\n"; compareResultAsTuples(actual, expected); }
@Test public void testForkingReduceOnKeyedDatasetWithSelection() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); // creates the input data and distributes them evenly among the available downstream tasks DataSet<Tuple3<String, Integer, Boolean>> input = createKeyedInput(env); UnsortedGrouping<Tuple3<String, Integer, Boolean>> counts = input.groupBy(new KeySelectorX()); DataSet<Tuple3<String, Integer, Boolean>> r1 = counts.reduceGroup(new KeyedCombReducer()); DataSet<Tuple3<String, Integer, Boolean>> r2 = counts.reduceGroup(new KeyedGroupCombReducer()); List<Tuple3<String, Integer, Boolean>> actual = r1.union(r2).collect(); String expected = "k1,6,true\n" + "k2,4,true\n" + "k1,6,true\n" + "k2,4,true\n"; compareResultAsTuples(actual, expected); }
@Test public void testTupleContainingPojosAndRegularFields() throws Exception { /* * Test Tuple containing pojos and regular fields */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, CrazyNested, POJO>> ds = CollectionDataSets.getTupleContainingPojos(env); DataSet<Integer> reduceDs = ds.groupBy("f0", "f1.*") // nested full tuple selection .reduceGroup(new GroupReducer4()); List<Integer> result = reduceDs.collect(); String expected = "3\n1\n"; compareResultAsText(result, expected); }
@Test public void testGroupReduceWithAtomicValue() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> ds = env.fromElements(1, 1, 2, 3, 4); DataSet<Integer> reduceDs = ds.groupBy("*").reduceGroup(new GroupReduceFunction<Integer, Integer>() { @Override public void reduce(Iterable<Integer> values, Collector<Integer> out) throws Exception { out.collect(values.iterator().next()); } }); List<Integer> result = reduceDs.collect(); String expected = "1\n" + "2\n" + "3\n" + "4"; compareResultAsText(result, expected); }
@Test public void testPojoContainigWritableAndTuples() throws Exception { /* * Test Pojo containing a Writable and Tuples */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<PojoContainingTupleAndWritable> ds = CollectionDataSets.getPojoContainingTupleAndWritable(env); DataSet<Integer> reduceDs = ds.groupBy("hadoopFan", "theTuple.*") // full tuple selection .reduceGroup(new GroupReducer3()); List<Integer> result = reduceDs.collect(); String expected = "1\n5\n"; compareResultAsText(result, expected); }
@Test public void testPojoExtendingFromTupleWithCustomFields() throws Exception { /* * Test Pojo extending from tuple WITH custom fields */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<FromTupleWithCTor> ds = CollectionDataSets.getPojoExtendingFromTuple(env); DataSet<Integer> reduceDs = ds.groupBy("special", "f2") .reduceGroup(new GroupReducer2()); List<Integer> result = reduceDs.collect(); String expected = "3\n2\n"; compareResultAsText(result, expected); }
@Test public void testCorrectnessofGroupReduceOnTupleContainingPrimitiveByteArrayWithKeyFieldSelectors() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<byte[], Integer>> ds = CollectionDataSets.getTuple2WithByteArrayDataSet(env); DataSet<Integer> reduceDs = ds. groupBy(0).reduceGroup(new ByteArrayGroupReduce()); List<Integer> result = reduceDs.collect(); String expected = "0\n" + "1\n" + "2\n" + "3\n" + "4\n"; compareResultAsText(result, expected); }
@Test public void testDeepNesting() throws Exception { /* * Deep nesting test * + null value in pojo */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CrazyNested> ds = CollectionDataSets.getCrazyNestedDataSet(env); DataSet<Tuple2<String, Integer>> reduceDs = ds.groupBy("nestLvl1.nestLvl2.nestLvl3.nestLvl4.f1nal") .reduceGroup(new GroupReducer1()); List<Tuple2<String, Integer>> result = reduceDs.collect(); String expected = "aa,1\nbb,2\ncc,3\n"; compareResultAsTuples(result, expected); }
@Test public void testReduceOnKeyedDataset() throws Exception { // set up the execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); // creates the input data and distributes them evenly among the available downstream tasks DataSet<Tuple3<String, Integer, Boolean>> input = createKeyedInput(env); List<Tuple3<String, Integer, Boolean>> actual = input.groupBy(0).reduceGroup(new KeyedCombReducer()).collect(); String expected = "k1,6,true\nk2,4,true\n"; compareResultAsTuples(actual, expected); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> input = env.readTextFile(dataPath); input.flatMap(new TokenizeLine()) .groupBy(0) .reduceGroup(new CountWords()) .writeAsCsv(resultPath, "\n", " "); this.result = env.execute(); }
@Test public void testJavaCollectionsWithinPojos() throws Exception { /* * Test Java collections within pojos ( == test kryo) */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<CollectionDataSets.PojoWithCollection> ds = CollectionDataSets.getPojoWithCollection(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("key") .reduceGroup(new GroupReducer7()); List<String> result = reduceDs.collect(); String expected = "callFor key 0 we got: pojo.a=apojo.a=bFor key 0 we got: pojo.a=a2pojo.a=b2\n"; compareResultAsText(result, expected); }
@Test public void testGroupingWithPojoContainingMultiplePojos() throws Exception { /* * Test grouping with pojo containing multiple pojos (was a bug) */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<CollectionDataSets.PojoWithMultiplePojos> ds = CollectionDataSets.getPojoWithMultiplePojos(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("p2.a2") .reduceGroup(new GroupReducer6()); List<String> result = reduceDs.collect(); String expected = "b\nccc\nee\n"; compareResultAsText(result, expected); }
@Test public void testStandardCountingWithCombiner() throws Exception{ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, IntWritable>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper1()); DataSet<Tuple2<IntWritable, IntWritable>> counts = ds. groupBy(0). reduceGroup(new HadoopReduceCombineFunction<IntWritable, IntWritable, IntWritable, IntWritable>( new SumReducer(), new SumReducer())); String resultPath = tempFolder.newFile().toURI().toString(); counts.writeAsText(resultPath); env.execute(); String expected = "(0,5)\n" + "(1,6)\n" + "(2,6)\n" + "(3,4)\n"; compareResultsByLinesInMemory(expected, resultPath); }
@Test public void testReduceOnKeyedDatasetWithSelector() throws Exception { // set up the execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); // creates the input data and distributes them evenly among the available downstream tasks DataSet<Tuple3<String, Integer, Boolean>> input = createKeyedInput(env); List<Tuple3<String, Integer, Boolean>> actual = input .groupBy(new KeySelectorX()) .reduceGroup(new KeyedCombReducer()) .collect(); String expected = "k1,6,true\nk2,4,true\n"; compareResultAsTuples(actual, expected); }
@Test public void testUnsortedGroupReduceWithTypeInformationTypeHint() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().disableSysoutLogging(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.getSmall3TupleDataSet(env); DataSet<Integer> resultDs = ds .groupBy(0) .reduceGroup(new GroupReducer<Tuple3<Integer, Long, String>, Integer>()) .returns(BasicTypeInfo.INT_TYPE_INFO); List<Integer> result = resultDs.collect(); String expectedResult = "2\n" + "3\n" + "1\n"; compareResultAsText(result, expectedResult); }
@Test public void testSupportForDataAndEnumSerialization() throws Exception { /** * Test support for Date and enum serialization */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<PojoWithDateAndEnum> ds = env.generateSequence(0, 2).map(new Mapper1()); ds = ds.union(CollectionDataSets.getPojoWithDateAndEnum(env)); DataSet<String> res = ds.groupBy("group").reduceGroup(new GroupReducer1()); List<String> result = res.collect(); String expected = "ok\nok"; compareResultAsText(result, expected); }
public static DataSet<Tuple2<Long, Long>> doDeltaIteration(DataSet<Tuple2<Long, Long>> vertices, DataSet<Tuple2<Long, Long>> edges) { DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> depIteration = vertices.iterateDelta(vertices, 100, 0); DataSet<Tuple1<Long>> candidates = depIteration.getWorkset().join(edges).where(0).equalTo(0) .projectSecond(1); DataSet<Tuple1<Long>> grouped = candidates.groupBy(0).reduceGroup(new Reduce101()); DataSet<Tuple2<Long, Long>> candidatesDependencies = grouped.join(edges).where(0).equalTo(1).projectSecond(0, 1); DataSet<Tuple2<Long, Long>> verticesWithNewComponents = candidatesDependencies.join(depIteration.getSolutionSet()).where(0).equalTo(0) .with(new Join222()) .groupBy(0).aggregate(Aggregations.MIN, 1); DataSet<Tuple2<Long, Long>> updatedComponentId = verticesWithNewComponents.join(depIteration.getSolutionSet()).where(0).equalTo(0) .flatMap(new FlatMapJoin()); DataSet<Tuple2<Long, Long>> depResult = depIteration.closeWith(updatedComponentId, updatedComponentId); return depResult; }