/** * Returns a new set containing the first n elements in this grouped and sorted {@link DataSet}. * @param n The desired number of elements for each group. * @return A GroupReduceOperator that represents the DataSet containing the elements. */ public GroupReduceOperator<T, T> first(int n) { if (n < 1) { throw new InvalidProgramException("Parameter n of first(n) must be at least 1."); } return reduceGroup(new FirstReducer<T>(n)); }
@Test public void testStringBasedDefinitionOnGroupSortForTwoGroupingKeysWithPojos() throws Exception { /* * Test string-based definition on group sort, for two grouping keys with Pojos */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<PojoContainingTupleAndWritable> ds = CollectionDataSets.getGroupSortedPojoContainingTupleAndWritable(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("hadoopFan").sortGroup("theTuple.f0", Order.DESCENDING).sortGroup("theTuple.f1", Order.DESCENDING) .reduceGroup(new GroupReducer5()); List<String> result = reduceDs.collect(); String expected = "1---(10,100)-\n" + "2---(30,600)-(30,400)-(30,200)-(20,201)-(20,200)-\n"; compareResultAsText(result, expected); }
tupleDs.groupBy(2).sortGroup(4, Order.ASCENDING).first(1); } catch (Exception e) { Assert.fail(); tupleDs.groupBy(1, 3).sortGroup(4, Order.ASCENDING).first(10); } catch (Exception e) { Assert.fail(); tupleDs.groupBy(0).sortGroup(4, Order.ASCENDING).first(0); Assert.fail(); } catch (InvalidProgramException ipe) { tupleDs.groupBy(2).sortGroup(4, Order.ASCENDING).first(-1); Assert.fail(); } catch (InvalidProgramException ipe) {
@Test public void testIdentityWithGroupByAndSort() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> reduceDs = ds .groupBy(1) .sortGroup(1, Order.DESCENDING) // reduce partially .combineGroup(new IdentityFunction()) .groupBy(1) .sortGroup(1, Order.DESCENDING) // fully reduce .reduceGroup(new IdentityFunction()); List<Tuple3<Integer, Long, String>> result = reduceDs.collect(); compareResultAsTuples(result, identityResult); }
private void createSortOperation(PythonOperationInfo info) { if (sets.isDataSet(info.parentID)) { throw new IllegalArgumentException("sort() can not be applied on a DataSet."); } else if (sets.isUnsortedGrouping(info.parentID)) { sets.add(info.setID, sets.getUnsortedGrouping(info.parentID).sortGroup(info.field, info.order)); } else if (sets.isSortedGrouping(info.parentID)) { sets.add(info.setID, sets.getSortedGrouping(info.parentID).sortGroup(info.field, info.order)); } }
/** * Sorts Pojos within a group on the specified field in the specified {@link Order}. * * <p><b>Note: Only groups of Tuple elements and Pojos can be sorted.</b> * * <p>Groups can be sorted by multiple fields by chaining {@link #sortGroup(String, Order)} calls. * * @param field The Tuple or Pojo field on which the group is sorted. * @param order The Order in which the specified field is sorted. * @return A SortedGrouping with specified order of group element. * * @see Order */ public SortedGrouping<T> sortGroup(String field, Order order) { if (this.getKeys() instanceof Keys.SelectorFunctionKeys) { throw new InvalidProgramException("KeySelector grouping keys and field expression group-sorting keys cannot be used together."); } SortedGrouping<T> sg = new SortedGrouping<T>(this.inputDataSet, this.keys, field, order); sg.customPartitioner = getCustomPartitioner(); return sg; }
/** * Sorts {@link org.apache.flink.api.java.tuple.Tuple} elements within a group on the specified field in the specified {@link Order}. * * <p><b>Note: Only groups of Tuple or Pojo elements can be sorted.</b> * * <p>Groups can be sorted by multiple fields by chaining {@link #sortGroup(int, Order)} calls. * * @param field The Tuple field on which the group is sorted. * @param order The Order in which the specified Tuple field is sorted. * @return A SortedGrouping with specified order of group element. * * @see org.apache.flink.api.java.tuple.Tuple * @see Order */ public SortedGrouping<T> sortGroup(int field, Order order) { if (groupSortSelectorFunctionKey != null) { throw new InvalidProgramException("Chaining sortGroup with KeySelector sorting is not supported"); } if (!Keys.ExpressionKeys.isSortKey(field, inputDataSet.getType())) { throw new InvalidProgramException("Selected sort key is not a sortable type"); } ExpressionKeys<T> ek = new ExpressionKeys<>(field, inputDataSet.getType()); addSortGroupInternal(ek, order); return this; }
@Test(expected = InvalidProgramException.class) public void testGroupSortByKeyExpression5() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple4<Integer, Long, CustomType, Long[]>> tupleDs = env.fromCollection(tupleWithCustomData, tupleWithCustomInfo); // should not work tupleDs.groupBy("f0") .sortGroup("f1", Order.ASCENDING) .sortGroup("f2", Order.ASCENDING); }
/** * Sorts {@link org.apache.flink.api.java.tuple.Tuple} elements within a group on the specified field in the specified {@link Order}. * * <p><b>Note: Only groups of Tuple elements and Pojos can be sorted.</b> * * <p>Groups can be sorted by multiple fields by chaining {@link #sortGroup(int, Order)} calls. * * @param field The Tuple field on which the group is sorted. * @param order The Order in which the specified Tuple field is sorted. * @return A SortedGrouping with specified order of group element. * * @see org.apache.flink.api.java.tuple.Tuple * @see Order */ public SortedGrouping<T> sortGroup(int field, Order order) { if (this.getKeys() instanceof Keys.SelectorFunctionKeys) { throw new InvalidProgramException("KeySelector grouping keys and field index group-sorting keys cannot be used together."); } SortedGrouping<T> sg = new SortedGrouping<T>(this.inputDataSet, this.keys, field, order); sg.customPartitioner = getCustomPartitioner(); return sg; }
/** * Sorts {@link org.apache.flink.api.java.tuple.Tuple} or POJO elements within a group on the specified field in the specified {@link Order}. * * <p><b>Note: Only groups of Tuple or Pojo elements can be sorted.</b> * * <p>Groups can be sorted by multiple fields by chaining {@link #sortGroup(String, Order)} calls. * * @param field The Tuple or Pojo field on which the group is sorted. * @param order The Order in which the specified field is sorted. * @return A SortedGrouping with specified order of group element. * * @see org.apache.flink.api.java.tuple.Tuple * @see Order */ public SortedGrouping<T> sortGroup(String field, Order order) { if (groupSortSelectorFunctionKey != null) { throw new InvalidProgramException("Chaining sortGroup with KeySelector sorting is not supported"); } if (!Keys.ExpressionKeys.isSortKey(field, inputDataSet.getType())) { throw new InvalidProgramException("Selected sort key is not a sortable type"); } ExpressionKeys<T> ek = new ExpressionKeys<>(field, inputDataSet.getType()); addSortGroupInternal(ek, order); return this; }
private <IN, OUT> DataSet<OUT> applyGroupReduceOperation(SortedGrouping<IN> op1, PythonOperationInfo info, TypeInformation<OUT> type) { return op1 .reduceGroup(new IdentityGroupReduce<IN>()).setCombinable(false).setParallelism(info.parallelism).name("PythonGroupReducePreStep") .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type)) .setParallelism(info.parallelism).name(info.name); }
@Test public void testStringBasedDefinitionOnGroupSortForTwoGroupingKeys() throws Exception { /* * Test string-based definition on group sort, for two grouping keys */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple2<Tuple2<Integer, Integer>, String>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("f1").sortGroup("f0.f0", Order.DESCENDING).sortGroup("f0.f1", Order.DESCENDING).reduceGroup(new NestedTupleReducer()); List<String> result = reduceDs.collect(); String expected = "a--(2,1)-(1,3)-(1,2)-\n" + "b--(2,2)-\n" + "c--(4,9)-(3,6)-(3,3)-\n"; compareResultAsText(result, expected); }
@Test public void testChainedGroupSortKeyFields() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> tupleDs = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { tupleDs.groupBy(0).sortGroup(0, Order.ASCENDING).sortGroup(2, Order.DESCENDING); } catch (Exception e) { Assert.fail(); } }
/** * Sorts elements within a group on a key extracted by the specified {@link org.apache.flink.api.java.functions.KeySelector} * in the specified {@link Order}. * * <p>Chaining {@link #sortGroup(KeySelector, Order)} calls is not supported. * * @param keySelector The KeySelector with which the group is sorted. * @param order The Order in which the extracted key is sorted. * @return A SortedGrouping with specified order of group element. * * @see Order */ public <K> SortedGrouping<T> sortGroup(KeySelector<T, K> keySelector, Order order) { if (!(this.getKeys() instanceof Keys.SelectorFunctionKeys)) { throw new InvalidProgramException("KeySelector group-sorting keys can only be used with KeySelector grouping keys."); } TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keySelector, this.inputDataSet.getType()); SortedGrouping<T> sg = new SortedGrouping<T>(this.inputDataSet, this.keys, new Keys.SelectorFunctionKeys<T, K>(keySelector, this.inputDataSet.getType(), keyType), order); sg.customPartitioner = getCustomPartitioner(); return sg; }
/** * Test for FLINK-2135. */ @Test public void testFaultyCast() throws Exception { ExecutionEnvironment ee = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> b = ee.fromElements("a", "b"); GroupReduceOperator<String, String> a = b.groupBy(new KeySelector<String, Long>() { @Override public Long getKey(String value) throws Exception { return 1L; } }).sortGroup(new KeySelector<String, Double>() { @Override public Double getKey(String value) throws Exception { return 1.0; } }, Order.DESCENDING).first(1); List<String> result = b.collect(); String expected = "a\nb"; compareResultAsText(result, expected); }
/** * Sorts {@link org.apache.flink.api.java.tuple.Tuple} or POJO elements within a group on the specified field in the specified {@link Order}. * * <p><b>Note: Only groups of Tuple or Pojo elements can be sorted.</b> * * <p>Groups can be sorted by multiple fields by chaining {@link #sortGroup(String, Order)} calls. * * @param field The Tuple or Pojo field on which the group is sorted. * @param order The Order in which the specified field is sorted. * @return A SortedGrouping with specified order of group element. * * @see org.apache.flink.api.java.tuple.Tuple * @see Order */ public SortedGrouping<T> sortGroup(String field, Order order) { if (groupSortSelectorFunctionKey != null) { throw new InvalidProgramException("Chaining sortGroup with KeySelector sorting is not supported"); } if (!Keys.ExpressionKeys.isSortKey(field, inputDataSet.getType())) { throw new InvalidProgramException("Selected sort key is not a sortable type"); } ExpressionKeys<T> ek = new ExpressionKeys<>(field, inputDataSet.getType()); addSortGroupInternal(ek, order); return this; }
@Override public DataSet<Tuple3<K, K, K>> run(Graph<K, VV, EV> input) throws Exception { DataSet<Edge<K, EV>> edges = input.getEdges(); // annotate edges with degrees DataSet<EdgeWithDegrees<K>> edgesWithDegrees = edges.flatMap(new EdgeDuplicator<>()) .groupBy(0).sortGroup(1, Order.ASCENDING).reduceGroup(new DegreeCounter<>()) .groupBy(EdgeWithDegrees.V1, EdgeWithDegrees.V2).reduce(new DegreeJoiner<>()); // project edges by degrees DataSet<Edge<K, NullValue>> edgesByDegree = edgesWithDegrees.map(new EdgeByDegreeProjector<>()); // project edges by vertex id DataSet<Edge<K, NullValue>> edgesById = edgesByDegree.map(new EdgeByIdProjector<>()); DataSet<Tuple3<K, K, K>> triangles = edgesByDegree // build triads .groupBy(EdgeWithDegrees.V1).sortGroup(EdgeWithDegrees.V2, Order.ASCENDING) .reduceGroup(new TriadBuilder<>()) // filter triads .join(edgesById, JoinHint.REPARTITION_HASH_SECOND).where(Triad.V2, Triad.V3).equalTo(0, 1).with(new TriadFilter<>()); return triangles; }
@Test public void testIntBasedDefinitionOnGroupSortForPartialNestedTuple() throws Exception { /* * Test int-based definition on group sort, for (partial) nested Tuple ASC */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSet<Tuple2<Tuple2<Integer, Integer>, String>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet(env); // f0.f0 is first integer DataSet<String> reduceDs = ds.groupBy("f1") .sortGroup("f0.f0", Order.ASCENDING) .sortGroup("f0.f1", Order.ASCENDING) .reduceGroup(new NestedTupleReducer()); List<String> result = reduceDs.collect(); String expected = "a--(1,2)-(1,3)-(2,1)-\n" + "b--(2,2)-\n" + "c--(3,3)-(3,6)-(4,9)-\n"; compareResultAsText(result, expected); }
@Test public void testGroupSortByKeyExpression3() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple4<Integer, Long, CustomType, Long[]>> tupleDs = env.fromCollection(tupleWithCustomData, tupleWithCustomInfo); // should work try { tupleDs.groupBy("f0") .sortGroup("f2.myString", Order.ASCENDING) .sortGroup("f1", Order.DESCENDING); } catch (Exception e) { Assert.fail(); } }
/** * Sorts Pojos within a group on the specified field in the specified {@link Order}. * * <p><b>Note: Only groups of Tuple elements and Pojos can be sorted.</b> * * <p>Groups can be sorted by multiple fields by chaining {@link #sortGroup(String, Order)} calls. * * @param field The Tuple or Pojo field on which the group is sorted. * @param order The Order in which the specified field is sorted. * @return A SortedGrouping with specified order of group element. * * @see Order */ public SortedGrouping<T> sortGroup(String field, Order order) { if (this.getKeys() instanceof Keys.SelectorFunctionKeys) { throw new InvalidProgramException("KeySelector grouping keys and field expression group-sorting keys cannot be used together."); } SortedGrouping<T> sg = new SortedGrouping<T>(this.inputDataSet, this.keys, field, order); sg.customPartitioner = getCustomPartitioner(); return sg; }