/** * Enforces a re-balancing of the DataSet, i.e., the DataSet is evenly distributed over all parallel instances of the * following task. This can help to improve performance in case of heavy data skew and compute intensive operations. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @return The re-balanced DataSet. */ public PartitionOperator<T> rebalance() { return new PartitionOperator<>(this, PartitionMethod.REBALANCE, Utils.getCallLocationName()); }
/** * Hash-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @param fields The field indexes on which the DataSet is hash-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByHash(int... fields) { return new PartitionOperator<>(this, PartitionMethod.HASH, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Hash-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @param fields The field expressions on which the DataSet is hash-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByHash(String... fields) { return new PartitionOperator<>(this, PartitionMethod.HASH, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation requires an extra pass over the DataSet to compute the range boundaries and * shuffles the whole DataSet over the network. This can take significant amount of time. * * @param fields The field expressions on which the DataSet is range-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByRange(String... fields) { return new PartitionOperator<>(this, PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified tuple field positions. */ public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, int... fields) { return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType(), false), distribution, Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified fields. */ public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, String... fields) { return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType()), distribution, Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation requires an extra pass over the DataSet to compute the range boundaries and * shuffles the whole DataSet over the network. This can take significant amount of time. * * @param fields The field indexes on which the DataSet is range-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByRange(int... fields) { return new PartitionOperator<>(this, PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Partitions a tuple DataSet on the specified key fields using a custom partitioner. * This method takes the key position to partition on, and a partitioner that accepts the key type. * * <p>Note: This method works only on single field keys. * * @param partitioner The partitioner to assign partitions to keys. * @param field The field index on which the DataSet is to partitioned. * @return The partitioned DataSet. */ public <K> PartitionOperator<T> partitionCustom(Partitioner<K> partitioner, int field) { return new PartitionOperator<>(this, new Keys.ExpressionKeys<>(new int[] {field}, getType()), clean(partitioner), Utils.getCallLocationName()); }
/** * Partitions a POJO DataSet on the specified key fields using a custom partitioner. * This method takes the key expression to partition on, and a partitioner that accepts the key type. * * <p>Note: This method works only on single field keys. * * @param partitioner The partitioner to assign partitions to keys. * @param field The field index on which the DataSet is to partitioned. * @return The partitioned DataSet. */ public <K> PartitionOperator<T> partitionCustom(Partitioner<K> partitioner, String field) { return new PartitionOperator<>(this, new Keys.ExpressionKeys<>(new String[] {field}, getType()), clean(partitioner), Utils.getCallLocationName()); }
/** * Range-partitions a DataSet using the specified key selector function. */ public static <T, K extends Comparable<K>> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, KeySelector<T, K> keyExtractor) { final TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, input.getType()); return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.SelectorFunctionKeys<>(input.clean(keyExtractor), input.getType(), keyType), distribution, Utils.getCallLocationName()); }
/** * Partitions a DataSet using the specified KeySelector. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @param keyExtractor The KeyExtractor with which the DataSet is hash-partitioned. * @return The partitioned DataSet. * * @see KeySelector */ public <K extends Comparable<K>> PartitionOperator<T> partitionByHash(KeySelector<T, K> keyExtractor) { final TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, getType()); return new PartitionOperator<>(this, PartitionMethod.HASH, new Keys.SelectorFunctionKeys<>(clean(keyExtractor), this.getType(), keyType), Utils.getCallLocationName()); }
/** * Range-partitions a DataSet using the specified KeySelector. * * <p><b>Important:</b>This operation requires an extra pass over the DataSet to compute the range boundaries and * shuffles the whole DataSet over the network. This can take significant amount of time. * * @param keyExtractor The KeyExtractor with which the DataSet is range-partitioned. * @return The partitioned DataSet. * * @see KeySelector */ public <K extends Comparable<K>> PartitionOperator<T> partitionByRange(KeySelector<T, K> keyExtractor) { final TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, getType()); return new PartitionOperator<>(this, PartitionMethod.RANGE, new Keys.SelectorFunctionKeys<>(clean(keyExtractor), this.getType(), keyType), Utils.getCallLocationName()); }
/** * Partitions a DataSet on the key returned by the selector, using a custom partitioner. * This method takes the key selector to get the key to partition on, and a partitioner that * accepts the key type. * * <p>Note: This method works only on single field keys, i.e. the selector cannot return tuples * of fields. * * @param partitioner The partitioner to assign partitions to keys. * @param keyExtractor The KeyExtractor with which the DataSet is partitioned. * @return The partitioned DataSet. * * @see KeySelector */ public <K extends Comparable<K>> PartitionOperator<T> partitionCustom(Partitioner<K> partitioner, KeySelector<T, K> keyExtractor) { final TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, getType()); return new PartitionOperator<>(this, new Keys.SelectorFunctionKeys<>(keyExtractor, getType(), keyType), clean(partitioner), Utils.getCallLocationName()); }
/** * Enforces a re-balancing of the DataSet, i.e., the DataSet is evenly distributed over all parallel instances of the * following task. This can help to improve performance in case of heavy data skew and compute intensive operations. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @return The re-balanced DataSet. */ public PartitionOperator<T> rebalance() { return new PartitionOperator<>(this, PartitionMethod.REBALANCE, Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified fields. */ public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, String... fields) { return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType()), distribution, Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified fields. */ public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, String... fields) { return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType()), distribution, Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation requires an extra pass over the DataSet to compute the range boundaries and * shuffles the whole DataSet over the network. This can take significant amount of time. * * @param fields The field indexes on which the DataSet is range-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByRange(int... fields) { return new PartitionOperator<>(this, PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation requires an extra pass over the DataSet to compute the range boundaries and * shuffles the whole DataSet over the network. This can take significant amount of time. * * @param fields The field expressions on which the DataSet is range-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByRange(String... fields) { return new PartitionOperator<>(this, PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }
/** * Range-partitions a DataSet on the specified tuple field positions. */ public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, int... fields) { return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType(), false), distribution, Utils.getCallLocationName()); }
/** * Hash-partitions a DataSet on the specified key fields. * * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time. * * @param fields The field expressions on which the DataSet is hash-partitioned. * @return The partitioned DataSet. */ public PartitionOperator<T> partitionByHash(String... fields) { return new PartitionOperator<>(this, PartitionMethod.HASH, new Keys.ExpressionKeys<>(fields, getType()), Utils.getCallLocationName()); }