/** * Ungroup this LGroupedTable back into an {@link LTable}. This will still trigger a "reduce" operation, so is * usually only used in special cases like producing a globally-ordered list by feeding the everything through * a single reducers. */ default LTable<K, V> ungroup() { return factory().wrap(underlying().ungroup()); }
/** * Sorts the {@code PTable} using the natural ordering of its keys in the * order specified with a client-specified number of reducers. * * @return a {@code PTable} representing the sorted collection. */ public static <K, V> PTable<K, V> sort(PTable<K, V> table, int numReducers, Order key) { Configuration conf = table.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(table, conf, numReducers, key); return table.groupByKey(options).ungroup(); }
/** * Create a list of unique items in the input collection with their count, sorted descending by their frequency. * @param input input collection * @param <X> record type * @return global toplist */ public static <X> PTable<X, Long> globalToplist(PCollection<X> input) { return negateCounts(negateCounts(input.count()).groupByKey(1).ungroup()); }
/** * Create a list of unique items in the input collection with their count, sorted descending by their frequency. * @param input input collection * @param <X> record type * @return global toplist */ public static <X> PTable<X, Long> globalToplist(PCollection<X> input) { return SPTables.negateCounts(SPTables.negateCounts(input.count()).groupByKey(1).ungroup()); } }
/** * Sorts the {@link PTable} using the natural ordering of its keys * in the order specified. * * @return a {@link PTable} representing the sorted collection. */ public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) { PTypeFamily tf = table.getTypeFamily(); Configuration conf = table.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key); return table.groupByKey(options).ungroup(); }
/** * Sorts the {@link PTable} using the natural ordering of its keys in the * order specified. * * @return a {@link PTable} representing the sorted collection. */ public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) { PTypeFamily tf = table.getTypeFamily(); Configuration conf = table.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key); return table.groupByKey(options).ungroup(); }
/** * Creates a {@code PCollection<T>} that has the same contents as its input argument but will * be written to a fixed number of output files. This is useful for map-only jobs that process * lots of input files but only write out a small amount of input per task. * * @param pc The {@code PCollection<T>} to rebalance * @param numPartitions The number of output partitions to create * @return A rebalanced {@code PCollection<T>} with the same contents as the input */ public static <T> PCollection<T> shard(PCollection<T> pc, int numPartitions) { return pc.by(new ShardFn<T>(), pc.getTypeFamily().ints()) .groupByKey(numPartitions) .ungroup() .values(); }
private static <E> PCollection<E> partition(PCollection<E> collection, int numReducers) { PType<E> type = collection.getPType(); PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls()); PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType); PGroupedTable<E, Void> grouped = numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey(); return grouped.ungroup().keys(); }
Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<Tuple4<V1, V2, V3, V4>, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<Tuple4<V1, V2, V3, V4>,Void>, Tuple4<V1, V2, V3, V4>>() { @Override
/** * Sorts the {@link PCollection} of {@link Tuple4}s using the specified column * ordering. * * @return a {@link PCollection} representing the sorted collection. */ public static <V1, V2, V3, V4> PCollection<Tuple4<V1, V2, V3, V4>> sortQuads( PCollection<Tuple4<V1, V2, V3, V4>> collection, ColumnOrder... columnOrders) { PTypeFamily tf = collection.getTypeFamily(); PType<Tuple4<V1, V2, V3, V4>> pType = collection.getPType(); @SuppressWarnings("unchecked") PTableType<Tuple4<V1, V2, V3, V4>, Void> type = tf.tableOf(tf.quads(pType.getSubTypes().get(0), pType.getSubTypes() .get(1), pType.getSubTypes().get(2), pType.getSubTypes().get(3)), tf.nulls()); PTable<Tuple4<V1, V2, V3, V4>, Void> pt = collection.parallelDo( new DoFn<Tuple4<V1, V2, V3, V4>, Pair<Tuple4<V1, V2, V3, V4>, Void>>() { @Override public void process(Tuple4<V1, V2, V3, V4> input, Emitter<Pair<Tuple4<V1, V2, V3, V4>, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<Tuple4<V1, V2, V3, V4>, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<Tuple4<V1, V2, V3, V4>, Void>, Tuple4<V1, V2, V3, V4>>() { @Override public void process(Pair<Tuple4<V1, V2, V3, V4>, Void> input, Emitter<Tuple4<V1, V2, V3, V4>> emitter) { emitter.emit(input.first()); } }, collection.getPType()); }
/** * Sorts the {@link PCollection} of {@link Tuple3}s using the specified column * ordering. * * @return a {@link PCollection} representing the sorted collection. */ public static <V1, V2, V3> PCollection<Tuple3<V1, V2, V3>> sortTriples(PCollection<Tuple3<V1, V2, V3>> collection, ColumnOrder... columnOrders) { PTypeFamily tf = collection.getTypeFamily(); PType<Tuple3<V1, V2, V3>> pType = collection.getPType(); @SuppressWarnings("unchecked") PTableType<Tuple3<V1, V2, V3>, Void> type = tf.tableOf( tf.triples(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2)), tf.nulls()); PTable<Tuple3<V1, V2, V3>, Void> pt = collection.parallelDo( new DoFn<Tuple3<V1, V2, V3>, Pair<Tuple3<V1, V2, V3>, Void>>() { @Override public void process(Tuple3<V1, V2, V3> input, Emitter<Pair<Tuple3<V1, V2, V3>, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<Tuple3<V1, V2, V3>, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<Tuple3<V1, V2, V3>, Void>, Tuple3<V1, V2, V3>>() { @Override public void process(Pair<Tuple3<V1, V2, V3>, Void> input, Emitter<Tuple3<V1, V2, V3>> emitter) { emitter.emit(input.first()); } }, collection.getPType()); }
Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<Tuple3<V1, V2, V3>, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<Tuple3<V1, V2, V3>,Void>, Tuple3<V1, V2, V3>>() { @Override
Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<Pair<U, V>, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<Pair<U, V>, Void>, Pair<U, V>>() { @Override
/** * Sorts the {@code PCollection} of {@link TupleN}s using the specified column * ordering and a client-specified number of reducers. * * @return a {@code PCollection} representing the sorted collection. */ public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection, int numReducers, ColumnOrder... columnOrders) { PType<T> pType = collection.getPType(); SortFns.KeyExtraction<T> ke = new SortFns.KeyExtraction<T>(pType, columnOrders); PTable<Object, T> pt = collection.by(ke.getByFn(), ke.getKeyType()); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, columnOrders); return pt.groupByKey(options).ungroup().values(); }
Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<Pair<U, V>, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<Pair<U, V>,Void>, Pair<U, V>>() { @Override
/** * Sorts the {@code PCollection} using the natural ordering of its elements in * the order specified using the given number of reducers. * * @return a {@code PCollection} representing the sorted collection. */ public static <T> PCollection<T> sort(PCollection<T> collection, int numReducers, Order order) { PTypeFamily tf = collection.getTypeFamily(); PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls()); Configuration conf = collection.getPipeline().getConfiguration(); PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() { @Override public void process(T input, Emitter<Pair<T, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, order); return pt.groupByKey(options).ungroup().keys(); }
Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<TupleN,Void>, TupleN>() { @Override
/** * Sorts the {@link PCollection} of {@link TupleN}s using the specified column * ordering. * * @return a {@link PCollection} representing the sorted collection. */ public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) { PTypeFamily tf = collection.getTypeFamily(); PType<TupleN> pType = collection.getPType(); PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls()); PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() { @Override public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders); PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() { @Override public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) { emitter.emit(input.first()); } }, collection.getPType()); }
/** * Sorts the {@link PCollection} using the natural ordering of its elements * in the order specified. * * @return a {@link PCollection} representing the sorted collection. */ public static <T> PCollection<T> sort(PCollection<T> collection, Order order) { PTypeFamily tf = collection.getTypeFamily(); PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls()); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order); PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() { @Override public void process(T input, Emitter<Pair<T, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() { @Override public void process(Pair<T, Void> input, Emitter<T> emitter) { emitter.emit(input.first()); } }, collection.getPType()); }
/** * Sorts the {@link PCollection} using the natural ordering of its elements in * the order specified. * * @return a {@link PCollection} representing the sorted collection. */ public static <T> PCollection<T> sort(PCollection<T> collection, Order order) { PTypeFamily tf = collection.getTypeFamily(); PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls()); Configuration conf = collection.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order); PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() { @Override public void process(T input, Emitter<Pair<T, Void>> emitter) { emitter.emit(Pair.of(input, (Void) null)); } }, type); PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup(); return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() { @Override public void process(Pair<T, Void> input, Emitter<T> emitter) { emitter.emit(input.first()); } }, collection.getPType()); }