public static <K1, V1, K2 extends Writable, V2 extends Writable> PTable<K2, V2> reduce( PGroupedTable<K1, V1> input, Class<? extends Reducer<K1, V1, K2, V2>> reducerClass, Class<K2> keyClass, Class<V2> valueClass) { return input.parallelDo(new ReducerFn<K1, V1, K2, V2>(reducerClass), tableOf(keyClass, valueClass)); }
/** * Perform a secondary sort on the given {@code PTable} instance and then apply a * {@code DoFn} to the resulting sorted data to yield an output {@code PCollection<T>}. */ public static <K, V1, V2, T> PCollection<T> sortAndApply(PTable<K, Pair<V1, V2>> input, DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> doFn, PType<T> ptype) { return prepare(input) .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, T>(doFn), ptype); }
public static <K1, V1, K2 extends Writable, V2 extends Writable> PTable<K2, V2> reduce( PGroupedTable<K1, V1> input, Class<? extends Reducer<K1, V1, K2, V2>> reducerClass, Class<K2> keyClass, Class<V2> valueClass) { return input.parallelDo(new ReducerFn<K1, V1, K2, V2>(reducerClass), tableOf(keyClass, valueClass)); }
/** * Perform a secondary sort on the given {@code PTable} instance and then apply a * {@code DoFn} to the resulting sorted data to yield an output {@code PTable<U, V>}. */ public static <K, V1, V2, U, V> PTable<U, V> sortAndApply(PTable<K, Pair<V1, V2>> input, DoFn<Pair<K, Iterable<Pair<V1, V2>>>, Pair<U, V>> doFn, PTableType<U, V> ptype) { return prepare(input) .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, Pair<U, V>>(doFn), ptype); }
/** * Perform a secondary sort on the given {@code PTable} instance and then apply a * {@code DoFn} to the resulting sorted data to yield an output {@code PCollection<T>}, using * the given number of reducers. */ public static <K, V1, V2, T> PCollection<T> sortAndApply( PTable<K, Pair<V1, V2>> input, DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> doFn, PType<T> ptype, int numReducers) { return prepare(input, numReducers) .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, T>(doFn), ptype); }
/** * Perform a secondary sort on the given {@code PTable} instance and then apply a * {@code DoFn} to the resulting sorted data to yield an output {@code PTable<U, V>}, using * the given number of reducers. */ public static <K, V1, V2, U, V> PTable<U, V> sortAndApply( PTable<K, Pair<V1, V2>> input, DoFn<Pair<K, Iterable<Pair<V1, V2>>>, Pair<U, V>> doFn, PTableType<U, V> ptype, int numReducers) { return prepare(input, numReducers) .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, Pair<U, V>>(doFn), ptype); }
/** * Supports a user-specified number of reducers for the one-to-many join. * * @param left left-side table to join * @param right right-side table to join * @param postProcessFn DoFn to process the results of the join * @param ptype type of the output of the postProcessFn * @param numReducers The number of reducers to use * @return the post-processed output of the join */ public static <K, U, V, T> PCollection<T> oneToManyJoin(PTable<K, U> left, PTable<K, V> right, DoFn<Pair<U, Iterable<V>>, T> postProcessFn, PType<T> ptype, int numReducers) { PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = DefaultJoinStrategy.preJoin(left, right, numReducers); return grouped.parallelDo("One to many join " + grouped.getName(), new OneToManyJoinFn<K, U, V, T>(left.getValueType(), postProcessFn), ptype); }
public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) { PTypeFamily tf = collect.getTypeFamily(); final PType<V> valueType = collect.getValueType(); return collect.groupByKey().parallelDo("collect", new MapValuesFn<K, Iterable<V>, Collection<V>>() { @Override public void initialize() { valueType.initialize(getConfiguration()); } public Collection<V> map(Iterable<V> values) { List<V> collected = Lists.newArrayList(); for (V value : values) { collected.add(valueType.getDetachedValue(value)); } return collected; } }, tf.tableOf(collect.getKeyType(), tf.collections(collect.getValueType()))); } }
/** * An analogue of the {@code mapValues} function for {@code PGroupedTable<K, U>} collections. * * @param name The name of the operation * @param ptable The {@code PGroupedTable} to be mapped * @param mapFn The mapping function * @param ptype The PType for the returned values * @return A new {@code PTable<K, V>} instance */ public static <K, U, V> PTable<K, V> mapValues(String name, PGroupedTable<K, U> ptable, MapFn<Iterable<U>, V> mapFn, PType<V> ptype) { PTypeFamily ptf = ptable.getTypeFamily(); return ptable.parallelDo(name, new PairMapFn<K, Iterable<U>, K, V>(IdentityFn.<K>getInstance(), mapFn), ptf.tableOf((PType<K>) ptable.getPType().getSubTypes().get(0), ptype)); }
/** * A {@code distinct} operation that gives the client more control over how frequently * elements are flushed to disk in order to allow control over performance or * memory consumption. * * @param input The input {@code PCollection} * @param flushEvery Flush the elements to disk whenever we encounter this many unique values * @return A new {@code PCollection} that contains the unique elements of the input */ public static <S> PCollection<S> distinct(PCollection<S> input, int flushEvery) { Preconditions.checkArgument(flushEvery > 0); PType<S> pt = input.getPType(); PTypeFamily ptf = pt.getFamily(); return input .parallelDo("pre-distinct", new PreDistinctFn<S>(flushEvery), ptf.tableOf(pt, ptf.nulls())) .groupByKey() .parallelDo("post-distinct", new PostDistinctFn<S>(), pt); }
/** * A {@code distinct} operation that gives the client more control over how frequently * elements are flushed to disk in order to allow control over performance or * memory consumption. * * @param input The input {@code PCollection} * @param flushEvery Flush the elements to disk whenever we encounter this many unique values * @return A new {@code PCollection} that contains the unique elements of the input */ public static <S> PCollection<S> distinct(PCollection<S> input, int flushEvery) { Preconditions.checkArgument(flushEvery > 0); PType<S> pt = input.getPType(); PTypeFamily ptf = pt.getFamily(); return input .parallelDo("pre-distinct", new PreDistinctFn<S>(flushEvery, pt), ptf.tableOf(pt, ptf.nulls())) .groupByKey() .parallelDo("post-distinct", new PostDistinctFn<S>(), pt); }
public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) { PTypeFamily tf = collect.getTypeFamily(); final PType<V> valueType = collect.getValueType(); return collect.groupByKey().parallelDo("collect", new MapValuesFn<K, Iterable<V>, Collection<V>>() { public Collection<V> map(Iterable<V> values) { List<V> collected = Lists.newArrayList(); for (V value : values) { collected.add(valueType.getDetachedValue(value)); } return collected; } }, tf.tableOf(collect.getKeyType(), tf.collections(collect.getValueType()))); } }
.by(new GetSessionKey(), Avros.strings()) .groupByKey() .parallelDo(new MakeSession(), Avros.specifics(Session.class));
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
/** * Perform a default join on the given {@code PTable} instances using a user-specified {@code JoinFn}. * * @param left left table to be joined * @param right right table to be joined * @param joinFn The user-specified implementation of the {@code JoinFn} class * @return joined tables */ public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right, numReducers); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
/** * Calculate a set of quantiles for each key in a numerically-valued table. * * Quantiles are calculated on a per-key basis by grouping, reading the data into memory, then sorting and * and calculating. This is much faster than the distributed option, but if you get into the order of 10M+ per key, then * performance might start to degrade or even cause OOMs. * * The quantile definition that we use here is the "nearest rank" defined here: * http://en.wikipedia.org/wiki/Percentile#Definition * * @param table numerically-valued PTable * @param p1 First quantile (in the range 0.0 - 1.0) * @param pn More quantiles (in the range 0.0 - 1.0) * @param <K> Key type of the table * @param <V> Value type of the table (must extends java.lang.Number) * @return PTable of each key with a collection of pairs of the quantile provided and it's result. */ public static <K, V extends Comparable> PTable<K, Result<V>> inMemory(PTable<K, V> table, double p1, double... pn) { final List<Double> quantileList = createListFromVarargs(p1, pn); PTypeFamily ptf = table.getTypeFamily(); return table .groupByKey() .parallelDo(new InMemoryQuantiles<K, V>(quantileList), ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType()))); }
/** * Calculate a set of percentiles for each key in a numerically-valued table. * * Percentiles are calculated on a per-key basis by grouping, reading the data into memory, then sorting and * and calculating. This is much faster than the distributed option, but if you get into the order of 10M+ per key, then * performance might start to degrade or even cause OOMs. * * The percentile definition that we use here is the "nearest rank" defined here: * http://en.wikipedia.org/wiki/Percentile#Definition * * @param table numerically-valued PTable * @param p1 First percentile (in the range 0.0 - 1.0) * @param pn More percentiles (in the range 0.0 - 1.0) * @param <K> Key type of the table * @param <V> Value type of the table (must extends java.lang.Number) * @return PTable of each key with a collection of pairs of the percentile provided and it's result. */ public static <K, V extends Comparable> PTable<K, Result<V>> inMemory(PTable<K, V> table, double p1, double... pn) { final List<Double> percentileList = createListFromVarargs(p1, pn); PTypeFamily ptf = table.getTypeFamily(); return table .groupByKey() .parallelDo(new InMemoryPercentiles<K, V>(percentileList), ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType()))); }
/** * Co-groups the two {@link PTable} arguments. * * @return a {@code PTable} representing the co-grouped tables. */ public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(PTable<K, U> left, PTable<K, V> right) { PTypeFamily ptf = left.getTypeFamily(); PType<K> keyType = left.getPTableType().getKeyType(); PType<U> leftType = left.getPTableType().getValueType(); PType<V> rightType = right.getPTableType().getValueType(); PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType); PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> both = cgLeft.union(cgRight); PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs(ptf.collections(leftType), ptf.collections(rightType)); return both.groupByKey().parallelDo("cogroup", new PostGroupFn<K, U, V>(), ptf.tableOf(keyType, otype)); }
/** * Co-groups the two {@link PTable} arguments. * * @return a {@code PTable} representing the co-grouped tables. */ public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup( PTable<K, U> left, PTable<K, V> right) { PTypeFamily ptf = left.getTypeFamily(); PType<K> keyType = left.getPTableType().getKeyType(); PType<U> leftType = left.getPTableType().getValueType(); PType<V> rightType = right.getPTableType().getValueType(); PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType); PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> both = cgLeft.union(cgRight); PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs( ptf.collections(leftType), ptf.collections(rightType)); return both.groupByKey().parallelDo("cogroup", new PostGroupFn<K, U, V>(), ptf.tableOf(keyType, otype)); }