/** * Swap the key and value part of a PTable. The original PTypes are used in the opposite order * @param table PTable to process * @param <K> Key type (will become value type) * @param <V> Value type (will become key type) * @return PType<V, K> containing the same data as the original */ public static <K, V> PTable<V, K> swapKeyValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); return table.parallelDo(new MapFn<Pair<K, V>, Pair<V, K>>() { @Override public Pair<V, K> map(Pair<K, V> input) { return Pair.of(input.second(), input.first()); } }, ptf.tableOf(table.getValueType(), table.getKeyType())); }
/** * Swap the key and value part of a table. The original PTypes are used in the opposite order * @param table PTable to process * @param <K> Key type (will become value type) * @param <V> Value type (will become key type) * @return PType<V, K> containing the same data as the original */ public static <K, V> PTable<V, K> swapKeyValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); return table.parallelDo(new MapFn<Pair<K, V>, Pair<V, K>>() { @Override public Pair<V, K> map(Pair<K, V> input) { return Pair.of(input.second(), input.first()); } }, ptf.tableOf(table.getValueType(), table.getKeyType())); } }
public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) { PTypeFamily tf = collect.getTypeFamily(); final PType<V> valueType = collect.getValueType(); return collect.groupByKey().parallelDo("collect", new MapValuesFn<K, Iterable<V>, Collection<V>>() { @Override public void initialize() { valueType.initialize(getConfiguration()); } public Collection<V> map(Iterable<V> values) { List<V> collected = Lists.newArrayList(); for (V value : values) { collected.add(valueType.getDetachedValue(value)); } return collected; } }, tf.tableOf(collect.getKeyType(), tf.collections(collect.getValueType()))); } }
@Override public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinType joinType) { JoinType reversedJoinType; switch (joinType) { case INNER_JOIN: reversedJoinType = JoinType.INNER_JOIN; break; case RIGHT_OUTER_JOIN: reversedJoinType = JoinType.LEFT_OUTER_JOIN; break; default: throw new UnsupportedOperationException("Join type " + joinType + " is not supported"); } return mapsideJoinStrategy.join(right, left, reversedJoinType) .mapValues("Reverse order out output table values", new ReversePairOrderFn<V, U>(), left.getTypeFamily().pairs(left.getValueType(), right.getValueType())); } }
/** * Sorts the {@link PTable} using the natural ordering of its keys * in the order specified. * * @return a {@link PTable} representing the sorted collection. */ public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) { PTypeFamily tf = table.getTypeFamily(); Configuration conf = table.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key); return table.groupByKey(options).ungroup(); }
/** * Sorts the {@link PTable} using the natural ordering of its keys in the * order specified. * * @return a {@link PTable} representing the sorted collection. */ public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) { PTypeFamily tf = table.getTypeFamily(); Configuration conf = table.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key); return table.groupByKey(options).ungroup(); }
/** * Maps a {@code PTable<K, U>} to a {@code PTable<K, V>} using the given {@code MapFn<U, V>} on * the values of the {@code PTable}. * * @param name The name of the transform * @param ptable The {@code PTable} to be mapped * @param mapFn The mapping function * @param ptype The PType for the returned values * @return A new {@code PTable<K, V>} instance */ public static <K, U, V> PTable<K, V> mapValues(String name, PTable<K, U> ptable, MapFn<U, V> mapFn, PType<V> ptype) { PTypeFamily ptf = ptable.getTypeFamily(); return ptable.parallelDo(name, new PairMapFn<K, U, K, V>(IdentityFn.<K>getInstance(), mapFn), ptf.tableOf(ptable.getKeyType(), ptype)); }
/** * Maps a {@code PTable<K1, V>} to a {@code PTable<K2, V>} using the given {@code MapFn<K1, K2>} on * the keys of the {@code PTable}. * * @param name The name of the transform * @param ptable The {@code PTable} to be mapped * @param mapFn The mapping function * @param ptype The PType for the returned keys * @return A new {@code PTable<K2, V>} instance */ public static <K1, K2, V> PTable<K2, V> mapKeys(String name, PTable<K1, V> ptable, MapFn<K1, K2> mapFn, PType<K2> ptype) { PTypeFamily ptf = ptable.getTypeFamily(); return ptable.parallelDo(name, new PairMapFn<K1, V, K2, V>(mapFn, IdentityFn.<V>getInstance()), ptf.tableOf(ptype, ptable.getValueType())); }
public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) { PTypeFamily tf = collect.getTypeFamily(); final PType<V> valueType = collect.getValueType(); return collect.groupByKey().mapValues("collect", new MapFn<Iterable<V>, Collection<V>>() { @Override public void initialize() { valueType.initialize(getConfiguration()); } public Collection<V> map(Iterable<V> values) { List<V> collected = Lists.newArrayList(); for (V value : values) { collected.add(valueType.getDetachedValue(value)); } return collected; } }, tf.collections(collect.getValueType())); }
public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) { PTypeFamily tf = collect.getTypeFamily(); final PType<V> valueType = collect.getValueType(); return collect.groupByKey().parallelDo("collect", new MapValuesFn<K, Iterable<V>, Collection<V>>() { public Collection<V> map(Iterable<V> values) { List<V> collected = Lists.newArrayList(); for (V value : values) { collected.add(valueType.getDetachedValue(value)); } return collected; } }, tf.tableOf(collect.getKeyType(), tf.collections(collect.getValueType()))); } }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
/** * Perform a default join on the given {@code PTable} instances using a user-specified {@code JoinFn}. * * @param left left table to be joined * @param right right table to be joined * @param joinFn The user-specified implementation of the {@code JoinFn} class * @return joined tables */ public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right, numReducers); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input, int numReducers) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); GroupingOptions.Builder gob = GroupingOptions.builder() .requireSortedKeys() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)); if (numReducers > 0) { gob.numReducers(numReducers); } return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey(gob.build()); }
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter) .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }
private PTable<K, Pair<U,V>> joinInternal(PTable<K, U> left, PTable<K, V> right, boolean includeUnmatchedLeftValues) { PTypeFamily tf = left.getTypeFamily(); ReadableData<Pair<K, V>> rightReadable = right.asReadable(materialize); MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>( rightReadable, right.getPTableType(), includeUnmatchedLeftValues); ParallelDoOptions options = ParallelDoOptions.builder() .sourceTargets(rightReadable.getSourceTargets()) .build(); return left.parallelDo("mapjoin", mapJoinDoFn, tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())), options); }
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter) .groupByKey(1) .combineValues(new TopKCombineFn<K, V>(limit, maximize)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); PTableType<K, Collection<Pair<V1, V2>>> out = ptf.tableOf(input.getKeyType(), ptf.collections(input.getValueType())); return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey( GroupingOptions.builder() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)) .build()); }
@Override public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinType joinType) { if (joinType == JoinType.FULL_OUTER_JOIN || joinType == JoinType.LEFT_OUTER_JOIN) { throw new UnsupportedOperationException("Join type " + joinType + " not supported by ShardedJoinStrategy"); } PTypeFamily ptf = left.getTypeFamily(); PTableType<Pair<K, Integer>, U> shardedLeftType = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), left.getValueType()); PTableType<Pair<K, Integer>, V> shardedRightType = ptf.tableOf(ptf.pairs(right.getKeyType(), ptf.ints()), right.getValueType()); PTableType<K, Pair<U,V>> outputType = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); PTable<Pair<K,Integer>,U> shardedLeft = left.parallelDo("Pre-shard left", new PreShardLeftSideFn<K, U>(shardingStrategy), shardedLeftType); PTable<Pair<K,Integer>,V> shardedRight = right.parallelDo("Pre-shard right", new PreShardRightSideFn<K, V>(shardingStrategy), shardedRightType); PTable<Pair<K, Integer>, Pair<U, V>> shardedJoined = wrappedJoinStrategy.join(shardedLeft, shardedRight, joinType); return shardedJoined.parallelDo("Unshard", new UnshardFn<K, U, V>(), outputType); }
/** * Co-groups the two {@link PTable} arguments. * * @return a {@code PTable} representing the co-grouped tables. */ public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(PTable<K, U> left, PTable<K, V> right) { PTypeFamily ptf = left.getTypeFamily(); PType<K> keyType = left.getPTableType().getKeyType(); PType<U> leftType = left.getPTableType().getValueType(); PType<V> rightType = right.getPTableType().getValueType(); PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType); PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> both = cgLeft.union(cgRight); PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs(ptf.collections(leftType), ptf.collections(rightType)); return both.groupByKey().parallelDo("cogroup", new PostGroupFn<K, U, V>(), ptf.tableOf(keyType, otype)); }