/** * Extract the values from the given {@code PTable<K, V>} as a {@code PCollection<V>}. * @param ptable The {@code PTable} * @return A {@code PCollection<V>} */ public static <K, V> PCollection<V> values(PTable<K, V> ptable) { return ptable.parallelDo("PTables.values", new DoFn<Pair<K, V>, V>() { @Override public void process(Pair<K, V> input, Emitter<V> emitter) { emitter.emit(input.second()); } }, ptable.getValueType()); }
public static <K, V> PCollection<V> values(PTable<K, V> ptable) { return ptable.parallelDo("PTables.values", new DoFn<Pair<K, V>, V>() { @Override public void process(Pair<K, V> input, Emitter<V> emitter) { emitter.emit(input.second()); } }, ptable.getValueType()); }
/** * Get a {@link PType} which can be used to serialize the value part of this table */ default PType<V> valueType() { return underlying().getValueType(); }
/** * Extract the values from the given {@code PTable<K, V>} as a {@code PCollection<V>}. * @param ptable The {@code PTable} * @return A {@code PCollection<V>} */ public static <K, V> PCollection<V> values(PTable<K, V> ptable) { return ptable.parallelDo("PTables.values", new DoFn<Pair<K, V>, V>() { @Override public void process(Pair<K, V> input, Emitter<V> emitter) { emitter.emit(input.second()); } }, ptable.getValueType()); }
/** * Performs an inner join on the specified {@link PTable}s. * * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Inner_join">Inner Join</a> * @param left A PTable to perform an inner join on. * @param right A PTable to perform an inner join on. * @param <K> Type of the keys. * @param <U> Type of the first {@link PTable}'s values * @param <V> Type of the second {@link PTable}'s values * @return The joined result. */ public static <K, U, V> PTable<K, Pair<U, V>> innerJoin(PTable<K, U> left, PTable<K, V> right) { return join(left, right, new InnerJoinFn<K, U, V>(left.getValueType())); }
/** * Performs a full outer join on the specified {@link PTable}s. * * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Full_outer_join">Full Join</a> * @param left A PTable to perform an full join on. * @param right A PTable to perform an full join on. * @param <K> Type of the keys. * @param <U> Type of the first {@link PTable}'s values * @param <V> Type of the second {@link PTable}'s values * @return The joined result. */ public static <K, U, V> PTable<K, Pair<U, V>> fullJoin(PTable<K, U> left, PTable<K, V> right) { return join(left, right, new FullOuterJoinFn<K, U, V>(left.getValueType())); }
/** * Performs a left outer join on the specified {@link PTable}s. * * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Left_outer_join">Left Join</a> * @param left A PTable to perform an left join on. All of this PTable's entries will appear * in the resulting PTable. * @param right A PTable to perform an left join on. * @param <K> Type of the keys. * @param <U> Type of the first {@link PTable}'s values * @param <V> Type of the second {@link PTable}'s values * @return The joined result. */ public static <K, U, V> PTable<K, Pair<U, V>> leftJoin(PTable<K, U> left, PTable<K, V> right) { return join(left, right, new LeftOuterJoinFn<K, U, V>(left.getValueType())); }
/** * Performs a right outer join on the specified {@link PTable}s. * * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Right_outer_join">Right Join</a> * @param left A PTable to perform an right join on. * @param right A PTable to perform an right join on. All of this PTable's entries will appear * in the resulting PTable. * @param <K> Type of the keys. * @param <U> Type of the first {@link PTable}'s values * @param <V> Type of the second {@link PTable}'s values * @return The joined result. */ public static <K, U, V> PTable<K, Pair<U, V>> rightJoin(PTable<K, U> left, PTable<K, V> right) { return join(left, right, new RightOuterJoinFn<K, U, V>(left.getValueType())); }
public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) { PTypeFamily tf = collect.getTypeFamily(); final PType<V> valueType = collect.getValueType(); return collect.groupByKey().mapValues("collect", new MapFn<Iterable<V>, Collection<V>>() { @Override public void initialize() { valueType.initialize(getConfiguration()); } public Collection<V> map(Iterable<V> values) { List<V> collected = Lists.newArrayList(); for (V value : values) { collected.add(valueType.getDetachedValue(value)); } return collected; } }, tf.collections(collect.getValueType())); }
public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) { PTypeFamily tf = collect.getTypeFamily(); final PType<V> valueType = collect.getValueType(); return collect.groupByKey().parallelDo("collect", new MapValuesFn<K, Iterable<V>, Collection<V>>() { public Collection<V> map(Iterable<V> values) { List<V> collected = Lists.newArrayList(); for (V value : values) { collected.add(valueType.getDetachedValue(value)); } return collected; } }, tf.tableOf(collect.getKeyType(), tf.collections(collect.getValueType()))); } }
@Override public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinType joinType) { switch (joinType) { case INNER_JOIN: return join(left, right, new InnerJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); case LEFT_OUTER_JOIN: return join(left, right, new LeftOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); case RIGHT_OUTER_JOIN: return join(left, right, new RightOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); case FULL_OUTER_JOIN: return join(left, right, new FullOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); default: throw new UnsupportedOperationException("Join type " + joinType + " is not supported"); } }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
/** * Supports a user-specified number of reducers for the one-to-many join. * * @param left left-side table to join * @param right right-side table to join * @param postProcessFn DoFn to process the results of the join * @param ptype type of the output of the postProcessFn * @param numReducers The number of reducers to use * @return the post-processed output of the join */ public static <K, U, V, T> PCollection<T> oneToManyJoin(PTable<K, U> left, PTable<K, V> right, DoFn<Pair<U, Iterable<V>>, T> postProcessFn, PType<T> ptype, int numReducers) { PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = DefaultJoinStrategy.preJoin(left, right, numReducers); return grouped.parallelDo("One to many join " + grouped.getName(), new OneToManyJoinFn<K, U, V, T>(left.getValueType(), postProcessFn), ptype); }
/** * Perform a default join on the given {@code PTable} instances using a user-specified {@code JoinFn}. * * @param left left table to be joined * @param right right table to be joined * @param joinFn The user-specified implementation of the {@code JoinFn} class * @return joined tables */ public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right, numReducers); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
@Override public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinType joinType) { if (joinType == JoinType.FULL_OUTER_JOIN || joinType == JoinType.LEFT_OUTER_JOIN) { throw new UnsupportedOperationException("Join type " + joinType + " not supported by ShardedJoinStrategy"); } PTypeFamily ptf = left.getTypeFamily(); PTableType<Pair<K, Integer>, U> shardedLeftType = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), left.getValueType()); PTableType<Pair<K, Integer>, V> shardedRightType = ptf.tableOf(ptf.pairs(right.getKeyType(), ptf.ints()), right.getValueType()); PTableType<K, Pair<U,V>> outputType = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); PTable<Pair<K,Integer>,U> shardedLeft = left.parallelDo("Pre-shard left", new PreShardLeftSideFn<K, U>(shardingStrategy), shardedLeftType); PTable<Pair<K,Integer>,V> shardedRight = right.parallelDo("Pre-shard right", new PreShardRightSideFn<K, V>(shardingStrategy), shardedRightType); PTable<Pair<K, Integer>, Pair<U, V>> shardedJoined = wrappedJoinStrategy.join(shardedLeft, shardedRight, joinType); return shardedJoined.parallelDo("Unshard", new UnshardFn<K, U, V>(), outputType); }
/** * Swap the key and value part of a PTable. The original PTypes are used in the opposite order * @param table PTable to process * @param <K> Key type (will become value type) * @param <V> Value type (will become key type) * @return PType<V, K> containing the same data as the original */ public static <K, V> PTable<V, K> swapKeyValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); return table.parallelDo(new MapFn<Pair<K, V>, Pair<V, K>>() { @Override public Pair<V, K> map(Pair<K, V> input) { return Pair.of(input.second(), input.first()); } }, ptf.tableOf(table.getValueType(), table.getKeyType())); }
private PTable<K, Pair<U,V>> joinInternal(PTable<K, U> left, PTable<K, V> right, boolean includeUnmatchedLeftValues) { PTypeFamily tf = left.getTypeFamily(); ReadableData<Pair<K, V>> rightReadable = right.asReadable(materialize); MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>( rightReadable, right.getPTableType(), includeUnmatchedLeftValues); ParallelDoOptions options = ParallelDoOptions.builder() .sourceTargets(rightReadable.getSourceTargets()) .build(); return left.parallelDo("mapjoin", mapJoinDoFn, tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())), options); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); PTableType<K, Collection<Pair<V1, V2>>> out = ptf.tableOf(input.getKeyType(), ptf.collections(input.getValueType())); return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey( GroupingOptions.builder() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)) .build()); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input, int numReducers) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); GroupingOptions.Builder gob = GroupingOptions.builder() .requireSortedKeys() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)); if (numReducers > 0) { gob.numReducers(numReducers); } return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey(gob.build()); }