public static <K, V> PCollection<K> keys(PTable<K, V> ptable) { return ptable.parallelDo("PTables.keys", new DoFn<Pair<K, V>, K>() { @Override public void process(Pair<K, V> input, Emitter<K> emitter) { emitter.emit(input.first()); } }, ptable.getKeyType()); }
/** * Get a {@link PType} which can be used to serialize the key part of this table */ default PType<K> keyType() { return underlying().getKeyType(); }
/** * Extract the keys from the given {@code PTable<K, V>} as a {@code PCollection<K>}. * @param ptable The {@code PTable} * @return A {@code PCollection<K>} */ public static <K, V> PCollection<K> keys(PTable<K, V> ptable) { return ptable.parallelDo("PTables.keys", new DoFn<Pair<K, V>, K>() { @Override public void process(Pair<K, V> input, Emitter<K> emitter) { emitter.emit(input.first()); } }, ptable.getKeyType()); }
/** * Extract the keys from the given {@code PTable<K, V>} as a {@code PCollection<K>}. * @param ptable The {@code PTable} * @return A {@code PCollection<K>} */ public static <K, V> PCollection<K> keys(PTable<K, V> ptable) { return ptable.parallelDo("PTables.keys", new DoFn<Pair<K, V>, K>() { @Override public void process(Pair<K, V> input, Emitter<K> emitter) { emitter.emit(input.first()); } }, ptable.getKeyType()); }
@Override public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinType joinType) { switch (joinType) { case INNER_JOIN: return join(left, right, new InnerJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); case LEFT_OUTER_JOIN: return join(left, right, new LeftOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); case RIGHT_OUTER_JOIN: return join(left, right, new RightOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); case FULL_OUTER_JOIN: return join(left, right, new FullOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); default: throw new UnsupportedOperationException("Join type " + joinType + " is not supported"); } }
/** * Performs a full outer join on the specified {@link PTable}s. * * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Full_outer_join">Full * Join</a> * @param left * A PTable to perform an full join on. * @param right * A PTable to perform an full join on. * @param <K> * Type of the keys. * @param <U> * Type of the first {@link PTable}'s values * @param <V> * Type of the second {@link PTable}'s values * @return The joined result. */ public static <K, U, V> PTable<K, Pair<U, V>> fullJoin(PTable<K, U> left, PTable<K, V> right) { return join(left, right, new FullOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType())); }
/** * Swap the key and value part of a PTable. The original PTypes are used in the opposite order * @param table PTable to process * @param <K> Key type (will become value type) * @param <V> Value type (will become key type) * @return PType<V, K> containing the same data as the original */ public static <K, V> PTable<V, K> swapKeyValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); return table.parallelDo(new MapFn<Pair<K, V>, Pair<V, K>>() { @Override public Pair<V, K> map(Pair<K, V> input) { return Pair.of(input.second(), input.first()); } }, ptf.tableOf(table.getValueType(), table.getKeyType())); }
/** * Swap the key and value part of a table. The original PTypes are used in the opposite order * @param table PTable to process * @param <K> Key type (will become value type) * @param <V> Value type (will become key type) * @return PType<V, K> containing the same data as the original */ public static <K, V> PTable<V, K> swapKeyValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); return table.parallelDo(new MapFn<Pair<K, V>, Pair<V, K>>() { @Override public Pair<V, K> map(Pair<K, V> input) { return Pair.of(input.second(), input.first()); } }, ptf.tableOf(table.getValueType(), table.getKeyType())); } }
/** * Sorts the {@link PTable} using the natural ordering of its keys * in the order specified. * * @return a {@link PTable} representing the sorted collection. */ public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) { PTypeFamily tf = table.getTypeFamily(); Configuration conf = table.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key); return table.groupByKey(options).ungroup(); }
/** * Sorts the {@link PTable} using the natural ordering of its keys in the * order specified. * * @return a {@link PTable} representing the sorted collection. */ public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) { PTypeFamily tf = table.getTypeFamily(); Configuration conf = table.getPipeline().getConfiguration(); GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key); return table.groupByKey(options).ungroup(); }
/** * Maps a {@code PTable<K, U>} to a {@code PTable<K, V>} using the given {@code MapFn<U, V>} on * the values of the {@code PTable}. * * @param name The name of the transform * @param ptable The {@code PTable} to be mapped * @param mapFn The mapping function * @param ptype The PType for the returned values * @return A new {@code PTable<K, V>} instance */ public static <K, U, V> PTable<K, V> mapValues(String name, PTable<K, U> ptable, MapFn<U, V> mapFn, PType<V> ptype) { PTypeFamily ptf = ptable.getTypeFamily(); return ptable.parallelDo(name, new PairMapFn<K, U, K, V>(IdentityFn.<K>getInstance(), mapFn), ptf.tableOf(ptable.getKeyType(), ptype)); }
@Override public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinType joinType) { if (joinType == JoinType.FULL_OUTER_JOIN || joinType == JoinType.LEFT_OUTER_JOIN) { throw new UnsupportedOperationException("Join type " + joinType + " not supported by ShardedJoinStrategy"); } PTypeFamily ptf = left.getTypeFamily(); PTableType<Pair<K, Integer>, U> shardedLeftType = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), left.getValueType()); PTableType<Pair<K, Integer>, V> shardedRightType = ptf.tableOf(ptf.pairs(right.getKeyType(), ptf.ints()), right.getValueType()); PTableType<K, Pair<U,V>> outputType = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); PTable<Pair<K,Integer>,U> shardedLeft = left.parallelDo("Pre-shard left", new PreShardLeftSideFn<K, U>(shardingStrategy), shardedLeftType); PTable<Pair<K,Integer>,V> shardedRight = right.parallelDo("Pre-shard right", new PreShardRightSideFn<K, V>(shardingStrategy), shardedRightType); PTable<Pair<K, Integer>, Pair<U, V>> shardedJoined = wrappedJoinStrategy.join(shardedLeft, shardedRight, joinType); return shardedJoined.parallelDo("Unshard", new UnshardFn<K, U, V>(), outputType); }
public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) { PTypeFamily tf = collect.getTypeFamily(); final PType<V> valueType = collect.getValueType(); return collect.groupByKey().parallelDo("collect", new MapValuesFn<K, Iterable<V>, Collection<V>>() { public Collection<V> map(Iterable<V> values) { List<V> collected = Lists.newArrayList(); for (V value : values) { collected.add(valueType.getDetachedValue(value)); } return collected; } }, tf.tableOf(collect.getKeyType(), tf.collections(collect.getValueType()))); } }
private static <K, V> GroupingOptions buildGroupingOptions(PTable<K, V> ptable, Configuration conf, int numReducers, Order order) { PType<K> ptype = ptable.getKeyType(); PTypeFamily tf = ptable.getTypeFamily(); GroupingOptions.Builder builder = GroupingOptions.builder(); if (order == Order.DESCENDING) { if (tf == WritableTypeFamily.getInstance()) { builder.sortComparatorClass(ReverseWritableComparator.class); } else if (tf == AvroTypeFamily.getInstance()) { AvroType<K> avroType = (AvroType<K>) ptype; Schema schema = avroType.getSchema(); builder.conf("crunch.schema", schema.toString()); builder.sortComparatorClass(ReverseAvroComparator.class); } else { throw new RuntimeException("Unrecognized type family: " + tf); } } else if (tf == AvroTypeFamily.getInstance()) { builder.conf("crunch.schema", ((AvroType<K>) ptype).getSchema().toString()); } builder.requireSortedKeys(); configureReducers(builder, ptable, conf, numReducers); return builder.build(); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); PTableType<K, Collection<Pair<V1, V2>>> out = ptf.tableOf(input.getKeyType(), ptf.collections(input.getValueType())); return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey( GroupingOptions.builder() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)) .build()); }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right); PTableType<K, Pair<U, V>> ret = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
/** * Perform a default join on the given {@code PTable} instances using a user-specified {@code JoinFn}. * * @param left left table to be joined * @param right right table to be joined * @param joinFn The user-specified implementation of the {@code JoinFn} class * @return joined tables */ public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) { PTypeFamily ptf = left.getTypeFamily(); PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right, numReducers); PTableType<K, Pair<U, V>> ret = ptf .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input, int numReducers) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); GroupingOptions.Builder gob = GroupingOptions.builder() .requireSortedKeys() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)); if (numReducers > 0) { gob.numReducers(numReducers); } return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey(gob.build()); }
private PTable<K, Pair<U,V>> joinInternal(PTable<K, U> left, PTable<K, V> right, boolean includeUnmatchedLeftValues) { PTypeFamily tf = left.getTypeFamily(); ReadableData<Pair<K, V>> rightReadable = right.asReadable(materialize); MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>( rightReadable, right.getPTableType(), includeUnmatchedLeftValues); ParallelDoOptions options = ParallelDoOptions.builder() .sourceTargets(rightReadable.getSourceTargets()) .build(); return left.parallelDo("mapjoin", mapJoinDoFn, tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())), options); }