public static <K, V> PCollection<K> keys(PTable<K, V> ptable) { return ptable.parallelDo("PTables.keys", new DoFn<Pair<K, V>, K>() { @Override public void process(Pair<K, V> input, Emitter<K> emitter) { emitter.emit(input.first()); } }, ptable.getKeyType()); }
public static <K, V> PCollection<V> values(PTable<K, V> ptable) { return ptable.parallelDo("PTables.values", new DoFn<Pair<K, V>, V>() { @Override public void process(Pair<K, V> input, Emitter<V> emitter) { emitter.emit(input.second()); } }, ptable.getValueType()); }
/** * Extract the values from the given {@code PTable<K, V>} as a {@code PCollection<V>}. * @param ptable The {@code PTable} * @return A {@code PCollection<V>} */ public static <K, V> PCollection<V> values(PTable<K, V> ptable) { return ptable.parallelDo("PTables.values", new DoFn<Pair<K, V>, V>() { @Override public void process(Pair<K, V> input, Emitter<V> emitter) { emitter.emit(input.second()); } }, ptable.getValueType()); }
/** * Extract the keys from the given {@code PTable<K, V>} as a {@code PCollection<K>}. * @param ptable The {@code PTable} * @return A {@code PCollection<K>} */ public static <K, V> PCollection<K> keys(PTable<K, V> ptable) { return ptable.parallelDo("PTables.keys", new DoFn<Pair<K, V>, K>() { @Override public void process(Pair<K, V> input, Emitter<K> emitter) { emitter.emit(input.first()); } }, ptable.getKeyType()); }
/** * Extract the values from the given {@code PTable<K, V>} as a {@code PCollection<V>}. * @param ptable The {@code PTable} * @return A {@code PCollection<V>} */ public static <K, V> PCollection<V> values(PTable<K, V> ptable) { return ptable.parallelDo("PTables.values", new DoFn<Pair<K, V>, V>() { @Override public void process(Pair<K, V> input, Emitter<V> emitter) { emitter.emit(input.second()); } }, ptable.getValueType()); }
/** * Extract the keys from the given {@code PTable<K, V>} as a {@code PCollection<K>}. * @param ptable The {@code PTable} * @return A {@code PCollection<K>} */ public static <K, V> PCollection<K> keys(PTable<K, V> ptable) { return ptable.parallelDo("PTables.keys", new DoFn<Pair<K, V>, K>() { @Override public void process(Pair<K, V> input, Emitter<K> emitter) { emitter.emit(input.first()); } }, ptable.getKeyType()); }
/** * When creating toplists, it is often required to sort by count descending. As some sort operations don't support * order (such as SecondarySort), this method will negate counts so that a natural-ordered sort will produce a * descending order. * @param table PTable to process * @param <K> key type * @return PTable of the same format with the value negated */ public static <K> PTable<K, Long> negateCounts(PTable<K, Long> table) { return table.parallelDo(new MapFn<Pair<K, Long>, Pair<K, Long>>() { @Override public Pair<K, Long> map(Pair<K, Long> input) { return Pair.of(input.first(), -input.second()); } }, table.getPTableType()); } }
/** * When creating toplists, it is often required to sort by count descending. As some sort operations don't support * order (such as SecondarySort), this method will negate counts so that a natural-ordered sort will produce a * descending order. * @param table PTable to process * @param <K> key type * @return PTable of the same format with the value negated */ public static <K> PTable<K, Long> negateCounts(PTable<K, Long> table) { return table.parallelDo(new MapFn<Pair<K, Long>, Pair<K, Long>>() { @Override public Pair<K, Long> map(Pair<K, Long> input) { return Pair.of(input.first(), -input.second()); } }, table.getPTableType()); } }
public static <K1, V1, K2 extends Writable, V2 extends Writable> PTable<K2, V2> map( PTable<K1, V1> input, Class<? extends Mapper<K1, V1, K2, V2>> mapperClass, Class<K2> keyClass, Class<V2> valueClass) { return input.parallelDo(new MapperFn<K1, V1, K2, V2>(mapperClass), tableOf(keyClass, valueClass)); }
public static <K1, V1, K2 extends Writable, V2 extends Writable> PTable<K2, V2> map( PTable<K1, V1> input, Class<? extends Mapper<K1, V1, K2, V2>> mapperClass, Class<K2> keyClass, Class<V2> valueClass) { return input.parallelDo(new MapperFn<K1, V1, K2, V2>(mapperClass), tableOf(keyClass, valueClass)); }
/** * Compute the intersection of two sets of elements. * * @return a collection containing elements that common to both sets * <code>coll1</code> and <code>coll2</code> */ public static <T> PCollection<T> intersection(PCollection<T> coll1, PCollection<T> coll2) { return Cogroup.cogroup(toTable(coll1), toTable(coll2)) .parallelDo(new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, T>() { @Override public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<T> emitter) { Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second(); if (!groups.first().isEmpty() && !groups.second().isEmpty()) { emitter.emit(input.first()); } } }, coll1.getPType()); }
/** * Swap the key and value part of a PTable. The original PTypes are used in the opposite order * @param table PTable to process * @param <K> Key type (will become value type) * @param <V> Value type (will become key type) * @return PType<V, K> containing the same data as the original */ public static <K, V> PTable<V, K> swapKeyValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); return table.parallelDo(new MapFn<Pair<K, V>, Pair<V, K>>() { @Override public Pair<V, K> map(Pair<K, V> input) { return Pair.of(input.second(), input.first()); } }, ptf.tableOf(table.getValueType(), table.getKeyType())); }
/** * Swap the key and value part of a table. The original PTypes are used in the opposite order * @param table PTable to process * @param <K> Key type (will become value type) * @param <V> Value type (will become key type) * @return PType<V, K> containing the same data as the original */ public static <K, V> PTable<V, K> swapKeyValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); return table.parallelDo(new MapFn<Pair<K, V>, Pair<V, K>>() { @Override public Pair<V, K> map(Pair<K, V> input) { return Pair.of(input.second(), input.first()); } }, ptf.tableOf(table.getValueType(), table.getKeyType())); } }
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter) .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }
/** * Maps a {@code PTable<K1, V>} to a {@code PTable<K2, V>} using the given {@code MapFn<K1, K2>} on * the keys of the {@code PTable}. * * @param name The name of the transform * @param ptable The {@code PTable} to be mapped * @param mapFn The mapping function * @param ptype The PType for the returned keys * @return A new {@code PTable<K2, V>} instance */ public static <K1, K2, V> PTable<K2, V> mapKeys(String name, PTable<K1, V> ptable, MapFn<K1, K2> mapFn, PType<K2> ptype) { PTypeFamily ptf = ptable.getTypeFamily(); return ptable.parallelDo(name, new PairMapFn<K1, V, K2, V>(mapFn, IdentityFn.<V>getInstance()), ptf.tableOf(ptype, ptable.getValueType())); }
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter) .groupByKey(1) .combineValues(new TopKCombineFn<K, V>(limit, maximize)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }
@Override public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinType joinType) { if (joinType == JoinType.FULL_OUTER_JOIN || joinType == JoinType.LEFT_OUTER_JOIN) { throw new UnsupportedOperationException("Join type " + joinType + " not supported by ShardedJoinStrategy"); } PTypeFamily ptf = left.getTypeFamily(); PTableType<Pair<K, Integer>, U> shardedLeftType = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), left.getValueType()); PTableType<Pair<K, Integer>, V> shardedRightType = ptf.tableOf(ptf.pairs(right.getKeyType(), ptf.ints()), right.getValueType()); PTableType<K, Pair<U,V>> outputType = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType())); PTable<Pair<K,Integer>,U> shardedLeft = left.parallelDo("Pre-shard left", new PreShardLeftSideFn<K, U>(shardingStrategy), shardedLeftType); PTable<Pair<K,Integer>,V> shardedRight = right.parallelDo("Pre-shard right", new PreShardRightSideFn<K, V>(shardingStrategy), shardedRightType); PTable<Pair<K, Integer>, Pair<U, V>> shardedJoined = wrappedJoinStrategy.join(shardedLeft, shardedRight, joinType); return shardedJoined.parallelDo("Unshard", new UnshardFn<K, U, V>(), outputType); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input, int numReducers) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); GroupingOptions.Builder gob = GroupingOptions.builder() .requireSortedKeys() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)); if (numReducers > 0) { gob.numReducers(numReducers); } return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey(gob.build()); }
private PTable<K, Pair<U,V>> joinInternal(PTable<K, U> left, PTable<K, V> right, boolean includeUnmatchedLeftValues) { PTypeFamily tf = left.getTypeFamily(); ReadableData<Pair<K, V>> rightReadable = right.asReadable(materialize); MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>( rightReadable, right.getPTableType(), includeUnmatchedLeftValues); ParallelDoOptions options = ParallelDoOptions.builder() .sourceTargets(rightReadable.getSourceTargets()) .build(); return left.parallelDo("mapjoin", mapJoinDoFn, tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())), options); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); PTableType<K, Collection<Pair<V1, V2>>> out = ptf.tableOf(input.getKeyType(), ptf.collections(input.getValueType())); return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey( GroupingOptions.builder() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)) .build()); }