/** * Get the underlying {@link PTableType} used to serialize key/value pairs in this table */ default PTableType<K, V> pType() { return underlying().getPTableType(); }
/** * When creating toplists, it is often required to sort by count descending. As some sort operations don't support * order (such as SecondarySort), this method will negate counts so that a natural-ordered sort will produce a * descending order. * @param table PTable to process * @param <K> key type * @return PTable of the same format with the value negated */ public static <K> PTable<K, Long> negateCounts(PTable<K, Long> table) { return table.parallelDo(new MapFn<Pair<K, Long>, Pair<K, Long>>() { @Override public Pair<K, Long> map(Pair<K, Long> input) { return Pair.of(input.first(), -input.second()); } }, table.getPTableType()); } }
@Override public <K, V> PTable<K, V> unionTables(List<PTable<K, V>> tables) { List<Pair<K, V>> values = Lists.newArrayList(); for (PTable<K, V> table : tables) { Iterables.addAll(values, table.materialize()); } return new MemTable<K, V>(values, tables.get(0).getPTableType(), null); }
/** * When creating toplists, it is often required to sort by count descending. As some sort operations don't support * order (such as SecondarySort), this method will negate counts so that a natural-ordered sort will produce a * descending order. * @param table PTable to process * @param <K> key type * @return PTable of the same format with the value negated */ public static <K> PTable<K, Long> negateCounts(PTable<K, Long> table) { return table.parallelDo(new MapFn<Pair<K, Long>, Pair<K, Long>>() { @Override public Pair<K, Long> map(Pair<K, Long> input) { return Pair.of(input.first(), -input.second()); } }, table.getPTableType()); } }
@Override public PTable<K, V> union(PTable<K, V>... others) { List<Pair<K, V>> values = Lists.newArrayList(); values.addAll(getCollection()); for (PTable<K, V> ptable : others) { for (Pair<K, V> p : ptable.materialize()) { values.add(p); } } return new MemTable<K, V>(values, others[0].getPTableType(), null); }
@Override public PTable<K, V> union(PTable<K, V>... others) { List<Pair<K, V>> values = Lists.newArrayList(); values.addAll(getCollection()); for (PTable<K, V> ptable : others) { for (Pair<K, V> p : ptable.materialize()) { values.add(p); } } return new MemTable<K, V>(values, others[0].getPTableType(), null); }
/** * Co-groups the two {@link PTable} arguments. * * @return a {@code PTable} representing the co-grouped tables. */ public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(PTable<K, U> left, PTable<K, V> right) { PTypeFamily ptf = left.getTypeFamily(); PType<K> keyType = left.getPTableType().getKeyType(); PType<U> leftType = left.getPTableType().getValueType(); PType<V> rightType = right.getPTableType().getValueType(); PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType); PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> both = cgLeft.union(cgRight); PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs(ptf.collections(leftType), ptf.collections(rightType)); return both.groupByKey().parallelDo("cogroup", new PostGroupFn<K, U, V>(), ptf.tableOf(keyType, otype)); }
/** * Co-groups the two {@link PTable} arguments. * * @return a {@code PTable} representing the co-grouped tables. */ public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup( PTable<K, U> left, PTable<K, V> right) { PTypeFamily ptf = left.getTypeFamily(); PType<K> keyType = left.getPTableType().getKeyType(); PType<U> leftType = left.getPTableType().getValueType(); PType<V> rightType = right.getPTableType().getValueType(); PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType); PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(), ptf.tableOf(keyType, itype)); PTable<K, Pair<U, V>> both = cgLeft.union(cgRight); PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs( ptf.collections(leftType), ptf.collections(rightType)); return both.groupByKey().parallelDo("cogroup", new PostGroupFn<K, U, V>(), ptf.tableOf(keyType, otype)); }
"disable deep copy", new DeepCopyDisablerFn<Pair<K, V>>(), right.getPTableType()); PTable<K, V> filteredRightSide = right.parallelDo( "Filter right-side with BloomFilters", filterKeysFn, right.getPTableType(), options); .parallelDo( "Negatively filter right-side with BloomFilters", FilterFns.not(filterKeysFn), right.getPTableType(), options) .mapValues( "Right outer join: attach null as left-value",
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter) .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }
@SuppressWarnings({ "rawtypes", "unchecked" }) private void writeSequenceFileFromPTable(final FileSystem fs, final Path path, final PTable table) throws IOException { final PTableType pType = table.getPTableType(); final Class<?> keyClass = pType.getConverter().getKeyClass(); final Class<?> valueClass = pType.getConverter().getValueClass(); final SequenceFile.Writer writer = new SequenceFile.Writer(fs, fs.getConf(), path, keyClass, valueClass); for (final Object o : table.materialize()) { final Pair<?,?> p = (Pair) o; final Object key = pType.getKeyType().getOutputMapFn().map(p.first()); final Object value = pType.getValueType().getOutputMapFn().map(p.second()); writer.append(key, value); } writer.close(); }
private PTable<K, Pair<U,V>> joinInternal(PTable<K, U> left, PTable<K, V> right, boolean includeUnmatchedLeftValues) { PTypeFamily tf = left.getTypeFamily(); ReadableData<Pair<K, V>> rightReadable = right.asReadable(materialize); MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>( rightReadable, right.getPTableType(), includeUnmatchedLeftValues); ParallelDoOptions options = ParallelDoOptions.builder() .sourceTargets(rightReadable.getSourceTargets()) .build(); return left.parallelDo("mapjoin", mapJoinDoFn, tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())), options); }
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter) .groupByKey(1) .combineValues(new TopKCombineFn<K, V>(limit, maximize)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }
/** * Same as the other groupedWeightedReservoirSample method, but include a seed for testing * purposes. * * @param input A {@code PTable} with the key a group ID and the value a weighted observation in that group * @param sampleSizes An array of length N, with each entry is the number of elements to include in that group * @param seed The test seed * @return A {@code PCollection} of the sampled elements for each of the groups */ public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample( PTable<Integer, Pair<T, N>> input, int[] sampleSizes, Long seed) { PTypeFamily ptf = input.getTypeFamily(); PType<T> ttype = (PType<T>) input.getPTableType().getValueType().getSubTypes().get(0); PTableType<Integer, Pair<Double, T>> ptt = ptf.tableOf(ptf.ints(), ptf.pairs(ptf.doubles(), ttype)); return input.parallelDo("Initial reservoir sampling", new ReservoirSampleFn<T, N>(sampleSizes, seed, ttype), ptt) .groupByKey(1) .combineValues(new WRSCombineFn<T>(sampleSizes, ttype)) .parallelDo("Extract sampled values", new MapFn<Pair<Integer, Pair<Double, T>>, Pair<Integer, T>>() { @Override public Pair<Integer, T> map(Pair<Integer, Pair<Double, T>> p) { return Pair.of(p.first(), p.second().second()); } }, ptf.pairs(ptf.ints(), ttype)); }
/** * Selects the top N pairs from the given table, with sorting being performed on the values (i.e. the second * value in the pair) of the table. * * @param ptable table containing the pairs from which the top N is to be selected * @param limit number of top elements to select * @param maximize if true, the maximum N values from the table will be selected, otherwise the minimal * N values will be selected * @return table containing the top N values from the incoming table */ public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize, pairType), inter) .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize, pairType)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }