@Override public PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinType joinType) { JoinType reversedJoinType; switch (joinType) { case INNER_JOIN: reversedJoinType = JoinType.INNER_JOIN; break; case RIGHT_OUTER_JOIN: reversedJoinType = JoinType.LEFT_OUTER_JOIN; break; default: throw new UnsupportedOperationException("Join type " + joinType + " is not supported"); } return mapsideJoinStrategy.join(right, left, reversedJoinType) .mapValues("Reverse order out output table values", new ReversePairOrderFn<V, U>(), left.getTypeFamily().pairs(left.getValueType(), right.getValueType())); } }
/** * Calculate the mean average value by key for a table with numeric values. * @param table PTable of (key, value) pairs to operate on * @param <K> Key type, can be any type * @param <V> Value type, must be numeric (ie. extend java.lang.Number) * @return PTable<K, Double> of (key, mean(values)) pairs */ public static <K, V extends Number> PTable<K, Double> meanValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); PTable<K, Pair<Double, Long>> withCounts = table.mapValues(new MapFn<V, Pair<Double, Long>>() { @Override public Pair<Double, Long> map(V input) { return Pair.of(input.doubleValue(), 1L); } }, ptf.pairs(ptf.doubles(), ptf.longs())); PGroupedTable<K, Pair<Double, Long>> grouped = withCounts.groupByKey(); return grouped.combineValues(pairAggregator(SUM_DOUBLES(), SUM_LONGS())) .mapValues(new MapFn<Pair<Double, Long>, Double>() { @Override public Double map(Pair<Double, Long> input) { return input.first() / input.second(); } }, ptf.doubles()); }
/** * Calculate the mean average value by key for a table with numeric values. * @param table PTable of (key, value) pairs to operate on * @param <K> Key type, can be any type * @param <V> Value type, must be numeric (ie. extend java.lang.Number) * @return PTable<K, Double> of (key, mean(values)) pairs */ public static <K, V extends Number> PTable<K, Double> meanValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); PTable<K, Pair<Double, Long>> withCounts = table.mapValues(new MapFn<V, Pair<Double, Long>>() { @Override public Pair<Double, Long> map(V input) { return Pair.of(input.doubleValue(), 1L); } }, ptf.pairs(ptf.doubles(), ptf.longs())); PGroupedTable<K, Pair<Double, Long>> grouped = withCounts.groupByKey(); return grouped.combineValues(pairAggregator(SUM_DOUBLES(), SUM_LONGS())) .mapValues(new MapFn<Pair<Double, Long>, Double>() { @Override public Double map(Pair<Double, Long> input) { return input.first() / input.second(); } }, ptf.doubles()); }
/** * Calculate a set of percentiles for each key in a numerically-valued table. * * Percentiles are calculated on a per-key basis by counting, joining and sorting. This is highly scalable, but takes * 2 more map-reduce cycles than if you can guarantee that the value set will fit into memory. Use inMemory * if you have less than the order of 10M values per key. * * The percentile definition that we use here is the "nearest rank" defined here: * http://en.wikipedia.org/wiki/Percentile#Definition * * @param table numerically-valued PTable * @param p1 First percentile (in the range 0.0 - 1.0) * @param pn More percentiles (in the range 0.0 - 1.0) * @param <K> Key type of the table * @param <V> Value type of the table (must extends java.lang.Number) * @return PTable of each key with a collection of pairs of the percentile provided and it's result. */ public static <K, V extends Number> PTable<K, Result<V>> distributed(PTable<K, V> table, double p1, double... pn) { final List<Double> percentileList = createListFromVarargs(p1, pn); PTypeFamily ptf = table.getTypeFamily(); PTable<K, Long> totalCounts = table.keys().count(); PTable<K, Pair<Long, V>> countValuePairs = totalCounts.join(table); PTable<K, Pair<V, Long>> valueCountPairs = countValuePairs.mapValues(new SwapPairComponents<Long, V>(), ptf.pairs(table.getValueType(), ptf.longs())); return SecondarySort.sortAndApply( valueCountPairs, new DistributedPercentiles<K, V>(percentileList), ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType()))); }
/** * Calculate a set of quantiles for each key in a numerically-valued table. * * Quantiles are calculated on a per-key basis by counting, joining and sorting. This is highly scalable, but takes * 2 more map-reduce cycles than if you can guarantee that the value set will fit into memory. Use inMemory * if you have less than the order of 10M values per key. * * The quantile definition that we use here is the "nearest rank" defined here: * http://en.wikipedia.org/wiki/Percentile#Definition * * @param table numerically-valued PTable * @param p1 First quantile (in the range 0.0 - 1.0) * @param pn More quantiles (in the range 0.0 - 1.0) * @param <K> Key type of the table * @param <V> Value type of the table (must extends java.lang.Number) * @return PTable of each key with a collection of pairs of the quantile provided and it's result. */ public static <K, V extends Number> PTable<K, Result<V>> distributed(PTable<K, V> table, double p1, double... pn) { final List<Double> quantileList = createListFromVarargs(p1, pn); PTypeFamily ptf = table.getTypeFamily(); PTable<K, Long> totalCounts = table.keys().count(); PTable<K, Pair<Long, V>> countValuePairs = totalCounts.join(table); PTable<K, Pair<V, Long>> valueCountPairs = countValuePairs.mapValues(new SwapPairComponents<Long, V>(), ptf.pairs(table.getValueType(), ptf.longs())); return SecondarySort.sortAndApply( valueCountPairs, new DistributedQuantiles<K, V>(quantileList), ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType()))); }
"Negatively filter right-side with BloomFilters", FilterFns.not(filterKeysFn), right.getPTableType(), options) .mapValues( "Right outer join: attach null as left-value", new NullKeyFn<U, V>(), leftJoinedWithFilteredRight.getValueType()));