/** * Calculate a set of percentiles for each key in a numerically-valued table. * * Percentiles are calculated on a per-key basis by counting, joining and sorting. This is highly scalable, but takes * 2 more map-reduce cycles than if you can guarantee that the value set will fit into memory. Use inMemory * if you have less than the order of 10M values per key. * * The percentile definition that we use here is the "nearest rank" defined here: * http://en.wikipedia.org/wiki/Percentile#Definition * * @param table numerically-valued PTable * @param p1 First percentile (in the range 0.0 - 1.0) * @param pn More percentiles (in the range 0.0 - 1.0) * @param <K> Key type of the table * @param <V> Value type of the table (must extends java.lang.Number) * @return PTable of each key with a collection of pairs of the percentile provided and it's result. */ public static <K, V extends Number> PTable<K, Result<V>> distributed(PTable<K, V> table, double p1, double... pn) { final List<Double> percentileList = createListFromVarargs(p1, pn); PTypeFamily ptf = table.getTypeFamily(); PTable<K, Long> totalCounts = table.keys().count(); PTable<K, Pair<Long, V>> countValuePairs = totalCounts.join(table); PTable<K, Pair<V, Long>> valueCountPairs = countValuePairs.mapValues(new SwapPairComponents<Long, V>(), ptf.pairs(table.getValueType(), ptf.longs())); return SecondarySort.sortAndApply( valueCountPairs, new DistributedPercentiles<K, V>(percentileList), ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType()))); }
/** * Calculate a set of quantiles for each key in a numerically-valued table. * * Quantiles are calculated on a per-key basis by counting, joining and sorting. This is highly scalable, but takes * 2 more map-reduce cycles than if you can guarantee that the value set will fit into memory. Use inMemory * if you have less than the order of 10M values per key. * * The quantile definition that we use here is the "nearest rank" defined here: * http://en.wikipedia.org/wiki/Percentile#Definition * * @param table numerically-valued PTable * @param p1 First quantile (in the range 0.0 - 1.0) * @param pn More quantiles (in the range 0.0 - 1.0) * @param <K> Key type of the table * @param <V> Value type of the table (must extends java.lang.Number) * @return PTable of each key with a collection of pairs of the quantile provided and it's result. */ public static <K, V extends Number> PTable<K, Result<V>> distributed(PTable<K, V> table, double p1, double... pn) { final List<Double> quantileList = createListFromVarargs(p1, pn); PTypeFamily ptf = table.getTypeFamily(); PTable<K, Long> totalCounts = table.keys().count(); PTable<K, Pair<Long, V>> countValuePairs = totalCounts.join(table); PTable<K, Pair<V, Long>> valueCountPairs = countValuePairs.mapValues(new SwapPairComponents<Long, V>(), ptf.pairs(table.getValueType(), ptf.longs())); return SecondarySort.sortAndApply( valueCountPairs, new DistributedQuantiles<K, V>(quantileList), ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType()))); }
rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType())); PTable<Pair<Integer, Integer>, Pair<U, V>> cg = leftCross.join(rightCross);
rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType())); PTable<Pair<Integer, Integer>, Pair<U, V>> cg = leftCross.join(rightCross);
/** * Helper for building the artificial cross keys. This technique was taken * from Pig's CROSS. */ private static class GFCross<V> extends DoFn<V, Pair<Pair<Integer, Integer>, V>> { private final int constantField; private final int parallelism; private final Random r; public GFCross(int constantField, int parallelism) { this.constantField = constantField; this.parallelism = parallelism; this.r = new Random(); } public void process(V input, Emitter<Pair<Pair<Integer, Integer>, V>> emitter) { int c = r.nextInt(parallelism); if (constantField == 0) { for (int i = 0; i < parallelism; i++) { emitter.emit(Pair.of(Pair.of(c, i), input)); } } else { for (int i = 0; i < parallelism; i++) { emitter.emit(Pair.of(Pair.of(i, c), input)); } } } }
/** * Helper for building the artificial cross keys. This technique was taken * from Pig's CROSS. */ private static class GFCross<V> extends DoFn<V, Pair<Pair<Integer, Integer>, V>> { private final int constantField; private final int parallelism; private final Random r; public GFCross(int constantField, int parallelism) { this.constantField = constantField; this.parallelism = parallelism; this.r = new Random(); } public void process(V input, Emitter<Pair<Pair<Integer, Integer>, V>> emitter) { int c = r.nextInt(parallelism); if (constantField == 0) { for (int i = 0; i < parallelism; i++) { emitter.emit(Pair.of(Pair.of(c, i), input)); } } else { for (int i = 0; i < parallelism; i++) { emitter.emit(Pair.of(Pair.of(i, c), input)); } } } }