/** * Combine the value part of the table using the provided Crunch {@link Aggregator}. This will be optimised into * both a combine and reduce in the MapReduce implementation, with similar optimisations available for other * implementations. */ default LTable<K, V> combineValues(Aggregator<V> aggregator) { return factory().wrap(underlying().combineValues(aggregator)); }
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println(); System.err.println("Two and only two arguments are accepted."); System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output"); System.err.println(); GenericOptionsParser.printGenericCommandUsage(System.err); return 1; } // Create an object to coordinate pipeline creation and execution. Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf()); // Reference a given text file as a collection of Strings. PCollection<String> lines = pipeline.readTextFile(args[0]); // Aggregator used for summing up response size Aggregator<Long> agg = Aggregators.SUM_LONGS(); // Table of (ip, sum(response size)) PTable<String, Long> ipAddrResponseSize = lines .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey() .combineValues(agg); pipeline.writeTextFile(ipAddrResponseSize, args[1]); // Execute the pipeline as a MapReduce. PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
.groupByKey(1).combineValues(new CombineFn<Boolean, S>() { public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Boolean, S>>() { public Pair<Boolean, S> map(S input) { return Pair.of(false, input); } }, tf.tableOf(tf.booleans(), collect.getPType())) .groupByKey(1) .combineValues(aggregator) .values(); } }
.groupByKey().combineValues(new CombineFn<Boolean, S>() { public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
/** * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count * of their occurrences. */ public static <S> PTable<S, Long> count(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() { public Pair<S, Long> map(S input) { return Pair.of(input, 1L); } }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey() .combineValues(Aggregators.SUM_LONGS()); }
.combineValues(new CombineFn<Boolean, S>() { public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) { S min = null;
private static <T> PTable<String, BloomFilter> createFilterTable(PCollection<T> collection, BloomFilterFn<T> filterFn) { PTypeFamily tf = collection.getTypeFamily(); PTable<String, BloomFilter> table = collection.parallelDo(filterFn, tf.tableOf(tf.strings(), Writables.writables(BloomFilter.class))); return table.groupByKey(1).combineValues(new BloomFilterAggregator()); }
/** * Returns a {@code PTable} that contains the unique elements of this * collection mapped to a count of their occurrences. */ public static <S> PTable<S, Long> count(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() { public Pair<S, Long> map(S input) { return Pair.of(input, 1L); } }, tf.tableOf(collect.getPType(), tf.longs())) .groupByKey() .combineValues(CombineFn.<S> SUM_LONGS()); }
.combineValues(new CombineFn<Boolean, S>() { public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) { S max = null;
/** * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count * of their occurrences. */ public static <S> PTable<S, Long> count(PCollection<S> collect, int numPartitions) { PTypeFamily tf = collect.getTypeFamily(); return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() { public Pair<S, Long> map(S input) { return Pair.of(input, 1L); } }, tf.tableOf(collect.getPType(), tf.longs())) .groupByKey(numPartitions) .combineValues(Aggregators.SUM_LONGS()); }
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println(); System.err.println("Two and only two arguments are accepted."); System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output"); System.err.println(); GenericOptionsParser.printGenericCommandUsage(System.err); return 1; } // Create an object to coordinate pipeline creation and execution. Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf()); // Reference a given text file as a collection of Strings. PCollection<String> lines = pipeline.readTextFile(args[0]); // Aggregator used for summing up response size and count Aggregator<Pair<Long, Long>> agg = pairAggregator(SUM_LONGS(), SUM_LONGS()); // Table of (ip, sum(response size), count) PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines .parallelDo(extractResponseSize, Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey() .combineValues(agg); // Calculate average response size by ip address PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage, Writables.tableOf(Writables.strings(), Writables.doubles())); // write the result to a text file pipeline.writeTextFile(avgs, args[1]); // Execute the pipeline as a MapReduce. PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter) .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter) .groupByKey(1) .combineValues(new TopKCombineFn<K, V>(limit, maximize)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }
/** * Calculate the mean average value by key for a table with numeric values. * @param table PTable of (key, value) pairs to operate on * @param <K> Key type, can be any type * @param <V> Value type, must be numeric (ie. extend java.lang.Number) * @return PTable<K, Double> of (key, mean(values)) pairs */ public static <K, V extends Number> PTable<K, Double> meanValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); PTable<K, Pair<Double, Long>> withCounts = table.mapValues(new MapFn<V, Pair<Double, Long>>() { @Override public Pair<Double, Long> map(V input) { return Pair.of(input.doubleValue(), 1L); } }, ptf.pairs(ptf.doubles(), ptf.longs())); PGroupedTable<K, Pair<Double, Long>> grouped = withCounts.groupByKey(); return grouped.combineValues(pairAggregator(SUM_DOUBLES(), SUM_LONGS())) .mapValues(new MapFn<Pair<Double, Long>, Double>() { @Override public Double map(Pair<Double, Long> input) { return input.first() / input.second(); } }, ptf.doubles()); }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey(GroupingOptions.builder().numReducers(1).build()) .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count); }
/** * Same as the other groupedWeightedReservoirSample method, but include a seed for testing * purposes. * * @param input A {@code PTable} with the key a group ID and the value a weighted observation in that group * @param sampleSizes An array of length N, with each entry is the number of elements to include in that group * @param seed The test seed * @return A {@code PCollection} of the sampled elements for each of the groups */ public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample( PTable<Integer, Pair<T, N>> input, int[] sampleSizes, Long seed) { PTypeFamily ptf = input.getTypeFamily(); PType<T> ttype = (PType<T>) input.getPTableType().getValueType().getSubTypes().get(0); PTableType<Integer, Pair<Double, T>> ptt = ptf.tableOf(ptf.ints(), ptf.pairs(ptf.doubles(), ttype)); return input.parallelDo("Initial reservoir sampling", new ReservoirSampleFn<T, N>(sampleSizes, seed, ttype), ptt) .groupByKey(1) .combineValues(new WRSCombineFn<T>(sampleSizes, ttype)) .parallelDo("Extract sampled values", new MapFn<Pair<Integer, Pair<Double, T>>, Pair<Integer, T>>() { @Override public Pair<Integer, T> map(Pair<Integer, Pair<Double, T>> p) { return Pair.of(p.first(), p.second().second()); } }, ptf.pairs(ptf.ints(), ttype)); }
/** * Calculate the mean average value by key for a table with numeric values. * @param table PTable of (key, value) pairs to operate on * @param <K> Key type, can be any type * @param <V> Value type, must be numeric (ie. extend java.lang.Number) * @return PTable<K, Double> of (key, mean(values)) pairs */ public static <K, V extends Number> PTable<K, Double> meanValue(PTable<K, V> table) { PTypeFamily ptf = table.getTypeFamily(); PTable<K, Pair<Double, Long>> withCounts = table.mapValues(new MapFn<V, Pair<Double, Long>>() { @Override public Pair<Double, Long> map(V input) { return Pair.of(input.doubleValue(), 1L); } }, ptf.pairs(ptf.doubles(), ptf.longs())); PGroupedTable<K, Pair<Double, Long>> grouped = withCounts.groupByKey(); return grouped.combineValues(pairAggregator(SUM_DOUBLES(), SUM_LONGS())) .mapValues(new MapFn<Pair<Double, Long>, Double>() { @Override public Double map(Pair<Double, Long> input) { return input.first() / input.second(); } }, ptf.doubles()); }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } public void cleanup(Emitter<Pair<Integer, Long>> e) { e.emit(Pair.of(1, 0L)); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey(GroupingOptions.builder().numReducers(1).build()) .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count, 0L); }
/** * Selects the top N pairs from the given table, with sorting being performed on the values (i.e. the second * value in the pair) of the table. * * @param ptable table containing the pairs from which the top N is to be selected * @param limit number of top elements to select * @param maximize if true, the maximum N values from the table will be selected, otherwise the minimal * N values will be selected * @return table containing the top N values from the incoming table */ public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) { PTypeFamily ptf = ptable.getTypeFamily(); PTableType<K, V> base = ptable.getPTableType(); PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType()); PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType); return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize, pairType), inter) .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize, pairType)) .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() { public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) { emitter.emit(input.second()); } }, base); }