org.apache.crunch.PGroupedTable.combineValues java code examples

/**
 * Combine the value part of the table using the provided Crunch {@link Aggregator}. This will be optimised into
 * both a combine and reduce in the MapReduce implementation, with similar optimisations available for other
 * implementations.
 */
default LTable<K, V> combineValues(Aggregator<V> aggregator) {
  return factory().wrap(underlying().combineValues(aggregator));
}

public int run(String[] args) throws Exception {
 if (args.length != 2) {
  System.err.println();
  System.err.println("Two and only two arguments are accepted.");
  System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
  System.err.println();
  GenericOptionsParser.printGenericCommandUsage(System.err);
  return 1;
 }
 // Create an object to coordinate pipeline creation and execution.
 Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
 // Reference a given text file as a collection of Strings.
 PCollection<String> lines = pipeline.readTextFile(args[0]);
 // Aggregator used for summing up response size
 Aggregator<Long> agg = Aggregators.SUM_LONGS();
 // Table of (ip, sum(response size))
 PTable<String, Long> ipAddrResponseSize = lines
   .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey()
   .combineValues(agg);
 pipeline.writeTextFile(ipAddrResponseSize, args[1]);
 // Execute the pipeline as a MapReduce.
 PipelineResult result = pipeline.done();
 return result.succeeded() ? 0 : 1;
}

.groupByKey(1).combineValues(new CombineFn<Boolean, S>() {
 public void process(Pair<Boolean, Iterable<S>> input,
   Emitter<Pair<Boolean, S>> emitter) {

 public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) {
  PTypeFamily tf = collect.getTypeFamily();
  return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Boolean, S>>() {
   public Pair<Boolean, S> map(S input) {
    return Pair.of(false, input);
   }
  }, tf.tableOf(tf.booleans(), collect.getPType()))
  .groupByKey(1)
  .combineValues(aggregator)
  .values();
 }
}

.groupByKey().combineValues(new CombineFn<Boolean, S>() {
 public void process(Pair<Boolean, Iterable<S>> input,
   Emitter<Pair<Boolean, S>> emitter) {

/**
 * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
 * of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey()
   .combineValues(Aggregators.SUM_LONGS());
}

.combineValues(new CombineFn<Boolean, S>() {
 public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
  S min = null;

private static <T> PTable<String, BloomFilter> createFilterTable(PCollection<T> collection, BloomFilterFn<T> filterFn) {
 PTypeFamily tf = collection.getTypeFamily();
 PTable<String, BloomFilter> table = collection.parallelDo(filterFn,
   tf.tableOf(tf.strings(), Writables.writables(BloomFilter.class)));
 return table.groupByKey(1).combineValues(new BloomFilterAggregator());
}

/**
 * Returns a {@code PTable} that contains the unique elements of this
 * collection mapped to a count of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {      
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs()))
 .groupByKey()
 .combineValues(CombineFn.<S> SUM_LONGS());
}

.combineValues(new CombineFn<Boolean, S>() {
 public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
  S max = null;

/**
 * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
 * of their occurrences.
 */
public static <S> PTable<S, Long> count(PCollection<S> collect, int numPartitions) {
 PTypeFamily tf = collect.getTypeFamily();
 return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
  public Pair<S, Long> map(S input) {
   return Pair.of(input, 1L);
  }
 }, tf.tableOf(collect.getPType(), tf.longs()))
   .groupByKey(numPartitions)
   .combineValues(Aggregators.SUM_LONGS());
}

public int run(String[] args) throws Exception {
 if (args.length != 2) {
  System.err.println();
  System.err.println("Two and only two arguments are accepted.");
  System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
  System.err.println();
  GenericOptionsParser.printGenericCommandUsage(System.err);
  return 1;
 }
 // Create an object to coordinate pipeline creation and execution.
 Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
 // Reference a given text file as a collection of Strings.
 PCollection<String> lines = pipeline.readTextFile(args[0]);
 // Aggregator used for summing up response size and count
 Aggregator<Pair<Long, Long>> agg = pairAggregator(SUM_LONGS(), SUM_LONGS());
 // Table of (ip, sum(response size), count)
 PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
   .parallelDo(extractResponseSize,
     Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey()
   .combineValues(agg);
 // Calculate average response size by ip address
 PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
   Writables.tableOf(Writables.strings(), Writables.doubles()));
 // write the result to a text file
 pipeline.writeTextFile(avgs, args[1]);
 // Execute the pipeline as a MapReduce.
 PipelineResult result = pipeline.done();
 return result.succeeded() ? 0 : 1;
}

public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) {
 PTypeFamily ptf = ptable.getTypeFamily();
 PTableType<K, V> base = ptable.getPTableType();
 PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType());
 PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType);
 return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter)
   .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize))
   .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() {
    public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) {
     emitter.emit(input.second());
    }
   }, base);
}

public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) {
 PTypeFamily ptf = ptable.getTypeFamily();
 PTableType<K, V> base = ptable.getPTableType();
 PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType());
 PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType);
 return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter)
   .groupByKey(1)
   .combineValues(new TopKCombineFn<K, V>(limit, maximize))
   .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() {
    public void process(Pair<Integer, Pair<K, V>> input,
      Emitter<Pair<K, V>> emitter) {
     emitter.emit(input.second()); 
    }
   }, base);
}

/**
 * Calculate the mean average value by key for a table with numeric values.
 * @param table PTable of (key, value) pairs to operate on
 * @param <K> Key type, can be any type
 * @param <V> Value type, must be numeric (ie. extend java.lang.Number)
 * @return PTable&lt;K, Double&gt; of (key, mean(values)) pairs
 */
public static <K, V extends Number> PTable<K, Double> meanValue(PTable<K, V> table) {
 PTypeFamily ptf = table.getTypeFamily();
 PTable<K, Pair<Double, Long>> withCounts = table.mapValues(new MapFn<V, Pair<Double, Long>>() {
  @Override
  public Pair<Double, Long> map(V input) {
   return Pair.of(input.doubleValue(), 1L);
  }
 }, ptf.pairs(ptf.doubles(), ptf.longs()));
 PGroupedTable<K, Pair<Double, Long>> grouped = withCounts.groupByKey();
 return grouped.combineValues(pairAggregator(SUM_DOUBLES(), SUM_LONGS()))
     .mapValues(new MapFn<Pair<Double, Long>, Double>() {
      @Override
      public Double map(Pair<Double, Long> input) {
       return input.first() / input.second();
      }
     }, ptf.doubles());
}

/**
 * Returns the number of elements in the provided PCollection.
 * 
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 PTable<Integer, Long> countTable = collect
   .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
    public Pair<Integer, Long> map(S input) {
     return Pair.of(1, 1L);
    }
   }, tf.tableOf(tf.ints(), tf.longs()))
   .groupByKey(GroupingOptions.builder().numReducers(1).build())
   .combineValues(Aggregators.SUM_LONGS());
 PCollection<Long> count = countTable.values();
 return new FirstElementPObject<Long>(count);
}

/**
 * Same as the other groupedWeightedReservoirSample method, but include a seed for testing
 * purposes.
 * 
 * @param input A {@code PTable} with the key a group ID and the value a weighted observation in that group
 * @param sampleSizes An array of length N, with each entry is the number of elements to include in that group
 * @param seed The test seed
 * @return A {@code PCollection} of the sampled elements for each of the groups
 */
public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample(
  PTable<Integer, Pair<T, N>> input,
  int[] sampleSizes,
  Long seed) {
 PTypeFamily ptf = input.getTypeFamily();
 PType<T> ttype = (PType<T>) input.getPTableType().getValueType().getSubTypes().get(0);
 PTableType<Integer, Pair<Double, T>> ptt = ptf.tableOf(ptf.ints(),
   ptf.pairs(ptf.doubles(), ttype));
 
 return input.parallelDo("Initial reservoir sampling", new ReservoirSampleFn<T, N>(sampleSizes, seed, ttype), ptt)
   .groupByKey(1)
   .combineValues(new WRSCombineFn<T>(sampleSizes, ttype))
   .parallelDo("Extract sampled values", new MapFn<Pair<Integer, Pair<Double, T>>, Pair<Integer, T>>() {
    @Override
    public Pair<Integer, T> map(Pair<Integer, Pair<Double, T>> p) {
     return Pair.of(p.first(), p.second().second());
    }
   }, ptf.pairs(ptf.ints(), ttype));
}

/**
 * Calculate the mean average value by key for a table with numeric values.
 * @param table PTable of (key, value) pairs to operate on
 * @param <K> Key type, can be any type
 * @param <V> Value type, must be numeric (ie. extend java.lang.Number)
 * @return PTable&lt;K, Double&gt; of (key, mean(values)) pairs
 */
public static <K, V extends Number> PTable<K, Double> meanValue(PTable<K, V> table) {
 PTypeFamily ptf = table.getTypeFamily();
 PTable<K, Pair<Double, Long>> withCounts = table.mapValues(new MapFn<V, Pair<Double, Long>>() {
  @Override
  public Pair<Double, Long> map(V input) {
   return Pair.of(input.doubleValue(), 1L);
  }
 }, ptf.pairs(ptf.doubles(), ptf.longs()));
 PGroupedTable<K, Pair<Double, Long>> grouped = withCounts.groupByKey();
 return grouped.combineValues(pairAggregator(SUM_DOUBLES(), SUM_LONGS()))
     .mapValues(new MapFn<Pair<Double, Long>, Double>() {
      @Override
      public Double map(Pair<Double, Long> input) {
       return input.first() / input.second();
      }
     }, ptf.doubles());
}

/**
 * Returns the number of elements in the provided PCollection.
 * 
 * @param collect The PCollection whose elements should be counted.
 * @param <S> The type of the PCollection.
 * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
 */
public static <S> PObject<Long> length(PCollection<S> collect) {
 PTypeFamily tf = collect.getTypeFamily();
 PTable<Integer, Long> countTable = collect
   .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
    public Pair<Integer, Long> map(S input) {
     return Pair.of(1, 1L);
    }
    public void cleanup(Emitter<Pair<Integer, Long>> e) {
     e.emit(Pair.of(1, 0L));
    }
   }, tf.tableOf(tf.ints(), tf.longs()))
   .groupByKey(GroupingOptions.builder().numReducers(1).build())
   .combineValues(Aggregators.SUM_LONGS());
 PCollection<Long> count = countTable.values();
 return new FirstElementPObject<Long>(count, 0L);
}

/**
 * Selects the top N pairs from the given table, with sorting being performed on the values (i.e. the second
 * value in the pair) of the table.
 *
 * @param ptable table containing the pairs from which the top N is to be selected
 * @param limit number of top elements to select
 * @param maximize if true, the maximum N values from the table will be selected, otherwise the minimal
 *                 N values will be selected
 * @return table containing the top N values from the incoming table
 */
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) {
 PTypeFamily ptf = ptable.getTypeFamily();
 PTableType<K, V> base = ptable.getPTableType();
 PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType());
 PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType);
 return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize, pairType), inter)
   .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize, pairType))
   .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() {
    public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) {
     emitter.emit(input.second());
    }
   }, base);
}

Javadoc

Combine the values in each group using the given Aggregator.

Popular methods of PGroupedTable

ungroup
Convert this grouping back into a multimap.
parallelDo
getName
getGroupedTableType
Return the PGroupedTableType containing serialization information for this PGroupedTable.
getPType
getTypeFamily
mapValues
Maps the Iterable elements of each record to a new type. Just like any parallelDo operation on a PGr

Popular in Java

Parsing JSON documents to java classes using gson
addToBackStack (FragmentTransaction)
getExternalFilesDir (Context)
requestLocationUpdates (LocationManager)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
URI (java.net)
A Uniform Resource Identifier that identifies an abstract or physical resource, as specified by RFC
URLConnection (java.net)
A connection to a URL for reading or writing. For HTTP connections, see HttpURLConnection for docume
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
VirtualMachine (com.sun.tools.attach)
A Java virtual machine. A VirtualMachine represents a Java virtual machine to which this Java vir
Option (scala)
Top 12 Jupyter Notebook extensions

How to use combineValuesmethodin org.apache.crunch.PGroupedTable

Best Java code snippets using org.apache.crunch.PGroupedTable.combineValues (Showing top 20 results out of 315)

How to use
combineValues
method
in
org.apache.crunch.PGroupedTable