writePartitionInfo(conf, partitionFile, splitPoints); GroupingOptions options = GroupingOptions.builder() .partitionerClass(TotalOrderPartitioner.class) .sortComparatorClass(KeyValueComparator.class) .conf(TotalOrderPartitioner.PARTITIONER_PATH, partitionFile.toString()) .numReducers(splitPoints.size() + 1) .build(); return t.groupByKey(options).ungroup().keys();
private static <K, V> GroupingOptions buildGroupingOptions(PTable<K, V> ptable, Configuration conf, int numReducers, ColumnOrder[] columnOrders) { PTypeFamily tf = ptable.getTypeFamily(); PType<K> keyType = ptable.getKeyType(); GroupingOptions.Builder builder = GroupingOptions.builder(); if (tf == WritableTypeFamily.getInstance()) { if (columnOrders.length == 1 && columnOrders[0].order == Order.DESCENDING) { builder.sortComparatorClass(ReverseWritableComparator.class); } else { WritableType[] wt = new WritableType[columnOrders.length]; for (int i = 0; i < wt.length; i++) { wt[i] = (WritableType) keyType.getSubTypes().get(i); } TupleWritableComparator.configureOrdering(conf, wt, columnOrders); builder.sortComparatorClass(TupleWritableComparator.class); } } else if (tf == AvroTypeFamily.getInstance()) { AvroType<K> avroType = (AvroType<K>) keyType; Schema schema = avroType.getSchema(); builder.conf("crunch.schema", schema.toString()); if (columnOrders.length == 1 && columnOrders[0].order == Order.DESCENDING) { builder.sortComparatorClass(ReverseAvroComparator.class); } } else { throw new RuntimeException("Unrecognized type family: " + tf); } builder.requireSortedKeys(); configureReducers(builder, ptable, conf, numReducers); return builder.build(); }
writePartitionInfo(conf, partitionFile, splitPoints); GroupingOptions options = GroupingOptions.builder() .partitionerClass(TotalOrderPartitioner.class) .sortComparatorClass(KeyValueComparator.class) .conf(TotalOrderPartitioner.PARTITIONER_PATH, partitionFile.toString()) .numReducers(splitPoints.size() + 1) .build(); return t.groupByKey(options).ungroup().keys();
public static PGroupedTable<TupleN, ByteBuffer> apply(PCollection<ByteBuffer> traces, List<String> keys) { Field[] fields = new Field[keys.size()]; boolean[] negate = new boolean[keys.size()]; for (int i = 0; i < keys.size(); i++) { String key = keys.get(i); if (key.charAt(0) == '-') { negate[i] = true; key = key.substring(1); } fields[i] = Fields.getSortField(key); if (fields[i] == null) { throw new IllegalArgumentException("Unrecognized susort key: " + keys.get(i)); } } PTypeFamily tf = traces.getTypeFamily(); PType[] headerTypes = new PType[keys.size()]; for (int i = 0; i < keys.size(); i++) { headerTypes[i] = tf.ints(); } GroupingOptions options = GroupingOptions.builder() .partitionerClass(JoinUtils.getPartitionerClass(tf)).build(); return traces.parallelDo("gethw", new HeaderExtractor(fields, negate), tf.tableOf(tf.tuples(headerTypes), tf.bytes())).groupByKey(options); } }
static <K, U, V> PGroupedTable<Pair<K, Integer>, Pair<U, V>> preJoin(PTable<K, U> left, PTable<K, V> right, int numReducers) { PTypeFamily ptf = left.getTypeFamily(); PTableType<Pair<K, Integer>, Pair<U, V>> ptt = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), ptf.pairs(left.getValueType(), right.getValueType())); PTable<Pair<K, Integer>, Pair<U, V>> tag1 = left.parallelDo("joinTagLeft", new MapFn<Pair<K, U>, Pair<Pair<K, Integer>, Pair<U, V>>>() { @Override public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, U> input) { return Pair.of(Pair.of(input.first(), 0), Pair.of(input.second(), (V) null)); } }, ptt); PTable<Pair<K, Integer>, Pair<U, V>> tag2 = right.parallelDo("joinTagRight", new MapFn<Pair<K, V>, Pair<Pair<K, Integer>, Pair<U, V>>>() { @Override public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, V> input) { return Pair.of(Pair.of(input.first(), 1), Pair.of((U) null, input.second())); } }, ptt); GroupingOptions.Builder optionsBuilder = GroupingOptions.builder(); optionsBuilder.requireSortedKeys(); optionsBuilder.partitionerClass(JoinUtils.getPartitionerClass(ptf)); if (numReducers > 0) { optionsBuilder.numReducers(numReducers); } return (tag1.union(tag2)).groupByKey(optionsBuilder.build()); }
private static <K, U, V> PGroupedTable<Pair<K, Integer>, Pair<U, V>> preJoin(PTable<K, U> left, PTable<K, V> right) { PTypeFamily ptf = left.getTypeFamily(); PTableType<Pair<K, Integer>, Pair<U, V>> ptt = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), ptf.pairs(left.getValueType(), right.getValueType())); PTable<Pair<K, Integer>, Pair<U, V>> tag1 = left.parallelDo("joinTagLeft", new MapFn<Pair<K, U>, Pair<Pair<K, Integer>, Pair<U, V>>>() { @Override public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, U> input) { return Pair.of(Pair.of(input.first(), 0), Pair.of(input.second(), (V) null)); } }, ptt); PTable<Pair<K, Integer>, Pair<U, V>> tag2 = right.parallelDo("joinTagRight", new MapFn<Pair<K, V>, Pair<Pair<K, Integer>, Pair<U, V>>>() { @Override public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, V> input) { return Pair.of(Pair.of(input.first(), 1), Pair.of((U) null, input.second())); } }, ptt); GroupingOptions.Builder optionsBuilder = GroupingOptions.builder(); optionsBuilder.partitionerClass(JoinUtils.getPartitionerClass(ptf)); return (tag1.union(tag2)).groupByKey(optionsBuilder.build()); } }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } public void cleanup(Emitter<Pair<Integer, Long>> e) { e.emit(Pair.of(1, 0L)); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey(GroupingOptions.builder().numReducers(1).build()) .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count, 0L); }
private static <K, V> GroupingOptions buildGroupingOptions(PTable<K, V> ptable, Configuration conf, int numReducers, Order order) { PType<K> ptype = ptable.getKeyType(); PTypeFamily tf = ptable.getTypeFamily(); GroupingOptions.Builder builder = GroupingOptions.builder(); if (order == Order.DESCENDING) { if (tf == WritableTypeFamily.getInstance()) { builder.sortComparatorClass(ReverseWritableComparator.class); } else if (tf == AvroTypeFamily.getInstance()) { AvroType<K> avroType = (AvroType<K>) ptype; Schema schema = avroType.getSchema(); builder.conf("crunch.schema", schema.toString()); builder.sortComparatorClass(ReverseAvroComparator.class); } else { throw new RuntimeException("Unrecognized type family: " + tf); } } else if (tf == AvroTypeFamily.getInstance()) { builder.conf("crunch.schema", ((AvroType<K>) ptype).getSchema().toString()); } builder.requireSortedKeys(); configureReducers(builder, ptable, conf, numReducers); return builder.build(); }
private static <K, U, V> PGroupedTable<Pair<K, Integer>, Pair<U, V>> preJoin( PTable<K, U> left, PTable<K, V> right) { PTypeFamily ptf = left.getTypeFamily(); PTableType<Pair<K, Integer>, Pair<U, V>> ptt = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), ptf.pairs(left.getValueType(), right.getValueType())); PTable<Pair<K, Integer>, Pair<U, V>> tag1 = left.parallelDo("joinTagLeft", new MapFn<Pair<K, U>, Pair<Pair<K, Integer>, Pair<U, V>>>() { @Override public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, U> input) { return Pair.of(Pair.of(input.first(), 0), Pair.of(input.second(), (V) null)); } }, ptt); PTable<Pair<K, Integer>, Pair<U, V>> tag2 = right.parallelDo("joinTagRight", new MapFn<Pair<K, V>, Pair<Pair<K, Integer>, Pair<U, V>>>() { @Override public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, V> input) { return Pair.of(Pair.of(input.first(), 1), Pair.of((U) null, input.second())); } }, ptt); GroupingOptions.Builder optionsBuilder = GroupingOptions.builder(); optionsBuilder.partitionerClass(JoinUtils.getPartitionerClass(ptf)); return (tag1.union(tag2)).groupByKey(optionsBuilder.build()); } }
private static <K, V> void configureReducers(GroupingOptions.Builder builder, PTable<K, V> ptable, Configuration conf, int numReducers) { if (numReducers <= 0) { numReducers = PartitionUtils.getRecommendedPartitions(ptable, conf); if (numReducers < 5) { // Not worth the overhead, force it to 1 numReducers = 1; } } builder.numReducers(numReducers); if (numReducers > 1) { Iterable<K> iter = Sample.reservoirSample(ptable.keys(), numReducers - 1).materialize(); MaterializableIterable<K> mi = (MaterializableIterable<K>) iter; if (mi.isSourceTarget()) { builder.sourceTargets((SourceTarget) mi.getSource()); } builder.partitionerClass(TotalOrderPartitioner.class); builder.conf(TotalOrderPartitioner.PARTITIONER_PATH, mi.getPath().toString()); //TODO: distcache handling } }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey(GroupingOptions.builder().numReducers(1).build()) .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count); }
private static <T> GroupingOptions buildGroupingOptions(Configuration conf, PTypeFamily tf, PType<T> ptype, Order order) { Builder builder = GroupingOptions.builder(); if (order == Order.DESCENDING) { if (tf == WritableTypeFamily.getInstance()) { builder.sortComparatorClass(ReverseWritableComparator.class); } else if (tf == AvroTypeFamily.getInstance()) { AvroType<T> avroType = (AvroType<T>) ptype; Schema schema = avroType.getSchema(); conf.set("crunch.schema", schema.toString()); builder.sortComparatorClass(ReverseAvroComparator.class); } else { throw new RuntimeException("Unrecognized type family: " + tf); } } // TODO:CRUNCH-23: Intermediate Fix for release 1. More elaborate fix is // required check JIRA for details. builder.numReducers(1); return builder.build(); }
private static <T> GroupingOptions buildGroupingOptions(Configuration conf, PTypeFamily tf, PType<T> ptype, ColumnOrder[] columnOrders) { Builder builder = GroupingOptions.builder(); if (tf == WritableTypeFamily.getInstance()) { TupleWritableComparator.configureOrdering(conf, columnOrders); builder.sortComparatorClass(TupleWritableComparator.class); } else if (tf == AvroTypeFamily.getInstance()) { TupleAvroComparator.configureOrdering(conf, columnOrders, ptype); builder.sortComparatorClass(TupleAvroComparator.class); } else { throw new RuntimeException("Unrecognized type family: " + tf); } // TODO:CRUNCH-23: Intermediate Fix for release 1. More elaborate fix is // required check JIRA for details. builder.numReducers(1); return builder.build(); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input, int numReducers) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); GroupingOptions.Builder gob = GroupingOptions.builder() .requireSortedKeys() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)); if (numReducers > 0) { gob.numReducers(numReducers); } return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey(gob.build()); }
private static <T> GroupingOptions buildGroupingOptions(Configuration conf, PTypeFamily tf, PType<T> ptype, Order order) { Builder builder = GroupingOptions.builder(); if (order == Order.DESCENDING) { if (tf == WritableTypeFamily.getInstance()) { builder.sortComparatorClass(ReverseWritableComparator.class); } else if (tf == AvroTypeFamily.getInstance()) { AvroType<T> avroType = (AvroType<T>) ptype; Schema schema = avroType.getSchema(); conf.set("crunch.schema", schema.toString()); builder.sortComparatorClass(ReverseAvroComparator.class); } else { throw new RuntimeException("Unrecognized type family: " + tf); } } return builder.build(); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); PTableType<K, Collection<Pair<V1, V2>>> out = ptf.tableOf(input.getKeyType(), ptf.collections(input.getValueType())); return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey( GroupingOptions.builder() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)) .build()); }
public BaseGroupedTable<K, V> groupByKey() { return pipeline.getFactory().createGroupedTable(this, GroupingOptions.builder().build()); }
public static Builder builder() { return new Builder(); }
public static Builder builder() { return new Builder(); }
public static Builder builder() { return new Builder(); }