private static <T> GroupingOptions buildGroupingOptions(Configuration conf, PTypeFamily tf, PType<T> ptype, Order order) { Builder builder = GroupingOptions.builder(); if (order == Order.DESCENDING) { if (tf == WritableTypeFamily.getInstance()) { builder.sortComparatorClass(ReverseWritableComparator.class); } else if (tf == AvroTypeFamily.getInstance()) { AvroType<T> avroType = (AvroType<T>) ptype; Schema schema = avroType.getSchema(); conf.set("crunch.schema", schema.toString()); builder.sortComparatorClass(ReverseAvroComparator.class); } else { throw new RuntimeException("Unrecognized type family: " + tf); } } // TODO:CRUNCH-23: Intermediate Fix for release 1. More elaborate fix is // required check JIRA for details. builder.numReducers(1); return builder.build(); }
.sortComparatorClass(KeyValueComparator.class) .conf(TotalOrderPartitioner.PARTITIONER_PATH, partitionFile.toString()) .numReducers(splitPoints.size() + 1) .build(); return t.groupByKey(options).ungroup().keys();
.sortComparatorClass(KeyValueComparator.class) .conf(TotalOrderPartitioner.PARTITIONER_PATH, partitionFile.toString()) .numReducers(splitPoints.size() + 1) .build(); return t.groupByKey(options).ungroup().keys();
static <K, U, V> PGroupedTable<Pair<K, Integer>, Pair<U, V>> preJoin(PTable<K, U> left, PTable<K, V> right, int numReducers) { PTypeFamily ptf = left.getTypeFamily(); PTableType<Pair<K, Integer>, Pair<U, V>> ptt = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), ptf.pairs(left.getValueType(), right.getValueType())); PTable<Pair<K, Integer>, Pair<U, V>> tag1 = left.parallelDo("joinTagLeft", new MapFn<Pair<K, U>, Pair<Pair<K, Integer>, Pair<U, V>>>() { @Override public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, U> input) { return Pair.of(Pair.of(input.first(), 0), Pair.of(input.second(), (V) null)); } }, ptt); PTable<Pair<K, Integer>, Pair<U, V>> tag2 = right.parallelDo("joinTagRight", new MapFn<Pair<K, V>, Pair<Pair<K, Integer>, Pair<U, V>>>() { @Override public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, V> input) { return Pair.of(Pair.of(input.first(), 1), Pair.of((U) null, input.second())); } }, ptt); GroupingOptions.Builder optionsBuilder = GroupingOptions.builder(); optionsBuilder.requireSortedKeys(); optionsBuilder.partitionerClass(JoinUtils.getPartitionerClass(ptf)); if (numReducers > 0) { optionsBuilder.numReducers(numReducers); } return (tag1.union(tag2)).groupByKey(optionsBuilder.build()); }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } public void cleanup(Emitter<Pair<Integer, Long>> e) { e.emit(Pair.of(1, 0L)); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey(GroupingOptions.builder().numReducers(1).build()) .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count, 0L); }
/** * Returns the number of elements in the provided PCollection. * * @param collect The PCollection whose elements should be counted. * @param <S> The type of the PCollection. * @return A {@code PObject} containing the number of elements in the {@code PCollection}. */ public static <S> PObject<Long> length(PCollection<S> collect) { PTypeFamily tf = collect.getTypeFamily(); PTable<Integer, Long> countTable = collect .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() { public Pair<Integer, Long> map(S input) { return Pair.of(1, 1L); } }, tf.tableOf(tf.ints(), tf.longs())) .groupByKey(GroupingOptions.builder().numReducers(1).build()) .combineValues(Aggregators.SUM_LONGS()); PCollection<Long> count = countTable.values(); return new FirstElementPObject<Long>(count); }
private static <K, V> void configureReducers(GroupingOptions.Builder builder, PTable<K, V> ptable, Configuration conf, int numReducers) { if (numReducers <= 0) { numReducers = PartitionUtils.getRecommendedPartitions(ptable, conf); if (numReducers < 5) { // Not worth the overhead, force it to 1 numReducers = 1; } } builder.numReducers(numReducers); if (numReducers > 1) { Iterable<K> iter = Sample.reservoirSample(ptable.keys(), numReducers - 1).materialize(); MaterializableIterable<K> mi = (MaterializableIterable<K>) iter; if (mi.isSourceTarget()) { builder.sourceTargets((SourceTarget) mi.getSource()); } builder.partitionerClass(TotalOrderPartitioner.class); builder.conf(TotalOrderPartitioner.PARTITIONER_PATH, mi.getPath().toString()); //TODO: distcache handling } }
private static <T> GroupingOptions buildGroupingOptions(Configuration conf, PTypeFamily tf, PType<T> ptype, ColumnOrder[] columnOrders) { Builder builder = GroupingOptions.builder(); if (tf == WritableTypeFamily.getInstance()) { TupleWritableComparator.configureOrdering(conf, columnOrders); builder.sortComparatorClass(TupleWritableComparator.class); } else if (tf == AvroTypeFamily.getInstance()) { TupleAvroComparator.configureOrdering(conf, columnOrders, ptype); builder.sortComparatorClass(TupleAvroComparator.class); } else { throw new RuntimeException("Unrecognized type family: " + tf); } // TODO:CRUNCH-23: Intermediate Fix for release 1. More elaborate fix is // required check JIRA for details. builder.numReducers(1); return builder.build(); }
private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare( PTable<K, Pair<V1, V2>> input, int numReducers) { PTypeFamily ptf = input.getTypeFamily(); PType<Pair<V1, V2>> valueType = input.getValueType(); PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf( ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)), valueType); GroupingOptions.Builder gob = GroupingOptions.builder() .requireSortedKeys() .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf)) .partitionerClass(JoinUtils.getPartitionerClass(ptf)); if (numReducers > 0) { gob.numReducers(numReducers); } return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter) .groupByKey(gob.build()); }
public BaseGroupedTable<K, V> groupByKey(int numReduceTasks) { return pipeline.getFactory().createGroupedTable( this, GroupingOptions.builder().numReducers(numReduceTasks).build()); }
public PGroupedTableImpl<K, V> groupByKey(int numReduceTasks) { return new PGroupedTableImpl<K, V>(this, GroupingOptions.builder().numReducers(numReduceTasks).build()); }
public PGroupedTableImpl<K, V> groupByKey(int numReduceTasks) { return new PGroupedTableImpl<K, V>(this, GroupingOptions.builder().numReducers(numReduceTasks).build()); }