public static <S, T> Shuffler<S, T> create(PType<S> keyType, GroupingOptions options, Pipeline pipeline) { Map<Object, Collection<T>> map = getMapForKeyType(keyType); if (options != null) { Job job; try { job = new Job(pipeline.getConfiguration()); } catch (IOException e) { throw new IllegalStateException("Could not create Job instance", e); } options.configure(job); if (Pair.class.equals(keyType.getTypeClass()) && options.getGroupingComparatorClass() != null) { PType<?> pairKey = keyType.getSubTypes().get(0); return new SecondarySortShuffler(getMapForKeyType(pairKey)); } else if (options.getSortComparatorClass() != null) { RawComparator rc = ReflectionUtils.newInstance( options.getSortComparatorClass(), job.getConfiguration()); map = new TreeMap<Object, Collection<T>>(rc); return new MapShuffler<S, T>(map, keyType); } } return new MapShuffler<S, T>(map); }
public boolean isNumReduceTasksSetByUser() { return (groupingOptions != null && groupingOptions.getNumReducers() > 0); } }
public GroupingOptions build() { return new GroupingOptions(partitionerClass, groupingComparatorClass, sortComparatorClass, numReducers); } }
public static <S, T> Shuffler<S, T> create(PType<S> keyType, GroupingOptions options, Pipeline pipeline) { Map<S, Collection<T>> map = getMapForKeyType(keyType); if (options != null) { if (Pair.class.equals(keyType.getTypeClass()) && options.getGroupingComparatorClass() != null) { PType<?> pairKey = keyType.getSubTypes().get(0); return new SecondarySortShuffler(getMapForKeyType(pairKey)); } else if (options.getSortComparatorClass() != null) { RawComparator<S> rc = ReflectionUtils.newInstance(options.getSortComparatorClass(), pipeline.getConfiguration()); map = new TreeMap<S, Collection<T>>(rc); } } return new MapShuffler<S, T>(map); }
private Partitioner getPartitioner() { if (partitioner == null) { try { ptype.initialize(runtimeContext.getConfiguration()); Job job = new Job(runtimeContext.getConfiguration()); options.configure(job); ptype.configureShuffle(job, options); partitioner = ReflectionUtils.newInstance(options.getPartitionerClass(), job.getConfiguration()); } catch (IOException e) { throw new CrunchRuntimeException("Error configuring partitioner", e); } } return partitioner; } }
public PGroupedTableImpl<K, V> groupByKey(int numReduceTasks) { return new PGroupedTableImpl<K, V>(this, GroupingOptions.builder().numReducers(numReduceTasks).build()); }
int numPartitions = (groupingOptions.getNumReducers() > 0) ? groupingOptions.getNumReducers() : PartitionUtils.getRecommendedPartitions(this, getPipeline().getConfiguration()); if (numPartitions <= 0) { if (groupingOptions.getPartitionerClass() != null) { groupedRDD = parentRDD .map(new PairMapFunction(ptype.getOutputMapFn(), runtime.getRuntimeContext())) if (groupingOptions.requireSortedKeys() || groupingOptions.getSortComparatorClass() != null) { SparkComparator scmp = new SparkComparator(groupingOptions, ptype, runtime.getRuntimeContext()); groupedRDD = groupedRDD.sortByKey(scmp); if (groupingOptions.getGroupingComparatorClass() != null) { groupedRDD = groupedRDD.mapPartitionsToPair( new ReduceGroupingFunction(groupingOptions, ptype, runtime.getRuntimeContext()));
@Override public void configureShuffle(Job job, GroupingOptions options) { if (options != null) { options.configure(job); } WritableType keyType = (WritableType) tableType.getKeyType(); WritableType valueType = (WritableType) tableType.getValueType(); job.setMapOutputKeyClass(keyType.getSerializationClass()); job.setMapOutputValueClass(valueType.getSerializationClass()); if ((options == null || options.getSortComparatorClass() == null) && TupleWritable.class.equals(keyType.getSerializationClass())) { job.setSortComparatorClass(TupleWritable.Comparator.class); } } }
@Override public void configureShuffle(Job job, GroupingOptions options) { AvroTableType<K, V> att = (AvroTableType<K, V>) tableType; String schemaJson = att.getSchema().toString(); Configuration conf = job.getConfiguration(); if (!att.isSpecific()) { conf.setBoolean(AvroJob.MAP_OUTPUT_IS_REFLECT, true); } conf.set(AvroJob.MAP_OUTPUT_SCHEMA, schemaJson); job.setSortComparatorClass(AvroKeyComparator.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(AvroValue.class); if (options != null) { options.configure(job); } Avros.configureReflectDataFactory(conf); Collection<String> serializations = job.getConfiguration().getStringCollection("io.serializations"); if (!serializations.contains(SafeAvroSerialization.class.getName())) { serializations.add(SafeAvroSerialization.class.getName()); job.getConfiguration().setStrings("io.serializations", serializations.toArray(new String[0])); } }
private static <S, T> Map<S, Collection<T>> createMapFor(PType<S> keyType, GroupingOptions options, Pipeline pipeline) { if (options != null && options.getSortComparatorClass() != null) { RawComparator<S> rc = ReflectionUtils.newInstance(options.getSortComparatorClass(), pipeline.getConfiguration()); return new TreeMap<S, Collection<T>>(rc); } else if (keyType != null && Comparable.class.isAssignableFrom(keyType.getTypeClass())) { return new TreeMap<S, Collection<T>>(); } return Maps.newHashMap(); }
private RawComparator<?> rawComparator() { if (cmp == null) { try { Job job = new Job(ctxt.getConfiguration()); ptype.configureShuffle(job, options); cmp = ReflectionUtils.newInstance(options.getGroupingComparatorClass(), job.getConfiguration()); } catch (IOException e) { throw new CrunchRuntimeException("Error configuring grouping comparator", e); } } return cmp; }
public PGroupedTableImpl<K, V> groupByKey(int numReduceTasks) { return new PGroupedTableImpl<K, V>(this, GroupingOptions.builder().numReducers(numReduceTasks).build()); }
int numPartitions = (groupingOptions.getNumReducers() > 0) ? groupingOptions.getNumReducers() : PartitionUtils.getRecommendedPartitions(this, getPipeline().getConfiguration()); if (numPartitions <= 0) { if (groupingOptions.getPartitionerClass() != null) { groupedRDD = parentRDD .map(new PairMapFunction(ptype.getOutputMapFn(), runtime.getRuntimeContext())) if (groupingOptions.requireSortedKeys() || groupingOptions.getSortComparatorClass() != null) { SparkComparator scmp = new SparkComparator(groupingOptions, ptype, runtime.getRuntimeContext()); groupedRDD = groupedRDD.sortByKey(scmp); if (groupingOptions.getGroupingComparatorClass() != null) { groupedRDD = groupedRDD.mapPartitionsToPair( new ReduceGroupingFunction(groupingOptions, ptype, runtime.getRuntimeContext()));
@Override public void configureShuffle(Job job, GroupingOptions options) { if (options != null) { options.configure(job); } WritableType keyType = (WritableType) tableType.getKeyType(); WritableType valueType = (WritableType) tableType.getValueType(); job.setMapOutputKeyClass(keyType.getSerializationClass()); job.setMapOutputValueClass(valueType.getSerializationClass()); } }
private Partitioner getPartitioner() { if (partitioner == null) { try { ptype.initialize(runtimeContext.getConfiguration()); Job job = new Job(runtimeContext.getConfiguration()); options.configure(job); ptype.configureShuffle(job, options); partitioner = ReflectionUtils.newInstance(options.getPartitionerClass(), job.getConfiguration()); } catch (IOException e) { throw new CrunchRuntimeException("Error configuring partitioner", e); } } return partitioner; } }
public SparkComparator(GroupingOptions options, PGroupedTableType ptype, SparkRuntimeContext ctxt) { if (options.getSortComparatorClass() != null) { this.cmpClass = options.getSortComparatorClass(); } else if (AvroTypeFamily.getInstance().equals(ptype.getFamily())) { this.cmpClass = AvroKeyComparator.class; } else { this.cmpClass = null; } this.options = options; this.ptype = ptype; this.ctxt = ctxt; }
private RawComparator<?> rawComparator() { if (cmp == null) { try { Job job = new Job(ctxt.getConfiguration()); ptype.configureShuffle(job, options); cmp = ReflectionUtils.newInstance(options.getGroupingComparatorClass(), job.getConfiguration()); } catch (IOException e) { throw new CrunchRuntimeException("Error configuring grouping comparator", e); } } return cmp; }
public BaseGroupedTable<K, V> groupByKey() { return pipeline.getFactory().createGroupedTable(this, GroupingOptions.builder().build()); }
@Override public void configureShuffle(Job job, GroupingOptions options) { if (options != null) { options.configure(job); } WritableType keyType = (WritableType) tableType.getKeyType(); WritableType valueType = (WritableType) tableType.getValueType(); job.setMapOutputKeyClass(keyType.getSerializationClass()); job.setMapOutputValueClass(valueType.getSerializationClass()); } }
public void configureShuffle(Job job) { ptype.configureShuffle(job, groupingOptions); if (groupingOptions == null || groupingOptions.getNumReducers() <= 0) { long bytesPerTask = job.getConfiguration().getLong("crunch.bytes.per.reduce.task", (1000L * 1000L * 1000L)); int numReduceTasks = 1 + (int) (getSize() / bytesPerTask); if (numReduceTasks > 0) { job.setNumReduceTasks(numReduceTasks); LOG.info(String.format("Setting num reduce tasks to %d", numReduceTasks)); } else { LOG.warn("Attempted to set a negative number of reduce tasks"); } } }