public static SparkEdgeProperty getEdgeProperty(HiveConf conf, ReduceSinkOperator reduceSink, ReduceWork reduceWork) throws SemanticException { boolean useSparkGroupBy = conf.getBoolVar(HiveConf.ConfVars.SPARK_USE_GROUPBY_SHUFFLE); SparkEdgeProperty edgeProperty = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE); edgeProperty.setNumPartitions(reduceWork.getNumReduceTasks()); String sortOrder = Strings.nullToEmpty(reduceSink.getConf().getOrder()).trim(); edgeProperty.setShuffleGroup(); edgeProperty.setMRShuffle(); edgeProperty.setMRShuffle(); hive_metastoreConstants.BUCKET_COUNT); if (bucketCount != null && Integer.parseInt(bucketCount) > 1) { edgeProperty.setMRShuffle(); if (edgeProperty.isShuffleNone() && !sortOrder.isEmpty()) { if ((reduceSink.getConf().getPartitionCols() == null || reduceSink.getConf().getPartitionCols().isEmpty() || isSame(reduceSink.getConf().getPartitionCols(), reduceSink.getConf().getKeyCols())) && reduceSink.getConf().hasOrderBy()) { edgeProperty.setShuffleSort(); } else { edgeProperty.setMRShuffle(); if (edgeProperty.isShuffleNone()) { if (!useSparkGroupBy) { LOG.info("hive.spark.use.groupby.shuffle is off. Use repartition shuffle instead."); edgeProperty.setMRShuffle();
private ShuffleTran generate(SparkPlan sparkPlan, SparkEdgeProperty edge, boolean toCache, String name, BaseWork work) { Preconditions.checkArgument(!edge.isShuffleNone(), "AssertionError: SHUFFLE_NONE should only be used for UnionWork."); SparkShuffler shuffler; if (edge.isMRShuffle()) { shuffler = new SortByShuffler(false, sparkPlan, shuffleSerializer); } else if (edge.isShuffleSort()) { shuffler = new SortByShuffler(true, sparkPlan, shuffleSerializer); } else { shuffler = new GroupByShuffler(); } return new ShuffleTran(sparkPlan, shuffler, edge.getNumPartitions(), toCache, name, edge, work); }
@Explain(displayName = "Shuffle Type") public String getShuffleType() { return prop.getShuffleType(); }
@Explain(displayName = "Shuffle Type") public String getShuffleType() { if (isShuffleNone()) { return "NONE"; } StringBuilder sb = new StringBuilder(); if (isShuffleGroup()) { sb.append("GROUP"); } if (isMRShuffle()) { if (sb.length() != 0) { sb.append(" "); } sb.append("PARTITION-LEVEL SORT"); } if (isShuffleSort()) { if (sb.length() != 0) { sb.append(" "); } sb.append("SORT"); } return sb.toString(); }
mapJoinOp.getConf().getParentToInput().put(pos, parentWork.getName()); SparkEdgeProperty edgeProp = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE);
@Explain(displayName = "Number of Partitions") public String getNumPartitions() { return Integer.toString(prop.getNumPartitions()); }
@Explain(displayName = "Shuffle Type") public String getShuffleType() { if (isShuffleNone()) { return "NONE"; } StringBuilder sb = new StringBuilder(); if (isShuffleGroup()) { sb.append("GROUP"); } if (isMRShuffle()) { if (sb.length() != 0) { sb.append(" "); } sb.append("PARTITION-LEVEL SORT"); } if (isShuffleSort()) { if (sb.length() != 0) { sb.append(" "); } sb.append("SORT"); } return sb.toString(); }
mapJoinOp.getConf().getParentToInput().put(pos, parentWork.getName()); SparkEdgeProperty edgeProp = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE);
@Explain(displayName = "Number of Partitions") public String getNumPartitions() { return Integer.toString(prop.getNumPartitions()); }
public static SparkEdgeProperty getEdgeProperty(ReduceSinkOperator reduceSink, ReduceWork reduceWork) throws SemanticException { SparkEdgeProperty edgeProperty = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE); edgeProperty.setNumPartitions(reduceWork.getNumReduceTasks()); String sortOrder = Strings.nullToEmpty(reduceSink.getConf().getOrder()).trim(); edgeProperty.setShuffleGroup(); edgeProperty.setMRShuffle(); edgeProperty.setMRShuffle(); hive_metastoreConstants.BUCKET_COUNT); if (bucketCount != null && Integer.parseInt(bucketCount) > 1) { edgeProperty.setMRShuffle(); if (edgeProperty.isShuffleNone() && !sortOrder.isEmpty()) { if ((reduceSink.getConf().getPartitionCols() == null || reduceSink.getConf().getPartitionCols().isEmpty() || isSame(reduceSink.getConf().getPartitionCols(), reduceSink.getConf().getKeyCols())) && reduceSink.getConf().hasOrderBy()) { edgeProperty.setShuffleSort(); } else { edgeProperty.setMRShuffle(); if (edgeProperty.isShuffleNone()) { edgeProperty.setShuffleGroup();
private ShuffleTran generate(SparkPlan sparkPlan, SparkEdgeProperty edge, boolean toCache) { Preconditions.checkArgument(!edge.isShuffleNone(), "AssertionError: SHUFFLE_NONE should only be used for UnionWork."); SparkShuffler shuffler; if (edge.isMRShuffle()) { shuffler = new SortByShuffler(false, sparkPlan); } else if (edge.isShuffleSort()) { shuffler = new SortByShuffler(true, sparkPlan); } else { boolean useSparkGroupBy = jobConf.getBoolean("hive.spark.use.groupby.shuffle", true); if (!useSparkGroupBy) { LOG.info("hive.spark.use.groupby.shuffle is off. Use repartitin shuffle instead."); } shuffler = useSparkGroupBy ? new GroupByShuffler() : new RepartitionShuffler(); } return new ShuffleTran(sparkPlan, shuffler, edge.getNumPartitions(), toCache); }
@Explain(displayName = "Shuffle Type") public String getShuffleType() { if (isShuffleNone()) { return "NONE"; } StringBuilder sb = new StringBuilder(); if (isShuffleGroup()) { sb.append("GROUP"); } if (isMRShuffle()) { if (sb.length() != 0) { sb.append(" "); } sb.append("PARTITION-LEVEL SORT"); } if (isShuffleSort()) { if (sb.length() != 0) { sb.append(" "); } sb.append("SORT"); } return sb.toString(); }
if (work != bigMapWork) { sparkWork.connect(work, bigMapWork, new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE));
@Explain(displayName = "Shuffle Type") public String getShuffleType() { return prop.getShuffleType(); }
@Explain(displayName = "Number of Partitions") public String getNumPartitions() { return Integer.toString(prop.getNumPartitions()); }
public static SparkEdgeProperty getEdgeProperty(ReduceSinkOperator reduceSink, ReduceWork reduceWork) throws SemanticException { SparkEdgeProperty edgeProperty = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE); edgeProperty.setNumPartitions(reduceWork.getNumReduceTasks()); String sortOrder = Strings.nullToEmpty(reduceSink.getConf().getOrder()).trim(); edgeProperty.setShuffleGroup(); edgeProperty.setMRShuffle(); edgeProperty.setMRShuffle(); hive_metastoreConstants.BUCKET_COUNT); if (bucketCount != null && Integer.valueOf(bucketCount) > 1) { edgeProperty.setMRShuffle(); if (edgeProperty.isShuffleNone() && !sortOrder.isEmpty()) { if ((reduceSink.getConf().getPartitionCols() == null || reduceSink.getConf().getPartitionCols().isEmpty() || isSame(reduceSink.getConf().getPartitionCols(), reduceSink.getConf().getKeyCols())) && reduceSink.getConf().hasOrderBy()) { edgeProperty.setShuffleSort(); } else { edgeProperty.setMRShuffle(); if (edgeProperty.isShuffleNone()) { edgeProperty.setShuffleGroup();
private ShuffleTran generate(SparkPlan sparkPlan, SparkEdgeProperty edge, boolean toCache) { Preconditions.checkArgument(!edge.isShuffleNone(), "AssertionError: SHUFFLE_NONE should only be used for UnionWork."); SparkShuffler shuffler; if (edge.isMRShuffle()) { shuffler = new SortByShuffler(false); } else if (edge.isShuffleSort()) { shuffler = new SortByShuffler(true); } else { shuffler = new GroupByShuffler(); } return new ShuffleTran(sparkPlan, shuffler, edge.getNumPartitions(), toCache); }
if (work != bigMapWork) { sparkWork.connect(work, bigMapWork, new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE));
@Override public JavaPairRDD<HiveKey, BytesWritable> transform(JavaPairRDD<HiveKey, BytesWritable> input) { JavaPairRDD<HiveKey, BytesWritable> result = shuffler.shuffle(input, numOfPartitions); if (toCache) { sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } return result.setName(this.name + " (" + edge.getShuffleType() + ", " + numOfPartitions + (toCache ? ", cached)" : ")")); }
mapJoinOp.getConf().getParentToInput().put(pos, parentWork.getName()); SparkEdgeProperty edgeProp = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE);