private SparkTran generateParentTran(SparkPlan sparkPlan, SparkWork sparkWork, BaseWork work) throws Exception { if (cloneToWork.containsKey(work)) { BaseWork originalWork = cloneToWork.get(work); if (workToParentWorkTranMap.containsKey(originalWork)) { return workToParentWorkTranMap.get(originalWork); } } SparkTran result; if (work instanceof MapWork) { result = generateMapInput(sparkPlan, (MapWork)work); sparkPlan.addTran(result); } else if (work instanceof ReduceWork) { boolean toCache = cloneToWork.containsKey(work); List<BaseWork> parentWorks = sparkWork.getParents(work); SparkEdgeProperty sparkEdgeProperty = sparkWork.getEdgeProperty(parentWorks.get(0), work); result = generate(sparkPlan, sparkEdgeProperty, toCache, work.getName(), work); sparkPlan.addTran(result); for (BaseWork parentWork : parentWorks) { sparkPlan.connect(workToTranMap.get(parentWork), result); } } else { throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName()); } if (cloneToWork.containsKey(work)) { workToParentWorkTranMap.put(cloneToWork.get(work), result); } return result; }
Map<SparkTran, JavaPairRDD<HiveKey, BytesWritable>> tranToOutputRDDMap = new HashMap<SparkTran, JavaPairRDD<HiveKey, BytesWritable>>(); for (SparkTran tran : getAllTrans()) { JavaPairRDD<HiveKey, BytesWritable> rdd = null; List<SparkTran> parents = getParents(tran); if (parents.size() == 0) { logSparkPlan();
private void visit(SparkTran child, Set<SparkTran> seen, List<SparkTran> result) { if (seen.contains(child)) { // don't visit multiple times return; } seen.add(child); for (SparkTran parent: getParents(child)) { if (!seen.contains(parent)) { visit(parent, seen, result); } } result.add(child); }
private void collectLeafTrans(SparkTran leaf, List<SparkTran> reduceTrans) { List<SparkTran> parents = getParents(leaf); if (parents.size() > 0) { SparkTran nextLeaf = null; for (SparkTran leafTran : parents) { if (leafTran instanceof ReduceTran) { reduceTrans.add(leafTran); } else { if (getParents(leafTran).size() > 0) nextLeaf = leafTran; } } if (nextLeaf != null) collectLeafTrans(nextLeaf, reduceTrans); } }
private void logLeafTran(List<SparkTran> parent, StringBuilder sparkPlan) { sparkPlan.append(" <-- "); boolean isFirst = true; for (SparkTran sparkTran : parent) { List<SparkTran> parents = getParents(sparkTran); SparkTran leaf = parents.get(0); if (isFirst) { sparkPlan.append("( " + leaf.getName()); if (leaf instanceof ShuffleTran) { logShuffleTranStatus((ShuffleTran) leaf, sparkPlan); } else { logCacheStatus(leaf, sparkPlan); } isFirst = false; } else { sparkPlan.append("," + leaf.getName()); if (leaf instanceof ShuffleTran) { logShuffleTranStatus((ShuffleTran) leaf, sparkPlan); } else { logCacheStatus(leaf, sparkPlan); } } } sparkPlan.append(" ) "); }
JavaPairRDD<HiveKey, BytesWritable> finalRDD = plan.generateGraph(); sc, jobId, jobMetricsListener, sparkCounters, plan.getCachedRDDIds(), future); return new LocalSparkJobRef(Integer.toString(jobId), hiveConf, sparkJobStatus, sc);
public SparkPlan generate(SparkWork sparkWork) throws Exception { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_BUILD_PLAN); SparkPlan sparkPlan = new SparkPlan(); cloneToWork = sparkWork.getCloneToWork(); workToTranMap.clear(); workToParentWorkTranMap.clear(); try { for (BaseWork work : sparkWork.getAllWork()) { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_CREATE_TRAN + work.getName()); SparkTran tran = generate(work, sparkWork); SparkTran parentTran = generateParentTran(sparkPlan, sparkWork, work); sparkPlan.addTran(tran); sparkPlan.connect(parentTran, tran); workToTranMap.put(work, tran); perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_CREATE_TRAN + work.getName()); } } finally { // clear all ThreadLocal cached MapWork/ReduceWork after plan generation // as this may executed in a pool thread. Utilities.clearWorkMap(jobConf); } perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_BUILD_PLAN); return sparkPlan; }
private void getSparkPlan(SparkTran tran, StringBuilder sparkPlan) { List<SparkTran> parents = getParents(tran); List<SparkTran> nextLeaf = new ArrayList<SparkTran>(); if (parents.size() > 0) { sparkPlan.append("( " + leaf.getName()); if (leaf instanceof ShuffleTran) { logShuffleTranStatus((ShuffleTran) leaf, sparkPlan); } else { logCacheStatus(leaf, sparkPlan); sparkPlan.append("," + leaf.getName()); if (leaf instanceof ShuffleTran) { logShuffleTranStatus((ShuffleTran) leaf, sparkPlan); } else { logCacheStatus(leaf, sparkPlan); if (getParents(leaf).size() > 0 && !(leaf instanceof ReduceTran)) { nextLeaf.add(leaf); logLeafTran(nextLeaf, sparkPlan); } else { if (nextLeaf.size() != 0) getSparkPlan(nextLeaf.get(0), sparkPlan);
@Override public JavaPairRDD<HiveKey, BytesWritable> transform(JavaPairRDD<HiveKey, BytesWritable> input) { JavaPairRDD<HiveKey, BytesWritable> result = shuffler.shuffle(input, numOfPartitions); if (toCache) { sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } return result; }
Map<SparkTran, JavaPairRDD<HiveKey, BytesWritable>> tranToOutputRDDMap = new HashMap<SparkTran, JavaPairRDD<HiveKey, BytesWritable>>(); for (SparkTran tran : getAllTrans()) { JavaPairRDD<HiveKey, BytesWritable> rdd = null; List<SparkTran> parents = getParents(tran); if (parents.size() == 0) { sc.setCallSite(CallSite.apply(tran.getName(), getLongFormCallSite(tran))); rdd = tran.transform(null); } else { sc.setCallSite(CallSite.apply(tran.getName(), getLongFormCallSite(tran))); rdd = tran.transform(rdd);
Map<SparkTran, JavaPairRDD<HiveKey, BytesWritable>> tranToOutputRDDMap = new HashMap<SparkTran, JavaPairRDD<HiveKey, BytesWritable>>(); for (SparkTran tran : getAllTrans()) { JavaPairRDD<HiveKey, BytesWritable> rdd = null; List<SparkTran> parents = getParents(tran); if (parents.size() == 0) {
/** * This method returns a topologically sorted list of SparkTran. */ private List<SparkTran> getAllTrans() { List<SparkTran> result = new LinkedList<SparkTran>(); Set<SparkTran> seen = new HashSet<SparkTran>(); for (SparkTran leaf: leafTrans) { // make sure all leaves are visited at least once visit(leaf, seen, result); } return result; }
null); SparkPlan sparkPlan = sparkPlanGenerator.generate(sparkTask.getWork()); RDD<Tuple2<HiveKey, BytesWritable>> reducerRdd = sparkPlan.generateGraph().rdd();
JavaPairRDD<HiveKey, BytesWritable> finalRDD = plan.generateGraph(); sc, jobId, jobMetricsListener, sparkCounters, plan.getCachedRDDIds(), future); return new LocalSparkJobRef(Integer.toString(jobId), hiveConf, sparkJobStatus, sc);
public SparkPlan generate(SparkWork sparkWork) throws Exception { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_BUILD_PLAN); SparkPlan sparkPlan = new SparkPlan(this.jobConf, this.sc.sc()); cloneToWork = sparkWork.getCloneToWork(); workToTranMap.clear(); workToParentWorkTranMap.clear(); try { for (BaseWork work : sparkWork.getAllWork()) { // Run the SparkDynamicPartitionPruner, we run this here instead of inside the // InputFormat so that we don't have to run pruning when creating a Record Reader runDynamicPartitionPruner(work); perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_CREATE_TRAN + work.getName()); SparkTran tran = generate(work, sparkWork); SparkTran parentTran = generateParentTran(sparkPlan, sparkWork, work); sparkPlan.addTran(tran); sparkPlan.connect(parentTran, tran); workToTranMap.put(work, tran); perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_CREATE_TRAN + work.getName()); } } finally { // clear all ThreadLocal cached MapWork/ReduceWork after plan generation // as this may executed in a pool thread. Utilities.clearWorkMap(jobConf); } perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_BUILD_PLAN); return sparkPlan; }
@Override public JavaPairRDD<WritableComparable, Writable> transform( JavaPairRDD<WritableComparable, Writable> input) { Preconditions.checkArgument(input == null, "AssertionError: MapInput doesn't take any input"); JavaPairRDD<WritableComparable, Writable> result; if (toCache) { result = hadoopRDD.mapToPair(new CopyFunction()); sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } else { result = hadoopRDD; } result.setName(this.name); return result; }
/** * This method returns a topologically sorted list of SparkTran. */ private List<SparkTran> getAllTrans() { List<SparkTran> result = new LinkedList<SparkTran>(); Set<SparkTran> seen = new HashSet<SparkTran>(); for (SparkTran leaf: leafTrans) { // make sure all leaves are visited at least once visit(leaf, seen, result); } return result; }
JavaPairRDD<HiveKey, BytesWritable> finalRDD = plan.generateGraph(); jc.monitor(future, sparkCounters, plan.getCachedRDDIds()); return null;
public SparkPlan generate(SparkWork sparkWork) throws Exception { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_BUILD_PLAN); SparkPlan sparkPlan = new SparkPlan(); cloneToWork = sparkWork.getCloneToWork(); workToTranMap.clear(); workToParentWorkTranMap.clear(); try { for (BaseWork work : sparkWork.getAllWork()) { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_CREATE_TRAN + work.getName()); SparkTran tran = generate(work); SparkTran parentTran = generateParentTran(sparkPlan, sparkWork, work); sparkPlan.addTran(tran); sparkPlan.connect(parentTran, tran); workToTranMap.put(work, tran); perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_CREATE_TRAN + work.getName()); } } finally { // clear all ThreadLocal cached MapWork/ReduceWork after plan generation // as this may executed in a pool thread. Utilities.clearWorkMap(); } perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_BUILD_PLAN); return sparkPlan; }
private void visit(SparkTran child, Set<SparkTran> seen, List<SparkTran> result) { if (seen.contains(child)) { // don't visit multiple times return; } seen.add(child); for (SparkTran parent: getParents(child)) { if (!seen.contains(parent)) { visit(parent, seen, result); } } result.add(child); }