@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { // On every vertex, if ResourceSlotProperty is not set, put it as true. dag.getVertices().stream() .filter(v -> !v.getPropertyValue(ResourceSlotProperty.class).isPresent()) .forEach(v -> v.setProperty(ResourceSlotProperty.of(true))); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { // On every vertex, if ResourceSlotProperty is not set, put it as true. dag.getVertices().stream() .filter(v -> !v.getPropertyValue(ResourceSlotProperty.class).isPresent()) .forEach(v -> v.setProperty(ResourceSlotProperty.of(true))); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { // On every vertex, if ResourceLocalityProperty is not set, put it as true. dag.getVertices().stream() .filter(v -> !v.getPropertyValue(ResourceLocalityProperty.class).isPresent()) .forEach(v -> v.setProperty(ResourceLocalityProperty.of(true))); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { // On every vertex, if ResourceLocalityProperty is not set, put it as true. dag.getVertices().stream() .filter(v -> !v.getPropertyValue(ResourceLocalityProperty.class).isPresent()) .forEach(v -> v.setProperty(ResourceLocalityProperty.of(true))); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { // Speculative execution policy. final double fractionToWaitFor = 0.00000001; // Aggressive final double medianTimeMultiplier = 1.00000001; // Aggressive // Apply the policy to ALL vertices dag.getVertices().forEach(vertex -> vertex.setProperty(ClonedSchedulingProperty.of( new ClonedSchedulingProperty.CloneConf(fractionToWaitFor, medianTimeMultiplier)))); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { // Speculative execution policy. final double fractionToWaitFor = 0.00000001; // Aggressive final double medianTimeMultiplier = 1.00000001; // Aggressive // Apply the policy to ALL vertices dag.getVertices().forEach(vertex -> vertex.setProperty(ClonedSchedulingProperty.of( new ClonedSchedulingProperty.CloneConf(fractionToWaitFor, medianTimeMultiplier)))); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().stream() .filter(vertex -> dag.getIncomingEdgesOf(vertex.getId()) .stream() // TODO #198: Handle Un-cloneable Beam Sink Operators // only shuffle receivers (for now... as particular Beam sink operators fail when cloned) .anyMatch(edge -> edge.getPropertyValue(CommunicationPatternProperty.class) .orElseThrow(() -> new IllegalStateException()) .equals(CommunicationPatternProperty.Value.Shuffle)) ) .forEach(vertex -> vertex.setProperty( ClonedSchedulingProperty.of(new ClonedSchedulingProperty.CloneConf()))); // clone upfront, always return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { if (bandwidthSpecificationString.isEmpty()) { dag.topologicalDo(irVertex -> irVertex.setProperty(ResourceSiteProperty.of(EMPTY_MAP))); } else { assignNodeShares(dag, BandwidthSpecification.fromJsonString(bandwidthSpecificationString)); } return dag; }
/** * Recursively synchronize parallelism for vertices connected by one-to-one edges. * @param dag the original DAG. * @param vertex vertex to observe and update. * @param parallelism the parallelism of the most recently updated descendant. * @return the max value of parallelism among those observed. */ static Integer recursivelySynchronizeO2OParallelism(final DAG<IRVertex, IREdge> dag, final IRVertex vertex, final Integer parallelism) { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); final Integer ancestorParallelism = inEdges.stream() .filter(edge -> CommunicationPatternProperty.Value.OneToOne .equals(edge.getPropertyValue(CommunicationPatternProperty.class).get())) .map(IREdge::getSrc) .mapToInt(inVertex -> recursivelySynchronizeO2OParallelism(dag, inVertex, parallelism)) .max().orElse(1); final Integer maxParallelism = ancestorParallelism > parallelism ? ancestorParallelism : parallelism; final Integer myParallelism = vertex.getPropertyValue(ParallelismProperty.class).get(); // update the vertex with the max value. if (maxParallelism > myParallelism) { vertex.setProperty(ParallelismProperty.of(maxParallelism)); return maxParallelism; } return myParallelism; }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().stream() .filter(vertex -> dag.getIncomingEdgesOf(vertex.getId()) .stream() // TODO #198: Handle Un-cloneable Beam Sink Operators // only shuffle receivers (for now... as particular Beam sink operators fail when cloned) .anyMatch(edge -> edge.getPropertyValue(CommunicationPatternProperty.class) .orElseThrow(() -> new IllegalStateException()) .equals(CommunicationPatternProperty.Value.Shuffle)) ) .forEach(vertex -> vertex.setProperty( ClonedSchedulingProperty.of(new ClonedSchedulingProperty.CloneConf()))); // clone upfront, always return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { if (bandwidthSpecificationString.isEmpty()) { dag.topologicalDo(irVertex -> irVertex.setProperty(ResourceSiteProperty.of(EMPTY_MAP))); } else { assignNodeShares(dag, BandwidthSpecification.fromJsonString(bandwidthSpecificationString)); } return dag; }
/** * Recursively synchronize parallelism for vertices connected by one-to-one edges. * @param dag the original DAG. * @param vertex vertex to observe and update. * @param parallelism the parallelism of the most recently updated descendant. * @return the max value of parallelism among those observed. */ static Integer recursivelySynchronizeO2OParallelism(final DAG<IRVertex, IREdge> dag, final IRVertex vertex, final Integer parallelism) { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); final Integer ancestorParallelism = inEdges.stream() .filter(edge -> CommunicationPatternProperty.Value.OneToOne .equals(edge.getPropertyValue(CommunicationPatternProperty.class).get())) .map(IREdge::getSrc) .mapToInt(inVertex -> recursivelySynchronizeO2OParallelism(dag, inVertex, parallelism)) .max().orElse(1); final Integer maxParallelism = ancestorParallelism > parallelism ? ancestorParallelism : parallelism; final Integer myParallelism = vertex.getPropertyValue(ParallelismProperty.class).get(); // update the vertex with the max value. if (maxParallelism > myParallelism) { vertex.setProperty(ParallelismProperty.of(maxParallelism)); return maxParallelism; } return myParallelism; }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices() .forEach(v -> dag.getOutgoingEdgesOf(v).stream() .filter(edge -> edge.getPropertyValue(MetricCollectionProperty.class).isPresent()) .forEach(skewEdge -> { final IRVertex dstV = skewEdge.getDst(); dstV.setProperty(ResourceSkewedDataProperty.of(true)); dag.getDescendants(dstV.getId()).forEach(descendentV -> { descendentV.getExecutionProperties().put(ResourceSkewedDataProperty.of(true)); }); }) ); return dag; } }
vertex.setProperty(ParallelismProperty.of( sourceVertex.getReadables(desiredSourceParallelism).size())); vertex.setProperty(ParallelismProperty.of(parallelism));
v1.setProperty(ParallelismProperty.of(3)); v1.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.COMPUTE)); dagBuilder.addVertex(v1); v2.setProperty(ParallelismProperty.of(2)); if (sameContainerType) { v2.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.COMPUTE)); } else { v2.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.TRANSIENT)); v3.setProperty(ParallelismProperty.of(2)); if (sameContainerType) { v3.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.COMPUTE)); } else { v3.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.TRANSIENT));
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().stream() .filter(v -> v instanceof OperatorVertex && ((OperatorVertex) v).getTransform() instanceof MetricCollectTransform) .forEach(v -> v.setProperty(DynamicOptimizationProperty .of(DynamicOptimizationProperty.Value.DataSkewRuntimePass))); dag.getVertices().stream() .filter(v -> hasParentWithMetricCollectTransform(dag, v) && !v.getExecutionProperties().containsKey(ResourceSkewedDataProperty.class)) .forEach(childV -> { childV.getExecutionProperties().put(ResourceSkewedDataProperty.of(true)); dag.getDescendants(childV.getId()).forEach(descendentV -> { descendentV.getExecutionProperties().put(ResourceSkewedDataProperty.of(true)); }); }); return dag; } }
v1.setProperty(ParallelismProperty.of(3)); v1.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.COMPUTE)); dagBuilder.addVertex(v1); v2.setProperty(ParallelismProperty.of(2)); v2.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.COMPUTE)); dagBuilder.addVertex(v2); v3.setProperty(ParallelismProperty.of(3)); v3.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.COMPUTE)); dagBuilder.addVertex(v3); v4.setProperty(ParallelismProperty.of(2)); v4.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.COMPUTE)); dagBuilder.addVertex(v4); v5.setProperty(ParallelismProperty.of(2)); v5.setProperty(ResourcePriorityProperty.of(ResourcePriorityProperty.COMPUTE)); dagBuilder.addVertex(v5);
/** * Static method to create a JavaRDD object from an text file. * * @param sparkContext the spark context containing configurations. * @param minPartitions the minimum number of partitions. * @param inputPath the path of the input text file. * @return the new JavaRDD object */ public static JavaRDD<String> of(final SparkContext sparkContext, final int minPartitions, final String inputPath) { final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(); final org.apache.spark.rdd.RDD<String> textRdd = sparkContext.textFile(inputPath, minPartitions); final int numPartitions = textRdd.getNumPartitions(); final IRVertex textSourceVertex = new SparkTextFileBoundedSourceVertex(sparkContext, inputPath, numPartitions); textSourceVertex.setProperty(ParallelismProperty.of(numPartitions)); builder.addVertex(textSourceVertex); return new JavaRDD<>(textRdd, sparkContext, builder.buildWithoutSourceSinkCheck(), textSourceVertex); }
/** * Static method to create a JavaRDD object from a Dataset. * * @param sparkSession spark session containing configurations. * @param dataset dataset to read initial data from. * @param <T> type of the resulting object. * @return the new JavaRDD object. */ public static <T> JavaRDD<T> of(final SparkSession sparkSession, final Dataset<T> dataset) { final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(); final IRVertex sparkBoundedSourceVertex = new SparkDatasetBoundedSourceVertex<>(sparkSession, dataset); final org.apache.spark.rdd.RDD<T> sparkRDD = dataset.sparkRDD(); sparkBoundedSourceVertex.setProperty(ParallelismProperty.of(sparkRDD.getNumPartitions())); builder.addVertex(sparkBoundedSourceVertex); return new JavaRDD<>( sparkRDD, sparkSession.sparkContext(), builder.buildWithoutSourceSinkCheck(), sparkBoundedSourceVertex); }
/** * Static method to create a RDD object from an iterable object. * * @param sparkContext spark context containing configurations. * @param initialData initial data. * @param parallelism parallelism information. * @param <T> type of the resulting object. * @return the new JavaRDD object. */ public static <T> JavaRDD<T> of(final SparkContext sparkContext, final Iterable<T> initialData, final Integer parallelism) { final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(); final IRVertex initializedSourceVertex = new InMemorySourceVertex<>(initialData); initializedSourceVertex.setProperty(ParallelismProperty.of(parallelism)); builder.addVertex(initializedSourceVertex); final RDD<T> nemoRdd = new RDD<>(sparkContext, builder.buildWithoutSourceSinkCheck(), initializedSourceVertex, Option.empty(), ClassTag$.MODULE$.apply(Object.class)); return new JavaRDD<>(nemoRdd); }