/** * Get the expected number of data read according to the communication pattern of the edge and * the parallelism of destination vertex. * * @return the expected number of data read. */ private int getExpectedRead() { final Optional<DuplicateEdgeGroupPropertyValue> duplicateDataProperty = runtimeEdge.getPropertyValue(DuplicateEdgeGroupProperty.class); final int duplicatedDataMultiplier = duplicateDataProperty.isPresent() ? duplicateDataProperty.get().getGroupSize() : 1; final int readForABlock = CommunicationPatternProperty.Value.OneToOne.equals( runtimeEdge.getPropertyValue(CommunicationPatternProperty.class).orElseThrow( () -> new RuntimeException("No communication pattern on this edge."))) ? 1 : dstIrVertex.getPropertyValue(ParallelismProperty.class).orElseThrow( () -> new RuntimeException("No parallelism property on the destination vertex.")); return readForABlock * duplicatedDataMultiplier; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { // Find the merger vertex inserted by reshaping pass. if (dag.getIncomingEdgesOf(vertex).stream().anyMatch(irEdge -> CommunicationPatternProperty.Value.Shuffle .equals(irEdge.getPropertyValue(CommunicationPatternProperty.class).get()))) { dag.getIncomingEdgesOf(vertex).forEach(edgeToMerger -> { if (CommunicationPatternProperty.Value.Shuffle .equals(edgeToMerger.getPropertyValue(CommunicationPatternProperty.class).get())) { // Pass data through memory to the merger vertex. edgeToMerger.setPropertyPermanently(DataStoreProperty.of(DataStoreProperty.Value.SerializedMemoryStore)); } }); dag.getOutgoingEdgesOf(vertex).forEach(edgeFromMerger -> // Merge the input data and write it immediately to the remote disk. edgeFromMerger.setPropertyPermanently(DataStoreProperty.of(DataStoreProperty.Value.LocalFileStore))); } }); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); if (!inEdges.isEmpty()) { inEdges.forEach(edge -> { if (fromTransientToReserved(edge) || fromReservedToTransient(edge)) { edge.setPropertyPermanently(DataStoreProperty.of(DataStoreProperty.Value.LocalFileStore)); } else if (CommunicationPatternProperty.Value.OneToOne .equals(edge.getPropertyValue(CommunicationPatternProperty.class).get())) { edge.setPropertyPermanently(DataStoreProperty.of(DataStoreProperty.Value.MemoryStore)); } else { edge.setPropertyPermanently(DataStoreProperty.of(DataStoreProperty.Value.LocalFileStore)); } }); } }); return dag; }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); if (!inEdges.isEmpty()) { inEdges.forEach(edge -> { if (fromTransientToReserved(edge) || fromReservedToTransient(edge)) { edge.setPropertyPermanently(DataStoreProperty.of(DataStoreProperty.Value.LocalFileStore)); } else if (CommunicationPatternProperty.Value.OneToOne .equals(edge.getPropertyValue(CommunicationPatternProperty.class).get())) { edge.setPropertyPermanently(DataStoreProperty.of(DataStoreProperty.Value.MemoryStore)); } else { edge.setPropertyPermanently(DataStoreProperty.of(DataStoreProperty.Value.LocalFileStore)); } }); } }); return dag; }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.topologicalDo(v -> { // we only care about metric collection vertices. if (v instanceof OperatorVertex && ((OperatorVertex) v).getTransform() instanceof MetricCollectTransform) { dag.getOutgoingEdgesOf(v).forEach(edge -> { // double checking. if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { edge.setPropertyPermanently(MetricCollectionProperty.of( MetricCollectionProperty.Value.DataSkewRuntimePass)); } }); } }); return dag; } }
@Override public List<CompletableFuture<DataUtil.IteratorWithNumBytes>> read() { final Optional<CommunicationPatternProperty.Value> comValue = runtimeEdge.getPropertyValue(CommunicationPatternProperty.class); if (comValue.get().equals(CommunicationPatternProperty.Value.OneToOne)) { return Collections.singletonList(pipeManagerWorker.read(dstTaskIndex, runtimeEdge, dstTaskIndex)); } else if (comValue.get().equals(CommunicationPatternProperty.Value.BroadCast) || comValue.get().equals(CommunicationPatternProperty.Value.Shuffle)) { final int numSrcTasks = InputReader.getSourceParallelism(this); final List<CompletableFuture<DataUtil.IteratorWithNumBytes>> futures = new ArrayList<>(); for (int srcTaskIdx = 0; srcTaskIdx < numSrcTasks; srcTaskIdx++) { futures.add(pipeManagerWorker.read(srcTaskIdx, runtimeEdge, dstTaskIndex)); } return futures; } else { throw new UnsupportedCommPatternException(new Exception("Communication pattern not supported")); } }
@Override public List<CompletableFuture<DataUtil.IteratorWithNumBytes>> read() { final Optional<CommunicationPatternProperty.Value> comValue = runtimeEdge.getPropertyValue(CommunicationPatternProperty.class); if (comValue.get().equals(CommunicationPatternProperty.Value.OneToOne)) { return Collections.singletonList(pipeManagerWorker.read(dstTaskIndex, runtimeEdge, dstTaskIndex)); } else if (comValue.get().equals(CommunicationPatternProperty.Value.BroadCast) || comValue.get().equals(CommunicationPatternProperty.Value.Shuffle)) { final int numSrcTasks = InputReader.getSourceParallelism(this); final List<CompletableFuture<DataUtil.IteratorWithNumBytes>> futures = new ArrayList<>(); for (int srcTaskIdx = 0; srcTaskIdx < numSrcTasks; srcTaskIdx++) { futures.add(pipeManagerWorker.read(srcTaskIdx, runtimeEdge, dstTaskIndex)); } return futures; } else { throw new UnsupportedCommPatternException(new Exception("Communication pattern not supported")); } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); inEdges.forEach(edge -> { if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { edge.setPropertyPermanently(DecompressionProperty.of(CompressionProperty.Value.None)); dag.getOutgoingEdgesOf(edge.getDst()) .forEach(edgeFromRelay -> edgeFromRelay.setPropertyPermanently(DecompressionProperty.of(CompressionProperty.Value.LZ4))); } }); }); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().stream() .filter(vertex -> dag.getIncomingEdgesOf(vertex.getId()) .stream() // TODO #198: Handle Un-cloneable Beam Sink Operators // only shuffle receivers (for now... as particular Beam sink operators fail when cloned) .anyMatch(edge -> edge.getPropertyValue(CommunicationPatternProperty.class) .orElseThrow(() -> new IllegalStateException()) .equals(CommunicationPatternProperty.Value.Shuffle)) ) .forEach(vertex -> vertex.setProperty( ClonedSchedulingProperty.of(new ClonedSchedulingProperty.CloneConf()))); // clone upfront, always return dag; } }
/** * Get the expected number of data read according to the communication pattern of the edge and * the parallelism of destination vertex. * * @return the expected number of data read. */ private int getExpectedRead() { final Optional<DuplicateEdgeGroupPropertyValue> duplicateDataProperty = runtimeEdge.getPropertyValue(DuplicateEdgeGroupProperty.class); final int duplicatedDataMultiplier = duplicateDataProperty.isPresent() ? duplicateDataProperty.get().getGroupSize() : 1; final int readForABlock = CommunicationPatternProperty.Value.OneToOne.equals( runtimeEdge.getPropertyValue(CommunicationPatternProperty.class).orElseThrow( () -> new RuntimeException("No communication pattern on this edge."))) ? 1 : dstIrVertex.getPropertyValue(ParallelismProperty.class).orElseThrow( () -> new RuntimeException("No parallelism property on the destination vertex.")); return readForABlock * duplicatedDataMultiplier; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); inEdges.forEach(edge -> { if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { dag.getOutgoingEdgesOf(edge.getDst()) .forEach(edgeFromRelay -> edgeFromRelay.setPropertyPermanently(PartitionerProperty.of( PartitionerProperty.Value.DedicatedKeyPerElementPartitioner))); } }); }); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); inEdges.forEach(edge -> { if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { edge.setPropertyPermanently(DecompressionProperty.of(CompressionProperty.Value.None)); dag.getOutgoingEdgesOf(edge.getDst()) .forEach(edgeFromRelay -> edgeFromRelay.setPropertyPermanently(DecompressionProperty.of(CompressionProperty.Value.LZ4))); } }); }); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.topologicalDo(dst -> dag.getIncomingEdgesOf(dst).forEach(edge -> { if (CommunicationPatternProperty.Value.Shuffle .equals(edge.getPropertyValue(CommunicationPatternProperty.class).get())) { final int parallelism = dst.getPropertyValue(ParallelismProperty.class).get(); final Map<Integer, KeyRange> metric = new HashMap<>(); for (int i = 0; i < parallelism; i++) { metric.put(i, HashRange.of(i, i + 1, false)); } edge.setProperty(DataSkewMetricProperty.of(new DataSkewMetricFactory(metric))); } })); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); inEdges.forEach(edge -> { if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { dag.getOutgoingEdgesOf(edge.getDst()) .forEach(edgeFromRelay -> edgeFromRelay.setPropertyPermanently(EncoderProperty.of(BytesEncoderFactory.of()))); } }); }); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); inEdges.forEach(edge -> { if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { edge.setPropertyPermanently(DataFlowProperty.of(DataFlowProperty.Value.Push)); // Push to the merger vertex. } else { edge.setPropertyPermanently(DataFlowProperty.of(DataFlowProperty.Value.Pull)); } }); }); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); if (!inEdges.isEmpty()) { inEdges.forEach(edge -> { if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { edge.setProperty(DataFlowProperty.of(DataFlowProperty.Value.Push)); } }); } }); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.topologicalDo(dst -> dag.getIncomingEdgesOf(dst).forEach(edge -> { if (CommunicationPatternProperty.Value.Shuffle .equals(edge.getPropertyValue(CommunicationPatternProperty.class).get())) { final int parallelism = dst.getPropertyValue(ParallelismProperty.class).get(); final Map<Integer, KeyRange> metric = new HashMap<>(); for (int i = 0; i < parallelism; i++) { metric.put(i, HashRange.of(i, i + 1, false)); } edge.setProperty(DataSkewMetricProperty.of(new DataSkewMetricFactory(metric))); } })); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); inEdges.forEach(edge -> { if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { edge.setPropertyPermanently(CompressionProperty.of(CompressionProperty.Value.LZ4)); dag.getOutgoingEdgesOf(edge.getDst()) .forEach(edgeFromRelay -> edgeFromRelay.setPropertyPermanently(CompressionProperty.of(CompressionProperty.Value.None))); } }); }); return dag; } }
@Override public DAG<IRVertex, IREdge> apply(final DAG<IRVertex, IREdge> dag) { dag.getVertices().forEach(vertex -> { final List<IREdge> inEdges = dag.getIncomingEdgesOf(vertex); inEdges.forEach(edge -> { if (edge.getPropertyValue(CommunicationPatternProperty.class).get() .equals(CommunicationPatternProperty.Value.Shuffle)) { dag.getOutgoingEdgesOf(edge.getDst()) .forEach(edgeFromRelay -> edgeFromRelay.setPropertyPermanently(PartitionerProperty.of( PartitionerProperty.Value.DedicatedKeyPerElementPartitioner))); } }); }); return dag; } }
public boolean hasSkewedData(final Task task) { final int taskIdx = RuntimeIdManager.getIndexFromTaskId(task.getTaskId()); for (StageEdge inEdge : task.getTaskIncomingEdges()) { if (CommunicationPatternProperty.Value.Shuffle .equals(inEdge.getDataCommunicationPattern())) { final Map<Integer, KeyRange> taskIdxToKeyRange = inEdge.getPropertyValue(DataSkewMetricProperty.class).get().getMetric(); final KeyRange hashRange = taskIdxToKeyRange.get(taskIdx); if (((HashRange) hashRange).isSkewed()) { return true; } } } return false; }