new BlobCacheService(mock(PermanentBlobCache.class), mock(TransientBlobCache.class)); final Task task = new Task( jobInformation, taskInformation, () -> task.run(), TestingUtils.defaultExecutor()); task.triggerCheckpointBarrier(checkpointId, checkpointTimestamp, CheckpointOptions.forCheckpointWithDefaultLocation()); if (task.getFailureCause() != null) { throw new Exception("Task failed", task.getFailureCause()); assertEquals(ExecutionState.FINISHED, task.getExecutionState());
private void runTestDeclineOnCheckpointError(AbstractStateBackend backend) throws Exception{ TestDeclinedCheckpointResponder checkpointResponder = new TestDeclinedCheckpointResponder(); Task task = createTask(new FilterOperator(), backend, checkpointResponder, false); // start the task and wait until it is in "restore" task.startTaskThread(); checkpointResponder.declinedLatch.await(); Assert.assertEquals(ExecutionState.RUNNING, task.getExecutionState()); task.cancelExecution(); task.getExecutingThread().join(); }
/** * This test checks that cancel calls that are issued before the operator is * instantiated still lead to proper canceling. */ @Test public void testEarlyCanceling() throws Exception { final StreamConfig cfg = new StreamConfig(new Configuration()); cfg.setOperatorID(new OperatorID(4711L, 42L)); cfg.setStreamOperator(new SlowlyDeserializingOperator()); cfg.setTimeCharacteristic(TimeCharacteristic.ProcessingTime); final TaskManagerActions taskManagerActions = spy(new NoOpTaskManagerActions()); final Task task = createTask(SourceStreamTask.class, cfg, new Configuration(), taskManagerActions); final TaskExecutionState state = new TaskExecutionState( task.getJobID(), task.getExecutionId(), ExecutionState.RUNNING); task.startTaskThread(); verify(taskManagerActions, timeout(2000L)).updateTaskExecutionState(eq(state)); // send a cancel. because the operator takes a long time to deserialize, this should // hit the task before the operator is deserialized task.cancelExecution(); task.getExecutingThread().join(); assertFalse("Task did not cancel", task.getExecutingThread().isAlive()); assertEquals(ExecutionState.CANCELED, task.getExecutionState()); }
private Throwable runTestTaskFailingOnCheckpointError(AbstractStateBackend backend) throws Exception { Task task = createTask(new FilterOperator(), backend, mock(CheckpointResponder.class), true); // start the task and wait until it is in "restore" task.startTaskThread(); task.getExecutingThread().join(); assertEquals(ExecutionState.FAILED, task.getExecutionState()); return task.getFailureCause(); }
@Test public void testLifeCycleFull() throws Exception { ACTUAL_ORDER_TRACKING.clear(); Configuration taskManagerConfig = new Configuration(); StreamConfig cfg = new StreamConfig(new Configuration()); MockSourceFunction srcFun = new MockSourceFunction(); cfg.setStreamOperator(new LifecycleTrackingStreamSource<>(srcFun, true)); cfg.setOperatorID(new OperatorID()); cfg.setTimeCharacteristic(TimeCharacteristic.ProcessingTime); Task task = StreamTaskTest.createTask(SourceStreamTask.class, cfg, taskManagerConfig); task.startTaskThread(); LifecycleTrackingStreamSource.runStarted.await(); // wait for clean termination task.getExecutingThread().join(); assertEquals(ExecutionState.FINISHED, task.getExecutionState()); assertEquals(EXPECTED_CALL_ORDER_FULL, ACTUAL_ORDER_TRACKING); }
private void unregisterTaskAndNotifyFinalState( final JobMasterGateway jobMasterGateway, final ExecutionAttemptID executionAttemptID) { Task task = taskSlotTable.removeTask(executionAttemptID); if (task != null) { if (!task.getExecutionState().isTerminal()) { try { task.failExternally(new IllegalStateException("Task is being remove from TaskManager.")); } catch (Exception e) { log.error("Could not properly fail task.", e); } } log.info("Un-registering task and sending final execution state {} to JobManager for task {} {}.", task.getExecutionState(), task.getTaskInfo().getTaskName(), task.getExecutionId()); AccumulatorSnapshot accumulatorSnapshot = task.getAccumulatorRegistry().getSnapshot(); updateTaskExecutionState( jobMasterGateway, new TaskExecutionState( task.getJobID(), task.getExecutionId(), task.getExecutionState(), task.getFailureCause(), accumulatorSnapshot, task.getMetricGroup().getIOMetricGroup().createSnapshot())); } else { log.error("Cannot find task with ID {} to unregister.", executionAttemptID); } }
ExecutionState current = this.executionState; if (current == ExecutionState.CREATED) { if (transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) { notifyFinalState(); if (metrics != null) { metrics.close(); if (transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) { notifyFinalState(); if (metrics != null) { metrics.close(); userCodeClassLoader = createUserCodeClassloader(libraryCache); final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader); invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass); if (isCanceledOrFailed()) { throw new CancelTaskException(); if (isCanceledOrFailed()) { throw new CancelTaskException(); .createKvStateTaskRegistry(jobId, getJobVertexId()); if (!transitionState(ExecutionState.DEPLOYING, ExecutionState.RUNNING)) { throw new CancelTaskException();
public void unregisterTask(Task task) { LOG.debug("Unregister task {} from network environment (state: {}).", task.getTaskInfo().getTaskNameWithSubtasks(), task.getExecutionState()); final ExecutionAttemptID executionId = task.getExecutionId(); if (task.isCanceledOrFailed()) { resultPartitionManager.releasePartitionsProducedBy(executionId, task.getFailureCause()); ResultPartitionWriter[] writers = task.getAllWriters(); if (writers != null) { for (ResultPartitionWriter writer : writers) { ResultPartition[] partitions = task.getProducedPartitions(); if (partitions != null) { for (ResultPartition partition : partitions) { final SingleInputGate[] inputGates = task.getAllInputGates();
/** * Add the given task to the slot identified by the task's allocation id. * * @param task to add to the task slot with the respective allocation id * @throws SlotNotFoundException if there was no slot for the given allocation id * @throws SlotNotActiveException if there was no slot active for task's job and allocation id * @return True if the task could be added to the task slot; otherwise false */ public boolean addTask(Task task) throws SlotNotFoundException, SlotNotActiveException { Preconditions.checkNotNull(task); TaskSlot taskSlot = getTaskSlot(task.getAllocationId()); if (taskSlot != null) { if (taskSlot.isActive(task.getJobID(), task.getAllocationId())) { if (taskSlot.add(task)) { taskSlotMappings.put(task.getExecutionId(), new TaskSlotMapping(task, taskSlot)); return true; } else { return false; } } else { throw new SlotNotActiveException(task.getJobID(), task.getAllocationId()); } } else { throw new SlotNotFoundException(task.getAllocationId()); } }
@Override public void run() { // set safety net from the task's context for checkpointing thread LOG.debug("Creating FileSystem stream leak safety net for {}", Thread.currentThread().getName()); FileSystemSafetyNet.setSafetyNetCloseableRegistryForThread(safetyNetCloseableRegistry); try { boolean success = invokable.triggerCheckpoint(checkpointMetaData, checkpointOptions); if (!success) { checkpointResponder.declineCheckpoint( getJobID(), getExecutionId(), checkpointID, new CheckpointDeclineTaskNotReadyException(taskName)); } } catch (Throwable t) { if (getExecutionState() == ExecutionState.RUNNING) { failExternally(new Exception( "Error while triggering checkpoint " + checkpointID + " for " + taskNameWithSubtask, t)); } else { LOG.debug("Encountered error while triggering checkpoint {} for " + "{} ({}) while being not in state running.", checkpointID, taskNameWithSubtask, executionId, t); } } finally { FileSystemSafetyNet.setSafetyNetCloseableRegistryForThread(null); } } };
PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker(); Task task = new Task( jobInformation, taskInformation, getRpcService().getExecutor()); log.info("Received task {}.", task.getTaskInfo().getTaskNameWithSubtasks()); task.startTaskThread(); task.getExecutionId() + '.';
public void unregisterTask(Task task) { LOG.debug("Unregister task {} from network environment (state: {}).", task.getTaskInfo().getTaskNameWithSubtasks(), task.getExecutionState()); final ExecutionAttemptID executionId = task.getExecutionId(); if (task.isCanceledOrFailed()) { resultPartitionManager.releasePartitionsProducedBy(executionId, task.getFailureCause()); for (InternalResultPartition partition : task.getInternalPartitions()) { taskEventDispatcher.unregisterPartition(partition.getPartitionId()); partition.destroyBufferPool(); final SingleInputGate[] inputGates = task.getAllInputGates();
@Override public Void apply(ExecutionState executionState, Throwable throwable) { try { if (executionState != null) { onPartitionStateUpdate( intermediateDataSetId, resultPartitionId, executionState); } else if (throwable instanceof TimeoutException) { // our request timed out, assume we're still running and try again onPartitionStateUpdate( intermediateDataSetId, resultPartitionId, ExecutionState.RUNNING); } else if (throwable instanceof PartitionProducerDisposedException) { String msg = String.format("Producer %s of partition %s disposed. Cancelling execution.", resultPartitionId.getProducerId(), resultPartitionId.getPartitionId()); LOG.info(msg, throwable); cancelExecution(); } else { failExternally(throwable); } } catch (IOException | InterruptedException e) { failExternally(e); } return null; } }, executor);
Task currentTask = tasks.next(); int resultPartitionCount = currentTask.getProducedPartitions().length; ResultPartitionID[] resultPartitionIDs = new ResultPartitionID[resultPartitionCount]; boolean[] resultPartitionsConsumable = new boolean[resultPartitionCount]; for (int i = 0; i < resultPartitionCount; i++) { final ResultPartition resultPartition = currentTask.getProducedPartitions()[i]; resultPartitionIDs[i] = resultPartition.getPartitionId(); if (resultPartition instanceof InternalResultPartition) { currentTask.getExecutionState(), currentTask.getTaskInfo().getAttemptNumber(), currentTask.getCreateTimestamp(), currentTask.getJobVertexId(), currentTask.getExecutionId(), currentTask.getTaskInfo().getIndexOfThisSubtask(), resultPartitionIDs, resultPartitionsConsumable, currentTask.getInputSplitProvider().getAssignedInputSplits(), currentTaskSlot.generateSlotOffer()); allTaskExecutionStatus.add(taskExecutionStatus);
@Override public void run() { try { statefulTask.notifyCheckpointComplete(checkpointID); } catch (Throwable t) { if (getExecutionState() == ExecutionState.RUNNING) { // fail task if checkpoint confirmation failed. failExternally(new RuntimeException( "Error while confirming checkpoint", t)); } } } };
cancelExecution(); } else { producerState); failExternally(new IllegalStateException(msg)); failExternally(new IllegalStateException("Received partition producer state for " + "unknown input gate " + intermediateDataSetId + "."));
private Optional<StackTraceElement[]> getStackTrace( final ExecutionAttemptID executionAttemptId, final int maxStackTraceDepth) { final Task task = taskSlotTable.getTask(executionAttemptId); if (task != null && task.getExecutionState() == ExecutionState.RUNNING) { final StackTraceElement[] stackTrace = task.getExecutingThread().getStackTrace(); if (maxStackTraceDepth > 0) { return Optional.of(Arrays.copyOfRange(stackTrace, 0, Math.min(maxStackTraceDepth, stackTrace.length))); } else { return Optional.of(stackTrace); } } else { return Optional.empty(); } }
/** * Cancels the task execution. If the task is already in a terminal state * (such as FINISHED, CANCELED, FAILED), or if the task is already canceling this does nothing. * Otherwise it sets the state to CANCELING, and, if the invokable code is running, * starts an asynchronous thread that aborts that code. * * <p>This method never blocks.</p> */ public void cancelExecution() { LOG.info("Attempting to cancel task {} ({}).", taskNameWithSubtask, executionId); cancelOrFailAndCancelInvokable(ExecutionState.CANCELING, null); }
@Override public CompletableFuture<Acknowledge> triggerCheckpoint( ExecutionAttemptID executionAttemptID, long checkpointId, long checkpointTimestamp, CheckpointOptions checkpointOptions) { log.debug("Trigger checkpoint {}@{} for {}.", checkpointId, checkpointTimestamp, executionAttemptID); final Task task = taskSlotTable.getTask(executionAttemptID); if (task != null) { task.triggerCheckpointBarrier(checkpointId, checkpointTimestamp, checkpointOptions); return CompletableFuture.completedFuture(Acknowledge.get()); } else { final String message = "TaskManager received a checkpoint request for unknown task " + executionAttemptID + '.'; log.debug(message); return FutureUtils.completedExceptionally(new CheckpointException(message)); } }
@Override public CompletableFuture<Acknowledge> cancelTask(ExecutionAttemptID executionAttemptID, Time timeout) { final Task task = taskSlotTable.getTask(executionAttemptID); if (task != null) { try { task.cancelExecution(); return CompletableFuture.completedFuture(Acknowledge.get()); } catch (Throwable t) { return FutureUtils.completedExceptionally( new TaskException("Cannot cancel task for execution " + executionAttemptID + '.', t)); } } else { final String message = "Cannot find task to stop for execution " + executionAttemptID + '.'; log.debug(message); return FutureUtils.completedExceptionally(new TaskException(message)); } }