@Override public void close() throws Exception { // Free cluster resources clusterClient.cancel(jobId); // cancel() is non-blocking so do this to make sure the job finished CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay( () -> clusterClient.getJobStatus(jobId), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus.equals(JobStatus.CANCELED), TestingUtils.defaultScheduledExecutor()); assertEquals( JobStatus.CANCELED, jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS)); } }
clusterClient.getJobStatus(closableJobGraph.getJobId()); while (deadline.hasTimeLeft() && !jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS).equals(JobStatus.RUNNING)) { Thread.sleep(50); jobStatusFuture =
private void allVerticesInTerminalState(long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.CANCELLING)) { if (transitionState(curStatus, JobStatus.CANCELED)) { reset(globalModVersionOfFailover); break; } } else { LOG.info("FailoverRegion {} is {} when allVerticesInTerminalState.", id, state); break; } } }
private void allVerticesInTerminalState(long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.CANCELLING)) { if (transitionState(curStatus, JobStatus.CANCELED)) { reset(globalModVersionOfFailover); break; } } else { LOG.info("FailoverRegion {} is {} when allVerticesInTerminalState.", id, state); break; } } }
private void allVerticesInTerminalState(long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.CANCELLING)) { if (transitionState(curStatus, JobStatus.CANCELED)) { reset(globalModVersionOfFailover); break; } } else { LOG.info("FailoverRegion {} is {} when allVerticesInTerminalState.", id, state); break; } } }
private void allVerticesInTerminalState(long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.CANCELLING)) { if (transitionState(curStatus, JobStatus.CANCELED)) { reset(globalModVersionOfFailover); break; } } else { LOG.info("FailoverRegion {} is {} when allVerticesInTerminalState.", id, state); break; } } }
private void cancel(final long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { if (transitionState(curStatus, JobStatus.CANCELLING)) { // we build a future that is complete once all vertices have reached a terminal state final ArrayList<Future<?>> futures = new ArrayList<>(connectedExecutionVertexes.size()); // cancel all tasks (that still need cancelling) for (ExecutionVertex vertex : connectedExecutionVertexes) { futures.add(vertex.cancel()); } final FutureUtils.ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures); allTerminal.thenAcceptAsync(new AcceptFunction<Void>() { @Override public void accept(Void value) { allVerticesInTerminalState(globalModVersionOfFailover); } }, executor); break; } } else { LOG.info("FailoverRegion {} is {} when cancel.", id, state); break; } } }
private void failover(long globalModVersionOfFailover) { if (!executionGraph.getRestartStrategy().canRestart()) { executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail")); } else { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { cancel(globalModVersionOfFailover); } else if (curStatus.equals(JobStatus.CANCELED)) { reset(globalModVersionOfFailover); } else { LOG.info("FailoverRegion {} is {} when notified to failover.", id, state); } } }
private void failover(long globalModVersionOfFailover) { if (!executionGraph.getRestartStrategy().canRestart()) { executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail")); } else { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { cancel(globalModVersionOfFailover); } else if (curStatus.equals(JobStatus.CANCELED)) { reset(globalModVersionOfFailover); } else { LOG.info("FailoverRegion {} is {} when notified to failover.", id, state); } } }
private void failover(long globalModVersionOfFailover) { if (!executionGraph.getRestartStrategy().canRestart()) { executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail")); } else { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { cancel(globalModVersionOfFailover); } else if (curStatus.equals(JobStatus.CANCELED)) { reset(globalModVersionOfFailover); } else { LOG.info("FailoverRegion {} is {} when notified to failover.", id, state); } } }
private void cancel(final long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { if (transitionState(curStatus, JobStatus.CANCELLING)) { // we build a future that is complete once all vertices have reached a terminal state final ArrayList<CompletableFuture<?>> futures = new ArrayList<>(connectedExecutionVertexes.size()); // cancel all tasks (that still need cancelling) for (ExecutionVertex vertex : connectedExecutionVertexes) { futures.add(vertex.cancel()); } final FutureUtils.ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures); allTerminal.thenAcceptAsync( (Void value) -> allVerticesInTerminalState(globalModVersionOfFailover), executor); break; } } else { LOG.info("FailoverRegion {} is {} when cancel.", id, state); break; } } }
private void cancel(final long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { if (transitionState(curStatus, JobStatus.CANCELLING)) { // we build a future that is complete once all vertices have reached a terminal state final ArrayList<CompletableFuture<?>> futures = new ArrayList<>(connectedExecutionVertexes.size()); // cancel all tasks (that still need cancelling) for (ExecutionVertex vertex : connectedExecutionVertexes) { futures.add(vertex.cancel()); } final FutureUtils.ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures); allTerminal.thenAcceptAsync( (Void value) -> allVerticesInTerminalState(globalModVersionOfFailover), executor); break; } } else { LOG.info("FailoverRegion {} is {} when cancel.", id, state); break; } } }
public JobID findJobId(ActorGateway jobManagerGateway, String jobName) throws Exception { Future<Object> response = jobManagerGateway.ask( JobManagerMessages.getRequestRunningJobsStatus(), askTimeout); Object result; try { result = Await.result(response, askTimeout); } catch (Exception e) { throw new Exception("Could not retrieve running jobs from the JobManager.", e); } if (result instanceof RunningJobsStatus) { List<JobStatusMessage> jobs = ((RunningJobsStatus) result).getStatusMessages(); for (JobStatusMessage rj : jobs) { if (rj.getJobState().equals(JobStatus.RUNNING) || rj.getJobState().equals(JobStatus.RESTARTING)) { if (rj.getJobName().equals(jobName)) return rj.getJobId(); } } } throw new Exception("Could not find job"); }
/** * Notify the region to failover. */ private void failover(long globalModVersionOfFailover, Throwable cause) { LOG.info("Try to fail and restart region due to error: ", cause); regionFailCount++; if (!executionGraph.getRestartStrategy().canRestart()) { executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail", cause)); } else if (regionFailCount > regionFailLimit) { executionGraph.failGlobal(new FlinkException("FailoverRegion " + id + " exceeds max region restart limit", cause)); } else { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { cancel(globalModVersionOfFailover); } else if (curStatus.equals(JobStatus.CANCELED)) { reset(globalModVersionOfFailover); } else { LOG.info("FailoverRegion {} is {} when notified to failover.", id, state); } } }
private void cancel(final long globalModVersionOfFailover) { while (true) { JobStatus curStatus = this.state; if (curStatus.equals(JobStatus.RUNNING)) { if (transitionState(curStatus, JobStatus.CANCELLING)) {
public CompletableFuture<Collection<ExecutionAttemptID>> reconcile() { JobStatus curStatus = state; checkState(JobStatus.CREATED.equals(curStatus), "Not allow reconcile in state " + curStatus);
if (JobStatus.CREATED.equals(executionGraph.getState()) && !graphManager.isReconciling()) { scheduleExecutionGraph(); } else { (Collection<ExecutionAttemptID> reconcileFailedExecutions) -> { graphManager.leaveReconcile(); if (JobStatus.RUNNING.equals(executionGraph.getState())) { for (ExecutionAttemptID executionAttemptId : reconcileFailedExecutions) { if (executionAttemptId != null) { else if (JobStatus.CREATED.equals(executionGraph.getState())) { scheduleExecutionGraph(); } else {
private void assignExecutionGraph( ExecutionGraph newExecutionGraph, JobManagerJobMetricGroup newJobManagerJobMetricGroup) { checkState(executionGraph == null || JobStatus.CREATED.equals(executionGraph.getState()) || executionGraph.getState().isTerminalState(), "The job state is " + (executionGraph == null ? null : executionGraph.getState())); checkState(jobManagerJobMetricGroup == null); executionGraph = newExecutionGraph; jobManagerJobMetricGroup = newJobManagerJobMetricGroup; checkState(jobStatusListener == null); // register self as job status change listener jobStatusListener = new JobManagerJobStatusListener(); executionGraph.registerJobStatusListener(jobStatusListener); setupGraphManager(); }
@Override public void replayOpLog(OperationLog opLog) { checkArgument(isReconciling && JobStatus.CREATED.equals(executionGraph.getState()), "Job is in " + executionGraph.getState() + " while replaying log.");