public void onExecutionFail(Execution taskExecution, Throwable cause) { // TODO: check if need to failover the preceding region failover(taskExecution.getGlobalModVersion(), cause); }
/** * Restart the region by notify the schedule plugin. */ private void restart(long globalModVersionOfFailover) { try { if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) { // Let the scheduler event to reschedule connected ExecutionVertices executionGraph.resetExecutionVerticesAndNotify(globalModVersionOfFailover, connectedExecutionVertices); } else { failover(globalModVersionOfFailover, new FlinkException("FailoverRegion " + id + " witch from CREATED to RUNNING fail.")); } } catch (GlobalModVersionMismatch e) { // happens when a global recovery happens concurrently to the regional recovery // should do nothing } catch (Exception e) { failover(globalModVersionOfFailover, new FlinkException("FailoverRegion " + id + " restart failed.", e)); } }
private void reset(long globalModVersionOfFailover) { if (transitionState(JobStatus.CANCELED, JobStatus.CREATED)) { // reset all connected ExecutionVertexes final Collection<CoLocationGroup> colGroups = new HashSet<>(); for (ExecutionVertex ev : connectedExecutionVertices) { CoLocationGroup cgroup = ev.getJobVertex().getCoLocationGroup(); if (cgroup != null && !colGroups.contains(cgroup)){ cgroup.resetConstraints(); colGroups.add(cgroup); } } restart(globalModVersionOfFailover); } else { failover(globalModVersionOfFailover, new FlinkException("FailoverRegion " + id + " switch from CANCELLED to CREATED fail.")); } }
(Void ignored, Throwable throwable) -> { if (throwable != null) { failover(globalModVersionOfFailover, new FlinkException("Could not cancel all execution job vertices properly.", throwable)); } else {