@Override public void revokeLeadership() { synchronized (lock) { if (shutdown) { log.info("JobManagerRunner already shutdown."); return; } log.info("JobManager for job {} ({}) was revoked leadership at {}.", jobGraph.getName(), jobGraph.getJobID(), getAddress()); jobManager.getSelf().suspendExecution(new Exception("JobManager is no longer the leader.")); } }
/** * Marks this runner's job as not running. Other JobManager will not recover the job * after this call. * * <p>This method never throws an exception. */ private void unregisterJobFromHighAvailability() { try { runningJobsRegistry.setJobFinished(jobGraph.getJobID()); } catch (Throwable t) { log.error("Could not un-register from high-availability services job {} ({})." + "Other JobManager's may attempt to recover it and re-execute it.", jobGraph.getName(), jobGraph.getJobID(), t); } }
/** * Marks this runner's job as not running. Other JobManager will not recover the job * after this call. * * <p>This method never throws an exception. */ private void unregisterJobFromHighAvailability() { try { runningJobsRegistry.setJobFinished(jobGraph.getJobID()); } catch (Throwable t) { log.error("Could not un-register from high-availability services job {} ({})." + "Other JobManager's may attempt to recover it and re-execute it.", jobGraph.getName(), jobGraph.getJobID(), t); } }
/** * Marks this runner's job as not running. Other JobManager will not recover the job * after this call. * * <p>This method never throws an exception. */ private void unregisterJobFromHighAvailability() { try { runningJobsRegistry.setJobFinished(jobGraph.getJobID()); } catch (Throwable t) { log.error("Could not un-register from high-availability services job {} ({})." + "Other JobManager's may attempt to recover it and re-execute it.", jobGraph.getName(), jobGraph.getJobID(), t); } }
public JobManagerJobMetricGroup addJob(JobGraph job) { JobID jobId = job.getJobID(); String jobName = job.getName(); // get or create a jobs metric group JobManagerJobMetricGroup currentJobGroup; synchronized (this) { if (!isClosed()) { currentJobGroup = jobs.get(jobId); if (currentJobGroup == null || currentJobGroup.isClosed()) { currentJobGroup = new JobManagerJobMetricGroup(registry, this, jobId, jobName); jobs.put(jobId, currentJobGroup); } return currentJobGroup; } else { return null; } } }
public JobManagerJobMetricGroup addJob(JobGraph job) { JobID jobId = job.getJobID(); String jobName = job.getName(); // get or create a jobs metric group JobManagerJobMetricGroup currentJobGroup; synchronized (this) { if (!isClosed()) { currentJobGroup = jobs.get(jobId); if (currentJobGroup == null || currentJobGroup.isClosed()) { currentJobGroup = new JobManagerJobMetricGroup(registry, this, jobId, jobName); jobs.put(jobId, currentJobGroup); } return currentJobGroup; } else { return null; } } }
public JobManagerJobMetricGroup addJob(JobGraph job) { JobID jobId = job.getJobID(); String jobName = job.getName(); // get or create a jobs metric group JobManagerJobMetricGroup currentJobGroup; synchronized (this) { if (!isClosed()) { currentJobGroup = jobs.get(jobId); if (currentJobGroup == null || currentJobGroup.isClosed()) { currentJobGroup = new JobManagerJobMetricGroup(registry, this, jobId, jobName); jobs.put(jobId, currentJobGroup); } return currentJobGroup; } else { return null; } } }
public JobManagerJobMetricGroup addJob(JobGraph job) { JobID jobId = job.getJobID(); String jobName = job.getName(); // get or create a jobs metric group JobManagerJobMetricGroup currentJobGroup; synchronized (this) { if (!isClosed()) { currentJobGroup = jobs.get(jobId); if (currentJobGroup == null || currentJobGroup.isClosed()) { currentJobGroup = new JobManagerJobMetricGroup(registry, this, jobId, jobName); jobs.put(jobId, currentJobGroup); } return currentJobGroup; } else { return null; } } }
@Override public CompletableFuture<Acknowledge> submitJob(JobGraph jobGraph, Time timeout) { final JobID jobId = jobGraph.getJobID(); log.info("Submitting job {} ({}).", jobId, jobGraph.getName()); try { final RunningJobsRegistry.JobSchedulingStatus jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobId); if (jobSchedulingStatus == RunningJobsRegistry.JobSchedulingStatus.DONE || jobManagerRunners.containsKey(jobId)) { return FutureUtils.completedExceptionally( new JobSubmissionException(jobId, String.format("Job has already been submitted and is in state %s.", jobSchedulingStatus))); } else { persistAndRunJob(jobGraph); return CompletableFuture.completedFuture(Acknowledge.get()); } } catch (Exception e) { return FutureUtils.completedExceptionally(new FlinkException(String.format("Failed to submit job %s.", jobId), e)); } }
private Acknowledge startJobExecution(JobMasterId newJobMasterId) throws Exception { validateRunsInMainThread(); checkNotNull(newJobMasterId, "The new JobMasterId must not be null."); if (Objects.equals(getFencingToken(), newJobMasterId)) { log.info("Already started the job execution with JobMasterId {}.", newJobMasterId); return Acknowledge.get(); } setNewFencingToken(newJobMasterId); startJobMasterServices(); log.info("Starting execution of job {} ({})", jobGraph.getName(), jobGraph.getJobID()); resetAndScheduleExecutionGraph(); return Acknowledge.get(); }
/** * Suspend the job and shutdown all other services including rpc. */ @Override public CompletableFuture<Void> postStop() { log.info("Stopping the JobMaster for job {}({}).", jobGraph.getName(), jobGraph.getJobID()); // disconnect from all registered TaskExecutors final Set<ResourceID> taskManagerResourceIds = new HashSet<>(registeredTaskManagers.keySet()); final FlinkException cause = new FlinkException("Stopping JobMaster for job " + jobGraph.getName() + '(' + jobGraph.getJobID() + ")."); for (ResourceID taskManagerResourceId : taskManagerResourceIds) { disconnectTaskManager(taskManagerResourceId, cause); } taskManagerHeartbeatManager.stop(); resourceManagerHeartbeatManager.stop(); // make sure there is a graceful exit suspendExecution(new FlinkException("JobManager is shutting down.")); // shut down will internally release all registered slots slotPool.shutDown(); final CompletableFuture<Void> disposeInternalSavepointFuture; if (lastInternalSavepoint != null) { disposeInternalSavepointFuture = CompletableFuture.runAsync(() -> disposeSavepoint(lastInternalSavepoint)); } else { disposeInternalSavepointFuture = CompletableFuture.completedFuture(null); } final CompletableFuture<Void> slotPoolTerminationFuture = slotPool.getTerminationFuture(); return FutureUtils.completeAll(Arrays.asList(disposeInternalSavepointFuture, slotPoolTerminationFuture)); }
private Acknowledge startJobExecution(JobMasterId newJobMasterId) throws Exception { validateRunsInMainThread(); checkNotNull(newJobMasterId, "The new JobMasterId must not be null."); if (Objects.equals(getFencingToken(), newJobMasterId)) { log.info("Already started the job execution with JobMasterId {}.", newJobMasterId); return Acknowledge.get(); } setNewFencingToken(newJobMasterId); startJobMasterServices(); log.info("Starting execution of job {} ({})", jobGraph.getName(), jobGraph.getJobID()); resetAndScheduleExecutionGraph(); return Acknowledge.get(); }
/** * Suspend the job and shutdown all other services including rpc. */ @Override public CompletableFuture<Void> postStop() { log.info("Stopping the JobMaster for job {}({}).", jobGraph.getName(), jobGraph.getJobID()); // disconnect from all registered TaskExecutors final Set<ResourceID> taskManagerResourceIds = new HashSet<>(registeredTaskManagers.keySet()); final FlinkException cause = new FlinkException("Stopping JobMaster for job " + jobGraph.getName() + '(' + jobGraph.getJobID() + ")."); for (ResourceID taskManagerResourceId : taskManagerResourceIds) { disconnectTaskManager(taskManagerResourceId, cause); } taskManagerHeartbeatManager.stop(); resourceManagerHeartbeatManager.stop(); // make sure there is a graceful exit suspendExecution(new FlinkException("JobManager is shutting down.")); // shut down will internally release all registered slots slotPool.shutDown(); final CompletableFuture<Void> disposeInternalSavepointFuture; if (lastInternalSavepoint != null) { disposeInternalSavepointFuture = CompletableFuture.runAsync(() -> disposeSavepoint(lastInternalSavepoint)); } else { disposeInternalSavepointFuture = CompletableFuture.completedFuture(null); } final CompletableFuture<Void> slotPoolTerminationFuture = slotPool.getTerminationFuture(); return FutureUtils.completeAll(Arrays.asList(disposeInternalSavepointFuture, slotPoolTerminationFuture)); }
@Override public CompletableFuture<Acknowledge> submitJob(JobGraph jobGraph, Time timeout) { final JobID jobId = jobGraph.getJobID(); log.info("Submitting job {} ({}).", jobId, jobGraph.getName()); final RunningJobsRegistry.JobSchedulingStatus jobSchedulingStatus; try { jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobId); } catch (IOException e) { return FutureUtils.completedExceptionally(new FlinkException(String.format("Failed to retrieve job scheduling status for job %s.", jobId), e)); } if (jobSchedulingStatus == RunningJobsRegistry.JobSchedulingStatus.DONE || jobManagerRunnerFutures.containsKey(jobId)) { return FutureUtils.completedExceptionally( new JobSubmissionException(jobId, String.format("Job has already been submitted and is in state %s.", jobSchedulingStatus))); } else { final CompletableFuture<Acknowledge> persistAndRunFuture = waitForTerminatingJobManager(jobId, jobGraph, this::persistAndRunJob) .thenApply(ignored -> Acknowledge.get()); return persistAndRunFuture.exceptionally( (Throwable throwable) -> { final Throwable strippedThrowable = ExceptionUtils.stripCompletionException(throwable); log.error("Failed to submit job {}.", jobId, strippedThrowable); throw new CompletionException( new JobSubmissionException(jobId, "Failed to submit job.", strippedThrowable)); }); } }
@Override public CompletableFuture<Acknowledge> submitJob(JobGraph jobGraph, Time timeout) { final JobID jobId = jobGraph.getJobID(); log.info("Submitting job {} ({}).", jobId, jobGraph.getName()); final RunningJobsRegistry.JobSchedulingStatus jobSchedulingStatus; try { jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobId); } catch (IOException e) { return FutureUtils.completedExceptionally(new FlinkException(String.format("Failed to retrieve job scheduling status for job %s.", jobId), e)); } if (jobSchedulingStatus == RunningJobsRegistry.JobSchedulingStatus.DONE || jobManagerRunnerFutures.containsKey(jobId)) { return FutureUtils.completedExceptionally( new JobSubmissionException(jobId, String.format("Job has already been submitted and is in state %s.", jobSchedulingStatus))); } else { final CompletableFuture<Acknowledge> persistAndRunFuture = waitForTerminatingJobManager(jobId, jobGraph, this::persistAndRunJob) .thenApply(ignored -> Acknowledge.get()); return persistAndRunFuture.exceptionally( (Throwable throwable) -> { final Throwable strippedThrowable = ExceptionUtils.stripCompletionException(throwable); log.error("Failed to submit job {}.", jobId, strippedThrowable); throw new CompletionException( new JobSubmissionException(jobId, "Failed to submit job.", strippedThrowable)); }); } }
@Override public void revokeLeadership() { synchronized (lock) { if (shutdown) { log.info("JobManagerRunner already shutdown."); return; } log.info("JobManager for job {} ({}) was revoked leadership at {}.", jobGraph.getName(), jobGraph.getJobID(), getAddress()); setNewLeaderGatewayFuture(); CompletableFuture<Acknowledge> suspendFuture = jobMaster.suspend(new FlinkException("JobManager is no longer the leader."), rpcTimeout); suspendFuture.whenCompleteAsync( (Acknowledge ack, Throwable throwable) -> { if (throwable != null) { handleJobManagerRunnerError(new FlinkException("Could not suspend the job manager.", throwable)); } }, jobManagerSharedServices.getScheduledExecutorService()); } }
@Override public void revokeLeadership() { synchronized (lock) { if (shutdown) { log.info("JobManagerRunner already shutdown."); return; } log.info("JobManager for job {} ({}) was revoked leadership at {}.", jobGraph.getName(), jobGraph.getJobID(), getAddress()); setNewLeaderGatewayFuture(); CompletableFuture<Acknowledge> suspendFuture = jobMaster.suspend(new FlinkException("JobManager is no longer the leader."), rpcTimeout); suspendFuture.whenCompleteAsync( (Acknowledge ack, Throwable throwable) -> { if (throwable != null) { handleJobManagerRunnerError(new FlinkException("Could not suspend the job manager.", throwable)); } }, jobManagerSharedServices.getScheduledExecutorService()); } }
/** * This method executes a job in detached mode. The method returns immediately after the job * has been added to the * * @param job The Flink job to execute * * @throws JobExecutionException Thrown if anything went amiss during initial job launch, * or if the job terminally failed. */ public void runDetached(JobGraph job) throws JobExecutionException { checkNotNull(job); LOG.info("Received job for detached execution: {} ({})", job.getName(), job.getJobID()); synchronized (lock) { checkState(!shutdown, "mini cluster is shut down"); checkState(runners == null, "mini cluster can only execute one job at a time"); DetachedFinalizer finalizer = new DetachedFinalizer(job.getJobID(), numJobManagers); this.runners = startJobRunners(job, finalizer, finalizer); } }
private void verifyJobSchedulingStatusAndStartJobManager(UUID leaderSessionId) throws Exception { final JobSchedulingStatus jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobGraph.getJobID()); if (jobSchedulingStatus == JobSchedulingStatus.DONE) { log.info("Granted leader ship but job {} has been finished. ", jobGraph.getJobID()); jobFinishedByOther(); } else { log.info("JobManager runner for job {} ({}) was granted leadership with session id {} at {}.", jobGraph.getName(), jobGraph.getJobID(), leaderSessionId, getAddress()); runningJobsRegistry.setJobRunning(jobGraph.getJobID()); final CompletableFuture<Acknowledge> startFuture = jobMaster.start(new JobMasterId(leaderSessionId), rpcTimeout); final CompletableFuture<JobMasterGateway> currentLeaderGatewayFuture = leaderGatewayFuture; startFuture.whenCompleteAsync( (Acknowledge ack, Throwable throwable) -> { if (throwable != null) { handleJobManagerRunnerError(new FlinkException("Could not start the job manager.", throwable)); } else { confirmLeaderSessionIdIfStillLeader(leaderSessionId, currentLeaderGatewayFuture); } }, jobManagerSharedServices.getScheduledExecutorService()); } }
private void verifyJobSchedulingStatusAndStartJobManager(UUID leaderSessionId) throws Exception { final JobSchedulingStatus jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobGraph.getJobID()); if (jobSchedulingStatus == JobSchedulingStatus.DONE) { log.info("Granted leader ship but job {} has been finished. ", jobGraph.getJobID()); jobFinishedByOther(); } else { log.info("JobManager runner for job {} ({}) was granted leadership with session id {} at {}.", jobGraph.getName(), jobGraph.getJobID(), leaderSessionId, getAddress()); runningJobsRegistry.setJobRunning(jobGraph.getJobID()); final CompletableFuture<Acknowledge> startFuture = jobMaster.start(new JobMasterId(leaderSessionId), rpcTimeout); final CompletableFuture<JobMasterGateway> currentLeaderGatewayFuture = leaderGatewayFuture; startFuture.whenCompleteAsync( (Acknowledge ack, Throwable throwable) -> { if (throwable != null) { handleJobManagerRunnerError(new FlinkException("Could not start the job manager.", throwable)); } else { confirmLeaderSessionIdIfStillLeader(leaderSessionId, currentLeaderGatewayFuture); } }, jobManagerSharedServices.getScheduledExecutorService()); } }