@Override public JobManagerRunner createJobManagerRunner( ResourceID resourceId, JobGraph jobGraph, Configuration configuration, RpcService rpcService, HighAvailabilityServices highAvailabilityServices, HeartbeatServices heartbeatServices, BlobServer blobServer, JobManagerSharedServices jobManagerServices, JobManagerJobMetricGroupFactory jobManagerJobMetricGroupFactory, FatalErrorHandler fatalErrorHandler) throws Exception { return new JobManagerRunner( resourceId, jobGraph, configuration, rpcService, highAvailabilityServices, heartbeatServices, blobServer, jobManagerServices, jobManagerJobMetricGroupFactory, fatalErrorHandler); } }
/** * Job completion notification triggered by JobManager */ @Override public void jobFinished(JobExecutionResult result) { try { unregisterJobFromHighAvailability(); shutdownInternally(); } finally { if (toNotifyOnComplete != null) { toNotifyOnComplete.jobFinished(result); } } }
@Override public void grantLeadership(final UUID leaderSessionID) { synchronized (lock) { if (shutdown) { log.info("JobManagerRunner already shutdown."); return; } try { verifyJobSchedulingStatusAndStartJobManager(leaderSessionID); } catch (Exception e) { handleJobManagerRunnerError(e); } } }
private void verifyJobSchedulingStatusAndStartJobManager(UUID leaderSessionId) throws Exception { final JobSchedulingStatus jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobID); if (jobSchedulingStatus == JobSchedulingStatus.DONE) { log.info("Granted leader ship but job {} has been finished. ", jobID); jobFinishedByOther(); } else { log.info("JobManager runner for job {} ({}) was granted leadership with session id {} at {}.", jobName, jobID, leaderSessionId, getAddress()); if (jobSchedulingStatus == JobSchedulingStatus.RUNNING) { // If finding the job status is running, it means someone has already started the job, need recover. jobMaster.reconcile(); } else if (jobSchedulingStatus == JobSchedulingStatus.PENDING) { runningJobsRegistry.setJobRunning(jobID); } final CompletableFuture<Acknowledge> startFuture = jobMaster.start(new JobMasterId(leaderSessionId), rpcTimeout); final CompletableFuture<JobMasterGateway> currentLeaderGatewayFuture = leaderGatewayFuture; startFuture.whenCompleteAsync( (Acknowledge ack, Throwable throwable) -> { if (throwable != null) { handleJobManagerRunnerError(new FlinkException("Could not start the job manager.", throwable)); } else { confirmLeaderSessionIdIfStillLeader(leaderSessionId, currentLeaderGatewayFuture); } }, jobManagerSharedServices.getScheduledExecutorService()); } }
@Override public void revokeLeadership() { synchronized (lock) { if (shutdown) { log.info("JobManagerRunner already shutdown."); return; } log.info("JobManager for job {} ({}) was revoked leadership at {}.", jobName, jobID, getAddress()); setNewLeaderGatewayFuture(); CompletableFuture<Acknowledge> suspendFuture = jobMaster.suspend(new FlinkException("JobManager is no longer the leader."), rpcTimeout); suspendFuture.whenCompleteAsync( (Acknowledge ack, Throwable throwable) -> { if (throwable != null) { handleJobManagerRunnerError(new FlinkException("Could not suspend the job manager.", throwable)); } else { leaderShipLostHandler.onLeaderShipLost(new Exception("Job manager runner was revoked leader ship.")); } }, jobManagerSharedServices.getScheduledExecutorService()); } }
private JobManagerRunner startJobManagerRunner(JobManagerRunner jobManagerRunner) throws Exception { final JobID jobId = jobManagerRunner.getJobGraph().getJobID(); jobManagerRunner.getResultFuture().whenCompleteAsync( (ArchivedExecutionGraph archivedExecutionGraph, Throwable throwable) -> { // check if we are still the active JobManagerRunner by checking the identity //noinspection ObjectEquality if (jobManagerRunner == jobManagerRunnerFutures.get(jobId).getNow(null)) { if (archivedExecutionGraph != null) { jobReachedGloballyTerminalState(archivedExecutionGraph); } else { final Throwable strippedThrowable = ExceptionUtils.stripCompletionException(throwable); if (strippedThrowable instanceof JobNotFinishedException) { jobNotFinished(jobId); } else { jobMasterFailed(jobId, strippedThrowable); } } } else { log.debug("There is a newer JobManagerRunner for the job {}.", jobId); } }, getMainThreadExecutor()); jobManagerRunner.start(); return jobManagerRunner; }
jobGraph.getName(), jobGraph.getJobID(), leaderSessionID, getAddress()); onFatalError(t); return; jobFinishedByOther(); return; onFatalError(new Exception("Could not start the job manager.", e));
for (int i = 0; i < numJobManagers; i++) { try { runners[i] = new JobManagerRunner( ResourceID.generate(), job, onCompletion, errorHandler); runners[i].start(); try { if (runners[i] != null) { runners[i].shutdown();
private void confirmLeaderSessionIdIfStillLeader(UUID leaderSessionId, CompletableFuture<JobMasterGateway> currentLeaderGatewayFuture) { if (leaderElectionService.hasLeadership(leaderSessionId)) { currentLeaderGatewayFuture.complete(jobMaster.getSelfGateway(JobMasterGateway.class)); leaderElectionService.confirmLeaderSessionID(leaderSessionId); } else { log.debug("Ignoring confirmation of leader session id because {} is no longer the leader.", getAddress()); } }
@Override public void jobMasterFailed(Throwable cause) { handleJobManagerRunnerError(cause); }
/** * Job completion notification triggered by JobManager. */ @Override public void jobReachedGloballyTerminalState(ArchivedExecutionGraph executionGraph) { unregisterJobFromHighAvailability(); // complete the result future with the terminal execution graph resultFuture.complete(executionGraph); }
private void runJob(JobGraph jobGraph) throws Exception { Preconditions.checkState(!jobManagerRunners.containsKey(jobGraph.getJobID())); final JobManagerRunner jobManagerRunner = createJobManagerRunner(jobGraph); jobManagerRunner.start(); jobManagerRunners.put(jobGraph.getJobID(), jobManagerRunner); }
shutdown = true; setNewLeaderGatewayFuture(); leaderGatewayFuture.completeExceptionally(new FlinkException("JobMaster has been shut down."));
@Override public CompletableFuture<JobResult> requestJobResult(JobID jobId, Time timeout) { final JobManagerRunner jobManagerRunner = jobManagerRunners.get(jobId); if (jobManagerRunner == null) { final ArchivedExecutionGraph archivedExecutionGraph = archivedExecutionGraphStore.get(jobId); if (archivedExecutionGraph == null) { return FutureUtils.completedExceptionally(new FlinkJobNotFoundException(jobId)); } else { return CompletableFuture.completedFuture(JobResult.createFrom(archivedExecutionGraph)); } } else { return jobManagerRunner.getResultFuture().thenApply(JobResult::createFrom); } }
private void verifyJobSchedulingStatusAndStartJobManager(UUID leaderSessionId) throws Exception { final JobSchedulingStatus jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobGraph.getJobID()); if (jobSchedulingStatus == JobSchedulingStatus.DONE) { log.info("Granted leader ship but job {} has been finished. ", jobGraph.getJobID()); jobFinishedByOther(); } else { log.info("JobManager runner for job {} ({}) was granted leadership with session id {} at {}.", jobGraph.getName(), jobGraph.getJobID(), leaderSessionId, getAddress()); runningJobsRegistry.setJobRunning(jobGraph.getJobID()); final CompletableFuture<Acknowledge> startFuture = jobMaster.start(new JobMasterId(leaderSessionId), rpcTimeout); final CompletableFuture<JobMasterGateway> currentLeaderGatewayFuture = leaderGatewayFuture; startFuture.whenCompleteAsync( (Acknowledge ack, Throwable throwable) -> { if (throwable != null) { handleJobManagerRunnerError(new FlinkException("Could not start the job manager.", throwable)); } else { confirmLeaderSessionIdIfStillLeader(leaderSessionId, currentLeaderGatewayFuture); } }, jobManagerSharedServices.getScheduledExecutorService()); } }
@Override public void revokeLeadership() { synchronized (lock) { if (shutdown) { log.info("JobManagerRunner already shutdown."); return; } log.info("JobManager for job {} ({}) was revoked leadership at {}.", jobGraph.getName(), jobGraph.getJobID(), getAddress()); setNewLeaderGatewayFuture(); CompletableFuture<Acknowledge> suspendFuture = jobMaster.suspend(new FlinkException("JobManager is no longer the leader."), rpcTimeout); suspendFuture.whenCompleteAsync( (Acknowledge ack, Throwable throwable) -> { if (throwable != null) { handleJobManagerRunnerError(new FlinkException("Could not suspend the job manager.", throwable)); } }, jobManagerSharedServices.getScheduledExecutorService()); } }
private JobManagerRunner startJobManagerRunner(JobManagerRunner jobManagerRunner) throws Exception { final JobID jobId = jobManagerRunner.getJobGraph().getJobID(); jobManagerRunner.getResultFuture().whenCompleteAsync( (ArchivedExecutionGraph archivedExecutionGraph, Throwable throwable) -> { // check if we are still the active JobManagerRunner by checking the identity //noinspection ObjectEquality if (jobManagerRunner == jobManagerRunnerFutures.get(jobId).getNow(null)) { if (archivedExecutionGraph != null) { jobReachedGloballyTerminalState(archivedExecutionGraph); } else { final Throwable strippedThrowable = ExceptionUtils.stripCompletionException(throwable); if (strippedThrowable instanceof JobNotFinishedException) { jobNotFinished(jobId); } else { jobMasterFailed(jobId, strippedThrowable); } } } else { log.debug("There is a newer JobManagerRunner for the job {}.", jobId); } }, getMainThreadExecutor()); jobManagerRunner.start(); return jobManagerRunner; }
private void confirmLeaderSessionIdIfStillLeader(UUID leaderSessionId, CompletableFuture<JobMasterGateway> currentLeaderGatewayFuture) { if (leaderElectionService.hasLeadership(leaderSessionId)) { currentLeaderGatewayFuture.complete(jobMaster.getSelfGateway(JobMasterGateway.class)); leaderElectionService.confirmLeaderSessionID(leaderSessionId); } else { log.debug("Ignoring confirmation of leader session id because {} is no longer the leader.", getAddress()); } }
@Override public void jobMasterFailed(Throwable cause) { handleJobManagerRunnerError(cause); }
/** * Job completion notification triggered by JobManager. */ @Override public void jobReachedGloballyTerminalState(ArchivedExecutionGraph executionGraph) { unregisterJobFromHighAvailability(); // complete the result future with the terminal execution graph resultFuture.complete(executionGraph); }