/** * Fetches framework/worker information persisted by a prior incarnation of the RM. */ private CompletableFuture<List<MesosWorkerStore.Worker>> getWorkersAsync() { // if this resource manager is recovering from failure, // then some worker tasks are most likely still alive and we can re-obtain them return CompletableFuture.supplyAsync(() -> { try { final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers(); for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) { if (worker.state() == MesosWorkerStore.WorkerState.New) { // remove new workers because allocation requests are transient workerStore.removeWorker(worker.taskID()); } } return tasksFromPreviousAttempts; } catch (final Exception e) { throw new CompletionException(new ResourceManagerException(e)); } }, getRpcService().getExecutor()); }
this.workerStore = mesosServices.createMesosWorkerStore(flinkConfig, getRpcService().getExecutor()); workerStore.start(); } catch (Exception e) {
private CompletableFuture<Void> removeJob(JobID jobId, boolean cleanupHA) { CompletableFuture<JobManagerRunner> jobManagerRunnerFuture = jobManagerRunnerFutures.remove(jobId); final CompletableFuture<Void> jobManagerRunnerTerminationFuture; if (jobManagerRunnerFuture != null) { jobManagerRunnerTerminationFuture = jobManagerRunnerFuture.thenCompose(JobManagerRunner::closeAsync); } else { jobManagerRunnerTerminationFuture = CompletableFuture.completedFuture(null); } return jobManagerRunnerTerminationFuture.thenRunAsync( () -> cleanUpJobData(jobId, cleanupHA), getRpcService().getExecutor()); }
private CompletableFuture<Void> removeJob(JobID jobId, boolean cleanupHA) { CompletableFuture<JobManagerRunner> jobManagerRunnerFuture = jobManagerRunnerFutures.remove(jobId); final CompletableFuture<Void> jobManagerRunnerTerminationFuture; if (jobManagerRunnerFuture != null) { jobManagerRunnerTerminationFuture = jobManagerRunnerFuture.thenCompose(JobManagerRunner::closeAsync); } else { jobManagerRunnerTerminationFuture = CompletableFuture.completedFuture(null); } return jobManagerRunnerTerminationFuture.thenRunAsync( () -> cleanUpJobData(jobId, cleanupHA), getRpcService().getExecutor()); }
/** * Recovers all jobs persisted via the submitted job graph store. */ @VisibleForTesting CompletableFuture<Collection<JobGraph>> recoverJobs() { log.info("Recovering all persisted jobs."); return FutureUtils.supplyAsync( () -> { final Collection<JobID> jobIds = submittedJobGraphStore.getJobIds(); final List<JobGraph> jobGraphs = new ArrayList<>(jobIds.size()); for (JobID jobId : jobIds) { jobGraphs.add(recoverJob(jobId)); } return jobGraphs; }, getRpcService().getExecutor()); }
register(result, 1, initialRegistrationTimeout); }, rpcService.getExecutor()); }, rpcService.getExecutor());
private CompletableFuture<JobManagerRunner> createJobManagerRunner(JobGraph jobGraph) { final RpcService rpcService = getRpcService(); final CompletableFuture<JobManagerRunner> jobManagerRunnerFuture = CompletableFuture.supplyAsync( CheckedSupplier.unchecked(() -> jobManagerRunnerFactory.createJobManagerRunner( ResourceID.generate(), jobGraph, configuration, rpcService, highAvailabilityServices, heartbeatServices, blobServer, jobManagerSharedServices, new DefaultJobManagerJobMetricGroupFactory(jobManagerMetricGroup), fatalErrorHandler)), rpcService.getExecutor()); return jobManagerRunnerFuture.thenApply(FunctionUtils.uncheckedFunction(this::startJobManagerRunner)); }
private CompletableFuture<Void> removeJob(JobID jobId, boolean cleanupHA) { JobManagerRunner jobManagerRunner = jobManagerRunners.remove(jobId); final CompletableFuture<Void> jobManagerRunnerTerminationFuture; if (jobManagerRunner != null) { jobManagerRunnerTerminationFuture = jobManagerRunner.closeAsync(); } else { jobManagerRunnerTerminationFuture = CompletableFuture.completedFuture(null); } return jobManagerRunnerTerminationFuture.thenRunAsync( () -> { jobManagerMetricGroup.removeJob(jobId); blobServer.cleanupJob(jobId, cleanupHA); if (cleanupHA) { try { submittedJobGraphStore.removeJobGraph(jobId); } catch (Exception e) { log.warn("Could not properly remove job {} from submitted job graph store.", jobId); } try { runningJobsRegistry.clearJob(jobId); } catch (IOException e) { log.warn("Could not properly remove job {} from the running jobs registry.", jobId); } } }, getRpcService().getExecutor()); }
private CompletableFuture<JobManagerRunner> createJobManagerRunner(JobGraph jobGraph) { final RpcService rpcService = getRpcService(); final CompletableFuture<JobManagerRunner> jobManagerRunnerFuture = CompletableFuture.supplyAsync( CheckedSupplier.unchecked(() -> jobManagerRunnerFactory.createJobManagerRunner( ResourceID.generate(), jobGraph, configuration, rpcService, highAvailabilityServices, heartbeatServices, blobServer, jobManagerSharedServices, new DefaultJobManagerJobMetricGroupFactory(jobManagerMetricGroup), fatalErrorHandler)), rpcService.getExecutor()); return jobManagerRunnerFuture.thenApply(FunctionUtils.uncheckedFunction(this::startJobManagerRunner)); }
/** * Registers an info message listener * * @param address address of infoMessage listener to register to this resource manager */ @RpcMethod public void registerInfoMessageListener(final String address) { if(infoMessageListeners.containsKey(address)) { log.warn("Receive a duplicate registration from info message listener on ({})", address); } else { Future<InfoMessageListenerRpcGateway> infoMessageListenerRpcGatewayFuture = getRpcService().connect(address, InfoMessageListenerRpcGateway.class); Future<Void> infoMessageListenerAcceptFuture = infoMessageListenerRpcGatewayFuture.thenAcceptAsync(new AcceptFunction<InfoMessageListenerRpcGateway>() { @Override public void accept(InfoMessageListenerRpcGateway gateway) { log.info("Receive a registration from info message listener on ({})", address); infoMessageListeners.put(address, gateway); } }, getMainThreadExecutor()); infoMessageListenerAcceptFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() { @Override public Void apply(Throwable failure) { log.warn("Receive a registration from unreachable info message listener on ({})", address); return null; } }, getRpcService().getExecutor()); } }
getRpcService().getExecutor()); getRpcService().getExecutor()))) .orElse(CompletableFuture.completedFuture(null)), getUnfencedMainThreadExecutor());
/** * Callback method when current resourceManager is granted leadership. * * @param newLeaderSessionID unique leadershipID */ @Override public void grantLeadership(final UUID newLeaderSessionID) { final CompletableFuture<Boolean> acceptLeadershipFuture = clearStateFuture .thenComposeAsync((ignored) -> tryAcceptLeadership(newLeaderSessionID), getUnfencedMainThreadExecutor()); final CompletableFuture<Void> confirmationFuture = acceptLeadershipFuture.thenAcceptAsync( (acceptLeadership) -> { if (acceptLeadership) { // confirming the leader session ID might be blocking, leaderElectionService.confirmLeaderSessionID(newLeaderSessionID); } }, getRpcService().getExecutor()); confirmationFuture.whenComplete( (Void ignored, Throwable throwable) -> { if (throwable != null) { onFatalError(ExceptionUtils.stripCompletionException(throwable)); } }); }
/** * Callback method when current resourceManager is granted leadership. * * @param newLeaderSessionID unique leadershipID */ @Override public void grantLeadership(final UUID newLeaderSessionID) { log.info("Dispatcher {} was granted leadership with fencing token {}", getAddress(), newLeaderSessionID); final CompletableFuture<Collection<JobGraph>> recoveredJobsFuture = recoverJobs(); final CompletableFuture<Boolean> fencingTokenFuture = recoveredJobsFuture.thenApplyAsync( (Collection<JobGraph> recoveredJobs) -> tryAcceptLeadershipAndRunJobs(newLeaderSessionID, recoveredJobs), getUnfencedMainThreadExecutor()); final CompletableFuture<Void> confirmationFuture = fencingTokenFuture.thenAcceptAsync( (Boolean confirmLeadership) -> { if (confirmLeadership) { leaderElectionService.confirmLeaderSessionID(newLeaderSessionID); } }, getRpcService().getExecutor()); confirmationFuture.whenComplete( (Void ignored, Throwable throwable) -> { if (throwable != null) { onFatalError(ExceptionUtils.stripCompletionException(throwable)); } }); }
/** * Callback method when current resourceManager is granted leadership. * * @param newLeaderSessionID unique leadershipID */ @Override public void grantLeadership(final UUID newLeaderSessionID) { final CompletableFuture<Boolean> acceptLeadershipFuture = clearStateFuture .thenComposeAsync((ignored) -> tryAcceptLeadership(newLeaderSessionID), getUnfencedMainThreadExecutor()); final CompletableFuture<Void> confirmationFuture = acceptLeadershipFuture.thenAcceptAsync( (acceptLeadership) -> { if (acceptLeadership) { // confirming the leader session ID might be blocking, leaderElectionService.confirmLeaderSessionID(newLeaderSessionID); } }, getRpcService().getExecutor()); confirmationFuture.whenComplete( (Void ignored, Throwable throwable) -> { if (throwable != null) { onFatalError(ExceptionUtils.stripCompletionException(throwable)); } }); }
getRpcService().getExecutor()); getRpcService().getExecutor());
protected void initializeServices(Configuration configuration) throws Exception { LOG.info("Initializing cluster services."); synchronized (lock) { final String bindAddress = configuration.getString(JobManagerOptions.ADDRESS); final String portRange = getRPCPortRange(configuration); commonRpcService = createRpcService(configuration, bindAddress, portRange); // update the configuration used to create the high availability services configuration.setString(JobManagerOptions.ADDRESS, commonRpcService.getAddress()); configuration.setInteger(JobManagerOptions.PORT, commonRpcService.getPort()); haServices = createHaServices(configuration, commonRpcService.getExecutor()); blobServer = new BlobServer(configuration, haServices.createBlobStore()); blobServer.start(); heartbeatServices = createHeartbeatServices(configuration); metricRegistry = createMetricRegistry(configuration); // TODO: This is a temporary hack until we have ported the MetricQueryService to the new RpcEndpoint // Start actor system for metric query service on any available port metricQueryServiceActorSystem = MetricUtils.startMetricsActorSystem(configuration, bindAddress, LOG); metricRegistry.startQueryService(metricQueryServiceActorSystem, null); archivedExecutionGraphStore = createSerializableExecutionGraphStore(configuration, commonRpcService.getScheduledExecutor()); transientBlobCache = new TransientBlobCache( configuration, new InetSocketAddress( commonRpcService.getAddress(), blobServer.getPort())); } }
protected void initializeServices(Configuration configuration) throws Exception { LOG.info("Initializing cluster services."); synchronized (lock) { final String bindAddress = configuration.getString(JobManagerOptions.ADDRESS); final String portRange = getRPCPortRange(configuration); commonRpcService = createRpcService(configuration, bindAddress, portRange); // update the configuration used to create the high availability services configuration.setString(JobManagerOptions.ADDRESS, commonRpcService.getAddress()); configuration.setInteger(JobManagerOptions.PORT, commonRpcService.getPort()); haServices = createHaServices(configuration, commonRpcService.getExecutor()); blobServer = new BlobServer(configuration, haServices.createBlobStore()); blobServer.start(); heartbeatServices = createHeartbeatServices(configuration); metricRegistry = createMetricRegistry(configuration); // TODO: This is a temporary hack until we have ported the MetricQueryService to the new RpcEndpoint // Start actor system for metric query service on any available port metricQueryServiceActorSystem = MetricUtils.startMetricsActorSystem(configuration, bindAddress, LOG); metricRegistry.startQueryService(metricQueryServiceActorSystem, null); archivedExecutionGraphStore = createSerializableExecutionGraphStore(configuration, commonRpcService.getScheduledExecutor()); transientBlobCache = new TransientBlobCache( configuration, new InetSocketAddress( commonRpcService.getAddress(), blobServer.getPort())); } }
protected void initializeServices(Configuration configuration) throws Exception { LOG.info("Initializing cluster services."); synchronized (lock) { final String bindAddress = configuration.getString(JobManagerOptions.ADDRESS); final String portRange = getRPCPortRange(configuration); commonRpcService = createRpcService(configuration, bindAddress, portRange); // update the configuration used to create the high availability services configuration.setString(JobManagerOptions.ADDRESS, commonRpcService.getAddress()); configuration.setInteger(JobManagerOptions.PORT, commonRpcService.getPort()); haServices = createHaServices(configuration, commonRpcService.getExecutor()); blobServer = new BlobServer(configuration, haServices.createBlobStore()); blobServer.start(); heartbeatServices = createHeartbeatServices(configuration); metricRegistry = createMetricRegistry(configuration); // TODO: This is a temporary hack until we have ported the MetricQueryService to the new RpcEndpoint // start the MetricQueryService final ActorSystem actorSystem = ((AkkaRpcService) commonRpcService).getActorSystem(); metricRegistry.startQueryService(actorSystem, null); archivedExecutionGraphStore = createSerializableExecutionGraphStore(configuration, commonRpcService.getScheduledExecutor()); clusterInformation = new ClusterInformation( commonRpcService.getAddress(), blobServer.getPort()); transientBlobCache = new TransientBlobCache( configuration, new InetSocketAddress( clusterInformation.getBlobServerHostname(), clusterInformation.getBlobServerPort())); } }
private JobManagerConnection associateWithJobManager( JobID jobID, ResourceID resourceID, JobMasterGateway jobMasterGateway) { checkNotNull(jobID); checkNotNull(resourceID); checkNotNull(jobMasterGateway); TaskManagerActions taskManagerActions = new TaskManagerActionsImpl(jobMasterGateway); CheckpointResponder checkpointResponder = new RpcCheckpointResponder(jobMasterGateway); final LibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager( blobCacheService.getPermanentBlobService(), taskManagerConfiguration.getClassLoaderResolveOrder(), taskManagerConfiguration.getAlwaysParentFirstLoaderPatterns()); ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = new RpcResultPartitionConsumableNotifier( jobMasterGateway, getRpcService().getExecutor(), taskManagerConfiguration.getTimeout()); PartitionProducerStateChecker partitionStateChecker = new RpcPartitionStateChecker(jobMasterGateway); registerQueryableState(jobID, jobMasterGateway); return new JobManagerConnection( jobID, resourceID, jobMasterGateway, taskManagerActions, checkpointResponder, libraryCacheManager, resultPartitionConsumableNotifier, partitionStateChecker); }
private JobManagerConnection associateWithJobManager( JobID jobID, ResourceID resourceID, JobMasterGateway jobMasterGateway) { checkNotNull(jobID); checkNotNull(resourceID); checkNotNull(jobMasterGateway); TaskManagerActions taskManagerActions = new TaskManagerActionsImpl(jobMasterGateway); CheckpointResponder checkpointResponder = new RpcCheckpointResponder(jobMasterGateway); final LibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager( blobCacheService.getPermanentBlobService(), taskManagerConfiguration.getClassLoaderResolveOrder(), taskManagerConfiguration.getAlwaysParentFirstLoaderPatterns()); ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = new RpcResultPartitionConsumableNotifier( jobMasterGateway, getRpcService().getExecutor(), taskManagerConfiguration.getTimeout()); PartitionProducerStateChecker partitionStateChecker = new RpcPartitionStateChecker(jobMasterGateway); registerQueryableState(jobID, jobMasterGateway); return new JobManagerConnection( jobID, resourceID, jobMasterGateway, taskManagerActions, checkpointResponder, libraryCacheManager, resultPartitionConsumableNotifier, partitionStateChecker); }