private void establishResourceManagerConnection(ResourceID resourceManagerResourceId) { // monitor the resource manager as heartbeat target resourceManagerHeartbeatManager.monitorTarget(resourceManagerResourceId, new HeartbeatTarget<Void>() { @Override public void receiveHeartbeat(ResourceID resourceID, Void payload) { ResourceManagerGateway resourceManagerGateway = resourceManagerConnection.getTargetGateway(); resourceManagerGateway.heartbeatFromTaskManager(resourceID); } @Override public void requestHeartbeat(ResourceID resourceID, Void payload) { // the TaskManager won't send heartbeat requests to the ResourceManager } }); }
@Override public void heartbeatFromJobManager(final ResourceID resourceID) { jobManagerHeartbeatManager.receiveHeartbeat(resourceID, null); }
@Override public void heartbeatFromJobManager(ResourceID resourceID) { jobManagerHeartbeatManager.requestHeartbeat(resourceID, null); }
@Override public CompletableFuture<Acknowledge> disconnectTaskManager(final ResourceID resourceID, final Exception cause) { log.info("Disconnect TaskExecutor {} because: {}", resourceID, cause.getMessage()); taskManagerHeartbeatManager.unmonitorTarget(resourceID); CompletableFuture<Acknowledge> releaseFuture = slotPoolGateway.releaseTaskManager(resourceID, cause); Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManagerConnection = registeredTaskManagers.remove(resourceID); if (taskManagerConnection != null) { taskManagerConnection.f1.disconnectJobManager(jobGraph.getJobID(), cause); } return releaseFuture; }
/** * Suspend the job and shutdown all other services including rpc. */ @Override public void shutDown() throws Exception { taskManagerHeartbeatManager.stop(); resourceManagerHeartbeatManager.stop(); // make sure there is a graceful exit getSelf().suspendExecution(new Exception("JobManager is shutting down.")); super.shutDown(); }
WorkerRegistration<WorkerType> registration = taskExecutors.get(taskExecutorResourceId); if (registration != null && taskManagerHeartbeatManager.getLastHeartbeatFrom(taskExecutorResourceId) >= 0) { log.info("The TaskExecutor {} has already registered and kept heartbeat, so will ignore " + "and use original instance id {}", taskExecutorResourceId, registration.getInstanceID()); taskManagerHeartbeatManager.monitorTarget(taskExecutorResourceId, new HeartbeatTarget<Void>() { @Override public void receiveHeartbeat(ResourceID resourceID, Void payload) {
@Override public CompletableFuture<Collection<TaskManagerInfo>> requestTaskManagerInfo(Time timeout) { final ArrayList<TaskManagerInfo> taskManagerInfos = new ArrayList<>(taskExecutors.size()); for (Map.Entry<ResourceID, WorkerRegistration<WorkerType>> taskExecutorEntry : taskExecutors.entrySet()) { final ResourceID resourceId = taskExecutorEntry.getKey(); final WorkerRegistration<WorkerType> taskExecutor = taskExecutorEntry.getValue(); taskManagerInfos.add( new TaskManagerInfo( resourceId, taskExecutor.getTaskExecutorGateway().getAddress(), taskExecutor.getDataPort(), taskManagerHeartbeatManager.getLastHeartbeatFrom(resourceId), slotManager.getNumberRegisteredSlotsOf(taskExecutor.getInstanceID()), slotManager.getNumberFreeSlotsOf(taskExecutor.getInstanceID()), taskExecutor.getHardwareDescription())); } return CompletableFuture.completedFuture(taskManagerInfos); }
@Override public CompletableFuture<Acknowledge> disconnectTaskManager(final ResourceID resourceID, final Exception cause) { log.debug("Disconnect TaskExecutor {} because: {}", resourceID, cause.getMessage()); taskManagerHeartbeatManager.unmonitorTarget(resourceID); CompletableFuture<Acknowledge> releaseFuture = slotPoolGateway.releaseTaskManager(resourceID, cause); Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManagerConnection = registeredTaskManagers.remove(resourceID); if (taskManagerConnection != null) { taskManagerConnection.f1.disconnectJobManager(jobGraph.getJobID(), cause); } return releaseFuture; }
/** * Suspend the job and shutdown all other services including rpc. */ @Override public CompletableFuture<Void> postStop() { log.info("Stopping the JobMaster for job {}({}).", jobGraph.getName(), jobGraph.getJobID()); // disconnect from all registered TaskExecutors final Set<ResourceID> taskManagerResourceIds = new HashSet<>(registeredTaskManagers.keySet()); final FlinkException cause = new FlinkException("Stopping JobMaster for job " + jobGraph.getName() + '(' + jobGraph.getJobID() + ")."); for (ResourceID taskManagerResourceId : taskManagerResourceIds) { disconnectTaskManager(taskManagerResourceId, cause); } taskManagerHeartbeatManager.stop(); resourceManagerHeartbeatManager.stop(); // make sure there is a graceful exit suspendExecution(new FlinkException("JobManager is shutting down.")); // shut down will internally release all registered slots slotPool.shutDown(); final CompletableFuture<Void> disposeInternalSavepointFuture; if (lastInternalSavepoint != null) { disposeInternalSavepointFuture = CompletableFuture.runAsync(() -> disposeSavepoint(lastInternalSavepoint)); } else { disposeInternalSavepointFuture = CompletableFuture.completedFuture(null); } final CompletableFuture<Void> slotPoolTerminationFuture = slotPool.getTerminationFuture(); return FutureUtils.completeAll(Arrays.asList(disposeInternalSavepointFuture, slotPoolTerminationFuture)); }
@Override public CompletableFuture<Collection<TaskManagerInfo>> requestTaskManagerInfo(Time timeout) { final ArrayList<TaskManagerInfo> taskManagerInfos = new ArrayList<>(taskExecutors.size()); for (Map.Entry<ResourceID, WorkerRegistration<WorkerType>> taskExecutorEntry : taskExecutors.entrySet()) { final ResourceID resourceId = taskExecutorEntry.getKey(); final WorkerRegistration<WorkerType> taskExecutor = taskExecutorEntry.getValue(); taskManagerInfos.add( new TaskManagerInfo( resourceId, taskExecutor.getTaskExecutorGateway().getAddress(), taskExecutor.getDataPort(), taskManagerHeartbeatManager.getLastHeartbeatFrom(resourceId), slotManager.getNumberRegisteredSlotsOf(taskExecutor.getInstanceID()), slotManager.getNumberFreeSlotsOf(taskExecutor.getInstanceID()), taskExecutor.getHardwareDescription())); } return CompletableFuture.completedFuture(taskManagerInfos); }
@Override public CompletableFuture<Acknowledge> disconnectTaskManager(final ResourceID resourceID, final Exception cause) { log.debug("Disconnect TaskExecutor {} because: {}", resourceID, cause.getMessage()); taskManagerHeartbeatManager.unmonitorTarget(resourceID); CompletableFuture<Acknowledge> releaseFuture = slotPoolGateway.releaseTaskManager(resourceID, cause); Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManagerConnection = registeredTaskManagers.remove(resourceID); if (taskManagerConnection != null) { taskManagerConnection.f1.disconnectJobManager(jobGraph.getJobID(), cause); } return releaseFuture; }
private void establishResourceManagerConnection(final JobMasterRegistrationSuccess success) { final UUID resourceManagerLeaderId = success.getResourceManagerLeaderId(); // verify the response with current connection if (resourceManagerConnection != null && resourceManagerConnection.getTargetLeaderId().equals(resourceManagerLeaderId)) { log.info("JobManager successfully registered at ResourceManager, leader id: {}.", resourceManagerLeaderId); final ResourceManagerGateway resourceManagerGateway = resourceManagerConnection.getTargetGateway(); slotPoolGateway.connectToResourceManager(resourceManagerLeaderId, resourceManagerGateway); resourceManagerHeartbeatManager.monitorTarget(success.getResourceManagerResourceId(), new HeartbeatTarget<Void>() { @Override public void receiveHeartbeat(ResourceID resourceID, Void payload) { resourceManagerGateway.heartbeatFromJobManager(resourceID); } @Override public void requestHeartbeat(ResourceID resourceID, Void payload) { // request heartbeat will never be called on the job manager side } }); } }
@Override public void heartbeatFromResourceManager(final ResourceID resourceID) { resourceManagerHeartbeatManager.requestHeartbeat(resourceID, null); }
@Override public void heartbeatFromTaskManager(final ResourceID resourceID, AccumulatorReport accumulatorReport) { taskManagerHeartbeatManager.receiveHeartbeat(resourceID, accumulatorReport); }
/** * Suspend the job and shutdown all other services including rpc. */ @Override public CompletableFuture<Void> postStop() { log.info("Stopping the JobMaster for job {}({}).", jobGraph.getName(), jobGraph.getJobID()); // disconnect from all registered TaskExecutors final Set<ResourceID> taskManagerResourceIds = new HashSet<>(registeredTaskManagers.keySet()); final FlinkException cause = new FlinkException("Stopping JobMaster for job " + jobGraph.getName() + '(' + jobGraph.getJobID() + ")."); for (ResourceID taskManagerResourceId : taskManagerResourceIds) { disconnectTaskManager(taskManagerResourceId, cause); } taskManagerHeartbeatManager.stop(); resourceManagerHeartbeatManager.stop(); // make sure there is a graceful exit suspendExecution(new FlinkException("JobManager is shutting down.")); // shut down will internally release all registered slots slotPool.shutDown(); final CompletableFuture<Void> disposeInternalSavepointFuture; if (lastInternalSavepoint != null) { disposeInternalSavepointFuture = CompletableFuture.runAsync(() -> disposeSavepoint(lastInternalSavepoint)); } else { disposeInternalSavepointFuture = CompletableFuture.completedFuture(null); } final CompletableFuture<Void> slotPoolTerminationFuture = slotPool.getTerminationFuture(); return FutureUtils.completeAll(Arrays.asList(disposeInternalSavepointFuture, slotPoolTerminationFuture)); }
@Override public CompletableFuture<Collection<TaskManagerInfo>> requestTaskManagerInfo(Time timeout) { final ArrayList<TaskManagerInfo> taskManagerInfos = new ArrayList<>(taskExecutors.size()); for (Map.Entry<ResourceID, WorkerRegistration<WorkerType>> taskExecutorEntry : taskExecutors.entrySet()) { final ResourceID resourceId = taskExecutorEntry.getKey(); final WorkerRegistration<WorkerType> taskExecutor = taskExecutorEntry.getValue(); taskManagerInfos.add( new TaskManagerInfo( resourceId, taskExecutor.getTaskExecutorGateway().getAddress(), taskExecutor.getDataPort(), taskManagerHeartbeatManager.getLastHeartbeatFrom(resourceId), slotManager.getNumberRegisteredSlotsOf(taskExecutor.getInstanceID()), slotManager.getNumberFreeSlotsOf(taskExecutor.getInstanceID()), taskExecutor.getHardwareDescription(), TaskManagerResourceDescription.fromResourceProfile(slotManager.getTotalResourceOf(resourceId)), TaskManagerResourceDescription.fromResourceProfile(slotManager.getAvailableResourceOf(resourceId)))); } return CompletableFuture.completedFuture(taskManagerInfos); }
private void dissolveResourceManagerConnection(EstablishedResourceManagerConnection establishedResourceManagerConnection, Exception cause) { final ResourceID resourceManagerResourceID = establishedResourceManagerConnection.getResourceManagerResourceID(); if (log.isDebugEnabled()) { log.debug("Close ResourceManager connection {}.", resourceManagerResourceID, cause); } else { log.info("Close ResourceManager connection {}: {}.", resourceManagerResourceID, cause.getMessage()); } resourceManagerHeartbeatManager.unmonitorTarget(resourceManagerResourceID); ResourceManagerGateway resourceManagerGateway = establishedResourceManagerConnection.getResourceManagerGateway(); resourceManagerGateway.disconnectJobManager(jobGraph.getJobID(), cause); slotPoolGateway.disconnectResourceManager(); }
private void establishResourceManagerConnection(final JobMasterRegistrationSuccess success) { final ResourceManagerId resourceManagerId = success.getResourceManagerId(); // verify the response with current connection if (resourceManagerConnection != null && Objects.equals(resourceManagerConnection.getTargetLeaderId(), resourceManagerId)) { log.info("JobManager successfully registered at ResourceManager, leader id: {}.", resourceManagerId); final ResourceManagerGateway resourceManagerGateway = resourceManagerConnection.getTargetGateway(); final ResourceID resourceManagerResourceId = success.getResourceManagerResourceId(); establishedResourceManagerConnection = new EstablishedResourceManagerConnection( resourceManagerGateway, resourceManagerResourceId); slotPoolGateway.connectToResourceManager(resourceManagerGateway); resourceManagerHeartbeatManager.monitorTarget(resourceManagerResourceId, new HeartbeatTarget<Void>() { @Override public void receiveHeartbeat(ResourceID resourceID, Void payload) { resourceManagerGateway.heartbeatFromJobManager(resourceID); } @Override public void requestHeartbeat(ResourceID resourceID, Void payload) { // request heartbeat will never be called on the job manager side } }); } else { log.debug("Ignoring resource manager connection to {} because its a duplicate or outdated.", resourceManagerId); } }
@Override public void heartbeatFromResourceManager(ResourceID resourceID) { resourceManagerHeartbeatManager.requestHeartbeat(resourceID, null); }
@Override public void heartbeatFromTaskManager(final ResourceID resourceID, final SlotReport slotReport) { taskManagerHeartbeatManager.receiveHeartbeat(resourceID, slotReport); }