private void setNewFencingToken(JobMasterId newJobMasterId) { if (getFencingToken() != null) { log.info("Restarting old job with JobMasterId {}. The new JobMasterId is {}.", getFencingToken(), newJobMasterId); } // set new leader id setFencingToken(newJobMasterId); }
private void setNewFencingToken(JobMasterId newJobMasterId) { if (getFencingToken() != null) { log.info("Restarting old job with JobMasterId {}. The new JobMasterId is {}.", getFencingToken(), newJobMasterId); // first we have to suspend the current execution suspendExecution(new FlinkException("Old job with JobMasterId " + getFencingToken() + " is restarted with a new JobMasterId " + newJobMasterId + '.')); } // set new leader id setFencingToken(newJobMasterId); }
private void setNewFencingToken(JobMasterId newJobMasterId) { if (getFencingToken() != null) { log.info("Restarting old job with JobMasterId {}. The new JobMasterId is {}.", getFencingToken(), newJobMasterId); // first we have to suspend the current execution suspendExecution(new FlinkException("Old job with JobMasterId " + getFencingToken() + " is restarted with a new JobMasterId " + newJobMasterId + '.')); } // set new leader id setFencingToken(newJobMasterId); }
@Override public CompletableFuture<Collection<SlotOffer>> offerSlots( final ResourceID taskManagerId, final Collection<SlotOffer> slots, final Time timeout) { Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManager = registeredTaskManagers.get(taskManagerId); if (taskManager == null) { return FutureUtils.completedExceptionally(new Exception("Unknown TaskManager " + taskManagerId)); } final TaskManagerLocation taskManagerLocation = taskManager.f0; final TaskExecutorGateway taskExecutorGateway = taskManager.f1; final RpcTaskManagerGateway rpcTaskManagerGateway = new RpcTaskManagerGateway(taskExecutorGateway, getFencingToken()); return slotPoolGateway.offerSlots( taskManagerLocation, rpcTaskManagerGateway, slots); }
@Override public CompletableFuture<Collection<SlotOffer>> offerSlots( final ResourceID taskManagerId, final Collection<SlotOffer> slots, final Time timeout) { Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManager = registeredTaskManagers.get(taskManagerId); if (taskManager == null) { return FutureUtils.completedExceptionally(new Exception("Unknown TaskManager " + taskManagerId)); } final TaskManagerLocation taskManagerLocation = taskManager.f0; final TaskExecutorGateway taskExecutorGateway = taskManager.f1; final RpcTaskManagerGateway rpcTaskManagerGateway = new RpcTaskManagerGateway(taskExecutorGateway, getFencingToken()); return slotPoolGateway.offerSlots( taskManagerLocation, rpcTaskManagerGateway, slots); }
@Override public CompletableFuture<Collection<SlotOffer>> offerSlots( final ResourceID taskManagerId, final Collection<SlotOffer> slots, final Time timeout) { Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManager = registeredTaskManagers.get(taskManagerId); if (taskManager == null) { return FutureUtils.completedExceptionally(new Exception("Unknown TaskManager " + taskManagerId)); } final TaskManagerLocation taskManagerLocation = taskManager.f0; final TaskExecutorGateway taskExecutorGateway = taskManager.f1; final RpcTaskManagerGateway rpcTaskManagerGateway = new RpcTaskManagerGateway(taskExecutorGateway, getFencingToken()); return slotPoolGateway.offerSlots( taskManagerLocation, rpcTaskManagerGateway, slots); }
private Acknowledge startJobExecution(JobMasterId newJobMasterId) throws Exception { validateRunsInMainThread(); checkNotNull(newJobMasterId, "The new JobMasterId must not be null."); if (Objects.equals(getFencingToken(), newJobMasterId)) { log.info("Already started the job execution with JobMasterId {}.", newJobMasterId); return Acknowledge.get(); } setNewFencingToken(newJobMasterId); startJobMasterServices(); log.info("Starting execution of job {} ({})", jobGraph.getName(), jobGraph.getJobID()); resetAndScheduleExecutionGraph(); return Acknowledge.get(); }
private Acknowledge startJobExecution(JobMasterId newJobMasterId) throws Exception { validateRunsInMainThread(); checkNotNull(newJobMasterId, "The new JobMasterId must not be null."); if (Objects.equals(getFencingToken(), newJobMasterId)) { log.info("Already started the job execution with JobMasterId {}.", newJobMasterId); return Acknowledge.get(); } setNewFencingToken(newJobMasterId); startJobMasterServices(); log.info("Starting execution of job {} ({})", jobGraph.getName(), jobGraph.getJobID()); resetAndScheduleExecutionGraph(); return Acknowledge.get(); }
private void connectToResourceManager() { assert(resourceManagerAddress != null); assert(resourceManagerConnection == null); assert(establishedResourceManagerConnection == null); log.info("Connecting to ResourceManager {}", resourceManagerAddress); resourceManagerConnection = new ResourceManagerConnection( log, jobGraph.getJobID(), resourceId, getAddress(), getFencingToken(), resourceManagerAddress.getAddress(), resourceManagerAddress.getResourceManagerId(), scheduledExecutorService); resourceManagerConnection.start(); }
private void connectToResourceManager() { assert(resourceManagerAddress != null); assert(resourceManagerConnection == null); assert(establishedResourceManagerConnection == null); log.info("Connecting to ResourceManager {}", resourceManagerAddress); resourceManagerConnection = new ResourceManagerConnection( log, jobGraph.getJobID(), resourceId, getAddress(), getFencingToken(), resourceManagerAddress.getAddress(), resourceManagerAddress.getResourceManagerId(), scheduledExecutorService); resourceManagerConnection.start(); }
private void connectToResourceManager() { assert(resourceManagerAddress != null); assert(resourceManagerConnection == null); assert(establishedResourceManagerConnection == null); log.info("Connecting to ResourceManager {}", resourceManagerAddress); resourceManagerConnection = new ResourceManagerConnection( log, jobGraph.getJobID(), resourceId, getAddress(), getFencingToken(), resourceManagerAddress.getAddress(), resourceManagerAddress.getResourceManagerId(), scheduledExecutorService); resourceManagerConnection.start(); }
private void startJobMasterServices() throws Exception { // start the slot pool make sure the slot pool now accepts messages for this leader slotPool.start(getFencingToken(), getAddress()); //TODO: Remove once the ZooKeeperLeaderRetrieval returns the stored address upon start // try to reconnect to previously known leader reconnectToResourceManager(new FlinkException("Starting JobMaster component.")); // job is ready to go, try to establish connection with resource manager // - activate leader retrieval for the resource manager // - on notification of the leader, the connection will be established and // the slot pool will start requesting slots resourceManagerLeaderRetriever.start(new ResourceManagerLeaderListener()); }
private void startJobMasterServices() throws Exception { // start the slot pool make sure the slot pool now accepts messages for this leader slotPool.start(getFencingToken(), getAddress()); //TODO: Remove once the ZooKeeperLeaderRetrieval returns the stored address upon start // try to reconnect to previously known leader reconnectToResourceManager(new FlinkException("Starting JobMaster component.")); // job is ready to go, try to establish connection with resource manager // - activate leader retrieval for the resource manager // - on notification of the leader, the connection will be established and // the slot pool will start requesting slots resourceManagerLeaderRetriever.start(new ResourceManagerLeaderListener()); }
private void startJobMasterServices() throws Exception { // start the slot pool make sure the slot pool now accepts messages for this leader slotPool.start(getFencingToken(), getAddress()); //TODO: Remove once the ZooKeeperLeaderRetrieval returns the stored address upon start // try to reconnect to previously known leader reconnectToResourceManager(new FlinkException("Starting JobMaster component.")); // job is ready to go, try to establish connection with resource manager // - activate leader retrieval for the resource manager // - on notification of the leader, the connection will be established and // the slot pool will start requesting slots resourceManagerLeaderRetriever.start(new ResourceManagerLeaderListener()); }
/** * Suspending job, all the running tasks will be cancelled, and communication with other components * will be disposed. * * <p>Mostly job is suspended because of the leadership has been revoked, one can be restart this job by * calling the {@link #start(JobMasterId, Time)} method once we take the leadership back again. * * @param cause The reason of why this job been suspended. */ private Acknowledge suspendExecution(final Exception cause) { validateRunsInMainThread(); if (getFencingToken() == null) { log.debug("Job has already been suspended or shutdown."); return Acknowledge.get(); } // not leader anymore --> set the JobMasterId to null setFencingToken(null); try { resourceManagerLeaderRetriever.stop(); } catch (Throwable t) { log.warn("Failed to stop resource manager leader retriever when suspending.", t); } suspendAndClearExecutionGraphFields(cause); // the slot pool stops receiving messages and clears its pooled slots slotPoolGateway.suspend(); // disconnect from resource manager: closeResourceManagerConnection(cause); return Acknowledge.get(); }
/** * Suspending job, all the running tasks will be cancelled, and communication with other components * will be disposed. * * <p>Mostly job is suspended because of the leadership has been revoked, one can be restart this job by * calling the {@link #start(JobMasterId, Time)} method once we take the leadership back again. * * @param cause The reason of why this job been suspended. */ private Acknowledge suspendExecution(final Exception cause) { validateRunsInMainThread(); if (getFencingToken() == null) { log.debug("Job has already been suspended or shutdown."); return Acknowledge.get(); } // not leader anymore --> set the JobMasterId to null setFencingToken(null); try { resourceManagerLeaderRetriever.stop(); } catch (Throwable t) { log.warn("Failed to stop resource manager leader retriever when suspending.", t); } suspendAndClearExecutionGraphFields(cause); // the slot pool stops receiving messages and clears its pooled slots slotPoolGateway.suspend(); // disconnect from resource manager: closeResourceManagerConnection(cause); return Acknowledge.get(); }
/** * Suspending job, and communication with other components will be disposed. * * <p>Mostly job is suspended without cancelling running tasks because of the leadership has been revoked, * the one who takes the leadership can take over the control. * * @param cause The reason of why this job been suspended. */ private Acknowledge suspendExecution(final Exception cause) { validateRunsInMainThread(); if (getFencingToken() == null) { log.debug("Job has already been suspended or shutdown."); return Acknowledge.get(); } // not leader anymore --> set the JobMasterId to null setFencingToken(null); try { resourceManagerLeaderRetriever.stop(); } catch (Throwable t) { log.warn("Failed to stop resource manager leader retriever when suspending.", t); } suspendAndClearExecutionGraphFields(cause); // flush the operation logs. operationLogManager.stop(); // the slot pool stops receiving messages and clears its pooled slots. slotPoolGateway.suspend(); // disconnect from resource manager: closeResourceManagerConnection(cause); return Acknowledge.get(); }
final RpcTaskManagerGateway rpcTaskManagerGateway = new RpcTaskManagerGateway(taskExecutorGateway, this.getFencingToken());
if (Objects.equals(getFencingToken(), newJobMasterId)) { log.info("Already started the job execution with JobMasterId {}.", newJobMasterId);