@Override public void run() { try { checkpointCoordinator.receiveAcknowledgeMessage(ackMessage); } catch (Throwable t) { log.warn("Error while processing checkpoint acknowledgement message"); } } });
@Override public void run() { try { checkpointCoordinator.receiveDeclineMessage(decline); } catch (Exception e) { log.error("Error in CheckpointCoordinator while processing {}", decline, e); } } });
/** * Discards the given pending checkpoint because of the given cause. * * @param pendingCheckpoint to discard * @param cause for discarding the checkpoint */ private void discardCheckpoint(PendingCheckpoint pendingCheckpoint, @Nullable Throwable cause) { assert(Thread.holdsLock(lock)); Preconditions.checkNotNull(pendingCheckpoint); final long checkpointId = pendingCheckpoint.getCheckpointId(); final String reason = (cause != null) ? cause.getMessage() : ""; LOG.info("Discarding checkpoint {} of job {} because: {}", checkpointId, job, reason); pendingCheckpoint.abortDeclined(); rememberRecentCheckpointId(checkpointId); // we don't have to schedule another "dissolving" checkpoint any more because the // cancellation barriers take care of breaking downstream alignments // we only need to make sure that suspended queued requests are resumed boolean haveMoreRecentPending = false; for (PendingCheckpoint p : pendingCheckpoints.values()) { if (!p.isDiscarded() && p.getCheckpointId() >= pendingCheckpoint.getCheckpointId()) { haveMoreRecentPending = true; break; } } if (!haveMoreRecentPending) { triggerQueuedRequests(); } }
public void resetExecutionVerticesAndNotify(long modVersion, List<ExecutionVertex> executionVertices) throws Exception { final long resetTimestamp = System.currentTimeMillis(); List<ExecutionVertexID> evIds = new ArrayList<>(executionVertices.size()); for (ExecutionVertex ev : executionVertices) { ev.resetForNewExecution(resetTimestamp, modVersion); evIds.add(ev.getExecutionVertexID()); } // if we have checkpointed state, reload it into the executions // we restart scheduler to ensure EXACTLY_ONCE mechanism and // to trigger new checkpoint without having to wait for old checkpoint expired if (checkpointCoordinator != null) { checkpointCoordinator.stopCheckpointScheduler(); checkpointCoordinator.restoreLatestCheckpointedState(executionVertices, false, true); checkpointCoordinator.startCheckpointScheduler(); } graphManager.notifyExecutionVertexFailover(evIds); }
checkpointCoordinator = new CheckpointCoordinator( jobInformation.getJobId(), interval, if (!checkpointCoordinator.addMasterHook(hook)) { LOG.warn("Trying to register multiple checkpoint hooks with the name: {}", hook.getIdentifier()); checkpointCoordinator.setCheckpointStatsTracker(checkpointStatsTracker); registerJobStatusListener(checkpointCoordinator.createActivatorDeactivator());
pendingCheckpoints.remove(checkpointId); triggerQueuedRequests(); rememberRecentCheckpointId(checkpointId); dropSubsumedCheckpoints(checkpointId);
completePendingCheckpoint(checkpoint); message.getTaskExecutionId(), message.getJob()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); message.getCheckpointId(), message.getTaskExecutionId(), message.getJob()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());
/** * Restores the latest checkpointed state. * * <p>The recovery of checkpoints might block. Make sure that calls to this method don't * block the job manager actor and run asynchronously. * * @param errorIfNoCheckpoint Fail if there is no checkpoint available * @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped * to the ExecutionGraph vertices (if the checkpoint contains state for a * job vertex that is not part of this ExecutionGraph). */ public void restoreLatestCheckpointedState(boolean errorIfNoCheckpoint, boolean allowNonRestoredState) throws Exception { synchronized (progressLock) { if (checkpointCoordinator != null) { checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), errorIfNoCheckpoint, allowNonRestoredState); } } }
private void dropSubsumedCheckpoints(long checkpointId) { Iterator<Map.Entry<Long, PendingCheckpoint>> entries = pendingCheckpoints.entrySet().iterator(); while (entries.hasNext()) { PendingCheckpoint p = entries.next().getValue(); // remove all pending checkpoints that are lesser than the current completed checkpoint if (p.getCheckpointId() < checkpointId && p.canBeSubsumed()) { rememberRecentCheckpointId(p.getCheckpointId()); p.abortSubsumed(); entries.remove(); } } }
checkpointCoordinator = new CheckpointCoordinator( jobInformation.getJobId(), interval, if (!checkpointCoordinator.addMasterHook(hook)) { LOG.warn("Trying to register multiple checkpoint hooks with the name: {}", hook.getIdentifier()); checkpointCoordinator.setCheckpointStatsTracker(checkpointStatsTracker); registerJobStatusListener(checkpointCoordinator.createActivatorDeactivator());
pendingCheckpoints.remove(checkpointId); triggerQueuedRequests(); rememberRecentCheckpointId(checkpointId); dropSubsumedCheckpoints(checkpointId);
completePendingCheckpoint(checkpoint); message.getTaskExecutionId(), message.getJob()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); message.getCheckpointId(), message.getTaskExecutionId(), message.getJob()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());
/** * Restores the latest checkpointed state. * * <p>The recovery of checkpoints might block. Make sure that calls to this method don't * block the job manager actor and run asynchronously. * * @param errorIfNoCheckpoint Fail if there is no checkpoint available * @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped * to the the ExecutionGraph vertices (if the checkpoint contains state for a * job vertex that is not part of this ExecutionGraph). */ public void restoreLatestCheckpointedState(boolean errorIfNoCheckpoint, boolean allowNonRestoredState) throws Exception { synchronized (progressLock) { if (checkpointCoordinator != null) { checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), errorIfNoCheckpoint, allowNonRestoredState); } } }
private void dropSubsumedCheckpoints(long checkpointId) { Iterator<Map.Entry<Long, PendingCheckpoint>> entries = pendingCheckpoints.entrySet().iterator(); while (entries.hasNext()) { PendingCheckpoint p = entries.next().getValue(); // remove all pending checkpoints that are lesser than the current completed checkpoint if (p.getCheckpointId() < checkpointId && p.canBeSubsumed()) { rememberRecentCheckpointId(p.getCheckpointId()); p.abortSubsumed(); entries.remove(); } } }
checkpointCoordinator = new CheckpointCoordinator( jobInformation.getJobId(), interval, if (!checkpointCoordinator.addMasterHook(hook)) { LOG.warn("Trying to register multiple checkpoint hooks with the name: {}", hook.getIdentifier()); checkpointCoordinator.setCheckpointStatsTracker(checkpointStatsTracker); registerJobStatusListener(checkpointCoordinator.createActivatorDeactivator());
dropSubsumedCheckpoints(checkpointId); triggerQueuedRequests(); rememberRecentCheckpointId(checkpointId);
rememberRecentCheckpointId(checkpointId); triggerQueuedRequests();
completePendingCheckpoint(checkpoint); message.getTaskExecutionId(), message.getJob()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); message.getCheckpointId(), message.getTaskExecutionId(), message.getJob()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());
/** * Restores the latest checkpointed state. * * <p>The recovery of checkpoints might block. Make sure that calls to this method don't * block the job manager actor and run asynchronously. * * @param errorIfNoCheckpoint Fail if there is no checkpoint available * @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped * to the ExecutionGraph vertices (if the checkpoint contains state for a * job vertex that is not part of this ExecutionGraph). */ public void restoreLatestCheckpointedState(boolean errorIfNoCheckpoint, boolean allowNonRestoredState) throws Exception { synchronized (progressLock) { if (checkpointCoordinator != null) { checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), errorIfNoCheckpoint, allowNonRestoredState); } } }
private void dropSubsumedCheckpoints(long checkpointId) { Iterator<Map.Entry<Long, PendingCheckpoint>> entries = pendingCheckpoints.entrySet().iterator(); while (entries.hasNext()) { PendingCheckpoint p = entries.next().getValue(); // remove all pending checkpoints that are lesser than the current completed checkpoint if (p.getCheckpointId() < checkpointId && p.canBeSubsumed()) { rememberRecentCheckpointId(p.getCheckpointId()); p.abortSubsumed(); entries.remove(); } } }