@Override public boolean triggerCheckpoint(final CheckpointMetaData checkpointMetaData, final CheckpointOptions checkpointOptions) { final TaskStateSnapshot checkpointStateHandles = new TaskStateSnapshot(); checkpointStateHandles.putSubtaskStateByOperatorID( OperatorID.fromJobVertexID(getEnvironment().getJobVertexId()), new OperatorSubtaskState()); getEnvironment().acknowledgeCheckpoint( checkpointMetaData.getCheckpointId(), new CheckpointMetrics(), checkpointStateHandles); triggerCheckpointLatch.countDown(); return true; }
private void reportCompletedSnapshotStates( TaskStateSnapshot acknowledgedTaskStateSnapshot, TaskStateSnapshot localTaskStateSnapshot, long asyncDurationMillis) { TaskStateManager taskStateManager = owner.getEnvironment().getTaskStateManager(); boolean hasAckState = acknowledgedTaskStateSnapshot.hasState(); boolean hasLocalState = localTaskStateSnapshot.hasState(); Preconditions.checkState(hasAckState || !hasLocalState, "Found cached state but no corresponding primary state is reported to the job " + "manager. This indicates a problem."); // we signal stateless tasks by reporting null, so that there are no attempts to assign empty state // to stateless tasks on restore. This enables simple job modifications that only concern // stateless without the need to assign them uids to match their (always empty) states. taskStateManager.reportTaskStateSnapshots( checkpointMetaData, checkpointMetrics, hasAckState ? acknowledgedTaskStateSnapshot : null, hasLocalState ? localTaskStateSnapshot : null); LOG.debug("{} - finished asynchronous part of checkpoint {}. Asynchronous duration: {} ms", owner.getName(), checkpointMetaData.getCheckpointId(), asyncDurationMillis); LOG.trace("{} - reported the following states in snapshot for checkpoint {}: {}.", owner.getName(), checkpointMetaData.getCheckpointId(), acknowledgedTaskStateSnapshot); }
@Test public void testRestoreAfterScaleUp() throws Exception { OperatorID headOperatorID = new OperatorID(42L, 42L); OperatorID tailOperatorID = new OperatorID(44L, 44L); JobManagerTaskRestore restore = createRunAndCheckpointOperatorChain( headOperatorID, new CounterOperator(), tailOperatorID, new CounterOperator(), Optional.empty()); TaskStateSnapshot stateHandles = restore.getTaskStateSnapshot(); assertEquals(2, stateHandles.getSubtaskStateMappings().size()); // test empty state in case of scale up OperatorSubtaskState emptyHeadOperatorState = StateAssignmentOperation.operatorSubtaskStateFrom( new OperatorInstanceID(0, headOperatorID), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap()); stateHandles.putSubtaskStateByOperatorID(headOperatorID, emptyHeadOperatorState); createRunAndCheckpointOperatorChain( headOperatorID, new CounterOperator(), tailOperatorID, new CounterOperator(), Optional.of(restore)); assertEquals(new HashSet<>(Arrays.asList(headOperatorID, tailOperatorID)), RESTORED_OPERATORS); }
lastJobManagerTaskStateSnapshot.getSubtaskStateByOperatorID(operatorID); lastTaskManagerTaskStateSnapshot.getSubtaskStateByOperatorID(operatorID);
@Override public void run() { try { subtaskState.discardState(); } catch (Throwable t2) { LOG.warn("Could not properly discard state object of checkpoint {} " + "belonging to task {} of job {}.", checkpointId, executionAttemptID, jobId, t2); } } });
JobVertexID jobVertexID = new JobVertexID(); TaskStateSnapshot jmSnapshot = new TaskStateSnapshot(); TaskStateSnapshot tmSnapshot = new TaskStateSnapshot();
@Override public int hashCode() { int result = super.hashCode(); result = 31 * result + (subtaskState != null ? subtaskState.hashCode() : 0); return result; }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof AcknowledgeCheckpoint)) { return false; } if (!super.equals(o)) { return false; } AcknowledgeCheckpoint that = (AcknowledgeCheckpoint) o; return subtaskState != null ? subtaskState.equals(that.subtaskState) : that.subtaskState == null; }
jobManagerStateSnapshot.getSubtaskStateByOperatorID(operatorID); OperatorSubtaskState localSubtaskState = localStateSnapshot.getSubtaskStateByOperatorID(operatorID);
@Override public void run() { try { subtaskState.discardState(); } catch (Throwable t2) { LOG.warn("Could not properly discard state object of checkpoint {} " + "belonging to task {} of job {}.", checkpointId, executionAttemptID, jobId, t2); } } });
@Override public int hashCode() { int result = super.hashCode(); result = 31 * result + (subtaskState != null ? subtaskState.hashCode() : 0); return result; }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof AcknowledgeCheckpoint)) { return false; } if (!super.equals(o)) { return false; } AcknowledgeCheckpoint that = (AcknowledgeCheckpoint) o; return subtaskState != null ? subtaskState.equals(that.subtaskState) : that.subtaskState == null; }
OperatorID operatorID = OperatorID.fromJobVertexID(jobVertexID); streamConfig.setOperatorID(operatorID); TaskStateSnapshot stateSnapshot = new TaskStateSnapshot(); stateSnapshot.putSubtaskStateByOperatorID(operatorID, operatorSubtaskState);
jobManagerStateSnapshot.getSubtaskStateByOperatorID(operatorID); OperatorSubtaskState localSubtaskState = localStateSnapshot.getSubtaskStateByOperatorID(operatorID);
@Override public void run() { try { subtaskState.discardState(); } catch (Throwable t2) { LOG.warn("Could not properly discard state object of checkpoint {} " + "belonging to task {} of job {}.", checkpointId, executionAttemptID, jobId, t2); } } });
@Override public int hashCode() { int result = super.hashCode(); result = 31 * result + (subtaskState != null ? subtaskState.hashCode() : 0); return result; }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof AcknowledgeCheckpoint)) { return false; } if (!super.equals(o)) { return false; } AcknowledgeCheckpoint that = (AcknowledgeCheckpoint) o; return subtaskState != null ? subtaskState.equals(that.subtaskState) : that.subtaskState == null; }
private void reportCompletedSnapshotStates( TaskStateSnapshot acknowledgedTaskStateSnapshot, TaskStateSnapshot localTaskStateSnapshot, long asyncDurationMillis) { TaskStateManager taskStateManager = owner.getEnvironment().getTaskStateManager(); boolean hasAckState = acknowledgedTaskStateSnapshot.hasState(); boolean hasLocalState = localTaskStateSnapshot.hasState(); Preconditions.checkState(hasAckState || !hasLocalState, "Found cached state but no corresponding primary state is reported to the job " + "manager. This indicates a problem."); // we signal stateless tasks by reporting null, so that there are no attempts to assign empty state // to stateless tasks on restore. This enables simple job modifications that only concern // stateless without the need to assign them uids to match their (always empty) states. taskStateManager.reportTaskStateSnapshots( checkpointMetaData, checkpointMetrics, hasAckState ? acknowledgedTaskStateSnapshot : null, hasLocalState ? localTaskStateSnapshot : null); LOG.debug("{} - finished asynchronous part of checkpoint {}. Asynchronous duration: {} ms", owner.getName(), checkpointMetaData.getCheckpointId(), asyncDurationMillis); LOG.trace("{} - reported the following states in snapshot for checkpoint {}: {}.", owner.getName(), checkpointMetaData.getCheckpointId(), acknowledgedTaskStateSnapshot); }
new StateObjectCollection<>(nullToEmptyCollection(localRawKeyGroupState))); TaskStateSnapshot jmTaskStateSnapshot = new TaskStateSnapshot(); jmTaskStateSnapshot.putSubtaskStateByOperatorID(operator.getOperatorID(), processedJmOpSubtaskState); TaskStateSnapshot tmTaskStateSnapshot = new TaskStateSnapshot(); tmTaskStateSnapshot.putSubtaskStateByOperatorID(operator.getOperatorID(), tmOperatorStateHandles); taskStateManager.setTaskManagerTaskStateSnapshotsByCheckpointId( Collections.singletonMap(0L, tmTaskStateSnapshot));
jobManagerStateSnapshot.getSubtaskStateByOperatorID(operatorID); OperatorSubtaskState localSubtaskState = localStateSnapshot.getSubtaskStateByOperatorID(operatorID);