@Override public void shardInitialized(ShardRouting unassignedShard, ShardRouting initializedShard) { // if we force an empty primary, we should also fail the restore entry if (unassignedShard.recoverySource().getType() == RecoverySource.Type.SNAPSHOT && initializedShard.recoverySource().getType() != RecoverySource.Type.SNAPSHOT) { changes(unassignedShard.recoverySource()).shards.put( unassignedShard.shardId(), new ShardRestoreStatus(null, RestoreInProgress.State.FAILURE, "recovery source type changed from snapshot to " + initializedShard.recoverySource()) ); } }
/** * Is the allocator responsible for allocating the given {@link ShardRouting}? */ private static boolean isResponsibleFor(final ShardRouting shard) { return shard.primary() // must be primary && shard.unassigned() // must be unassigned // only handle either an existing store or a snapshot recovery && (shard.recoverySource().getType() == RecoverySource.Type.EXISTING_STORE || shard.recoverySource().getType() == RecoverySource.Type.SNAPSHOT); }
@Nullable private ShardRouting findAssignedPrimaryIfPeerRecovery(ShardRouting routing) { ShardRouting primary = null; if (routing.recoverySource() != null && routing.recoverySource().getType() == RecoverySource.Type.PEER) { List<ShardRouting> shardRoutings = assignedShards.get(routing.shardId()); if (shardRoutings != null) { for (ShardRouting shardRouting : shardRoutings) { if (shardRouting.primary()) { if (shardRouting.active()) { return shardRouting; } else if (primary == null) { primary = shardRouting; } else if (primary.relocatingNodeId() != null) { primary = shardRouting; } } } } } return primary; }
/** * Moves the shard to unassigned state. */ public ShardRouting moveToUnassigned(UnassignedInfo unassignedInfo) { assert state != ShardRoutingState.UNASSIGNED : this; final RecoverySource recoverySource; if (active()) { if (primary()) { recoverySource = ExistingStoreRecoverySource.INSTANCE; } else { recoverySource = PeerRecoverySource.INSTANCE; } } else { recoverySource = recoverySource(); } return new ShardRouting(shardId, null, null, primary, ShardRoutingState.UNASSIGNED, recoverySource, unassignedInfo, null, UNAVAILABLE_EXPECTED_SHARD_SIZE); }
@Override public void unassignedInfoUpdated(ShardRouting unassignedShard, UnassignedInfo newUnassignedInfo) { RecoverySource recoverySource = unassignedShard.recoverySource(); if (recoverySource.getType() == RecoverySource.Type.SNAPSHOT) { if (newUnassignedInfo.getLastAllocationStatus() == UnassignedInfo.AllocationStatus.DECIDERS_NO) { String reason = "shard could not be allocated to any of the nodes"; changes(recoverySource).shards.put( unassignedShard.shardId(), new ShardRestoreStatus(unassignedShard.currentNodeId(), RestoreInProgress.State.FAILURE, reason)); } } }
@Override public void shardStarted(ShardRouting initializingShard, ShardRouting startedShard) { // mark snapshot as completed if (initializingShard.primary()) { RecoverySource recoverySource = initializingShard.recoverySource(); if (recoverySource.getType() == RecoverySource.Type.SNAPSHOT) { changes(recoverySource).shards.put( initializingShard.shardId(), new ShardRestoreStatus(initializingShard.currentNodeId(), RestoreInProgress.State.SUCCESS)); } } }
/** * Checks if an inactive primary shard should cause the cluster health to go RED. * * An inactive primary shard in an index should cause the cluster health to be RED to make it visible that some of the existing data is * unavailable. In case of index creation, snapshot restore or index shrinking, which are unexceptional events in the cluster lifecycle, * cluster health should not turn RED for the time where primaries are still in the initializing state but go to YELLOW instead. * However, in case of exceptional events, for example when the primary shard cannot be assigned to a node or initialization fails at * some point, cluster health should still turn RED. * * NB: this method should *not* be called on active shards nor on non-primary shards. */ public static ClusterHealthStatus getInactivePrimaryHealth(final ShardRouting shardRouting) { assert shardRouting.primary() : "cannot invoke on a replica shard: " + shardRouting; assert shardRouting.active() == false : "cannot invoke on an active shard: " + shardRouting; assert shardRouting.unassignedInfo() != null : "cannot invoke on a shard with no UnassignedInfo: " + shardRouting; assert shardRouting.recoverySource() != null : "cannot invoke on a shard that has no recovery source" + shardRouting; final UnassignedInfo unassignedInfo = shardRouting.unassignedInfo(); RecoverySource.Type recoveryType = shardRouting.recoverySource().getType(); if (unassignedInfo.getLastAllocationStatus() != AllocationStatus.DECIDERS_NO && unassignedInfo.getNumFailedAllocations() == 0 && (recoveryType == RecoverySource.Type.EMPTY_STORE || recoveryType == RecoverySource.Type.LOCAL_SHARDS || recoveryType == RecoverySource.Type.SNAPSHOT)) { return ClusterHealthStatus.YELLOW; } else { return ClusterHealthStatus.RED; } }
@Override public void shardFailed(ShardRouting failedShard, UnassignedInfo unassignedInfo) { if (failedShard.primary() && failedShard.initializing()) { RecoverySource recoverySource = failedShard.recoverySource(); if (recoverySource.getType() == RecoverySource.Type.SNAPSHOT) { // mark restore entry for this shard as failed when it's due to a file corruption. There is no need wait on retries // to restore this shard on another node if the snapshot files are corrupt. In case where a node just left or crashed, // however, we only want to acknowledge the restore operation once it has been successfully restored on another node. if (unassignedInfo.getFailure() != null && Lucene.isCorruptionException(unassignedInfo.getFailure().getCause())) { changes(recoverySource).shards.put( failedShard.shardId(), new ShardRestoreStatus(failedShard.currentNodeId(), RestoreInProgress.State.FAILURE, unassignedInfo.getFailure().getCause().getMessage())); } } } }
@Override public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (shardRouting.unassigned()) { // only for unassigned - we filter allocation right after the index creation ie. for shard shrinking etc. to ensure // that once it has been allocated post API the replicas can be allocated elsewhere without user interaction // this is a setting that can only be set within the system! IndexMetaData indexMd = allocation.metaData().getIndexSafe(shardRouting.index()); DiscoveryNodeFilters initialRecoveryFilters = indexMd.getInitialRecoveryFilters(); if (initialRecoveryFilters != null && INITIAL_RECOVERY_TYPES.contains(shardRouting.recoverySource().getType()) && initialRecoveryFilters.match(node.node()) == false) { String explanation = (shardRouting.recoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS) ? "initial allocation of the shrunken index is only allowed on nodes [%s] that hold a copy of every shard in the index" : "initial allocation of the index is only allowed on nodes [%s]"; return allocation.decision(Decision.NO, NAME, explanation, initialRecoveryFilters); } } return shouldFilter(shardRouting, node, allocation); }
@Override public void shardStarted(ShardRouting initializingShard, ShardRouting startedShard) { assert Objects.equals(initializingShard.allocationId().getId(), startedShard.allocationId().getId()) : "initializingShard.allocationId [" + initializingShard.allocationId().getId() + "] and startedShard.allocationId [" + startedShard.allocationId().getId() + "] have to have the same"; Updates updates = changes(startedShard.shardId()); updates.addedAllocationIds.add(startedShard.allocationId().getId()); if (startedShard.primary() // started shard has to have null recoverySource; have to pick up recoverySource from its initializing state && (initializingShard.recoverySource() == RecoverySource.ExistingStoreRecoverySource.FORCE_STALE_PRIMARY_INSTANCE)) { updates.removedAllocationIds.add(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID); } }
private void createShard(DiscoveryNodes nodes, RoutingTable routingTable, ShardRouting shardRouting, ClusterState state) { assert shardRouting.initializing() : "only allow shard creation for initializing shard but was " + shardRouting; DiscoveryNode sourceNode = null; if (shardRouting.recoverySource().getType() == Type.PEER) { sourceNode = findSourceNodeForPeerRecovery(logger, routingTable, nodes, shardRouting); if (sourceNode == null) { logger.trace("ignoring initializing shard {} - no source node can be found.", shardRouting.shardId()); return; } } try { logger.debug("{} creating shard", shardRouting.shardId()); RecoveryState recoveryState = new RecoveryState(shardRouting, nodes.getLocalNode(), sourceNode); indicesService.createShard(shardRouting, recoveryState, recoveryTargetService, new RecoveryListener(shardRouting), repositoriesService, failedShardHandler, globalCheckpointSyncer); } catch (Exception e) { failAndRemoveShard(shardRouting, true, "failed to create shard", e, state); } }
public RecoveryState(ShardRouting shardRouting, DiscoveryNode targetNode, @Nullable DiscoveryNode sourceNode) { assert shardRouting.initializing() : "only allow initializing shard routing to be recovered: " + shardRouting; RecoverySource recoverySource = shardRouting.recoverySource(); assert (recoverySource.getType() == RecoverySource.Type.PEER) == (sourceNode != null) : "peer recovery requires source node, recovery type: " + recoverySource.getType() + " source node: " + sourceNode; this.shardId = shardRouting.shardId(); this.primary = shardRouting.primary(); this.recoverySource = recoverySource; this.sourceNode = sourceNode; this.targetNode = targetNode; stage = Stage.INIT; timer.start(); }
@Override public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (shardRouting.primary()) { if (shardRouting.currentNodeId() == null) { if (shardRouting.recoverySource() != null && shardRouting.recoverySource().getType() == RecoverySource.Type.SNAPSHOT) { // restoring from a snapshot - check that the node can handle the version return isVersionCompatible((SnapshotRecoverySource)shardRouting.recoverySource(), node, allocation); } else { // existing or fresh primary on the node return allocation.decision(Decision.YES, NAME, "the primary shard is new or already existed on the node"); } } else { // relocating primary, only migrate to newer host return isVersionCompatibleRelocatePrimary(allocation.routingNodes(), shardRouting.currentNodeId(), node, allocation); } } else { final ShardRouting primary = allocation.routingNodes().activePrimary(shardRouting.shardId()); // check that active primary has a newer version so that peer recovery works if (primary != null) { return isVersionCompatibleAllocatingReplica(allocation.routingNodes(), primary.currentNodeId(), node, allocation); } else { // ReplicaAfterPrimaryActiveAllocationDecider should prevent this case from occurring return allocation.decision(Decision.YES, NAME, "no active primary shard yet"); } } }
private void updateRecoveryCounts(final ShardRouting routing, final boolean increment, @Nullable final ShardRouting primary) { final int howMany = increment ? 1 : -1; assert routing.initializing() : "routing must be initializing: " + routing; // TODO: check primary == null || primary.active() after all tests properly add ReplicaAfterPrimaryActiveAllocationDecider assert primary == null || primary.assignedToNode() : "shard is initializing but its primary is not assigned to a node"; Recoveries.getOrAdd(recoveriesPerNode, routing.currentNodeId()).addIncoming(howMany); if (routing.recoverySource().getType() == RecoverySource.Type.PEER) { // add/remove corresponding outgoing recovery on node with primary shard if (primary == null) { throw new IllegalStateException("shard is peer recovering but primary is unassigned"); } Recoveries.getOrAdd(recoveriesPerNode, primary.currentNodeId()).addOutgoing(howMany); if (increment == false && routing.primary() && routing.relocatingNodeId() != null) { // primary is done relocating, move non-primary recoveries from old primary to new primary int numRecoveringReplicas = 0; for (ShardRouting assigned : assignedShards(routing.shardId())) { if (assigned.primary() == false && assigned.initializing() && assigned.recoverySource().getType() == RecoverySource.Type.PEER) { numRecoveringReplicas++; } } recoveriesPerNode.get(routing.relocatingNodeId()).addOutgoing(-numRecoveringReplicas); recoveriesPerNode.get(routing.currentNodeId()).addOutgoing(numRecoveringReplicas); } } }
/** * Marks a shard as temporarily ignored and adds it to the ignore unassigned list. * Should be used with caution, typically, * the correct usage is to removeAndIgnore from the iterator. * @see #ignored() * @see UnassignedIterator#removeAndIgnore(AllocationStatus, RoutingChangesObserver) * @see #isIgnoredEmpty() */ public void ignoreShard(ShardRouting shard, AllocationStatus allocationStatus, RoutingChangesObserver changes) { nodes.ensureMutable(); if (shard.primary()) { ignoredPrimaries++; UnassignedInfo currInfo = shard.unassignedInfo(); assert currInfo != null; if (allocationStatus.equals(currInfo.getLastAllocationStatus()) == false) { UnassignedInfo newInfo = new UnassignedInfo(currInfo.getReason(), currInfo.getMessage(), currInfo.getFailure(), currInfo.getNumFailedAllocations(), currInfo.getUnassignedTimeInNanos(), currInfo.getUnassignedTimeInMillis(), currInfo.isDelayed(), allocationStatus); ShardRouting updatedShard = shard.updateUnassigned(newInfo, shard.recoverySource()); changes.unassignedInfoUpdated(shard, newInfo); shard = updatedShard; } } ignored.add(shard); }
/** * Reset failed allocation counter for unassigned shards */ private void resetFailedAllocationCounter(RoutingAllocation allocation) { final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = allocation.routingNodes().unassigned().iterator(); while (unassignedIterator.hasNext()) { ShardRouting shardRouting = unassignedIterator.next(); UnassignedInfo unassignedInfo = shardRouting.unassignedInfo(); unassignedIterator.updateUnassigned(new UnassignedInfo(unassignedInfo.getNumFailedAllocations() > 0 ? UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.getReason(), unassignedInfo.getMessage(), unassignedInfo.getFailure(), 0, unassignedInfo.getUnassignedTimeInNanos(), unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(), unassignedInfo.getLastAllocationStatus()), shardRouting.recoverySource(), allocation.changes()); } }
/** * Returns the expected shard size for the given shard or the default value provided if not enough information are available * to estimate the shards size. */ public static long getExpectedShardSize(ShardRouting shard, RoutingAllocation allocation, long defaultValue) { final IndexMetaData metaData = allocation.metaData().getIndexSafe(shard.index()); final ClusterInfo info = allocation.clusterInfo(); if (metaData.getResizeSourceIndex() != null && shard.active() == false && shard.recoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS) { // in the shrink index case we sum up the source index shards since we basically make a copy of the shard in // the worst case long targetShardSize = 0; final Index mergeSourceIndex = metaData.getResizeSourceIndex(); final IndexMetaData sourceIndexMeta = allocation.metaData().index(mergeSourceIndex); if (sourceIndexMeta != null) { final Set<ShardId> shardIds = IndexMetaData.selectRecoverFromShards(shard.id(), sourceIndexMeta, metaData.getNumberOfShards()); for (IndexShardRoutingTable shardRoutingTable : allocation.routingTable().index(mergeSourceIndex.getName())) { if (shardIds.contains(shardRoutingTable.shardId())) { targetShardSize += info.getShardSize(shardRoutingTable.primaryShard(), 0); } } } return targetShardSize == 0 ? defaultValue : targetShardSize; } else { return info.getShardSize(shard, defaultValue); } } }
/** * Initializes an unassigned shard on a node and removes it from the unassigned * * @param allocation the allocation * @param routingNodes the routing nodes * @param routingNode the node to initialize it to * @param shardRouting the shard routing that is to be matched in unassigned shards * @param unassignedInfo unassigned info to override * @param recoverySource recovery source to override */ protected void initializeUnassignedShard(RoutingAllocation allocation, RoutingNodes routingNodes, RoutingNode routingNode, ShardRouting shardRouting, @Nullable UnassignedInfo unassignedInfo, @Nullable RecoverySource recoverySource) { for (RoutingNodes.UnassignedShards.UnassignedIterator it = routingNodes.unassigned().iterator(); it.hasNext(); ) { ShardRouting unassigned = it.next(); if (!unassigned.equalsIgnoringMetaData(shardRouting)) { continue; } if (unassignedInfo != null || recoverySource != null) { unassigned = it.updateUnassigned(unassignedInfo != null ? unassignedInfo : unassigned.unassignedInfo(), recoverySource != null ? recoverySource : unassigned.recoverySource(), allocation.changes()); } it.initialize(routingNode.nodeId(), null, allocation.clusterInfo().getShardSize(unassigned, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), allocation.changes()); return; } assert false : "shard to initialize not found in list of unassigned shards"; }
@Override public Decision canAllocate(final ShardRouting shardRouting, final RoutingAllocation allocation) { final RecoverySource recoverySource = shardRouting.recoverySource(); if (recoverySource == null || recoverySource.getType() != RecoverySource.Type.SNAPSHOT) { return allocation.decision(Decision.YES, NAME, "ignored as shard is not being recovered from a snapshot"); } RecoverySource.SnapshotRecoverySource source = (RecoverySource.SnapshotRecoverySource) recoverySource; final RestoreInProgress restoresInProgress = allocation.custom(RestoreInProgress.TYPE); if (restoresInProgress != null) { RestoreInProgress.Entry restoreInProgress = restoresInProgress.get(source.restoreUUID()); if (restoreInProgress != null) { RestoreInProgress.ShardRestoreStatus shardRestoreStatus = restoreInProgress.shards().get(shardRouting.shardId()); if (shardRestoreStatus != null && shardRestoreStatus.state().completed() == false) { assert shardRestoreStatus.state() != RestoreInProgress.State.SUCCESS : "expected shard [" + shardRouting + "] to be in initializing state but got [" + shardRestoreStatus.state() + "]"; return allocation.decision(Decision.YES, NAME, "shard is currently being restored"); } } } return allocation.decision(Decision.NO, NAME, "shard has failed to be restored from the snapshot [%s] because of [%s] - " + "manually close or delete the index [%s] in order to retry to restore the snapshot again or use the reroute API to force the " + "allocation of an empty primary shard", source.snapshot(), shardRouting.unassignedInfo().getDetails(), shardRouting.getIndexName()); }
/** * Removes delay markers from unassigned shards based on current time stamp. */ private void removeDelayMarkers(RoutingAllocation allocation) { final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = allocation.routingNodes().unassigned().iterator(); final MetaData metaData = allocation.metaData(); while (unassignedIterator.hasNext()) { ShardRouting shardRouting = unassignedIterator.next(); UnassignedInfo unassignedInfo = shardRouting.unassignedInfo(); if (unassignedInfo.isDelayed()) { final long newComputedLeftDelayNanos = unassignedInfo.getRemainingDelay(allocation.getCurrentNanoTime(), metaData.getIndexSafe(shardRouting.index()).getSettings()); if (newComputedLeftDelayNanos == 0) { unassignedIterator.updateUnassigned(new UnassignedInfo(unassignedInfo.getReason(), unassignedInfo.getMessage(), unassignedInfo.getFailure(), unassignedInfo.getNumFailedAllocations(), unassignedInfo.getUnassignedTimeInNanos(), unassignedInfo.getUnassignedTimeInMillis(), false, unassignedInfo.getLastAllocationStatus()), shardRouting.recoverySource(), allocation.changes()); } } } }