@Override protected AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetchData(ShardRouting shard, RoutingAllocation allocation) { AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.computeIfAbsent(shard.shardId(), shardId -> new InternalAsyncFetch<>(logger, "shard_started", shardId, startedAction)); AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetch.fetchData(allocation.nodes(), allocation.getIgnoreNodes(shard.shardId())); if (shardState.hasData()) { shardState.processAllocation(allocation); } return shardState; } }
private void deassociateDeadNodes(RoutingAllocation allocation) { for (Iterator<RoutingNode> it = allocation.routingNodes().mutableIterator(); it.hasNext(); ) { RoutingNode node = it.next(); if (allocation.nodes().getDataNodes().containsKey(node.nodeId())) { // its a live node, continue continue; } // now, go over all the shards routing on the node, and fail them for (ShardRouting shardRouting : node.copyShards()) { final IndexMetaData indexMetaData = allocation.metaData().getIndexSafe(shardRouting.index()); boolean delayed = INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(indexMetaData.getSettings()).nanos() > 0; UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "node_left[" + node.nodeId() + "]", null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis(), delayed, AllocationStatus.NO_ATTEMPT); allocation.routingNodes().failShard(logger, shardRouting, unassignedInfo, indexMetaData, allocation.changes()); } // its a dead node, remove it, note, its important to remove it *after* we apply failed shard // since it relies on the fact that the RoutingNode exists in the list of nodes it.remove(); } }
/** * Reroutes the routing table based on the live nodes. * <p> * If the same instance of ClusterState is returned, then no change has been made. */ protected ClusterState reroute(ClusterState clusterState, String reason, boolean debug) { ClusterState fixedClusterState = adaptAutoExpandReplicas(clusterState); RoutingNodes routingNodes = getMutableRoutingNodes(fixedClusterState); // shuffle the unassigned nodes, just so we won't have things like poison failed shards routingNodes.unassigned().shuffle(); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, fixedClusterState, clusterInfoService.getClusterInfo(), currentNanoTime()); allocation.debugDecision(debug); reroute(allocation); if (fixedClusterState == clusterState && allocation.routingNodesChanged() == false) { return clusterState; } return buildResultAndLogHealthChange(clusterState, allocation, reason); }
@Override public Decision canRebalance(ShardRouting shardRouting, RoutingAllocation allocation) { if (!allocation.routingNodes().allReplicasActive(shardRouting.shardId(), allocation.metaData())) { return allocation.decision(Decision.NO, NAME, "rebalancing is not allowed until all replicas in the cluster are active"); } return allocation.decision(Decision.YES, NAME, "rebalancing is allowed as all replicas are active in the cluster"); } }
@Override public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocation) { if (shardRouting.primary()) { return allocation.decision(Decision.YES, NAME, "shard is primary and can be allocated"); } ShardRouting primary = allocation.routingNodes().activePrimary(shardRouting.shardId()); if (primary == null) { return allocation.decision(Decision.NO, NAME, "primary shard for this replica is not yet active"); } return allocation.decision(Decision.YES, NAME, "primary shard for this replica is already active"); } }
private boolean hasDeadNodes(RoutingAllocation allocation) { for (RoutingNode routingNode : allocation.routingNodes()) { if (allocation.nodes().getDataNodes().containsKey(routingNode.nodeId()) == false) { return true; } } return false; }
RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, tmpState, clusterInfoService.getClusterInfo(), currentNanoTime); IndexMetaData indexMetaData = allocation.metaData().getIndexSafe(shardToFail.shardId().getIndex()); allocation.addIgnoreShardForNode(shardToFail.shardId(), shardToFail.currentNodeId()); AllocationStatus.NO_ATTEMPT); if (failedShardEntry.markAsStale()) { allocation.removeAllocationId(failedShard); routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetaData, allocation.changes()); } else { logger.trace("{} shard routing failed in an earlier iteration (routing: {})", shardToFail.shardId(), shardToFail);
@Override public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) { DiscoveryNode fromDiscoNode = allocation.nodes().resolveNode(fromNode); DiscoveryNode toDiscoNode = allocation.nodes().resolveNode(toNode); Decision decision = null; RoutingNode fromRoutingNode = allocation.routingNodes().node(fromDiscoNode.getId()); if (fromRoutingNode == null && !fromDiscoNode.isDataNode()) { throw new IllegalArgumentException("[move_allocation] can't move [" + index + "][" + shardId + "] from " + "] is not a data node."); RoutingNode toRoutingNode = allocation.routingNodes().node(toDiscoNode.getId()); if (toRoutingNode == null && !toDiscoNode.isDataNode()) { throw new IllegalArgumentException("[move_allocation] can't move [" + index + "][" + shardId + "] from " return new RerouteExplanation(this, allocation.decision(Decision.NO, "move_allocation_command", "shard " + shardId + " has not been started")); decision = allocation.deciders().canAllocate(shardRouting, toRoutingNode, allocation); if (decision.type() == Decision.Type.NO) { if (explain) { allocation.routingNodes().relocateShard(shardRouting, toRoutingNode.nodeId(), allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), allocation.changes()); return new RerouteExplanation(this, allocation.decision(Decision.NO, "move_allocation_command", "shard " + shardId + " not found"));
@Override public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) { DiscoveryNode discoNode = allocation.nodes().resolveNode(node); ShardRouting shardRouting = null; RoutingNodes routingNodes = allocation.routingNodes(); RoutingNode routingNode = routingNodes.node(discoNode.getId()); IndexMetaData indexMetaData = null; if (routingNode != null) { indexMetaData = allocation.metaData().index(index()); if (indexMetaData == null) { throw new IndexNotFoundException(index()); return new RerouteExplanation(this, allocation.decision(Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + ", failed to find it on node " + discoNode)); return new RerouteExplanation(this, allocation.decision(Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + " on node " + discoNode + ", shard is primary and " + shardRouting.state().name().toLowerCase(Locale.ROOT))); new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null), indexMetaData, allocation.changes()); allocation.removeAllocationId(shardRouting); return new RerouteExplanation(this, allocation.decision(Decision.YES, "cancel_allocation_command", "shard " + shardId + " on node " + discoNode + " can be cancelled"));
final DiscoveryNode discoNode; try { discoNode = allocation.nodes().resolveNode(node); } catch (IllegalArgumentException e) { return explainOrThrowRejectedCommand(explain, allocation, e); final RoutingNodes routingNodes = allocation.routingNodes(); RoutingNode routingNode = routingNodes.node(discoNode.getId()); if (routingNode == null) { shardRouting = allocation.routingTable().shardRoutingTable(index, shardId).primaryShard(); } catch (IndexNotFoundException | ShardNotFoundException e) { return explainOrThrowRejectedCommand(explain, allocation, e); return new RerouteExplanation(this, allocation.decision(Decision.YES, name() + " (allocation command)", "ignore deciders"));
final boolean explain = allocation.debugDecision(); final FetchResult<NodeGatewayStartedShards> shardState = fetchData(unassignedShard, allocation); if (shardState.hasData() == false) { allocation.setHasPendingAsyncFetch(); List<NodeAllocationResult> nodeDecisions = null; if (explain) { final IndexMetaData indexMetaData = allocation.metaData().getIndexSafe(unassignedShard.index()); final Set<String> inSyncAllocationIds = indexMetaData.inSyncAllocationIds(unassignedShard.id()); final boolean snapshotRestore = unassignedShard.recoverySource().getType() == RecoverySource.Type.SNAPSHOT; allocation.getIgnoreNodes(unassignedShard.shardId()), inSyncAllocationIds, shardState, logger); final boolean enoughAllocationsFound = nodeShardsResult.orderedAllocationCandidates.size() > 0; logger.debug("[{}][{}]: found {} allocation candidates of {} based on allocation ids: [{}]", unassignedShard.index(), nodeResults = buildNodeDecisions(nodesToAllocate, shardState, inSyncAllocationIds); if (allocation.hasPendingAsyncFetch()) { return AllocateUnassignedDecision.no(AllocationStatus.FETCHING_SHARD_DATA, nodeResults); } else if (node != null) {
final RoutingNodes routingNodes = allocation.routingNodes(); final boolean explain = allocation.debugDecision(); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", unassignedShard); allocation.setHasPendingAsyncFetch(); List<NodeAllocationResult> nodeDecisions = null; if (explain) { return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), nodeDecisions); } else if (matchingNodes.getNodeWithHighestMatch() != null) { RoutingNode nodeWithHighestMatch = allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().getId()); Decision decision = allocation.deciders().canAllocate(unassignedShard, nodeWithHighestMatch, allocation); if (decision.type() == Decision.Type.THROTTLE) { logger.debug("[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store", if (explain) { UnassignedInfo unassignedInfo = unassignedShard.unassignedInfo(); MetaData metadata = allocation.metaData(); IndexMetaData indexMetaData = metadata.index(unassignedShard.index()); totalDelayMillis = INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(indexMetaData.getSettings()).getMillis();
RoutingAllocation allocation) { Decision madeDecision = Decision.NO; final boolean explain = allocation.debugDecision(); Map<String, NodeAllocationResult> nodeDecisions = explain ? new HashMap<>() : null; for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().getDataNodes().values()) { RoutingNode node = allocation.routingNodes().node(cursor.value.getId()); if (node == null) { continue; Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES && madeDecision.type() != Decision.Type.YES) { if (explain) {
final DiscoveryNode discoNode; try { discoNode = allocation.nodes().resolveNode(node); } catch (IllegalArgumentException e) { return explainOrThrowRejectedCommand(explain, allocation, e); final RoutingNodes routingNodes = allocation.routingNodes(); RoutingNode routingNode = routingNodes.node(discoNode.getId()); if (routingNode == null) { primaryShardRouting = allocation.routingTable().shardRoutingTable(index, shardId).primaryShard(); } catch (IndexNotFoundException | ShardNotFoundException e) { return explainOrThrowRejectedCommand(explain, allocation, e); allocation.routingTable().shardRoutingTable(index, shardId).replicaShardsWithState(ShardRoutingState.UNASSIGNED); ShardRouting shardRouting; if (replicaShardRoutings.isEmpty()) { Decision decision = allocation.deciders().canAllocate(shardRouting, routingNode, allocation); if (decision.type() == Decision.Type.NO) {
@Override protected AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetchData(ShardRouting shard, RoutingAllocation allocation) { AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction); asyncFetchStarted.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetch.fetchData(allocation.nodes(), allocation.metaData(), allocation.getIgnoreNodes(shard.shardId())); if (shardState.hasData() == true) { shardState.processAllocation(allocation); } return shardState; } }
private ClusterState buildResult(ClusterState oldState, RoutingAllocation allocation) { final RoutingTable oldRoutingTable = oldState.routingTable(); final RoutingNodes newRoutingNodes = allocation.routingNodes(); final RoutingTable newRoutingTable = new RoutingTable.Builder().updateNodes(oldRoutingTable.version(), newRoutingNodes).build(); final MetaData newMetaData = allocation.updateMetaDataWithRoutingChanges(newRoutingTable); assert newRoutingTable.validate(newMetaData); // validates the routing table is coherent with the cluster state metadata final ClusterState.Builder newStateBuilder = ClusterState.builder(oldState) .routingTable(newRoutingTable) .metaData(newMetaData); final RestoreInProgress restoreInProgress = allocation.custom(RestoreInProgress.TYPE); if (restoreInProgress != null) { RestoreInProgress updatedRestoreInProgress = allocation.updateRestoreInfoWithRoutingChanges(restoreInProgress); if (updatedRestoreInProgress != restoreInProgress) { ImmutableOpenMap.Builder<String, ClusterState.Custom> customsBuilder = ImmutableOpenMap.builder(allocation.getCustoms()); customsBuilder.put(RestoreInProgress.TYPE, updatedRestoreInProgress); newStateBuilder.customs(customsBuilder.build()); } } return newStateBuilder.build(); }
@Override public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { if (allocation.shouldIgnoreShardForNode(shardRouting.shardId(), node.nodeId())) { return Decision.NO; } Decision.Multi ret = new Decision.Multi(); for (AllocationDecider allocationDecider : allocations) { Decision decision = allocationDecider.canAllocate(shardRouting, node, allocation); // short track if a NO is returned. if (decision == Decision.NO) { if (logger.isTraceEnabled()) { logger.trace("Can not allocate [{}] on node [{}] due to [{}]", shardRouting, node.node(), allocationDecider.getClass().getSimpleName()); } // short circuit only if debugging is not enabled if (!allocation.debugDecision()) { return decision; } else { ret.add(decision); } } else if (decision != Decision.ALWAYS && (allocation.getDebugMode() != EXCLUDE_YES_DECISIONS || decision.type() != Decision.Type.YES)) { // the assumption is that a decider that returns the static instance Decision#ALWAYS // does not really implements canAllocate ret.add(decision); } } return ret; }
/** * Allocate unassigned shards to nodes (if any) where valid copies of the shard already exist. * It is up to the individual implementations of {@link #makeAllocationDecision(ShardRouting, RoutingAllocation, Logger)} * to make decisions on assigning shards to nodes. * * @param allocation the allocation state container object */ public void allocateUnassigned(RoutingAllocation allocation) { final RoutingNodes routingNodes = allocation.routingNodes(); final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { final ShardRouting shard = unassignedIterator.next(); final AllocateUnassignedDecision allocateUnassignedDecision = makeAllocationDecision(shard, allocation, logger); if (allocateUnassignedDecision.isDecisionTaken() == false) { // no decision was taken by this allocator continue; } if (allocateUnassignedDecision.getAllocationDecision() == AllocationDecision.YES) { unassignedIterator.initialize(allocateUnassignedDecision.getTargetNode().getId(), allocateUnassignedDecision.getAllocationId(), shard.primary() ? ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE : allocation.clusterInfo().getShardSize(shard, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), allocation.changes()); } else { unassignedIterator.removeAndIgnore(allocateUnassignedDecision.getAllocationStatus(), allocation.changes()); } } }
private void reroute(RoutingAllocation allocation) { assert hasDeadNodes(allocation) == false : "dead nodes should be explicitly cleaned up. See deassociateDeadNodes"; assert AutoExpandReplicas.getAutoExpandReplicaChanges(allocation.metaData(), allocation.nodes()).isEmpty() : "auto-expand replicas out of sync with number of nodes in the cluster"; // now allocate all the unassigned to available nodes if (allocation.routingNodes().unassigned().size() > 0) { removeDelayMarkers(allocation); gatewayAllocator.allocateUnassigned(allocation); } shardsAllocator.allocate(allocation); assert RoutingNodes.assertShardStats(allocation.routingNodes()); }
public CommandsResult reroute(final ClusterState clusterState, AllocationCommands commands, boolean explain, boolean retryFailed) { RoutingNodes routingNodes = getMutableRoutingNodes(clusterState); // we don't shuffle the unassigned shards here, to try and get as close as possible to // a consistent result of the effect the commands have on the routing // this allows systems to dry run the commands, see the resulting cluster state, and act on it RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState, clusterInfoService.getClusterInfo(), currentNanoTime()); // don't short circuit deciders, we want a full explanation allocation.debugDecision(true); // we ignore disable allocation, because commands are explicit allocation.ignoreDisable(true); RoutingExplanations explanations = commands.execute(allocation, explain); // we revert the ignore disable flag, since when rerouting, we want the original setting to take place allocation.ignoreDisable(false); // the assumption is that commands will move / act on shards (or fail through exceptions) // so, there will always be shard "movements", so no need to check on reroute if (retryFailed) { resetFailedAllocationCounter(allocation); } reroute(allocation); return new CommandsResult(explanations, buildResultAndLogHealthChange(clusterState, allocation, "reroute commands")); }