/** * Applies the started shards. Note, only initializing ShardRouting instances that exist in the routing table should be * provided as parameter and no duplicates should be contained. * <p> * If the same instance of the {@link ClusterState} is returned, then no change has been made.</p> */ public ClusterState applyStartedShards(ClusterState clusterState, List<ShardRouting> startedShards) { if (startedShards.isEmpty()) { return clusterState; } RoutingNodes routingNodes = getMutableRoutingNodes(clusterState); // shuffle the unassigned nodes, just so we won't have things like poison failed shards routingNodes.unassigned().shuffle(); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState, clusterInfoService.getClusterInfo(), currentNanoTime()); // as starting a primary relocation target can reinitialize replica shards, start replicas first startedShards = new ArrayList<>(startedShards); Collections.sort(startedShards, Comparator.comparing(ShardRouting::primary)); applyStartedShards(allocation, startedShards); gatewayAllocator.applyStartedShards(allocation, startedShards); reroute(allocation); String startedShardsAsString = firstListElementsToCommaDelimitedString(startedShards, s -> s.shardId().toString()); return buildResultAndLogHealthChange(clusterState, allocation, "shards started [" + startedShardsAsString + "] ..."); }
ClusterState applyFailedShards(ClusterState currentState, List<FailedShard> failedShards, List<StaleShard> staleShards) { return allocationService.applyFailedShards(currentState, failedShards, staleShards); }
/** * unassigned an shards that are associated with nodes that are no longer part of the cluster, potentially promoting replicas * if needed. */ public ClusterState deassociateDeadNodes(ClusterState clusterState, boolean reroute, String reason) { RoutingNodes routingNodes = getMutableRoutingNodes(clusterState); // shuffle the unassigned nodes, just so we won't have things like poison failed shards routingNodes.unassigned().shuffle(); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState, clusterInfoService.getClusterInfo(), currentNanoTime()); // first, clear from the shards any node id they used to belong to that is now dead deassociateDeadNodes(allocation); if (allocation.routingNodesChanged()) { clusterState = buildResult(clusterState, allocation); } if (reroute) { return reroute(clusterState, reason); } else { return clusterState; } }
protected ClusterState buildResultAndLogHealthChange(ClusterState oldState, RoutingAllocation allocation, String reason) { ClusterState newState = buildResult(oldState, allocation); logClusterHealthStateChange( new ClusterStateHealth(oldState), new ClusterStateHealth(newState), reason ); return newState; }
/** * Reroutes the routing table based on the live nodes. * <p> * If the same instance of ClusterState is returned, then no change has been made. */ protected ClusterState reroute(ClusterState clusterState, String reason, boolean debug) { ClusterState fixedClusterState = adaptAutoExpandReplicas(clusterState); RoutingNodes routingNodes = getMutableRoutingNodes(fixedClusterState); // shuffle the unassigned nodes, just so we won't have things like poison failed shards routingNodes.unassigned().shuffle(); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, fixedClusterState, clusterInfoService.getClusterInfo(), currentNanoTime()); allocation.debugDecision(debug); reroute(allocation); if (fixedClusterState == clusterState && allocation.routingNodesChanged() == false) { return clusterState; } return buildResultAndLogHealthChange(clusterState, allocation, reason); }
public RoutingAllocation.Result reroute(ClusterState clusterState, AllocationCommands commands, boolean explain) { RoutingNodes routingNodes = getMutableRoutingNodes(clusterState); // we don't shuffle the unassigned shards here, to try and get as close as possible to // a consistent result of the effect the commands have on the routing // this allows systems to dry run the commands, see the resulting cluster state, and act on it RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState.nodes(), clusterInfoService.getClusterInfo(), currentNanoTime()); // don't short circuit deciders, we want a full explanation allocation.debugDecision(true); // we ignore disable allocation, because commands are explicit allocation.ignoreDisable(true); RoutingExplanations explanations = commands.execute(allocation, explain); // we revert the ignore disable flag, since when rerouting, we want the original setting to take place allocation.ignoreDisable(false); // the assumption is that commands will move / act on shards (or fail through exceptions) // so, there will always be shard "movements", so no need to check on reroute reroute(allocation); RoutingTable routingTable = new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()); RoutingAllocation.Result result = new RoutingAllocation.Result(true, routingTable, explanations); logClusterHealthStateChange( new ClusterStateHealth(clusterState), new ClusterStateHealth(clusterState.getMetaData(), routingTable), "reroute commands" ); return result; }
/** * Reroutes the routing table based on the live nodes. * <p> * If the same instance of ClusterState is returned, then no change has been made. */ public ClusterState reroute(ClusterState clusterState, String reason) { return reroute(clusterState, reason, false); }
public RoutingAllocation.Result applyStartedShards(ClusterState clusterState, List<? extends ShardRouting> startedShards, boolean withReroute) { RoutingNodes routingNodes = getMutableRoutingNodes(clusterState); // shuffle the unassigned nodes, just so we won't have things like poison failed shards routingNodes.unassigned().shuffle(); StartedRerouteAllocation allocation = new StartedRerouteAllocation(allocationDeciders, routingNodes, clusterState.nodes(), startedShards, clusterInfoService.getClusterInfo()); boolean changed = applyStartedShards(routingNodes, startedShards); if (!changed) { return new RoutingAllocation.Result(false, clusterState.routingTable()); } shardsAllocators.applyStartedShards(allocation); if (withReroute) { reroute(allocation); } RoutingTable routingTable = new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()); RoutingAllocation.Result result = new RoutingAllocation.Result(true, routingTable); String startedShardsAsString = firstListElementsToCommaDelimitedString(startedShards, new Function<ShardRouting, String>() { @Override public String apply(ShardRouting s) { return s.shardId().toString(); } }); logClusterHealthStateChange( new ClusterStateHealth(clusterState), new ClusterStateHealth(clusterState.metaData(), routingTable), "shards started [" + startedShardsAsString + "] ..." ); return result; }
RoutingNodes routingNodes = getMutableRoutingNodes(clusterState); boolean changed = false; for (FailedRerouteAllocation.FailedShard failedShard : failedShards) { changed |= applyFailedShard(allocation, failedShard.shard, true, new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, failedShard.message, failedShard.failure, System.nanoTime(), System.currentTimeMillis())); reroute(allocation); RoutingTable routingTable = new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()); RoutingAllocation.Result result = new RoutingAllocation.Result(true, routingTable); String failedShardsAsString = firstListElementsToCommaDelimitedString(failedShards, new Function<FailedRerouteAllocation.FailedShard, String>() { @Override public String apply(FailedRerouteAllocation.FailedShard s) { logClusterHealthStateChange( new ClusterStateHealth(clusterState), new ClusterStateHealth(clusterState.getMetaData(), routingTable),
maybeUpdatedState = allocationService.applyStartedShards(currentState, shardRoutingsToBeApplied); builder.successes(tasksToBeApplied); } catch (Exception e) {
private void reroute(RoutingAllocation allocation) { assert hasDeadNodes(allocation) == false : "dead nodes should be explicitly cleaned up. See deassociateDeadNodes"; assert AutoExpandReplicas.getAutoExpandReplicaChanges(allocation.metaData(), allocation.nodes()).isEmpty() : "auto-expand replicas out of sync with number of nodes in the cluster"; // now allocate all the unassigned to available nodes if (allocation.routingNodes().unassigned().size() > 0) { removeDelayMarkers(allocation); gatewayAllocator.allocateUnassigned(allocation); } shardsAllocator.allocate(allocation); assert RoutingNodes.assertShardStats(allocation.routingNodes()); }
@Override public ClusterTasksResult<Task> execute(final ClusterState currentState, final List<Task> tasks) throws Exception { final DiscoveryNodes.Builder remainingNodesBuilder = DiscoveryNodes.builder(currentState.nodes()); boolean removed = false; for (final Task task : tasks) { if (currentState.nodes().nodeExists(task.node())) { remainingNodesBuilder.remove(task.node()); removed = true; } else { logger.debug("node [{}] does not exist in cluster state, ignoring", task); } } if (!removed) { // no nodes to remove, keep the current cluster state return ClusterTasksResult.<Task>builder().successes(tasks).build(currentState); } final ClusterState remainingNodesClusterState = remainingNodesClusterState(currentState, remainingNodesBuilder); final ClusterTasksResult.Builder<Task> resultBuilder = ClusterTasksResult.<Task>builder().successes(tasks); if (electMasterService.hasEnoughMasterNodes(remainingNodesClusterState.nodes()) == false) { final int masterNodes = electMasterService.countMasterNodes(remainingNodesClusterState.nodes()); rejoin.accept(LoggerMessageFormat.format("not enough master nodes (has [{}], but needed [{}])", masterNodes, electMasterService.minimumMasterNodes())); return resultBuilder.build(currentState); } else { return resultBuilder.build(allocationService.deassociateDeadNodes(remainingNodesClusterState, true, describeTasks(tasks))); } }
private boolean reroute(RoutingAllocation allocation) { boolean changed = false; // first, clear from the shards any node id they used to belong to that is now dead changed |= deassociateDeadNodes(allocation); // create a sorted list of from nodes with least number of shards to the maximum ones applyNewNodes(allocation); // elect primaries *before* allocating unassigned, so backups of primaries that failed // will be moved to primary state and not wait for primaries to be allocated and recovered (*from gateway*) changed |= electPrimariesAndUnassignedDanglingReplicas(allocation); // now allocate all the unassigned to available nodes if (allocation.routingNodes().unassigned().size() > 0) { updateLeftDelayOfUnassignedShards(allocation, settings); changed |= shardsAllocators.allocateUnassigned(allocation); } // move shards that no longer can be allocated changed |= shardsAllocators.moveShards(allocation); // rebalance changed |= shardsAllocators.rebalance(allocation); assert RoutingNodes.assertShardStats(allocation.routingNodes()); return changed; }
@Override protected long currentNanoTime() { return nanoTimeOverride == -1L ? super.currentNanoTime() : nanoTimeOverride; } }
protected ClusterState buildResultAndLogHealthChange(ClusterState oldState, RoutingAllocation allocation, String reason) { RoutingTable oldRoutingTable = oldState.routingTable(); RoutingNodes newRoutingNodes = allocation.routingNodes(); final RoutingTable newRoutingTable = new RoutingTable.Builder().updateNodes(oldRoutingTable.version(), newRoutingNodes).build(); MetaData newMetaData = allocation.updateMetaDataWithRoutingChanges(newRoutingTable); assert newRoutingTable.validate(newMetaData); // validates the routing table is coherent with the cluster state metadata final ClusterState.Builder newStateBuilder = ClusterState.builder(oldState) .routingTable(newRoutingTable) .metaData(newMetaData); final RestoreInProgress restoreInProgress = allocation.custom(RestoreInProgress.TYPE); if (restoreInProgress != null) { RestoreInProgress updatedRestoreInProgress = allocation.updateRestoreInfoWithRoutingChanges(restoreInProgress); if (updatedRestoreInProgress != restoreInProgress) { ImmutableOpenMap.Builder<String, ClusterState.Custom> customsBuilder = ImmutableOpenMap.builder(allocation.getCustoms()); customsBuilder.put(RestoreInProgress.TYPE, updatedRestoreInProgress); newStateBuilder.customs(customsBuilder.build()); } } final ClusterState newState = newStateBuilder.build(); logClusterHealthStateChange( new ClusterStateHealth(oldState), new ClusterStateHealth(newState), reason ); return newState; }
/** * Reroutes the routing table based on the live nodes. * <p> * If the same instance of the routing table is returned, then no change has been made. */ protected RoutingAllocation.Result reroute(ClusterState clusterState, String reason, boolean debug) { RoutingNodes routingNodes = getMutableRoutingNodes(clusterState); // shuffle the unassigned nodes, just so we won't have things like poison failed shards routingNodes.unassigned().shuffle(); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState.nodes(), clusterInfoService.getClusterInfo(), currentNanoTime()); allocation.debugDecision(debug); if (!reroute(allocation)) { return new RoutingAllocation.Result(false, clusterState.routingTable()); } RoutingTable routingTable = new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()); RoutingAllocation.Result result = new RoutingAllocation.Result(true, routingTable); logClusterHealthStateChange( new ClusterStateHealth(clusterState), new ClusterStateHealth(clusterState.getMetaData(), routingTable), reason ); return result; }
@Override public ClusterState execute(ClusterState currentState) throws Exception { removeIfSameTask(this); return allocationService.reroute(currentState, "assign delayed unassigned shards"); }
/** * Applies the started shards. Note, shards can be called several times within this method. * <p> * If the same instance of the routing table is returned, then no change has been made.</p> */ public RoutingAllocation.Result applyStartedShards(ClusterState clusterState, List<? extends ShardRouting> startedShards) { return applyStartedShards(clusterState, startedShards, true); }
private void reroute(RoutingAllocation allocation) { assert hasDeadNodes(allocation) == false : "dead nodes should be explicitly cleaned up. See deassociateDeadNodes"; // now allocate all the unassigned to available nodes if (allocation.routingNodes().unassigned().size() > 0) { removeDelayMarkers(allocation); gatewayAllocator.allocateUnassigned(allocation); } shardsAllocator.allocate(allocation); assert RoutingNodes.assertShardStats(allocation.routingNodes()); }
private ClusterState.Builder becomeMasterAndTrimConflictingNodes(ClusterState currentState, List<DiscoveryNode> joiningNodes) { assert currentState.nodes().getMasterNodeId() == null : currentState; DiscoveryNodes currentNodes = currentState.nodes(); DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder(currentNodes); nodesBuilder.masterNodeId(currentState.nodes().getLocalNodeId()); for (final DiscoveryNode joiningNode : joiningNodes) { final DiscoveryNode nodeWithSameId = nodesBuilder.get(joiningNode.getId()); if (nodeWithSameId != null && nodeWithSameId.equals(joiningNode) == false) { logger.debug("removing existing node [{}], which conflicts with incoming join from [{}]", nodeWithSameId, joiningNode); nodesBuilder.remove(nodeWithSameId.getId()); } final DiscoveryNode nodeWithSameAddress = currentNodes.findByAddress(joiningNode.getAddress()); if (nodeWithSameAddress != null && nodeWithSameAddress.equals(joiningNode) == false) { logger.debug("removing existing node [{}], which conflicts with incoming join from [{}]", nodeWithSameAddress, joiningNode); nodesBuilder.remove(nodeWithSameAddress.getId()); } } // now trim any left over dead nodes - either left there when the previous master stepped down // or removed by us above ClusterState tmpState = ClusterState.builder(currentState).nodes(nodesBuilder).blocks(ClusterBlocks.builder() .blocks(currentState.blocks()) .removeGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID)).build(); return ClusterState.builder(allocationService.deassociateDeadNodes(tmpState, false, "removed dead nodes on election")); }