private List<SingularityTaskId> tasksToShutDown(SingularityDeployProgress deployProgress, Collection<SingularityTaskId> otherActiveTasks, SingularityRequest request) { int numTasksToShutDown = Math.max(otherActiveTasks.size() - (request.getInstancesSafe() - deployProgress.getTargetActiveInstances()), 0); List<SingularityTaskId> sortedOtherTasks = new ArrayList<>(otherActiveTasks); Collections.sort(sortedOtherTasks, SingularityTaskId.INSTANCE_NO_COMPARATOR); return sortedOtherTasks.isEmpty() ? sortedOtherTasks : sortedOtherTasks.subList(0, Math.min(numTasksToShutDown, sortedOtherTasks.size())); }
private int getNewTargetInstances(SingularityDeployProgress deployProgress, SingularityRequest request, Optional<SingularityUpdatePendingDeployRequest> updateRequest) { if (updateRequest.isPresent()) { return Math.min(updateRequest.get().getTargetActiveInstances(), request.getInstancesSafe()); } else { return Math.min(deployProgress.getTargetActiveInstances() + deployProgress.getDeployInstanceCountPerStep(), request.getInstancesSafe()); } }
private boolean shouldKillIncrementalDeployCleanupTask(SingularityRequest request, SingularityTaskCleanup taskCleanup, String matchingTasksDeployId, List<SingularityTaskId> matchingTasks, SingularityDeployKey key, Multiset<SingularityDeployKey> incrementalCleaningTasks) { int healthyActiveDeployTasks = getNumHealthyTasks(request, matchingTasksDeployId, matchingTasks); if (healthyActiveDeployTasks < request.getInstancesSafe()) { LOG.trace("Not killing a task {} yet, only {} matching out of a required {}", taskCleanup, matchingTasks.size(), request.getInstancesSafe() - incrementalCleaningTasks.count(key)); return false; } else { LOG.debug("Killing a task {}, {} active deploy tasks are healthy", taskCleanup, healthyActiveDeployTasks); incrementalCleaningTasks.remove(key); return true; } }
private List<SingularityDeployFailure> getDeployFailures(SingularityRequest request, Optional<SingularityDeploy> deploy, SingularityPendingDeploy pendingDeploy, DeployState state, Collection<SingularityTaskId> matchingTasks) { List<SingularityDeployFailure> failures = new ArrayList<>(); failures.addAll(deployHealthHelper.getTaskFailures(deploy, matchingTasks)); if (state == DeployState.OVERDUE) { int targetInstances = pendingDeploy.getDeployProgress().isPresent() ? pendingDeploy.getDeployProgress().get().getTargetActiveInstances() :request.getInstancesSafe(); if (failures.isEmpty() && matchingTasks.size() < targetInstances) { failures.add(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_COULD_NOT_BE_SCHEDULED, Optional.<SingularityTaskId>absent(), Optional.of(String.format("Only %s of %s tasks could be launched for deploy, there may not be enough resources to launch the remaining tasks", matchingTasks.size(), targetInstances)))); } } return failures; } }
private boolean shouldKillIncrementalBounceTask(SingularityRequest request, SingularityTaskCleanup taskCleanup, String matchingTasksDeployId, List<SingularityTaskId> matchingTasks, SingularityDeployKey key, Multiset<SingularityDeployKey> incrementalCleaningTasks) { int healthyReplacementTasks = getNumHealthyTasks(request, matchingTasksDeployId, matchingTasks); if (healthyReplacementTasks + incrementalCleaningTasks.count(key) <= request.getInstancesSafe()) { LOG.trace("Not killing a task {} yet, only {} matching out of a required {}", taskCleanup, matchingTasks.size(), request.getInstancesSafe() - incrementalCleaningTasks.count(key)); return false; } else { LOG.debug("Killing a task {}, {} replacement tasks are healthy", taskCleanup, healthyReplacementTasks); incrementalCleaningTasks.remove(key); return true; } }
@Override public void sendRequestScaledMail(SingularityRequest request, Optional<SingularityScaleRequest> newScaleRequest, Optional<Integer> formerInstances, Optional<String> user) { Map<String, Object> additionalProperties = new HashMap<>(); Optional<String> message = Optional.absent(); if (newScaleRequest.isPresent()) { setupExpireFormat(additionalProperties, newScaleRequest.get().getDurationMillis()); message = newScaleRequest.get().getMessage(); } additionalProperties.put("newInstances", request.getInstancesSafe()); additionalProperties.put("oldInstances", formerInstances.or(1)); sendRequestMail(request, RequestMailType.SCALED, user, message, Optional.of(additionalProperties)); }
private boolean shouldReschedule(SingularityRequest newRequest, SingularityRequest oldRequest) { if (newRequest.getInstancesSafe() != oldRequest.getInstancesSafe()) { return true; } if (newRequest.isScheduled() && oldRequest.isScheduled()) { if (!newRequest.getQuartzScheduleSafe().equals(oldRequest.getQuartzScheduleSafe())) { return true; } } return false; }
public void checkResourcesForBounce(SingularityRequest request, boolean isIncremental) { SlavePlacement placement = request.getSlavePlacement().or(defaultSlavePlacement); if ((isAllowBounceToSameHost(request) && placement == SlavePlacement.SEPARATE_BY_REQUEST) || (!isAllowBounceToSameHost(request) && placement != SlavePlacement.GREEDY && placement != SlavePlacement.OPTIMISTIC)) { int currentActiveSlaveCount = slaveManager.getNumObjectsAtState(MachineState.ACTIVE); int requiredSlaveCount = isIncremental ? request.getInstancesSafe() + 1 : request.getInstancesSafe() * 2; checkBadRequest(currentActiveSlaveCount >= requiredSlaveCount, "Not enough active slaves to successfully scale request %s to %s instances (minimum required: %s, current: %s).", request.getId(), request.getInstancesSafe(), requiredSlaveCount, currentActiveSlaveCount); } }
public List<SingularityTaskId> rebalanceRacks(SingularityRequest request, List<SingularityTaskId> remainingActiveTasks, Optional<String> user) { List<SingularityTaskId> extraCleanedTasks = new ArrayList<>(); int numActiveRacks = rackManager.getNumActive(); double perRack = request.getInstancesSafe() / (double) numActiveRacks; Multiset<String> countPerRack = HashMultiset.create(); for (SingularityTaskId taskId : remainingActiveTasks) { countPerRack.add(taskId.getRackId()); LOG.info("{} - {} - {} - {}", countPerRack, perRack, extraCleanedTasks.size(), taskId); if (countPerRack.count(taskId.getRackId()) > perRack && extraCleanedTasks.size() < numActiveRacks / 2) { extraCleanedTasks.add(taskId); LOG.info("Cleaning up task {} to evenly distribute tasks among racks", taskId); taskManager.createTaskCleanup(new SingularityTaskCleanup(user, TaskCleanupType.REBALANCE_RACKS, System.currentTimeMillis(), taskId, Optional.absent(), Optional.absent(), Optional.absent())); } } return extraCleanedTasks; }
private boolean isLastStepFinished(SingularityDeployProgress deployProgress, SingularityRequest request) { return deployProgress.isStepComplete() && deployProgress.getTargetActiveInstances() >= request.getInstancesSafe(); }
public void checkScale(SingularityRequest request, Optional<Integer> previousScale) { SlavePlacement placement = request.getSlavePlacement().or(defaultSlavePlacement); if (placement != SlavePlacement.GREEDY && placement != SlavePlacement.OPTIMISTIC) { int currentActiveSlaveCount = slaveManager.getNumObjectsAtState(MachineState.ACTIVE); int requiredSlaveCount = request.getInstancesSafe(); if (previousScale.isPresent() && placement == SlavePlacement.SEPARATE_BY_REQUEST) { requiredSlaveCount += previousScale.get(); } checkBadRequest(currentActiveSlaveCount >= requiredSlaveCount, "Not enough active slaves to successfully complete a bounce of request %s (minimum required: %s, current: %s). Consider deploying, or changing the slave placement strategy instead.", request.getId(), requiredSlaveCount, currentActiveSlaveCount); } }
private boolean hasFailedTooManyTimes(SingularityRequest request, SingularityDeployStatistics deployStatistics, Optional<Integer> instanceNo, Optional<Long> recentFailureTimestamp) { final long now = System.currentTimeMillis(); int numInstancesThatMustFail = (int) Math.ceil(request.getInstancesSafe() * configuration.getCooldownAfterPctOfInstancesFail()); int numInstancesThatFailed = 0; for (int i = 1; i < request.getInstancesSafe() + 1; i++) { int numFailuresInsideCooldown = 0; for (long failureTimestamp : deployStatistics.getInstanceSequentialFailureTimestamps().get(i)) { if (hasFailedInsideCooldown(now, failureTimestamp)) { numFailuresInsideCooldown++; } } if (instanceNo.isPresent() && instanceNo.get() == i && recentFailureTimestamp.isPresent()) { if (hasFailedInsideCooldown(now, recentFailureTimestamp.get())) { numFailuresInsideCooldown++; } } if (numFailuresInsideCooldown >= configuration.getCooldownAfterFailures()) { numInstancesThatFailed++; } } return numInstancesThatFailed >= numInstancesThatMustFail; }
private int numInstancesExpected(SingularityRequest request, SingularityPendingRequest pendingRequest, Optional<SingularityPendingDeploy> maybePendingDeploy) { if (!maybePendingDeploy.isPresent() || (maybePendingDeploy.get().getCurrentDeployState() == DeployState.CANCELED) || !maybePendingDeploy.get().getDeployProgress().isPresent()) { return request.getInstancesSafe(); } SingularityDeployProgress deployProgress = maybePendingDeploy.get().getDeployProgress().get(); if (maybePendingDeploy.get().getDeployMarker().getDeployId().equals(pendingRequest.getDeployId())) { return deployProgress.getTargetActiveInstances(); } else { if (deployProgress.isStepComplete()) { return Math.max(request.getInstancesSafe() - deployProgress.getTargetActiveInstances(), 0); } else { return request.getInstancesSafe() - (Math.max(deployProgress.getTargetActiveInstances() - deployProgress.getDeployInstanceCountPerStep(), 0)); } } }
private boolean areSlaveAttributeMinimumsFeasible(SingularityOfferHolder offerHolder, SingularityTaskRequest taskRequest, List<SingularityTaskId> activeTaskIdsForRequest) { if (!taskRequest.getRequest().getSlaveAttributeMinimums().isPresent()) { return true; } Map<String, String> offerAttributes = slaveManager.getSlave(offerHolder.getSlaveId()).get().getAttributes(); Integer numDesiredInstances = taskRequest.getRequest().getInstancesSafe(); Integer numActiveInstances = activeTaskIdsForRequest.size(); for (Entry<String, Map<String, Integer>> keyEntry : taskRequest.getRequest().getSlaveAttributeMinimums().get().entrySet()) { String attrKey = keyEntry.getKey(); for (Entry<String, Integer> valueEntry : keyEntry.getValue().entrySet()) { Integer percentInstancesWithAttr = valueEntry.getValue(); Integer minInstancesWithAttr = Math.max(1, (int) ((percentInstancesWithAttr / 100.0) * numDesiredInstances)); if (offerAttributes.containsKey(attrKey) && offerAttributes.get(attrKey).equals(valueEntry.getKey())) { // Accepting this offer would add an instance of the needed attribute, so it's okay. continue; } // Would accepting this offer prevent meeting the necessary attribute in the future? long numInstancesWithAttr = getNumInstancesWithAttribute(activeTaskIdsForRequest, attrKey, valueEntry.getKey()); long numInstancesWithoutAttr = numActiveInstances - numInstancesWithAttr + 1; long maxPotentialInstancesWithAttr = numDesiredInstances - numInstancesWithoutAttr; if (maxPotentialInstancesWithAttr < minInstancesWithAttr) { return false; } } } return true; }
private void maybeUpdatePendingRequest(SingularityPendingDeploy pendingDeploy, Optional<SingularityDeploy> deploy, SingularityRequest request, Optional<SingularityUpdatePendingDeployRequest> updatePendingDeployRequest, Optional<SingularityLoadBalancerUpdate> lbUpdate) { if (updatePendingDeployRequest.isPresent() && pendingDeploy.getDeployProgress().isPresent()) { SingularityDeployProgress newProgress = pendingDeploy.getDeployProgress().get().withNewTargetInstances(Math.min(updatePendingDeployRequest.get().getTargetActiveInstances(), request.getInstancesSafe())); updatePendingDeploy(pendingDeploy, lbUpdate.or(pendingDeploy.getLastLoadBalancerUpdate()), DeployState.WAITING, Optional.of(newProgress)); requestManager .addToPendingQueue(new SingularityPendingRequest(request.getId(), pendingDeploy.getDeployMarker().getDeployId(), System.currentTimeMillis(), pendingDeploy.getDeployMarker().getUser(), PendingType.NEXT_DEPLOY_STEP, deploy.isPresent() ? deploy.get().getSkipHealthchecksOnDeploy() : Optional.<Boolean> absent(), pendingDeploy.getDeployMarker().getMessage())); } }
private void updatePossiblyUnderProvisionedAndOverProvisionedIds(SingularityRequestWithState requestWithState, Map<String, Long> numInstances, List<String> overProvisionedRequestIds, Set<String> possiblyUnderProvisionedRequestIds) { if (requestWithState.getState().isRunnable() && requestWithState.getRequest().isAlwaysRunning()) { SingularityRequest request = requestWithState.getRequest(); final int expectedInstances = request.getInstancesSafe(); final Long numActualInstances = numInstances.get(request.getId()); if (numActualInstances == null || numActualInstances < expectedInstances) { possiblyUnderProvisionedRequestIds.add(request.getId()); } else if (numActualInstances > expectedInstances) { overProvisionedRequestIds.add(request.getId()); } } }
private TaskCleanupType getCleanupType(SingularityPendingDeploy pendingDeploy, SingularityRequest request, SingularityDeployResult deployResult) { if (pendingDeploy.getDeployProgress().isPresent() && pendingDeploy.getDeployProgress().get().getDeployInstanceCountPerStep() != request.getInstancesSafe()) { // For incremental deploys, return a special cleanup type if (deployResult.getDeployState() == DeployState.FAILED) { return TaskCleanupType.INCREMENTAL_DEPLOY_FAILED; } else if (deployResult.getDeployState() == DeployState.CANCELED) { return TaskCleanupType.INCREMENTAL_DEPLOY_CANCELLED; } } return deployResult.getDeployState().getCleanupType(); }
@Override public void runActionOnPoll() { int currentActiveSlaveCount = slaveManager.getNumObjectsAtState(MachineState.ACTIVE); for (SingularityRequestWithState requestWithState : requestManager.getActiveRequests()) { lock.runWithRequestLock(() -> { SingularityRequest request = requestWithState.getRequest(); SlavePlacement placement = request.getSlavePlacement().or(defaultSlavePlacement); if (placement != SlavePlacement.SPREAD_ALL_SLAVES) { return; } int requestInstanceCount = request.getInstancesSafe(); if (requestInstanceCount == currentActiveSlaveCount) { LOG.trace("Active Request {} is already spread to all {} available slaves", request.getId(), currentActiveSlaveCount); } else { LOG.info("Scaling request {} from {} instances to {} available slaves", request.getId(), requestInstanceCount, currentActiveSlaveCount); submitScaleRequest(requestWithState, currentActiveSlaveCount); } }, requestWithState.getRequest().getId(), getClass().getSimpleName()); } }
@POST @Path("/update") @Operation( summary = "Update the target active instance count for a pending deploy", responses = { @ApiResponse(responseCode = "400", description = "Deploy is not in the pending state pending or is not not present") } ) public SingularityRequestParent updatePendingDeploy( @Parameter(hidden = true) @Auth SingularityUser user, @RequestBody(required = true) SingularityUpdatePendingDeployRequest updateRequest) { SingularityRequestWithState requestWithState = fetchRequestWithState(updateRequest.getRequestId(), user); authorizationHelper.checkForAuthorization(requestWithState.getRequest(), user, SingularityAuthorizationScope.WRITE); Optional<SingularityRequestDeployState> deployState = deployManager.getRequestDeployState(requestWithState.getRequest().getId()); checkBadRequest(deployState.isPresent() && deployState.get().getPendingDeploy().isPresent() && deployState.get().getPendingDeploy().get().getDeployId().equals(updateRequest.getDeployId()), "Request %s does not have a pending deploy %s", updateRequest.getRequestId(), updateRequest.getDeployId()); checkBadRequest(updateRequest.getTargetActiveInstances() > 0 && updateRequest.getTargetActiveInstances() <= requestWithState.getRequest().getInstancesSafe(), "Cannot update pending deploy to have more instances (%s) than instances set for request (%s), or less than 1 instance", updateRequest.getTargetActiveInstances(), requestWithState.getRequest().getInstancesSafe()); deployManager.createUpdatePendingDeployRequest(updateRequest); return fillEntireRequest(requestWithState); } }
private void checkForIllegalResources(SingularityRequest request, SingularityDeploy deploy) { int instances = request.getInstancesSafe(); double cpusPerInstance = deploy.getResources().or(defaultResources).getCpus(); double memoryMbPerInstance = deploy.getResources().or(defaultResources).getMemoryMb(); double diskMbPerInstance = deploy.getResources().or(defaultResources).getDiskMb(); checkBadRequest(cpusPerInstance > 0, "Request must have more than 0 cpus"); checkBadRequest(memoryMbPerInstance > 0, "Request must have more than 0 memoryMb"); checkBadRequest(diskMbPerInstance >= 0, "Request must have non-negative diskMb"); checkBadRequest(cpusPerInstance <= maxCpusPerInstance, "Deploy %s uses too many cpus %s (maxCpusPerInstance %s in mesos configuration)", deploy.getId(), cpusPerInstance, maxCpusPerInstance); checkBadRequest(cpusPerInstance * instances <= maxCpusPerRequest, "Deploy %s uses too many cpus %s (%s*%s) (cpusPerRequest %s in mesos configuration)", deploy.getId(), cpusPerInstance * instances, cpusPerInstance, instances, maxCpusPerRequest); checkBadRequest(memoryMbPerInstance <= maxMemoryMbPerInstance, "Deploy %s uses too much memoryMb %s (maxMemoryMbPerInstance %s in mesos configuration)", deploy.getId(), memoryMbPerInstance, maxMemoryMbPerInstance); checkBadRequest(memoryMbPerInstance * instances <= maxMemoryMbPerRequest, "Deploy %s uses too much memoryMb %s (%s*%s) (maxMemoryMbPerRequest %s in mesos configuration)", deploy.getId(), memoryMbPerInstance * instances, memoryMbPerInstance, instances, maxMemoryMbPerRequest); checkBadRequest(diskMbPerInstance <= maxDiskMbPerInstance, "Deploy %s uses too much diskMb %s (maxDiskMbPerInstance %s in mesos configuration)", deploy.getId(), diskMbPerInstance, maxDiskMbPerInstance); checkBadRequest(diskMbPerInstance * instances <= maxDiskMbPerRequest, "Deploy %s uses too much diskMb %s (%s*%s) (maxDiskMbPerRequest %s in mesos configuration)", deploy.getId(), diskMbPerInstance * instances, diskMbPerInstance, instances, maxDiskMbPerRequest); }