public SingularityTaskRequestHolder(SingularityTaskRequest taskRequest, Resources defaultResources, Resources defaultCustomExecutorResources) { this.taskRequest = taskRequest; this.executorResources = taskRequest.getDeploy().getCustomExecutorCmd().isPresent() ? taskRequest.getDeploy().getCustomExecutorResources().or(defaultCustomExecutorResources) : Resources.EMPTY_RESOURCES;; this.taskResources = taskRequest.getPendingTask().getResources().or(taskRequest.getDeploy().getResources()).or(defaultResources); this.totalResources = Resources.add(taskResources, executorResources); this.requestedPorts = new ArrayList<>(); if (taskRequest.getDeploy().getContainerInfo().isPresent() && taskRequest.getDeploy().getContainerInfo().get().getDocker().isPresent()) { requestedPorts.addAll(taskRequest.getDeploy().getContainerInfo().get().getDocker().get().getLiteralHostPorts()); } }
private Optional<Integer> getCpuHardLimit(SingularityTaskRequest task) { if (configuration.getCpuHardLimit().isPresent()) { Optional<Resources> maybeResources = task.getPendingTask().getResources().or(task.getDeploy().getResources()); if (maybeResources.isPresent()) { double requestedCpus = maybeResources.get().getCpus(); int scaledLimit = (int) Math.ceil(requestedCpus * configuration.getCpuHardLimitScaleFactor()); return Optional.of(Math.max(scaledLimit, configuration.getCpuHardLimit().get())); } } return Optional.absent(); }
public SingularityHealthcheckAsyncHandler(SingularityExceptionNotifier exceptionNotifier, SingularityConfiguration configuration, SingularityHealthchecker healthchecker, SingularityNewTaskChecker newTaskChecker, TaskManager taskManager, SingularityTask task) { this.exceptionNotifier = exceptionNotifier; this.taskManager = taskManager; this.newTaskChecker = newTaskChecker; this.healthchecker = healthchecker; this.task = task; this.maxHealthcheckResponseBodyBytes = configuration.getMaxHealthcheckResponseBodyBytes(); this.failureStatusCodes = task.getTaskRequest().getDeploy().getHealthcheck().isPresent() ? task.getTaskRequest().getDeploy().getHealthcheck().get().getFailureStatusCodes().or(configuration.getHealthcheckFailureStatusCodes()) : configuration.getHealthcheckFailureStatusCodes(); startTime = System.currentTimeMillis(); }
private Optional<String> getHealthcheckUri(SingularityTask task) { if (!task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) { return Optional.absent(); } HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get(); final String hostname = task.getHostname(); Optional<Long> healthcheckPort = options.getPortNumber().or(MesosUtils.getPortByIndex(mesosProtosUtils.toResourceList(task.getMesosTask().getResources()), options.getPortIndex().or(0))); if (!healthcheckPort.isPresent() || healthcheckPort.get() < 1L) { return Optional.absent(); } if (!task.getTaskRequest().getDeploy().getHealthcheck().get().getUri().isPresent()) { return Optional.absent(); } String uri = task.getTaskRequest().getDeploy().getHealthcheck().get().getUri().get(); if (uri.startsWith("/")) { uri = uri.substring(1); } HealthcheckProtocol protocol = options.getProtocol().or(DEFAULT_HEALTH_CHECK_SCHEME); return Optional.of(String.format("%s://%s:%d/%s", protocol.getProtocol(), hostname, healthcheckPort.get(), uri)); }
private Optional<String> getStatusMessage(Protos.TaskStatus status, Optional<SingularityTask> task) { if (status.hasMessage() && !Strings.isNullOrEmpty(status.getMessage())) { return Optional.of(status.getMessage()); } else if (status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_MEMORY) { if (task.isPresent() && task.get().getTaskRequest().getDeploy().getResources().isPresent()) { if (task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb() > 0) { return Optional.of(String.format("Task exceeded one or more memory limits (%s MB mem, %s MB disk).", task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb(), task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb())); } else { return Optional.of(String.format("Task exceeded memory limit (%s MB mem).", task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb())); } } return Optional.of("Task exceeded memory limit."); } else if (status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_DISK) { if (task.isPresent() && task.get().getTaskRequest().getDeploy().getResources().isPresent()) { return Optional.of(String.format("Task exceeded disk limit (%s MB disk).", task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb())); } else { return Optional.of("Task exceeded disk limit."); } } return Optional.absent(); }
private boolean matchesDeploy(SingularityRequestDeployState requestDeployState, SingularityTaskRequest taskRequest) { if (requestDeployState == null) { return false; } return matchesDeployMarker(requestDeployState.getActiveDeploy(), taskRequest.getDeploy().getId()) || matchesDeployMarker(requestDeployState.getPendingDeploy(), taskRequest.getDeploy().getId()); }
private int getDelaySeconds(SingularityTask task, Optional<SingularityRequestWithState> requestWithState) { int delaySeconds = configuration.getNewTaskCheckerBaseDelaySeconds(); if (hasHealthcheck(task, requestWithState)) { Optional<Integer> maybeStartupDelay = task.getTaskRequest().getDeploy().getHealthcheck().get().getStartupDelaySeconds().or(configuration.getStartupDelaySeconds()); if (maybeStartupDelay.isPresent()) { return maybeStartupDelay.get(); } } else if (task.getTaskRequest().getRequest().isLoadBalanced()) { return delaySeconds; } delaySeconds += task.getTaskRequest().getDeploy().getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds()); return delaySeconds; }
private boolean hasHealthcheck(SingularityTask task, Optional<SingularityRequestWithState> requestWithState) { if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) { return false; } if (!task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) { return false; } if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(Boolean.FALSE)) { return false; } if (requestWithState.isPresent() && requestWithState.get().getRequest().getSkipHealthchecks().or(Boolean.FALSE)) { return false; } return true; }
private boolean isPreemptibleTask(SingularityTaskRequest taskRequest) { // A long running task can be replaced + killed easily if (taskRequest.getRequest().getRequestType().isLongRunning()) { return true; } // A short, non-long-running task Optional<SingularityDeployStatistics> deployStatistics = deployManager.getDeployStatistics(taskRequest.getRequest().getId(), taskRequest.getDeploy().getId()); return deployStatistics.isPresent() && deployStatistics.get().getAverageRuntimeMillis().isPresent() && deployStatistics.get().getAverageRuntimeMillis().get() < configuration.getPreemptibleTaskMaxExpectedRuntimeMs(); }
private List<UpstreamInfo> tasksToUpstreams(List<SingularityTask> tasks, String requestId, Optional<String> loadBalancerUpstreamGroup) { final List<UpstreamInfo> upstreams = Lists.newArrayListWithCapacity(tasks.size()); for (SingularityTask task : tasks) { final Optional<Long> maybeLoadBalancerPort = MesosUtils.getPortByIndex(mesosProtosUtils.toResourceList(task.getMesosTask().getResources()), task.getTaskRequest().getDeploy().getLoadBalancerPortIndex().or(0)); if (maybeLoadBalancerPort.isPresent()) { String upstream = String.format("%s:%d", task.getHostname(), maybeLoadBalancerPort.get()); Optional<String> group = loadBalancerUpstreamGroup; if (taskLabelForLoadBalancerUpstreamGroup.isPresent()) { for (MesosParameter label : task.getMesosTask().getLabels().getLabels()) { if (label.hasKey() && label.getKey().equals(taskLabelForLoadBalancerUpstreamGroup.get()) && label.hasValue()) { group = Optional.of(label.getValue()); break; } } } upstreams.add(new UpstreamInfo(upstream, Optional.of(requestId), task.getRackId(), Optional.<String>absent(), group)); } else { LOG.warn("Task {} is missing port but is being passed to LB ({})", task.getTaskId(), task); } } return upstreams; }
@Before public void setup() { Mockito.when(taskRequest.getRequest()).thenReturn(request); Mockito.when(request.getId()).thenReturn("requestId"); Mockito.when(taskRequest.getDeploy()).thenReturn(deploy); Mockito.when(deploy.getId()).thenReturn("deployId"); Mockito.when(taskRequest.getPendingTask()).thenReturn(task); Mockito.when(task.getPendingTaskId()).thenReturn(taskId); }
public void enqueueHealthcheck(SingularityTask task, boolean ignoreExisting, boolean inStartup, boolean isFirstCheck) { HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get(); final Optional<Integer> healthcheckMaxRetries = options.getMaxRetries().or(configuration.getHealthcheckMaxRetries()); Optional<Long> maybeRunningAt = getRunningAt(taskManager.getTaskHistoryUpdates(task.getTaskId())); if (maybeRunningAt.isPresent()) { final long durationSinceRunning = System.currentTimeMillis() - maybeRunningAt.get(); final int startupTimeout = options.getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds()); if (inStartup && durationSinceRunning > TimeUnit.SECONDS.toMillis(startupTimeout)) { LOG.debug("{} since running", durationSinceRunning); LOG.info("Not enqueuing new healthcheck for {}, has not responded to healthchecks before startup timeout of {}s", task.getTaskId(), startupTimeout); return; } } if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(task.getTaskId()) > healthcheckMaxRetries.get()) { LOG.info("Not enqueuing new healthcheck for {}, it has already attempted {} times", task.getTaskId(), healthcheckMaxRetries.get()); return; } ScheduledFuture<?> future = enqueueHealthcheckWithDelay(task, getDelaySeconds(task.getTaskId(), options, inStartup, isFirstCheck), inStartup); ScheduledFuture<?> existing = taskIdToHealthcheck.put(task.getTaskId().getId(), future); if (existing != null) { boolean canceledExisting = existing.cancel(false); if (!ignoreExisting) { LOG.warn("Found existing overlapping healthcheck for task {} - cancel success: {}", task.getTaskId(), canceledExisting); } } }
private boolean shouldHealthcheck(final SingularityTask task, final Optional<SingularityRequestWithState> request, Optional<SingularityPendingDeploy> pendingDeploy) { if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) { return false; } if (!task.getTaskRequest().getRequest().isLongRunning() || !task.getTaskRequest().getDeploy().getHealthcheck().isPresent() || task.getTaskRequest().getDeploy().getHealthcheck().get().getHealthcheckResultFilePath().isPresent()) { return false; } if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(false)) { return false; } if (pendingDeploy.isPresent() && pendingDeploy.get().getDeployMarker().getDeployId().equals(task.getTaskId().getDeployId()) && task.getTaskRequest().getDeploy().getSkipHealthchecksOnDeploy().or(false)) { return false; } if (request.isPresent() && request.get().getRequest().getSkipHealthchecks().or(false)) { return false; } Optional<SingularityTaskHealthcheckResult> lastHealthcheck = taskManager.getLastHealthcheck(task.getTaskId()); if (lastHealthcheck.isPresent() && !lastHealthcheck.get().isFailed()) { LOG.debug("Not submitting a new healthcheck for {} because it already passed a healthcheck", task.getTaskId()); return false; } return true; }
private void asyncHealthcheck(final SingularityTask task) { final SingularityHealthcheckAsyncHandler handler = new SingularityHealthcheckAsyncHandler(exceptionNotifier, configuration, this, newTaskChecker, taskManager, task); final Optional<String> uri = getHealthcheckUri(task); if (!uri.isPresent()) { saveFailure(handler, "Invalid healthcheck uri or ports not present"); return; } final Integer timeoutSeconds = task.getTaskRequest().getDeploy().getHealthcheck().isPresent() ? task.getTaskRequest().getDeploy().getHealthcheck().get().getResponseTimeoutSeconds().or(configuration.getHealthcheckTimeoutSeconds()) : configuration.getHealthcheckTimeoutSeconds(); try { PerRequestConfig prc = new PerRequestConfig(); prc.setRequestTimeoutInMs((int) TimeUnit.SECONDS.toMillis(timeoutSeconds)); RequestBuilder builder = new RequestBuilder("GET"); builder.setFollowRedirects(true); builder.setUrl(uri.get()); builder.setPerRequestConfig(prc); LOG.trace("Issuing a healthcheck ({}) for task {} with timeout {}s", uri.get(), task.getTaskId(), timeoutSeconds); http.prepareRequest(builder.build()).execute(handler); } catch (Throwable t) { LOG.debug("Exception while preparing healthcheck ({}) for task ({})", uri, task.getTaskId(), t); exceptionNotifier.notify(String.format("Error preparing healthcheck (%s)", t.getMessage()), t, ImmutableMap.of("taskId", task.getTaskId().toString())); saveFailure(handler, String.format("Healthcheck failed due to exception: %s", t.getMessage())); } }
public void killAndRecord(SingularityTaskId taskId, Optional<RequestCleanupType> requestCleanupType, Optional<TaskCleanupType> taskCleanupType, Optional<Long> originalTimestamp, Optional<Integer> retries, Optional<String> user) { Preconditions.checkState(isRunning()); Optional<TaskCleanupType> maybeCleanupFromRequestAndTask = getTaskCleanupType(requestCleanupType, taskCleanupType); if (maybeCleanupFromRequestAndTask.isPresent() && (maybeCleanupFromRequestAndTask.get() == TaskCleanupType.USER_REQUESTED_DESTROY || maybeCleanupFromRequestAndTask.get() == TaskCleanupType.REQUEST_DELETING)) { Optional<SingularityTask> task = taskManager.getTask(taskId); if (task.isPresent()) { if (task.get().getTaskRequest().getDeploy().getCustomExecutorCmd().isPresent()) { byte[] messageBytes = transcoder.toBytes(new SingularityTaskDestroyFrameworkMessage(taskId, user)); mesosSchedulerClient.frameworkMessage( MesosProtosUtils.toExecutorId(task.get().getMesosTask().getExecutor().getExecutorId()), MesosProtosUtils.toAgentId(task.get().getMesosTask().getAgentId()), messageBytes ); } else { LOG.warn("Not using custom executor, will not send framework message to destroy task"); } } else { String message = String.format("No task data available to build kill task framework message for task %s", taskId); exceptionNotifier.notify(message); LOG.error(message); } } mesosSchedulerClient.kill(TaskID.newBuilder().setValue(taskId.toString()).build()); taskManager.saveKilledRecord(new SingularityKilledTaskIdRecord(taskId, System.currentTimeMillis(), originalTimestamp.or(System.currentTimeMillis()), requestCleanupType, taskCleanupType, retries.or(-1) + 1)); }
private TaskCleanupResult cleanTask(SingularityExecutorTaskDefinition taskDefinition, Optional<SingularityTaskHistory> taskHistory) { SingularityExecutorTaskLogManager logManager = new SingularityExecutorTaskLogManager(taskDefinition, templateManager, baseConfiguration, executorConfiguration, LOG, jsonObjectFileHelper, false); SingularityExecutorTaskCleanup taskCleanup = new SingularityExecutorTaskCleanup(logManager, executorConfiguration, taskDefinition, LOG, dockerUtils); boolean cleanupTaskAppDirectory = !taskDefinition.getExecutorData().getPreserveTaskSandboxAfterFinish().or(Boolean.FALSE); if (taskDefinition.shouldLogrotateLogFile()) { checkForUncompressedLogrotatedFile(taskDefinition); } if (taskHistory.isPresent()) { final Optional<SingularityTaskHistoryUpdate> lastUpdate = JavaUtils.getLast(taskHistory.get().getTaskUpdates()); if (lastUpdate.isPresent()) { if (lastUpdate.get().getTaskState().isDone() && System.currentTimeMillis() - lastUpdate.get().getTimestamp() > TimeUnit.MINUTES.toMillis(15)) { LOG.info("Task {} is done for > 15 minutes, removing logrotate files"); taskCleanup.cleanUpLogs(); } if (lastUpdate.get().getTaskState().isFailed()) { final long delta = System.currentTimeMillis() - lastUpdate.get().getTimestamp(); if (delta < cleanupConfiguration.getCleanupAppDirectoryOfFailedTasksAfterMillis()) { LOG.info("Not cleaning up task app directory of {} because only {} has elapsed since it failed (will cleanup after {})", taskDefinition.getTaskId(), JavaUtils.durationFromMillis(delta), JavaUtils.durationFromMillis(cleanupConfiguration.getCleanupAppDirectoryOfFailedTasksAfterMillis())); cleanupTaskAppDirectory = false; } } } } boolean isDocker = (taskHistory.isPresent() && taskHistory.get().getTask().getTaskRequest().getDeploy().getContainerInfo().isPresent() && taskHistory.get().getTask().getTaskRequest().getDeploy().getContainerInfo().get().getType() == SingularityContainerType.DOCKER); return taskCleanup.cleanup(cleanupTaskAppDirectory, isDocker); }
private MaxProbableUsage getMaxProbableUsageForSlave(List<SingularityTaskId> activeTaskIds, Map<String, RequestUtilization> requestUtilizations, String sanitizedHostname) { double cpu = 0; double memBytes = 0; double diskBytes = 0; for (SingularityTaskId taskId : activeTaskIds) { if (taskId.getSanitizedHost().equals(sanitizedHostname)) { if (requestUtilizations.containsKey(taskId.getRequestId())) { RequestUtilization utilization = requestUtilizations.get(taskId.getRequestId()); cpu += getEstimatedCpuUsageForRequest(utilization); memBytes += utilization.getMaxMemBytesUsed(); diskBytes += utilization.getMaxDiskBytesUsed(); } else { Optional<SingularityTask> maybeTask = taskManager.getTask(taskId); if (maybeTask.isPresent()) { Resources resources = maybeTask.get().getTaskRequest().getPendingTask().getResources() .or(maybeTask.get().getTaskRequest().getDeploy().getResources()) .or(defaultResources); cpu += resources.getCpus(); memBytes += resources.getMemoryMb() * SingularitySlaveUsage.BYTES_PER_MEGABYTE; diskBytes += resources.getDiskMb() * SingularitySlaveUsage.BYTES_PER_MEGABYTE; } } } } return new MaxProbableUsage(cpu, memBytes, diskBytes); }
SingularityTask getSizeOptimizedTask(SingularityMesosTaskHolder taskHolder) { if (configuration.isStoreAllMesosTaskInfoForDebugging()) { return taskHolder.getTask(); } SingularityTask task = taskHolder.getTask(); TaskInfo.Builder mesosTask = taskHolder.getMesosTask().toBuilder(); mesosTask.clearData(); List<MesosOfferObject> offers = task.getOffers() .stream() .map(MesosOfferObject::sizeOptimized) .collect(Collectors.toList()); SingularityTaskRequest taskRequest = task.getTaskRequest(); if (task.getTaskRequest().getDeploy().getExecutorData().isPresent()) { SingularityDeployBuilder deploy = task.getTaskRequest().getDeploy().toBuilder(); deploy.setExecutorData(Optional.absent()); taskRequest = new SingularityTaskRequest(task.getTaskRequest().getRequest(), deploy.build(), task.getTaskRequest().getPendingTask()); } return new SingularityTask(taskRequest, task.getTaskId(), offers, mesosProtosUtils.taskFromProtos(mesosTask.build()), task.getRackId()); }
private void relaunchTask(SingularityTask task) { SingularityPendingTask pendingTask = task.getTaskRequest().getPendingTask(); SingularityPendingRequest pendingRequest = new SingularityPendingRequestBuilder() .setRequestId(task.getTaskRequest().getRequest().getId()) .setDeployId(task.getTaskRequest().getDeploy().getId()) .setPendingType(PendingType.RETRY) .setUser(pendingTask.getUser()) .setRunId(pendingTask.getRunId()) .setCmdLineArgsList(pendingTask.getCmdLineArgsList()) .setSkipHealthchecks(pendingTask.getSkipHealthchecks()) .setMessage(pendingTask.getMessage()) .setResources(pendingTask.getResources()) .setS3UploaderAdditionalFiles(pendingTask.getS3UploaderAdditionalFiles()) .setRunAsUserOverride(pendingTask.getRunAsUserOverride()) .setEnvOverrides(pendingTask.getEnvOverrides()) .setExtraArtifacts(pendingTask.getExtraArtifacts()) .setActionId(pendingTask.getActionId()) .setRunAt(pendingTask.getPendingTaskId().getNextRunAt()) .setTimestamp(System.currentTimeMillis()) .build(); requestManager.addToPendingQueue(pendingRequest); }