private boolean isRunningLongerThanThreshold(SingularityDeploy deploy, long durationSinceRunning) { long relevantTimeoutSeconds = deploy.getHealthcheck().isPresent() ? getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get()) : deploy.getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds()); return durationSinceRunning > TimeUnit.SECONDS.toMillis(relevantTimeoutSeconds); }
private long getAllowedMillis(SingularityDeploy deploy) { long seconds = deploy.getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds()); if (deploy.getHealthcheck().isPresent() && !deploy.getSkipHealthchecksOnDeploy().or(false)) { seconds += deployHealthHelper.getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get()); } else { seconds += deploy.getConsiderHealthyAfterRunningForSeconds().or(configuration.getConsiderTaskHealthyAfterRunningForSeconds()); } return TimeUnit.SECONDS.toMillis(seconds); }
public SingularityHealthcheckAsyncHandler(SingularityExceptionNotifier exceptionNotifier, SingularityConfiguration configuration, SingularityHealthchecker healthchecker, SingularityNewTaskChecker newTaskChecker, TaskManager taskManager, SingularityTask task) { this.exceptionNotifier = exceptionNotifier; this.taskManager = taskManager; this.newTaskChecker = newTaskChecker; this.healthchecker = healthchecker; this.task = task; this.maxHealthcheckResponseBodyBytes = configuration.getMaxHealthcheckResponseBodyBytes(); this.failureStatusCodes = task.getTaskRequest().getDeploy().getHealthcheck().isPresent() ? task.getTaskRequest().getDeploy().getHealthcheck().get().getFailureStatusCodes().or(configuration.getHealthcheckFailureStatusCodes()) : configuration.getHealthcheckFailureStatusCodes(); startTime = System.currentTimeMillis(); }
private Optional<String> getHealthcheckUri(SingularityTask task) { if (!task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) { return Optional.absent(); } HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get(); final String hostname = task.getHostname(); Optional<Long> healthcheckPort = options.getPortNumber().or(MesosUtils.getPortByIndex(mesosProtosUtils.toResourceList(task.getMesosTask().getResources()), options.getPortIndex().or(0))); if (!healthcheckPort.isPresent() || healthcheckPort.get() < 1L) { return Optional.absent(); } if (!task.getTaskRequest().getDeploy().getHealthcheck().get().getUri().isPresent()) { return Optional.absent(); } String uri = task.getTaskRequest().getDeploy().getHealthcheck().get().getUri().get(); if (uri.startsWith("/")) { uri = uri.substring(1); } HealthcheckProtocol protocol = options.getProtocol().or(DEFAULT_HEALTH_CHECK_SCHEME); return Optional.of(String.format("%s://%s:%d/%s", protocol.getProtocol(), hostname, healthcheckPort.get(), uri)); }
if (deploy.getHealthcheck().isPresent() && deploy.getHealthcheck().get().getHealthcheckResultFilePath().isPresent()) { if (taskManager.getTaskHistoryUpdate(taskId, ExtendedTaskState.TASK_RUNNING).isPresent()) { LOG.debug("Task {} has non-web healthcheck and is in running state, marking healthy.", taskId); LOG.debug("Found a failed healthcheck: {}", healthcheckResult); if (deploy.getHealthcheck().isPresent() && healthcheckResult.get().getStatusCode().isPresent() && deploy.getHealthcheck().get().getFailureStatusCodes().or(configuration.getHealthcheckFailureStatusCodes()).contains(healthcheckResult.get().getStatusCode().get())) { LOG.debug("Failed healthcheck had bad status code: {}", healthcheckResult.get().getStatusCode().get()); return DeployHealth.UNHEALTHY; final int startupTimeout = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds()) : configuration.getStartupTimeoutSeconds(); Collection<SingularityTaskHistoryUpdate> updates = taskManager.getTaskHistoryUpdates(taskId); Optional<Long> runningAt = getRunningAt(updates); final Optional<Integer> healthcheckMaxRetries = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getMaxRetries().or(configuration.getHealthcheckMaxRetries()) : Optional.<Integer>absent(); final Optional<Integer> healthcheckMaxTotalTimeoutSeconds = deploy.getHealthcheck().isPresent() ? Optional.of(getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get())) : Optional.<Integer>absent();
private boolean shouldCheckHealthchecks(final SingularityRequest request, final Optional<SingularityDeploy> deploy, final Collection<SingularityTaskId> activeTasks, final boolean isDeployPending) { if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) { return false; } if (!deploy.isPresent()) { return false; } if (!deploy.get().getHealthcheck().isPresent()) { return false; } if (isDeployPending && deploy.get().getSkipHealthchecksOnDeploy().or(false)) { return false; } if (request.getSkipHealthchecks().or(Boolean.FALSE)) { return false; } for (SingularityTask task : taskManager.getTasks(activeTasks).values()) { if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(Boolean.FALSE)) { return false; } } return true; }
private boolean hasHealthcheck(SingularityTask task, Optional<SingularityRequestWithState> requestWithState) { if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) { return false; } if (!task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) { return false; } if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(Boolean.FALSE)) { return false; } if (requestWithState.isPresent() && requestWithState.get().getRequest().getSkipHealthchecks().or(Boolean.FALSE)) { return false; } return true; }
final Optional<Integer> healthcheckMaxRetries = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getMaxRetries().or(configuration.getHealthcheckMaxRetries()) : configuration.getHealthcheckMaxRetries(); if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()) { String message = String.format("Instance %s failed %s healthchecks, the max for the deploy.", taskId.getInstanceNo(), healthcheckMaxRetries.get() + 1); if (runningAt.isPresent()) { final long durationSinceRunning = System.currentTimeMillis() - runningAt.get(); if (healthcheckResult.isStartup() && deploy.getHealthcheck().isPresent() && durationSinceRunning > deploy.getHealthcheck().get().getStartupTimeoutSeconds() .or(configuration.getStartupTimeoutSeconds())) { String message = String.format("Instance %s has not responded to healthchecks after running for %s", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
public void enqueueHealthcheck(SingularityTask task, boolean ignoreExisting, boolean inStartup, boolean isFirstCheck) { HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get(); final Optional<Integer> healthcheckMaxRetries = options.getMaxRetries().or(configuration.getHealthcheckMaxRetries()); Optional<Long> maybeRunningAt = getRunningAt(taskManager.getTaskHistoryUpdates(task.getTaskId())); if (maybeRunningAt.isPresent()) { final long durationSinceRunning = System.currentTimeMillis() - maybeRunningAt.get(); final int startupTimeout = options.getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds()); if (inStartup && durationSinceRunning > TimeUnit.SECONDS.toMillis(startupTimeout)) { LOG.debug("{} since running", durationSinceRunning); LOG.info("Not enqueuing new healthcheck for {}, has not responded to healthchecks before startup timeout of {}s", task.getTaskId(), startupTimeout); return; } } if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(task.getTaskId()) > healthcheckMaxRetries.get()) { LOG.info("Not enqueuing new healthcheck for {}, it has already attempted {} times", task.getTaskId(), healthcheckMaxRetries.get()); return; } ScheduledFuture<?> future = enqueueHealthcheckWithDelay(task, getDelaySeconds(task.getTaskId(), options, inStartup, isFirstCheck), inStartup); ScheduledFuture<?> existing = taskIdToHealthcheck.put(task.getTaskId().getId(), future); if (existing != null) { boolean canceledExisting = existing.cancel(false); if (!ignoreExisting) { LOG.warn("Found existing overlapping healthcheck for task {} - cancel success: {}", task.getTaskId(), canceledExisting); } } }
private int getDelaySeconds(SingularityTask task, Optional<SingularityRequestWithState> requestWithState) { int delaySeconds = configuration.getNewTaskCheckerBaseDelaySeconds(); if (hasHealthcheck(task, requestWithState)) { Optional<Integer> maybeStartupDelay = task.getTaskRequest().getDeploy().getHealthcheck().get().getStartupDelaySeconds().or(configuration.getStartupDelaySeconds()); if (maybeStartupDelay.isPresent()) { return maybeStartupDelay.get(); } } else if (task.getTaskRequest().getRequest().isLoadBalanced()) { return delaySeconds; } delaySeconds += task.getTaskRequest().getDeploy().getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds()); return delaySeconds; }
private boolean shouldHealthcheck(final SingularityTask task, final Optional<SingularityRequestWithState> request, Optional<SingularityPendingDeploy> pendingDeploy) { if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) { return false; } if (!task.getTaskRequest().getRequest().isLongRunning() || !task.getTaskRequest().getDeploy().getHealthcheck().isPresent() || task.getTaskRequest().getDeploy().getHealthcheck().get().getHealthcheckResultFilePath().isPresent()) { return false; } if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(false)) { return false; } if (pendingDeploy.isPresent() && pendingDeploy.get().getDeployMarker().getDeployId().equals(task.getTaskId().getDeployId()) && task.getTaskRequest().getDeploy().getSkipHealthchecksOnDeploy().or(false)) { return false; } if (request.isPresent() && request.get().getRequest().getSkipHealthchecks().or(false)) { return false; } Optional<SingularityTaskHealthcheckResult> lastHealthcheck = taskManager.getLastHealthcheck(task.getTaskId()); if (lastHealthcheck.isPresent() && !lastHealthcheck.get().isFailed()) { LOG.debug("Not submitting a new healthcheck for {} because it already passed a healthcheck", task.getTaskId()); return false; } return true; }
private void asyncHealthcheck(final SingularityTask task) { final SingularityHealthcheckAsyncHandler handler = new SingularityHealthcheckAsyncHandler(exceptionNotifier, configuration, this, newTaskChecker, taskManager, task); final Optional<String> uri = getHealthcheckUri(task); if (!uri.isPresent()) { saveFailure(handler, "Invalid healthcheck uri or ports not present"); return; } final Integer timeoutSeconds = task.getTaskRequest().getDeploy().getHealthcheck().isPresent() ? task.getTaskRequest().getDeploy().getHealthcheck().get().getResponseTimeoutSeconds().or(configuration.getHealthcheckTimeoutSeconds()) : configuration.getHealthcheckTimeoutSeconds(); try { PerRequestConfig prc = new PerRequestConfig(); prc.setRequestTimeoutInMs((int) TimeUnit.SECONDS.toMillis(timeoutSeconds)); RequestBuilder builder = new RequestBuilder("GET"); builder.setFollowRedirects(true); builder.setUrl(uri.get()); builder.setPerRequestConfig(prc); LOG.trace("Issuing a healthcheck ({}) for task {} with timeout {}s", uri.get(), task.getTaskId(), timeoutSeconds); http.prepareRequest(builder.build()).execute(handler); } catch (Throwable t) { LOG.debug("Exception while preparing healthcheck ({}) for task ({})", uri, task.getTaskId(), t); exceptionNotifier.notify(String.format("Error preparing healthcheck (%s)", t.getMessage()), t, ImmutableMap.of("taskId", task.getTaskId().toString())); saveFailure(handler, String.format("Healthcheck failed due to exception: %s", t.getMessage())); } }
if (deploy.getHealthcheck().isPresent()) { HealthcheckOptions healthcheck = deploy.getHealthcheck().get(); checkBadRequest(!(healthcheck.getPortIndex().isPresent() && healthcheck.getPortNumber().isPresent()), "Can only specify one of portIndex or portNumber for healthchecks"); if (deploy.getHealthcheck().isPresent()) { HealthcheckOptions healthcheckOptions = deploy.getHealthcheck().get(); boolean hasUri = healthcheckOptions.getUri().isPresent() && !Strings.isNullOrEmpty(healthcheckOptions.getUri().get()); boolean hasHealthCheckResultFilePath = healthcheckOptions.getHealthcheckResultFilePath().isPresent();
Optional<HealthcheckOptions> healthcheckOptions = task.getRequest().getSkipHealthchecks().or(false) ? Optional.absent() : task.getDeploy().getHealthcheck(); final SingularityTaskExecutorData executorData = new SingularityTaskExecutorData(executorDataBldr.build(), uploaderAdditionalFiles, defaultS3Bucket, s3UploaderKeyPattern, configuration.getCustomExecutorConfiguration().getServiceLog(), configuration.getCustomExecutorConfiguration().getServiceFinishedTailLog(), task.getRequest().getGroup(),
private boolean isRunningLongerThanThreshold(SingularityDeploy deploy, long durationSinceRunning) { long relevantTimeoutSeconds = deploy.getHealthcheck().isPresent() ? getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get()) : deploy.getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds()); return durationSinceRunning > TimeUnit.SECONDS.toMillis(relevantTimeoutSeconds); }
private long getAllowedMillis(SingularityDeploy deploy) { long seconds = deploy.getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds()); if (deploy.getHealthcheck().isPresent() && !deploy.getSkipHealthchecksOnDeploy().or(false)) { seconds += deployHealthHelper.getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get()); } else { seconds += deploy.getConsiderHealthyAfterRunningForSeconds().or(configuration.getConsiderTaskHealthyAfterRunningForSeconds()); } return TimeUnit.SECONDS.toMillis(seconds); }
public SingularityHealthcheckAsyncHandler(SingularityExceptionNotifier exceptionNotifier, SingularityConfiguration configuration, SingularityHealthchecker healthchecker, SingularityNewTaskChecker newTaskChecker, TaskManager taskManager, SingularityTask task) { this.exceptionNotifier = exceptionNotifier; this.taskManager = taskManager; this.newTaskChecker = newTaskChecker; this.healthchecker = healthchecker; this.task = task; this.maxHealthcheckResponseBodyBytes = configuration.getMaxHealthcheckResponseBodyBytes(); this.failureStatusCodes = task.getTaskRequest().getDeploy().getHealthcheck().isPresent() ? task.getTaskRequest().getDeploy().getHealthcheck().get().getFailureStatusCodes().or(configuration.getHealthcheckFailureStatusCodes()) : configuration.getHealthcheckFailureStatusCodes(); startTime = System.currentTimeMillis(); }
private Optional<String> getHealthcheckUri(SingularityTask task) { if (!task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) { return Optional.absent(); } HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get(); final String hostname = task.getHostname(); Optional<Long> healthcheckPort = options.getPortNumber().or(MesosUtils.getPortByIndex(mesosProtosUtils.toResourceList(task.getMesosTask().getResources()), options.getPortIndex().or(0))); if (!healthcheckPort.isPresent() || healthcheckPort.get() < 1L) { return Optional.absent(); } if (!task.getTaskRequest().getDeploy().getHealthcheck().get().getUri().isPresent()) { return Optional.absent(); } String uri = task.getTaskRequest().getDeploy().getHealthcheck().get().getUri().get(); if (uri.startsWith("/")) { uri = uri.substring(1); } HealthcheckProtocol protocol = options.getProtocol().or(DEFAULT_HEALTH_CHECK_SCHEME); return Optional.of(String.format("%s://%s:%d/%s", protocol.getProtocol(), hostname, healthcheckPort.get(), uri)); }
private boolean hasHealthcheck(SingularityTask task, Optional<SingularityRequestWithState> requestWithState) { if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) { return false; } if (!task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) { return false; } if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(Boolean.FALSE)) { return false; } if (requestWithState.isPresent() && requestWithState.get().getRequest().getSkipHealthchecks().or(Boolean.FALSE)) { return false; } return true; }
private int getDelaySeconds(SingularityTask task, Optional<SingularityRequestWithState> requestWithState) { int delaySeconds = configuration.getNewTaskCheckerBaseDelaySeconds(); if (hasHealthcheck(task, requestWithState)) { Optional<Integer> maybeStartupDelay = task.getTaskRequest().getDeploy().getHealthcheck().get().getStartupDelaySeconds().or(configuration.getStartupDelaySeconds()); if (maybeStartupDelay.isPresent()) { return maybeStartupDelay.get(); } } else if (task.getTaskRequest().getRequest().isLoadBalanced()) { return delaySeconds; } delaySeconds += task.getTaskRequest().getDeploy().getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds()); return delaySeconds; }