void slaveLost(AgentID slaveIdObj) { final String slaveId = slaveIdObj.getValue(); Optional<SingularitySlave> slave = slaveManager.getObject(slaveId); if (slave.isPresent()) { MachineState previousState = slave.get().getCurrentState().getState(); slaveManager.changeState(slave.get(), MachineState.DEAD, Optional.absent(), Optional.absent()); if (configuration.getDisasterDetection().isEnabled()) { updateDisasterCounter(previousState); } checkRackAfterSlaveLoss(slave.get()); } else { LOG.warn("Lost a slave {}, but didn't know about it", slaveId); } }
Assert.assertEquals(1, slaveManager.getHistory("slave1").size()); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 2); Assert.assertTrue(rackManager.getNumObjectsAtState(MachineState.ACTIVE) == 2); Assert.assertTrue(slaveManager.getObject("slave1").get().getCurrentState().equals(slaveManager.getHistory("slave1").get(0))); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 1); Assert.assertTrue(rackManager.getNumObjectsAtState(MachineState.ACTIVE) == 1); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.DEAD) == 1); Assert.assertTrue(rackManager.getNumObjectsAtState(MachineState.DEAD) == 1); Assert.assertTrue(slaveManager.getObject("slave1").get().getCurrentState().getState() == MachineState.DEAD); Assert.assertTrue(rackManager.getObject("rack1").get().getCurrentState().getState() == MachineState.DEAD); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 2); Assert.assertEquals(2, rackManager.getNumObjectsAtState(MachineState.ACTIVE)); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.DEAD) == 1); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 3); Assert.assertTrue(rackManager.getNumObjectsAtState(MachineState.ACTIVE) == 2); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 2); Assert.assertTrue(rackManager.getNumObjectsAtState(MachineState.ACTIVE) == 2); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.DEAD) == 1); Assert.assertTrue(slaveManager.getHistory("slave1").size() == 4); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 2); Assert.assertTrue(rackManager.getNumObjectsAtState(MachineState.ACTIVE) == 2);
public void activateLeaderCache() { leaderCache.cacheSlaves(getObjects()); }
private void checkDeadSlaves() { final long start = System.currentTimeMillis(); final List<SingularitySlave> deadSlaves = slaveManager.getObjectsFiltered(MachineState.DEAD); if (deadSlaves.isEmpty()) { LOG.trace("No dead slaves"); return; } int deleted = 0; final long maxDuration = TimeUnit.HOURS.toMillis(configuration.getDeleteDeadSlavesAfterHours()); for (SingularitySlave deadSlave : slaveManager.getObjectsFiltered(MachineState.DEAD)) { final long duration = System.currentTimeMillis() - deadSlave.getCurrentState().getTimestamp(); if (duration > maxDuration) { SingularityDeleteResult result = slaveManager.deleteObject(deadSlave.getId()); deleted++; LOG.info("Removing dead slave {} ({}) after {} (max {})", deadSlave.getId(), result, JavaUtils.durationFromMillis(duration), JavaUtils.durationFromMillis(maxDuration)); } } LOG.debug("Checked {} dead slaves, deleted {} in {}", deadSlaves.size(), deleted, JavaUtils.duration(start)); }
@Override protected void checkExpiringObjects() { for (SingularityExpiringMachineState expiringObject : slaveManager.getExpiringObjects()) { if (isExpiringDue(expiringObject)) { Optional<SingularitySlave> slave = slaveManager.getObject(expiringObject.getMachineId()); if (!slave.isPresent()) { LOG.warn("Slave {} not present, discarding {}", expiringObject.getMachineId(), expiringObject); } else { try { handleExpiringObject(expiringObject, slave.get(), getMessage(expiringObject)); } catch (Exception e) { LOG.error("Could not return slave {} to state {}", slave.get().getHost(), expiringObject.getRevertToState()); } } slaveManager.deleteExpiringObject(expiringObject.getMachineId()); } } } }
@Test public void testSlavePlacementSpread() { initRequest(); initFirstDeploy(); saveAndSchedule(request.toBuilder().setInstances(Optional.of(1)).setSlavePlacement(Optional.of(SlavePlacement.SPREAD_ALL_SLAVES))); sms.resourceOffers(Arrays.asList(createOffer(20, 20000, 50000, "slave1", "host1", Optional.of("rack1")))); // assert one Request on one slave. Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 1); Assert.assertTrue(taskManager.getPendingTaskIds().size() == 0); Assert.assertTrue(taskManager.getActiveTaskIds().size() == 1); sms.resourceOffers(Arrays.asList(createOffer(20, 20000, 50000, "slave2", "host2"))); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 2); spreadAllPoller.runActionOnPoll(); scheduler.drainPendingQueue(); sms.resourceOffers(Arrays.asList(createOffer(20, 20000, 50000, "slave2", "host2"))); // assert Request is spread over the two slaves Assert.assertTrue(taskManager.getPendingTaskIds().size() == 0); Assert.assertTrue(taskManager.getActiveTaskIds().size() == 2); Assert.assertEquals(1, taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave1").get()).size()); Assert.assertEquals(1, taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave2").get()).size()); // decommission slave and kill task slaveManager.changeState("slave2", MachineState.FROZEN, Optional.<String>absent(), Optional.<String>absent()); slaveManager.changeState("slave2", MachineState.STARTING_DECOMMISSION, Optional.<String>absent(), Optional.<String>absent()); cleaner.drainCleanupQueue(); statusUpdate(taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave2").get()).get(0), TaskState.TASK_KILLED); spreadAllPoller.runActionOnPoll(); scheduler.drainPendingQueue(); Assert.assertTrue(taskManager.getPendingTaskIds().isEmpty()); Assert.assertTrue(taskManager.getActiveTaskIds().size() == 1); }
@Test public void testUsageCleaner() { initRequest(); initFirstDeploy(); saveAndSchedule(request.toBuilder().setInstances(Optional.of(2))); resourceOffers(1); List<SingularityTaskId> taskIds = taskManager.getActiveTaskIds(); String t1 = taskIds.get(0).getId(); String t2 = taskIds.get(1).getId(); String slaveId = slaveManager.getObjectIds().get(0); String host = slaveManager.getObjects().get(0).getHost(); MesosTaskMonitorObject t1u1 = getTaskMonitor(t1, 2, 5, 100); MesosTaskMonitorObject t2u1 = getTaskMonitor(t2, 10, 5, 1000); mesosClient.setSlaveResourceUsage(host, Arrays.asList(t1u1, t2u1)); usagePoller.runActionOnPoll(); cleaner.runActionOnPoll(); Assert.assertEquals(2, usageManager.getTasksWithUsage().size()); Assert.assertEquals(1, usageManager.getSlavesWithUsage().size()); Assert.assertEquals(1100, usageManager.getAllCurrentSlaveUsage().get(0).getMemoryBytesUsed(), 0); // kill task one statusUpdate(taskManager.getActiveTasks().get(0), TaskState.TASK_KILLED); killKilledTasks(); cleaner.runActionOnPoll(); Assert.assertEquals(1, usageManager.getTasksWithUsage().size()); Assert.assertEquals(1, usageManager.getSlavesWithUsage().size()); slaveManager.changeState(slaveId, MachineState.DEAD, Optional.absent(), Optional.absent()); cleaner.runActionOnPoll(); Assert.assertEquals(1, usageManager.getTasksWithUsage().size()); Assert.assertEquals(0, usageManager.getSlavesWithUsage().size()); }
public void loadSlavesAndRacksFromMaster(MesosMasterStateObject state, boolean isStartup) { Map<String, SingularitySlave> activeSlavesById = slaveManager.getObjectsByIdForState(MachineState.ACTIVE); Map<String, SingularityRack> activeRacksById = rackManager.getObjectsByIdForState(MachineState.ACTIVE); if (slave != null && (!slave.getResources().isPresent() || !slave.getResources().get().equals(slaveJsonObject.getResources()))) { LOG.trace("Found updated resources ({}) for slave {}", slaveJsonObject.getResources(), slave); slaveManager.saveObject(slave.withResources(slaveJsonObject.getResources())); slaveManager.changeState(leftOverSlave, isStartup ? MachineState.MISSING_ON_STARTUP : MachineState.DEAD, Optional.absent(), Optional.absent());
public Optional<SingularitySlave> getSlave(String slaveId) { if (leaderCache.active()) { return leaderCache.getSlave(slaveId); } return getObject(slaveId); }
public CheckResult checkOffer(Offer offer) { final String slaveId = offer.getAgentId().getValue(); final String rackId = slaveAndRackHelper.getRackIdOrDefault(offer); final String host = slaveAndRackHelper.getMaybeTruncatedHost(offer); final Map<String, String> textAttributes = slaveAndRackHelper.getTextAttributes(offer); final SingularitySlave slave = new SingularitySlave(slaveId, host, rackId, textAttributes, Optional.absent()); CheckResult result = check(slave, slaveManager); if (result == CheckResult.NEW) { if (inactiveSlaveManager.isInactive(slave.getHost())) { LOG.info("Slave {} on inactive host {} attempted to rejoin. Marking as decommissioned.", slave, host); slaveManager.changeState(slave, MachineState.STARTING_DECOMMISSION, Optional.of(String.format("Slave %s on inactive host %s attempted to rejoin cluster.", slaveId, host)), Optional.absent()); } else { LOG.info("Offer revealed a new slave {}", slave); } } final SingularityRack rack = new SingularityRack(rackId); if (check(rack, rackManager) == CheckResult.NEW) { LOG.info("Offer revealed a new rack {}", rack); } return result; }
public void checkScale(SingularityRequest request, Optional<Integer> previousScale) { SlavePlacement placement = request.getSlavePlacement().or(defaultSlavePlacement); if (placement != SlavePlacement.GREEDY && placement != SlavePlacement.OPTIMISTIC) { int currentActiveSlaveCount = slaveManager.getNumObjectsAtState(MachineState.ACTIVE); int requiredSlaveCount = request.getInstancesSafe(); if (previousScale.isPresent() && placement == SlavePlacement.SEPARATE_BY_REQUEST) { requiredSlaveCount += previousScale.get(); } checkBadRequest(currentActiveSlaveCount >= requiredSlaveCount, "Not enough active slaves to successfully complete a bounce of request %s (minimum required: %s, current: %s). Consider deploying, or changing the slave placement strategy instead.", request.getId(), requiredSlaveCount, currentActiveSlaveCount); } }
@Test public void testDeadSlavesArePurged() { SingularitySlave liveSlave = new SingularitySlave("1", "h1", "r1", ImmutableMap.of("uniqueAttribute", "1"), Optional.absent()); SingularitySlave deadSlave = new SingularitySlave("2", "h1", "r1", ImmutableMap.of("uniqueAttribute", "2"), Optional.absent()); final long now = System.currentTimeMillis(); liveSlave = liveSlave.changeState(new SingularityMachineStateHistoryUpdate("1", MachineState.ACTIVE, 100, Optional.absent(), Optional.absent())); deadSlave = deadSlave.changeState(new SingularityMachineStateHistoryUpdate("2", MachineState.DEAD, now - TimeUnit.HOURS.toMillis(10), Optional.absent(), Optional.absent())); slaveManager.saveObject(liveSlave); slaveManager.saveObject(deadSlave); slaveReconciliationPoller.runActionOnPoll(); Assert.assertEquals(1, slaveManager.getObjectsFiltered(MachineState.ACTIVE).size()); Assert.assertEquals(1, slaveManager.getObjectsFiltered(MachineState.DEAD).size()); configuration.setDeleteDeadSlavesAfterHours(1); slaveReconciliationPoller.runActionOnPoll(); Assert.assertEquals(1, slaveManager.getObjectsFiltered(MachineState.ACTIVE).size()); Assert.assertEquals(0, slaveManager.getObjectsFiltered(MachineState.DEAD).size()); }
public void validateDecommissioningCount() { int decommissioning = slaveManager.getObjectsFiltered(MachineState.DECOMMISSIONING).size() + slaveManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION).size(); checkBadRequest(decommissioning < maxDecommissioningSlaves, "%s slaves are already decommissioning state (%s allowed at once). Allow these slaves to finish before decommissioning another", decommissioning, maxDecommissioningSlaves); }
void checkStateAfterFinishedTask(SingularityTaskId taskId, String slaveId, SingularityLeaderCache leaderCache) { Optional<SingularitySlave> slave = slaveManager.getSlave(slaveId); if (!slave.isPresent()) { final String message = String.format("Couldn't find slave with id %s for task %s", slaveId, taskId); LOG.warn(message); exceptionNotifier.notify(message, ImmutableMap.of("slaveId", slaveId, "taskId", taskId.toString())); return; } if (slave.get().getCurrentState().getState() == MachineState.DECOMMISSIONING) { if (!hasTaskLeftOnSlave(taskId, slaveId, leaderCache)) { slaveManager.changeState(slave.get(), MachineState.DECOMMISSIONED, slave.get().getCurrentState().getMessage(), slave.get().getCurrentState().getUser()); } } Optional<SingularityRack> rack = rackManager.getObject(slave.get().getRackId()); if (!rack.isPresent()) { final String message = String.format("Couldn't find rack with id %s for task %s", slave.get().getRackId(), taskId); LOG.warn(message); exceptionNotifier.notify(message, ImmutableMap.of("rackId", slave.get().getRackId(), "taskId", taskId.toString())); return; } if (rack.get().getCurrentState().getState() == MachineState.DECOMMISSIONING) { if (!hasTaskLeftOnRack(taskId, leaderCache)) { rackManager.changeState(rack.get(), MachineState.DECOMMISSIONED, rack.get().getCurrentState().getMessage(), rack.get().getCurrentState().getUser()); } } }
String t2 = taskIds.get(1).getId(); String slaveId = slaveManager.getObjectIds().get(0); String host = slaveManager.getObjects().get(0).getHost();
@Test public void testSystemChangeClearsExpiringChangeIfInvalid() { SingularitySlave slave = getSingleSlave(); slaveResource.freezeSlave(singularityUser, slave.getId(), null); slaveResource.activateSlave(singularityUser, slave.getId(), new SingularityMachineChangeRequest(Optional.of(1L), Optional.absent(), Optional.absent(), Optional.of(MachineState.FROZEN), Optional.absent())); Assert.assertTrue(slaveManager.getExpiringObject(slave.getId()).isPresent()); slaveResource.decommissionSlave(singularityUser, slave.getId(), null); Assert.assertFalse(slaveManager.getExpiringObject(slave.getId()).isPresent()); }
public void activateLeaderCache() { taskManager.activateLeaderCache(); deployManager.activateLeaderCache(); requestManager.activateLeaderCache(); slaveManager.activateLeaderCache(); rackManager.activateLeaderCache(); usageManager.activateLeaderCache(); leaderCache.activate(); }
for (SingularityTask task : taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave1").get())) { statusUpdate(task, TaskState.TASK_RUNNING); for (SingularityTask task : taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave2").get())) { statusUpdate(task, TaskState.TASK_RUNNING); Assert.assertTrue(slaveManager.getNumObjectsAtState(MachineState.ACTIVE) == 4); Assert.assertTrue(taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave1").get()).size() == 1); Assert.assertTrue(taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave2").get()).size() == 1); Assert.assertTrue(taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave3").get()).isEmpty()); Assert.assertTrue(taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave4").get()).isEmpty()); Assert.assertEquals(StateChangeResult.SUCCESS, slaveManager.changeState("slave1", MachineState.STARTING_DECOMMISSION, Optional.absent(), Optional.of("user1"))); Assert.assertEquals(StateChangeResult.FAILURE_ALREADY_AT_STATE, slaveManager.changeState("slave1", MachineState.STARTING_DECOMMISSION, Optional.absent(), Optional.of("user1"))); Assert.assertEquals(StateChangeResult.FAILURE_NOT_FOUND, slaveManager.changeState("slave9231", MachineState.STARTING_DECOMMISSION, Optional.absent(), Optional.of("user1"))); Assert.assertEquals(MachineState.STARTING_DECOMMISSION, slaveManager.getObject("slave1").get().getCurrentState().getState()); Assert.assertTrue(slaveManager.getObject("slave1").get().getCurrentState().getUser().get().equals("user1")); Assert.assertTrue(taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave1").get()).size() == 1); Assert.assertTrue(slaveManager.getObject("slave1").get().getCurrentState().getState() == MachineState.DECOMMISSIONING); Assert.assertTrue(slaveManager.getObject("slave1").get().getCurrentState().getUser().get().equals("user1")); Assert.assertTrue(slaveManager.getObject("slave1").get().getCurrentState().getState() == MachineState.DECOMMISSIONING); Assert.assertTrue(slaveManager.getObject("slave1").get().getCurrentState().getUser().get().equals("user1")); for (SingularityTask task : taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slaveManager.getObject("slave4").get())) {
@Test public void testUsageCleaner() { initRequest(); initFirstDeploy(); saveAndSchedule(request.toBuilder().setInstances(Optional.of(2))); resourceOffers(1); List<SingularityTaskId> taskIds = taskManager.getActiveTaskIds(); String t1 = taskIds.get(0).getId(); String t2 = taskIds.get(1).getId(); String slaveId = slaveManager.getObjectIds().get(0); String host = slaveManager.getObjects().get(0).getHost(); MesosTaskMonitorObject t1u1 = getTaskMonitor(t1, 2, 5, 100); MesosTaskMonitorObject t2u1 = getTaskMonitor(t2, 10, 5, 1000); mesosClient.setSlaveResourceUsage(host, Arrays.asList(t1u1, t2u1)); usagePoller.runActionOnPoll(); cleaner.runActionOnPoll(); Assert.assertEquals(2, usageManager.getTasksWithUsage().size()); Assert.assertEquals(1, usageManager.getSlavesWithUsage().size()); Assert.assertEquals(1100, usageManager.getAllCurrentSlaveUsage().get(0).getMemoryBytesUsed(), 0); // kill task one statusUpdate(taskManager.getActiveTasks().get(0), TaskState.TASK_KILLED); killKilledTasks(); cleaner.runActionOnPoll(); Assert.assertEquals(1, usageManager.getTasksWithUsage().size()); Assert.assertEquals(1, usageManager.getSlavesWithUsage().size()); slaveManager.changeState(slaveId, MachineState.DEAD, Optional.absent(), Optional.absent()); cleaner.runActionOnPoll(); Assert.assertEquals(1, usageManager.getTasksWithUsage().size()); Assert.assertEquals(0, usageManager.getSlavesWithUsage().size()); }
public void loadSlavesAndRacksFromMaster(MesosMasterStateObject state, boolean isStartup) { Map<String, SingularitySlave> activeSlavesById = slaveManager.getObjectsByIdForState(MachineState.ACTIVE); Map<String, SingularityRack> activeRacksById = rackManager.getObjectsByIdForState(MachineState.ACTIVE); if (slave != null && (!slave.getResources().isPresent() || !slave.getResources().get().equals(slaveJsonObject.getResources()))) { LOG.trace("Found updated resources ({}) for slave {}", slaveJsonObject.getResources(), slave); slaveManager.saveObject(slave.withResources(slaveJsonObject.getResources())); slaveManager.changeState(leftOverSlave, isStartup ? MachineState.MISSING_ON_STARTUP : MachineState.DEAD, Optional.absent(), Optional.absent());