/** * Clean up a container. * PRECONDITION: All of the processes have died. * @param dynamicState current state * @param staticState static data * @param nextState the next MachineState to go to. * @return the next state. */ private static DynamicState cleanupCurrentContainer(DynamicState dynamicState, StaticState staticState, MachineState nextState) throws Exception { assert (dynamicState.container != null); assert (dynamicState.currentAssignment != null); assert (dynamicState.container.areAllProcessesDead()); dynamicState.container.cleanUp(); staticState.localizer.releaseSlotFor(dynamicState.currentAssignment, staticState.port); DynamicState ret = dynamicState.withCurrentAssignment(null, null); if (nextState != null) { ret = ret.withState(nextState); } return ret; }
/** * State Transitions for KILL state. * PRECONDITION: container.kill() was called * PRECONDITION: container != null && currentAssignment != null * @param dynamicState current state * @param staticState static data * @return the next state * @throws Exception on any error */ private static DynamicState handleKill(DynamicState dynamicState, StaticState staticState) throws Exception { assert (dynamicState.container != null); assert (dynamicState.currentAssignment != null); if (dynamicState.container.areAllProcessesDead()) { LOG.info("SLOT {} all processes are dead...", staticState.port); return cleanupCurrentContainer(dynamicState, staticState, dynamicState.pendingLocalization == null ? MachineState.EMPTY : MachineState.WAITING_FOR_BLOB_LOCALIZATION); } LOG.warn("SLOT {} force kill and wait...", staticState.port); dynamicState.container.forceKill(); Time.sleep(staticState.killSleepMs); return dynamicState; }
Boolean isDead = dynamicState.container.areAllProcessesDead(); if (!isDead) { if (reason == KillReason.ASSIGNMENT_CHANGED || reason == KillReason.BLOB_CHANGED) { staticState.iSupervisor.killedWorker(staticState.port); dynamicState.container.kill();
@Override public void forceKill() throws IOException { LOG.info("Force Killing {}:{}", _supervisorId, _workerId); numForceKill.mark(); try { Set<Long> pids = getAllPids(); for (Long pid : pids) { forceKill(pid); } } catch (IOException e) { numForceKillExceptions.mark(); throw e; } }
@Override public void kill() throws IOException { LOG.info("Killing {}:{}", _supervisorId, _workerId); if (shutdownTimer == null) { shutdownTimer = shutdownDuration.time(); } try { Set<Long> pids = getAllPids(); for (Long pid : pids) { kill(pid); } } catch (IOException e) { numKillExceptions.mark(); throw e; } }
when(cContainer.readHeartbeat()).thenReturn(chb); when(cContainer.areAllProcessesDead()).thenReturn(false, false, true); verify(cContainer).kill(); verify(localizer, never()).requestDownloadTopologyBlobs(null, port, cb); assertSame("pendingDownload not set properly", null, nextState.pendingDownload); verify(cContainer).forceKill(); assertSame("pendingDownload not set properly", null, nextState.pendingDownload); assertEquals(null, nextState.pendingLocalization); verify(cContainer).cleanUp(); verify(localizer).releaseSlotFor(cAssignment, port); assertEquals(null, nextState.container);
LSWorkerHeartbeat oldhb = mkWorkerHB(topoId, port, execList, Time.currentTimeSecs() - 10); LSWorkerHeartbeat goodhb = mkWorkerHB(topoId, port, execList, Time.currentTimeSecs()); when(container.readHeartbeat()).thenReturn(oldhb, oldhb, goodhb, goodhb); when(container.areAllProcessesDead()).thenReturn(false, false, true); verify(container).kill(); assertTrue(Time.currentTimeMillis() > 1000); verify(container).forceKill(); assertTrue(Time.currentTimeMillis() > 2000); verify(container).relaunch();
/** * Setup the container to run. By default this creates the needed directories/links in the local file system PREREQUISITE: All needed * blobs and topology, jars/configs have been downloaded and placed in the appropriate locations * * @throws IOException on any error */ protected void setup() throws IOException { _type.assertFull(); if (!_ops.doRequiredTopoFilesExist(_conf, _topologyId)) { LOG.info("Missing topology storm code, so can't launch worker with assignment {} for this supervisor {} on port {} with id {}", _assignment, _supervisorId, _port, _workerId); throw new IllegalStateException("Not all needed files are here!!!!"); } LOG.info("Setting up {}:{}", _supervisorId, _workerId); _ops.forceMkdir(new File(ConfigUtils.workerPidsRoot(_conf, _workerId))); _ops.forceMkdir(new File(ConfigUtils.workerTmpRoot(_conf, _workerId))); _ops.forceMkdir(new File(ConfigUtils.workerHeartbeatsRoot(_conf, _workerId))); File workerArtifacts = new File(ConfigUtils.workerArtifactsRoot(_conf, _topologyId, _port)); if (!_ops.fileExists(workerArtifacts)) { _ops.forceMkdir(workerArtifacts); _ops.setupWorkerArtifactsDir(_assignment.get_owner(), workerArtifacts); } String user = getWorkerUser(); writeLogMetadata(user); saveWorkerUser(user); createArtifactsLink(); createBlobstoreLinks(); }
if (dynamicState.container.didMainProcessExit()) { LOG.warn("SLOT {}: main process has exited", staticState.port); return killContainerFor(KillReason.PROCESS_EXIT, dynamicState, staticState); if (dynamicState.container.isMemoryLimitViolated(dynamicState.currentAssignment)) { LOG.warn("SLOT {}: violated memory limits", staticState.port); return killContainerFor(KillReason.MEMORY_VIOLATION, dynamicState, staticState); LSWorkerHeartbeat hb = dynamicState.container.readHeartbeat(); if (hb == null) { LOG.warn("SLOT {}: HB returned as null", staticState.port); boolean isTimeForStop = Time.currentTimeMillis() > action.request.get_time_stamp(); if (isTimeForStop) { if (dynamicState.container.runProfiling(action.request, true)) { LOG.debug("Stopped {} action finished", action); iter.remove(); if (dynamicState.container.runProfiling(action.request, false)) { modPending.add(action); LOG.debug("Started {} now: {}", action, Time.currentTimeMillis()); if (dynamicState.container.runProfiling(action.request, false)) { LOG.debug("Started {} action finished", action); iter.remove(); dynamicState.container.processMetrics(staticState.metricsExec, staticState.metricsProcessor);
/** * Clean up the container partly preparing for restart. By default delete all of the temp directories we are going to get a new * worker_id anyways. POST CONDITION: the workerId will be set to null * * @throws IOException on any error */ public void cleanUpForRestart() throws IOException { LOG.info("Cleaning up {}:{}", _supervisorId, _workerId); Set<Long> pids = getAllPids(); String user = getWorkerUser(); for (Long pid : pids) { File path = new File(ConfigUtils.workerPidPath(_conf, _workerId, pid)); _ops.deleteIfExists(path, user, _workerId); } //clean up for resource isolation if enabled if (_resourceIsolationManager != null) { _resourceIsolationManager.releaseResourcesForWorker(_workerId); } //Always make sure to clean up everything else before worker directory //is removed since that is what is going to trigger the retry for cleanup _ops.deleteIfExists(new File(ConfigUtils.workerHeartbeatsRoot(_conf, _workerId)), user, _workerId); _ops.deleteIfExists(new File(ConfigUtils.workerPidsRoot(_conf, _workerId)), user, _workerId); _ops.deleteIfExists(new File(ConfigUtils.workerTmpRoot(_conf, _workerId)), user, _workerId); _ops.deleteIfExists(new File(ConfigUtils.workerRoot(_conf, _workerId)), user, _workerId); deleteSavedWorkerUser(); _workerId = null; }
if (dynamicState.container.didMainProcessExit()) { LOG.warn("SLOT {}: main process has exited", staticState.port); return killAndRelaunchContainer(dynamicState, staticState); LSWorkerHeartbeat hb = dynamicState.container.readHeartbeat(); if (hb == null) { LOG.warn("SLOT {}: HB returned as null", staticState.port); boolean isTimeForStop = Time.currentTimeMillis() > action.request.get_time_stamp(); if (isTimeForStop) { if (dynamicState.container.runProfiling(action.request, true)) { LOG.debug("Stopped {} action finished", action); iter.remove(); if (dynamicState.container.runProfiling(action.request, false)) { modPending.add(action); LOG.debug("Started {} now: {}", action, Time.currentTimeMillis()); if (dynamicState.container.runProfiling(action.request, false)) { LOG.debug("Started {} action finished", action); iter.remove();
when(cContainer.readHeartbeat()).thenReturn(chb, chb, chb, chb, chb, chb); when(cContainer.runProfiling(any(ProfileRequest.class), anyBoolean())).thenReturn(true); verify(cContainer).runProfiling(request, false); assertEquals(expectedPending, nextState.pendingStopProfileActions); assertEquals(expectedPending, nextState.profileActions); verify(cContainer).runProfiling(request, true); assertEquals(Collections.<TopoProfileAction>emptySet(), nextState.pendingStopProfileActions); assertEquals(Collections.<TopoProfileAction>emptySet(), nextState.profileActions);
@Override public void kill() throws IOException { ProcessSimulator.killProcess(_workerId); _isAlive = false; //Make sure the worker is down before we try to shoot any child processes super.kill(); }
assert (dynamicState.currentAssignment != null); LSWorkerHeartbeat hb = dynamicState.container.readHeartbeat(); if (hb != null) { long hbAgeMs = (Time.currentTimeSecs() - hb.get_time_secs()) * 1000;
@Override public boolean areAllProcessesDead() throws IOException { return !_isAlive && super.areAllProcessesDead(); }
@Override public void cleanUp() throws IOException { try (Timer.Context t = cleanupDuration.time()) { containerMemoryTracker.remove(_port); cleanUpForRestart(); } catch (IOException e) { //This may or may not be reported depending on when process exits numCleanupExceptions.mark(); throw e; } }
when(cContainer.readHeartbeat()).thenReturn(chb); when(cContainer.areAllProcessesDead()).thenReturn(false, false, true); when(containerLauncher.launchContainer(port, nAssignment, state)).thenReturn(nContainer); LSWorkerHeartbeat nhb = mkWorkerHB(nTopoId, 100, nExecList, Time.currentTimeSecs()); when(nContainer.readHeartbeat()).thenReturn(nhb, nhb); verify(cContainer).kill(); verify(localizer).requestDownloadTopologyBlobs(nAssignment, port, cb); assertSame("pendingDownload not set properly", blobFuture, nextState.pendingDownload); verify(cContainer).forceKill(); assertSame("pendingDownload not set properly", blobFuture, nextState.pendingDownload); assertEquals(nAssignment, nextState.pendingLocalization); verify(cContainer).cleanUp(); verify(localizer).releaseSlotFor(cAssignment, port); assertTrue(Time.currentTimeMillis() > 2000);
String user = getWorkerUser(); writeLogMetadata(user); saveWorkerUser(user); createArtifactsLink(); createBlobstoreLinks();
/** * Clean up the container partly preparing for restart. * By default delete all of the temp directories we are going * to get a new worker_id anyways. * POST CONDITION: the workerId will be set to null * @throws IOException on any error */ public void cleanUpForRestart() throws IOException { LOG.info("Cleaning up {}:{}", _supervisorId, _workerId); Set<Long> pids = getAllPids(); String user = getWorkerUser(); for (Long pid : pids) { File path = new File(ConfigUtils.workerPidPath(_conf, _workerId, pid)); _ops.deleteIfExists(path, user, _workerId); } //Always make sure to clean up everything else before worker directory //is removed since that is what is going to trigger the retry for cleanup _ops.deleteIfExists(new File(ConfigUtils.workerHeartbeatsRoot(_conf, _workerId)), user, _workerId); _ops.deleteIfExists(new File(ConfigUtils.workerPidsRoot(_conf, _workerId)), user, _workerId); _ops.deleteIfExists(new File(ConfigUtils.workerTmpRoot(_conf, _workerId)), user, _workerId); _ops.deleteIfExists(new File(ConfigUtils.workerRoot(_conf, _workerId)), user, _workerId); deleteSavedWorkerUser(); _workerId = null; }
@Override public void forceKill() throws IOException { LOG.info("Force Killing {}:{}", _supervisorId, _workerId); Set<Long> pids = getAllPids(); for (Long pid : pids) { forceKill(pid); } }