/** * Write a provisioner, and queue jobs to rebalance workers for all tenants in the system. * * @param provisioner Provisioner to write * @throws IOException */ public void writeProvisioner(Provisioner provisioner) throws IOException { tenantLock.lock(); try { provisionerStore.writeProvisioner(provisioner); // rebalance tenants every time a provisioner registers itself for (Tenant tenant : tenantStore.getAllTenants()) { balanceQueue.add(new Element(tenant.getId())); } } finally { tenantLock.unlock(); } }
@Override public void run() { try { barrier.await(); } catch (Exception e) { Throwables.propagate(e); } for (int j = 0; j < addsPerThread; j++) { queues.add(queueName, new Element(String.valueOf(producerNum * addsPerThread + j))); } latch.countDown(); } });
/** * Delete and reassign workers for provisioners that have not sent a heartbeat since the given timestamp in * milliseconds. * * @param timeoutTs Timestamp in milliseconds to use as a cut off for deleting provisioners. * @throws IOException */ public void timeoutProvisioners(long timeoutTs) throws IOException { tenantLock.lock(); try { Set<String> affectedTenants = Sets.newHashSet(); for (Provisioner provisioner : provisionerStore.getTimedOutProvisioners(timeoutTs)) { String provisionerId = provisioner.getId(); LOG.error("provisioner {} has not sent a heartbeat in over {} seconds, deleting it...", provisionerId, provisionerTimeoutSecs); provisionerStore.deleteProvisioner(provisioner.getId()); affectedTenants.addAll(provisioner.getAssignedTenants()); } for (String affectedTenant : affectedTenants) { balanceQueue.add(new Element(affectedTenant)); } } finally { tenantLock.unlock(); } }
private void deleteProvisioner(Provisioner provisioner) throws IOException { for (String tenant : provisioner.getAssignedTenants()) { balanceQueue.add(new Element(tenant)); } provisionerStore.deleteProvisioner(provisioner.getId()); }
/** * Sets the status of the given job to {@link ClusterJob.Status#RUNNING} and add it to the queue to be run. * * @param job Job to start. * @param cluster Cluster the job is for. * @throws IOException */ public void startJob(ClusterJob job, Cluster cluster) throws IOException { // TODO: wrap in a transaction LOG.debug("Starting job {} for cluster {}", job.getJobId(), cluster.getId()); job.setJobStatus(ClusterJob.Status.RUNNING); // Note: writing job status as RUNNING, will allow other operations on the job // (like cancel, etc.) to happen in parallel. clusterStore.writeClusterJob(job); callbackQueues.add(cluster.getAccount().getTenantId(), new Element(gson.toJson(new CallbackData(CallbackData.Type.START, cluster, job)))); }
@Test public void testRemoveAllForOneQueue() { QueueGroup queues = getQueueGroup(QueueType.PROVISIONER); queues.add("tenant1", new Element("id1", "val")); queues.add("tenant2", new Element("id2", "val")); queues.add("tenant3", new Element("id3", "val")); queues.add("tenant4", new Element("id4", "val")); Assert.assertTrue(queues.removeAll("tenant3")); Assert.assertNull(queues.take("tenant3", "consumer")); Assert.assertNotNull(queues.take("tenant1", "consumer")); Assert.assertNotNull(queues.take("tenant2", "consumer")); Assert.assertNotNull(queues.take("tenant4", "consumer")); }
@Test public void testInstanceInitializedWithExistingData() throws Exception { QueueGroup instance1 = new ZKQueueGroup(zkClient, QueueType.PROVISIONER); instance1.startAndWait(); instance1.add("tenant1", new Element("val1")); instance1.add("tenant2", new Element("val2")); QueueGroup instance2 = new ZKQueueGroup(zkClient, QueueType.PROVISIONER); instance2.startAndWait(); waitForQueueNames(Sets.newHashSet("tenant1", "tenant2"), instance2); instance1.stop(); instance2.stop(); }
@Test public void testRemoveAll() { QueueGroup queues = getQueueGroup(QueueType.PROVISIONER); queues.add("tenant1", new Element("id1", "val")); queues.add("tenant2", new Element("id2", "val")); queues.add("tenant3", new Element("id3", "val")); queues.add("tenant4", new Element("id4", "val")); Assert.assertTrue(queues.removeAll()); Assert.assertFalse(queues.takeIterator("consumer").hasNext()); Assert.assertNull(queues.take("tenant1", "consumer")); Assert.assertNull(queues.take("tenant2", "consumer")); Assert.assertNull(queues.take("tenant3", "consumer")); Assert.assertNull(queues.take("tenant4", "consumer")); }
/** * Sets the status of the given job to {@link ClusterJob.Status#FAILED} and the status of the cluster to some given * status. * * @param job Job to fail. * @param cluster Cluster to set the status for. * @param status Status to set the cluster to. * @param message Error message. * @throws IOException * @throws IllegalAccessException */ public void failJobAndSetClusterStatus(ClusterJob job, Cluster cluster, Cluster.Status status, String message) throws IOException, IllegalAccessException { cluster.setStatus(status); clusterStore.writeCluster(cluster); job.setJobStatus(ClusterJob.Status.FAILED); if (message != null) { job.setStatusMessage(message); } clusterStore.writeClusterJob(job); serverStats.getFailedClusterStats().incrementStat(job.getClusterAction()); callbackQueues.add(cluster.getAccount().getTenantId(), new Element(gson.toJson(new CallbackData(CallbackData.Type.FAILURE, cluster, job)))); }
@Test public void testGetQueueNames() { QueueGroup queues = getQueueGroup(QueueType.PROVISIONER); queues.add("tenant1", new Element("id", "val")); queues.take("tenant2", "consumer.0"); queues.take("tenant3", "consumer.1"); queues.removeAll("tenant4"); queues.getBeingConsumed("tenant5"); queues.getQueued("tenant6"); ImmutableSet<String> expected = ImmutableSet.of("tenant1", "tenant2", "tenant3", "tenant4", "tenant5", "tenant6"); ImmutableSet<String> actual = ImmutableSet.copyOf(queues.getQueueNames()); Assert.assertEquals(expected, actual); }
/** * Sets the status of the given job to {@link ClusterJob.Status#COMPLETE} and the status of the given cluster to * {@link co.cask.coopr.cluster.Cluster.Status#ACTIVE}. * * @param job Job to complete. * @param cluster Cluster the job was for. * @throws IOException */ public void completeJob(ClusterJob job, Cluster cluster) throws IOException, IllegalAccessException { job.setJobStatus(ClusterJob.Status.COMPLETE); clusterStore.writeClusterJob(job); LOG.debug("Job {} is complete", job.getJobId()); // Update cluster status if (job.getClusterAction() == ClusterAction.CLUSTER_DELETE) { cluster.setStatus(Cluster.Status.TERMINATED); } else { cluster.setStatus(Cluster.Status.ACTIVE); } clusterStore.writeCluster(cluster); serverStats.getSuccessfulClusterStats().incrementStat(job.getClusterAction()); if (job.getClusterAction() == ClusterAction.CLUSTER_DELETE) { wipeSensitiveFields(cluster); } callbackQueues.add(cluster.getAccount().getTenantId(), new Element(gson.toJson(new CallbackData(CallbackData.Type.SUCCESS, cluster, job)))); }
@Test public void testGetSize() { QueueGroup queues = getQueueGroup(QueueType.PROVISIONER); queues.add("tenant1", new Element("id1", "val")); queues.add("tenant1", new Element("id2", "val")); queues.add("tenant1", new Element("id3", "val")); queues.add("tenant1", new Element("id4", "val")); queues.add("tenant2", new Element("id1", "val")); queues.add("tenant2", new Element("id2", "val")); queues.add("tenant3", new Element("id1", "val")); Assert.assertEquals(4, queues.size("tenant1")); Assert.assertEquals(2, queues.size("tenant2")); Assert.assertEquals(1, queues.size("tenant3")); // size includes elements being consumed queues.take("tenant1", "consumer"); Assert.assertEquals(4, queues.size("tenant1")); Assert.assertEquals(2, queues.size("tenant2")); Assert.assertEquals(1, queues.size("tenant3")); // size does not include elements that are finished being consumed queues.recordProgress("consumer", "tenant1", "id1", TrackingQueue.ConsumingStatus.FINISHED_SUCCESSFULLY, "result"); Assert.assertEquals(3, queues.size("tenant1")); Assert.assertEquals(2, queues.size("tenant2")); Assert.assertEquals(1, queues.size("tenant3")); }
private TakeTaskRequest getRequest() throws IOException { String tenantId = USER1_ACCOUNT.getTenantId(); ClusterTask clusterTask = new ClusterTask( ProvisionerAction.CREATE, TaskId.fromString("1-1-1"), "node_id", "service", ClusterAction.CLUSTER_CREATE, "test", USER1_ACCOUNT); clusterStore.writeClusterTask(clusterTask); ClusterJob clusterJob = new ClusterJob(JobId.fromString("1-1"), ClusterAction.CLUSTER_CREATE); clusterStore.writeClusterJob(clusterJob); TaskConfig taskConfig = new TaskConfig( NodeProperties.builder().build(), Entities.ProviderExample.JOYENT, ImmutableMap.<String, NodeProperties>of(), new TaskServiceAction("svcA", new ServiceAction("shell", ImmutableMap.<String, String>of())), new JsonObject(), new JsonObject() ); SchedulableTask schedulableTask= new SchedulableTask(clusterTask, taskConfig); provisionerQueues.add(tenantId, new Element(clusterTask.getTaskId(), gson.toJson(schedulableTask))); return new TakeTaskRequest("worker1", PROVISIONER_ID, TENANT_ID); }
@Test public void testTakeTask() throws Exception { String tenantId = USER1_ACCOUNT.getTenantId(); ClusterTask clusterTask = new ClusterTask( ProvisionerAction.CREATE, TaskId.fromString("1-1-1"), "node_id", "service", ClusterAction.CLUSTER_CREATE, "test", USER1_ACCOUNT); clusterStore.writeClusterTask(clusterTask); ClusterJob clusterJob = new ClusterJob(JobId.fromString("1-1"), ClusterAction.CLUSTER_CREATE); clusterStore.writeClusterJob(clusterJob); TaskConfig taskConfig = new TaskConfig( NodeProperties.builder().build(), Entities.ProviderExample.JOYENT, ImmutableMap.<String, NodeProperties>of(), new TaskServiceAction("svcA", new ServiceAction("shell", ImmutableMap.<String, String>of())), new JsonObject(), new JsonObject() ); SchedulableTask schedulableTask= new SchedulableTask(clusterTask, taskConfig); provisionerQueues.add(tenantId, new Element(clusterTask.getTaskId(), gson.toJson(schedulableTask))); TakeTaskRequest takeRequest = new TakeTaskRequest("worker1", PROVISIONER_ID, TENANT_ID); HttpResponse response = doPostInternalAPI("/tasks/take", gson.toJson(takeRequest)); assertResponseStatus(response, HttpResponseStatus.OK); JsonObject responseJson = getResponseJson(response); Assert.assertEquals(clusterTask.getTaskId(), responseJson.get("taskId").getAsString()); }
@Test public void testOneQueueAddTakeWithQueueName() { QueueGroup queues = getQueueGroup(QueueType.PROVISIONER); String queueName = "tenant1"; String consumerId = "worker.0"; queues.add(queueName, new Element("id", "val")); Element taken = queues.take(queueName, consumerId); Assert.assertEquals("id", taken.getId()); Assert.assertEquals("val", taken.getValue()); }
@Test(timeout = 20000) public void testFalseOnStartStopsJob() throws Exception { String tenantId = "q"; ClusterScheduler clusterScheduler = injector.getInstance(ClusterScheduler.class); clusterQueues.add(tenantId, new Element(cluster.getId(), ClusterAction.CLUSTER_CREATE.name())); clusterScheduler.run(); CallbackScheduler callbackScheduler = injector.getInstance(CallbackScheduler.class); // should be no job in the queue until the start callback runs Assert.assertEquals(0, jobQueues.size(tenantId)); // tell mock callback to return false for onStart callback mockClusterCallback.setReturnOnStart(false); // wait for start callback to finish waitForCallback(callbackScheduler); Assert.assertEquals(CallbackData.Type.START, mockClusterCallback.getReceivedCallbacks().get(0).getType()); // wait for fail callback to finish if (mockClusterCallback.getReceivedCallbacks().size() < 2) { waitForCallback(callbackScheduler); } Assert.assertEquals(CallbackData.Type.FAILURE, mockClusterCallback.getReceivedCallbacks().get(1).getType()); // there also should not be any jobs in the queue Assert.assertEquals(0, jobQueues.size(tenantId)); }
@Test public void testOneQueueAddTakeWithoutQueueName() { QueueGroup queues = getQueueGroup(QueueType.PROVISIONER); String queueName = "tenant1"; String consumerId = "worker.0"; queues.add(queueName, new Element("id", "val")); GroupElement taken = queues.takeIterator(consumerId).next(); Assert.assertEquals(queueName, taken.getQueueName()); Assert.assertEquals("id", taken.getElement().getId()); Assert.assertEquals("val", taken.getElement().getValue()); }
@Test public void testOneQueueGetQueuedAndConsumed() { QueueGroup queues = getQueueGroup(QueueType.PROVISIONER); String queueName = "tenant1"; queues.add(queueName, new Element("id1", "val")); queues.add(queueName, new Element("id2", "val")); // check being consumed is correct Assert.assertEquals(ImmutableSet.<String>of(), getIds(queues.getBeingConsumed(queueName))); // check queued is correct. Assert.assertEquals(ImmutableSet.of("id1", "id2"), getIds(queues.getQueued(queueName))); // take one element queues.take(queueName, "consumer1"); // check being consumed is correct Assert.assertEquals(ImmutableSet.of("id1"), getIds(queues.getBeingConsumed(queueName))); // check queued is correct. Assert.assertEquals(ImmutableSet.of("id2"), getIds(queues.getQueued(queueName))); // take next element queues.take(queueName, "consumer2"); // check being consumed is correct Assert.assertEquals(ImmutableSet.of("id1", "id2"), getIds(queues.getBeingConsumed(queueName))); // check queued is correct. Assert.assertEquals(ImmutableSet.<String>of(), getIds(queues.getQueued(queueName))); // finish first element queues.recordProgress("consumer1", queueName, "id1", TrackingQueue.ConsumingStatus.FINISHED_SUCCESSFULLY, "result"); // check being consumed is correct Assert.assertEquals(ImmutableSet.of("id2"), getIds(queues.getBeingConsumed(queueName))); // check queued is correct. Assert.assertEquals(ImmutableSet.<String>of(), getIds(queues.getQueued(queueName))); }
@Test public void testQueuedTaskMissingFromStoreIsRemovedFromQueue() { ClusterCleanup clusterCleanup = new ClusterCleanup(clusterStore, clusterService, nodeService, taskService, jobQueues, provisionerQueues, -10, 1, 1); String queueName = account.getTenantId(); ClusterTask task = new ClusterTask(ProvisionerAction.CREATE, TaskId.fromString("3-1-1"), "node1", "service", ClusterAction.CLUSTER_CREATE, "test", account); task.setStatus(ClusterTask.Status.IN_PROGRESS); Cluster cluster = Entities.ClusterExample.createCluster(); TaskConfig taskConfig = TaskConfig.from(cluster, Entities.ClusterExample.NODE1, Entities.ServiceExample.NAMENODE, cluster.getConfig(), ProvisionerAction.START, null); SchedulableTask schedulableTask = new SchedulableTask(task, taskConfig); // add a task to the queue without storing it.x provisionerQueues.add(queueName, new Element(task.getTaskId(), gson.toJson(schedulableTask))); provisionerQueues.takeIterator("0").next(); clusterCleanup.run(); Assert.assertEquals(0, Iterators.size(provisionerQueues.getBeingConsumed(queueName))); }
private void testCallbacks(boolean failJob) throws Exception { ClusterScheduler clusterScheduler = injector.getInstance(ClusterScheduler.class); String tenantId = cluster.getAccount().getTenantId(); clusterQueues.add(tenantId, new Element(cluster.getId(), ClusterAction.CLUSTER_CREATE.name())); clusterScheduler.run(); CallbackScheduler callbackScheduler = injector.getInstance(CallbackScheduler.class); // should be no job in the queue until the start callback runs Assert.assertEquals(0, jobQueues.size(tenantId)); waitForCallback(callbackScheduler); Assert.assertEquals(CallbackData.Type.START, mockClusterCallback.getReceivedCallbacks().get(0).getType()); JobScheduler jobScheduler = injector.getInstance(JobScheduler.class); jobScheduler.run(); // take tasks until there are no more TakeTaskRequest takeRequest = new TakeTaskRequest("consumer1", PROVISIONER_ID, tenantId); SchedulableTask task = TestHelper.takeTask(getInternalServerUrl(), takeRequest); while (task != null) { FinishTaskRequest finishRequest = new FinishTaskRequest("consumer1", PROVISIONER_ID, tenantId, task.getTaskId(), null, null, failJob ? 1 : 0, null, null, null); TestHelper.finishTask(getInternalServerUrl(), finishRequest); jobScheduler.run(); jobScheduler.run(); task = TestHelper.takeTask(getInternalServerUrl(), takeRequest); } jobScheduler.run(); waitForCallback(callbackScheduler); // at this point, the failure callback should have run Assert.assertEquals(failJob ? CallbackData.Type.FAILURE : CallbackData.Type.SUCCESS, mockClusterCallback.getReceivedCallbacks().get(1).getType()); }