[autotest] Add AFE endpoints for shard management This adds endpoints to the AFE to create, delete and list shards. Especially sensitive is the delete operation: All hosts and jobs are claimed back from the shard to the master. Hosts will be rebooted and reverified using repair. This is to ensure no processes from the shards are still running on the DUTs, as they could potentially interfer with upcoming tests. BUG=None DEPLOY=apache TEST=Ran suites Change-Id: Idf887452d62f37ae62771d42b7661c09e72d9a08 Reviewed-on: https://chromium-review.googlesource.com/218294 Reviewed-by: Fang Deng <fdeng@chromium.org> Commit-Queue: Jakob Jülich <jakobjuelich@chromium.org> Tested-by: Jakob Jülich <jakobjuelich@chromium.org>

commit: 82b7d1c262ff19d01ed0bc8c02911d02fbc2a148 [log] [tgz]
author: Jakob Juelich <jakobjuelich@chromium.org> Mon Sep 15 16:10:57 2014 -0700
committer: chrome-internal-fetch <chrome-internal-fetch@google.com> Thu Sep 25 22:49:51 2014 +0000
tree: e88e36e48107f93e659bef3b5e9b790a63e30ae1
parent: 225dd2cccba21b0a9ebd40404a6630704a243d82 [diff] [blame]
diff --git a/frontend/afe/site_rpc_interface.py b/frontend/afe/site_rpc_interface.py
index 765748d..b7a945c 100644
--- a/frontend/afe/site_rpc_interface.py
+++ b/frontend/afe/site_rpc_interface.py

@@ -12,6 +12,7 @@
 import os
 import shutil
 
+from autotest_lib.frontend.afe import models
 from autotest_lib.client.common_lib import error
 from autotest_lib.client.common_lib import global_config
 from autotest_lib.client.common_lib import priorities
@@ -319,3 +320,72 @@
             'hosts': [host.serialize() for host in hosts],
             'jobs': [job.serialize() for job in jobs],
         }
+
+
+def get_shards(**filter_data):
+    """Return a list of all shards.
+
+    @returns A sequence of nested dictionaries of shard information.
+    """
+    shards = models.Shard.query_objects(filter_data)
+    serialized_shards = rpc_utils.prepare_rows_as_nested_dicts(shards, ())
+    for serialized, shard in zip(serialized_shards, shards):
+        serialized['labels'] = [label.name for label in shard.labels.all()]
+
+    return serialized_shards
+
+
+def add_shard(hostname, label):
+    """Add a shard and start running jobs on it.
+
+    @param hostname: The hostname of the shard to be added; needs to be unique.
+    @param label: A platform label. Jobs of this label will be assigned to the
+                  shard.
+
+    @raises model_logic.ValidationError if a shard with the given hostname
+            already exists.
+    """
+    shard = models.Shard.add_object(hostname=hostname)
+    shard.labels.add(models.Label.smart_get(label))
+    return shard.id
+
+
+def delete_shard(hostname):
+    """Delete a shard and reclaim all resources from it.
+
+    This claims back all assigned hosts from the shard. To ensure all DUTs are
+    in a sane state, a Repair task is scheduled for them. This reboots the DUTs
+    and therefore clears all running processes that might be left.
+
+    The shard_id of jobs of that shard will be set to None.
+
+    The status of jobs that haven't been reported to be finished yet, will be
+    lost. The master scheduler will pick up the jobs and execute them.
+
+    @param hostname: Hostname of the shard to delete.
+    """
+    shard = rpc_utils.retrieve_shard(shard_hostname=hostname)
+
+    # TODO(beeps): Power off shard
+
+    # For ChromeOS hosts, repair reboots the DUT.
+    # Repair will excalate through multiple repair steps and will verify the
+    # success after each of them. Anyway, it will always run at least the first
+    # one, which includes a reboot.
+    # After a reboot we can be sure no processes from prior tests that were run
+    # by a shard are still running on the DUT.
+    # Important: Don't just set the status to Repair Failed, as that would run
+    # Verify first, before doing any repair measures. Verify would probably
+    # succeed, so this wouldn't change anything on the DUT.
+    for host in models.Host.objects.filter(shard=shard):
+            models.SpecialTask.objects.create(
+                    task=models.SpecialTask.Task.REPAIR,
+                    host=host,
+                    requested_by=models.User.current_user())
+    models.Host.objects.filter(shard=shard).update(shard=None)
+
+    models.Job.objects.filter(shard=shard).update(shard=None)
+
+    shard.labels.clear()
+
+    shard.delete()
commit	82b7d1c262ff19d01ed0bc8c02911d02fbc2a148	[log] [tgz]
author	Jakob Juelich <jakobjuelich@chromium.org>	Mon Sep 15 16:10:57 2014 -0700
committer	chrome-internal-fetch <chrome-internal-fetch@google.com>	Thu Sep 25 22:49:51 2014 +0000
tree	e88e36e48107f93e659bef3b5e9b790a63e30ae1
parent	225dd2cccba21b0a9ebd40404a6630704a243d82 [diff] [blame]