Quarantine android bots instead of shutting off device
This should make it easier to diagnose and remotely fix
the devices/hosts.
Only works because of
https://chrome-internal-review.googlesource.com/c/infradata/config/+/535542
Bug: skia:
Change-Id: Idf11a09da6c7367212a0ac0b0b959cbf49fed85b
Reviewed-on: https://skia-review.googlesource.com/89181
Reviewed-by: Ravi Mistry <rmistry@google.com>
Commit-Queue: Ravi Mistry <rmistry@google.com>
diff --git a/infra/bots/recipe_modules/flavor/examples/full.expected/failed_infra_step.json b/infra/bots/recipe_modules/flavor/examples/full.expected/failed_infra_step.json
index 55ffab7..a9c04d9 100644
--- a/infra/bots/recipe_modules/flavor/examples/full.expected/failed_infra_step.json
+++ b/infra/bots/recipe_modules/flavor/examples/full.expected/failed_infra_step.json
@@ -1023,20 +1023,17 @@
},
{
"cmd": [
- "/usr/bin/adb.1.0.35",
- "shell",
- "reboot",
- "-p"
+ "python",
+ "-u",
+ "RECIPE_MODULE[recipe_engine::file]/resources/fileutil.py",
+ "--json-output",
+ "/path/to/tmp/json",
+ "copy",
+ " ",
+ "~/force_quarantine"
],
- "cwd": "[START_DIR]/skia",
- "env": {
- "BUILDTYPE": "Debug",
- "CHROME_HEADLESS": "1",
- "PATH": "<PATH>:RECIPE_PACKAGE_REPO[depot_tools]",
- "SKIA_OUT": "[START_DIR]/out"
- },
"infra_step": true,
- "name": "shut down device to quarantine bot"
+ "name": "Quarantining Bot"
},
{
"cmd": [
diff --git a/infra/bots/recipe_modules/flavor/examples/full.expected/retry_adb_command_retries_exhausted.json b/infra/bots/recipe_modules/flavor/examples/full.expected/retry_adb_command_retries_exhausted.json
index 9146bfa..3fd7669 100644
--- a/infra/bots/recipe_modules/flavor/examples/full.expected/retry_adb_command_retries_exhausted.json
+++ b/infra/bots/recipe_modules/flavor/examples/full.expected/retry_adb_command_retries_exhausted.json
@@ -296,20 +296,17 @@
},
{
"cmd": [
- "/usr/bin/adb.1.0.35",
- "shell",
- "reboot",
- "-p"
+ "python",
+ "-u",
+ "RECIPE_MODULE[recipe_engine::file]/resources/fileutil.py",
+ "--json-output",
+ "/path/to/tmp/json",
+ "copy",
+ " ",
+ "~/force_quarantine"
],
- "cwd": "[START_DIR]/skia",
- "env": {
- "BUILDTYPE": "Debug",
- "CHROME_HEADLESS": "1",
- "PATH": "<PATH>:RECIPE_PACKAGE_REPO[depot_tools]",
- "SKIA_OUT": "[START_DIR]/out"
- },
"infra_step": true,
- "name": "shut down device to quarantine bot"
+ "name": "Quarantining Bot"
},
{
"cmd": [
diff --git a/infra/bots/recipe_modules/flavor/gn_android_flavor.py b/infra/bots/recipe_modules/flavor/gn_android_flavor.py
index be1f7e9..01fcc30 100644
--- a/infra/bots/recipe_modules/flavor/gn_android_flavor.py
+++ b/infra/bots/recipe_modules/flavor/gn_android_flavor.py
@@ -361,18 +361,15 @@
timeout=300,
abort_on_failure=False)
- # Only shutdown the device and quarantine the bot if the first failed step
+ # Only quarantine the bot if the first failed step
# is an infra step. If, instead, we did this for any infra failures, we
- # would shutdown too much. For example, if a Nexus 10 died during dm
+ # would do this too much. For example, if a Nexus 10 died during dm
# and the following pull step would also fail "device not found" - causing
# us to run the shutdown command when the device was probably not in a
# broken state; it was just rebooting.
- # Avoid doing this to machines in the Golo because they are harder to fix
- # than local devices.
if (self.m.run.failed_steps and
- isinstance(self.m.run.failed_steps[0], recipe_api.InfraFailure) and
- self.m.vars.builder_cfg.get('model') not in self._golo_devices):
- self._adb('shut down device to quarantine bot', 'shell', 'reboot', '-p')
+ isinstance(self.m.run.failed_steps[0], recipe_api.InfraFailure)):
+ self.m.file.write_text('Quarantining Bot', '~/force_quarantine', ' ')
if self._ever_ran_adb:
self._adb('kill adb server', 'kill-server')
diff --git a/infra/bots/recipes/perf.expected/cpu_scale_failed.json b/infra/bots/recipes/perf.expected/cpu_scale_failed.json
index ffdc7de..0350673 100644
--- a/infra/bots/recipes/perf.expected/cpu_scale_failed.json
+++ b/infra/bots/recipes/perf.expected/cpu_scale_failed.json
@@ -851,20 +851,17 @@
},
{
"cmd": [
- "/usr/bin/adb.1.0.35",
- "shell",
- "reboot",
- "-p"
+ "python",
+ "-u",
+ "RECIPE_MODULE[recipe_engine::file]/resources/fileutil.py",
+ "--json-output",
+ "/path/to/tmp/json",
+ "copy",
+ " ",
+ "~/force_quarantine"
],
- "cwd": "[START_DIR]/skia",
- "env": {
- "BUILDTYPE": "Debug",
- "CHROME_HEADLESS": "1",
- "PATH": "<PATH>:RECIPE_PACKAGE_REPO[depot_tools]",
- "SKIA_OUT": "[START_DIR]/out"
- },
"infra_step": true,
- "name": "shut down device to quarantine bot"
+ "name": "Quarantining Bot"
},
{
"cmd": [
diff --git a/infra/bots/recipes/perf.expected/cpu_scale_failed_golo.json b/infra/bots/recipes/perf.expected/cpu_scale_failed_golo.json
index 992125d..5843c40 100644
--- a/infra/bots/recipes/perf.expected/cpu_scale_failed_golo.json
+++ b/infra/bots/recipes/perf.expected/cpu_scale_failed_golo.json
@@ -919,6 +919,20 @@
},
{
"cmd": [
+ "python",
+ "-u",
+ "RECIPE_MODULE[recipe_engine::file]/resources/fileutil.py",
+ "--json-output",
+ "/path/to/tmp/json",
+ "copy",
+ " ",
+ "~/force_quarantine"
+ ],
+ "infra_step": true,
+ "name": "Quarantining Bot"
+ },
+ {
+ "cmd": [
"/opt/infra-android/tools/adb",
"kill-server"
],
diff --git a/infra/bots/recipes/perf.expected/failed_push.json b/infra/bots/recipes/perf.expected/failed_push.json
index 9dd0c76..c68e9b5 100644
--- a/infra/bots/recipes/perf.expected/failed_push.json
+++ b/infra/bots/recipes/perf.expected/failed_push.json
@@ -91,20 +91,17 @@
},
{
"cmd": [
- "/usr/bin/adb.1.0.35",
- "shell",
- "reboot",
- "-p"
+ "python",
+ "-u",
+ "RECIPE_MODULE[recipe_engine::file]/resources/fileutil.py",
+ "--json-output",
+ "/path/to/tmp/json",
+ "copy",
+ " ",
+ "~/force_quarantine"
],
- "cwd": "[START_DIR]/skia",
- "env": {
- "BUILDTYPE": "Debug",
- "CHROME_HEADLESS": "1",
- "PATH": "<PATH>:RECIPE_PACKAGE_REPO[depot_tools]",
- "SKIA_OUT": "[START_DIR]/out"
- },
"infra_step": true,
- "name": "shut down device to quarantine bot"
+ "name": "Quarantining Bot"
},
{
"cmd": [
diff --git a/infra/bots/recipes/test.expected/failed_push.json b/infra/bots/recipes/test.expected/failed_push.json
index 9dd0c76..c68e9b5 100644
--- a/infra/bots/recipes/test.expected/failed_push.json
+++ b/infra/bots/recipes/test.expected/failed_push.json
@@ -91,20 +91,17 @@
},
{
"cmd": [
- "/usr/bin/adb.1.0.35",
- "shell",
- "reboot",
- "-p"
+ "python",
+ "-u",
+ "RECIPE_MODULE[recipe_engine::file]/resources/fileutil.py",
+ "--json-output",
+ "/path/to/tmp/json",
+ "copy",
+ " ",
+ "~/force_quarantine"
],
- "cwd": "[START_DIR]/skia",
- "env": {
- "BUILDTYPE": "Debug",
- "CHROME_HEADLESS": "1",
- "PATH": "<PATH>:RECIPE_PACKAGE_REPO[depot_tools]",
- "SKIA_OUT": "[START_DIR]/out"
- },
"infra_step": true,
- "name": "shut down device to quarantine bot"
+ "name": "Quarantining Bot"
},
{
"cmd": [