Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright 2015 The Chromium Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Cleanup orphaned containers. |
| 7 | |
| 8 | If an autoserv process dies without being able to call handler of SIGTERM, the |
| 9 | container used to run the test will be orphaned. This adds overhead to the |
| 10 | drone. This script is used to clean up such containers. |
| 11 | |
| 12 | This module also checks if the test job associated with a container has |
| 13 | finished. If so, kill the autoserv process for the test job and destroy the |
| 14 | container. To avoid racing condition, this only applies to job finished at least |
| 15 | 1 hour ago. |
| 16 | |
| 17 | """ |
| 18 | |
| 19 | import argparse |
| 20 | import datetime |
| 21 | import logging |
| 22 | import os |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 23 | import signal |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 24 | |
| 25 | import common |
| 26 | from autotest_lib.client.common_lib import logging_config |
| 27 | from autotest_lib.client.common_lib import time_utils |
| 28 | from autotest_lib.client.common_lib import utils |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 29 | from autotest_lib.server.cros.dynamic_suite import frontend_wrappers |
| 30 | from autotest_lib.site_utils import lxc |
| 31 | |
| 32 | |
| 33 | AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10) |
| 34 | # The cutoff time to declare a test job is completed and container is orphaned. |
| 35 | # This is to avoid a race condition that scheduler aborts a job and autoserv |
| 36 | # is still in the process of destroying the container it used. |
| 37 | FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1) |
| 38 | |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 39 | def is_container_orphaned(container): |
| 40 | """Check if a container is orphaned. |
| 41 | |
| 42 | A container is orphaned if any of these condition is True: |
| 43 | 1. The autoserv process created the container is no longer running. |
| 44 | 2. The test job is finished at least 1 hour ago. |
| 45 | |
| 46 | @param container: A Container object. |
| 47 | |
| 48 | @return: True if the container is orphaned. |
| 49 | |
| 50 | """ |
| 51 | logging.debug('Checking if container is orphaned: %s', container.name) |
Ben Kwa | f7aa288 | 2017-11-16 13:32:58 -0800 | [diff] [blame] | 52 | if container.id is None: |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 53 | logging.debug('Container %s is not created for test.', container.name) |
| 54 | return False |
| 55 | |
Ben Kwa | f7aa288 | 2017-11-16 13:32:58 -0800 | [diff] [blame] | 56 | job_id = container.id.job_id |
| 57 | pid = container.id.pid |
Ben Kwa | bedacad | 2017-08-28 12:20:38 -0700 | [diff] [blame] | 58 | |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 59 | if pid and not utils.pid_is_alive(pid): |
| 60 | logging.debug('Process with PID %s is not alive, container %s is ' |
| 61 | 'orphaned.', pid, container.name) |
| 62 | return True |
| 63 | |
| 64 | try: |
| 65 | hqes = AFE.get_host_queue_entries(job_id=job_id) |
| 66 | except Exception as e: |
| 67 | logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e) |
| 68 | return False |
| 69 | |
| 70 | if not hqes: |
| 71 | # The job has not run yet. |
| 72 | return False |
| 73 | for hqe in hqes: |
| 74 | if hqe.active or not hqe.complete: |
| 75 | logging.debug('Test job %s is not completed yet, container %s is ' |
| 76 | 'not orphaned.', job_id, container.name) |
| 77 | return False |
| 78 | if (hqe.finished_on and |
Ben Kwa | f7aa288 | 2017-11-16 13:32:58 -0800 | [diff] [blame] | 79 | (time_utils.time_string_to_datetime(hqe.finished_on) > |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 80 | FINISHED_JOB_CUTOFF_TIME)): |
| 81 | logging.debug('Test job %s was completed less than an hour ago.', |
| 82 | job_id) |
| 83 | return False |
| 84 | |
| 85 | logging.debug('Test job %s was completed, container %s is orphaned.', |
| 86 | job_id, container.name) |
| 87 | return True |
| 88 | |
| 89 | |
| 90 | def cleanup(container, options): |
| 91 | """Cleanup orphaned container. |
| 92 | |
| 93 | @param container: A Container object to be cleaned up. |
| 94 | @param options: Options to do cleanup. |
| 95 | |
| 96 | @return: True if cleanup is successful. False otherwise. |
| 97 | |
| 98 | """ |
| 99 | if not options.execute: |
| 100 | logging.info('dryrun: Cleanup container %s', container.name) |
| 101 | return False |
| 102 | |
| 103 | try: |
Ben Kwa | bedacad | 2017-08-28 12:20:38 -0700 | [diff] [blame] | 104 | # cleanup is protected by is_container_orphaned. At this point the |
| 105 | # container may be assumed to have a valid ID. |
Ben Kwa | f7aa288 | 2017-11-16 13:32:58 -0800 | [diff] [blame] | 106 | pid = container.id.pid |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 107 | # Kill autoserv process |
| 108 | if pid and utils.pid_is_alive(pid): |
| 109 | logging.info('Stopping process %s...', pid) |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 110 | utils.nuke_pid(int(pid), (signal.SIGKILL,)) |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 111 | |
| 112 | # Destroy container |
| 113 | logging.info('Destroying container %s...', container.name) |
| 114 | container.destroy() |
| 115 | return True |
| 116 | except Exception as e: |
| 117 | logging.error('Failed to cleanup container %s. Error: %s', |
| 118 | container.name, e) |
| 119 | return False |
| 120 | |
| 121 | |
| 122 | def parse_options(): |
| 123 | """Parse command line inputs. |
| 124 | |
| 125 | @return: Options to run the script. |
| 126 | """ |
| 127 | parser = argparse.ArgumentParser() |
| 128 | parser.add_argument('-v', '--verbose', action='store_true', |
| 129 | default=False, |
| 130 | help='Print out ALL entries.') |
| 131 | parser.add_argument('-x', '--execute', action='store_true', |
| 132 | default=False, |
| 133 | help=('Execute the actions to kill autoserv processes ' |
| 134 | 'and destroy containers. Default is False to do ' |
| 135 | 'dry run')) |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 136 | # TODO(dshi): Consider to adopt the scheduler log model: |
| 137 | # 1. Create one log per run. |
| 138 | # 2. Create a symlink to the latest log. |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 139 | parser.add_argument('-l', '--logfile', type=str, |
| 140 | default=None, |
| 141 | help='Path to the log file to save logs.') |
| 142 | return parser.parse_args() |
| 143 | |
| 144 | |
| 145 | def main(options): |
| 146 | """Main script. |
| 147 | |
| 148 | @param options: Options to run the script. |
| 149 | """ |
| 150 | config = logging_config.LoggingConfig() |
| 151 | if options.logfile: |
| 152 | config.add_file_handler( |
| 153 | file_path=os.path.abspath(options.logfile), |
| 154 | level=logging.DEBUG if options.verbose else logging.INFO) |
| 155 | |
| 156 | bucket = lxc.ContainerBucket() |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 157 | logging.info('') |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 158 | logging.info('Cleaning container bucket %s', bucket.container_path) |
| 159 | success_count = 0 |
| 160 | failure_count = 0 |
| 161 | for container in bucket.get_all().values(): |
| 162 | if is_container_orphaned(container): |
| 163 | if cleanup(container, options): |
| 164 | success_count += 1 |
| 165 | else: |
| 166 | failure_count += 1 |
Dan Shi | c458f66 | 2015-04-29 12:12:38 -0700 | [diff] [blame] | 167 | logging.info('Cleanup finished.') |
Dan Shi | f4cb4da | 2015-04-23 15:55:29 -0700 | [diff] [blame] | 168 | |
| 169 | |
| 170 | if __name__ == '__main__': |
| 171 | options = parse_options() |
| 172 | main(options) |