Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 3 | from __future__ import print_function |
| 4 | |
| 5 | import argparse |
Shuqian Zhao | 9febd45 | 2017-01-31 15:36:40 -0800 | [diff] [blame] | 6 | import logging |
Shuqian Zhao | fad5067 | 2017-02-02 16:46:03 -0800 | [diff] [blame] | 7 | import multiprocessing |
J. Richard Barnette | 868cf64 | 2014-07-21 16:34:38 -0700 | [diff] [blame] | 8 | import subprocess |
| 9 | import sys |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 10 | |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 11 | import common |
Don Garrett | 5071346 | 2015-01-07 18:04:05 -0800 | [diff] [blame] | 12 | from autotest_lib.server import frontend |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 13 | from autotest_lib.site_utils.lib import infra |
| 14 | |
Shuqian Zhao | 8754a1a | 2016-08-24 12:54:11 -0700 | [diff] [blame] | 15 | DEPLOY_SERVER_LOCAL = ('/usr/local/autotest/site_utils/deploy_server_local.py') |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 16 | POOL_SIZE = 124 |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 17 | |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 18 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 19 | def _filter_servers(servers): |
| 20 | """Filter a set of servers to those that should be deployed to.""" |
| 21 | non_push_roles = {'devserver', 'crash_server', 'reserve'} |
| 22 | for s in servers: |
| 23 | if s['status'] == 'repair_required': |
| 24 | continue |
Aviv Keshet | 9ec24b5 | 2017-10-31 11:02:10 -0700 | [diff] [blame] | 25 | if s['status'] == 'backup': |
| 26 | continue |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 27 | if set(s['roles']) & non_push_roles: |
| 28 | continue |
| 29 | yield s |
| 30 | |
| 31 | |
| 32 | def discover_servers(afe): |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 33 | """Discover the in-production servers to update. |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 34 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 35 | Returns the set of servers from serverdb that are in production and should |
| 36 | be updated. This filters out servers in need of repair, or servers of roles |
| 37 | that are not yet supported by deploy_server / deploy_server_local. |
| 38 | |
Don Garrett | eecbc13 | 2015-01-08 17:26:20 -0800 | [diff] [blame] | 39 | @param afe: Server to contact with RPC requests. |
| 40 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 41 | @returns: A set of server hostnames. |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 42 | """ |
Don Garrett | 5071346 | 2015-01-07 18:04:05 -0800 | [diff] [blame] | 43 | # Example server details.... |
| 44 | # { |
| 45 | # 'hostname': 'server1', |
| 46 | # 'status': 'backup', |
| 47 | # 'roles': ['drone', 'scheduler'], |
| 48 | # 'attributes': {'max_processes': 300} |
| 49 | # } |
Don Garrett | eecbc13 | 2015-01-08 17:26:20 -0800 | [diff] [blame] | 50 | rpc = frontend.AFE(server=afe) |
Don Garrett | 5071346 | 2015-01-07 18:04:05 -0800 | [diff] [blame] | 51 | servers = rpc.run('get_servers') |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 52 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 53 | return {s['hostname'] for s in _filter_servers(servers)} |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 54 | |
J. Richard Barnette | f533b18 | 2014-09-04 18:24:42 -0700 | [diff] [blame] | 55 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 56 | def _parse_arguments(args): |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 57 | """Parse command line arguments. |
| 58 | |
| 59 | @param args: The command line arguments to parse. (usually sys.argv[1:]) |
| 60 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 61 | @returns A tuple of (argparse.Namespace populated with argument values, |
| 62 | list of extra args to pass to deploy_server_local). |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 63 | """ |
| 64 | parser = argparse.ArgumentParser( |
Don Garrett | 3f2b660 | 2014-12-16 18:19:16 -0800 | [diff] [blame] | 65 | formatter_class=argparse.RawDescriptionHelpFormatter, |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 66 | description='Run deploy_server_local on a bunch of servers. Extra ' |
| 67 | 'arguments will be passed through.', |
Don Garrett | 3f2b660 | 2014-12-16 18:19:16 -0800 | [diff] [blame] | 68 | epilog=('Update all servers:\n' |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 69 | ' deploy_server.py -x --afe cautotest\n' |
Don Garrett | 3f2b660 | 2014-12-16 18:19:16 -0800 | [diff] [blame] | 70 | '\n' |
| 71 | 'Update one server:\n' |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 72 | ' deploy_server.py <server> -x\n' |
| 73 | )) |
Don Garrett | 3f2b660 | 2014-12-16 18:19:16 -0800 | [diff] [blame] | 74 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 75 | parser.add_argument('-x', action='store_true', |
| 76 | help='Actually perform actions. If not supplied, ' |
| 77 | 'script does nothing.') |
| 78 | parser.add_argument('--afe', |
Shuqian Zhao | 6cf933b | 2017-09-27 15:07:56 -0700 | [diff] [blame] | 79 | help='The AFE server used to get servers from server_db,' |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 80 | 'e.g, cautotest. Used only if no SERVER specified.') |
| 81 | parser.add_argument('servers', action='store', nargs='*', metavar='SERVER') |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 82 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 83 | return parser.parse_known_args() |
J. Richard Barnette | f533b18 | 2014-09-04 18:24:42 -0700 | [diff] [blame] | 84 | |
| 85 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 86 | def _update_server(server, extra_args=[]): |
| 87 | """Run deploy_server_local for given server. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 88 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 89 | @param server: hostname to update. |
| 90 | @param extra_args: args to be passed in to deploy_server_local. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 91 | |
| 92 | @return: A tuple of (server, success, output), where: |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 93 | server: Name of the server. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 94 | sucess: True if update succeeds, False otherwise. |
Shuqian Zhao | 8754a1a | 2016-08-24 12:54:11 -0700 | [diff] [blame] | 95 | output: A string of the deploy_server_local script output |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 96 | including any errors. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 97 | """ |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 98 | cmd = ('%s %s' % |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 99 | (DEPLOY_SERVER_LOCAL, ' '.join(extra_args))) |
| 100 | success = False |
| 101 | try: |
| 102 | output = infra.execute_command(server, cmd) |
| 103 | success = True |
| 104 | except subprocess.CalledProcessError as e: |
| 105 | output = e.output |
Dan Shi | 94c310d | 2016-03-18 11:27:38 -0700 | [diff] [blame] | 106 | |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 107 | return server, success, output |
| 108 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 109 | def _update_in_parallel(servers, extra_args=[]): |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 110 | """Update a group of servers in parallel. |
| 111 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 112 | @param servers: A list of servers to update. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 113 | @param options: Options for the push. |
| 114 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 115 | @returns A dictionary from server names that failed to the output |
| 116 | of the update script. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 117 | """ |
Shuqian Zhao | fad5067 | 2017-02-02 16:46:03 -0800 | [diff] [blame] | 118 | # Create a list to record all the finished servers. |
| 119 | manager = multiprocessing.Manager() |
| 120 | finished_servers = manager.list() |
| 121 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 122 | do_server = lambda s: _update_server(s, extra_args) |
| 123 | |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 124 | # The update actions run in parallel. If any update failed, we should wait |
| 125 | # for other running updates being finished. Abort in the middle of an update |
| 126 | # may leave the server in a bad state. |
| 127 | pool = multiprocessing.pool.ThreadPool(POOL_SIZE) |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 128 | try: |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 129 | results = pool.map_async(do_server, servers) |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 130 | pool.close() |
Shuqian Zhao | fad5067 | 2017-02-02 16:46:03 -0800 | [diff] [blame] | 131 | |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 132 | # Track the updating progress for current group of servers. |
| 133 | incomplete_servers = set() |
| 134 | server_names = set([s[0] for s in servers]) |
| 135 | while not results.ready(): |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 136 | incomplete_servers = sorted(set(servers) - set(finished_servers)) |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 137 | print('Not finished yet. %d servers in this group. ' |
| 138 | '%d servers are still running:\n%s\n' % |
| 139 | (len(servers), len(incomplete_servers), incomplete_servers)) |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 140 | # Check the progress every 20s |
| 141 | results.wait(20) |
Shuqian Zhao | fad5067 | 2017-02-02 16:46:03 -0800 | [diff] [blame] | 142 | |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 143 | # After update finished, parse the result. |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 144 | failures = {} |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 145 | for server, success, output in results.get(): |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 146 | if not success: |
| 147 | failures[server] = output |
| 148 | |
| 149 | return failures |
| 150 | |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 151 | finally: |
| 152 | pool.terminate() |
| 153 | pool.join() |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 154 | |
| 155 | |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 156 | def main(args): |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 157 | """Entry point to deploy_server.py |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 158 | |
Don Garrett | 8679bfe | 2016-08-25 18:56:39 -0700 | [diff] [blame] | 159 | @param args: The command line arguments to parse. (usually sys.argv) |
J. Richard Barnette | 868cf64 | 2014-07-21 16:34:38 -0700 | [diff] [blame] | 160 | |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 161 | @returns The system exit code. |
| 162 | """ |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 163 | options, extra_args = _parse_arguments(args[1:]) |
Shuqian Zhao | 9febd45 | 2017-01-31 15:36:40 -0800 | [diff] [blame] | 164 | # Remove all the handlers from the root logger to get rid of the handlers |
| 165 | # introduced by the import packages. |
| 166 | logging.getLogger().handlers = [] |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 167 | logging.basicConfig(level=logging.DEBUG) |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 168 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 169 | servers = options.servers |
| 170 | if not servers: |
| 171 | if not options.afe: |
| 172 | print('No servers or afe specified. Aborting') |
| 173 | return 1 |
| 174 | print('Retrieving servers from %s..' % options.afe) |
| 175 | servers = discover_servers(options.afe) |
| 176 | print('Retrieved servers were: %s' % servers) |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 177 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 178 | if not options.x: |
| 179 | print('Doing nothing because -x was not supplied.') |
| 180 | print('servers: %s' % options.servers) |
| 181 | print('extra args for deploy_server_local: %s' % extra_args) |
| 182 | return 0 |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 183 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 184 | failures = _update_in_parallel(servers, extra_args) |
Don Garrett | 8679bfe | 2016-08-25 18:56:39 -0700 | [diff] [blame] | 185 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 186 | if not failures: |
| 187 | print('Completed all updates successfully.') |
| 188 | return 0 |
| 189 | |
| 190 | print('The following servers failed, with the following output:') |
| 191 | for s, o in failures.iteritems(): |
| 192 | print('======== %s ========' % s) |
| 193 | print(o) |
| 194 | |
| 195 | print('The servers that failed were:') |
| 196 | print('\n'.join(failures.keys())) |
| 197 | print('\n\nTo retry on failed servers, run the following command:') |
| 198 | retry_cmd = [args[0], '-x'] + failures.keys() + extra_args |
| 199 | print(' '.join(retry_cmd)) |
| 200 | return 1 |
| 201 | |
J. Richard Barnette | f533b18 | 2014-09-04 18:24:42 -0700 | [diff] [blame] | 202 | |
| 203 | |
| 204 | if __name__ == '__main__': |
Don Garrett | 8679bfe | 2016-08-25 18:56:39 -0700 | [diff] [blame] | 205 | sys.exit(main(sys.argv)) |