Mike Frysinger | d03e6b5 | 2019-08-03 12:49:01 -0400 | [diff] [blame] | 1 | #!/usr/bin/python2 |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 2 | |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 3 | from __future__ import print_function |
| 4 | |
| 5 | import argparse |
Shuqian Zhao | 9febd45 | 2017-01-31 15:36:40 -0800 | [diff] [blame] | 6 | import logging |
Shuqian Zhao | fad5067 | 2017-02-02 16:46:03 -0800 | [diff] [blame] | 7 | import multiprocessing |
J. Richard Barnette | 868cf64 | 2014-07-21 16:34:38 -0700 | [diff] [blame] | 8 | import subprocess |
| 9 | import sys |
Ningning Xia | 9c0bcd2 | 2018-05-01 15:40:58 -0700 | [diff] [blame] | 10 | from multiprocessing.pool import ThreadPool |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 11 | |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 12 | import common |
Don Garrett | 5071346 | 2015-01-07 18:04:05 -0800 | [diff] [blame] | 13 | from autotest_lib.server import frontend |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 14 | from autotest_lib.site_utils.lib import infra |
| 15 | |
Shuqian Zhao | 8754a1a | 2016-08-24 12:54:11 -0700 | [diff] [blame] | 16 | DEPLOY_SERVER_LOCAL = ('/usr/local/autotest/site_utils/deploy_server_local.py') |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 17 | POOL_SIZE = 124 |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 18 | |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 19 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 20 | def _filter_servers(servers): |
| 21 | """Filter a set of servers to those that should be deployed to.""" |
| 22 | non_push_roles = {'devserver', 'crash_server', 'reserve'} |
| 23 | for s in servers: |
| 24 | if s['status'] == 'repair_required': |
| 25 | continue |
Aviv Keshet | 9ec24b5 | 2017-10-31 11:02:10 -0700 | [diff] [blame] | 26 | if s['status'] == 'backup': |
| 27 | continue |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 28 | if set(s['roles']) & non_push_roles: |
| 29 | continue |
| 30 | yield s |
| 31 | |
| 32 | |
| 33 | def discover_servers(afe): |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 34 | """Discover the in-production servers to update. |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 35 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 36 | Returns the set of servers from serverdb that are in production and should |
| 37 | be updated. This filters out servers in need of repair, or servers of roles |
| 38 | that are not yet supported by deploy_server / deploy_server_local. |
| 39 | |
Don Garrett | eecbc13 | 2015-01-08 17:26:20 -0800 | [diff] [blame] | 40 | @param afe: Server to contact with RPC requests. |
| 41 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 42 | @returns: A set of server hostnames. |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 43 | """ |
Don Garrett | 5071346 | 2015-01-07 18:04:05 -0800 | [diff] [blame] | 44 | # Example server details.... |
| 45 | # { |
| 46 | # 'hostname': 'server1', |
| 47 | # 'status': 'backup', |
| 48 | # 'roles': ['drone', 'scheduler'], |
| 49 | # 'attributes': {'max_processes': 300} |
| 50 | # } |
Don Garrett | eecbc13 | 2015-01-08 17:26:20 -0800 | [diff] [blame] | 51 | rpc = frontend.AFE(server=afe) |
Don Garrett | 5071346 | 2015-01-07 18:04:05 -0800 | [diff] [blame] | 52 | servers = rpc.run('get_servers') |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 53 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 54 | return {s['hostname'] for s in _filter_servers(servers)} |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 55 | |
J. Richard Barnette | f533b18 | 2014-09-04 18:24:42 -0700 | [diff] [blame] | 56 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 57 | def _parse_arguments(args): |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 58 | """Parse command line arguments. |
| 59 | |
| 60 | @param args: The command line arguments to parse. (usually sys.argv[1:]) |
| 61 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 62 | @returns A tuple of (argparse.Namespace populated with argument values, |
| 63 | list of extra args to pass to deploy_server_local). |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 64 | """ |
| 65 | parser = argparse.ArgumentParser( |
Don Garrett | 3f2b660 | 2014-12-16 18:19:16 -0800 | [diff] [blame] | 66 | formatter_class=argparse.RawDescriptionHelpFormatter, |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 67 | description='Run deploy_server_local on a bunch of servers. Extra ' |
| 68 | 'arguments will be passed through.', |
Don Garrett | 3f2b660 | 2014-12-16 18:19:16 -0800 | [diff] [blame] | 69 | epilog=('Update all servers:\n' |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 70 | ' deploy_server.py -x --afe cautotest\n' |
Don Garrett | 3f2b660 | 2014-12-16 18:19:16 -0800 | [diff] [blame] | 71 | '\n' |
| 72 | 'Update one server:\n' |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 73 | ' deploy_server.py <server> -x\n' |
| 74 | )) |
Don Garrett | 3f2b660 | 2014-12-16 18:19:16 -0800 | [diff] [blame] | 75 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 76 | parser.add_argument('-x', action='store_true', |
| 77 | help='Actually perform actions. If not supplied, ' |
| 78 | 'script does nothing.') |
| 79 | parser.add_argument('--afe', |
Shuqian Zhao | 6cf933b | 2017-09-27 15:07:56 -0700 | [diff] [blame] | 80 | help='The AFE server used to get servers from server_db,' |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 81 | 'e.g, cautotest. Used only if no SERVER specified.') |
| 82 | parser.add_argument('servers', action='store', nargs='*', metavar='SERVER') |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 83 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 84 | return parser.parse_known_args() |
J. Richard Barnette | f533b18 | 2014-09-04 18:24:42 -0700 | [diff] [blame] | 85 | |
| 86 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 87 | def _update_server(server, extra_args=[]): |
| 88 | """Run deploy_server_local for given server. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 89 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 90 | @param server: hostname to update. |
| 91 | @param extra_args: args to be passed in to deploy_server_local. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 92 | |
| 93 | @return: A tuple of (server, success, output), where: |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 94 | server: Name of the server. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 95 | sucess: True if update succeeds, False otherwise. |
Shuqian Zhao | 8754a1a | 2016-08-24 12:54:11 -0700 | [diff] [blame] | 96 | output: A string of the deploy_server_local script output |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 97 | including any errors. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 98 | """ |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 99 | cmd = ('%s %s' % |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 100 | (DEPLOY_SERVER_LOCAL, ' '.join(extra_args))) |
| 101 | success = False |
| 102 | try: |
| 103 | output = infra.execute_command(server, cmd) |
| 104 | success = True |
| 105 | except subprocess.CalledProcessError as e: |
| 106 | output = e.output |
Dan Shi | 94c310d | 2016-03-18 11:27:38 -0700 | [diff] [blame] | 107 | |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 108 | return server, success, output |
| 109 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 110 | def _update_in_parallel(servers, extra_args=[]): |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 111 | """Update a group of servers in parallel. |
| 112 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 113 | @param servers: A list of servers to update. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 114 | @param options: Options for the push. |
| 115 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 116 | @returns A dictionary from server names that failed to the output |
| 117 | of the update script. |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 118 | """ |
Shuqian Zhao | fad5067 | 2017-02-02 16:46:03 -0800 | [diff] [blame] | 119 | # Create a list to record all the finished servers. |
| 120 | manager = multiprocessing.Manager() |
| 121 | finished_servers = manager.list() |
| 122 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 123 | do_server = lambda s: _update_server(s, extra_args) |
| 124 | |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 125 | # The update actions run in parallel. If any update failed, we should wait |
| 126 | # for other running updates being finished. Abort in the middle of an update |
| 127 | # may leave the server in a bad state. |
Ningning Xia | 9c0bcd2 | 2018-05-01 15:40:58 -0700 | [diff] [blame] | 128 | pool = ThreadPool(POOL_SIZE) |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 129 | try: |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 130 | results = pool.map_async(do_server, servers) |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 131 | pool.close() |
Shuqian Zhao | fad5067 | 2017-02-02 16:46:03 -0800 | [diff] [blame] | 132 | |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 133 | # Track the updating progress for current group of servers. |
| 134 | incomplete_servers = set() |
| 135 | server_names = set([s[0] for s in servers]) |
| 136 | while not results.ready(): |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 137 | incomplete_servers = sorted(set(servers) - set(finished_servers)) |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 138 | print('Not finished yet. %d servers in this group. ' |
| 139 | '%d servers are still running:\n%s\n' % |
| 140 | (len(servers), len(incomplete_servers), incomplete_servers)) |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 141 | # Check the progress every 20s |
| 142 | results.wait(20) |
Shuqian Zhao | fad5067 | 2017-02-02 16:46:03 -0800 | [diff] [blame] | 143 | |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 144 | # After update finished, parse the result. |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 145 | failures = {} |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 146 | for server, success, output in results.get(): |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 147 | if not success: |
| 148 | failures[server] = output |
| 149 | |
| 150 | return failures |
| 151 | |
Prathmesh Prabhu | f7e50fd | 2017-03-16 14:39:13 -0700 | [diff] [blame] | 152 | finally: |
| 153 | pool.terminate() |
| 154 | pool.join() |
Dan Shi | fb12d14 | 2015-06-09 23:30:11 -0700 | [diff] [blame] | 155 | |
| 156 | |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 157 | def main(args): |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 158 | """Entry point to deploy_server.py |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 159 | |
Don Garrett | 8679bfe | 2016-08-25 18:56:39 -0700 | [diff] [blame] | 160 | @param args: The command line arguments to parse. (usually sys.argv) |
J. Richard Barnette | 868cf64 | 2014-07-21 16:34:38 -0700 | [diff] [blame] | 161 | |
Don Garrett | 4003636 | 2014-12-08 15:52:44 -0800 | [diff] [blame] | 162 | @returns The system exit code. |
| 163 | """ |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 164 | options, extra_args = _parse_arguments(args[1:]) |
Shuqian Zhao | 9febd45 | 2017-01-31 15:36:40 -0800 | [diff] [blame] | 165 | # Remove all the handlers from the root logger to get rid of the handlers |
| 166 | # introduced by the import packages. |
| 167 | logging.getLogger().handlers = [] |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 168 | logging.basicConfig(level=logging.DEBUG) |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 169 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 170 | servers = options.servers |
| 171 | if not servers: |
| 172 | if not options.afe: |
| 173 | print('No servers or afe specified. Aborting') |
| 174 | return 1 |
| 175 | print('Retrieving servers from %s..' % options.afe) |
| 176 | servers = discover_servers(options.afe) |
| 177 | print('Retrieved servers were: %s' % servers) |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 178 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 179 | if not options.x: |
| 180 | print('Doing nothing because -x was not supplied.') |
| 181 | print('servers: %s' % options.servers) |
| 182 | print('extra args for deploy_server_local: %s' % extra_args) |
| 183 | return 0 |
Alex Miller | b0b2d25 | 2014-06-25 17:17:01 -0700 | [diff] [blame] | 184 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 185 | failures = _update_in_parallel(servers, extra_args) |
Don Garrett | 8679bfe | 2016-08-25 18:56:39 -0700 | [diff] [blame] | 186 | |
Aviv Keshet | 92281f7 | 2017-10-24 16:01:10 -0700 | [diff] [blame] | 187 | if not failures: |
| 188 | print('Completed all updates successfully.') |
| 189 | return 0 |
| 190 | |
| 191 | print('The following servers failed, with the following output:') |
| 192 | for s, o in failures.iteritems(): |
| 193 | print('======== %s ========' % s) |
| 194 | print(o) |
| 195 | |
| 196 | print('The servers that failed were:') |
| 197 | print('\n'.join(failures.keys())) |
| 198 | print('\n\nTo retry on failed servers, run the following command:') |
| 199 | retry_cmd = [args[0], '-x'] + failures.keys() + extra_args |
| 200 | print(' '.join(retry_cmd)) |
| 201 | return 1 |
| 202 | |
J. Richard Barnette | f533b18 | 2014-09-04 18:24:42 -0700 | [diff] [blame] | 203 | |
| 204 | |
| 205 | if __name__ == '__main__': |
Don Garrett | 8679bfe | 2016-08-25 18:56:39 -0700 | [diff] [blame] | 206 | sys.exit(main(sys.argv)) |