| #!/usr/bin/env python2.7 |
| # Copyright 2015-2016, Google Inc. |
| # All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions are |
| # met: |
| # |
| # * Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # * Redistributions in binary form must reproduce the above |
| # copyright notice, this list of conditions and the following disclaimer |
| # in the documentation and/or other materials provided with the |
| # distribution. |
| # * Neither the name of Google Inc. nor the names of its |
| # contributors may be used to endorse or promote products derived from |
| # this software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| import argparse |
| import datetime |
| import os |
| import subprocess |
| import sys |
| import time |
| |
| stress_test_utils_dir = os.path.abspath(os.path.join( |
| os.path.dirname(__file__), '../../gcp/stress_test')) |
| sys.path.append(stress_test_utils_dir) |
| from stress_test_utils import BigQueryHelper |
| |
| kubernetes_api_dir = os.path.abspath(os.path.join( |
| os.path.dirname(__file__), '../../gcp/utils')) |
| sys.path.append(kubernetes_api_dir) |
| |
| import kubernetes_api |
| |
| _GRPC_ROOT = os.path.abspath(os.path.join( |
| os.path.dirname(sys.argv[0]), '../../..')) |
| os.chdir(_GRPC_ROOT) |
| |
| # num of seconds to wait for the GKE image to start and warmup |
| _GKE_IMAGE_WARMUP_WAIT_SECS = 60 |
| |
| _SERVER_POD_NAME = 'stress-server' |
| _CLIENT_POD_NAME_PREFIX = 'stress-client' |
| _DATASET_ID_PREFIX = 'stress_test' |
| _SUMMARY_TABLE_ID = 'summary' |
| _QPS_TABLE_ID = 'qps' |
| |
| _DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test' |
| |
| # The default port on which the kubernetes proxy server is started on localhost |
| # (i.e kubectl proxy --port=<port>) |
| _DEFAULT_KUBERNETES_PROXY_PORT = 8001 |
| |
| # How frequently should the stress client wrapper script (running inside a GKE |
| # container) poll the health of the stress client (also running inside the GKE |
| # container) and upload metrics to BigQuery |
| _DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60 |
| |
| # The default setting for stress test server and client |
| _DEFAULT_STRESS_SERVER_PORT = 8080 |
| _DEFAULT_METRICS_PORT = 8081 |
| _DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1' |
| _DEFAULT_NUM_CHANNELS_PER_SERVER = 5 |
| _DEFAULT_NUM_STUBS_PER_CHANNEL = 10 |
| _DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30 |
| |
| # Number of stress client instances to launch |
| _DEFAULT_NUM_CLIENTS = 3 |
| |
| # How frequently should this test monitor the health of Stress clients and |
| # Servers running in GKE |
| _DEFAULT_TEST_POLL_INTERVAL_SECS = 60 |
| |
| # Default run time for this test (2 hour) |
| _DEFAULT_TEST_DURATION_SECS = 7200 |
| |
| # The number of seconds it would take a GKE pod to warm up (i.e get to 'Running' |
| # state from the time of creation). Ideally this is something the test should |
| # automatically determine by using Kubernetes API to poll the pods status. |
| _DEFAULT_GKE_WARMUP_SECS = 60 |
| |
| |
| class KubernetesProxy: |
| """ Class to start a proxy on localhost to the Kubernetes API server """ |
| |
| def __init__(self, api_port): |
| self.port = api_port |
| self.p = None |
| self.started = False |
| |
| def start(self): |
| cmd = ['kubectl', 'proxy', '--port=%d' % self.port] |
| self.p = subprocess.Popen(args=cmd) |
| self.started = True |
| time.sleep(2) |
| print '..Started' |
| |
| def get_port(self): |
| return self.port |
| |
| def is_started(self): |
| return self.started |
| |
| def __del__(self): |
| if self.p is not None: |
| print 'Shutting down Kubernetes proxy..' |
| self.p.kill() |
| |
| |
| class TestSettings: |
| |
| def __init__(self, build_docker_image, test_poll_interval_secs, |
| test_duration_secs, kubernetes_proxy_port): |
| self.build_docker_image = build_docker_image |
| self.test_poll_interval_secs = test_poll_interval_secs |
| self.test_duration_secs = test_duration_secs |
| self.kubernetes_proxy_port = kubernetes_proxy_port |
| |
| |
| class GkeSettings: |
| |
| def __init__(self, project_id, docker_image_name): |
| self.project_id = project_id |
| self.docker_image_name = docker_image_name |
| self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name) |
| |
| |
| class BigQuerySettings: |
| |
| def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id): |
| self.run_id = run_id |
| self.dataset_id = dataset_id |
| self.summary_table_id = summary_table_id |
| self.qps_table_id = qps_table_id |
| |
| |
| class StressServerSettings: |
| |
| def __init__(self, server_pod_name, server_port): |
| self.server_pod_name = server_pod_name |
| self.server_port = server_port |
| |
| |
| class StressClientSettings: |
| |
| def __init__(self, num_clients, client_pod_name_prefix, server_pod_name, |
| server_port, metrics_port, metrics_collection_interval_secs, |
| stress_client_poll_interval_secs, num_channels_per_server, |
| num_stubs_per_channel, test_cases_str): |
| self.num_clients = num_clients |
| self.client_pod_name_prefix = client_pod_name_prefix |
| self.server_pod_name = server_pod_name |
| self.server_port = server_port |
| self.metrics_port = metrics_port |
| self.metrics_collection_interval_secs = metrics_collection_interval_secs |
| self.stress_client_poll_interval_secs = stress_client_poll_interval_secs |
| self.num_channels_per_server = num_channels_per_server |
| self.num_stubs_per_channel = num_stubs_per_channel |
| self.test_cases_str = test_cases_str |
| |
| # == Derived properties == |
| # Note: Client can accept a list of server addresses (a comma separated list |
| # of 'server_name:server_port'). In this case, we only have one server |
| # address to pass |
| self.server_addresses = '%s.default.svc.cluster.local:%d' % ( |
| server_pod_name, server_port) |
| self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i) |
| for i in range(1, num_clients + 1)] |
| |
| |
| def _build_docker_image(image_name, tag_name): |
| """ Build the docker image and add tag it to the GKE repository """ |
| print 'Building docker image: %s' % image_name |
| os.environ['INTEROP_IMAGE'] = image_name |
| os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name |
| # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script |
| # build_interop_stress_image.sh invokes the following script: |
| # tools/dockerfile/$BASE_NAME/build_interop_stress.sh |
| os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx' |
| cmd = ['tools/jenkins/build_interop_stress_image.sh'] |
| retcode = subprocess.call(args=cmd) |
| if retcode != 0: |
| print 'Error in building docker image' |
| return False |
| return True |
| |
| |
| def _push_docker_image_to_gke_registry(docker_tag_name): |
| """Executes 'gcloud docker push <docker_tag_name>' to push the image to GKE registry""" |
| cmd = ['gcloud', 'docker', 'push', docker_tag_name] |
| print 'Pushing %s to GKE registry..' % docker_tag_name |
| retcode = subprocess.call(args=cmd) |
| if retcode != 0: |
| print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name |
| return False |
| return True |
| |
| |
| def _launch_server(gke_settings, stress_server_settings, bq_settings, |
| kubernetes_proxy): |
| """ Launches a stress test server instance in GKE cluster """ |
| if not kubernetes_proxy.is_started: |
| print 'Kubernetes proxy must be started before calling this function' |
| return False |
| |
| # This is the wrapper script that is run in the container. This script runs |
| # the actual stress test server |
| server_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_server.py'] |
| |
| # run_server.py does not take any args from the command line. The args are |
| # instead passed via environment variables (see server_env below) |
| server_arg_list = [] |
| |
| # The parameters to the script run_server.py are injected into the container |
| # via environment variables |
| server_env = { |
| 'STRESS_TEST_IMAGE_TYPE': 'SERVER', |
| 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server', |
| 'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port, |
| 'RUN_ID': bq_settings.run_id, |
| 'POD_NAME': stress_server_settings.server_pod_name, |
| 'GCP_PROJECT_ID': gke_settings.project_id, |
| 'DATASET_ID': bq_settings.dataset_id, |
| 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, |
| 'QPS_TABLE_ID': bq_settings.qps_table_id |
| } |
| |
| # Launch Server |
| is_success = kubernetes_api.create_pod_and_service( |
| 'localhost', |
| kubernetes_proxy.get_port(), |
| 'default', # Use 'default' namespace |
| stress_server_settings.server_pod_name, |
| gke_settings.tag_name, |
| [stress_server_settings.server_port], # Port that should be exposed |
| server_cmd_list, |
| server_arg_list, |
| server_env, |
| True # Headless = True for server. Since we want DNS records to be created by GKE |
| ) |
| |
| return is_success |
| |
| |
| def _launch_client(gke_settings, stress_server_settings, stress_client_settings, |
| bq_settings, kubernetes_proxy): |
| """ Launches a configurable number of stress test clients on GKE cluster """ |
| if not kubernetes_proxy.is_started: |
| print 'Kubernetes proxy must be started before calling this function' |
| return False |
| |
| stress_client_arg_list = [ |
| '--server_addresses=%s' % stress_client_settings.server_addresses, |
| '--test_cases=%s' % stress_client_settings.test_cases_str, |
| '--num_stubs_per_channel=%d' % |
| stress_client_settings.num_stubs_per_channel |
| ] |
| |
| # This is the wrapper script that is run in the container. This script runs |
| # the actual stress client |
| client_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_client.py'] |
| |
| # run_client.py takes no args. All args are passed as env variables (see |
| # client_env) |
| client_arg_list = [] |
| |
| metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port |
| metrics_client_arg_list = [ |
| '--metrics_server_address=%s' % metrics_server_address, |
| '--total_only=true' |
| ] |
| |
| # The parameters to the script run_client.py are injected into the container |
| # via environment variables |
| client_env = { |
| 'STRESS_TEST_IMAGE_TYPE': 'CLIENT', |
| 'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test', |
| 'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list), |
| 'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client', |
| 'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list), |
| 'RUN_ID': bq_settings.run_id, |
| 'POLL_INTERVAL_SECS': |
| str(stress_client_settings.stress_client_poll_interval_secs), |
| 'GCP_PROJECT_ID': gke_settings.project_id, |
| 'DATASET_ID': bq_settings.dataset_id, |
| 'SUMMARY_TABLE_ID': bq_settings.summary_table_id, |
| 'QPS_TABLE_ID': bq_settings.qps_table_id |
| } |
| |
| for pod_name in stress_client_settings.client_pod_names_list: |
| client_env['POD_NAME'] = pod_name |
| is_success = kubernetes_api.create_pod_and_service( |
| 'localhost', # Since proxy is running on localhost |
| kubernetes_proxy.get_port(), |
| 'default', # default namespace |
| pod_name, |
| gke_settings.tag_name, |
| [stress_client_settings.metrics_port |
| ], # Client pods expose metrics port |
| client_cmd_list, |
| client_arg_list, |
| client_env, |
| False # Client is not a headless service |
| ) |
| if not is_success: |
| print 'Error in launching client %s' % pod_name |
| return False |
| |
| return True |
| |
| |
| def _launch_server_and_client(gke_settings, stress_server_settings, |
| stress_client_settings, bq_settings, |
| kubernetes_proxy_port): |
| # Start kubernetes proxy |
| print 'Kubernetes proxy' |
| kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port) |
| kubernetes_proxy.start() |
| |
| print 'Launching server..' |
| is_success = _launch_server(gke_settings, stress_server_settings, bq_settings, |
| kubernetes_proxy) |
| if not is_success: |
| print 'Error in launching server' |
| return False |
| |
| # Server takes a while to start. |
| # TODO(sree) Use Kubernetes API to query the status of the server instead of |
| # sleeping |
| print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS |
| time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS) |
| |
| # Launch client |
| client_pod_name_prefix = 'stress-client' |
| is_success = _launch_client(gke_settings, stress_server_settings, |
| stress_client_settings, bq_settings, |
| kubernetes_proxy) |
| |
| if not is_success: |
| print 'Error in launching client(s)' |
| return False |
| |
| print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS |
| time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS) |
| return True |
| |
| |
| def _delete_server_and_client(stress_server_settings, stress_client_settings, |
| kubernetes_proxy_port): |
| kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port) |
| kubernetes_proxy.start() |
| |
| # Delete clients first |
| is_success = True |
| for pod_name in stress_client_settings.client_pod_names_list: |
| is_success = kubernetes_api.delete_pod_and_service( |
| 'localhost', kubernetes_proxy_port, 'default', pod_name) |
| if not is_success: |
| return False |
| |
| # Delete server |
| is_success = kubernetes_api.delete_pod_and_service( |
| 'localhost', kubernetes_proxy_port, 'default', |
| stress_server_settings.server_pod_name) |
| return is_success |
| |
| |
| def run_test_main(test_settings, gke_settings, stress_server_settings, |
| stress_client_clients): |
| is_success = True |
| |
| if test_settings.build_docker_image: |
| is_success = _build_docker_image(gke_settings.docker_image_name, |
| gke_settings.tag_name) |
| if not is_success: |
| return False |
| |
| is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name) |
| if not is_success: |
| return False |
| |
| # Create a unique id for this run (Note: Using timestamp instead of UUID to |
| # make it easier to deduce the date/time of the run just by looking at the run |
| # run id. This is useful in debugging when looking at records in Biq query) |
| run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') |
| dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id) |
| |
| # Big Query settings (common for both Stress Server and Client) |
| bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID, |
| _QPS_TABLE_ID) |
| |
| bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id, |
| _SUMMARY_TABLE_ID, _QPS_TABLE_ID) |
| bq_helper.initialize() |
| |
| try: |
| is_success = _launch_server_and_client(gke_settings, stress_server_settings, |
| stress_client_settings, bq_settings, |
| test_settings.kubernetes_proxy_port) |
| if not is_success: |
| return False |
| |
| start_time = datetime.datetime.now() |
| end_time = start_time + datetime.timedelta( |
| seconds=test_settings.test_duration_secs) |
| print 'Running the test until %s' % end_time.isoformat() |
| |
| while True: |
| if datetime.datetime.now() > end_time: |
| print 'Test was run for %d seconds' % test_settings.test_duration_secs |
| break |
| |
| # Check if either stress server or clients have failed |
| if bq_helper.check_if_any_tests_failed(): |
| is_success = False |
| print 'Some tests failed.' |
| break |
| |
| # Things seem to be running fine. Wait until next poll time to check the |
| # status |
| print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs |
| time.sleep(test_settings.test_poll_interval_secs) |
| |
| # Print BiqQuery tables |
| bq_helper.print_summary_records() |
| bq_helper.print_qps_records() |
| |
| finally: |
| # If is_success is False at this point, it means that the stress tests were |
| # started successfully but failed while running the tests. In this case we |
| # do should not delete the pods (since they contain all the failure |
| # information) |
| if is_success: |
| _delete_server_and_client(stress_server_settings, stress_client_settings, |
| test_settings.kubernetes_proxy_port) |
| |
| return is_success |
| |
| |
| argp = argparse.ArgumentParser( |
| description='Launch stress tests in GKE', |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
| argp.add_argument('--project_id', |
| required=True, |
| help='The Google Cloud Platform Project Id') |
| argp.add_argument('--num_clients', |
| default=1, |
| type=int, |
| help='Number of client instances to start') |
| argp.add_argument('--docker_image_name', |
| default=_DEFAULT_DOCKER_IMAGE_NAME, |
| help='The name of the docker image containing stress client ' |
| 'and stress servers') |
| argp.add_argument('--build_docker_image', |
| dest='build_docker_image', |
| action='store_true', |
| help='Build a docker image and push to Google Container ' |
| 'Registry') |
| argp.add_argument('--do_not_build_docker_image', |
| dest='build_docker_image', |
| action='store_false', |
| help='Do not build and push docker image to Google Container ' |
| 'Registry') |
| argp.set_defaults(build_docker_image=True) |
| |
| argp.add_argument('--test_poll_interval_secs', |
| default=_DEFAULT_TEST_POLL_INTERVAL_SECS, |
| type=int, |
| help='How frequently should this script should monitor the ' |
| 'health of stress clients and servers running in the GKE ' |
| 'cluster') |
| argp.add_argument('--test_duration_secs', |
| default=_DEFAULT_TEST_DURATION_SECS, |
| type=int, |
| help='How long should this test be run') |
| argp.add_argument('--kubernetes_proxy_port', |
| default=_DEFAULT_KUBERNETES_PROXY_PORT, |
| type=int, |
| help='The port on which the kubernetes proxy (on localhost)' |
| ' is started') |
| argp.add_argument('--stress_server_port', |
| default=_DEFAULT_STRESS_SERVER_PORT, |
| type=int, |
| help='The port on which the stress server (in GKE ' |
| 'containers) listens') |
| argp.add_argument('--stress_client_metrics_port', |
| default=_DEFAULT_METRICS_PORT, |
| type=int, |
| help='The port on which the stress clients (in GKE ' |
| 'containers) expose metrics') |
| argp.add_argument('--stress_client_poll_interval_secs', |
| default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS, |
| type=int, |
| help='How frequently should the stress client wrapper script' |
| ' running inside GKE should monitor health of the actual ' |
| ' stress client process and upload the metrics to BigQuery') |
| argp.add_argument('--stress_client_metrics_collection_interval_secs', |
| default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS, |
| type=int, |
| help='How frequently should metrics be collected in-memory on' |
| ' the stress clients (running inside GKE containers). Note ' |
| 'that this is NOT the same as the upload-to-BigQuery ' |
| 'frequency. The metrics upload frequency is controlled by the' |
| ' --stress_client_poll_interval_secs flag') |
| argp.add_argument('--stress_client_num_channels_per_server', |
| default=_DEFAULT_NUM_CHANNELS_PER_SERVER, |
| type=int, |
| help='The number of channels created to each server from a ' |
| 'stress client') |
| argp.add_argument('--stress_client_num_stubs_per_channel', |
| default=_DEFAULT_NUM_STUBS_PER_CHANNEL, |
| type=int, |
| help='The number of stubs created per channel. This number ' |
| 'indicates the max number of RPCs that can be made in ' |
| 'parallel on each channel at any given time') |
| argp.add_argument('--stress_client_test_cases', |
| default=_DEFAULT_TEST_CASES_STR, |
| help='List of test cases (with weights) to be executed by the' |
| ' stress test client. The list is in the following format:\n' |
| ' <testcase_1:w_1,<test_case2:w_2>..<testcase_n:w_n>\n' |
| ' (Note: The weights do not have to add up to 100)') |
| |
| if __name__ == '__main__': |
| args = argp.parse_args() |
| |
| test_settings = TestSettings( |
| args.build_docker_image, args.test_poll_interval_secs, |
| args.test_duration_secs, args.kubernetes_proxy_port) |
| |
| gke_settings = GkeSettings(args.project_id, args.docker_image_name) |
| |
| stress_server_settings = StressServerSettings(_SERVER_POD_NAME, |
| args.stress_server_port) |
| stress_client_settings = StressClientSettings( |
| args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME, |
| args.stress_server_port, args.stress_client_metrics_port, |
| args.stress_client_metrics_collection_interval_secs, |
| args.stress_client_poll_interval_secs, |
| args.stress_client_num_channels_per_server, |
| args.stress_client_num_stubs_per_channel, args.stress_client_test_cases) |
| |
| run_test_main(test_settings, gke_settings, stress_server_settings, |
| stress_client_settings) |