tools/run_tests/stress_test/run_stress_tests_on_gke.py - platform/external/grpc-grpc - Gitiles

 #!/usr/bin/env python2.7
 # Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import argparse
 import datetime
 import os
 import subprocess
 import sys
 import time

 stress_test_utils_dir = os.path.abspath(os.path.join(
     os.path.dirname(__file__), '../../gcp/stress_test'))
 sys.path.append(stress_test_utils_dir)
 from stress_test_utils import BigQueryHelper

 kubernetes_api_dir = os.path.abspath(os.path.join(
     os.path.dirname(__file__), '../../gcp/utils'))
 sys.path.append(kubernetes_api_dir)

 import kubernetes_api

 _GRPC_ROOT = os.path.abspath(os.path.join(
     os.path.dirname(sys.argv[0]), '../../..'))
 os.chdir(_GRPC_ROOT)

 # num of seconds to wait for the GKE image to start and warmup
 _GKE_IMAGE_WARMUP_WAIT_SECS = 60

 _SERVER_POD_NAME = 'stress-server'
 _CLIENT_POD_NAME_PREFIX = 'stress-client'
 _DATASET_ID_PREFIX = 'stress_test'
 _SUMMARY_TABLE_ID = 'summary'
 _QPS_TABLE_ID = 'qps'

 _DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test'

 # The default port on which the kubernetes proxy server is started on localhost
 # (i.e kubectl proxy --port=<port>)
 _DEFAULT_KUBERNETES_PROXY_PORT = 8001

 # How frequently should the stress client wrapper script (running inside a GKE
 # container) poll the health of the stress client (also running inside the GKE
 # container) and upload metrics to BigQuery
 _DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60

 # The default setting for stress test server and client
 _DEFAULT_STRESS_SERVER_PORT = 8080
 _DEFAULT_METRICS_PORT = 8081
 _DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1'
 _DEFAULT_NUM_CHANNELS_PER_SERVER = 5
 _DEFAULT_NUM_STUBS_PER_CHANNEL = 10
 _DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30

 # Number of stress client instances to launch
 _DEFAULT_NUM_CLIENTS = 3

 # How frequently should this test monitor the health of Stress clients and
 # Servers running in GKE
 _DEFAULT_TEST_POLL_INTERVAL_SECS = 60

 # Default run time for this test (2 hour)
 _DEFAULT_TEST_DURATION_SECS = 7200

 # The number of seconds it would take a GKE pod to warm up (i.e get to 'Running'
 # state from the time of creation). Ideally this is something the test should
 # automatically determine by using Kubernetes API to poll the pods status.
 _DEFAULT_GKE_WARMUP_SECS = 60


 class KubernetesProxy:
   """ Class to start a proxy on localhost to the Kubernetes API server """

   def __init__(self, api_port):
     self.port = api_port
     self.p = None
     self.started = False

   def start(self):
     cmd = ['kubectl', 'proxy', '--port=%d' % self.port]
     self.p = subprocess.Popen(args=cmd)
     self.started = True
     time.sleep(2)
     print '..Started'

   def get_port(self):
     return self.port

   def is_started(self):
     return self.started

   def __del__(self):
     if self.p is not None:
       print 'Shutting down Kubernetes proxy..'
       self.p.kill()


 class TestSettings:

   def __init__(self, build_docker_image, test_poll_interval_secs,
                test_duration_secs, kubernetes_proxy_port):
     self.build_docker_image = build_docker_image
     self.test_poll_interval_secs = test_poll_interval_secs
     self.test_duration_secs = test_duration_secs
     self.kubernetes_proxy_port = kubernetes_proxy_port


 class GkeSettings:

   def __init__(self, project_id, docker_image_name):
     self.project_id = project_id
     self.docker_image_name = docker_image_name
     self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name)


 class BigQuerySettings:

   def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id):
     self.run_id = run_id
     self.dataset_id = dataset_id
     self.summary_table_id = summary_table_id
     self.qps_table_id = qps_table_id


 class StressServerSettings:

   def __init__(self, server_pod_name, server_port):
     self.server_pod_name = server_pod_name
     self.server_port = server_port


 class StressClientSettings:

   def __init__(self, num_clients, client_pod_name_prefix, server_pod_name,
                server_port, metrics_port, metrics_collection_interval_secs,
                stress_client_poll_interval_secs, num_channels_per_server,
                num_stubs_per_channel, test_cases_str):
     self.num_clients = num_clients
     self.client_pod_name_prefix = client_pod_name_prefix
     self.server_pod_name = server_pod_name
     self.server_port = server_port
     self.metrics_port = metrics_port
     self.metrics_collection_interval_secs = metrics_collection_interval_secs
     self.stress_client_poll_interval_secs = stress_client_poll_interval_secs
     self.num_channels_per_server = num_channels_per_server
     self.num_stubs_per_channel = num_stubs_per_channel
     self.test_cases_str = test_cases_str

     # == Derived properties ==
     # Note: Client can accept a list of server addresses (a comma separated list
     # of 'server_name:server_port'). In this case, we only have one server
     # address to pass
     self.server_addresses = '%s.default.svc.cluster.local:%d' % (
         server_pod_name, server_port)
     self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i)
                                   for i in range(1, num_clients + 1)]


 def _build_docker_image(image_name, tag_name):
   """ Build the docker image and add tag it to the GKE repository """
   print 'Building docker image: %s' % image_name
   os.environ['INTEROP_IMAGE'] = image_name
   os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name
   # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script
   # build_interop_stress_image.sh invokes the following script:
   #   tools/dockerfile/$BASE_NAME/build_interop_stress.sh
   os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx'
   cmd = ['tools/jenkins/build_interop_stress_image.sh']
   retcode = subprocess.call(args=cmd)
   if retcode != 0:
     print 'Error in building docker image'
     return False
   return True


 def _push_docker_image_to_gke_registry(docker_tag_name):
   """Executes 'gcloud docker push <docker_tag_name>' to push the image to GKE registry"""
   cmd = ['gcloud', 'docker', 'push', docker_tag_name]
   print 'Pushing %s to GKE registry..' % docker_tag_name
   retcode = subprocess.call(args=cmd)
   if retcode != 0:
     print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name
     return False
   return True


 def _launch_server(gke_settings, stress_server_settings, bq_settings,
                    kubernetes_proxy):
   """ Launches a stress test server instance in GKE cluster """
   if not kubernetes_proxy.is_started:
     print 'Kubernetes proxy must be started before calling this function'
     return False

   # This is the wrapper script that is run in the container. This script runs
   # the actual stress test server
   server_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_server.py']

   # run_server.py does not take any args from the command line. The args are
   # instead passed via environment variables (see server_env below)
   server_arg_list = []

   # The parameters to the script run_server.py are injected into the container
   # via environment variables
   server_env = {
       'STRESS_TEST_IMAGE_TYPE': 'SERVER',
       'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server',
       'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port,
       'RUN_ID': bq_settings.run_id,
       'POD_NAME': stress_server_settings.server_pod_name,
       'GCP_PROJECT_ID': gke_settings.project_id,
       'DATASET_ID': bq_settings.dataset_id,
       'SUMMARY_TABLE_ID': bq_settings.summary_table_id,
       'QPS_TABLE_ID': bq_settings.qps_table_id
   }

   # Launch Server
   is_success = kubernetes_api.create_pod_and_service(
       'localhost',
       kubernetes_proxy.get_port(),
       'default',  # Use 'default' namespace
       stress_server_settings.server_pod_name,
       gke_settings.tag_name,
       [stress_server_settings.server_port],  # Port that should be exposed
       server_cmd_list,
       server_arg_list,
       server_env,
       True  # Headless = True for server. Since we want DNS records to be created by GKE
   )

   return is_success


 def _launch_client(gke_settings, stress_server_settings, stress_client_settings,
                    bq_settings, kubernetes_proxy):
   """ Launches a configurable number of stress test clients on GKE cluster """
   if not kubernetes_proxy.is_started:
     print 'Kubernetes proxy must be started before calling this function'
     return False

   stress_client_arg_list = [
       '--server_addresses=%s' % stress_client_settings.server_addresses,
       '--test_cases=%s' % stress_client_settings.test_cases_str,
       '--num_stubs_per_channel=%d' %
       stress_client_settings.num_stubs_per_channel
   ]

   # This is the wrapper script that is run in the container. This script runs
   # the actual stress client
   client_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_client.py']

   # run_client.py takes no args. All args are passed as env variables (see
   # client_env)
   client_arg_list = []

   metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port
   metrics_client_arg_list = [
       '--metrics_server_address=%s' % metrics_server_address,
       '--total_only=true'
   ]

   # The parameters to the script run_client.py are injected into the container
   # via environment variables
   client_env = {
       'STRESS_TEST_IMAGE_TYPE': 'CLIENT',
       'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test',
       'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list),
       'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client',
       'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list),
       'RUN_ID': bq_settings.run_id,
       'POLL_INTERVAL_SECS':
           str(stress_client_settings.stress_client_poll_interval_secs),
       'GCP_PROJECT_ID': gke_settings.project_id,
       'DATASET_ID': bq_settings.dataset_id,
       'SUMMARY_TABLE_ID': bq_settings.summary_table_id,
       'QPS_TABLE_ID': bq_settings.qps_table_id
   }

   for pod_name in stress_client_settings.client_pod_names_list:
     client_env['POD_NAME'] = pod_name
     is_success = kubernetes_api.create_pod_and_service(
         'localhost',  # Since proxy is running on localhost
         kubernetes_proxy.get_port(),
         'default',  # default namespace
         pod_name,
         gke_settings.tag_name,
         [stress_client_settings.metrics_port
         ],  # Client pods expose metrics port
         client_cmd_list,
         client_arg_list,
         client_env,
         False  # Client is not a headless service
     )
     if not is_success:
       print 'Error in launching client %s' % pod_name
       return False

   return True


 def _launch_server_and_client(gke_settings, stress_server_settings,
                               stress_client_settings, bq_settings,
                               kubernetes_proxy_port):
   # Start kubernetes proxy
   print 'Kubernetes proxy'
   kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)
   kubernetes_proxy.start()

   print 'Launching server..'
   is_success = _launch_server(gke_settings, stress_server_settings, bq_settings,
                               kubernetes_proxy)
   if not is_success:
     print 'Error in launching server'
     return False

   # Server takes a while to start.
   # TODO(sree) Use Kubernetes API to query the status of the server instead of
   # sleeping
   print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS
   time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)

   # Launch client
   client_pod_name_prefix = 'stress-client'
   is_success = _launch_client(gke_settings, stress_server_settings,
                               stress_client_settings, bq_settings,
                               kubernetes_proxy)

   if not is_success:
     print 'Error in launching client(s)'
     return False

   print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS
   time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)
   return True


 def _delete_server_and_client(stress_server_settings, stress_client_settings,
                               kubernetes_proxy_port):
   kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)
   kubernetes_proxy.start()

   # Delete clients first
   is_success = True
   for pod_name in stress_client_settings.client_pod_names_list:
     is_success = kubernetes_api.delete_pod_and_service(
         'localhost', kubernetes_proxy_port, 'default', pod_name)
     if not is_success:
       return False

   # Delete server
   is_success = kubernetes_api.delete_pod_and_service(
       'localhost', kubernetes_proxy_port, 'default',
       stress_server_settings.server_pod_name)
   return is_success


 def run_test_main(test_settings, gke_settings, stress_server_settings,
                   stress_client_clients):
   is_success = True

   if test_settings.build_docker_image:
     is_success = _build_docker_image(gke_settings.docker_image_name,
                                      gke_settings.tag_name)
     if not is_success:
       return False

     is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name)
     if not is_success:
       return False

   # Create a unique id for this run (Note: Using timestamp instead of UUID to
   # make it easier to deduce the date/time of the run just by looking at the run
   # run id. This is useful in debugging when looking at records in Biq query)
   run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
   dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id)

   # Big Query settings (common for both Stress Server and Client)
   bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID,
                                  _QPS_TABLE_ID)

   bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id,
                              _SUMMARY_TABLE_ID, _QPS_TABLE_ID)
   bq_helper.initialize()

   try:
     is_success = _launch_server_and_client(gke_settings, stress_server_settings,
                                            stress_client_settings, bq_settings,
                                            test_settings.kubernetes_proxy_port)
     if not is_success:
       return False

     start_time = datetime.datetime.now()
     end_time = start_time + datetime.timedelta(
         seconds=test_settings.test_duration_secs)
     print 'Running the test until %s' % end_time.isoformat()

     while True:
       if datetime.datetime.now() > end_time:
         print 'Test was run for %d seconds' % test_settings.test_duration_secs
         break

       # Check if either stress server or clients have failed
       if bq_helper.check_if_any_tests_failed():
         is_success = False
         print 'Some tests failed.'
         break

       # Things seem to be running fine. Wait until next poll time to check the
       # status
       print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs
       time.sleep(test_settings.test_poll_interval_secs)

     # Print BiqQuery tables
     bq_helper.print_summary_records()
     bq_helper.print_qps_records()

   finally:
     # If is_success is False at this point, it means that the stress tests were
     # started successfully but failed while running the tests. In this case we
     # do should not delete the pods (since they contain all the failure
     # information)
     if is_success:
       _delete_server_and_client(stress_server_settings, stress_client_settings,
                                 test_settings.kubernetes_proxy_port)

   return is_success


 argp = argparse.ArgumentParser(
     description='Launch stress tests in GKE',
     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 argp.add_argument('--project_id',
                   required=True,
                   help='The Google Cloud Platform Project Id')
 argp.add_argument('--num_clients',
                   default=1,
                   type=int,
                   help='Number of client instances to start')
 argp.add_argument('--docker_image_name',
                   default=_DEFAULT_DOCKER_IMAGE_NAME,
                   help='The name of the docker image containing stress client '
                   'and stress servers')
 argp.add_argument('--build_docker_image',
                   dest='build_docker_image',
                   action='store_true',
                   help='Build a docker image and push to Google Container '
                   'Registry')
 argp.add_argument('--do_not_build_docker_image',
                   dest='build_docker_image',
                   action='store_false',
                   help='Do not build and push docker image to Google Container '
                   'Registry')
 argp.set_defaults(build_docker_image=True)

 argp.add_argument('--test_poll_interval_secs',
                   default=_DEFAULT_TEST_POLL_INTERVAL_SECS,
                   type=int,
                   help='How frequently should this script should monitor the '
                   'health of stress clients and servers running in the GKE '
                   'cluster')
 argp.add_argument('--test_duration_secs',
                   default=_DEFAULT_TEST_DURATION_SECS,
                   type=int,
                   help='How long should this test be run')
 argp.add_argument('--kubernetes_proxy_port',
                   default=_DEFAULT_KUBERNETES_PROXY_PORT,
                   type=int,
                   help='The port on which the kubernetes proxy (on localhost)'
                   ' is started')
 argp.add_argument('--stress_server_port',
                   default=_DEFAULT_STRESS_SERVER_PORT,
                   type=int,
                   help='The port on which the stress server (in GKE '
                   'containers) listens')
 argp.add_argument('--stress_client_metrics_port',
                   default=_DEFAULT_METRICS_PORT,
                   type=int,
                   help='The port on which the stress clients (in GKE '
                   'containers) expose metrics')
 argp.add_argument('--stress_client_poll_interval_secs',
                   default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS,
                   type=int,
                   help='How frequently should the stress client wrapper script'
                   ' running inside GKE should monitor health of the actual '
                   ' stress client process and upload the metrics to BigQuery')
 argp.add_argument('--stress_client_metrics_collection_interval_secs',
                   default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS,
                   type=int,
                   help='How frequently should metrics be collected in-memory on'
                   ' the stress clients (running inside GKE containers). Note '
                   'that this is NOT the same as the upload-to-BigQuery '
                   'frequency. The metrics upload frequency is controlled by the'
                   ' --stress_client_poll_interval_secs flag')
 argp.add_argument('--stress_client_num_channels_per_server',
                   default=_DEFAULT_NUM_CHANNELS_PER_SERVER,
                   type=int,
                   help='The number of channels created to each server from a '
                   'stress client')
 argp.add_argument('--stress_client_num_stubs_per_channel',
                   default=_DEFAULT_NUM_STUBS_PER_CHANNEL,
                   type=int,
                   help='The number of stubs created per channel. This number '
                   'indicates the max number of RPCs that can be made in '
                   'parallel on each channel at any given time')
 argp.add_argument('--stress_client_test_cases',
                   default=_DEFAULT_TEST_CASES_STR,
                   help='List of test cases (with weights) to be executed by the'
                   ' stress test client. The list is in the following format:\n'
                   '  <testcase_1:w_1,<test_case2:w_2>..<testcase_n:w_n>\n'
                   ' (Note: The weights do not have to add up to 100)')

 if __name__ == '__main__':
   args = argp.parse_args()

   test_settings = TestSettings(
       args.build_docker_image, args.test_poll_interval_secs,
       args.test_duration_secs, args.kubernetes_proxy_port)

   gke_settings = GkeSettings(args.project_id, args.docker_image_name)

   stress_server_settings = StressServerSettings(_SERVER_POD_NAME,
                                                 args.stress_server_port)
   stress_client_settings = StressClientSettings(
       args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME,
       args.stress_server_port, args.stress_client_metrics_port,
       args.stress_client_metrics_collection_interval_secs,
       args.stress_client_poll_interval_secs,
       args.stress_client_num_channels_per_server,
       args.stress_client_num_stubs_per_channel, args.stress_client_test_cases)

   run_test_main(test_settings, gke_settings, stress_server_settings,
                 stress_client_settings)
	#!/usr/bin/env python2.7
	# Copyright 2015-2016, Google Inc.
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are
	# met:
	#
	# * Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above
	# copyright notice, this list of conditions and the following disclaimer
	# in the documentation and/or other materials provided with the
	# distribution.
	# * Neither the name of Google Inc. nor the names of its
	# contributors may be used to endorse or promote products derived from
	# this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	import argparse
	import datetime
	import os
	import subprocess
	import sys
	import time

	stress_test_utils_dir = os.path.abspath(os.path.join(
	os.path.dirname(__file__), '../../gcp/stress_test'))
	sys.path.append(stress_test_utils_dir)
	from stress_test_utils import BigQueryHelper

	kubernetes_api_dir = os.path.abspath(os.path.join(
	os.path.dirname(__file__), '../../gcp/utils'))
	sys.path.append(kubernetes_api_dir)

	import kubernetes_api

	_GRPC_ROOT = os.path.abspath(os.path.join(
	os.path.dirname(sys.argv[0]), '../../..'))
	os.chdir(_GRPC_ROOT)

	# num of seconds to wait for the GKE image to start and warmup
	_GKE_IMAGE_WARMUP_WAIT_SECS = 60

	_SERVER_POD_NAME = 'stress-server'
	_CLIENT_POD_NAME_PREFIX = 'stress-client'
	_DATASET_ID_PREFIX = 'stress_test'
	_SUMMARY_TABLE_ID = 'summary'
	_QPS_TABLE_ID = 'qps'

	_DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test'

	# The default port on which the kubernetes proxy server is started on localhost
	# (i.e kubectl proxy --port=<port>)
	_DEFAULT_KUBERNETES_PROXY_PORT = 8001

	# How frequently should the stress client wrapper script (running inside a GKE
	# container) poll the health of the stress client (also running inside the GKE
	# container) and upload metrics to BigQuery
	_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60

	# The default setting for stress test server and client
	_DEFAULT_STRESS_SERVER_PORT = 8080
	_DEFAULT_METRICS_PORT = 8081
	_DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1'
	_DEFAULT_NUM_CHANNELS_PER_SERVER = 5
	_DEFAULT_NUM_STUBS_PER_CHANNEL = 10
	_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30

	# Number of stress client instances to launch
	_DEFAULT_NUM_CLIENTS = 3

	# How frequently should this test monitor the health of Stress clients and
	# Servers running in GKE
	_DEFAULT_TEST_POLL_INTERVAL_SECS = 60

	# Default run time for this test (2 hour)
	_DEFAULT_TEST_DURATION_SECS = 7200

	# The number of seconds it would take a GKE pod to warm up (i.e get to 'Running'
	# state from the time of creation). Ideally this is something the test should
	# automatically determine by using Kubernetes API to poll the pods status.
	_DEFAULT_GKE_WARMUP_SECS = 60


	class KubernetesProxy:
	""" Class to start a proxy on localhost to the Kubernetes API server """

	def __init__(self, api_port):
	self.port = api_port
	self.p = None
	self.started = False

	def start(self):
	cmd = ['kubectl', 'proxy', '--port=%d' % self.port]
	self.p = subprocess.Popen(args=cmd)
	self.started = True
	time.sleep(2)
	print '..Started'

	def get_port(self):
	return self.port

	def is_started(self):
	return self.started

	def __del__(self):
	if self.p is not None:
	print 'Shutting down Kubernetes proxy..'
	self.p.kill()


	class TestSettings:

	def __init__(self, build_docker_image, test_poll_interval_secs,
	test_duration_secs, kubernetes_proxy_port):
	self.build_docker_image = build_docker_image
	self.test_poll_interval_secs = test_poll_interval_secs
	self.test_duration_secs = test_duration_secs
	self.kubernetes_proxy_port = kubernetes_proxy_port


	class GkeSettings:

	def __init__(self, project_id, docker_image_name):
	self.project_id = project_id
	self.docker_image_name = docker_image_name
	self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name)


	class BigQuerySettings:

	def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id):
	self.run_id = run_id
	self.dataset_id = dataset_id
	self.summary_table_id = summary_table_id
	self.qps_table_id = qps_table_id


	class StressServerSettings:

	def __init__(self, server_pod_name, server_port):
	self.server_pod_name = server_pod_name
	self.server_port = server_port


	class StressClientSettings:

	def __init__(self, num_clients, client_pod_name_prefix, server_pod_name,
	server_port, metrics_port, metrics_collection_interval_secs,
	stress_client_poll_interval_secs, num_channels_per_server,
	num_stubs_per_channel, test_cases_str):
	self.num_clients = num_clients
	self.client_pod_name_prefix = client_pod_name_prefix
	self.server_pod_name = server_pod_name
	self.server_port = server_port
	self.metrics_port = metrics_port
	self.metrics_collection_interval_secs = metrics_collection_interval_secs
	self.stress_client_poll_interval_secs = stress_client_poll_interval_secs
	self.num_channels_per_server = num_channels_per_server
	self.num_stubs_per_channel = num_stubs_per_channel
	self.test_cases_str = test_cases_str

	# == Derived properties ==
	# Note: Client can accept a list of server addresses (a comma separated list
	# of 'server_name:server_port'). In this case, we only have one server
	# address to pass
	self.server_addresses = '%s.default.svc.cluster.local:%d' % (
	server_pod_name, server_port)
	self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i)
	for i in range(1, num_clients + 1)]


	def _build_docker_image(image_name, tag_name):
	""" Build the docker image and add tag it to the GKE repository """
	print 'Building docker image: %s' % image_name
	os.environ['INTEROP_IMAGE'] = image_name
	os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name
	# Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script
	# build_interop_stress_image.sh invokes the following script:
	# tools/dockerfile/$BASE_NAME/build_interop_stress.sh
	os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx'
	cmd = ['tools/jenkins/build_interop_stress_image.sh']
	retcode = subprocess.call(args=cmd)
	if retcode != 0:
	print 'Error in building docker image'
	return False
	return True


	def _push_docker_image_to_gke_registry(docker_tag_name):
	"""Executes 'gcloud docker push <docker_tag_name>' to push the image to GKE registry"""
	cmd = ['gcloud', 'docker', 'push', docker_tag_name]
	print 'Pushing %s to GKE registry..' % docker_tag_name
	retcode = subprocess.call(args=cmd)
	if retcode != 0:
	print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name
	return False
	return True


	def _launch_server(gke_settings, stress_server_settings, bq_settings,
	kubernetes_proxy):
	""" Launches a stress test server instance in GKE cluster """
	if not kubernetes_proxy.is_started:
	print 'Kubernetes proxy must be started before calling this function'
	return False

	# This is the wrapper script that is run in the container. This script runs
	# the actual stress test server
	server_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_server.py']

	# run_server.py does not take any args from the command line. The args are
	# instead passed via environment variables (see server_env below)
	server_arg_list = []

	# The parameters to the script run_server.py are injected into the container
	# via environment variables
	server_env = {
	'STRESS_TEST_IMAGE_TYPE': 'SERVER',
	'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server',
	'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port,
	'RUN_ID': bq_settings.run_id,
	'POD_NAME': stress_server_settings.server_pod_name,
	'GCP_PROJECT_ID': gke_settings.project_id,
	'DATASET_ID': bq_settings.dataset_id,
	'SUMMARY_TABLE_ID': bq_settings.summary_table_id,
	'QPS_TABLE_ID': bq_settings.qps_table_id
	}

	# Launch Server
	is_success = kubernetes_api.create_pod_and_service(
	'localhost',
	kubernetes_proxy.get_port(),
	'default', # Use 'default' namespace
	stress_server_settings.server_pod_name,
	gke_settings.tag_name,
	[stress_server_settings.server_port], # Port that should be exposed
	server_cmd_list,
	server_arg_list,
	server_env,
	True # Headless = True for server. Since we want DNS records to be created by GKE
	)

	return is_success


	def _launch_client(gke_settings, stress_server_settings, stress_client_settings,
	bq_settings, kubernetes_proxy):
	""" Launches a configurable number of stress test clients on GKE cluster """
	if not kubernetes_proxy.is_started:
	print 'Kubernetes proxy must be started before calling this function'
	return False

	stress_client_arg_list = [
	'--server_addresses=%s' % stress_client_settings.server_addresses,
	'--test_cases=%s' % stress_client_settings.test_cases_str,
	'--num_stubs_per_channel=%d' %
	stress_client_settings.num_stubs_per_channel
	]

	# This is the wrapper script that is run in the container. This script runs
	# the actual stress client
	client_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_client.py']

	# run_client.py takes no args. All args are passed as env variables (see
	# client_env)
	client_arg_list = []

	metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port
	metrics_client_arg_list = [
	'--metrics_server_address=%s' % metrics_server_address,
	'--total_only=true'
	]

	# The parameters to the script run_client.py are injected into the container
	# via environment variables
	client_env = {
	'STRESS_TEST_IMAGE_TYPE': 'CLIENT',
	'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test',
	'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list),
	'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client',
	'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list),
	'RUN_ID': bq_settings.run_id,
	'POLL_INTERVAL_SECS':
	str(stress_client_settings.stress_client_poll_interval_secs),
	'GCP_PROJECT_ID': gke_settings.project_id,
	'DATASET_ID': bq_settings.dataset_id,
	'SUMMARY_TABLE_ID': bq_settings.summary_table_id,
	'QPS_TABLE_ID': bq_settings.qps_table_id
	}

	for pod_name in stress_client_settings.client_pod_names_list:
	client_env['POD_NAME'] = pod_name
	is_success = kubernetes_api.create_pod_and_service(
	'localhost', # Since proxy is running on localhost
	kubernetes_proxy.get_port(),
	'default', # default namespace
	pod_name,
	gke_settings.tag_name,
	[stress_client_settings.metrics_port
	], # Client pods expose metrics port
	client_cmd_list,
	client_arg_list,
	client_env,
	False # Client is not a headless service
	)
	if not is_success:
	print 'Error in launching client %s' % pod_name
	return False

	return True


	def _launch_server_and_client(gke_settings, stress_server_settings,
	stress_client_settings, bq_settings,
	kubernetes_proxy_port):
	# Start kubernetes proxy
	print 'Kubernetes proxy'
	kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)
	kubernetes_proxy.start()

	print 'Launching server..'
	is_success = _launch_server(gke_settings, stress_server_settings, bq_settings,
	kubernetes_proxy)
	if not is_success:
	print 'Error in launching server'
	return False

	# Server takes a while to start.
	# TODO(sree) Use Kubernetes API to query the status of the server instead of
	# sleeping
	print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS
	time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)

	# Launch client
	client_pod_name_prefix = 'stress-client'
	is_success = _launch_client(gke_settings, stress_server_settings,
	stress_client_settings, bq_settings,
	kubernetes_proxy)

	if not is_success:
	print 'Error in launching client(s)'
	return False

	print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS
	time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)
	return True


	def _delete_server_and_client(stress_server_settings, stress_client_settings,
	kubernetes_proxy_port):
	kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)
	kubernetes_proxy.start()

	# Delete clients first
	is_success = True
	for pod_name in stress_client_settings.client_pod_names_list:
	is_success = kubernetes_api.delete_pod_and_service(
	'localhost', kubernetes_proxy_port, 'default', pod_name)
	if not is_success:
	return False

	# Delete server
	is_success = kubernetes_api.delete_pod_and_service(
	'localhost', kubernetes_proxy_port, 'default',
	stress_server_settings.server_pod_name)
	return is_success


	def run_test_main(test_settings, gke_settings, stress_server_settings,
	stress_client_clients):
	is_success = True

	if test_settings.build_docker_image:
	is_success = _build_docker_image(gke_settings.docker_image_name,
	gke_settings.tag_name)
	if not is_success:
	return False

	is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name)
	if not is_success:
	return False

	# Create a unique id for this run (Note: Using timestamp instead of UUID to
	# make it easier to deduce the date/time of the run just by looking at the run
	# run id. This is useful in debugging when looking at records in Biq query)
	run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
	dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id)

	# Big Query settings (common for both Stress Server and Client)
	bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID,
	_QPS_TABLE_ID)

	bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id,
	_SUMMARY_TABLE_ID, _QPS_TABLE_ID)
	bq_helper.initialize()

	try:
	is_success = _launch_server_and_client(gke_settings, stress_server_settings,
	stress_client_settings, bq_settings,
	test_settings.kubernetes_proxy_port)
	if not is_success:
	return False

	start_time = datetime.datetime.now()
	end_time = start_time + datetime.timedelta(
	seconds=test_settings.test_duration_secs)
	print 'Running the test until %s' % end_time.isoformat()

	while True:
	if datetime.datetime.now() > end_time:
	print 'Test was run for %d seconds' % test_settings.test_duration_secs
	break

	# Check if either stress server or clients have failed
	if bq_helper.check_if_any_tests_failed():
	is_success = False
	print 'Some tests failed.'
	break

	# Things seem to be running fine. Wait until next poll time to check the
	# status
	print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs
	time.sleep(test_settings.test_poll_interval_secs)

	# Print BiqQuery tables
	bq_helper.print_summary_records()
	bq_helper.print_qps_records()

	finally:
	# If is_success is False at this point, it means that the stress tests were
	# started successfully but failed while running the tests. In this case we
	# do should not delete the pods (since they contain all the failure
	# information)
	if is_success:
	_delete_server_and_client(stress_server_settings, stress_client_settings,
	test_settings.kubernetes_proxy_port)

	return is_success


	argp = argparse.ArgumentParser(
	description='Launch stress tests in GKE',
	formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	argp.add_argument('--project_id',
	required=True,
	help='The Google Cloud Platform Project Id')
	argp.add_argument('--num_clients',
	default=1,
	type=int,
	help='Number of client instances to start')
	argp.add_argument('--docker_image_name',
	default=_DEFAULT_DOCKER_IMAGE_NAME,
	help='The name of the docker image containing stress client '
	'and stress servers')
	argp.add_argument('--build_docker_image',
	dest='build_docker_image',
	action='store_true',
	help='Build a docker image and push to Google Container '
	'Registry')
	argp.add_argument('--do_not_build_docker_image',
	dest='build_docker_image',
	action='store_false',
	help='Do not build and push docker image to Google Container '
	'Registry')
	argp.set_defaults(build_docker_image=True)

	argp.add_argument('--test_poll_interval_secs',
	default=_DEFAULT_TEST_POLL_INTERVAL_SECS,
	type=int,
	help='How frequently should this script should monitor the '
	'health of stress clients and servers running in the GKE '
	'cluster')
	argp.add_argument('--test_duration_secs',
	default=_DEFAULT_TEST_DURATION_SECS,
	type=int,
	help='How long should this test be run')
	argp.add_argument('--kubernetes_proxy_port',
	default=_DEFAULT_KUBERNETES_PROXY_PORT,
	type=int,
	help='The port on which the kubernetes proxy (on localhost)'
	' is started')
	argp.add_argument('--stress_server_port',
	default=_DEFAULT_STRESS_SERVER_PORT,
	type=int,
	help='The port on which the stress server (in GKE '
	'containers) listens')
	argp.add_argument('--stress_client_metrics_port',
	default=_DEFAULT_METRICS_PORT,
	type=int,
	help='The port on which the stress clients (in GKE '
	'containers) expose metrics')
	argp.add_argument('--stress_client_poll_interval_secs',
	default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS,
	type=int,
	help='How frequently should the stress client wrapper script'
	' running inside GKE should monitor health of the actual '
	' stress client process and upload the metrics to BigQuery')
	argp.add_argument('--stress_client_metrics_collection_interval_secs',
	default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS,
	type=int,
	help='How frequently should metrics be collected in-memory on'
	' the stress clients (running inside GKE containers). Note '
	'that this is NOT the same as the upload-to-BigQuery '
	'frequency. The metrics upload frequency is controlled by the'
	' --stress_client_poll_interval_secs flag')
	argp.add_argument('--stress_client_num_channels_per_server',
	default=_DEFAULT_NUM_CHANNELS_PER_SERVER,
	type=int,
	help='The number of channels created to each server from a '
	'stress client')
	argp.add_argument('--stress_client_num_stubs_per_channel',
	default=_DEFAULT_NUM_STUBS_PER_CHANNEL,
	type=int,
	help='The number of stubs created per channel. This number '
	'indicates the max number of RPCs that can be made in '
	'parallel on each channel at any given time')
	argp.add_argument('--stress_client_test_cases',
	default=_DEFAULT_TEST_CASES_STR,
	help='List of test cases (with weights) to be executed by the'
	' stress test client. The list is in the following format:\n'
	' <testcase_1:w_1,<test_case2:w_2>..<testcase_n:w_n>\n'
	' (Note: The weights do not have to add up to 100)')

	if __name__ == '__main__':
	args = argp.parse_args()

	test_settings = TestSettings(
	args.build_docker_image, args.test_poll_interval_secs,
	args.test_duration_secs, args.kubernetes_proxy_port)

	gke_settings = GkeSettings(args.project_id, args.docker_image_name)

	stress_server_settings = StressServerSettings(_SERVER_POD_NAME,
	args.stress_server_port)
	stress_client_settings = StressClientSettings(
	args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME,
	args.stress_server_port, args.stress_client_metrics_port,
	args.stress_client_metrics_collection_interval_secs,
	args.stress_client_poll_interval_secs,
	args.stress_client_num_channels_per_server,
	args.stress_client_num_stubs_per_channel, args.stress_client_test_cases)

	run_test_main(test_settings, gke_settings, stress_server_settings,
	stress_client_settings)