blob: 03b684b318c38f2f179588db727844e8a08c5b84 [file] [log] [blame]
Siddharth Shukla8e64d902017-03-12 19:50:18 +01001#!/usr/bin/env python
Jan Tattermusch7897ae92017-06-07 22:57:36 +02002# Copyright 2016 gRPC authors.
Jan Tattermuschb2758442016-03-28 09:32:20 -07003#
Jan Tattermusch7897ae92017-06-07 22:57:36 +02004# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
Jan Tattermuschb2758442016-03-28 09:32:20 -07007#
Jan Tattermusch7897ae92017-06-07 22:57:36 +02008# http://www.apache.org/licenses/LICENSE-2.0
Jan Tattermuschb2758442016-03-28 09:32:20 -07009#
Jan Tattermusch7897ae92017-06-07 22:57:36 +020010# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
Jan Tattermuschb2758442016-03-28 09:32:20 -070015"""Run performance tests locally or remotely."""
16
siddharthshukla0589e532016-07-07 16:08:01 +020017from __future__ import print_function
18
Jan Tattermuschb2758442016-03-28 09:32:20 -070019import argparse
Craig Tilleraccf16b2016-09-15 09:08:32 -070020import collections
Jan Tattermuschbb1a4532016-03-30 18:04:01 -070021import itertools
Craig Tiller0bda0b32016-03-03 12:51:53 -080022import json
Jan Tattermuschb2758442016-03-28 09:32:20 -070023import multiprocessing
24import os
Craig Tiller0bda0b32016-03-03 12:51:53 -080025import pipes
Jan Tattermusch38becc22016-04-14 08:00:35 -070026import re
Jan Tattermuschb2758442016-03-28 09:32:20 -070027import subprocess
28import sys
29import tempfile
30import time
Jan Tattermuschee9032c2016-04-14 08:35:51 -070031import traceback
Jan Tattermuschb2758442016-03-28 09:32:20 -070032import uuid
Siddharth Shuklad194f592017-03-11 19:12:43 +010033import six
Jan Tattermusch5c79a312016-12-20 11:02:50 +010034
35import performance.scenario_config as scenario_config
36import python_utils.jobset as jobset
37import python_utils.report_utils as report_utils
Jan Tattermuschb2758442016-03-28 09:32:20 -070038
Jan Tattermuschb2758442016-03-28 09:32:20 -070039_ROOT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), '../..'))
40os.chdir(_ROOT)
41
Jan Tattermuschb2758442016-03-28 09:32:20 -070042_REMOTE_HOST_USERNAME = 'jenkins'
43
44
Jan Tattermuschb2758442016-03-28 09:32:20 -070045class QpsWorkerJob:
ncteisen888093c2017-12-11 18:00:40 -080046 """Encapsulates a qps worker server job."""
Jan Tattermuschb2758442016-03-28 09:32:20 -070047
ncteisen888093c2017-12-11 18:00:40 -080048 def __init__(self, spec, language, host_and_port, perf_file_base_name=None):
49 self._spec = spec
50 self.language = language
51 self.host_and_port = host_and_port
52 self._job = None
53 self.perf_file_base_name = perf_file_base_name
Craig Tillerc1b54f22016-09-15 08:57:14 -070054
ncteisen888093c2017-12-11 18:00:40 -080055 def start(self):
56 self._job = jobset.Job(
57 self._spec, newline_on_success=True, travis=True, add_env={})
Jan Tattermuschb2758442016-03-28 09:32:20 -070058
ncteisen888093c2017-12-11 18:00:40 -080059 def is_running(self):
60 """Polls a job and returns True if given job is still running."""
61 return self._job and self._job.state() == jobset._RUNNING
Jan Tattermuschb2758442016-03-28 09:32:20 -070062
ncteisen888093c2017-12-11 18:00:40 -080063 def kill(self):
64 if self._job:
65 self._job.kill()
66 self._job = None
Jan Tattermuschb2758442016-03-28 09:32:20 -070067
68
ncteisen888093c2017-12-11 18:00:40 -080069def create_qpsworker_job(language,
70 shortname=None,
71 port=10000,
72 remote_host=None,
73 perf_cmd=None):
74 cmdline = (language.worker_cmdline() + ['--driver_port=%s' % port])
Alexander Polcyn9f08d112016-10-24 12:25:02 -070075
ncteisen888093c2017-12-11 18:00:40 -080076 if remote_host:
77 host_and_port = '%s:%s' % (remote_host, port)
78 else:
79 host_and_port = 'localhost:%s' % port
Jan Tattermuschb2758442016-03-28 09:32:20 -070080
ncteisen888093c2017-12-11 18:00:40 -080081 perf_file_base_name = None
82 if perf_cmd:
83 perf_file_base_name = '%s-%s' % (host_and_port, shortname)
84 # specify -o output file so perf.data gets collected when worker stopped
85 cmdline = perf_cmd + ['-o', '%s-perf.data' % perf_file_base_name
86 ] + cmdline
Alexander Polcyn9f08d112016-10-24 12:25:02 -070087
ncteisen888093c2017-12-11 18:00:40 -080088 worker_timeout = 3 * 60
89 if remote_host:
90 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)
91 ssh_cmd = ['ssh']
92 cmdline = ['timeout', '%s' % (worker_timeout + 30)] + cmdline
93 ssh_cmd.extend([
94 str(user_at_host),
95 'cd ~/performance_workspace/grpc/ && python tools/run_tests/start_port_server.py && %s'
96 % ' '.join(cmdline)
97 ])
98 cmdline = ssh_cmd
Alexander Polcyn9f08d112016-10-24 12:25:02 -070099
ncteisen888093c2017-12-11 18:00:40 -0800100 jobspec = jobset.JobSpec(
101 cmdline=cmdline,
102 shortname=shortname,
103 timeout_seconds=worker_timeout, # workers get restarted after each scenario
104 verbose_success=True)
105 return QpsWorkerJob(jobspec, language, host_and_port, perf_file_base_name)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700106
107
ncteisen888093c2017-12-11 18:00:40 -0800108def create_scenario_jobspec(scenario_json,
109 workers,
110 remote_host=None,
111 bq_result_table=None,
112 server_cpu_load=0):
113 """Runs one scenario using QPS driver."""
114 # setting QPS_WORKERS env variable here makes sure it works with SSH too.
115 cmd = 'QPS_WORKERS="%s" ' % ','.join(workers)
116 if bq_result_table:
117 cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table
118 cmd += 'tools/run_tests/performance/run_qps_driver.sh '
119 cmd += '--scenarios_json=%s ' % pipes.quote(
120 json.dumps({
121 'scenarios': [scenario_json]
122 }))
123 cmd += '--scenario_result_file=scenario_result.json '
124 if server_cpu_load != 0:
125 cmd += '--search_param=offered_load --initial_search_value=1000 --targeted_cpu_load=%d --stride=500 --error_tolerance=0.01' % server_cpu_load
126 if remote_host:
127 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)
128 cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (
129 user_at_host, pipes.quote(cmd))
Jan Tattermuschb2758442016-03-28 09:32:20 -0700130
ncteisen888093c2017-12-11 18:00:40 -0800131 return jobset.JobSpec(
132 cmdline=[cmd],
133 shortname='qps_json_driver.%s' % scenario_json['name'],
134 timeout_seconds=12 * 60,
135 shell=True,
136 verbose_success=True)
Craig Tiller0bda0b32016-03-03 12:51:53 -0800137
138
139def create_quit_jobspec(workers, remote_host=None):
ncteisen888093c2017-12-11 18:00:40 -0800140 """Runs quit using QPS driver."""
141 # setting QPS_WORKERS env variable here makes sure it works with SSH too.
142 cmd = 'QPS_WORKERS="%s" bins/opt/qps_json_driver --quit' % ','.join(
143 w.host_and_port for w in workers)
144 if remote_host:
145 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)
146 cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (
147 user_at_host, pipes.quote(cmd))
Craig Tiller0bda0b32016-03-03 12:51:53 -0800148
ncteisen888093c2017-12-11 18:00:40 -0800149 return jobset.JobSpec(
150 cmdline=[cmd],
151 shortname='qps_json_driver.quit',
152 timeout_seconds=3 * 60,
153 shell=True,
154 verbose_success=True)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700155
156
ncteisen888093c2017-12-11 18:00:40 -0800157def create_netperf_jobspec(server_host='localhost',
158 client_host=None,
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700159 bq_result_table=None):
ncteisen888093c2017-12-11 18:00:40 -0800160 """Runs netperf benchmark."""
161 cmd = 'NETPERF_SERVER_HOST="%s" ' % server_host
162 if bq_result_table:
163 cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table
164 if client_host:
165 # If netperf is running remotely, the env variables populated by Jenkins
166 # won't be available on the client, but we need them for uploading results
167 # to BigQuery.
168 jenkins_job_name = os.getenv('JOB_NAME')
169 if jenkins_job_name:
170 cmd += 'JOB_NAME="%s" ' % jenkins_job_name
171 jenkins_build_number = os.getenv('BUILD_NUMBER')
172 if jenkins_build_number:
173 cmd += 'BUILD_NUMBER="%s" ' % jenkins_build_number
Jan Tattermuschad17bf72016-05-11 12:41:37 -0700174
ncteisen888093c2017-12-11 18:00:40 -0800175 cmd += 'tools/run_tests/performance/run_netperf.sh'
176 if client_host:
177 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, client_host)
178 cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (
179 user_at_host, pipes.quote(cmd))
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700180
ncteisen888093c2017-12-11 18:00:40 -0800181 return jobset.JobSpec(
182 cmdline=[cmd],
183 shortname='netperf',
184 timeout_seconds=60,
185 shell=True,
186 verbose_success=True)
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700187
188
Jan Tattermuschde874a12016-04-18 09:21:37 -0700189def archive_repo(languages):
ncteisen888093c2017-12-11 18:00:40 -0800190 """Archives local version of repo including submodules."""
191 cmdline = ['tar', '-cf', '../grpc.tar', '../grpc/']
192 if 'java' in languages:
193 cmdline.append('../grpc-java')
194 if 'go' in languages:
195 cmdline.append('../grpc-go')
Jan Tattermuschde874a12016-04-18 09:21:37 -0700196
ncteisen888093c2017-12-11 18:00:40 -0800197 archive_job = jobset.JobSpec(
198 cmdline=cmdline, shortname='archive_repo', timeout_seconds=3 * 60)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700199
ncteisen888093c2017-12-11 18:00:40 -0800200 jobset.message('START', 'Archiving local repository.', do_newline=True)
201 num_failures, _ = jobset.run(
202 [archive_job], newline_on_success=True, maxjobs=1)
203 if num_failures == 0:
204 jobset.message(
205 'SUCCESS',
206 'Archive with local repository created successfully.',
207 do_newline=True)
208 else:
209 jobset.message(
210 'FAILED', 'Failed to archive local repository.', do_newline=True)
211 sys.exit(1)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700212
213
Jan Tattermusch14089202016-04-27 17:55:27 -0700214def prepare_remote_hosts(hosts, prepare_local=False):
ncteisen888093c2017-12-11 18:00:40 -0800215 """Prepares remote hosts (and maybe prepare localhost as well)."""
216 prepare_timeout = 10 * 60
217 prepare_jobs = []
218 for host in hosts:
219 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)
220 prepare_jobs.append(
221 jobset.JobSpec(
222 cmdline=['tools/run_tests/performance/remote_host_prepare.sh'],
223 shortname='remote_host_prepare.%s' % host,
224 environ={'USER_AT_HOST': user_at_host},
225 timeout_seconds=prepare_timeout))
226 if prepare_local:
227 # Prepare localhost as well
228 prepare_jobs.append(
229 jobset.JobSpec(
230 cmdline=['tools/run_tests/performance/kill_workers.sh'],
231 shortname='local_prepare',
232 timeout_seconds=prepare_timeout))
233 jobset.message('START', 'Preparing hosts.', do_newline=True)
234 num_failures, _ = jobset.run(
235 prepare_jobs, newline_on_success=True, maxjobs=10)
236 if num_failures == 0:
237 jobset.message(
238 'SUCCESS', 'Prepare step completed successfully.', do_newline=True)
239 else:
240 jobset.message(
241 'FAILED', 'Failed to prepare remote hosts.', do_newline=True)
242 sys.exit(1)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700243
244
ncteisen888093c2017-12-11 18:00:40 -0800245def build_on_remote_hosts(hosts,
246 languages=scenario_config.LANGUAGES.keys(),
247 build_local=False):
248 """Builds performance worker on remote hosts (and maybe also locally)."""
249 build_timeout = 15 * 60
250 # Kokoro VMs (which are local only) do not have caching, so they need more time to build
251 local_build_timeout = 30 * 60
252 build_jobs = []
253 for host in hosts:
254 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)
255 build_jobs.append(
256 jobset.JobSpec(
257 cmdline=['tools/run_tests/performance/remote_host_build.sh'] +
258 languages,
259 shortname='remote_host_build.%s' % host,
260 environ={'USER_AT_HOST': user_at_host,
261 'CONFIG': 'opt'},
262 timeout_seconds=build_timeout))
263 if build_local:
264 # Build locally as well
265 build_jobs.append(
266 jobset.JobSpec(
267 cmdline=['tools/run_tests/performance/build_performance.sh'] +
268 languages,
269 shortname='local_build',
270 environ={'CONFIG': 'opt'},
271 timeout_seconds=local_build_timeout))
272 jobset.message('START', 'Building.', do_newline=True)
273 num_failures, _ = jobset.run(
274 build_jobs, newline_on_success=True, maxjobs=10)
275 if num_failures == 0:
276 jobset.message('SUCCESS', 'Built successfully.', do_newline=True)
277 else:
278 jobset.message('FAILED', 'Build failed.', do_newline=True)
279 sys.exit(1)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700280
281
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700282def create_qpsworkers(languages, worker_hosts, perf_cmd=None):
ncteisen888093c2017-12-11 18:00:40 -0800283 """Creates QPS workers (but does not start them)."""
284 if not worker_hosts:
285 # run two workers locally (for each language)
286 workers = [(None, 10000), (None, 10010)]
287 elif len(worker_hosts) == 1:
288 # run two workers on the remote host (for each language)
289 workers = [(worker_hosts[0], 10000), (worker_hosts[0], 10010)]
290 else:
291 # run one worker per each remote host (for each language)
292 workers = [(worker_host, 10000) for worker_host in worker_hosts]
Jan Tattermuschb2758442016-03-28 09:32:20 -0700293
ncteisen888093c2017-12-11 18:00:40 -0800294 return [
295 create_qpsworker_job(
296 language,
297 shortname='qps_worker_%s_%s' % (language, worker_idx),
298 port=worker[1] + language.worker_port_offset(),
299 remote_host=worker[0],
300 perf_cmd=perf_cmd)
301 for language in languages for worker_idx, worker in enumerate(workers)
302 ]
Jan Tattermuschb2758442016-03-28 09:32:20 -0700303
304
ncteisen888093c2017-12-11 18:00:40 -0800305def perf_report_processor_job(worker_host, perf_base_name, output_filename,
306 flame_graph_reports):
307 print('Creating perf report collection job for %s' % worker_host)
308 cmd = ''
309 if worker_host != 'localhost':
310 user_at_host = "%s@%s" % (_REMOTE_HOST_USERNAME, worker_host)
311 cmd = "USER_AT_HOST=%s OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%stools/run_tests/performance/process_remote_perf_flamegraphs.sh" % (
312 user_at_host, output_filename, flame_graph_reports, perf_base_name)
313 else:
314 cmd = "OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%stools/run_tests/performance/process_local_perf_flamegraphs.sh" % (
315 output_filename, flame_graph_reports, perf_base_name)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700316
ncteisen888093c2017-12-11 18:00:40 -0800317 return jobset.JobSpec(
318 cmdline=cmd,
319 timeout_seconds=3 * 60,
320 shell=True,
321 verbose_success=True,
322 shortname='process perf report')
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700323
324
Craig Tiller677966a2016-09-26 07:37:28 -0700325Scenario = collections.namedtuple('Scenario', 'jobspec workers name')
Craig Tillerc1b54f22016-09-15 08:57:14 -0700326
327
ncteisen888093c2017-12-11 18:00:40 -0800328def create_scenarios(languages,
329 workers_by_lang,
330 remote_host=None,
331 regex='.*',
332 category='all',
333 bq_result_table=None,
334 netperf=False,
335 netperf_hosts=[],
336 server_cpu_load=0):
337 """Create jobspecs for scenarios to run."""
338 all_workers = [
339 worker for workers in workers_by_lang.values() for worker in workers
340 ]
341 scenarios = []
342 _NO_WORKERS = []
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700343
ncteisen888093c2017-12-11 18:00:40 -0800344 if netperf:
345 if not netperf_hosts:
346 netperf_server = 'localhost'
347 netperf_client = None
348 elif len(netperf_hosts) == 1:
349 netperf_server = netperf_hosts[0]
350 netperf_client = netperf_hosts[0]
351 else:
352 netperf_server = netperf_hosts[0]
353 netperf_client = netperf_hosts[1]
354 scenarios.append(
355 Scenario(
356 create_netperf_jobspec(
357 server_host=netperf_server,
358 client_host=netperf_client,
359 bq_result_table=bq_result_table), _NO_WORKERS, 'netperf'))
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700360
ncteisen888093c2017-12-11 18:00:40 -0800361 for language in languages:
362 for scenario_json in language.scenarios():
363 if re.search(regex, scenario_json['name']):
364 categories = scenario_json.get('CATEGORIES',
365 ['scalable', 'smoketest'])
366 if category in categories or category == 'all':
367 workers = workers_by_lang[str(language)][:]
368 # 'SERVER_LANGUAGE' is an indicator for this script to pick
369 # a server in different language.
370 custom_server_lang = scenario_json.get('SERVER_LANGUAGE',
371 None)
372 custom_client_lang = scenario_json.get('CLIENT_LANGUAGE',
373 None)
374 scenario_json = scenario_config.remove_nonproto_fields(
375 scenario_json)
376 if custom_server_lang and custom_client_lang:
377 raise Exception(
378 'Cannot set both custom CLIENT_LANGUAGE and SERVER_LANGUAGE'
Jan Tattermusch37a907e2016-05-13 13:49:43 -0700379 'in the same scenario')
ncteisen888093c2017-12-11 18:00:40 -0800380 if custom_server_lang:
381 if not workers_by_lang.get(custom_server_lang, []):
382 print('Warning: Skipping scenario %s as' %
383 scenario_json['name'])
384 print(
385 'SERVER_LANGUAGE is set to %s yet the language has '
386 'not been selected with -l' %
387 custom_server_lang)
388 continue
389 for idx in range(0, scenario_json['num_servers']):
390 # replace first X workers by workers of a different language
391 workers[idx] = workers_by_lang[custom_server_lang][
392 idx]
393 if custom_client_lang:
394 if not workers_by_lang.get(custom_client_lang, []):
395 print('Warning: Skipping scenario %s as' %
396 scenario_json['name'])
397 print(
398 'CLIENT_LANGUAGE is set to %s yet the language has '
399 'not been selected with -l' %
400 custom_client_lang)
401 continue
402 for idx in range(scenario_json['num_servers'],
403 len(workers)):
404 # replace all client workers by workers of a different language,
405 # leave num_server workers as they are server workers.
406 workers[idx] = workers_by_lang[custom_client_lang][
407 idx]
408 scenario = Scenario(
409 create_scenario_jobspec(
410 scenario_json, [w.host_and_port for w in workers],
411 remote_host=remote_host,
412 bq_result_table=bq_result_table,
413 server_cpu_load=server_cpu_load), workers,
414 scenario_json['name'])
415 scenarios.append(scenario)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700416
ncteisen888093c2017-12-11 18:00:40 -0800417 return scenarios
Jan Tattermuschb2758442016-03-28 09:32:20 -0700418
419
Michael Darakanandaf570fb22017-08-17 17:09:56 +1000420def finish_qps_workers(jobs, qpsworker_jobs):
ncteisen888093c2017-12-11 18:00:40 -0800421 """Waits for given jobs to finish and eventually kills them."""
422 retries = 0
423 num_killed = 0
424 while any(job.is_running() for job in jobs):
425 for job in qpsworker_jobs:
426 if job.is_running():
427 print('QPS worker "%s" is still running.' % job.host_and_port)
428 if retries > 10:
429 print('Killing all QPS workers.')
430 for job in jobs:
431 job.kill()
432 num_killed += 1
433 retries += 1
434 time.sleep(3)
435 print('All QPS workers finished.')
436 return num_killed
437
Jan Tattermuschb2758442016-03-28 09:32:20 -0700438
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700439profile_output_files = []
440
ncteisen888093c2017-12-11 18:00:40 -0800441
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700442# Collect perf text reports and flamegraphs if perf_cmd was used
443# Note the base names of perf text reports are used when creating and processing
444# perf data. The scenario name uniqifies the output name in the final
Michael Darakanandaf570fb22017-08-17 17:09:56 +1000445# perf reports directory.
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700446# Alos, the perf profiles need to be fetched and processed after each scenario
447# in order to avoid clobbering the output files.
ncteisen888093c2017-12-11 18:00:40 -0800448def run_collect_perf_profile_jobs(hosts_and_base_names, scenario_name,
449 flame_graph_reports):
450 perf_report_jobs = []
451 global profile_output_files
452 for host_and_port in hosts_and_base_names:
453 perf_base_name = hosts_and_base_names[host_and_port]
454 output_filename = '%s-%s' % (scenario_name, perf_base_name)
455 # from the base filename, create .svg output filename
456 host = host_and_port.split(':')[0]
457 profile_output_files.append('%s.svg' % output_filename)
458 perf_report_jobs.append(
459 perf_report_processor_job(host, perf_base_name, output_filename,
460 flame_graph_reports))
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700461
ncteisen888093c2017-12-11 18:00:40 -0800462 jobset.message(
463 'START', 'Collecting perf reports from qps workers', do_newline=True)
464 failures, _ = jobset.run(
465 perf_report_jobs, newline_on_success=True, maxjobs=1)
466 jobset.message(
467 'END', 'Collecting perf reports from qps workers', do_newline=True)
468 return failures
469
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700470
Michael Darakanandaf570fb22017-08-17 17:09:56 +1000471def main():
ncteisen888093c2017-12-11 18:00:40 -0800472 argp = argparse.ArgumentParser(description='Run performance tests.')
473 argp.add_argument(
474 '-l',
475 '--language',
476 choices=['all'] + sorted(scenario_config.LANGUAGES.keys()),
477 nargs='+',
478 required=True,
479 help='Languages to benchmark.')
480 argp.add_argument(
481 '--remote_driver_host',
482 default=None,
483 help='Run QPS driver on given host. By default, QPS driver is run locally.'
484 )
485 argp.add_argument(
486 '--remote_worker_host',
487 nargs='+',
488 default=[],
489 help='Worker hosts where to start QPS workers.')
490 argp.add_argument(
491 '--dry_run',
492 default=False,
493 action='store_const',
494 const=True,
495 help='Just list scenarios to be run, but don\'t run them.')
496 argp.add_argument(
497 '-r',
498 '--regex',
499 default='.*',
500 type=str,
501 help='Regex to select scenarios to run.')
502 argp.add_argument(
503 '--bq_result_table',
504 default=None,
505 type=str,
506 help='Bigquery "dataset.table" to upload results to.')
507 argp.add_argument(
508 '--category',
509 choices=['smoketest', 'all', 'scalable', 'sweep'],
510 default='all',
511 help='Select a category of tests to run.')
512 argp.add_argument(
513 '--netperf',
514 default=False,
515 action='store_const',
516 const=True,
517 help='Run netperf benchmark as one of the scenarios.')
518 argp.add_argument(
519 '--server_cpu_load',
520 default=0,
521 type=int,
522 help='Select a targeted server cpu load to run. 0 means ignore this flag'
523 )
524 argp.add_argument(
525 '-x',
526 '--xml_report',
527 default='report.xml',
528 type=str,
529 help='Name of XML report file to generate.')
530 argp.add_argument(
531 '--perf_args',
532 help=('Example usage: "--perf_args=record -F 99 -g". '
533 'Wrap QPS workers in a perf command '
534 'with the arguments to perf specified here. '
535 '".svg" flame graph profiles will be '
536 'created for each Qps Worker on each scenario. '
537 'Files will output to "<repo_root>/<args.flame_graph_reports>" '
538 'directory. Output files from running the worker '
539 'under perf are saved in the repo root where its ran. '
540 'Note that the perf "-g" flag is necessary for '
541 'flame graphs generation to work (assuming the binary '
542 'being profiled uses frame pointers, check out '
543 '"--call-graph dwarf" option using libunwind otherwise.) '
544 'Also note that the entire "--perf_args=<arg(s)>" must '
545 'be wrapped in quotes as in the example usage. '
546 'If the "--perg_args" is unspecified, "perf" will '
547 'not be used at all. '
548 'See http://www.brendangregg.com/perf.html '
549 'for more general perf examples.'))
550 argp.add_argument(
551 '--skip_generate_flamegraphs',
552 default=False,
553 action='store_const',
554 const=True,
555 help=('Turn flame graph generation off. '
556 'May be useful if "perf_args" arguments do not make sense for '
557 'generating flamegraphs (e.g., "--perf_args=stat ...")'))
558 argp.add_argument(
559 '-f',
560 '--flame_graph_reports',
561 default='perf_reports',
562 type=str,
563 help='Name of directory to output flame graph profiles to, if any are created.'
564 )
565 argp.add_argument(
566 '-u',
567 '--remote_host_username',
568 default='',
569 type=str,
570 help='Use a username that isn\'t "Jenkins" to SSH into remote workers.')
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700571
ncteisen888093c2017-12-11 18:00:40 -0800572 args = argp.parse_args()
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700573
ncteisen888093c2017-12-11 18:00:40 -0800574 global _REMOTE_HOST_USERNAME
575 if args.remote_host_username:
576 _REMOTE_HOST_USERNAME = args.remote_host_username
Matt Kwongb75db422017-10-09 17:53:05 -0700577
ncteisen888093c2017-12-11 18:00:40 -0800578 languages = set(
579 scenario_config.LANGUAGES[l]
580 for l in itertools.chain.from_iterable(
581 six.iterkeys(scenario_config.LANGUAGES) if x == 'all' else [x]
582 for x in args.language))
Jan Tattermuschbb1a4532016-03-30 18:04:01 -0700583
ncteisen888093c2017-12-11 18:00:40 -0800584 # Put together set of remote hosts where to run and build
585 remote_hosts = set()
586 if args.remote_worker_host:
587 for host in args.remote_worker_host:
588 remote_hosts.add(host)
589 if args.remote_driver_host:
590 remote_hosts.add(args.remote_driver_host)
Jan Tattermusch6d7fa552016-04-14 17:42:54 -0700591
ncteisen888093c2017-12-11 18:00:40 -0800592 if not args.dry_run:
593 if remote_hosts:
594 archive_repo(languages=[str(l) for l in languages])
595 prepare_remote_hosts(remote_hosts, prepare_local=True)
596 else:
597 prepare_remote_hosts([], prepare_local=True)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700598
ncteisen888093c2017-12-11 18:00:40 -0800599 build_local = False
600 if not args.remote_driver_host:
601 build_local = True
602 if not args.dry_run:
603 build_on_remote_hosts(
604 remote_hosts,
605 languages=[str(l) for l in languages],
606 build_local=build_local)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700607
ncteisen888093c2017-12-11 18:00:40 -0800608 perf_cmd = None
609 if args.perf_args:
610 print('Running workers under perf profiler')
611 # Expect /usr/bin/perf to be installed here, as is usual
612 perf_cmd = ['/usr/bin/perf']
613 perf_cmd.extend(re.split('\s+', args.perf_args))
Jan Tattermuschb2758442016-03-28 09:32:20 -0700614
ncteisen888093c2017-12-11 18:00:40 -0800615 qpsworker_jobs = create_qpsworkers(
616 languages, args.remote_worker_host, perf_cmd=perf_cmd)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700617
ncteisen888093c2017-12-11 18:00:40 -0800618 # get list of worker addresses for each language.
619 workers_by_lang = dict([(str(language), []) for language in languages])
620 for job in qpsworker_jobs:
621 workers_by_lang[str(job.language)].append(job)
Jan Tattermusch38becc22016-04-14 08:00:35 -0700622
ncteisen888093c2017-12-11 18:00:40 -0800623 scenarios = create_scenarios(
624 languages,
625 workers_by_lang=workers_by_lang,
626 remote_host=args.remote_driver_host,
627 regex=args.regex,
628 category=args.category,
629 bq_result_table=args.bq_result_table,
630 netperf=args.netperf,
631 netperf_hosts=args.remote_worker_host,
632 server_cpu_load=args.server_cpu_load)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700633
ncteisen888093c2017-12-11 18:00:40 -0800634 if not scenarios:
635 raise Exception('No scenarios to run')
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700636
ncteisen888093c2017-12-11 18:00:40 -0800637 total_scenario_failures = 0
638 qps_workers_killed = 0
639 merged_resultset = {}
640 perf_report_failures = 0
Jan Tattermuschb2758442016-03-28 09:32:20 -0700641
ncteisen888093c2017-12-11 18:00:40 -0800642 for scenario in scenarios:
643 if args.dry_run:
644 print(scenario.name)
645 else:
646 scenario_failures = 0
647 try:
648 for worker in scenario.workers:
649 worker.start()
650 jobs = [scenario.jobspec]
651 if scenario.workers:
652 jobs.append(
653 create_quit_jobspec(
654 scenario.workers,
655 remote_host=args.remote_driver_host))
656 scenario_failures, resultset = jobset.run(
657 jobs, newline_on_success=True, maxjobs=1)
658 total_scenario_failures += scenario_failures
659 merged_resultset = dict(
660 itertools.chain(
661 six.iteritems(merged_resultset),
662 six.iteritems(resultset)))
663 finally:
664 # Consider qps workers that need to be killed as failures
665 qps_workers_killed += finish_qps_workers(scenario.workers,
666 qpsworker_jobs)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700667
ncteisen888093c2017-12-11 18:00:40 -0800668 if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs:
669 workers_and_base_names = {}
670 for worker in scenario.workers:
671 if not worker.perf_file_base_name:
672 raise Exception(
673 'using perf buf perf report filename is unspecified')
674 workers_and_base_names[
675 worker.host_and_port] = worker.perf_file_base_name
676 perf_report_failures += run_collect_perf_profile_jobs(
677 workers_and_base_names, scenario.name,
678 args.flame_graph_reports)
Alexander Polcyn49796672016-10-17 10:01:37 -0700679
ncteisen888093c2017-12-11 18:00:40 -0800680 # Still write the index.html even if some scenarios failed.
681 # 'profile_output_files' will only have names for scenarios that passed
682 if perf_cmd and not args.skip_generate_flamegraphs:
683 # write the index fil to the output dir, with all profiles from all scenarios/workers
684 report_utils.render_perf_profiling_results(
685 '%s/index.html' % args.flame_graph_reports, profile_output_files)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700686
ncteisen888093c2017-12-11 18:00:40 -0800687 report_utils.render_junit_xml_report(
688 merged_resultset, args.xml_report, suite_name='benchmarks')
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700689
ncteisen888093c2017-12-11 18:00:40 -0800690 if total_scenario_failures > 0 or qps_workers_killed > 0:
691 print('%s scenarios failed and %s qps worker jobs killed' %
692 (total_scenario_failures, qps_workers_killed))
693 sys.exit(1)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700694
ncteisen888093c2017-12-11 18:00:40 -0800695 if perf_report_failures > 0:
696 print('%s perf profile collection jobs failed' % perf_report_failures)
697 sys.exit(1)
Alexander Polcyn41fe5792017-02-02 10:46:51 -0800698
Michael Darakanandaf570fb22017-08-17 17:09:56 +1000699
700if __name__ == "__main__":
ncteisen888093c2017-12-11 18:00:40 -0800701 main()