blob: 9a9f74e9e5a3adde8aa9111b7703dd76cc6d1bc2 [file] [log] [blame]
Siddharth Shukla8e64d902017-03-12 19:50:18 +01001#!/usr/bin/env python
Jan Tattermusch7897ae92017-06-07 22:57:36 +02002# Copyright 2016 gRPC authors.
Jan Tattermuschb2758442016-03-28 09:32:20 -07003#
Jan Tattermusch7897ae92017-06-07 22:57:36 +02004# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
Jan Tattermuschb2758442016-03-28 09:32:20 -07007#
Jan Tattermusch7897ae92017-06-07 22:57:36 +02008# http://www.apache.org/licenses/LICENSE-2.0
Jan Tattermuschb2758442016-03-28 09:32:20 -07009#
Jan Tattermusch7897ae92017-06-07 22:57:36 +020010# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
Jan Tattermuschb2758442016-03-28 09:32:20 -070015"""Run performance tests locally or remotely."""
16
siddharthshukla0589e532016-07-07 16:08:01 +020017from __future__ import print_function
18
Jan Tattermuschb2758442016-03-28 09:32:20 -070019import argparse
Craig Tilleraccf16b2016-09-15 09:08:32 -070020import collections
Jan Tattermuschbb1a4532016-03-30 18:04:01 -070021import itertools
Craig Tiller0bda0b32016-03-03 12:51:53 -080022import json
Jan Tattermuschb2758442016-03-28 09:32:20 -070023import multiprocessing
24import os
Craig Tiller0bda0b32016-03-03 12:51:53 -080025import pipes
Jan Tattermusch38becc22016-04-14 08:00:35 -070026import re
Jan Tattermuschb2758442016-03-28 09:32:20 -070027import subprocess
28import sys
29import tempfile
30import time
Jan Tattermuschee9032c2016-04-14 08:35:51 -070031import traceback
Jan Tattermuschb2758442016-03-28 09:32:20 -070032import uuid
Siddharth Shuklad194f592017-03-11 19:12:43 +010033import six
Jan Tattermusch5c79a312016-12-20 11:02:50 +010034
35import performance.scenario_config as scenario_config
36import python_utils.jobset as jobset
37import python_utils.report_utils as report_utils
Jan Tattermuschb2758442016-03-28 09:32:20 -070038
Jan Tattermuschb2758442016-03-28 09:32:20 -070039_ROOT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), '../..'))
40os.chdir(_ROOT)
41
Jan Tattermuschb2758442016-03-28 09:32:20 -070042_REMOTE_HOST_USERNAME = 'jenkins'
43
44
Jan Tattermuschb2758442016-03-28 09:32:20 -070045class QpsWorkerJob:
ncteisen888093c2017-12-11 18:00:40 -080046 """Encapsulates a qps worker server job."""
Jan Tattermuschb2758442016-03-28 09:32:20 -070047
ncteisen888093c2017-12-11 18:00:40 -080048 def __init__(self, spec, language, host_and_port, perf_file_base_name=None):
49 self._spec = spec
50 self.language = language
51 self.host_and_port = host_and_port
52 self._job = None
53 self.perf_file_base_name = perf_file_base_name
Craig Tillerc1b54f22016-09-15 08:57:14 -070054
ncteisen888093c2017-12-11 18:00:40 -080055 def start(self):
56 self._job = jobset.Job(
57 self._spec, newline_on_success=True, travis=True, add_env={})
Jan Tattermuschb2758442016-03-28 09:32:20 -070058
ncteisen888093c2017-12-11 18:00:40 -080059 def is_running(self):
60 """Polls a job and returns True if given job is still running."""
61 return self._job and self._job.state() == jobset._RUNNING
Jan Tattermuschb2758442016-03-28 09:32:20 -070062
ncteisen888093c2017-12-11 18:00:40 -080063 def kill(self):
64 if self._job:
65 self._job.kill()
66 self._job = None
Jan Tattermuschb2758442016-03-28 09:32:20 -070067
68
ncteisen888093c2017-12-11 18:00:40 -080069def create_qpsworker_job(language,
70 shortname=None,
71 port=10000,
72 remote_host=None,
73 perf_cmd=None):
74 cmdline = (language.worker_cmdline() + ['--driver_port=%s' % port])
Alexander Polcyn9f08d112016-10-24 12:25:02 -070075
ncteisen888093c2017-12-11 18:00:40 -080076 if remote_host:
77 host_and_port = '%s:%s' % (remote_host, port)
78 else:
79 host_and_port = 'localhost:%s' % port
Jan Tattermuschb2758442016-03-28 09:32:20 -070080
ncteisen888093c2017-12-11 18:00:40 -080081 perf_file_base_name = None
82 if perf_cmd:
83 perf_file_base_name = '%s-%s' % (host_and_port, shortname)
84 # specify -o output file so perf.data gets collected when worker stopped
85 cmdline = perf_cmd + ['-o', '%s-perf.data' % perf_file_base_name
86 ] + cmdline
Alexander Polcyn9f08d112016-10-24 12:25:02 -070087
ncteisen888093c2017-12-11 18:00:40 -080088 worker_timeout = 3 * 60
89 if remote_host:
90 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)
91 ssh_cmd = ['ssh']
92 cmdline = ['timeout', '%s' % (worker_timeout + 30)] + cmdline
93 ssh_cmd.extend([
94 str(user_at_host),
95 'cd ~/performance_workspace/grpc/ && python tools/run_tests/start_port_server.py && %s'
96 % ' '.join(cmdline)
97 ])
98 cmdline = ssh_cmd
Alexander Polcyn9f08d112016-10-24 12:25:02 -070099
ncteisen888093c2017-12-11 18:00:40 -0800100 jobspec = jobset.JobSpec(
101 cmdline=cmdline,
102 shortname=shortname,
Mehrdad Afshari87cd9942018-01-02 14:40:00 -0800103 timeout_seconds=
104 worker_timeout, # workers get restarted after each scenario
ncteisen888093c2017-12-11 18:00:40 -0800105 verbose_success=True)
106 return QpsWorkerJob(jobspec, language, host_and_port, perf_file_base_name)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700107
108
ncteisen888093c2017-12-11 18:00:40 -0800109def create_scenario_jobspec(scenario_json,
110 workers,
111 remote_host=None,
112 bq_result_table=None,
113 server_cpu_load=0):
114 """Runs one scenario using QPS driver."""
115 # setting QPS_WORKERS env variable here makes sure it works with SSH too.
116 cmd = 'QPS_WORKERS="%s" ' % ','.join(workers)
117 if bq_result_table:
118 cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table
119 cmd += 'tools/run_tests/performance/run_qps_driver.sh '
120 cmd += '--scenarios_json=%s ' % pipes.quote(
121 json.dumps({
122 'scenarios': [scenario_json]
123 }))
124 cmd += '--scenario_result_file=scenario_result.json '
125 if server_cpu_load != 0:
126 cmd += '--search_param=offered_load --initial_search_value=1000 --targeted_cpu_load=%d --stride=500 --error_tolerance=0.01' % server_cpu_load
127 if remote_host:
128 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)
129 cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (
130 user_at_host, pipes.quote(cmd))
Jan Tattermuschb2758442016-03-28 09:32:20 -0700131
ncteisen888093c2017-12-11 18:00:40 -0800132 return jobset.JobSpec(
133 cmdline=[cmd],
134 shortname='qps_json_driver.%s' % scenario_json['name'],
135 timeout_seconds=12 * 60,
136 shell=True,
137 verbose_success=True)
Craig Tiller0bda0b32016-03-03 12:51:53 -0800138
139
140def create_quit_jobspec(workers, remote_host=None):
ncteisen888093c2017-12-11 18:00:40 -0800141 """Runs quit using QPS driver."""
142 # setting QPS_WORKERS env variable here makes sure it works with SSH too.
143 cmd = 'QPS_WORKERS="%s" bins/opt/qps_json_driver --quit' % ','.join(
144 w.host_and_port for w in workers)
145 if remote_host:
146 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)
147 cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (
148 user_at_host, pipes.quote(cmd))
Craig Tiller0bda0b32016-03-03 12:51:53 -0800149
ncteisen888093c2017-12-11 18:00:40 -0800150 return jobset.JobSpec(
151 cmdline=[cmd],
152 shortname='qps_json_driver.quit',
153 timeout_seconds=3 * 60,
154 shell=True,
155 verbose_success=True)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700156
157
ncteisen888093c2017-12-11 18:00:40 -0800158def create_netperf_jobspec(server_host='localhost',
159 client_host=None,
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700160 bq_result_table=None):
ncteisen888093c2017-12-11 18:00:40 -0800161 """Runs netperf benchmark."""
162 cmd = 'NETPERF_SERVER_HOST="%s" ' % server_host
163 if bq_result_table:
164 cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table
165 if client_host:
166 # If netperf is running remotely, the env variables populated by Jenkins
167 # won't be available on the client, but we need them for uploading results
168 # to BigQuery.
169 jenkins_job_name = os.getenv('JOB_NAME')
170 if jenkins_job_name:
171 cmd += 'JOB_NAME="%s" ' % jenkins_job_name
172 jenkins_build_number = os.getenv('BUILD_NUMBER')
173 if jenkins_build_number:
174 cmd += 'BUILD_NUMBER="%s" ' % jenkins_build_number
Jan Tattermuschad17bf72016-05-11 12:41:37 -0700175
ncteisen888093c2017-12-11 18:00:40 -0800176 cmd += 'tools/run_tests/performance/run_netperf.sh'
177 if client_host:
178 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, client_host)
179 cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (
180 user_at_host, pipes.quote(cmd))
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700181
ncteisen888093c2017-12-11 18:00:40 -0800182 return jobset.JobSpec(
183 cmdline=[cmd],
184 shortname='netperf',
185 timeout_seconds=60,
186 shell=True,
187 verbose_success=True)
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700188
189
Jan Tattermuschde874a12016-04-18 09:21:37 -0700190def archive_repo(languages):
ncteisen888093c2017-12-11 18:00:40 -0800191 """Archives local version of repo including submodules."""
192 cmdline = ['tar', '-cf', '../grpc.tar', '../grpc/']
193 if 'java' in languages:
194 cmdline.append('../grpc-java')
195 if 'go' in languages:
196 cmdline.append('../grpc-go')
Jan Tattermuschde874a12016-04-18 09:21:37 -0700197
ncteisen888093c2017-12-11 18:00:40 -0800198 archive_job = jobset.JobSpec(
199 cmdline=cmdline, shortname='archive_repo', timeout_seconds=3 * 60)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700200
ncteisen888093c2017-12-11 18:00:40 -0800201 jobset.message('START', 'Archiving local repository.', do_newline=True)
202 num_failures, _ = jobset.run(
203 [archive_job], newline_on_success=True, maxjobs=1)
204 if num_failures == 0:
205 jobset.message(
206 'SUCCESS',
207 'Archive with local repository created successfully.',
208 do_newline=True)
209 else:
210 jobset.message(
211 'FAILED', 'Failed to archive local repository.', do_newline=True)
212 sys.exit(1)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700213
214
Jan Tattermusch14089202016-04-27 17:55:27 -0700215def prepare_remote_hosts(hosts, prepare_local=False):
ncteisen888093c2017-12-11 18:00:40 -0800216 """Prepares remote hosts (and maybe prepare localhost as well)."""
217 prepare_timeout = 10 * 60
218 prepare_jobs = []
219 for host in hosts:
220 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)
221 prepare_jobs.append(
222 jobset.JobSpec(
223 cmdline=['tools/run_tests/performance/remote_host_prepare.sh'],
224 shortname='remote_host_prepare.%s' % host,
225 environ={'USER_AT_HOST': user_at_host},
226 timeout_seconds=prepare_timeout))
227 if prepare_local:
228 # Prepare localhost as well
229 prepare_jobs.append(
230 jobset.JobSpec(
231 cmdline=['tools/run_tests/performance/kill_workers.sh'],
232 shortname='local_prepare',
233 timeout_seconds=prepare_timeout))
234 jobset.message('START', 'Preparing hosts.', do_newline=True)
235 num_failures, _ = jobset.run(
236 prepare_jobs, newline_on_success=True, maxjobs=10)
237 if num_failures == 0:
238 jobset.message(
239 'SUCCESS', 'Prepare step completed successfully.', do_newline=True)
240 else:
241 jobset.message(
242 'FAILED', 'Failed to prepare remote hosts.', do_newline=True)
243 sys.exit(1)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700244
245
ncteisen888093c2017-12-11 18:00:40 -0800246def build_on_remote_hosts(hosts,
247 languages=scenario_config.LANGUAGES.keys(),
248 build_local=False):
249 """Builds performance worker on remote hosts (and maybe also locally)."""
250 build_timeout = 15 * 60
251 # Kokoro VMs (which are local only) do not have caching, so they need more time to build
252 local_build_timeout = 30 * 60
253 build_jobs = []
254 for host in hosts:
255 user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)
256 build_jobs.append(
257 jobset.JobSpec(
258 cmdline=['tools/run_tests/performance/remote_host_build.sh'] +
259 languages,
260 shortname='remote_host_build.%s' % host,
261 environ={'USER_AT_HOST': user_at_host,
262 'CONFIG': 'opt'},
263 timeout_seconds=build_timeout))
264 if build_local:
265 # Build locally as well
266 build_jobs.append(
267 jobset.JobSpec(
268 cmdline=['tools/run_tests/performance/build_performance.sh'] +
269 languages,
270 shortname='local_build',
271 environ={'CONFIG': 'opt'},
272 timeout_seconds=local_build_timeout))
273 jobset.message('START', 'Building.', do_newline=True)
274 num_failures, _ = jobset.run(
275 build_jobs, newline_on_success=True, maxjobs=10)
276 if num_failures == 0:
277 jobset.message('SUCCESS', 'Built successfully.', do_newline=True)
278 else:
279 jobset.message('FAILED', 'Build failed.', do_newline=True)
280 sys.exit(1)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700281
282
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700283def create_qpsworkers(languages, worker_hosts, perf_cmd=None):
ncteisen888093c2017-12-11 18:00:40 -0800284 """Creates QPS workers (but does not start them)."""
285 if not worker_hosts:
286 # run two workers locally (for each language)
287 workers = [(None, 10000), (None, 10010)]
288 elif len(worker_hosts) == 1:
289 # run two workers on the remote host (for each language)
290 workers = [(worker_hosts[0], 10000), (worker_hosts[0], 10010)]
291 else:
292 # run one worker per each remote host (for each language)
293 workers = [(worker_host, 10000) for worker_host in worker_hosts]
Jan Tattermuschb2758442016-03-28 09:32:20 -0700294
ncteisen888093c2017-12-11 18:00:40 -0800295 return [
296 create_qpsworker_job(
297 language,
298 shortname='qps_worker_%s_%s' % (language, worker_idx),
299 port=worker[1] + language.worker_port_offset(),
300 remote_host=worker[0],
301 perf_cmd=perf_cmd)
Mehrdad Afshari87cd9942018-01-02 14:40:00 -0800302 for language in languages
303 for worker_idx, worker in enumerate(workers)
ncteisen888093c2017-12-11 18:00:40 -0800304 ]
Jan Tattermuschb2758442016-03-28 09:32:20 -0700305
306
ncteisen888093c2017-12-11 18:00:40 -0800307def perf_report_processor_job(worker_host, perf_base_name, output_filename,
308 flame_graph_reports):
309 print('Creating perf report collection job for %s' % worker_host)
310 cmd = ''
311 if worker_host != 'localhost':
312 user_at_host = "%s@%s" % (_REMOTE_HOST_USERNAME, worker_host)
313 cmd = "USER_AT_HOST=%s OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%stools/run_tests/performance/process_remote_perf_flamegraphs.sh" % (
314 user_at_host, output_filename, flame_graph_reports, perf_base_name)
315 else:
316 cmd = "OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%stools/run_tests/performance/process_local_perf_flamegraphs.sh" % (
317 output_filename, flame_graph_reports, perf_base_name)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700318
ncteisen888093c2017-12-11 18:00:40 -0800319 return jobset.JobSpec(
320 cmdline=cmd,
321 timeout_seconds=3 * 60,
322 shell=True,
323 verbose_success=True,
324 shortname='process perf report')
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700325
326
Craig Tiller677966a2016-09-26 07:37:28 -0700327Scenario = collections.namedtuple('Scenario', 'jobspec workers name')
Craig Tillerc1b54f22016-09-15 08:57:14 -0700328
329
ncteisen888093c2017-12-11 18:00:40 -0800330def create_scenarios(languages,
331 workers_by_lang,
332 remote_host=None,
333 regex='.*',
334 category='all',
335 bq_result_table=None,
336 netperf=False,
337 netperf_hosts=[],
338 server_cpu_load=0):
339 """Create jobspecs for scenarios to run."""
340 all_workers = [
341 worker for workers in workers_by_lang.values() for worker in workers
342 ]
343 scenarios = []
344 _NO_WORKERS = []
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700345
ncteisen888093c2017-12-11 18:00:40 -0800346 if netperf:
347 if not netperf_hosts:
348 netperf_server = 'localhost'
349 netperf_client = None
350 elif len(netperf_hosts) == 1:
351 netperf_server = netperf_hosts[0]
352 netperf_client = netperf_hosts[0]
353 else:
354 netperf_server = netperf_hosts[0]
355 netperf_client = netperf_hosts[1]
356 scenarios.append(
357 Scenario(
358 create_netperf_jobspec(
359 server_host=netperf_server,
360 client_host=netperf_client,
361 bq_result_table=bq_result_table), _NO_WORKERS, 'netperf'))
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700362
ncteisen888093c2017-12-11 18:00:40 -0800363 for language in languages:
364 for scenario_json in language.scenarios():
365 if re.search(regex, scenario_json['name']):
366 categories = scenario_json.get('CATEGORIES',
367 ['scalable', 'smoketest'])
368 if category in categories or category == 'all':
369 workers = workers_by_lang[str(language)][:]
370 # 'SERVER_LANGUAGE' is an indicator for this script to pick
371 # a server in different language.
Mehrdad Afshari87cd9942018-01-02 14:40:00 -0800372 custom_server_lang = scenario_json.get(
373 'SERVER_LANGUAGE', None)
374 custom_client_lang = scenario_json.get(
375 'CLIENT_LANGUAGE', None)
ncteisen888093c2017-12-11 18:00:40 -0800376 scenario_json = scenario_config.remove_nonproto_fields(
377 scenario_json)
378 if custom_server_lang and custom_client_lang:
379 raise Exception(
380 'Cannot set both custom CLIENT_LANGUAGE and SERVER_LANGUAGE'
Jan Tattermusch37a907e2016-05-13 13:49:43 -0700381 'in the same scenario')
ncteisen888093c2017-12-11 18:00:40 -0800382 if custom_server_lang:
383 if not workers_by_lang.get(custom_server_lang, []):
384 print('Warning: Skipping scenario %s as' %
385 scenario_json['name'])
386 print(
387 'SERVER_LANGUAGE is set to %s yet the language has '
388 'not been selected with -l' %
389 custom_server_lang)
390 continue
391 for idx in range(0, scenario_json['num_servers']):
392 # replace first X workers by workers of a different language
393 workers[idx] = workers_by_lang[custom_server_lang][
394 idx]
395 if custom_client_lang:
396 if not workers_by_lang.get(custom_client_lang, []):
397 print('Warning: Skipping scenario %s as' %
398 scenario_json['name'])
399 print(
400 'CLIENT_LANGUAGE is set to %s yet the language has '
401 'not been selected with -l' %
402 custom_client_lang)
403 continue
404 for idx in range(scenario_json['num_servers'],
405 len(workers)):
406 # replace all client workers by workers of a different language,
407 # leave num_server workers as they are server workers.
408 workers[idx] = workers_by_lang[custom_client_lang][
409 idx]
410 scenario = Scenario(
411 create_scenario_jobspec(
412 scenario_json, [w.host_and_port for w in workers],
413 remote_host=remote_host,
414 bq_result_table=bq_result_table,
415 server_cpu_load=server_cpu_load), workers,
416 scenario_json['name'])
417 scenarios.append(scenario)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700418
ncteisen888093c2017-12-11 18:00:40 -0800419 return scenarios
Jan Tattermuschb2758442016-03-28 09:32:20 -0700420
421
Michael Darakanandaf570fb22017-08-17 17:09:56 +1000422def finish_qps_workers(jobs, qpsworker_jobs):
ncteisen888093c2017-12-11 18:00:40 -0800423 """Waits for given jobs to finish and eventually kills them."""
424 retries = 0
425 num_killed = 0
426 while any(job.is_running() for job in jobs):
427 for job in qpsworker_jobs:
428 if job.is_running():
429 print('QPS worker "%s" is still running.' % job.host_and_port)
430 if retries > 10:
431 print('Killing all QPS workers.')
432 for job in jobs:
433 job.kill()
434 num_killed += 1
435 retries += 1
436 time.sleep(3)
437 print('All QPS workers finished.')
438 return num_killed
439
Jan Tattermuschb2758442016-03-28 09:32:20 -0700440
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700441profile_output_files = []
442
ncteisen888093c2017-12-11 18:00:40 -0800443
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700444# Collect perf text reports and flamegraphs if perf_cmd was used
445# Note the base names of perf text reports are used when creating and processing
446# perf data. The scenario name uniqifies the output name in the final
Michael Darakanandaf570fb22017-08-17 17:09:56 +1000447# perf reports directory.
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700448# Alos, the perf profiles need to be fetched and processed after each scenario
449# in order to avoid clobbering the output files.
ncteisen888093c2017-12-11 18:00:40 -0800450def run_collect_perf_profile_jobs(hosts_and_base_names, scenario_name,
451 flame_graph_reports):
452 perf_report_jobs = []
453 global profile_output_files
454 for host_and_port in hosts_and_base_names:
455 perf_base_name = hosts_and_base_names[host_and_port]
456 output_filename = '%s-%s' % (scenario_name, perf_base_name)
457 # from the base filename, create .svg output filename
458 host = host_and_port.split(':')[0]
459 profile_output_files.append('%s.svg' % output_filename)
460 perf_report_jobs.append(
461 perf_report_processor_job(host, perf_base_name, output_filename,
462 flame_graph_reports))
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700463
ncteisen888093c2017-12-11 18:00:40 -0800464 jobset.message(
465 'START', 'Collecting perf reports from qps workers', do_newline=True)
466 failures, _ = jobset.run(
467 perf_report_jobs, newline_on_success=True, maxjobs=1)
468 jobset.message(
469 'END', 'Collecting perf reports from qps workers', do_newline=True)
470 return failures
471
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700472
Michael Darakanandaf570fb22017-08-17 17:09:56 +1000473def main():
ncteisen888093c2017-12-11 18:00:40 -0800474 argp = argparse.ArgumentParser(description='Run performance tests.')
475 argp.add_argument(
476 '-l',
477 '--language',
478 choices=['all'] + sorted(scenario_config.LANGUAGES.keys()),
479 nargs='+',
480 required=True,
481 help='Languages to benchmark.')
482 argp.add_argument(
483 '--remote_driver_host',
484 default=None,
Mehrdad Afshari87cd9942018-01-02 14:40:00 -0800485 help=
486 'Run QPS driver on given host. By default, QPS driver is run locally.')
ncteisen888093c2017-12-11 18:00:40 -0800487 argp.add_argument(
488 '--remote_worker_host',
489 nargs='+',
490 default=[],
491 help='Worker hosts where to start QPS workers.')
492 argp.add_argument(
493 '--dry_run',
494 default=False,
495 action='store_const',
496 const=True,
497 help='Just list scenarios to be run, but don\'t run them.')
498 argp.add_argument(
499 '-r',
500 '--regex',
501 default='.*',
502 type=str,
503 help='Regex to select scenarios to run.')
504 argp.add_argument(
505 '--bq_result_table',
506 default=None,
507 type=str,
508 help='Bigquery "dataset.table" to upload results to.')
509 argp.add_argument(
510 '--category',
511 choices=['smoketest', 'all', 'scalable', 'sweep'],
512 default='all',
513 help='Select a category of tests to run.')
514 argp.add_argument(
515 '--netperf',
516 default=False,
517 action='store_const',
518 const=True,
519 help='Run netperf benchmark as one of the scenarios.')
520 argp.add_argument(
521 '--server_cpu_load',
522 default=0,
523 type=int,
524 help='Select a targeted server cpu load to run. 0 means ignore this flag'
525 )
526 argp.add_argument(
527 '-x',
528 '--xml_report',
529 default='report.xml',
530 type=str,
531 help='Name of XML report file to generate.')
532 argp.add_argument(
533 '--perf_args',
534 help=('Example usage: "--perf_args=record -F 99 -g". '
535 'Wrap QPS workers in a perf command '
536 'with the arguments to perf specified here. '
537 '".svg" flame graph profiles will be '
538 'created for each Qps Worker on each scenario. '
539 'Files will output to "<repo_root>/<args.flame_graph_reports>" '
540 'directory. Output files from running the worker '
541 'under perf are saved in the repo root where its ran. '
542 'Note that the perf "-g" flag is necessary for '
543 'flame graphs generation to work (assuming the binary '
544 'being profiled uses frame pointers, check out '
545 '"--call-graph dwarf" option using libunwind otherwise.) '
546 'Also note that the entire "--perf_args=<arg(s)>" must '
547 'be wrapped in quotes as in the example usage. '
548 'If the "--perg_args" is unspecified, "perf" will '
549 'not be used at all. '
550 'See http://www.brendangregg.com/perf.html '
551 'for more general perf examples.'))
552 argp.add_argument(
553 '--skip_generate_flamegraphs',
554 default=False,
555 action='store_const',
556 const=True,
557 help=('Turn flame graph generation off. '
558 'May be useful if "perf_args" arguments do not make sense for '
559 'generating flamegraphs (e.g., "--perf_args=stat ...")'))
560 argp.add_argument(
561 '-f',
562 '--flame_graph_reports',
563 default='perf_reports',
564 type=str,
Mehrdad Afshari87cd9942018-01-02 14:40:00 -0800565 help=
566 'Name of directory to output flame graph profiles to, if any are created.'
ncteisen888093c2017-12-11 18:00:40 -0800567 )
568 argp.add_argument(
569 '-u',
570 '--remote_host_username',
571 default='',
572 type=str,
573 help='Use a username that isn\'t "Jenkins" to SSH into remote workers.')
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700574
ncteisen888093c2017-12-11 18:00:40 -0800575 args = argp.parse_args()
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700576
ncteisen888093c2017-12-11 18:00:40 -0800577 global _REMOTE_HOST_USERNAME
578 if args.remote_host_username:
579 _REMOTE_HOST_USERNAME = args.remote_host_username
Matt Kwongb75db422017-10-09 17:53:05 -0700580
ncteisen888093c2017-12-11 18:00:40 -0800581 languages = set(
582 scenario_config.LANGUAGES[l]
583 for l in itertools.chain.from_iterable(
584 six.iterkeys(scenario_config.LANGUAGES) if x == 'all' else [x]
585 for x in args.language))
Jan Tattermuschbb1a4532016-03-30 18:04:01 -0700586
ncteisen888093c2017-12-11 18:00:40 -0800587 # Put together set of remote hosts where to run and build
588 remote_hosts = set()
589 if args.remote_worker_host:
590 for host in args.remote_worker_host:
591 remote_hosts.add(host)
592 if args.remote_driver_host:
593 remote_hosts.add(args.remote_driver_host)
Jan Tattermusch6d7fa552016-04-14 17:42:54 -0700594
ncteisen888093c2017-12-11 18:00:40 -0800595 if not args.dry_run:
596 if remote_hosts:
597 archive_repo(languages=[str(l) for l in languages])
598 prepare_remote_hosts(remote_hosts, prepare_local=True)
599 else:
600 prepare_remote_hosts([], prepare_local=True)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700601
ncteisen888093c2017-12-11 18:00:40 -0800602 build_local = False
603 if not args.remote_driver_host:
604 build_local = True
605 if not args.dry_run:
606 build_on_remote_hosts(
607 remote_hosts,
608 languages=[str(l) for l in languages],
609 build_local=build_local)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700610
ncteisen888093c2017-12-11 18:00:40 -0800611 perf_cmd = None
612 if args.perf_args:
613 print('Running workers under perf profiler')
614 # Expect /usr/bin/perf to be installed here, as is usual
615 perf_cmd = ['/usr/bin/perf']
616 perf_cmd.extend(re.split('\s+', args.perf_args))
Jan Tattermuschb2758442016-03-28 09:32:20 -0700617
ncteisen888093c2017-12-11 18:00:40 -0800618 qpsworker_jobs = create_qpsworkers(
619 languages, args.remote_worker_host, perf_cmd=perf_cmd)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700620
ncteisen888093c2017-12-11 18:00:40 -0800621 # get list of worker addresses for each language.
622 workers_by_lang = dict([(str(language), []) for language in languages])
623 for job in qpsworker_jobs:
624 workers_by_lang[str(job.language)].append(job)
Jan Tattermusch38becc22016-04-14 08:00:35 -0700625
ncteisen888093c2017-12-11 18:00:40 -0800626 scenarios = create_scenarios(
627 languages,
628 workers_by_lang=workers_by_lang,
629 remote_host=args.remote_driver_host,
630 regex=args.regex,
631 category=args.category,
632 bq_result_table=args.bq_result_table,
633 netperf=args.netperf,
634 netperf_hosts=args.remote_worker_host,
635 server_cpu_load=args.server_cpu_load)
Jan Tattermuschb2758442016-03-28 09:32:20 -0700636
ncteisen888093c2017-12-11 18:00:40 -0800637 if not scenarios:
638 raise Exception('No scenarios to run')
Jan Tattermusch4de2c322016-05-10 14:33:07 -0700639
ncteisen888093c2017-12-11 18:00:40 -0800640 total_scenario_failures = 0
641 qps_workers_killed = 0
642 merged_resultset = {}
643 perf_report_failures = 0
Jan Tattermuschb2758442016-03-28 09:32:20 -0700644
ncteisen888093c2017-12-11 18:00:40 -0800645 for scenario in scenarios:
646 if args.dry_run:
647 print(scenario.name)
648 else:
649 scenario_failures = 0
650 try:
651 for worker in scenario.workers:
652 worker.start()
653 jobs = [scenario.jobspec]
654 if scenario.workers:
655 jobs.append(
656 create_quit_jobspec(
657 scenario.workers,
658 remote_host=args.remote_driver_host))
659 scenario_failures, resultset = jobset.run(
660 jobs, newline_on_success=True, maxjobs=1)
661 total_scenario_failures += scenario_failures
662 merged_resultset = dict(
663 itertools.chain(
664 six.iteritems(merged_resultset),
665 six.iteritems(resultset)))
666 finally:
667 # Consider qps workers that need to be killed as failures
Mehrdad Afshari87cd9942018-01-02 14:40:00 -0800668 qps_workers_killed += finish_qps_workers(
669 scenario.workers, qpsworker_jobs)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700670
ncteisen888093c2017-12-11 18:00:40 -0800671 if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs:
672 workers_and_base_names = {}
673 for worker in scenario.workers:
674 if not worker.perf_file_base_name:
675 raise Exception(
Mehrdad Afshari87cd9942018-01-02 14:40:00 -0800676 'using perf buf perf report filename is unspecified'
677 )
ncteisen888093c2017-12-11 18:00:40 -0800678 workers_and_base_names[
679 worker.host_and_port] = worker.perf_file_base_name
680 perf_report_failures += run_collect_perf_profile_jobs(
681 workers_and_base_names, scenario.name,
682 args.flame_graph_reports)
Alexander Polcyn49796672016-10-17 10:01:37 -0700683
ncteisen888093c2017-12-11 18:00:40 -0800684 # Still write the index.html even if some scenarios failed.
685 # 'profile_output_files' will only have names for scenarios that passed
686 if perf_cmd and not args.skip_generate_flamegraphs:
687 # write the index fil to the output dir, with all profiles from all scenarios/workers
688 report_utils.render_perf_profiling_results(
689 '%s/index.html' % args.flame_graph_reports, profile_output_files)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700690
ncteisen888093c2017-12-11 18:00:40 -0800691 report_utils.render_junit_xml_report(
692 merged_resultset, args.xml_report, suite_name='benchmarks')
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700693
ncteisen888093c2017-12-11 18:00:40 -0800694 if total_scenario_failures > 0 or qps_workers_killed > 0:
695 print('%s scenarios failed and %s qps worker jobs killed' %
696 (total_scenario_failures, qps_workers_killed))
697 sys.exit(1)
Alexander Polcyn9f08d112016-10-24 12:25:02 -0700698
ncteisen888093c2017-12-11 18:00:40 -0800699 if perf_report_failures > 0:
700 print('%s perf profile collection jobs failed' % perf_report_failures)
701 sys.exit(1)
Alexander Polcyn41fe5792017-02-02 10:46:51 -0800702
Michael Darakanandaf570fb22017-08-17 17:09:56 +1000703
704if __name__ == "__main__":
ncteisen888093c2017-12-11 18:00:40 -0800705 main()