Blame - tools/run_tests/run_performance_tests.py - platform/external/grpc-grpc

2017-06-07 22:57:36 +0200

[diff] [blame]

2

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

3

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

4

# Licensed under the Apache License, Version 2.0 (the "License");

5

# you may not use this file except in compliance with the License.

6

# You may obtain a copy of the License at

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

7

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

8

# http://www.apache.org/licenses/LICENSE-2.0

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

9

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

10

# Unless required by applicable law or agreed to in writing, software

11

# distributed under the License is distributed on an "AS IS" BASIS,

12

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

# See the License for the specific language governing permissions and

14

# limitations under the License.

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

15

16

"""Run performance tests locally or remotely."""

17

siddharthshukla

2016-07-07 16:08:01 +0200

[diff] [blame]

18

from __future__ import print_function

19

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

20

import argparse

Craig Tiller

accf16b

2016-09-15 09:08:32 -0700

[diff] [blame]

21

import collections

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

22

import itertools

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

23

import json

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

24

import multiprocessing

25

import os

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

26

import pipes

Jan Tattermusch

38becc2

2016-04-14 08:00:35 -0700

[diff] [blame]

27

import re

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

import subprocess

import sys

import tempfile

import time

Jan Tattermusch

ee9032c

2016-04-14 08:35:51 -0700

[diff] [blame]

32

import traceback

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

33

import uuid

Siddharth Shukla

d194f59

2017-03-11 19:12:43 +0100

[diff] [blame]

34

import six

Jan Tattermusch

5c79a31

2016-12-20 11:02:50 +0100

[diff] [blame]

35

36

import performance.scenario_config as scenario_config

37

import python_utils.jobset as jobset

38

import python_utils.report_utils as report_utils

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

39

40

41

_ROOT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), '../..'))

os.chdir(_ROOT)

_REMOTE_HOST_USERNAME = 'jenkins'

46

47

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

48

class QpsWorkerJob:

49

"""Encapsulates a qps worker server job."""

50

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

51

def __init__(self, spec, language, host_and_port, perf_file_base_name=None):

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

52

self._spec = spec

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

53

self.language = language

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

54

self.host_and_port = host_and_port

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

55

self._job = None

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

56

self.perf_file_base_name = perf_file_base_name

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

57

58

def start(self):

Craig Tiller

c197ec1

2016-09-15 09:19:33 -0700

[diff] [blame]

59

self._job = jobset.Job(self._spec, newline_on_success=True, travis=True, add_env={})

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

60

61

def is_running(self):

62

"""Polls a job and returns True if given job is still running."""

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

63

return self._job and self._job.state() == jobset._RUNNING

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

64

65

def kill(self):

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

66

if self._job:

67

self._job.kill()

68

self._job = None

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

69

70

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

71

def create_qpsworker_job(language, shortname=None, port=10000, remote_host=None, perf_cmd=None):

72

cmdline = (language.worker_cmdline() + ['--driver_port=%s' % port])

73

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

74

if remote_host:

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

75

host_and_port='%s:%s' % (remote_host, port)

76

else:

77

host_and_port='localhost:%s' % port

78

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

79

perf_file_base_name = None

80

if perf_cmd:

81

perf_file_base_name = '%s-%s' % (host_and_port, shortname)

82

# specify -o output file so perf.data gets collected when worker stopped

83

cmdline = perf_cmd + ['-o', '%s-perf.data' % perf_file_base_name] + cmdline

84

Alexander Polcyn

76be306

2017-02-01 12:06:23 -0800

[diff] [blame]

85

worker_timeout = 3 * 60

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

86

if remote_host:

87

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

88

ssh_cmd = ['ssh']

Alexander Polcyn

76be306

2017-02-01 12:06:23 -0800

[diff] [blame]

89

cmdline = ['timeout', '%s' % (worker_timeout + 30)] + cmdline

Jan Tattermusch

61c9135

2017-08-11 18:03:35 +0200

[diff] [blame]

90

ssh_cmd.extend([str(user_at_host), 'cd ~/performance_workspace/grpc/ && python tools/run_tests/start_port_server.py && %s' % ' '.join(cmdline)])

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

91

cmdline = ssh_cmd

92

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

93

jobspec = jobset.JobSpec(

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

94

cmdline=cmdline,

95

shortname=shortname,

Alexander Polcyn

76be306

2017-02-01 12:06:23 -0800

[diff] [blame]

96

timeout_seconds=worker_timeout, # workers get restarted after each scenario

Jan Tattermusch

447548b

2016-10-17 12:04:56 +0200

[diff] [blame]

97

verbose_success=True)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

98

return QpsWorkerJob(jobspec, language, host_and_port, perf_file_base_name)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

99

100

Jan Tattermusch

2016-04-14 17:42:54 -0700

[diff] [blame]

101

def create_scenario_jobspec(scenario_json, workers, remote_host=None,

Yuxuan Li

2016-11-11 12:05:11 -0800

[diff] [blame]

102

bq_result_table=None, server_cpu_load=0):

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

103

"""Runs one scenario using QPS driver."""

104

# setting QPS_WORKERS env variable here makes sure it works with SSH too.

Jan Tattermusch

2016-04-14 17:42:54 -0700

[diff] [blame]

105

cmd = 'QPS_WORKERS="%s" ' % ','.join(workers)

106

if bq_result_table:

107

cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table

108

cmd += 'tools/run_tests/performance/run_qps_driver.sh '

109

cmd += '--scenarios_json=%s ' % pipes.quote(json.dumps({'scenarios': [scenario_json]}))

Yuxuan Li

2016-11-11 12:05:11 -0800

[diff] [blame]

110

cmd += '--scenario_result_file=scenario_result.json '

111

if server_cpu_load != 0:

112

cmd += '--search_param=offered_load --initial_search_value=1000 --targeted_cpu_load=%d --stride=500 --error_tolerance=0.01' % server_cpu_load

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

113

if remote_host:

114

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

Jan Tattermusch

2016-04-14 17:42:54 -0700

[diff] [blame]

115

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (user_at_host, pipes.quote(cmd))

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

116

117

return jobset.JobSpec(

118

cmdline=[cmd],

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

119

shortname='qps_json_driver.%s' % scenario_json['name'],

Yuxuan Li

2016-11-11 12:05:11 -0800

[diff] [blame]

120

timeout_seconds=12*60,

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

121

shell=True,

122

verbose_success=True)

123

124

125

def create_quit_jobspec(workers, remote_host=None):

126

"""Runs quit using QPS driver."""

127

# setting QPS_WORKERS env variable here makes sure it works with SSH too.

Craig Tiller

025972d

2016-09-15 09:26:50 -0700

[diff] [blame]

128

cmd = 'QPS_WORKERS="%s" bins/opt/qps_json_driver --quit' % ','.join(w.host_and_port for w in workers)

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

129

if remote_host:

130

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

Jan Tattermusch

2016-04-14 17:42:54 -0700

[diff] [blame]

131

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (user_at_host, pipes.quote(cmd))

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

132

133

return jobset.JobSpec(

134

cmdline=[cmd],

vjpai

29089c7

2016-04-20 12:38:16 -0700

[diff] [blame]

135

shortname='qps_json_driver.quit',

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

136

timeout_seconds=3*60,

137

shell=True,

138

verbose_success=True)

139

140

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

141

def create_netperf_jobspec(server_host='localhost', client_host=None,

142

bq_result_table=None):

143

"""Runs netperf benchmark."""

144

cmd = 'NETPERF_SERVER_HOST="%s" ' % server_host

145

if bq_result_table:

146

cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table

Jan Tattermusch

ad17bf7

2016-05-11 12:41:37 -0700

[diff] [blame]

147

if client_host:

148

# If netperf is running remotely, the env variables populated by Jenkins

149

# won't be available on the client, but we need them for uploading results

150

# to BigQuery.

151

jenkins_job_name = os.getenv('JOB_NAME')

152

if jenkins_job_name:

153

cmd += 'JOB_NAME="%s" ' % jenkins_job_name

154

jenkins_build_number = os.getenv('BUILD_NUMBER')

155

if jenkins_build_number:

156

cmd += 'BUILD_NUMBER="%s" ' % jenkins_build_number

157

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

158

cmd += 'tools/run_tests/performance/run_netperf.sh'

159

if client_host:

160

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, client_host)

161

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (user_at_host, pipes.quote(cmd))

162

163

return jobset.JobSpec(

cmdline=[cmd],

shortname='netperf',

timeout_seconds=60,

shell=True,

verbose_success=True)

169

170

Jan Tattermusch

2016-04-18 09:21:37 -0700

[diff] [blame]

171

def archive_repo(languages):

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

172

"""Archives local version of repo including submodules."""

Jan Tattermusch

2016-04-18 09:21:37 -0700

[diff] [blame]

173

cmdline=['tar', '-cf', '../grpc.tar', '../grpc/']

174

if 'java' in languages:

175

cmdline.append('../grpc-java')

176

if 'go' in languages:

177

cmdline.append('../grpc-go')

178

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

179

archive_job = jobset.JobSpec(

Jan Tattermusch

2016-04-18 09:21:37 -0700

[diff] [blame]

180

cmdline=cmdline,

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

181

shortname='archive_repo',

182

timeout_seconds=3*60)

183

184

jobset.message('START', 'Archiving local repository.', do_newline=True)

185

num_failures, _ = jobset.run(

Matt Kwong

2017-09-12 13:30:51 -0700

[diff] [blame]

186

[archive_job], newline_on_success=True, maxjobs=1)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

187

if num_failures == 0:

188

jobset.message('SUCCESS',

Jan Tattermusch

2016-04-18 09:21:37 -0700

[diff] [blame]

189

'Archive with local repository created successfully.',

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

190

do_newline=True)

191

else:

192

jobset.message('FAILED', 'Failed to archive local repository.',

do_newline=True)

sys.exit(1)

Jan Tattermusch

2016-04-27 17:55:27 -0700

[diff] [blame]

197

def prepare_remote_hosts(hosts, prepare_local=False):

198

"""Prepares remote hosts (and maybe prepare localhost as well)."""

Matt Kwong

2017-10-09 17:53:05 -0700

[diff] [blame^]

199

prepare_timeout = 10*60

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

200

prepare_jobs = []

201

for host in hosts:

202

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)

203

prepare_jobs.append(

204

jobset.JobSpec(

205

cmdline=['tools/run_tests/performance/remote_host_prepare.sh'],

206

shortname='remote_host_prepare.%s' % host,

207

environ = {'USER_AT_HOST': user_at_host},

Jan Tattermusch

1408920

2016-04-27 17:55:27 -0700

[diff] [blame]

208

timeout_seconds=prepare_timeout))

209

if prepare_local:

210

# Prepare localhost as well

211

prepare_jobs.append(

212

jobset.JobSpec(

213

cmdline=['tools/run_tests/performance/kill_workers.sh'],

214

shortname='local_prepare',

215

timeout_seconds=prepare_timeout))

216

jobset.message('START', 'Preparing hosts.', do_newline=True)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

217

num_failures, _ = jobset.run(

Matt Kwong

2017-09-12 13:30:51 -0700

[diff] [blame]

218

prepare_jobs, newline_on_success=True, maxjobs=10)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

219

if num_failures == 0:

220

jobset.message('SUCCESS',

Jan Tattermusch

1408920

2016-04-27 17:55:27 -0700

[diff] [blame]

221

'Prepare step completed successfully.',

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

222

do_newline=True)

223

else:

224

jobset.message('FAILED', 'Failed to prepare remote hosts.',

do_newline=True)

sys.exit(1)

Craig Tiller

2016-04-04 13:49:29 -0700

[diff] [blame]

229

def build_on_remote_hosts(hosts, languages=scenario_config.LANGUAGES.keys(), build_local=False):

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

230

"""Builds performance worker on remote hosts (and maybe also locally)."""

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

231

build_timeout = 15*60

Matt Kwong

2017-10-09 17:53:05 -0700

[diff] [blame^]

232

# Kokoro VMs (which are local only) do not have caching, so they need more time to build

233

local_build_timeout = 30*60

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

234

build_jobs = []

235

for host in hosts:

236

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)

237

build_jobs.append(

238

jobset.JobSpec(

Craig Tiller

7797e3f

2016-04-01 07:41:05 -0700

[diff] [blame]

239

cmdline=['tools/run_tests/performance/remote_host_build.sh'] + languages,

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

240

shortname='remote_host_build.%s' % host,

Craig Tiller

0a3d5f9

2017-03-01 17:08:39 -0800

[diff] [blame]

241

environ = {'USER_AT_HOST': user_at_host, 'CONFIG': 'opt'},

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

242

timeout_seconds=build_timeout))

243

if build_local:

244

# Build locally as well

245

build_jobs.append(

246

jobset.JobSpec(

Craig Tiller

7797e3f

2016-04-01 07:41:05 -0700

[diff] [blame]

247

cmdline=['tools/run_tests/performance/build_performance.sh'] + languages,

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

248

shortname='local_build',

Craig Tiller

0a3d5f9

2017-03-01 17:08:39 -0800

[diff] [blame]

249

environ = {'CONFIG': 'opt'},

Matt Kwong

2017-10-09 17:53:05 -0700

[diff] [blame^]

250

timeout_seconds=local_build_timeout))

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

251

jobset.message('START', 'Building.', do_newline=True)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

252

num_failures, _ = jobset.run(

Matt Kwong

2017-09-12 13:30:51 -0700

[diff] [blame]

253

build_jobs, newline_on_success=True, maxjobs=10)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

254

if num_failures == 0:

255

jobset.message('SUCCESS',

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

256

'Built successfully.',

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

257

do_newline=True)

258

else:

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

259

jobset.message('FAILED', 'Build failed.',

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

do_newline=True)

sys.exit(1)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

264

def create_qpsworkers(languages, worker_hosts, perf_cmd=None):

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

265

"""Creates QPS workers (but does not start them)."""

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

266

if not worker_hosts:

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

267

# run two workers locally (for each language)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

268

workers=[(None, 10000), (None, 10010)]

269

elif len(worker_hosts) == 1:

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

270

# run two workers on the remote host (for each language)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

271

workers=[(worker_hosts[0], 10000), (worker_hosts[0], 10010)]

272

else:

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

273

# run one worker per each remote host (for each language)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

274

workers=[(worker_host, 10000) for worker_host in worker_hosts]

275

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

276

return [create_qpsworker_job(language,

277

shortname= 'qps_worker_%s_%s' % (language,

278

worker_idx),

279

port=worker[1] + language.worker_port_offset(),

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

280

remote_host=worker[0],

281

perf_cmd=perf_cmd)

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

282

for language in languages

283

for worker_idx, worker in enumerate(workers)]

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

284

285

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

286

def perf_report_processor_job(worker_host, perf_base_name, output_filename, flame_graph_reports):

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

287

print('Creating perf report collection job for %s' % worker_host)

288

cmd = ''

289

if worker_host != 'localhost':

290

user_at_host = "%s@%s" % (_REMOTE_HOST_USERNAME, worker_host)

291

cmd = "USER_AT_HOST=%s OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%s\

292

tools/run_tests/performance/process_remote_perf_flamegraphs.sh" \

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

293

% (user_at_host, output_filename, flame_graph_reports, perf_base_name)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

294

else:

295

cmd = "OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%s\

296

tools/run_tests/performance/process_local_perf_flamegraphs.sh" \

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

297

% (output_filename, flame_graph_reports, perf_base_name)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

298

299

return jobset.JobSpec(cmdline=cmd,

300

timeout_seconds=3*60,

301

shell=True,

302

verbose_success=True,

303

shortname='process perf report')

304

305

Craig Tiller

677966a

2016-09-26 07:37:28 -0700

[diff] [blame]

306

Scenario = collections.namedtuple('Scenario', 'jobspec workers name')

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

307

308

Jan Tattermusch

2016-04-14 17:42:54 -0700

[diff] [blame]

309

def create_scenarios(languages, workers_by_lang, remote_host=None, regex='.*',

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

310

category='all', bq_result_table=None,

Yuxuan Li

2016-11-11 12:05:11 -0800

[diff] [blame]

311

netperf=False, netperf_hosts=[], server_cpu_load=0):

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

312

"""Create jobspecs for scenarios to run."""

Ken Payson

0482c10

2016-04-19 12:08:34 -0700

[diff] [blame]

313

all_workers = [worker

314

for workers in workers_by_lang.values()

315

for worker in workers]

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

316

scenarios = []

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

317

_NO_WORKERS = []

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

318

319

if netperf:

320

if not netperf_hosts:

321

netperf_server='localhost'

322

netperf_client=None

323

elif len(netperf_hosts) == 1:

324

netperf_server=netperf_hosts[0]

325

netperf_client=netperf_hosts[0]

326

else:

327

netperf_server=netperf_hosts[0]

328

netperf_client=netperf_hosts[1]

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

329

scenarios.append(Scenario(

330

create_netperf_jobspec(server_host=netperf_server,

331

client_host=netperf_client,

332

bq_result_table=bq_result_table),

Craig Tiller

677966a

2016-09-26 07:37:28 -0700

[diff] [blame]

333

_NO_WORKERS, 'netperf'))

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

334

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

335

for language in languages:

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

336

for scenario_json in language.scenarios():

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

337

if re.search(regex, scenario_json['name']):

Craig Tiller

b6df247

2016-09-13 09:41:26 -0700

[diff] [blame]

338

categories = scenario_json.get('CATEGORIES', ['scalable', 'smoketest'])

339

if category in categories or category == 'all':

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

340

workers = workers_by_lang[str(language)][:]

Jan Tattermusch

2016-05-05 18:10:14 -0700

[diff] [blame]

341

# 'SERVER_LANGUAGE' is an indicator for this script to pick

342

# a server in different language.

343

custom_server_lang = scenario_json.get('SERVER_LANGUAGE', None)

Jan Tattermusch

2016-05-13 13:49:43 -0700

[diff] [blame]

344

custom_client_lang = scenario_json.get('CLIENT_LANGUAGE', None)

Jan Tattermusch

2016-05-05 18:10:14 -0700

[diff] [blame]

345

scenario_json = scenario_config.remove_nonproto_fields(scenario_json)

Jan Tattermusch

2016-05-13 13:49:43 -0700

[diff] [blame]

346

if custom_server_lang and custom_client_lang:

347

raise Exception('Cannot set both custom CLIENT_LANGUAGE and SERVER_LANGUAGE'

348

'in the same scenario')

Jan Tattermusch

2016-05-05 18:10:14 -0700

[diff] [blame]

349

if custom_server_lang:

350

if not workers_by_lang.get(custom_server_lang, []):

siddharthshukla

2016-07-07 16:08:01 +0200

[diff] [blame]

351

print('Warning: Skipping scenario %s as' % scenario_json['name'])

Jan Tattermusch

2016-05-05 18:10:14 -0700

[diff] [blame]

352

print('SERVER_LANGUAGE is set to %s yet the language has '

353

'not been selected with -l' % custom_server_lang)

354

continue

355

for idx in range(0, scenario_json['num_servers']):

356

# replace first X workers by workers of a different language

357

workers[idx] = workers_by_lang[custom_server_lang][idx]

Jan Tattermusch

2016-05-13 13:49:43 -0700

[diff] [blame]

358

if custom_client_lang:

359

if not workers_by_lang.get(custom_client_lang, []):

siddharthshukla

2016-07-07 16:08:01 +0200

[diff] [blame]

360

print('Warning: Skipping scenario %s as' % scenario_json['name'])

Jan Tattermusch

2016-05-13 13:49:43 -0700

[diff] [blame]

361

print('CLIENT_LANGUAGE is set to %s yet the language has '

362

'not been selected with -l' % custom_client_lang)

363

continue

364

for idx in range(scenario_json['num_servers'], len(workers)):

365

# replace all client workers by workers of a different language,

366

# leave num_server workers as they are server workers.

367

workers[idx] = workers_by_lang[custom_client_lang][idx]

Craig Tiller

2016-09-15 08:57:14 -0700

[diff] [blame]

368

scenario = Scenario(

369

create_scenario_jobspec(scenario_json,

370

[w.host_and_port for w in workers],

371

remote_host=remote_host,

Yuxuan Li

2016-11-11 12:05:11 -0800

[diff] [blame]

372

bq_result_table=bq_result_table,

373

server_cpu_load=server_cpu_load),

Craig Tiller

677966a

2016-09-26 07:37:28 -0700

[diff] [blame]

374

workers,

375

scenario_json['name'])

Jan Tattermusch

2016-05-05 18:10:14 -0700

[diff] [blame]

376

scenarios.append(scenario)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

377

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

return scenarios

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

381

def finish_qps_workers(jobs, qpsworker_jobs):

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

382

"""Waits for given jobs to finish and eventually kills them."""

383

retries = 0

Alexander Polcyn

2016-10-17 10:01:37 -0700

[diff] [blame]

384

num_killed = 0

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

385

while any(job.is_running() for job in jobs):

386

for job in qpsworker_jobs:

387

if job.is_running():

siddharthshukla

2016-07-07 16:08:01 +0200

[diff] [blame]

388

print('QPS worker "%s" is still running.' % job.host_and_port)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

389

if retries > 10:

siddharthshukla

2016-07-07 16:08:01 +0200

[diff] [blame]

390

print('Killing all QPS workers.')

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

391

for job in jobs:

392

job.kill()

Alexander Polcyn

2016-10-17 10:01:37 -0700

[diff] [blame]

393

num_killed += 1

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

394

retries += 1

395

time.sleep(3)

siddharthshukla

2016-07-07 16:08:01 +0200

[diff] [blame]

396

print('All QPS workers finished.')

Alexander Polcyn

2016-10-17 10:01:37 -0700

[diff] [blame]

397

return num_killed

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

398

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

399

profile_output_files = []

400

401

# Collect perf text reports and flamegraphs if perf_cmd was used

402

# Note the base names of perf text reports are used when creating and processing

403

# perf data. The scenario name uniqifies the output name in the final

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

404

# perf reports directory.

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

405

# Alos, the perf profiles need to be fetched and processed after each scenario

406

# in order to avoid clobbering the output files.

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

407

def run_collect_perf_profile_jobs(hosts_and_base_names, scenario_name, flame_graph_reports):

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

408

perf_report_jobs = []

409

global profile_output_files

410

for host_and_port in hosts_and_base_names:

411

perf_base_name = hosts_and_base_names[host_and_port]

412

output_filename = '%s-%s' % (scenario_name, perf_base_name)

413

# from the base filename, create .svg output filename

414

host = host_and_port.split(':')[0]

415

profile_output_files.append('%s.svg' % output_filename)

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

416

perf_report_jobs.append(perf_report_processor_job(host, perf_base_name, output_filename, flame_graph_reports))

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

417

418

jobset.message('START', 'Collecting perf reports from qps workers', do_newline=True)

Matt Kwong

2017-09-12 13:30:51 -0700

[diff] [blame]

419

failures, _ = jobset.run(perf_report_jobs, newline_on_success=True, maxjobs=1)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

420

jobset.message('END', 'Collecting perf reports from qps workers', do_newline=True)

421

return failures

422

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

423

def main():

424

argp = argparse.ArgumentParser(description='Run performance tests.')

425

argp.add_argument('-l', '--language',

426

choices=['all'] + sorted(scenario_config.LANGUAGES.keys()),

427

nargs='+',

428

required=True,

429

help='Languages to benchmark.')

430

argp.add_argument('--remote_driver_host',

431

default=None,

432

help='Run QPS driver on given host. By default, QPS driver is run locally.')

433

argp.add_argument('--remote_worker_host',

434

nargs='+',

435

default=[],

436

help='Worker hosts where to start QPS workers.')

437

argp.add_argument('--dry_run',

438

default=False,

439

action='store_const',

440

const=True,

441

help='Just list scenarios to be run, but don\'t run them.')

442

argp.add_argument('-r', '--regex', default='.*', type=str,

443

help='Regex to select scenarios to run.')

444

argp.add_argument('--bq_result_table', default=None, type=str,

445

help='Bigquery "dataset.table" to upload results to.')

446

argp.add_argument('--category',

447

choices=['smoketest','all','scalable','sweep'],

448

default='all',

449

help='Select a category of tests to run.')

450

argp.add_argument('--netperf',

451

default=False,

452

action='store_const',

453

const=True,

454

help='Run netperf benchmark as one of the scenarios.')

455

argp.add_argument('--server_cpu_load',

456

default=0, type=int,

457

help='Select a targeted server cpu load to run. 0 means ignore this flag')

458

argp.add_argument('-x', '--xml_report', default='report.xml', type=str,

459

help='Name of XML report file to generate.')

460

argp.add_argument('--perf_args',

461

help=('Example usage: "--perf_args=record -F 99 -g". '

462

'Wrap QPS workers in a perf command '

463

'with the arguments to perf specified here. '

464

'".svg" flame graph profiles will be '

465

'created for each Qps Worker on each scenario. '

466

'Files will output to "<repo_root>/<args.flame_graph_reports>" '

467

'directory. Output files from running the worker '

468

'under perf are saved in the repo root where its ran. '

469

'Note that the perf "-g" flag is necessary for '

470

'flame graphs generation to work (assuming the binary '

471

'being profiled uses frame pointers, check out '

472

'"--call-graph dwarf" option using libunwind otherwise.) '

473

'Also note that the entire "--perf_args=<arg(s)>" must '

474

'be wrapped in quotes as in the example usage. '

475

'If the "--perg_args" is unspecified, "perf" will '

476

'not be used at all. '

477

'See http://www.brendangregg.com/perf.html '

478

'for more general perf examples.'))

479

argp.add_argument('--skip_generate_flamegraphs',

480

default=False,

481

action='store_const',

482

const=True,

483

help=('Turn flame graph generation off. '

484

'May be useful if "perf_args" arguments do not make sense for '

485

'generating flamegraphs (e.g., "--perf_args=stat ...")'))

486

argp.add_argument('-f', '--flame_graph_reports', default='perf_reports', type=str,

487

help='Name of directory to output flame graph profiles to, if any are created.')

Matt Kwong

2017-10-09 17:53:05 -0700

[diff] [blame^]

488

argp.add_argument('-u', '--remote_host_username', default='', type=str,

489

help='Use a username that isn\'t "Jenkins" to SSH into remote workers.')

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

490

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

491

args = argp.parse_args()

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

492

Matt Kwong

2017-10-09 17:53:05 -0700

[diff] [blame^]

493

global _REMOTE_HOST_USERNAME

494

if args.remote_host_username:

495

_REMOTE_HOST_USERNAME = args.remote_host_username

496

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

497

languages = set(scenario_config.LANGUAGES[l]

498

for l in itertools.chain.from_iterable(

499

six.iterkeys(scenario_config.LANGUAGES) if x == 'all'

500

else [x] for x in args.language))

Jan Tattermusch

2016-03-30 18:04:01 -0700

[diff] [blame]

501

Jan Tattermusch

2016-04-14 17:42:54 -0700

[diff] [blame]

502

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

503

# Put together set of remote hosts where to run and build

504

remote_hosts = set()

505

if args.remote_worker_host:

506

for host in args.remote_worker_host:

507

remote_hosts.add(host)

508

if args.remote_driver_host:

509

remote_hosts.add(args.remote_driver_host)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

510

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

511

if not args.dry_run:

512

if remote_hosts:

513

archive_repo(languages=[str(l) for l in languages])

514

prepare_remote_hosts(remote_hosts, prepare_local=True)

515

else:

516

prepare_remote_hosts([], prepare_local=True)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

517

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

518

build_local = False

519

if not args.remote_driver_host:

520

build_local = True

521

if not args.dry_run:

522

build_on_remote_hosts(remote_hosts, languages=[str(l) for l in languages], build_local=build_local)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

523

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

524

perf_cmd = None

525

if args.perf_args:

526

print('Running workers under perf profiler')

527

# Expect /usr/bin/perf to be installed here, as is usual

528

perf_cmd = ['/usr/bin/perf']

529

perf_cmd.extend(re.split('\s+', args.perf_args))

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

530

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

531

qpsworker_jobs = create_qpsworkers(languages, args.remote_worker_host, perf_cmd=perf_cmd)

Jan Tattermusch

38becc2

2016-04-14 08:00:35 -0700

[diff] [blame]

532

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

533

# get list of worker addresses for each language.

534

workers_by_lang = dict([(str(language), []) for language in languages])

535

for job in qpsworker_jobs:

536

workers_by_lang[str(job.language)].append(job)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

537

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

538

scenarios = create_scenarios(languages,

539

workers_by_lang=workers_by_lang,

540

remote_host=args.remote_driver_host,

541

regex=args.regex,

542

category=args.category,

543

bq_result_table=args.bq_result_table,

544

netperf=args.netperf,

545

netperf_hosts=args.remote_worker_host,

546

server_cpu_load=args.server_cpu_load)

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

547

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

548

if not scenarios:

549

raise Exception('No scenarios to run')

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

550

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

551

total_scenario_failures = 0

552

qps_workers_killed = 0

553

merged_resultset = {}

554

perf_report_failures = 0

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

555

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

556

for scenario in scenarios:

if args.dry_run:

print(scenario.name)

else:

scenario_failures = 0

561

try:

562

for worker in scenario.workers:

563

worker.start()

564

jobs = [scenario.jobspec]

565

if scenario.workers:

566

jobs.append(create_quit_jobspec(scenario.workers, remote_host=args.remote_driver_host))

Matt Kwong

3c8f6db

2017-09-12 13:19:49 -0700

[diff] [blame]

567

scenario_failures, resultset = jobset.run(jobs, newline_on_success=True, maxjobs=1)

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

568

total_scenario_failures += scenario_failures

569

merged_resultset = dict(itertools.chain(six.iteritems(merged_resultset),

570

six.iteritems(resultset)))

571

finally:

572

# Consider qps workers that need to be killed as failures

573

qps_workers_killed += finish_qps_workers(scenario.workers, qpsworker_jobs)

Alexander Polcyn

2016-10-17 10:01:37 -0700

[diff] [blame]

574

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

575

if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs:

576

workers_and_base_names = {}

577

for worker in scenario.workers:

578

if not worker.perf_file_base_name:

579

raise Exception('using perf buf perf report filename is unspecified')

580

workers_and_base_names[worker.host_and_port] = worker.perf_file_base_name

581

perf_report_failures += run_collect_perf_profile_jobs(workers_and_base_names, scenario.name, args.flame_graph_reports)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

582

583

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

584

# Still write the index.html even if some scenarios failed.

585

# 'profile_output_files' will only have names for scenarios that passed

586

if perf_cmd and not args.skip_generate_flamegraphs:

587

# write the index fil to the output dir, with all profiles from all scenarios/workers

588

report_utils.render_perf_profiling_results('%s/index.html' % args.flame_graph_reports, profile_output_files)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

589

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

590

report_utils.render_junit_xml_report(merged_resultset, args.xml_report,

591

suite_name='benchmarks')

Alexander Polcyn

41fe579

2017-02-02 10:46:51 -0800

[diff] [blame]

592

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

593

if total_scenario_failures > 0 or qps_workers_killed > 0:

594

print('%s scenarios failed and %s qps worker jobs killed' % (total_scenario_failures, qps_workers_killed))

595

sys.exit(1)

Jan Tattermusch

94d40cb

2016-10-24 21:06:40 +0200

[diff] [blame]

596

Michael Darakananda