Blame - tools/run_tests/run_performance_tests.py - platform/external/grpc-grpc

2017-06-07 22:57:36 +0200

[diff] [blame]

2

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

3

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

4

# Licensed under the Apache License, Version 2.0 (the "License");

5

# you may not use this file except in compliance with the License.

6

# You may obtain a copy of the License at

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

7

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

8

# http://www.apache.org/licenses/LICENSE-2.0

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

9

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

10

# Unless required by applicable law or agreed to in writing, software

11

# distributed under the License is distributed on an "AS IS" BASIS,

12

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

# See the License for the specific language governing permissions and

14

# limitations under the License.

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

15

"""Run performance tests locally or remotely."""

16

siddharthshukla

0589e53

2016-07-07 16:08:01 +0200

[diff] [blame]

17

from __future__ import print_function

18

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

19

import argparse

Craig Tiller

accf16b

2016-09-15 09:08:32 -0700

[diff] [blame]

20

import collections

Jan Tattermusch

bb1a453

2016-03-30 18:04:01 -0700

[diff] [blame]

21

import itertools

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

22

import json

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

23

import multiprocessing

24

import os

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

25

import pipes

Jan Tattermusch

38becc2

2016-04-14 08:00:35 -0700

[diff] [blame]

26

import re

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

import subprocess

import sys

import tempfile

import time

Jan Tattermusch

ee9032c

2016-04-14 08:35:51 -0700

[diff] [blame]

31

import traceback

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

32

import uuid

Siddharth Shukla

d194f59

2017-03-11 19:12:43 +0100

[diff] [blame]

33

import six

Jan Tattermusch

5c79a31

2016-12-20 11:02:50 +0100

[diff] [blame]

34

35

import performance.scenario_config as scenario_config

36

import python_utils.jobset as jobset

37

import python_utils.report_utils as report_utils

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

38

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

39

_ROOT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), '../..'))

40

os.chdir(_ROOT)

41

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

42

_REMOTE_HOST_USERNAME = 'jenkins'

43

44

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

45

class QpsWorkerJob:

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

46

"""Encapsulates a qps worker server job."""

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

47

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

48

def __init__(self, spec, language, host_and_port, perf_file_base_name=None):

49

self._spec = spec

50

self.language = language

51

self.host_and_port = host_and_port

52

self._job = None

53

self.perf_file_base_name = perf_file_base_name

Craig Tiller

c1b54f2

2016-09-15 08:57:14 -0700

[diff] [blame]

54

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

55

def start(self):

56

self._job = jobset.Job(

57

self._spec, newline_on_success=True, travis=True, add_env={})

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

58

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

59

def is_running(self):

60

"""Polls a job and returns True if given job is still running."""

61

return self._job and self._job.state() == jobset._RUNNING

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

62

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

def kill(self):

if self._job:

self._job.kill()

self._job = None

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

67

68

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

69

def create_qpsworker_job(language,

shortname=None,

port=10000,

remote_host=None,

perf_cmd=None):

cmdline = (language.worker_cmdline() + ['--driver_port=%s' % port])

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

75

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

76

if remote_host:

77

host_and_port = '%s:%s' % (remote_host, port)

78

else:

79

host_and_port = 'localhost:%s' % port

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

80

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

81

perf_file_base_name = None

82

if perf_cmd:

83

perf_file_base_name = '%s-%s' % (host_and_port, shortname)

84

# specify -o output file so perf.data gets collected when worker stopped

85

cmdline = perf_cmd + ['-o', '%s-perf.data' % perf_file_base_name

86

] + cmdline

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

87

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

88

worker_timeout = 3 * 60

89

if remote_host:

90

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

91

ssh_cmd = ['ssh']

92

cmdline = ['timeout', '%s' % (worker_timeout + 30)] + cmdline

93

ssh_cmd.extend([

94

str(user_at_host),

95

'cd ~/performance_workspace/grpc/ && python tools/run_tests/start_port_server.py && %s'

96

% ' '.join(cmdline)

97

])

98

cmdline = ssh_cmd

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

99

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

100

jobspec = jobset.JobSpec(

101

cmdline=cmdline,

102

shortname=shortname,

103

timeout_seconds=worker_timeout, # workers get restarted after each scenario

104

verbose_success=True)

105

return QpsWorkerJob(jobspec, language, host_and_port, perf_file_base_name)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

106

107

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

108

def create_scenario_jobspec(scenario_json,

109

workers,

110

remote_host=None,

111

bq_result_table=None,

112

server_cpu_load=0):

113

"""Runs one scenario using QPS driver."""

114

# setting QPS_WORKERS env variable here makes sure it works with SSH too.

115

cmd = 'QPS_WORKERS="%s" ' % ','.join(workers)

116

if bq_result_table:

117

cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table

118

cmd += 'tools/run_tests/performance/run_qps_driver.sh '

119

cmd += '--scenarios_json=%s ' % pipes.quote(

120

json.dumps({

121

'scenarios': [scenario_json]

122

}))

123

cmd += '--scenario_result_file=scenario_result.json '

124

if server_cpu_load != 0:

125

cmd += '--search_param=offered_load --initial_search_value=1000 --targeted_cpu_load=%d --stride=500 --error_tolerance=0.01' % server_cpu_load

126

if remote_host:

127

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

128

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (

129

user_at_host, pipes.quote(cmd))

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

130

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

131

return jobset.JobSpec(

132

cmdline=[cmd],

133

shortname='qps_json_driver.%s' % scenario_json['name'],

134

timeout_seconds=12 * 60,

135

shell=True,

136

verbose_success=True)

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

137

138

139

def create_quit_jobspec(workers, remote_host=None):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

140

"""Runs quit using QPS driver."""

141

# setting QPS_WORKERS env variable here makes sure it works with SSH too.

142

cmd = 'QPS_WORKERS="%s" bins/opt/qps_json_driver --quit' % ','.join(

143

w.host_and_port for w in workers)

144

if remote_host:

145

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

146

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (

147

user_at_host, pipes.quote(cmd))

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

148

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

149

return jobset.JobSpec(

150

cmdline=[cmd],

151

shortname='qps_json_driver.quit',

152

timeout_seconds=3 * 60,

153

shell=True,

154

verbose_success=True)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

155

156

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

157

def create_netperf_jobspec(server_host='localhost',

158

client_host=None,

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

159

bq_result_table=None):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

160

"""Runs netperf benchmark."""

161

cmd = 'NETPERF_SERVER_HOST="%s" ' % server_host

162

if bq_result_table:

163

cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table

164

if client_host:

165

# If netperf is running remotely, the env variables populated by Jenkins

166

# won't be available on the client, but we need them for uploading results

167

# to BigQuery.

168

jenkins_job_name = os.getenv('JOB_NAME')

169

if jenkins_job_name:

170

cmd += 'JOB_NAME="%s" ' % jenkins_job_name

171

jenkins_build_number = os.getenv('BUILD_NUMBER')

172

if jenkins_build_number:

173

cmd += 'BUILD_NUMBER="%s" ' % jenkins_build_number

Jan Tattermusch

ad17bf7

2016-05-11 12:41:37 -0700

[diff] [blame]

174

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

175

cmd += 'tools/run_tests/performance/run_netperf.sh'

176

if client_host:

177

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, client_host)

178

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (

179

user_at_host, pipes.quote(cmd))

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

180

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

181

return jobset.JobSpec(

cmdline=[cmd],

shortname='netperf',

timeout_seconds=60,

shell=True,

verbose_success=True)

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

187

188

Jan Tattermusch

de874a1

2016-04-18 09:21:37 -0700

[diff] [blame]

189

def archive_repo(languages):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

190

"""Archives local version of repo including submodules."""

191

cmdline = ['tar', '-cf', '../grpc.tar', '../grpc/']

192

if 'java' in languages:

193

cmdline.append('../grpc-java')

194

if 'go' in languages:

195

cmdline.append('../grpc-go')

Jan Tattermusch

de874a1

2016-04-18 09:21:37 -0700

[diff] [blame]

196

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

197

archive_job = jobset.JobSpec(

198

cmdline=cmdline, shortname='archive_repo', timeout_seconds=3 * 60)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

199

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

200

jobset.message('START', 'Archiving local repository.', do_newline=True)

201

num_failures, _ = jobset.run(

202

[archive_job], newline_on_success=True, maxjobs=1)

203

if num_failures == 0:

204

jobset.message(

205

'SUCCESS',

206

'Archive with local repository created successfully.',

do_newline=True)

else:

jobset.message(

'FAILED', 'Failed to archive local repository.', do_newline=True)

211

sys.exit(1)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

212

213

Jan Tattermusch

1408920

2016-04-27 17:55:27 -0700

[diff] [blame]

214

def prepare_remote_hosts(hosts, prepare_local=False):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

215

"""Prepares remote hosts (and maybe prepare localhost as well)."""

216

prepare_timeout = 10 * 60

217

prepare_jobs = []

218

for host in hosts:

219

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)

220

prepare_jobs.append(

221

jobset.JobSpec(

222

cmdline=['tools/run_tests/performance/remote_host_prepare.sh'],

223

shortname='remote_host_prepare.%s' % host,

224

environ={'USER_AT_HOST': user_at_host},

225

timeout_seconds=prepare_timeout))

226

if prepare_local:

227

# Prepare localhost as well

228

prepare_jobs.append(

229

jobset.JobSpec(

230

cmdline=['tools/run_tests/performance/kill_workers.sh'],

231

shortname='local_prepare',

232

timeout_seconds=prepare_timeout))

233

jobset.message('START', 'Preparing hosts.', do_newline=True)

234

num_failures, _ = jobset.run(

235

prepare_jobs, newline_on_success=True, maxjobs=10)

236

if num_failures == 0:

237

jobset.message(

238

'SUCCESS', 'Prepare step completed successfully.', do_newline=True)

239

else:

240

jobset.message(

241

'FAILED', 'Failed to prepare remote hosts.', do_newline=True)

242

sys.exit(1)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

243

244

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

245

def build_on_remote_hosts(hosts,

246

languages=scenario_config.LANGUAGES.keys(),

247

build_local=False):

248

"""Builds performance worker on remote hosts (and maybe also locally)."""

249

build_timeout = 15 * 60

250

# Kokoro VMs (which are local only) do not have caching, so they need more time to build

251

local_build_timeout = 30 * 60

252

build_jobs = []

253

for host in hosts:

254

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)

255

build_jobs.append(

256

jobset.JobSpec(

257

cmdline=['tools/run_tests/performance/remote_host_build.sh'] +

258

languages,

259

shortname='remote_host_build.%s' % host,

260

environ={'USER_AT_HOST': user_at_host,

261

'CONFIG': 'opt'},

262

timeout_seconds=build_timeout))

263

if build_local:

264

# Build locally as well

265

build_jobs.append(

266

jobset.JobSpec(

267

cmdline=['tools/run_tests/performance/build_performance.sh'] +

268

languages,

269

shortname='local_build',

270

environ={'CONFIG': 'opt'},

271

timeout_seconds=local_build_timeout))

272

jobset.message('START', 'Building.', do_newline=True)

273

num_failures, _ = jobset.run(

274

build_jobs, newline_on_success=True, maxjobs=10)

275

if num_failures == 0:

276

jobset.message('SUCCESS', 'Built successfully.', do_newline=True)

277

else:

278

jobset.message('FAILED', 'Build failed.', do_newline=True)

279

sys.exit(1)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

280

281

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

282

def create_qpsworkers(languages, worker_hosts, perf_cmd=None):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

283

"""Creates QPS workers (but does not start them)."""

284

if not worker_hosts:

285

# run two workers locally (for each language)

286

workers = [(None, 10000), (None, 10010)]

287

elif len(worker_hosts) == 1:

288

# run two workers on the remote host (for each language)

289

workers = [(worker_hosts[0], 10000), (worker_hosts[0], 10010)]

290

else:

291

# run one worker per each remote host (for each language)

292

workers = [(worker_host, 10000) for worker_host in worker_hosts]

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

293

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

294

return [

295

create_qpsworker_job(

296

language,

297

shortname='qps_worker_%s_%s' % (language, worker_idx),

298

port=worker[1] + language.worker_port_offset(),

299

remote_host=worker[0],

300

perf_cmd=perf_cmd)

301

for language in languages for worker_idx, worker in enumerate(workers)

302

]

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

303

304

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

305

def perf_report_processor_job(worker_host, perf_base_name, output_filename,

306

flame_graph_reports):

307

print('Creating perf report collection job for %s' % worker_host)

308

cmd = ''

309

if worker_host != 'localhost':

310

user_at_host = "%s@%s" % (_REMOTE_HOST_USERNAME, worker_host)

311

cmd = "USER_AT_HOST=%s OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%stools/run_tests/performance/process_remote_perf_flamegraphs.sh" % (

312

user_at_host, output_filename, flame_graph_reports, perf_base_name)

313

else:

314

cmd = "OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%stools/run_tests/performance/process_local_perf_flamegraphs.sh" % (

315

output_filename, flame_graph_reports, perf_base_name)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

316

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

317

return jobset.JobSpec(

318

cmdline=cmd,

319

timeout_seconds=3 * 60,

320

shell=True,

321

verbose_success=True,

322

shortname='process perf report')

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

323

324

Craig Tiller

677966a

2016-09-26 07:37:28 -0700

[diff] [blame]

325

Scenario = collections.namedtuple('Scenario', 'jobspec workers name')

Craig Tiller

c1b54f2

2016-09-15 08:57:14 -0700

[diff] [blame]

326

327

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

328

def create_scenarios(languages,

workers_by_lang,

remote_host=None,

regex='.*',

category='all',

bq_result_table=None,

netperf=False,

netperf_hosts=[],

server_cpu_load=0):

"""Create jobspecs for scenarios to run."""

338

all_workers = [

339

worker for workers in workers_by_lang.values() for worker in workers

340

]

341

scenarios = []

342

_NO_WORKERS = []

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

343

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

344

if netperf:

345

if not netperf_hosts:

346

netperf_server = 'localhost'

347

netperf_client = None

348

elif len(netperf_hosts) == 1:

349

netperf_server = netperf_hosts[0]

350

netperf_client = netperf_hosts[0]

351

else:

352

netperf_server = netperf_hosts[0]

353

netperf_client = netperf_hosts[1]

354

scenarios.append(

355

Scenario(

356

create_netperf_jobspec(

357

server_host=netperf_server,

358

client_host=netperf_client,

359

bq_result_table=bq_result_table), _NO_WORKERS, 'netperf'))

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

360

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

361

for language in languages:

362

for scenario_json in language.scenarios():

363

if re.search(regex, scenario_json['name']):

364

categories = scenario_json.get('CATEGORIES',

365

['scalable', 'smoketest'])

366

if category in categories or category == 'all':

367

workers = workers_by_lang[str(language)][:]

368

# 'SERVER_LANGUAGE' is an indicator for this script to pick

369

# a server in different language.

370

custom_server_lang = scenario_json.get('SERVER_LANGUAGE',

371

None)

372

custom_client_lang = scenario_json.get('CLIENT_LANGUAGE',

373

None)

374

scenario_json = scenario_config.remove_nonproto_fields(

375

scenario_json)

376

if custom_server_lang and custom_client_lang:

377

raise Exception(

378

'Cannot set both custom CLIENT_LANGUAGE and SERVER_LANGUAGE'

Jan Tattermusch

37a907e

2016-05-13 13:49:43 -0700

[diff] [blame]

379

'in the same scenario')

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

380

if custom_server_lang:

381

if not workers_by_lang.get(custom_server_lang, []):

382

print('Warning: Skipping scenario %s as' %

383

scenario_json['name'])

384

print(

385

'SERVER_LANGUAGE is set to %s yet the language has '

386

'not been selected with -l' %

387

custom_server_lang)

388

continue

389

for idx in range(0, scenario_json['num_servers']):

390

# replace first X workers by workers of a different language

391

workers[idx] = workers_by_lang[custom_server_lang][

392

idx]

393

if custom_client_lang:

394

if not workers_by_lang.get(custom_client_lang, []):

395

print('Warning: Skipping scenario %s as' %

396

scenario_json['name'])

397

print(

398

'CLIENT_LANGUAGE is set to %s yet the language has '

399

'not been selected with -l' %

400

custom_client_lang)

401

continue

402

for idx in range(scenario_json['num_servers'],

403

len(workers)):

404

# replace all client workers by workers of a different language,

405

# leave num_server workers as they are server workers.

406

workers[idx] = workers_by_lang[custom_client_lang][

407

idx]

408

scenario = Scenario(

409

create_scenario_jobspec(

410

scenario_json, [w.host_and_port for w in workers],

411

remote_host=remote_host,

412

bq_result_table=bq_result_table,

413

server_cpu_load=server_cpu_load), workers,

414

scenario_json['name'])

415

scenarios.append(scenario)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

416

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

417

return scenarios

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

418

419

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

420

def finish_qps_workers(jobs, qpsworker_jobs):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

421

"""Waits for given jobs to finish and eventually kills them."""

422

retries = 0

423

num_killed = 0

424

while any(job.is_running() for job in jobs):

425

for job in qpsworker_jobs:

426

if job.is_running():

427

print('QPS worker "%s" is still running.' % job.host_and_port)

428

if retries > 10:

429

print('Killing all QPS workers.')

for job in jobs:

job.kill()

num_killed += 1

retries += 1

time.sleep(3)

print('All QPS workers finished.')

436

return num_killed

437

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

438

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

439

profile_output_files = []

440

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

441

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

442

# Collect perf text reports and flamegraphs if perf_cmd was used

443

# Note the base names of perf text reports are used when creating and processing

444

# perf data. The scenario name uniqifies the output name in the final

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

445

# perf reports directory.

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

446

# Alos, the perf profiles need to be fetched and processed after each scenario

447

# in order to avoid clobbering the output files.

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

448

def run_collect_perf_profile_jobs(hosts_and_base_names, scenario_name,

449

flame_graph_reports):

450

perf_report_jobs = []

451

global profile_output_files

452

for host_and_port in hosts_and_base_names:

453

perf_base_name = hosts_and_base_names[host_and_port]

454

output_filename = '%s-%s' % (scenario_name, perf_base_name)

455

# from the base filename, create .svg output filename

456

host = host_and_port.split(':')[0]

457

profile_output_files.append('%s.svg' % output_filename)

458

perf_report_jobs.append(

459

perf_report_processor_job(host, perf_base_name, output_filename,

460

flame_graph_reports))

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

461

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

462

jobset.message(

463

'START', 'Collecting perf reports from qps workers', do_newline=True)

464

failures, _ = jobset.run(

465

perf_report_jobs, newline_on_success=True, maxjobs=1)

466

jobset.message(

467

'END', 'Collecting perf reports from qps workers', do_newline=True)

468

return failures

469

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

470

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

471

def main():

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

472

argp = argparse.ArgumentParser(description='Run performance tests.')

argp.add_argument(

'-l',

'--language',

choices=['all'] + sorted(scenario_config.LANGUAGES.keys()),

477

nargs='+',

478

required=True,

479

help='Languages to benchmark.')

480

argp.add_argument(

481

'--remote_driver_host',

482

default=None,

483

help='Run QPS driver on given host. By default, QPS driver is run locally.'

484

)

485

argp.add_argument(

486

'--remote_worker_host',

487

nargs='+',

488

default=[],

489

help='Worker hosts where to start QPS workers.')

argp.add_argument(

'--dry_run',

default=False,

action='store_const',

494

const=True,

495

help='Just list scenarios to be run, but don\'t run them.')

argp.add_argument(

'-r',

'--regex',

default='.*',

type=str,

help='Regex to select scenarios to run.')

argp.add_argument(

'--bq_result_table',

default=None,

type=str,

help='Bigquery "dataset.table" to upload results to.')

507

argp.add_argument(

508

'--category',

509

choices=['smoketest', 'all', 'scalable', 'sweep'],

510

default='all',

511

help='Select a category of tests to run.')

argp.add_argument(

'--netperf',

default=False,

action='store_const',

516

const=True,

517

help='Run netperf benchmark as one of the scenarios.')

argp.add_argument(

'--server_cpu_load',

default=0,

type=int,

help='Select a targeted server cpu load to run. 0 means ignore this flag'

)

argp.add_argument(

'-x',

'--xml_report',

default='report.xml',

528

type=str,

529

help='Name of XML report file to generate.')

530

argp.add_argument(

531

'--perf_args',

532

help=('Example usage: "--perf_args=record -F 99 -g". '

533

'Wrap QPS workers in a perf command '

534

'with the arguments to perf specified here. '

535

'".svg" flame graph profiles will be '

536

'created for each Qps Worker on each scenario. '

537

'Files will output to "<repo_root>/<args.flame_graph_reports>" '

538

'directory. Output files from running the worker '

539

'under perf are saved in the repo root where its ran. '

540

'Note that the perf "-g" flag is necessary for '

541

'flame graphs generation to work (assuming the binary '

542

'being profiled uses frame pointers, check out '

543

'"--call-graph dwarf" option using libunwind otherwise.) '

544

'Also note that the entire "--perf_args=<arg(s)>" must '

545

'be wrapped in quotes as in the example usage. '

546

'If the "--perg_args" is unspecified, "perf" will '

547

'not be used at all. '

548

'See http://www.brendangregg.com/perf.html '

549

'for more general perf examples.'))

550

argp.add_argument(

551

'--skip_generate_flamegraphs',

552

default=False,

553

action='store_const',

554

const=True,

555

help=('Turn flame graph generation off. '

556

'May be useful if "perf_args" arguments do not make sense for '

557

'generating flamegraphs (e.g., "--perf_args=stat ...")'))

558

argp.add_argument(

559

'-f',

560

'--flame_graph_reports',

561

default='perf_reports',

562

type=str,

563

help='Name of directory to output flame graph profiles to, if any are created.'

)

argp.add_argument(

'-u',

'--remote_host_username',

568

default='',

569

type=str,

570

help='Use a username that isn\'t "Jenkins" to SSH into remote workers.')

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

571

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

572

args = argp.parse_args()

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

573

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

574

global _REMOTE_HOST_USERNAME

575

if args.remote_host_username:

576

_REMOTE_HOST_USERNAME = args.remote_host_username

Matt Kwong

b75db42

2017-10-09 17:53:05 -0700

[diff] [blame]

577

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

578

languages = set(

579

scenario_config.LANGUAGES[l]

580

for l in itertools.chain.from_iterable(

581

six.iterkeys(scenario_config.LANGUAGES) if x == 'all' else [x]

582

for x in args.language))

Jan Tattermusch

bb1a453

2016-03-30 18:04:01 -0700

[diff] [blame]

583

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

584

# Put together set of remote hosts where to run and build

585

remote_hosts = set()

586

if args.remote_worker_host:

587

for host in args.remote_worker_host:

588

remote_hosts.add(host)

589

if args.remote_driver_host:

590

remote_hosts.add(args.remote_driver_host)

Jan Tattermusch

6d7fa55

2016-04-14 17:42:54 -0700

[diff] [blame]

591

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

592

if not args.dry_run:

593

if remote_hosts:

594

archive_repo(languages=[str(l) for l in languages])

595

prepare_remote_hosts(remote_hosts, prepare_local=True)

596

else:

597

prepare_remote_hosts([], prepare_local=True)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

598

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

599

build_local = False

600

if not args.remote_driver_host:

601

build_local = True

602

if not args.dry_run:

603

build_on_remote_hosts(

604

remote_hosts,

605

languages=[str(l) for l in languages],

606

build_local=build_local)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

607

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

608

perf_cmd = None

609

if args.perf_args:

610

print('Running workers under perf profiler')

611

# Expect /usr/bin/perf to be installed here, as is usual

612

perf_cmd = ['/usr/bin/perf']

613

perf_cmd.extend(re.split('\s+', args.perf_args))

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

614

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

615

qpsworker_jobs = create_qpsworkers(

616

languages, args.remote_worker_host, perf_cmd=perf_cmd)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

617

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

618

# get list of worker addresses for each language.

619

workers_by_lang = dict([(str(language), []) for language in languages])

620

for job in qpsworker_jobs:

621

workers_by_lang[str(job.language)].append(job)

Jan Tattermusch

38becc2

2016-04-14 08:00:35 -0700

[diff] [blame]

622

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

623

scenarios = create_scenarios(

624

languages,

625

workers_by_lang=workers_by_lang,

626

remote_host=args.remote_driver_host,

627

regex=args.regex,

628

category=args.category,

629

bq_result_table=args.bq_result_table,

630

netperf=args.netperf,

631

netperf_hosts=args.remote_worker_host,

632

server_cpu_load=args.server_cpu_load)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

633

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

634

if not scenarios:

635

raise Exception('No scenarios to run')

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

636

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

637

total_scenario_failures = 0

638

qps_workers_killed = 0

639

merged_resultset = {}

640

perf_report_failures = 0

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

641

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

642

for scenario in scenarios:

if args.dry_run:

print(scenario.name)

else:

scenario_failures = 0

647

try:

648

for worker in scenario.workers:

649

worker.start()

650

jobs = [scenario.jobspec]

if scenario.workers:

jobs.append(

create_quit_jobspec(

scenario.workers,

remote_host=args.remote_driver_host))

656

scenario_failures, resultset = jobset.run(

657

jobs, newline_on_success=True, maxjobs=1)

658

total_scenario_failures += scenario_failures

659

merged_resultset = dict(

660

itertools.chain(

661

six.iteritems(merged_resultset),

662

six.iteritems(resultset)))

663

finally:

664

# Consider qps workers that need to be killed as failures

665

qps_workers_killed += finish_qps_workers(scenario.workers,

666

qpsworker_jobs)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

667

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

668

if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs:

669

workers_and_base_names = {}

670

for worker in scenario.workers:

671

if not worker.perf_file_base_name:

672

raise Exception(

673

'using perf buf perf report filename is unspecified')

674

workers_and_base_names[

675

worker.host_and_port] = worker.perf_file_base_name

676

perf_report_failures += run_collect_perf_profile_jobs(

677

workers_and_base_names, scenario.name,

678

args.flame_graph_reports)

Alexander Polcyn

4979667

2016-10-17 10:01:37 -0700

[diff] [blame]

679

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

680

# Still write the index.html even if some scenarios failed.

681

# 'profile_output_files' will only have names for scenarios that passed

682

if perf_cmd and not args.skip_generate_flamegraphs:

683

# write the index fil to the output dir, with all profiles from all scenarios/workers

684

report_utils.render_perf_profiling_results(

685

'%s/index.html' % args.flame_graph_reports, profile_output_files)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

686

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

687

report_utils.render_junit_xml_report(

688

merged_resultset, args.xml_report, suite_name='benchmarks')

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

689

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

690

if total_scenario_failures > 0 or qps_workers_killed > 0:

691

print('%s scenarios failed and %s qps worker jobs killed' %

692

(total_scenario_failures, qps_workers_killed))

693

sys.exit(1)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

694

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame^]

695

if perf_report_failures > 0:

696

print('%s perf profile collection jobs failed' % perf_report_failures)

697

sys.exit(1)

Alexander Polcyn

41fe579

2017-02-02 10:46:51 -0800

[diff] [blame]

698

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

699

700

if __name__ == "__main__":

ncteisen