Blame - tools/run_tests/run_performance_tests.py - platform/external/grpc-grpc

2017-06-07 22:57:36 +0200

[diff] [blame]

2

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

3

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

4

# Licensed under the Apache License, Version 2.0 (the "License");

5

# you may not use this file except in compliance with the License.

6

# You may obtain a copy of the License at

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

7

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

8

# http://www.apache.org/licenses/LICENSE-2.0

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

9

#

Jan Tattermusch

2017-06-07 22:57:36 +0200

[diff] [blame]

10

# Unless required by applicable law or agreed to in writing, software

11

# distributed under the License is distributed on an "AS IS" BASIS,

12

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

# See the License for the specific language governing permissions and

14

# limitations under the License.

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

15

"""Run performance tests locally or remotely."""

16

siddharthshukla

0589e53

2016-07-07 16:08:01 +0200

[diff] [blame]

17

from __future__ import print_function

18

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

19

import argparse

Craig Tiller

accf16b

2016-09-15 09:08:32 -0700

[diff] [blame]

20

import collections

Jan Tattermusch

bb1a453

2016-03-30 18:04:01 -0700

[diff] [blame]

21

import itertools

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

22

import json

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

23

import multiprocessing

24

import os

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

25

import pipes

Jan Tattermusch

38becc2

2016-04-14 08:00:35 -0700

[diff] [blame]

26

import re

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

import subprocess

import sys

import tempfile

import time

Jan Tattermusch

ee9032c

2016-04-14 08:35:51 -0700

[diff] [blame]

31

import traceback

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

32

import uuid

Siddharth Shukla

d194f59

2017-03-11 19:12:43 +0100

[diff] [blame]

33

import six

Jan Tattermusch

5c79a31

2016-12-20 11:02:50 +0100

[diff] [blame]

34

35

import performance.scenario_config as scenario_config

36

import python_utils.jobset as jobset

37

import python_utils.report_utils as report_utils

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

38

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

39

_ROOT = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), '../..'))

40

os.chdir(_ROOT)

41

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

42

_REMOTE_HOST_USERNAME = 'jenkins'

43

44

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

45

class QpsWorkerJob:

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

46

"""Encapsulates a qps worker server job."""

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

47

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

48

def __init__(self, spec, language, host_and_port, perf_file_base_name=None):

49

self._spec = spec

50

self.language = language

51

self.host_and_port = host_and_port

52

self._job = None

53

self.perf_file_base_name = perf_file_base_name

Craig Tiller

c1b54f2

2016-09-15 08:57:14 -0700

[diff] [blame]

54

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

55

def start(self):

56

self._job = jobset.Job(

57

self._spec, newline_on_success=True, travis=True, add_env={})

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

58

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

59

def is_running(self):

60

"""Polls a job and returns True if given job is still running."""

61

return self._job and self._job.state() == jobset._RUNNING

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

62

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

def kill(self):

if self._job:

self._job.kill()

self._job = None

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

67

68

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

69

def create_qpsworker_job(language,

shortname=None,

port=10000,

remote_host=None,

perf_cmd=None):

cmdline = (language.worker_cmdline() + ['--driver_port=%s' % port])

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

75

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

76

if remote_host:

77

host_and_port = '%s:%s' % (remote_host, port)

78

else:

79

host_and_port = 'localhost:%s' % port

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

80

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

81

perf_file_base_name = None

82

if perf_cmd:

83

perf_file_base_name = '%s-%s' % (host_and_port, shortname)

84

# specify -o output file so perf.data gets collected when worker stopped

85

cmdline = perf_cmd + ['-o', '%s-perf.data' % perf_file_base_name

86

] + cmdline

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

87

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

88

worker_timeout = 3 * 60

89

if remote_host:

90

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

91

ssh_cmd = ['ssh']

92

cmdline = ['timeout', '%s' % (worker_timeout + 30)] + cmdline

93

ssh_cmd.extend([

94

str(user_at_host),

95

'cd ~/performance_workspace/grpc/ && python tools/run_tests/start_port_server.py && %s'

96

% ' '.join(cmdline)

97

])

98

cmdline = ssh_cmd

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

99

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

100

jobspec = jobset.JobSpec(

101

cmdline=cmdline,

102

shortname=shortname,

Mehrdad Afshari

2018-01-02 14:40:00 -0800

[diff] [blame]

103

timeout_seconds=

104

worker_timeout, # workers get restarted after each scenario

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

105

verbose_success=True)

106

return QpsWorkerJob(jobspec, language, host_and_port, perf_file_base_name)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

107

108

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

109

def create_scenario_jobspec(scenario_json,

110

workers,

111

remote_host=None,

112

bq_result_table=None,

113

server_cpu_load=0):

114

"""Runs one scenario using QPS driver."""

115

# setting QPS_WORKERS env variable here makes sure it works with SSH too.

116

cmd = 'QPS_WORKERS="%s" ' % ','.join(workers)

117

if bq_result_table:

118

cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table

119

cmd += 'tools/run_tests/performance/run_qps_driver.sh '

120

cmd += '--scenarios_json=%s ' % pipes.quote(

121

json.dumps({

122

'scenarios': [scenario_json]

123

}))

124

cmd += '--scenario_result_file=scenario_result.json '

125

if server_cpu_load != 0:

126

cmd += '--search_param=offered_load --initial_search_value=1000 --targeted_cpu_load=%d --stride=500 --error_tolerance=0.01' % server_cpu_load

127

if remote_host:

128

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

129

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (

130

user_at_host, pipes.quote(cmd))

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

131

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

132

return jobset.JobSpec(

133

cmdline=[cmd],

134

shortname='qps_json_driver.%s' % scenario_json['name'],

135

timeout_seconds=12 * 60,

136

shell=True,

137

verbose_success=True)

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

138

139

140

def create_quit_jobspec(workers, remote_host=None):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

141

"""Runs quit using QPS driver."""

142

# setting QPS_WORKERS env variable here makes sure it works with SSH too.

143

cmd = 'QPS_WORKERS="%s" bins/opt/qps_json_driver --quit' % ','.join(

144

w.host_and_port for w in workers)

145

if remote_host:

146

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, remote_host)

147

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (

148

user_at_host, pipes.quote(cmd))

Craig Tiller

2016-03-03 12:51:53 -0800

[diff] [blame]

149

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

150

return jobset.JobSpec(

151

cmdline=[cmd],

152

shortname='qps_json_driver.quit',

153

timeout_seconds=3 * 60,

154

shell=True,

155

verbose_success=True)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

156

157

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

158

def create_netperf_jobspec(server_host='localhost',

159

client_host=None,

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

160

bq_result_table=None):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

161

"""Runs netperf benchmark."""

162

cmd = 'NETPERF_SERVER_HOST="%s" ' % server_host

163

if bq_result_table:

164

cmd += 'BQ_RESULT_TABLE="%s" ' % bq_result_table

165

if client_host:

166

# If netperf is running remotely, the env variables populated by Jenkins

167

# won't be available on the client, but we need them for uploading results

168

# to BigQuery.

169

jenkins_job_name = os.getenv('JOB_NAME')

170

if jenkins_job_name:

171

cmd += 'JOB_NAME="%s" ' % jenkins_job_name

172

jenkins_build_number = os.getenv('BUILD_NUMBER')

173

if jenkins_build_number:

174

cmd += 'BUILD_NUMBER="%s" ' % jenkins_build_number

Jan Tattermusch

ad17bf7

2016-05-11 12:41:37 -0700

[diff] [blame]

175

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

176

cmd += 'tools/run_tests/performance/run_netperf.sh'

177

if client_host:

178

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, client_host)

179

cmd = 'ssh %s "cd ~/performance_workspace/grpc/ && "%s' % (

180

user_at_host, pipes.quote(cmd))

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

181

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

182

return jobset.JobSpec(

cmdline=[cmd],

shortname='netperf',

timeout_seconds=60,

shell=True,

verbose_success=True)

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

188

189

Jan Tattermusch

de874a1

2016-04-18 09:21:37 -0700

[diff] [blame]

190

def archive_repo(languages):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

191

"""Archives local version of repo including submodules."""

192

cmdline = ['tar', '-cf', '../grpc.tar', '../grpc/']

193

if 'java' in languages:

194

cmdline.append('../grpc-java')

195

if 'go' in languages:

196

cmdline.append('../grpc-go')

Jan Tattermusch

de874a1

2016-04-18 09:21:37 -0700

[diff] [blame]

197

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

198

archive_job = jobset.JobSpec(

199

cmdline=cmdline, shortname='archive_repo', timeout_seconds=3 * 60)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

200

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

201

jobset.message('START', 'Archiving local repository.', do_newline=True)

202

num_failures, _ = jobset.run(

203

[archive_job], newline_on_success=True, maxjobs=1)

204

if num_failures == 0:

205

jobset.message(

206

'SUCCESS',

207

'Archive with local repository created successfully.',

do_newline=True)

else:

jobset.message(

'FAILED', 'Failed to archive local repository.', do_newline=True)

212

sys.exit(1)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

213

214

Jan Tattermusch

1408920

2016-04-27 17:55:27 -0700

[diff] [blame]

215

def prepare_remote_hosts(hosts, prepare_local=False):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

216

"""Prepares remote hosts (and maybe prepare localhost as well)."""

217

prepare_timeout = 10 * 60

218

prepare_jobs = []

219

for host in hosts:

220

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)

221

prepare_jobs.append(

222

jobset.JobSpec(

223

cmdline=['tools/run_tests/performance/remote_host_prepare.sh'],

224

shortname='remote_host_prepare.%s' % host,

225

environ={'USER_AT_HOST': user_at_host},

226

timeout_seconds=prepare_timeout))

227

if prepare_local:

228

# Prepare localhost as well

229

prepare_jobs.append(

230

jobset.JobSpec(

231

cmdline=['tools/run_tests/performance/kill_workers.sh'],

232

shortname='local_prepare',

233

timeout_seconds=prepare_timeout))

234

jobset.message('START', 'Preparing hosts.', do_newline=True)

235

num_failures, _ = jobset.run(

236

prepare_jobs, newline_on_success=True, maxjobs=10)

237

if num_failures == 0:

238

jobset.message(

239

'SUCCESS', 'Prepare step completed successfully.', do_newline=True)

240

else:

241

jobset.message(

242

'FAILED', 'Failed to prepare remote hosts.', do_newline=True)

243

sys.exit(1)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

244

245

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

246

def build_on_remote_hosts(hosts,

247

languages=scenario_config.LANGUAGES.keys(),

248

build_local=False):

249

"""Builds performance worker on remote hosts (and maybe also locally)."""

250

build_timeout = 15 * 60

251

# Kokoro VMs (which are local only) do not have caching, so they need more time to build

252

local_build_timeout = 30 * 60

253

build_jobs = []

254

for host in hosts:

255

user_at_host = '%s@%s' % (_REMOTE_HOST_USERNAME, host)

256

build_jobs.append(

257

jobset.JobSpec(

258

cmdline=['tools/run_tests/performance/remote_host_build.sh'] +

259

languages,

260

shortname='remote_host_build.%s' % host,

261

environ={'USER_AT_HOST': user_at_host,

262

'CONFIG': 'opt'},

263

timeout_seconds=build_timeout))

264

if build_local:

265

# Build locally as well

266

build_jobs.append(

267

jobset.JobSpec(

268

cmdline=['tools/run_tests/performance/build_performance.sh'] +

269

languages,

270

shortname='local_build',

271

environ={'CONFIG': 'opt'},

272

timeout_seconds=local_build_timeout))

273

jobset.message('START', 'Building.', do_newline=True)

274

num_failures, _ = jobset.run(

275

build_jobs, newline_on_success=True, maxjobs=10)

276

if num_failures == 0:

277

jobset.message('SUCCESS', 'Built successfully.', do_newline=True)

278

else:

279

jobset.message('FAILED', 'Build failed.', do_newline=True)

280

sys.exit(1)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

281

282

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

283

def create_qpsworkers(languages, worker_hosts, perf_cmd=None):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

284

"""Creates QPS workers (but does not start them)."""

285

if not worker_hosts:

286

# run two workers locally (for each language)

287

workers = [(None, 10000), (None, 10010)]

288

elif len(worker_hosts) == 1:

289

# run two workers on the remote host (for each language)

290

workers = [(worker_hosts[0], 10000), (worker_hosts[0], 10010)]

291

else:

292

# run one worker per each remote host (for each language)

293

workers = [(worker_host, 10000) for worker_host in worker_hosts]

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

294

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

295

return [

296

create_qpsworker_job(

297

language,

298

shortname='qps_worker_%s_%s' % (language, worker_idx),

299

port=worker[1] + language.worker_port_offset(),

300

remote_host=worker[0],

301

perf_cmd=perf_cmd)

Mehrdad Afshari

2018-01-02 14:40:00 -0800

[diff] [blame]

302

for language in languages

303

for worker_idx, worker in enumerate(workers)

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

304

]

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

305

306

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

307

def perf_report_processor_job(worker_host, perf_base_name, output_filename,

308

flame_graph_reports):

309

print('Creating perf report collection job for %s' % worker_host)

310

cmd = ''

311

if worker_host != 'localhost':

312

user_at_host = "%s@%s" % (_REMOTE_HOST_USERNAME, worker_host)

313

cmd = "USER_AT_HOST=%s OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%stools/run_tests/performance/process_remote_perf_flamegraphs.sh" % (

314

user_at_host, output_filename, flame_graph_reports, perf_base_name)

315

else:

316

cmd = "OUTPUT_FILENAME=%s OUTPUT_DIR=%s PERF_BASE_NAME=%stools/run_tests/performance/process_local_perf_flamegraphs.sh" % (

317

output_filename, flame_graph_reports, perf_base_name)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

318

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

319

return jobset.JobSpec(

320

cmdline=cmd,

321

timeout_seconds=3 * 60,

322

shell=True,

323

verbose_success=True,

324

shortname='process perf report')

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

325

326

Craig Tiller

677966a

2016-09-26 07:37:28 -0700

[diff] [blame]

327

Scenario = collections.namedtuple('Scenario', 'jobspec workers name')

Craig Tiller

c1b54f2

2016-09-15 08:57:14 -0700

[diff] [blame]

328

329

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

330

def create_scenarios(languages,

workers_by_lang,

remote_host=None,

regex='.*',

category='all',

bq_result_table=None,

netperf=False,

netperf_hosts=[],

server_cpu_load=0):

"""Create jobspecs for scenarios to run."""

340

all_workers = [

341

worker for workers in workers_by_lang.values() for worker in workers

342

]

343

scenarios = []

344

_NO_WORKERS = []

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

345

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

346

if netperf:

347

if not netperf_hosts:

348

netperf_server = 'localhost'

349

netperf_client = None

350

elif len(netperf_hosts) == 1:

351

netperf_server = netperf_hosts[0]

352

netperf_client = netperf_hosts[0]

353

else:

354

netperf_server = netperf_hosts[0]

355

netperf_client = netperf_hosts[1]

356

scenarios.append(

357

Scenario(

358

create_netperf_jobspec(

359

server_host=netperf_server,

360

client_host=netperf_client,

361

bq_result_table=bq_result_table), _NO_WORKERS, 'netperf'))

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

362

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

363

for language in languages:

364

for scenario_json in language.scenarios():

365

if re.search(regex, scenario_json['name']):

366

categories = scenario_json.get('CATEGORIES',

367

['scalable', 'smoketest'])

368

if category in categories or category == 'all':

369

workers = workers_by_lang[str(language)][:]

370

# 'SERVER_LANGUAGE' is an indicator for this script to pick

371

# a server in different language.

Mehrdad Afshari

2018-01-02 14:40:00 -0800

[diff] [blame]

372

custom_server_lang = scenario_json.get(

373

'SERVER_LANGUAGE', None)

374

custom_client_lang = scenario_json.get(

375

'CLIENT_LANGUAGE', None)

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

376

scenario_json = scenario_config.remove_nonproto_fields(

377

scenario_json)

378

if custom_server_lang and custom_client_lang:

379

raise Exception(

380

'Cannot set both custom CLIENT_LANGUAGE and SERVER_LANGUAGE'

Jan Tattermusch

37a907e

2016-05-13 13:49:43 -0700

[diff] [blame]

381

'in the same scenario')

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

382

if custom_server_lang:

383

if not workers_by_lang.get(custom_server_lang, []):

384

print('Warning: Skipping scenario %s as' %

385

scenario_json['name'])

386

print(

387

'SERVER_LANGUAGE is set to %s yet the language has '

388

'not been selected with -l' %

389

custom_server_lang)

390

continue

391

for idx in range(0, scenario_json['num_servers']):

392

# replace first X workers by workers of a different language

393

workers[idx] = workers_by_lang[custom_server_lang][

394

idx]

395

if custom_client_lang:

396

if not workers_by_lang.get(custom_client_lang, []):

397

print('Warning: Skipping scenario %s as' %

398

scenario_json['name'])

399

print(

400

'CLIENT_LANGUAGE is set to %s yet the language has '

401

'not been selected with -l' %

402

custom_client_lang)

403

continue

404

for idx in range(scenario_json['num_servers'],

405

len(workers)):

406

# replace all client workers by workers of a different language,

407

# leave num_server workers as they are server workers.

408

workers[idx] = workers_by_lang[custom_client_lang][

409

idx]

410

scenario = Scenario(

411

create_scenario_jobspec(

412

scenario_json, [w.host_and_port for w in workers],

413

remote_host=remote_host,

414

bq_result_table=bq_result_table,

415

server_cpu_load=server_cpu_load), workers,

416

scenario_json['name'])

417

scenarios.append(scenario)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

418

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

419

return scenarios

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

420

421

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

422

def finish_qps_workers(jobs, qpsworker_jobs):

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

423

"""Waits for given jobs to finish and eventually kills them."""

424

retries = 0

425

num_killed = 0

426

while any(job.is_running() for job in jobs):

427

for job in qpsworker_jobs:

428

if job.is_running():

429

print('QPS worker "%s" is still running.' % job.host_and_port)

430

if retries > 10:

431

print('Killing all QPS workers.')

for job in jobs:

job.kill()

num_killed += 1

retries += 1

time.sleep(3)

print('All QPS workers finished.')

438

return num_killed

439

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

440

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

441

profile_output_files = []

442

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

443

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

444

# Collect perf text reports and flamegraphs if perf_cmd was used

445

# Note the base names of perf text reports are used when creating and processing

446

# perf data. The scenario name uniqifies the output name in the final

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

447

# perf reports directory.

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

448

# Alos, the perf profiles need to be fetched and processed after each scenario

449

# in order to avoid clobbering the output files.

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

450

def run_collect_perf_profile_jobs(hosts_and_base_names, scenario_name,

451

flame_graph_reports):

452

perf_report_jobs = []

453

global profile_output_files

454

for host_and_port in hosts_and_base_names:

455

perf_base_name = hosts_and_base_names[host_and_port]

456

output_filename = '%s-%s' % (scenario_name, perf_base_name)

457

# from the base filename, create .svg output filename

458

host = host_and_port.split(':')[0]

459

profile_output_files.append('%s.svg' % output_filename)

460

perf_report_jobs.append(

461

perf_report_processor_job(host, perf_base_name, output_filename,

462

flame_graph_reports))

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

463

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

464

jobset.message(

465

'START', 'Collecting perf reports from qps workers', do_newline=True)

466

failures, _ = jobset.run(

467

perf_report_jobs, newline_on_success=True, maxjobs=1)

468

jobset.message(

469

'END', 'Collecting perf reports from qps workers', do_newline=True)

470

return failures

471

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

472

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

473

def main():

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

474

argp = argparse.ArgumentParser(description='Run performance tests.')

argp.add_argument(

'-l',

'--language',

choices=['all'] + sorted(scenario_config.LANGUAGES.keys()),

479

nargs='+',

480

required=True,

481

help='Languages to benchmark.')

482

argp.add_argument(

483

'--remote_driver_host',

484

default=None,

Mehrdad Afshari

2018-01-02 14:40:00 -0800

[diff] [blame]

485

help=

486

'Run QPS driver on given host. By default, QPS driver is run locally.')

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

487

argp.add_argument(

488

'--remote_worker_host',

489

nargs='+',

490

default=[],

491

help='Worker hosts where to start QPS workers.')

argp.add_argument(

'--dry_run',

default=False,

action='store_const',

496

const=True,

497

help='Just list scenarios to be run, but don\'t run them.')

argp.add_argument(

'-r',

'--regex',

default='.*',

type=str,

help='Regex to select scenarios to run.')

argp.add_argument(

'--bq_result_table',

default=None,

type=str,

help='Bigquery "dataset.table" to upload results to.')

509

argp.add_argument(

510

'--category',

511

choices=['smoketest', 'all', 'scalable', 'sweep'],

512

default='all',

513

help='Select a category of tests to run.')

argp.add_argument(

'--netperf',

default=False,

action='store_const',

518

const=True,

519

help='Run netperf benchmark as one of the scenarios.')

argp.add_argument(

'--server_cpu_load',

default=0,

type=int,

help='Select a targeted server cpu load to run. 0 means ignore this flag'

)

argp.add_argument(

'-x',

'--xml_report',

default='report.xml',

530

type=str,

531

help='Name of XML report file to generate.')

532

argp.add_argument(

533

'--perf_args',

534

help=('Example usage: "--perf_args=record -F 99 -g". '

535

'Wrap QPS workers in a perf command '

536

'with the arguments to perf specified here. '

537

'".svg" flame graph profiles will be '

538

'created for each Qps Worker on each scenario. '

539

'Files will output to "<repo_root>/<args.flame_graph_reports>" '

540

'directory. Output files from running the worker '

541

'under perf are saved in the repo root where its ran. '

542

'Note that the perf "-g" flag is necessary for '

543

'flame graphs generation to work (assuming the binary '

544

'being profiled uses frame pointers, check out '

545

'"--call-graph dwarf" option using libunwind otherwise.) '

546

'Also note that the entire "--perf_args=<arg(s)>" must '

547

'be wrapped in quotes as in the example usage. '

548

'If the "--perg_args" is unspecified, "perf" will '

549

'not be used at all. '

550

'See http://www.brendangregg.com/perf.html '

551

'for more general perf examples.'))

552

argp.add_argument(

553

'--skip_generate_flamegraphs',

554

default=False,

555

action='store_const',

556

const=True,

557

help=('Turn flame graph generation off. '

558

'May be useful if "perf_args" arguments do not make sense for '

559

'generating flamegraphs (e.g., "--perf_args=stat ...")'))

560

argp.add_argument(

561

'-f',

562

'--flame_graph_reports',

563

default='perf_reports',

564

type=str,

Mehrdad Afshari

2018-01-02 14:40:00 -0800

[diff] [blame]

565

help=

566

'Name of directory to output flame graph profiles to, if any are created.'

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

)

argp.add_argument(

'-u',

'--remote_host_username',

571

default='',

572

type=str,

573

help='Use a username that isn\'t "Jenkins" to SSH into remote workers.')

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

574

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

575

args = argp.parse_args()

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

576

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

577

global _REMOTE_HOST_USERNAME

578

if args.remote_host_username:

579

_REMOTE_HOST_USERNAME = args.remote_host_username

Matt Kwong

b75db42

2017-10-09 17:53:05 -0700

[diff] [blame]

580

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

581

languages = set(

582

scenario_config.LANGUAGES[l]

583

for l in itertools.chain.from_iterable(

584

six.iterkeys(scenario_config.LANGUAGES) if x == 'all' else [x]

585

for x in args.language))

Jan Tattermusch

bb1a453

2016-03-30 18:04:01 -0700

[diff] [blame]

586

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

587

# Put together set of remote hosts where to run and build

588

remote_hosts = set()

589

if args.remote_worker_host:

590

for host in args.remote_worker_host:

591

remote_hosts.add(host)

592

if args.remote_driver_host:

593

remote_hosts.add(args.remote_driver_host)

Jan Tattermusch

6d7fa55

2016-04-14 17:42:54 -0700

[diff] [blame]

594

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

595

if not args.dry_run:

596

if remote_hosts:

597

archive_repo(languages=[str(l) for l in languages])

598

prepare_remote_hosts(remote_hosts, prepare_local=True)

599

else:

600

prepare_remote_hosts([], prepare_local=True)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

601

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

602

build_local = False

603

if not args.remote_driver_host:

604

build_local = True

605

if not args.dry_run:

606

build_on_remote_hosts(

607

remote_hosts,

608

languages=[str(l) for l in languages],

609

build_local=build_local)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

610

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

611

perf_cmd = None

612

if args.perf_args:

613

print('Running workers under perf profiler')

614

# Expect /usr/bin/perf to be installed here, as is usual

615

perf_cmd = ['/usr/bin/perf']

616

perf_cmd.extend(re.split('\s+', args.perf_args))

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

617

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

618

qpsworker_jobs = create_qpsworkers(

619

languages, args.remote_worker_host, perf_cmd=perf_cmd)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

620

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

621

# get list of worker addresses for each language.

622

workers_by_lang = dict([(str(language), []) for language in languages])

623

for job in qpsworker_jobs:

624

workers_by_lang[str(job.language)].append(job)

Jan Tattermusch

38becc2

2016-04-14 08:00:35 -0700

[diff] [blame]

625

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

626

scenarios = create_scenarios(

627

languages,

628

workers_by_lang=workers_by_lang,

629

remote_host=args.remote_driver_host,

630

regex=args.regex,

631

category=args.category,

632

bq_result_table=args.bq_result_table,

633

netperf=args.netperf,

634

netperf_hosts=args.remote_worker_host,

635

server_cpu_load=args.server_cpu_load)

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

636

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

637

if not scenarios:

638

raise Exception('No scenarios to run')

Jan Tattermusch

2016-05-10 14:33:07 -0700

[diff] [blame]

639

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

640

total_scenario_failures = 0

641

qps_workers_killed = 0

642

merged_resultset = {}

643

perf_report_failures = 0

Jan Tattermusch

2016-03-28 09:32:20 -0700

[diff] [blame]

644

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

645

for scenario in scenarios:

if args.dry_run:

print(scenario.name)

else:

scenario_failures = 0

650

try:

651

for worker in scenario.workers:

652

worker.start()

653

jobs = [scenario.jobspec]

if scenario.workers:

jobs.append(

create_quit_jobspec(

scenario.workers,

remote_host=args.remote_driver_host))

659

scenario_failures, resultset = jobset.run(

660

jobs, newline_on_success=True, maxjobs=1)

661

total_scenario_failures += scenario_failures

662

merged_resultset = dict(

663

itertools.chain(

664

six.iteritems(merged_resultset),

665

six.iteritems(resultset)))

666

finally:

667

# Consider qps workers that need to be killed as failures

Mehrdad Afshari

2018-01-02 14:40:00 -0800

[diff] [blame]

668

qps_workers_killed += finish_qps_workers(

669

scenario.workers, qpsworker_jobs)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

670

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

671

if perf_cmd and scenario_failures == 0 and not args.skip_generate_flamegraphs:

672

workers_and_base_names = {}

673

for worker in scenario.workers:

674

if not worker.perf_file_base_name:

675

raise Exception(

Mehrdad Afshari

2018-01-02 14:40:00 -0800

[diff] [blame]

676

'using perf buf perf report filename is unspecified'

677

)

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

678

workers_and_base_names[

679

worker.host_and_port] = worker.perf_file_base_name

680

perf_report_failures += run_collect_perf_profile_jobs(

681

workers_and_base_names, scenario.name,

682

args.flame_graph_reports)

Alexander Polcyn

4979667

2016-10-17 10:01:37 -0700

[diff] [blame]

683

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

684

# Still write the index.html even if some scenarios failed.

685

# 'profile_output_files' will only have names for scenarios that passed

686

if perf_cmd and not args.skip_generate_flamegraphs:

687

# write the index fil to the output dir, with all profiles from all scenarios/workers

688

report_utils.render_perf_profiling_results(

689

'%s/index.html' % args.flame_graph_reports, profile_output_files)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

690

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

691

report_utils.render_junit_xml_report(

692

merged_resultset, args.xml_report, suite_name='benchmarks')

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

693

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

694

if total_scenario_failures > 0 or qps_workers_killed > 0:

695

print('%s scenarios failed and %s qps worker jobs killed' %

696

(total_scenario_failures, qps_workers_killed))

697

sys.exit(1)

Alexander Polcyn

2016-10-24 12:25:02 -0700

[diff] [blame]

698

ncteisen

2017-12-11 18:00:40 -0800

[diff] [blame]

699

if perf_report_failures > 0:

700

print('%s perf profile collection jobs failed' % perf_report_failures)

701

sys.exit(1)

Alexander Polcyn

41fe579

2017-02-02 10:46:51 -0800

[diff] [blame]

702

Michael Darakananda

2017-08-17 17:09:56 +1000

[diff] [blame]

703

704

if __name__ == "__main__":

ncteisen