Blame - site_utils/balance_pools.py - platform/external/autotest

2015-04-21 10:22:31 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Adjust pool balances to cover DUT shortfalls.

7

8

This command takes all broken DUTs in a specific pool for specific

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

models and swaps them with working DUTs taken from a selected pool

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

10

of spares. The command is meant primarily for replacing broken DUTs

11

in critical pools like BVT or CQ, but it can also be used to adjust

12

pool sizes, or to create or remove pools.

13

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

14

usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

15

16

positional arguments:

17

POOL Name of the pool to balance

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

18

MODEL Names of models to balance

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

19

20

optional arguments:

21

-h, --help show this help message and exit

22

-t COUNT, --total COUNT

23

Set the number of DUTs in the pool to the specified

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

24

count for every MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

25

-a COUNT, --grow COUNT

26

Add the specified number of DUTs to the pool for every

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

27

MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

28

-d COUNT, --shrink COUNT

29

Remove the specified number of DUTs from the pool for

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

30

every MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

31

-s POOL, --spare POOL

32

Pool from which to draw replacement spares (default:

33

pool:suites)

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

34

--sku SKU The specific SKU we intend to swap with

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

35

-n, --dry-run Report actions to take in the form of shell commands

36

37

38

The command attempts to remove all broken DUTs from the target POOL

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

39

for every MODEL, and replace them with enough working DUTs taken

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

40

from the spare pool to bring the strength of POOL to the requested

41

total COUNT.

42

43

If no COUNT options are supplied (i.e. there are no --total, --grow,

44

or --shrink options), the command will maintain the current totals of

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

DUTs for every MODEL in the target POOL.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

46

47

If not enough working spares are available, broken DUTs may be left

48

in the pool to keep the pool at the target COUNT.

49

50

When reducing pool size, working DUTs will be returned after broken

51

DUTs, if it's necessary to achieve the target COUNT.

"""

import argparse

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

57

import os

58

import re

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

import sys

import time

import common

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

63

from autotest_lib.server import constants

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

64

from autotest_lib.server import site_utils

Aviv Keshet

a883432

2018-05-07 13:28:32 -0700

[diff] [blame]

65

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

66

from autotest_lib.server.lib import status_history

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

67

from autotest_lib.site_utils import lab_inventory

Prathmesh Prabhu

68acc40

2017-11-09 15:24:15 -0800

[diff] [blame]

68

from autotest_lib.utils import labellib

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

69

from chromite.lib import metrics

David James

2a3cb54

2015-05-05 17:13:43 -0700

[diff] [blame]

70

from chromite.lib import parallel

71

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

72

#This must be imported after chromite.lib.metrics

73

from infra_libs import ts_mon

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

74

75

_POOL_PREFIX = constants.Labels.POOL_PREFIX

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

76

# This is the ratio of all models we should calculate the default max

77

# number of broken models against. It seemed like the best choice that

78

# was neither too strict nor lax.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

79

_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

80

81

_ALL_CRITICAL_POOLS = 'all_critical_pools'

82

_SPARE_DEFAULT = lab_inventory.SPARE_POOL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

83

84

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

85

# _VALID_POOL_PATTERN - Regular expression matching pool names that will

86

# be accepted on the command line.

87

#

88

# Note: This pattern was selected merely to recognize all existing pool

89

# names; there's no underlying technical restriction motivating this

90

# pattern. No reasonable request to add more special characters to the

91

# allowed set should be refused.

92

93

_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$')

94

95

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

96

def _log_message(message, *args):

97

"""Log a message with optional format arguments to stdout.

98

99

This function logs a single line to stdout, with formatting

100

if necessary, and without adornments.

101

102

If `*args` are supplied, the message will be formatted using

103

the arguments.

104

105

@param message Message to be logged, possibly after formatting.

106

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if args:

message = message % args

112

sys.stdout.write('%s\n' % message)

113

114

115

def _log_info(dry_run, message, *args):

116

"""Log information in a dry-run dependent fashion.

117

118

This function logs a single line to stdout, with formatting

119

if necessary. When logging for a dry run, the message is

120

printed as a shell comment, rather than as unadorned text.

121

122

If `*args` are supplied, the message will be formatted using

123

the arguments.

124

125

@param message Message to be logged, possibly after formatting.

126

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if dry_run:

message = '# ' + message

132

_log_message(message, *args)

133

134

135

def _log_error(message, *args):

136

"""Log an error to stderr, with optional format arguments.

137

138

This function logs a single line to stderr, prefixed to indicate

139

that it is an error message.

140

141

If `*args` are supplied, the message will be formatted using

142

the arguments.

143

144

@param message Message to be logged, possibly after formatting.

145

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if args:

message = message % args

151

sys.stderr.write('ERROR: %s\n' % message)

152

153

154

class _DUTPool(object):

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

155

"""Information about a pool of DUTs matching given labels.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

156

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

157

This class collects information about all DUTs for a given pool and matching

158

the given labels, and divides them into three categories:

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

159

+ Working - the DUT is working for testing, and not locked.

160

+ Broken - the DUT is unable to run tests, or it is locked.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

161

+ Ineligible - the DUT is not available to be removed from this pool. The

162

DUT may be either working or broken.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

163

164

DUTs with more than one pool: label are ineligible for exchange

165

during balancing. This is done for the sake of chameleon hosts,

166

which must always be assigned to pool:suites. These DUTs are

167

always marked with pool:chameleon to prevent their reassignment.

168

169

TODO(jrbarnette): The use of `pool:chamelon` (instead of just

170

the `chameleon` label is a hack that should be eliminated.

171

172

_DUTPool instances are used to track both main pools that need

173

to be resupplied with working DUTs and spare pools that supply

174

those DUTs.

175

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

176

@property pool Name of the pool associated with

177

this pool of DUTs.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

178

@property labels Labels that constrain the DUTs to consider.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

179

@property working_hosts The list of this pool's working DUTs.

180

@property broken_hosts The list of this pool's broken DUTs.

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

181

@property ineligible_hosts The list of this pool's ineligible DUTs.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

182

@property pool_labels A list of labels that identify a DUT as part

183

of this pool.

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

184

@property total_hosts The total number of hosts in pool.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

188

def __init__(self, afe, pool, labels, start_time, end_time):

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

189

self.pool = pool

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

190

self.labels = labellib.LabelsMapping(labels)

191

self.labels['pool'] = pool

192

self._pool_labels = [_POOL_PREFIX + self.pool]

193

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

194

self.working_hosts = []

195

self.broken_hosts = []

196

self.ineligible_hosts = []

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

197

self.total_hosts = self._get_hosts(afe, start_time, end_time)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

198

199

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

200

def _get_hosts(self, afe, start_time, end_time):

Prathmesh Prabhu

68acc40

2017-11-09 15:24:15 -0800

[diff] [blame]

201

all_histories = status_history.HostJobHistory.get_multiple_histories(

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

202

afe, start_time, end_time, self.labels.getlabels())

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

203

for h in all_histories:

204

host = h.host

205

host_pools = [l for l in host.labels

206

if l.startswith(_POOL_PREFIX)]

207

if len(host_pools) != 1:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

208

self.ineligible_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

209

else:

210

diag = h.last_diagnosis()[0]

211

if (diag == status_history.WORKING and

212

not host.locked):

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

213

self.working_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

214

else:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

215

self.broken_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

216

return len(all_histories)

@property

def pool_labels(self):

221

"""Return the AFE labels that identify this pool.

222

223

The returned labels are the labels that must be removed

224

to remove a DUT from the pool, or added to add a DUT.

225

226

@return A list of AFE labels suitable for AFE.add_labels()

227

or AFE.remove_labels().

228

229

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

230

return self._pool_labels

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

231

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

232

def calculate_spares_needed(self, target_total):

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

233

"""Calculate and log the spares needed to achieve a target.

234

235

Return how many working spares are needed to achieve the

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

236

given `target_total` with all DUTs working.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

237

238

The spares count may be positive or negative. Positive

239

values indicate spares are needed to replace broken DUTs in

240

order to reach the target; negative numbers indicate that

241

no spares are needed, and that a corresponding number of

242

working devices can be returned.

243

244

If the new target total would require returning ineligible

245

DUTs, an error is logged, and the target total is adjusted

246

so that those DUTs are not exchanged.

247

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

248

@param target_total The new target pool size.

249

250

@return The number of spares needed.

251

252

"""

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

253

num_ineligible = len(self.ineligible_hosts)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

254

spares_needed = target_total >= num_ineligible

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

255

metrics.Boolean(

256

'chromeos/autotest/balance_pools/exhausted_pools',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

257

'True for each pool/model which requests more DUTs than supplied',

258

# TODO(jrbarnette) The 'board' field is a legacy. We need

259

# to leave it here until we do the extra work Monarch

260

# requires to delete a field.

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

261

field_spec=[

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

262

ts_mon.StringField('pool'),

263

ts_mon.StringField('board'),

264

ts_mon.StringField('model'),

]).set(

not spares_needed,

fields={

'pool': self.pool,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

269

'board': self.labels.get('model', ''),

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

270

'model': self.labels.get('model', ''),

271

},

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

272

)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

273

if not spares_needed:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

274

_log_error(

275

'%s pool (%s): Target of %d is below minimum of %d DUTs.',

276

self.pool, self.labels, target_total, num_ineligible,

277

)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

278

_log_error('Adjusting target to %d DUTs.', num_ineligible)

279

target_total = num_ineligible

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

280

else:

281

_log_message('%s %s pool: Target of %d is above minimum.',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

282

self.labels.get('model', ''), self.pool, target_total)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

283

adjustment = target_total - self.total_hosts

284

return len(self.broken_hosts) + adjustment

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

285

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

286

def allocate_surplus(self, num_broken):

287

"""Allocate a list DUTs that can returned as surplus.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

288

289

Return a list of devices that can be returned in order to

290

reduce this pool's supply. Broken DUTs will be preferred

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

291

over working ones.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

292

293

The `num_broken` parameter indicates the number of broken

294

DUTs to be left in the pool. If this number exceeds the

295

number of broken DUTs actually in the pool, the returned

296

list will be empty. If this number is negative, it

297

indicates a number of working DUTs to be returned in

298

addition to all broken ones.

299

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

300

@param num_broken Total number of broken DUTs to be left in

301

this pool.

302

303

@return A list of DUTs to be returned as surplus.

304

305

"""

306

if num_broken >= 0:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

307

surplus = self.broken_hosts[num_broken:]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

308

return surplus

309

else:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

310

return (self.broken_hosts +

311

self.working_hosts[:-num_broken])

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

312

313

314

def _exchange_labels(dry_run, hosts, target_pool, spare_pool):

315

"""Reassign a list of DUTs from one pool to another.

316

317

For all the given hosts, remove all labels associated with

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

318

`spare_pool`, and add the labels for `target_pool`.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

319

320

If `dry_run` is true, perform no changes, but log the `atest`

321

commands needed to accomplish the necessary label changes.

322

323

@param dry_run Whether the logging is for a dry run or

324

for actual execution.

325

@param hosts List of DUTs (AFE hosts) to be reassigned.

326

@param target_pool The `_DUTPool` object from which the hosts

327

are drawn.

328

@param spare_pool The `_DUTPool` object to which the hosts

329

will be added.

330

331

"""

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

332

_log_info(dry_run, 'Transferring %d DUTs from %s to %s.',

333

len(hosts), spare_pool.pool, target_pool.pool)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

334

metrics.Counter(

335

'chromeos/autotest/balance_pools/duts_moved',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

336

'DUTs transferred between pools',

337

# TODO(jrbarnette) The 'board' field is a legacy. We need to

338

# leave it here until we do the extra work Monarch requires to

339

# delete a field.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

340

field_spec=[

341

ts_mon.StringField('board'),

342

ts_mon.StringField('model'),

343

ts_mon.StringField('source_pool'),

344

ts_mon.StringField('target_pool'),

]

).increment_by(

len(hosts),

fields={

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

349

'board': target_pool.labels.get('model', ''),

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

350

'model': target_pool.labels.get('model', ''),

351

'source_pool': spare_pool.pool,

352

'target_pool': target_pool.pool,

353

},

354

)

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

355

if not hosts:

356

return

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

357

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

358

additions = target_pool.pool_labels

359

removals = spare_pool.pool_labels

360

for host in hosts:

361

if not dry_run:

362

_log_message('Updating host: %s.', host.hostname)

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

363

host.remove_labels(removals)

364

host.add_labels(additions)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

365

else:

366

_log_message('atest label remove -m %s %s',

367

host.hostname, ' '.join(removals))

368

_log_message('atest label add -m %s %s',

369

host.hostname, ' '.join(additions))

370

371

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

372

def _balance_model(arguments, afe, pool, labels, start_time, end_time):

373

"""Balance one model as requested by command line arguments.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

374

375

@param arguments Parsed command line arguments.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

376

@param afe AFE object to be used for the changes.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

377

@param pool Pool of the model to be balanced.

378

@param labels Restrict the balancing operation within DUTs

379

that have these labels.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

380

@param start_time Start time for HostJobHistory objects in

381

the DUT pools.

382

@param end_time End time for HostJobHistory objects in the

383

DUT pools.

384

385

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

386

spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)

387

main_pool = _DUTPool(afe, pool, labels, start_time, end_time)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

388

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

389

target_total = main_pool.total_hosts

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

390

if arguments.total is not None:

391

target_total = arguments.total

392

elif arguments.grow:

393

target_total += arguments.grow

394

elif arguments.shrink:

395

target_total -= arguments.shrink

396

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

397

spares_needed = main_pool.calculate_spares_needed(target_total)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

398

if spares_needed > 0:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

399

spare_duts = spare_pool.working_hosts[:spares_needed]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

400

shortfall = spares_needed - len(spare_duts)

401

else:

402

spare_duts = []

403

shortfall = spares_needed

404

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

405

surplus_duts = main_pool.allocate_surplus(shortfall)

406

407

if spares_needed or surplus_duts or arguments.verbose:

408

dry_run = arguments.dry_run

409

_log_message('')

410

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

411

_log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

412

_log_info(dry_run,

413

'Total %d DUTs, %d working, %d broken, %d reserved.',

414

main_pool.total_hosts, len(main_pool.working_hosts),

415

len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))

416

417

if spares_needed > 0:

418

add_msg = 'grow pool by %d DUTs' % spares_needed

419

elif spares_needed < 0:

420

add_msg = 'shrink pool by %d DUTs' % -spares_needed

421

else:

422

add_msg = 'no change to pool size'

423

_log_info(dry_run, 'Target is %d working DUTs; %s.',

424

target_total, add_msg)

425

426

_log_info(dry_run,

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

427

'%s %s pool has %d spares available for balancing pool %s',

428

labels, spare_pool.pool, len(spare_pool.working_hosts),

429

main_pool.pool)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

430

431

if spares_needed > len(spare_duts):

432

_log_error('Not enough spares: need %d, only have %d.',

433

spares_needed, len(spare_duts))

434

elif shortfall >= 0:

435

_log_info(dry_run,

436

'%s %s pool will return %d broken DUTs, '

437

'leaving %d still in the pool.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

438

labels, main_pool.pool,

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

439

len(surplus_duts),

440

len(main_pool.broken_hosts) - len(surplus_duts))

441

else:

442

_log_info(dry_run,

443

'%s %s pool will return %d surplus DUTs, '

444

'including %d working DUTs.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

445

labels, main_pool.pool,

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

446

len(main_pool.broken_hosts) - shortfall,

447

-shortfall)

448

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

449

if (len(main_pool.broken_hosts) > arguments.max_broken and

450

not arguments.force_rebalance):

451

_log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

452

labels, main_pool.pool, len(main_pool.broken_hosts))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

453

_log_error('Please investigate this model to for a bug ')

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

454

_log_error('that is bricking devices. Once you have finished your ')

455

_log_error('investigation, you can force a rebalance with ')

456

_log_error('--force-rebalance')

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

457

spare_duts = []

458

surplus_duts = []

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

459

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

460

if not spare_duts and not surplus_duts:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

461

if arguments.verbose:

462

_log_info(arguments.dry_run, 'No exchange required.')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

463

464

_exchange_labels(arguments.dry_run, surplus_duts,

465

spare_pool, main_pool)

466

_exchange_labels(arguments.dry_run, spare_duts,

467

main_pool, spare_pool)

468

469

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

470

def _too_many_broken(inventory, pool, args):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

471

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

472

Get the inventory of models and check if too many are broken.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

473

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

474

@param inventory: _LabInventory object.

475

@param pool: The pool to check.

476

@param args: Parsed command line arguments.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

477

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

478

@return True if the number of models with 1 or more broken duts

479

exceed max_broken_models, False otherwise.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

480

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

481

# Were we asked to skip this check?

482

if (args.force_rebalance or

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

483

(args.all_models and args.max_broken_models == 0)):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

484

return False

485

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

486

max_broken = args.max_broken_models

487

if max_broken is None:

488

total_num = len(inventory.get_pool_models(pool))

489

max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num)

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

490

_log_info(args.dry_run,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

491

'Max broken models for pool %s: %d',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

492

pool, max_broken)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

493

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

494

broken = [model for model, counts in inventory.iteritems()

495

if counts.get_broken(pool) != 0]

496

_log_message('There are %d models in the %s pool with at least 1 '

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

497

'broken DUT (max threshold %d)',

498

len(broken), pool, max_broken)

499

for b in sorted(broken):

500

_log_message(b)

501

return len(broken) > max_broken

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

502

503

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

504

def _parse_command(argv):

505

"""Parse the command line arguments.

506

507

Create an argument parser for this command's syntax, parse the

508

command line, and return the result of the `ArgumentParser`

509

`parse_args()` method.

510

511

@param argv Standard command line argument vector; `argv[0]` is

512

assumed to be the command name.

513

514

@return Result returned by `ArgumentParser.parse_args()`.

515

516

"""

517

parser = argparse.ArgumentParser(

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

518

prog=os.path.basename(argv[0]),

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

519

description='Balance pool shortages from spares on reserve')

520

Prathmesh Prabhu

c8cf0f6

2017-11-09 08:57:58 -0800

[diff] [blame]

521

parser.add_argument(

522

'-w', '--web', type=str, default=None,

523

help='AFE host to use. Default comes from shadow_config.',

524

)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

525

count_group = parser.add_mutually_exclusive_group()

526

count_group.add_argument('-t', '--total', type=int,

527

metavar='COUNT', default=None,

528

help='Set the number of DUTs in the '

529

'pool to the specified count for '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

530

'every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

531

count_group.add_argument('-a', '--grow', type=int,

532

metavar='COUNT', default=None,

533

help='Add the specified number of DUTs '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

534

'to the pool for every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

535

count_group.add_argument('-d', '--shrink', type=int,

536

metavar='COUNT', default=None,

537

help='Remove the specified number of DUTs '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

538

'from the pool for every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

539

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

540

parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

541

metavar='POOL',

542

help='Pool from which to draw replacement '

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

543

'spares (default: pool:%s)' % _SPARE_DEFAULT)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

544

parser.add_argument('-n', '--dry-run', action='store_true',

545

help='Report actions to take in the form of '

546

'shell commands')

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

547

parser.add_argument('-v', '--verbose', action='store_true',

548

help='Print more detail about calculations for debug '

549

'purposes.')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

550

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

551

parser.add_argument('-m', '--max-broken', default=2, type=int,

552

metavar='COUNT',

553

help='Only rebalance a pool if it has at most '

554

'COUNT broken DUTs.')

555

parser.add_argument('-f', '--force-rebalance', action='store_true',

556

help='Forcefully rebalance all DUTs in a pool, even '

557

'if it has a large number of broken DUTs. '

558

'Before doing this, please investigate whether '

559

'there is a bug that is bricking devices in the '

560

'lab.')

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

561

parser.add_argument('--production', action='store_true',

Prathmesh Prabhu

7a050da

2017-11-09 09:15:03 -0800

[diff] [blame]

562

help='Treat this as a production run. This will '

563

'collect metrics.')

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

564

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

565

parser.add_argument(

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

566

'--all-models',

567

action='store_true',

568

help='Rebalance all managed models. This will do a very expensive '

569

'check to see how many models have at least one broken DUT. '

570

'To bypass that check, set --max-broken-models to 0.',

571

)

572

parser.add_argument(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

573

'--max-broken-models', default=None, type=int, metavar='COUNT',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

574

help='Only rebalance all models if number of models with broken '

575

'DUTs in the specified pool is less than COUNT.',

576

)

577

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

578

parser.add_argument('pool',

579

metavar='POOL',

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

580

help='Name of the pool to balance. Use %s to balance '

581

'all critical pools' % _ALL_CRITICAL_POOLS)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

582

parser.add_argument('models', nargs='*', metavar='MODEL',

583

help='Names of models to balance.')

Aviv Keshet

39c20dd

2017-11-02 10:17:13 -0700

[diff] [blame]

584

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

585

parser.add_argument('--sku', type=str,

586

help='Optional name of sku to restrict to.')

587

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

588

arguments = parser.parse_args(argv[1:])

David James

8352bc2

2015-05-05 16:37:05 -0700

[diff] [blame]

589

590

# Error-check arguments.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

591

if arguments.models and arguments.all_models:

592

parser.error('Cannot specify individual models on the command line '

593

'when using --all-models.')

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

594

if (arguments.pool == _ALL_CRITICAL_POOLS and

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

595

arguments.spare != _SPARE_DEFAULT):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

596

parser.error('Cannot specify --spare pool to be %s when balancing all '

597

'critical pools.' % _SPARE_DEFAULT)

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

598

for p in (arguments.spare, arguments.pool):

599

if not _VALID_POOL_PATTERN.match(p):

600

parser.error('Invalid pool name: %s' % p)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

return arguments

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

604

def infer_balancer_targets(afe, arguments, pools):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

605

"""Take some arguments and translate them to a list of models to balance

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

606

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

607

Args:

608

@param afe AFE object to be used for taking inventory.

609

@param arguments Parsed command line arguments.

610

@param pools The list of pools to balance.

611

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

612

@returns a list of (model, labels) tuples to be balanced

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

613

614

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

615

balancer_targets = []

Aviv Keshet

39c20dd

2017-11-02 10:17:13 -0700

[diff] [blame]

616

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

617

for pool in pools:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

618

if arguments.all_models:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

619

inventory = lab_inventory.get_inventory(afe)

620

quarantine = _too_many_broken(inventory, pool, arguments)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

621

if quarantine:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

622

_log_error('Refusing to balance all models for %s pool, '

623

'too many models with at least 1 broken DUT '

624

'detected.', pool)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

625

else:

Richard Barnette

685ac85

2018-04-04 16:14:06 -0700

[diff] [blame]

626

for model in inventory.get_pool_models(pool):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

627

labels = labellib.LabelsMapping()

628

labels['model'] = model

629

balancer_targets.append((pool, labels.getlabels()))

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

630

metrics.Boolean(

631

'chromeos/autotest/balance_pools/unchanged_pools').set(

632

quarantine, fields={'pool': pool})

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

633

_log_message('Pool %s quarantine status: %s', pool, quarantine)

634

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

635

for model in arguments.models:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

636

labels = labellib.LabelsMapping()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

637

labels['model'] = model

Richard Barnette

32fb1e8

2018-01-30 13:39:30 -0800

[diff] [blame]

638

if arguments.sku:

639

labels['sku'] = arguments.sku

640

balancer_targets.append((pool, labels.getlabels()))

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

641

return balancer_targets

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

def main(argv):

"""Standard main routine.

646

647

@param argv Command line arguments including `sys.argv[0]`.

648

649

"""

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

650

arguments = _parse_command(argv)

651

if arguments.production:

Aviv Keshet

2cc427d

2018-04-18 13:39:24 -0700

[diff] [blame]

652

metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',

653

indirect=True)

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

654

else:

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

655

metrics_manager = site_utils.TrivialContextManager()

656

657

with metrics_manager:

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

658

with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):

659

end_time = time.time()

660

start_time = end_time - 24 * 60 * 60

Aviv Keshet

a883432

2018-05-07 13:28:32 -0700

[diff] [blame]

661

afe = frontend_wrappers.RetryingAFE(server=arguments.web)

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

662

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

663

def balancer(pool, labels):

664

"""Balance the specified model.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

665

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

666

@param pool: The pool to rebalance for the model.

667

@param labels: labels to restrict to balancing operations

668

within.

669

"""

670

_balance_model(arguments, afe, pool, labels,

671

start_time, end_time)

672

_log_message('')

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

673

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

674

pools = (lab_inventory.CRITICAL_POOLS

675

if arguments.pool == _ALL_CRITICAL_POOLS

676

else [arguments.pool])

677

balancer_targets = infer_balancer_targets(afe, arguments, pools)

678

try:

679

parallel.RunTasksInProcessPool(

balancer,

balancer_targets,

processes=8,

)

except KeyboardInterrupt:

685

pass

J. Richard Barnette