Blame - site_utils/balance_pools.py - platform/external/autotest

2015-04-21 10:22:31 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Adjust pool balances to cover DUT shortfalls.

7

8

This command takes all broken DUTs in a specific pool for specific

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

models and swaps them with working DUTs taken from a selected pool

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

10

of spares. The command is meant primarily for replacing broken DUTs

11

in critical pools like BVT or CQ, but it can also be used to adjust

12

pool sizes, or to create or remove pools.

13

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

14

usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

15

16

positional arguments:

17

POOL Name of the pool to balance

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

18

MODEL Names of models to balance

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

19

20

optional arguments:

21

-h, --help show this help message and exit

22

-t COUNT, --total COUNT

23

Set the number of DUTs in the pool to the specified

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

24

count for every MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

25

-a COUNT, --grow COUNT

26

Add the specified number of DUTs to the pool for every

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

27

MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

28

-d COUNT, --shrink COUNT

29

Remove the specified number of DUTs from the pool for

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

30

every MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

31

-s POOL, --spare POOL

32

Pool from which to draw replacement spares (default:

33

pool:suites)

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

34

--sku SKU The specific SKU we intend to swap with

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

35

-n, --dry-run Report actions to take in the form of shell commands

36

37

38

The command attempts to remove all broken DUTs from the target POOL

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

39

for every MODEL, and replace them with enough working DUTs taken

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

40

from the spare pool to bring the strength of POOL to the requested

41

total COUNT.

42

43

If no COUNT options are supplied (i.e. there are no --total, --grow,

44

or --shrink options), the command will maintain the current totals of

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

DUTs for every MODEL in the target POOL.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

46

47

If not enough working spares are available, broken DUTs may be left

48

in the pool to keep the pool at the target COUNT.

49

50

When reducing pool size, working DUTs will be returned after broken

51

DUTs, if it's necessary to achieve the target COUNT.

"""

import argparse

import sys

import time

import common

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

61

from autotest_lib.server import constants

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

62

from autotest_lib.server import frontend

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

63

from autotest_lib.server import site_utils

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

64

from autotest_lib.server.lib import status_history

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

65

from autotest_lib.site_utils import lab_inventory

Prathmesh Prabhu

68acc40

2017-11-09 15:24:15 -0800

[diff] [blame]

66

from autotest_lib.utils import labellib

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

67

from chromite.lib import metrics

David James

2a3cb54

2015-05-05 17:13:43 -0700

[diff] [blame]

68

from chromite.lib import parallel

69

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

70

#This must be imported after chromite.lib.metrics

71

from infra_libs import ts_mon

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

72

73

_POOL_PREFIX = constants.Labels.POOL_PREFIX

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

74

# This is the ratio of all models we should calculate the default max

75

# number of broken models against. It seemed like the best choice that

76

# was neither too strict nor lax.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

77

_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

78

79

_ALL_CRITICAL_POOLS = 'all_critical_pools'

80

_SPARE_DEFAULT = lab_inventory.SPARE_POOL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

81

82

83

def _log_message(message, *args):

84

"""Log a message with optional format arguments to stdout.

85

86

This function logs a single line to stdout, with formatting

87

if necessary, and without adornments.

88

89

If `*args` are supplied, the message will be formatted using

90

the arguments.

91

92

@param message Message to be logged, possibly after formatting.

93

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if args:

message = message % args

99

sys.stdout.write('%s\n' % message)

100

101

102

def _log_info(dry_run, message, *args):

103

"""Log information in a dry-run dependent fashion.

104

105

This function logs a single line to stdout, with formatting

106

if necessary. When logging for a dry run, the message is

107

printed as a shell comment, rather than as unadorned text.

108

109

If `*args` are supplied, the message will be formatted using

110

the arguments.

111

112

@param message Message to be logged, possibly after formatting.

113

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if dry_run:

message = '# ' + message

119

_log_message(message, *args)

120

121

122

def _log_error(message, *args):

123

"""Log an error to stderr, with optional format arguments.

124

125

This function logs a single line to stderr, prefixed to indicate

126

that it is an error message.

127

128

If `*args` are supplied, the message will be formatted using

129

the arguments.

130

131

@param message Message to be logged, possibly after formatting.

132

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if args:

message = message % args

138

sys.stderr.write('ERROR: %s\n' % message)

139

140

141

class _DUTPool(object):

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

142

"""Information about a pool of DUTs matching given labels.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

143

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

144

This class collects information about all DUTs for a given pool and matching

145

the given labels, and divides them into three categories:

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

146

+ Working - the DUT is working for testing, and not locked.

147

+ Broken - the DUT is unable to run tests, or it is locked.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

148

+ Ineligible - the DUT is not available to be removed from this pool. The

149

DUT may be either working or broken.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

150

151

DUTs with more than one pool: label are ineligible for exchange

152

during balancing. This is done for the sake of chameleon hosts,

153

which must always be assigned to pool:suites. These DUTs are

154

always marked with pool:chameleon to prevent their reassignment.

155

156

TODO(jrbarnette): The use of `pool:chamelon` (instead of just

157

the `chameleon` label is a hack that should be eliminated.

158

159

_DUTPool instances are used to track both main pools that need

160

to be resupplied with working DUTs and spare pools that supply

161

those DUTs.

162

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

163

@property pool Name of the pool associated with

164

this pool of DUTs.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

165

@property labels Labels that constrain the DUTs to consider.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

166

@property working_hosts The list of this pool's working DUTs.

167

@property broken_hosts The list of this pool's broken DUTs.

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

168

@property ineligible_hosts The list of this pool's ineligible DUTs.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

169

@property pool_labels A list of labels that identify a DUT as part

170

of this pool.

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

171

@property total_hosts The total number of hosts in pool.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

175

def __init__(self, afe, pool, labels, start_time, end_time):

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

176

self.pool = pool

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

177

self.labels = labellib.LabelsMapping(labels)

178

self.labels['pool'] = pool

179

self._pool_labels = [_POOL_PREFIX + self.pool]

180

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

181

self.working_hosts = []

182

self.broken_hosts = []

183

self.ineligible_hosts = []

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

184

self.total_hosts = self._get_hosts(afe, start_time, end_time)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

185

186

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

187

def _get_hosts(self, afe, start_time, end_time):

Prathmesh Prabhu

68acc40

2017-11-09 15:24:15 -0800

[diff] [blame]

188

all_histories = status_history.HostJobHistory.get_multiple_histories(

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

189

afe, start_time, end_time, self.labels.getlabels())

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

190

for h in all_histories:

191

host = h.host

192

host_pools = [l for l in host.labels

193

if l.startswith(_POOL_PREFIX)]

194

if len(host_pools) != 1:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

195

self.ineligible_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

196

else:

197

diag = h.last_diagnosis()[0]

198

if (diag == status_history.WORKING and

199

not host.locked):

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

200

self.working_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

201

else:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

202

self.broken_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

203

return len(all_histories)

@property

def pool_labels(self):

208

"""Return the AFE labels that identify this pool.

209

210

The returned labels are the labels that must be removed

211

to remove a DUT from the pool, or added to add a DUT.

212

213

@return A list of AFE labels suitable for AFE.add_labels()

214

or AFE.remove_labels().

215

216

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

217

return self._pool_labels

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

218

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

219

def calculate_spares_needed(self, target_total):

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

220

"""Calculate and log the spares needed to achieve a target.

221

222

Return how many working spares are needed to achieve the

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

223

given `target_total` with all DUTs working.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

224

225

The spares count may be positive or negative. Positive

226

values indicate spares are needed to replace broken DUTs in

227

order to reach the target; negative numbers indicate that

228

no spares are needed, and that a corresponding number of

229

working devices can be returned.

230

231

If the new target total would require returning ineligible

232

DUTs, an error is logged, and the target total is adjusted

233

so that those DUTs are not exchanged.

234

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

235

@param target_total The new target pool size.

236

237

@return The number of spares needed.

238

239

"""

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

240

num_ineligible = len(self.ineligible_hosts)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

241

spares_needed = target_total >= num_ineligible

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

242

metrics.Boolean(

243

'chromeos/autotest/balance_pools/exhausted_pools',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

244

'True for each pool/model which requests more DUTs than supplied',

245

# TODO(jrbarnette) The 'board' field is a legacy. We need

246

# to leave it here until we do the extra work Monarch

247

# requires to delete a field.

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

248

field_spec=[

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

249

ts_mon.StringField('pool'),

250

ts_mon.StringField('board'),

251

ts_mon.StringField('model'),

]).set(

not spares_needed,

fields={

'pool': self.pool,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

256

'board': self.labels.get('model', ''),

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

257

'model': self.labels.get('model', ''),

258

},

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

259

)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

260

if not spares_needed:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

261

_log_error(

262

'%s pool (%s): Target of %d is below minimum of %d DUTs.',

263

self.pool, self.labels, target_total, num_ineligible,

264

)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

265

_log_error('Adjusting target to %d DUTs.', num_ineligible)

266

target_total = num_ineligible

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

267

else:

268

_log_message('%s %s pool: Target of %d is above minimum.',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

269

self.labels.get('model', ''), self.pool, target_total)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

270

adjustment = target_total - self.total_hosts

271

return len(self.broken_hosts) + adjustment

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

272

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

273

def allocate_surplus(self, num_broken):

274

"""Allocate a list DUTs that can returned as surplus.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

275

276

Return a list of devices that can be returned in order to

277

reduce this pool's supply. Broken DUTs will be preferred

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

278

over working ones.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

279

280

The `num_broken` parameter indicates the number of broken

281

DUTs to be left in the pool. If this number exceeds the

282

number of broken DUTs actually in the pool, the returned

283

list will be empty. If this number is negative, it

284

indicates a number of working DUTs to be returned in

285

addition to all broken ones.

286

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

287

@param num_broken Total number of broken DUTs to be left in

288

this pool.

289

290

@return A list of DUTs to be returned as surplus.

291

292

"""

293

if num_broken >= 0:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

294

surplus = self.broken_hosts[num_broken:]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

295

return surplus

296

else:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

297

return (self.broken_hosts +

298

self.working_hosts[:-num_broken])

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

299

300

301

def _exchange_labels(dry_run, hosts, target_pool, spare_pool):

302

"""Reassign a list of DUTs from one pool to another.

303

304

For all the given hosts, remove all labels associated with

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

305

`spare_pool`, and add the labels for `target_pool`.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

306

307

If `dry_run` is true, perform no changes, but log the `atest`

308

commands needed to accomplish the necessary label changes.

309

310

@param dry_run Whether the logging is for a dry run or

311

for actual execution.

312

@param hosts List of DUTs (AFE hosts) to be reassigned.

313

@param target_pool The `_DUTPool` object from which the hosts

314

are drawn.

315

@param spare_pool The `_DUTPool` object to which the hosts

316

will be added.

317

318

"""

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

319

_log_info(dry_run, 'Transferring %d DUTs from %s to %s.',

320

len(hosts), spare_pool.pool, target_pool.pool)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

321

metrics.Counter(

322

'chromeos/autotest/balance_pools/duts_moved',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

323

'DUTs transferred between pools',

324

# TODO(jrbarnette) The 'board' field is a legacy. We need to

325

# leave it here until we do the extra work Monarch requires to

326

# delete a field.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

327

field_spec=[

328

ts_mon.StringField('board'),

329

ts_mon.StringField('model'),

330

ts_mon.StringField('source_pool'),

331

ts_mon.StringField('target_pool'),

]

).increment_by(

len(hosts),

fields={

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

336

'board': target_pool.labels.get('model', ''),

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

337

'model': target_pool.labels.get('model', ''),

338

'source_pool': spare_pool.pool,

339

'target_pool': target_pool.pool,

340

},

341

)

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

342

if not hosts:

343

return

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

344

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

345

additions = target_pool.pool_labels

346

removals = spare_pool.pool_labels

347

for host in hosts:

348

if not dry_run:

349

_log_message('Updating host: %s.', host.hostname)

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

350

host.remove_labels(removals)

351

host.add_labels(additions)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

352

else:

353

_log_message('atest label remove -m %s %s',

354

host.hostname, ' '.join(removals))

355

_log_message('atest label add -m %s %s',

356

host.hostname, ' '.join(additions))

357

358

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

359

def _balance_model(arguments, afe, pool, labels, start_time, end_time):

360

"""Balance one model as requested by command line arguments.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

361

362

@param arguments Parsed command line arguments.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

363

@param afe AFE object to be used for the changes.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

364

@param pool Pool of the model to be balanced.

365

@param labels Restrict the balancing operation within DUTs

366

that have these labels.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

367

@param start_time Start time for HostJobHistory objects in

368

the DUT pools.

369

@param end_time End time for HostJobHistory objects in the

370

DUT pools.

371

372

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

373

spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)

374

main_pool = _DUTPool(afe, pool, labels, start_time, end_time)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

375

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

376

target_total = main_pool.total_hosts

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

377

if arguments.total is not None:

378

target_total = arguments.total

379

elif arguments.grow:

380

target_total += arguments.grow

381

elif arguments.shrink:

382

target_total -= arguments.shrink

383

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

384

spares_needed = main_pool.calculate_spares_needed(target_total)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

385

if spares_needed > 0:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

386

spare_duts = spare_pool.working_hosts[:spares_needed]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

387

shortfall = spares_needed - len(spare_duts)

388

else:

389

spare_duts = []

390

shortfall = spares_needed

391

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

392

surplus_duts = main_pool.allocate_surplus(shortfall)

393

394

if spares_needed or surplus_duts or arguments.verbose:

395

dry_run = arguments.dry_run

396

_log_message('')

397

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

398

_log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

399

_log_info(dry_run,

400

'Total %d DUTs, %d working, %d broken, %d reserved.',

401

main_pool.total_hosts, len(main_pool.working_hosts),

402

len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))

403

404

if spares_needed > 0:

405

add_msg = 'grow pool by %d DUTs' % spares_needed

406

elif spares_needed < 0:

407

add_msg = 'shrink pool by %d DUTs' % -spares_needed

408

else:

409

add_msg = 'no change to pool size'

410

_log_info(dry_run, 'Target is %d working DUTs; %s.',

411

target_total, add_msg)

412

413

_log_info(dry_run,

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

414

'%s %s pool has %d spares available for balancing pool %s',

415

labels, spare_pool.pool, len(spare_pool.working_hosts),

416

main_pool.pool)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

417

418

if spares_needed > len(spare_duts):

419

_log_error('Not enough spares: need %d, only have %d.',

420

spares_needed, len(spare_duts))

421

elif shortfall >= 0:

422

_log_info(dry_run,

423

'%s %s pool will return %d broken DUTs, '

424

'leaving %d still in the pool.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

425

labels, main_pool.pool,

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

426

len(surplus_duts),

427

len(main_pool.broken_hosts) - len(surplus_duts))

428

else:

429

_log_info(dry_run,

430

'%s %s pool will return %d surplus DUTs, '

431

'including %d working DUTs.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

432

labels, main_pool.pool,

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

433

len(main_pool.broken_hosts) - shortfall,

434

-shortfall)

435

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

436

if (len(main_pool.broken_hosts) > arguments.max_broken and

437

not arguments.force_rebalance):

438

_log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

439

labels, main_pool.pool, len(main_pool.broken_hosts))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

440

_log_error('Please investigate this model to for a bug ')

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

441

_log_error('that is bricking devices. Once you have finished your ')

442

_log_error('investigation, you can force a rebalance with ')

443

_log_error('--force-rebalance')

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

444

spare_duts = []

445

surplus_duts = []

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

446

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

447

if not spare_duts and not surplus_duts:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

448

if arguments.verbose:

449

_log_info(arguments.dry_run, 'No exchange required.')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

450

451

_exchange_labels(arguments.dry_run, surplus_duts,

452

spare_pool, main_pool)

453

_exchange_labels(arguments.dry_run, spare_duts,

454

main_pool, spare_pool)

455

456

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

457

def _too_many_broken(inventory, pool, args):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

458

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

459

Get the inventory of models and check if too many are broken.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

460

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

461

@param inventory: _LabInventory object.

462

@param pool: The pool to check.

463

@param args: Parsed command line arguments.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

464

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

465

@return True if the number of models with 1 or more broken duts

466

exceed max_broken_models, False otherwise.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

467

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

468

# Were we asked to skip this check?

469

if (args.force_rebalance or

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

470

(args.all_models and args.max_broken_models == 0)):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

471

return False

472

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

473

max_broken = args.max_broken_models

474

if max_broken is None:

475

total_num = len(inventory.get_pool_models(pool))

476

max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num)

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

477

_log_info(args.dry_run,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

478

'Max broken models for pool %s: %d',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

479

pool, max_broken)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

480

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

481

broken = [model for model, counts in inventory.iteritems()

482

if counts.get_broken(pool) != 0]

483

_log_message('There are %d models in the %s pool with at least 1 '

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

484

'broken DUT (max threshold %d)',

485

len(broken), pool, max_broken)

486

for b in sorted(broken):

487

_log_message(b)

488

return len(broken) > max_broken

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

489

490

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

491

def _parse_command(argv):

492

"""Parse the command line arguments.

493

494

Create an argument parser for this command's syntax, parse the

495

command line, and return the result of the `ArgumentParser`

496

`parse_args()` method.

497

498

@param argv Standard command line argument vector; `argv[0]` is

499

assumed to be the command name.

500

501

@return Result returned by `ArgumentParser.parse_args()`.

502

503

"""

504

parser = argparse.ArgumentParser(

505

prog=argv[0],

506

description='Balance pool shortages from spares on reserve')

507

Prathmesh Prabhu

c8cf0f6

2017-11-09 08:57:58 -0800

[diff] [blame]

508

parser.add_argument(

509

'-w', '--web', type=str, default=None,

510

help='AFE host to use. Default comes from shadow_config.',

511

)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

512

count_group = parser.add_mutually_exclusive_group()

513

count_group.add_argument('-t', '--total', type=int,

514

metavar='COUNT', default=None,

515

help='Set the number of DUTs in the '

516

'pool to the specified count for '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

517

'every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

518

count_group.add_argument('-a', '--grow', type=int,

519

metavar='COUNT', default=None,

520

help='Add the specified number of DUTs '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

521

'to the pool for every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

522

count_group.add_argument('-d', '--shrink', type=int,

523

metavar='COUNT', default=None,

524

help='Remove the specified number of DUTs '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

525

'from the pool for every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

526

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

527

parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

528

metavar='POOL',

529

help='Pool from which to draw replacement '

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

530

'spares (default: pool:%s)' % _SPARE_DEFAULT)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

531

parser.add_argument('-n', '--dry-run', action='store_true',

532

help='Report actions to take in the form of '

533

'shell commands')

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

534

parser.add_argument('-v', '--verbose', action='store_true',

535

help='Print more detail about calculations for debug '

536

'purposes.')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

537

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

538

parser.add_argument('-m', '--max-broken', default=2, type=int,

539

metavar='COUNT',

540

help='Only rebalance a pool if it has at most '

541

'COUNT broken DUTs.')

542

parser.add_argument('-f', '--force-rebalance', action='store_true',

543

help='Forcefully rebalance all DUTs in a pool, even '

544

'if it has a large number of broken DUTs. '

545

'Before doing this, please investigate whether '

546

'there is a bug that is bricking devices in the '

547

'lab.')

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

548

parser.add_argument('--production', action='store_true',

Prathmesh Prabhu

7a050da

2017-11-09 09:15:03 -0800

[diff] [blame]

549

help='Treat this as a production run. This will '

550

'collect metrics.')

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

551

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

552

parser.add_argument(

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

553

'--all-models',

554

action='store_true',

555

help='Rebalance all managed models. This will do a very expensive '

556

'check to see how many models have at least one broken DUT. '

557

'To bypass that check, set --max-broken-models to 0.',

558

)

559

parser.add_argument(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

560

'--max-broken-models', default=None, type=int, metavar='COUNT',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

561

help='Only rebalance all models if number of models with broken '

562

'DUTs in the specified pool is less than COUNT.',

563

)

564

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

565

parser.add_argument('pool',

566

metavar='POOL',

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

567

help='Name of the pool to balance. Use %s to balance '

568

'all critical pools' % _ALL_CRITICAL_POOLS)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

569

parser.add_argument('models', nargs='*', metavar='MODEL',

570

help='Names of models to balance.')

Aviv Keshet

39c20dd

2017-11-02 10:17:13 -0700

[diff] [blame]

571

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

572

parser.add_argument('--sku', type=str,

573

help='Optional name of sku to restrict to.')

574

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

575

arguments = parser.parse_args(argv[1:])

David James

8352bc2

2015-05-05 16:37:05 -0700

[diff] [blame]

576

577

# Error-check arguments.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

578

if arguments.models and arguments.all_models:

579

parser.error('Cannot specify individual models on the command line '

580

'when using --all-models.')

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

581

if (arguments.pool == _ALL_CRITICAL_POOLS and

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

582

arguments.spare != _SPARE_DEFAULT):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

583

parser.error('Cannot specify --spare pool to be %s when balancing all '

584

'critical pools.' % _SPARE_DEFAULT)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

return arguments

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

588

def infer_balancer_targets(afe, arguments, pools):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

589

"""Take some arguments and translate them to a list of models to balance

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

590

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

591

Args:

592

@param afe AFE object to be used for taking inventory.

593

@param arguments Parsed command line arguments.

594

@param pools The list of pools to balance.

595

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

596

@returns a list of (model, labels) tuples to be balanced

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

597

598

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

599

balancer_targets = []

Aviv Keshet

39c20dd

2017-11-02 10:17:13 -0700

[diff] [blame]

600

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

601

for pool in pools:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

602

if arguments.all_models:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

603

inventory = lab_inventory.get_inventory(afe)

604

quarantine = _too_many_broken(inventory, pool, arguments)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

605

if quarantine:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

606

_log_error('Refusing to balance all models for %s pool, '

607

'too many models with at least 1 broken DUT '

608

'detected.', pool)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

609

else:

Richard Barnette

685ac85

2018-04-04 16:14:06 -0700

[diff] [blame]

610

for model in inventory.get_pool_models(pool):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

611

labels = labellib.LabelsMapping()

612

labels['model'] = model

613

balancer_targets.append((pool, labels.getlabels()))

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

614

metrics.Boolean(

615

'chromeos/autotest/balance_pools/unchanged_pools').set(

616

quarantine, fields={'pool': pool})

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

617

_log_message('Pool %s quarantine status: %s', pool, quarantine)

618

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

619

for model in arguments.models:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

620

labels = labellib.LabelsMapping()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

621

labels['model'] = model

Richard Barnette

32fb1e8

2018-01-30 13:39:30 -0800

[diff] [blame]

622

if arguments.sku:

623

labels['sku'] = arguments.sku

624

balancer_targets.append((pool, labels.getlabels()))

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

625

return balancer_targets

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

def main(argv):

"""Standard main routine.

630

631

@param argv Command line arguments including `sys.argv[0]`.

632

633

"""

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

634

arguments = _parse_command(argv)

635

if arguments.production:

Aviv Keshet

2cc427d

2018-04-18 13:39:24 -0700

[diff] [blame]

636

metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',

637

indirect=True)

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

638

else:

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

639

metrics_manager = site_utils.TrivialContextManager()

640

641

with metrics_manager:

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

642

with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):

643

end_time = time.time()

644

start_time = end_time - 24 * 60 * 60

645

afe = frontend.AFE(server=arguments.web)

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

646

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

647

def balancer(pool, labels):

648

"""Balance the specified model.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

649

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

650

@param pool: The pool to rebalance for the model.

651

@param labels: labels to restrict to balancing operations

652

within.

653

"""

654

_balance_model(arguments, afe, pool, labels,

655

start_time, end_time)

656

_log_message('')

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

657

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

658

pools = (lab_inventory.CRITICAL_POOLS

659

if arguments.pool == _ALL_CRITICAL_POOLS

660

else [arguments.pool])

661

balancer_targets = infer_balancer_targets(afe, arguments, pools)

662

try:

663

parallel.RunTasksInProcessPool(

balancer,

balancer_targets,

processes=8,

)

except KeyboardInterrupt:

669

pass

J. Richard Barnette