Blame - site_utils/balance_pools.py - platform/external/autotest

2015-04-21 10:22:31 -0700

[diff] [blame]

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Adjust pool balances to cover DUT shortfalls.

7

8

This command takes all broken DUTs in a specific pool for specific

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

models and swaps them with working DUTs taken from a selected pool

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

10

of spares. The command is meant primarily for replacing broken DUTs

11

in critical pools like BVT or CQ, but it can also be used to adjust

12

pool sizes, or to create or remove pools.

13

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

14

usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

15

16

positional arguments:

17

POOL Name of the pool to balance

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

18

MODEL Names of models to balance

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

19

20

optional arguments:

21

-h, --help show this help message and exit

22

-t COUNT, --total COUNT

23

Set the number of DUTs in the pool to the specified

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

24

count for every MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

25

-a COUNT, --grow COUNT

26

Add the specified number of DUTs to the pool for every

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

27

MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

28

-d COUNT, --shrink COUNT

29

Remove the specified number of DUTs from the pool for

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

30

every MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

31

-s POOL, --spare POOL

32

Pool from which to draw replacement spares (default:

33

pool:suites)

Matthew Leszczenski

2018-11-13 14:50:01 -0800

[diff] [blame]

34

-p PHASE, --phase PHASE

35

Phase to restrict the balance pool operation to

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

36

--sku SKU The specific SKU we intend to swap with

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

37

-n, --dry-run Report actions to take in the form of shell commands

38

39

40

The command attempts to remove all broken DUTs from the target POOL

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

41

for every MODEL, and replace them with enough working DUTs taken

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

42

from the spare pool to bring the strength of POOL to the requested

43

total COUNT.

44

45

If no COUNT options are supplied (i.e. there are no --total, --grow,

46

or --shrink options), the command will maintain the current totals of

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

47

DUTs for every MODEL in the target POOL.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

48

49

If not enough working spares are available, broken DUTs may be left

50

in the pool to keep the pool at the target COUNT.

51

52

When reducing pool size, working DUTs will be returned after broken

53

DUTs, if it's necessary to achieve the target COUNT.

"""

import argparse

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

59

import os

60

import re

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

import sys

import time

import common

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

65

from autotest_lib.server import constants

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

66

from autotest_lib.server import site_utils

Aviv Keshet

a883432

2018-05-07 13:28:32 -0700

[diff] [blame]

67

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

68

from autotest_lib.server.lib import status_history

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

69

from autotest_lib.site_utils import lab_inventory

Prathmesh Prabhu

68acc40

2017-11-09 15:24:15 -0800

[diff] [blame]

70

from autotest_lib.utils import labellib

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

71

from chromite.lib import metrics

David James

2a3cb54

2015-05-05 17:13:43 -0700

[diff] [blame]

72

from chromite.lib import parallel

73

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

74

#This must be imported after chromite.lib.metrics

75

from infra_libs import ts_mon

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

76

77

_POOL_PREFIX = constants.Labels.POOL_PREFIX

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

78

# This is the ratio of all models we should calculate the default max

79

# number of broken models against. It seemed like the best choice that

80

# was neither too strict nor lax.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

81

_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

82

83

_ALL_CRITICAL_POOLS = 'all_critical_pools'

84

_SPARE_DEFAULT = lab_inventory.SPARE_POOL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

85

86

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

87

# _VALID_POOL_PATTERN - Regular expression matching pool names that will

88

# be accepted on the command line.

89

#

90

# Note: This pattern was selected merely to recognize all existing pool

91

# names; there's no underlying technical restriction motivating this

92

# pattern. No reasonable request to add more special characters to the

93

# allowed set should be refused.

94

95

_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$')

96

97

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

98

def _log_message(message, *args):

99

"""Log a message with optional format arguments to stdout.

100

101

This function logs a single line to stdout, with formatting

102

if necessary, and without adornments.

103

104

If `*args` are supplied, the message will be formatted using

105

the arguments.

106

107

@param message Message to be logged, possibly after formatting.

108

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if args:

message = message % args

114

sys.stdout.write('%s\n' % message)

115

116

117

def _log_info(dry_run, message, *args):

118

"""Log information in a dry-run dependent fashion.

119

120

This function logs a single line to stdout, with formatting

121

if necessary. When logging for a dry run, the message is

122

printed as a shell comment, rather than as unadorned text.

123

124

If `*args` are supplied, the message will be formatted using

125

the arguments.

126

127

@param message Message to be logged, possibly after formatting.

128

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if dry_run:

message = '# ' + message

134

_log_message(message, *args)

135

136

137

def _log_error(message, *args):

138

"""Log an error to stderr, with optional format arguments.

139

140

This function logs a single line to stderr, prefixed to indicate

141

that it is an error message.

142

143

If `*args` are supplied, the message will be formatted using

144

the arguments.

145

146

@param message Message to be logged, possibly after formatting.

147

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if args:

message = message % args

153

sys.stderr.write('ERROR: %s\n' % message)

154

155

156

class _DUTPool(object):

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

157

"""Information about a pool of DUTs matching given labels.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

158

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

159

This class collects information about all DUTs for a given pool and matching

160

the given labels, and divides them into three categories:

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

161

+ Working - the DUT is working for testing, and not locked.

162

+ Broken - the DUT is unable to run tests, or it is locked.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

163

+ Ineligible - the DUT is not available to be removed from this pool. The

164

DUT may be either working or broken.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

165

166

DUTs with more than one pool: label are ineligible for exchange

167

during balancing. This is done for the sake of chameleon hosts,

168

which must always be assigned to pool:suites. These DUTs are

169

always marked with pool:chameleon to prevent their reassignment.

170

171

TODO(jrbarnette): The use of `pool:chamelon` (instead of just

172

the `chameleon` label is a hack that should be eliminated.

173

174

_DUTPool instances are used to track both main pools that need

175

to be resupplied with working DUTs and spare pools that supply

176

those DUTs.

177

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

178

@property pool Name of the pool associated with

179

this pool of DUTs.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

180

@property labels Labels that constrain the DUTs to consider.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

181

@property working_hosts The list of this pool's working DUTs.

182

@property broken_hosts The list of this pool's broken DUTs.

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

183

@property ineligible_hosts The list of this pool's ineligible DUTs.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

184

@property pool_labels A list of labels that identify a DUT as part

185

of this pool.

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

186

@property total_hosts The total number of hosts in pool.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

190

def __init__(self, afe, pool, labels, start_time, end_time):

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

191

self.pool = pool

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

192

self.labels = labellib.LabelsMapping(labels)

193

self.labels['pool'] = pool

194

self._pool_labels = [_POOL_PREFIX + self.pool]

195

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

196

self.working_hosts = []

197

self.broken_hosts = []

198

self.ineligible_hosts = []

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

199

self.total_hosts = self._get_hosts(afe, start_time, end_time)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

200

201

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

202

def _get_hosts(self, afe, start_time, end_time):

Prathmesh Prabhu

68acc40

2017-11-09 15:24:15 -0800

[diff] [blame]

203

all_histories = status_history.HostJobHistory.get_multiple_histories(

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

204

afe, start_time, end_time, self.labels.getlabels())

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

205

for h in all_histories:

206

host = h.host

207

host_pools = [l for l in host.labels

208

if l.startswith(_POOL_PREFIX)]

209

if len(host_pools) != 1:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

210

self.ineligible_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

211

else:

212

diag = h.last_diagnosis()[0]

213

if (diag == status_history.WORKING and

214

not host.locked):

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

215

self.working_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

216

else:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

217

self.broken_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

218

return len(all_histories)

@property

def pool_labels(self):

223

"""Return the AFE labels that identify this pool.

224

225

The returned labels are the labels that must be removed

226

to remove a DUT from the pool, or added to add a DUT.

227

228

@return A list of AFE labels suitable for AFE.add_labels()

229

or AFE.remove_labels().

230

231

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

232

return self._pool_labels

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

233

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

234

def calculate_spares_needed(self, target_total):

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

235

"""Calculate and log the spares needed to achieve a target.

236

237

Return how many working spares are needed to achieve the

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

238

given `target_total` with all DUTs working.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

239

240

The spares count may be positive or negative. Positive

241

values indicate spares are needed to replace broken DUTs in

242

order to reach the target; negative numbers indicate that

243

no spares are needed, and that a corresponding number of

244

working devices can be returned.

245

246

If the new target total would require returning ineligible

247

DUTs, an error is logged, and the target total is adjusted

248

so that those DUTs are not exchanged.

249

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

250

@param target_total The new target pool size.

251

252

@return The number of spares needed.

253

254

"""

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

255

num_ineligible = len(self.ineligible_hosts)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

256

spares_needed = target_total >= num_ineligible

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

257

metrics.Boolean(

258

'chromeos/autotest/balance_pools/exhausted_pools',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

259

'True for each pool/model which requests more DUTs than supplied',

260

# TODO(jrbarnette) The 'board' field is a legacy. We need

261

# to leave it here until we do the extra work Monarch

262

# requires to delete a field.

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

263

field_spec=[

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

264

ts_mon.StringField('pool'),

265

ts_mon.StringField('board'),

266

ts_mon.StringField('model'),

]).set(

not spares_needed,

fields={

'pool': self.pool,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

271

'board': self.labels.get('model', ''),

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

272

'model': self.labels.get('model', ''),

273

},

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

274

)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

275

if not spares_needed:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

276

_log_error(

277

'%s pool (%s): Target of %d is below minimum of %d DUTs.',

278

self.pool, self.labels, target_total, num_ineligible,

279

)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

280

_log_error('Adjusting target to %d DUTs.', num_ineligible)

281

target_total = num_ineligible

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

282

else:

283

_log_message('%s %s pool: Target of %d is above minimum.',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

284

self.labels.get('model', ''), self.pool, target_total)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

285

adjustment = target_total - self.total_hosts

286

return len(self.broken_hosts) + adjustment

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

287

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

288

def allocate_surplus(self, num_broken):

289

"""Allocate a list DUTs that can returned as surplus.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

290

291

Return a list of devices that can be returned in order to

292

reduce this pool's supply. Broken DUTs will be preferred

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

293

over working ones.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

294

295

The `num_broken` parameter indicates the number of broken

296

DUTs to be left in the pool. If this number exceeds the

297

number of broken DUTs actually in the pool, the returned

298

list will be empty. If this number is negative, it

299

indicates a number of working DUTs to be returned in

300

addition to all broken ones.

301

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

302

@param num_broken Total number of broken DUTs to be left in

303

this pool.

304

305

@return A list of DUTs to be returned as surplus.

306

307

"""

308

if num_broken >= 0:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

309

surplus = self.broken_hosts[num_broken:]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

310

return surplus

311

else:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

312

return (self.broken_hosts +

313

self.working_hosts[:-num_broken])

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

314

315

316

def _exchange_labels(dry_run, hosts, target_pool, spare_pool):

317

"""Reassign a list of DUTs from one pool to another.

318

319

For all the given hosts, remove all labels associated with

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

320

`spare_pool`, and add the labels for `target_pool`.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

321

322

If `dry_run` is true, perform no changes, but log the `atest`

323

commands needed to accomplish the necessary label changes.

324

325

@param dry_run Whether the logging is for a dry run or

326

for actual execution.

327

@param hosts List of DUTs (AFE hosts) to be reassigned.

328

@param target_pool The `_DUTPool` object from which the hosts

329

are drawn.

330

@param spare_pool The `_DUTPool` object to which the hosts

331

will be added.

332

333

"""

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

334

_log_info(dry_run, 'Transferring %d DUTs from %s to %s.',

335

len(hosts), spare_pool.pool, target_pool.pool)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

336

metrics.Counter(

337

'chromeos/autotest/balance_pools/duts_moved',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

338

'DUTs transferred between pools',

339

# TODO(jrbarnette) The 'board' field is a legacy. We need to

340

# leave it here until we do the extra work Monarch requires to

341

# delete a field.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

342

field_spec=[

343

ts_mon.StringField('board'),

344

ts_mon.StringField('model'),

345

ts_mon.StringField('source_pool'),

346

ts_mon.StringField('target_pool'),

]

).increment_by(

len(hosts),

fields={

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

351

'board': target_pool.labels.get('model', ''),

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

352

'model': target_pool.labels.get('model', ''),

353

'source_pool': spare_pool.pool,

354

'target_pool': target_pool.pool,

355

},

356

)

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

357

if not hosts:

358

return

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

359

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

360

additions = target_pool.pool_labels

361

removals = spare_pool.pool_labels

362

for host in hosts:

363

if not dry_run:

364

_log_message('Updating host: %s.', host.hostname)

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

365

host.remove_labels(removals)

366

host.add_labels(additions)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

367

else:

368

_log_message('atest label remove -m %s %s',

369

host.hostname, ' '.join(removals))

370

_log_message('atest label add -m %s %s',

371

host.hostname, ' '.join(additions))

372

373

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

374

def _balance_model(arguments, afe, pool, labels, start_time, end_time):

375

"""Balance one model as requested by command line arguments.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

376

377

@param arguments Parsed command line arguments.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

378

@param afe AFE object to be used for the changes.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

379

@param pool Pool of the model to be balanced.

380

@param labels Restrict the balancing operation within DUTs

381

that have these labels.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

382

@param start_time Start time for HostJobHistory objects in

383

the DUT pools.

384

@param end_time End time for HostJobHistory objects in the

385

DUT pools.

386

387

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

388

spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)

389

main_pool = _DUTPool(afe, pool, labels, start_time, end_time)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

390

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

391

target_total = main_pool.total_hosts

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

392

if arguments.total is not None:

393

target_total = arguments.total

394

elif arguments.grow:

395

target_total += arguments.grow

396

elif arguments.shrink:

397

target_total -= arguments.shrink

398

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

399

spares_needed = main_pool.calculate_spares_needed(target_total)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

400

if spares_needed > 0:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

401

spare_duts = spare_pool.working_hosts[:spares_needed]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

402

shortfall = spares_needed - len(spare_duts)

403

else:

404

spare_duts = []

405

shortfall = spares_needed

406

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

407

surplus_duts = main_pool.allocate_surplus(shortfall)

408

409

if spares_needed or surplus_duts or arguments.verbose:

410

dry_run = arguments.dry_run

411

_log_message('')

412

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

413

_log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

414

_log_info(dry_run,

415

'Total %d DUTs, %d working, %d broken, %d reserved.',

416

main_pool.total_hosts, len(main_pool.working_hosts),

417

len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))

418

419

if spares_needed > 0:

420

add_msg = 'grow pool by %d DUTs' % spares_needed

421

elif spares_needed < 0:

422

add_msg = 'shrink pool by %d DUTs' % -spares_needed

423

else:

424

add_msg = 'no change to pool size'

425

_log_info(dry_run, 'Target is %d working DUTs; %s.',

426

target_total, add_msg)

427

428

_log_info(dry_run,

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

429

'%s %s pool has %d spares available for balancing pool %s',

430

labels, spare_pool.pool, len(spare_pool.working_hosts),

431

main_pool.pool)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

432

433

if spares_needed > len(spare_duts):

434

_log_error('Not enough spares: need %d, only have %d.',

435

spares_needed, len(spare_duts))

436

elif shortfall >= 0:

437

_log_info(dry_run,

438

'%s %s pool will return %d broken DUTs, '

439

'leaving %d still in the pool.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

440

labels, main_pool.pool,

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

441

len(surplus_duts),

442

len(main_pool.broken_hosts) - len(surplus_duts))

443

else:

444

_log_info(dry_run,

445

'%s %s pool will return %d surplus DUTs, '

446

'including %d working DUTs.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

447

labels, main_pool.pool,

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

448

len(main_pool.broken_hosts) - shortfall,

449

-shortfall)

450

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

451

if (len(main_pool.broken_hosts) > arguments.max_broken and

452

not arguments.force_rebalance):

453

_log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

454

labels, main_pool.pool, len(main_pool.broken_hosts))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

455

_log_error('Please investigate this model to for a bug ')

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

456

_log_error('that is bricking devices. Once you have finished your ')

457

_log_error('investigation, you can force a rebalance with ')

458

_log_error('--force-rebalance')

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

459

spare_duts = []

460

surplus_duts = []

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

461

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

462

if not spare_duts and not surplus_duts:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

463

if arguments.verbose:

464

_log_info(arguments.dry_run, 'No exchange required.')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

465

466

_exchange_labels(arguments.dry_run, surplus_duts,

467

spare_pool, main_pool)

468

_exchange_labels(arguments.dry_run, spare_duts,

469

main_pool, spare_pool)

470

471

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

472

def _too_many_broken(inventory, pool, args):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

473

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

474

Get the inventory of models and check if too many are broken.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

475

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

476

@param inventory: _LabInventory object.

477

@param pool: The pool to check.

478

@param args: Parsed command line arguments.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

479

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

480

@return True if the number of models with 1 or more broken duts

481

exceed max_broken_models, False otherwise.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

482

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

483

# Were we asked to skip this check?

484

if (args.force_rebalance or

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

485

(args.all_models and args.max_broken_models == 0)):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

486

return False

487

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

488

max_broken = args.max_broken_models

489

if max_broken is None:

490

total_num = len(inventory.get_pool_models(pool))

491

max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num)

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

492

_log_info(args.dry_run,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

493

'Max broken models for pool %s: %d',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

494

pool, max_broken)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

495

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

496

broken = [model for model, counts in inventory.iteritems()

497

if counts.get_broken(pool) != 0]

498

_log_message('There are %d models in the %s pool with at least 1 '

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

499

'broken DUT (max threshold %d)',

500

len(broken), pool, max_broken)

501

for b in sorted(broken):

502

_log_message(b)

503

return len(broken) > max_broken

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

504

505

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

506

def _parse_command(argv):

507

"""Parse the command line arguments.

508

509

Create an argument parser for this command's syntax, parse the

510

command line, and return the result of the `ArgumentParser`

511

`parse_args()` method.

512

513

@param argv Standard command line argument vector; `argv[0]` is

514

assumed to be the command name.

515

516

@return Result returned by `ArgumentParser.parse_args()`.

517

518

"""

519

parser = argparse.ArgumentParser(

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

520

prog=os.path.basename(argv[0]),

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

521

description='Balance pool shortages from spares on reserve')

522

Prathmesh Prabhu

c8cf0f6

2017-11-09 08:57:58 -0800

[diff] [blame]

523

parser.add_argument(

524

'-w', '--web', type=str, default=None,

525

help='AFE host to use. Default comes from shadow_config.',

526

)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

527

count_group = parser.add_mutually_exclusive_group()

528

count_group.add_argument('-t', '--total', type=int,

529

metavar='COUNT', default=None,

530

help='Set the number of DUTs in the '

531

'pool to the specified count for '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

532

'every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

533

count_group.add_argument('-a', '--grow', type=int,

534

metavar='COUNT', default=None,

535

help='Add the specified number of DUTs '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

536

'to the pool for every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

537

count_group.add_argument('-d', '--shrink', type=int,

538

metavar='COUNT', default=None,

539

help='Remove the specified number of DUTs '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

540

'from the pool for every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

541

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

542

parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

543

metavar='POOL',

544

help='Pool from which to draw replacement '

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

545

'spares (default: pool:%s)' % _SPARE_DEFAULT)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

546

parser.add_argument('-n', '--dry-run', action='store_true',

547

help='Report actions to take in the form of '

548

'shell commands')

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

549

parser.add_argument('-v', '--verbose', action='store_true',

550

help='Print more detail about calculations for debug '

551

'purposes.')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

552

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

553

parser.add_argument('-m', '--max-broken', default=2, type=int,

554

metavar='COUNT',

555

help='Only rebalance a pool if it has at most '

556

'COUNT broken DUTs.')

557

parser.add_argument('-f', '--force-rebalance', action='store_true',

558

help='Forcefully rebalance all DUTs in a pool, even '

559

'if it has a large number of broken DUTs. '

560

'Before doing this, please investigate whether '

561

'there is a bug that is bricking devices in the '

562

'lab.')

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

563

parser.add_argument('--production', action='store_true',

Prathmesh Prabhu

7a050da

2017-11-09 09:15:03 -0800

[diff] [blame]

564

help='Treat this as a production run. This will '

565

'collect metrics.')

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

566

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

567

parser.add_argument(

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

568

'--all-models',

569

action='store_true',

570

help='Rebalance all managed models. This will do a very expensive '

571

'check to see how many models have at least one broken DUT. '

572

'To bypass that check, set --max-broken-models to 0.',

573

)

574

parser.add_argument(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

575

'--max-broken-models', default=None, type=int, metavar='COUNT',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

576

help='Only rebalance all models if number of models with broken '

577

'DUTs in the specified pool is less than COUNT.',

578

)

579

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

580

parser.add_argument('pool',

581

metavar='POOL',

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

582

help='Name of the pool to balance. Use %s to balance '

583

'all critical pools' % _ALL_CRITICAL_POOLS)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

584

parser.add_argument('models', nargs='*', metavar='MODEL',

585

help='Names of models to balance.')

Aviv Keshet

39c20dd

2017-11-02 10:17:13 -0700

[diff] [blame]

586

Matthew Leszczenski

2018-11-13 14:50:01 -0800

[diff] [blame]

587

parser.add_argument('-p', '--phase', metavar='PHASE',

588

help='Optional phase label to restrict balance '

589

'operation to.')

590

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

591

parser.add_argument('--sku', type=str,

592

help='Optional name of sku to restrict to.')

593

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

594

arguments = parser.parse_args(argv[1:])

David James

8352bc2

2015-05-05 16:37:05 -0700

[diff] [blame]

595

596

# Error-check arguments.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

597

if arguments.models and arguments.all_models:

598

parser.error('Cannot specify individual models on the command line '

599

'when using --all-models.')

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

600

if (arguments.pool == _ALL_CRITICAL_POOLS and

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

601

arguments.spare != _SPARE_DEFAULT):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

602

parser.error('Cannot specify --spare pool to be %s when balancing all '

603

'critical pools.' % _SPARE_DEFAULT)

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

604

for p in (arguments.spare, arguments.pool):

605

if not _VALID_POOL_PATTERN.match(p):

606

parser.error('Invalid pool name: %s' % p)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

return arguments

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

610

def infer_balancer_targets(afe, arguments, pools):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

611

"""Take some arguments and translate them to a list of models to balance

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

612

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

613

Args:

614

@param afe AFE object to be used for taking inventory.

615

@param arguments Parsed command line arguments.

616

@param pools The list of pools to balance.

617

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

618

@returns a list of (model, labels) tuples to be balanced

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

619

620

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

621

balancer_targets = []

Aviv Keshet

39c20dd

2017-11-02 10:17:13 -0700

[diff] [blame]

622

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

623

for pool in pools:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

624

if arguments.all_models:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

625

inventory = lab_inventory.get_inventory(afe)

626

quarantine = _too_many_broken(inventory, pool, arguments)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

627

if quarantine:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

628

_log_error('Refusing to balance all models for %s pool, '

629

'too many models with at least 1 broken DUT '

630

'detected.', pool)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

631

else:

Richard Barnette

685ac85

2018-04-04 16:14:06 -0700

[diff] [blame]

632

for model in inventory.get_pool_models(pool):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

633

labels = labellib.LabelsMapping()

634

labels['model'] = model

Matthew Leszczenski

2018-11-13 14:50:01 -0800

[diff] [blame]

635

if arguments.phase:

636

labels['phase'] = arguments.phase

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

637

balancer_targets.append((pool, labels.getlabels()))

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

638

metrics.Boolean(

639

'chromeos/autotest/balance_pools/unchanged_pools').set(

640

quarantine, fields={'pool': pool})

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

641

_log_message('Pool %s quarantine status: %s', pool, quarantine)

642

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

643

for model in arguments.models:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

644

labels = labellib.LabelsMapping()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

645

labels['model'] = model

Richard Barnette

32fb1e8

2018-01-30 13:39:30 -0800

[diff] [blame]

646

if arguments.sku:

647

labels['sku'] = arguments.sku

Matthew Leszczenski

2018-11-13 14:50:01 -0800

[diff] [blame]

648

if arguments.phase:

649

labels['phase'] = arguments.phase

Richard Barnette

32fb1e8

2018-01-30 13:39:30 -0800

[diff] [blame]

650

balancer_targets.append((pool, labels.getlabels()))

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

651

return balancer_targets

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

def main(argv):

"""Standard main routine.

656

657

@param argv Command line arguments including `sys.argv[0]`.

658

659

"""

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

660

arguments = _parse_command(argv)

661

if arguments.production:

Aviv Keshet

2cc427d

2018-04-18 13:39:24 -0700

[diff] [blame]

662

metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',

663

indirect=True)

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

664

else:

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

665

metrics_manager = site_utils.TrivialContextManager()

666

667

with metrics_manager:

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

668

with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):

669

end_time = time.time()

670

start_time = end_time - 24 * 60 * 60

Aviv Keshet

a883432

2018-05-07 13:28:32 -0700

[diff] [blame]

671

afe = frontend_wrappers.RetryingAFE(server=arguments.web)

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

672

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

673

def balancer(pool, labels):

674

"""Balance the specified model.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

675

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

676

@param pool: The pool to rebalance for the model.

677

@param labels: labels to restrict to balancing operations

678

within.

679

"""

680

_balance_model(arguments, afe, pool, labels,

681

start_time, end_time)

682

_log_message('')

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

683

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

684

pools = (lab_inventory.CRITICAL_POOLS

685

if arguments.pool == _ALL_CRITICAL_POOLS

686

else [arguments.pool])

687

balancer_targets = infer_balancer_targets(afe, arguments, pools)

688

try:

689

parallel.RunTasksInProcessPool(

balancer,

balancer_targets,

processes=8,

)

except KeyboardInterrupt:

695

pass

J. Richard Barnette