Blame - site_utils/balance_pools.py - platform/external/autotest

2015-04-21 10:22:31 -0700

[diff] [blame]

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Adjust pool balances to cover DUT shortfalls.

7

8

This command takes all broken DUTs in a specific pool for specific

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

models and swaps them with working DUTs taken from a selected pool

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

10

of spares. The command is meant primarily for replacing broken DUTs

11

in critical pools like BVT or CQ, but it can also be used to adjust

12

pool sizes, or to create or remove pools.

13

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

14

usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

15

16

positional arguments:

17

POOL Name of the pool to balance

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

18

MODEL Names of models to balance

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

19

20

optional arguments:

21

-h, --help show this help message and exit

22

-t COUNT, --total COUNT

23

Set the number of DUTs in the pool to the specified

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

24

count for every MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

25

-a COUNT, --grow COUNT

26

Add the specified number of DUTs to the pool for every

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

27

MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

28

-d COUNT, --shrink COUNT

29

Remove the specified number of DUTs from the pool for

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

30

every MODEL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

31

-s POOL, --spare POOL

32

Pool from which to draw replacement spares (default:

33

pool:suites)

Matthew Leszczenski

df2c3d7

2018-11-13 14:50:01 -0800

[diff] [blame]

34

-p PHASE, --phase PHASE

35

Phase to restrict the balance pool operation to

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

36

--sku SKU The specific SKU we intend to swap with

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

37

-n, --dry-run Report actions to take in the form of shell commands

38

39

40

The command attempts to remove all broken DUTs from the target POOL

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

41

for every MODEL, and replace them with enough working DUTs taken

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

42

from the spare pool to bring the strength of POOL to the requested

43

total COUNT.

44

45

If no COUNT options are supplied (i.e. there are no --total, --grow,

46

or --shrink options), the command will maintain the current totals of

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

47

DUTs for every MODEL in the target POOL.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

48

49

If not enough working spares are available, broken DUTs may be left

50

in the pool to keep the pool at the target COUNT.

51

52

When reducing pool size, working DUTs will be returned after broken

53

DUTs, if it's necessary to achieve the target COUNT.

"""

import argparse

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

59

import os

60

import re

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

import sys

import time

import common

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

65

from autotest_lib.server import constants

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

66

from autotest_lib.server import site_utils

Aviv Keshet

a883432

2018-05-07 13:28:32 -0700

[diff] [blame]

67

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

68

from autotest_lib.server.lib import status_history

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

69

from autotest_lib.site_utils import lab_inventory

Prathmesh Prabhu

68acc40

2017-11-09 15:24:15 -0800

[diff] [blame]

70

from autotest_lib.utils import labellib

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

71

from chromite.lib import metrics

David James

2a3cb54

2015-05-05 17:13:43 -0700

[diff] [blame]

72

from chromite.lib import parallel

73

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

74

#This must be imported after chromite.lib.metrics

75

from infra_libs import ts_mon

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

76

77

_POOL_PREFIX = constants.Labels.POOL_PREFIX

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

78

# This is the ratio of all models we should calculate the default max

79

# number of broken models against. It seemed like the best choice that

80

# was neither too strict nor lax.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

81

_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

82

83

_ALL_CRITICAL_POOLS = 'all_critical_pools'

84

_SPARE_DEFAULT = lab_inventory.SPARE_POOL

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

85

86

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

87

# _VALID_POOL_PATTERN - Regular expression matching pool names that will

88

# be accepted on the command line.

89

#

90

# Note: This pattern was selected merely to recognize all existing pool

91

# names; there's no underlying technical restriction motivating this

92

# pattern. No reasonable request to add more special characters to the

93

# allowed set should be refused.

94

95

_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$')

96

97

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

98

def _log_message(message, *args):

99

"""Log a message with optional format arguments to stdout.

100

101

This function logs a single line to stdout, with formatting

102

if necessary, and without adornments.

103

104

If `*args` are supplied, the message will be formatted using

105

the arguments.

106

107

@param message Message to be logged, possibly after formatting.

108

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if args:

message = message % args

114

sys.stdout.write('%s\n' % message)

115

116

117

def _log_info(dry_run, message, *args):

118

"""Log information in a dry-run dependent fashion.

119

120

This function logs a single line to stdout, with formatting

121

if necessary. When logging for a dry run, the message is

122

printed as a shell comment, rather than as unadorned text.

123

124

If `*args` are supplied, the message will be formatted using

125

the arguments.

126

127

@param message Message to be logged, possibly after formatting.

128

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if dry_run:

message = '# ' + message

134

_log_message(message, *args)

135

136

137

def _log_error(message, *args):

138

"""Log an error to stderr, with optional format arguments.

139

140

This function logs a single line to stderr, prefixed to indicate

141

that it is an error message.

142

143

If `*args` are supplied, the message will be formatted using

144

the arguments.

145

146

@param message Message to be logged, possibly after formatting.

147

@param args Format arguments. If empty, the message is logged

without formatting.

"""

if args:

message = message % args

153

sys.stderr.write('ERROR: %s\n' % message)

154

155

156

class _DUTPool(object):

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

157

"""Information about a pool of DUTs matching given labels.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

158

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

159

This class collects information about all DUTs for a given pool and matching

160

the given labels, and divides them into three categories:

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

161

+ Working - the DUT is working for testing, and not locked.

162

+ Broken - the DUT is unable to run tests, or it is locked.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

163

+ Ineligible - the DUT is not available to be removed from this pool. The

164

DUT may be either working or broken.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

165

166

DUTs with more than one pool: label are ineligible for exchange

167

during balancing. This is done for the sake of chameleon hosts,

168

which must always be assigned to pool:suites. These DUTs are

169

always marked with pool:chameleon to prevent their reassignment.

170

171

TODO(jrbarnette): The use of `pool:chamelon` (instead of just

172

the `chameleon` label is a hack that should be eliminated.

173

174

_DUTPool instances are used to track both main pools that need

175

to be resupplied with working DUTs and spare pools that supply

176

those DUTs.

177

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

178

@property pool Name of the pool associated with

179

this pool of DUTs.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

180

@property labels Labels that constrain the DUTs to consider.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

181

@property working_hosts The list of this pool's working DUTs.

182

@property broken_hosts The list of this pool's broken DUTs.

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

183

@property ineligible_hosts The list of this pool's ineligible DUTs.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

184

@property pool_labels A list of labels that identify a DUT as part

185

of this pool.

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

186

@property total_hosts The total number of hosts in pool.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

190

def __init__(self, afe, pool, labels, start_time, end_time):

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

191

self.pool = pool

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

192

self.labels = labellib.LabelsMapping(labels)

193

self.labels['pool'] = pool

194

self._pool_labels = [_POOL_PREFIX + self.pool]

195

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

196

self.working_hosts = []

197

self.broken_hosts = []

198

self.ineligible_hosts = []

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

199

self.total_hosts = self._get_hosts(afe, start_time, end_time)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

200

201

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

202

def _get_hosts(self, afe, start_time, end_time):

Prathmesh Prabhu

68acc40

2017-11-09 15:24:15 -0800

[diff] [blame]

203

all_histories = status_history.HostJobHistory.get_multiple_histories(

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

204

afe, start_time, end_time, self.labels.getlabels())

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

205

for h in all_histories:

206

host = h.host

207

host_pools = [l for l in host.labels

208

if l.startswith(_POOL_PREFIX)]

209

if len(host_pools) != 1:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

210

self.ineligible_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

211

else:

212

diag = h.last_diagnosis()[0]

213

if (diag == status_history.WORKING and

214

not host.locked):

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

215

self.working_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

216

else:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

217

self.broken_hosts.append(host)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

218

return len(all_histories)

@property

def pool_labels(self):

223

"""Return the AFE labels that identify this pool.

224

225

The returned labels are the labels that must be removed

226

to remove a DUT from the pool, or added to add a DUT.

227

228

@return A list of AFE labels suitable for AFE.add_labels()

229

or AFE.remove_labels().

230

231

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

232

return self._pool_labels

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

233

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

234

def calculate_spares_needed(self, target_total):

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

235

"""Calculate and log the spares needed to achieve a target.

236

237

Return how many working spares are needed to achieve the

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

238

given `target_total` with all DUTs working.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

239

240

The spares count may be positive or negative. Positive

241

values indicate spares are needed to replace broken DUTs in

242

order to reach the target; negative numbers indicate that

243

no spares are needed, and that a corresponding number of

244

working devices can be returned.

245

246

If the new target total would require returning ineligible

247

DUTs, an error is logged, and the target total is adjusted

248

so that those DUTs are not exchanged.

249

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

250

@param target_total The new target pool size.

251

252

@return The number of spares needed.

253

254

"""

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

255

num_ineligible = len(self.ineligible_hosts)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

256

spares_needed = target_total >= num_ineligible

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

257

metrics.Boolean(

258

'chromeos/autotest/balance_pools/exhausted_pools',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

259

'True for each pool/model which requests more DUTs than supplied',

260

# TODO(jrbarnette) The 'board' field is a legacy. We need

261

# to leave it here until we do the extra work Monarch

262

# requires to delete a field.

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

263

field_spec=[

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

264

ts_mon.StringField('pool'),

265

ts_mon.StringField('board'),

266

ts_mon.StringField('model'),

]).set(

not spares_needed,

fields={

'pool': self.pool,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

271

'board': self.labels.get('model', ''),

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

272

'model': self.labels.get('model', ''),

273

},

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

274

)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

275

if not spares_needed:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

276

_log_error(

277

'%s pool (%s): Target of %d is below minimum of %d DUTs.',

278

self.pool, self.labels, target_total, num_ineligible,

279

)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

280

_log_error('Adjusting target to %d DUTs.', num_ineligible)

281

target_total = num_ineligible

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

282

else:

283

_log_message('%s %s pool: Target of %d is above minimum.',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

284

self.labels.get('model', ''), self.pool, target_total)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

285

adjustment = target_total - self.total_hosts

286

return len(self.broken_hosts) + adjustment

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

287

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

288

def allocate_surplus(self, num_broken):

289

"""Allocate a list DUTs that can returned as surplus.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

290

291

Return a list of devices that can be returned in order to

292

reduce this pool's supply. Broken DUTs will be preferred

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

293

over working ones.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

294

295

The `num_broken` parameter indicates the number of broken

296

DUTs to be left in the pool. If this number exceeds the

297

number of broken DUTs actually in the pool, the returned

298

list will be empty. If this number is negative, it

299

indicates a number of working DUTs to be returned in

300

addition to all broken ones.

301

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

302

@param num_broken Total number of broken DUTs to be left in

303

this pool.

304

305

@return A list of DUTs to be returned as surplus.

306

307

"""

308

if num_broken >= 0:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

309

surplus = self.broken_hosts[num_broken:]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

310

return surplus

311

else:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

312

return (self.broken_hosts +

313

self.working_hosts[:-num_broken])

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

314

315

316

def _exchange_labels(dry_run, hosts, target_pool, spare_pool):

317

"""Reassign a list of DUTs from one pool to another.

318

319

For all the given hosts, remove all labels associated with

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

320

`spare_pool`, and add the labels for `target_pool`.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

321

322

If `dry_run` is true, perform no changes, but log the `atest`

323

commands needed to accomplish the necessary label changes.

324

325

@param dry_run Whether the logging is for a dry run or

326

for actual execution.

327

@param hosts List of DUTs (AFE hosts) to be reassigned.

328

@param target_pool The `_DUTPool` object from which the hosts

329

are drawn.

330

@param spare_pool The `_DUTPool` object to which the hosts

331

will be added.

332

333

"""

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

334

_log_info(dry_run, 'Transferring %d DUTs from %s to %s.',

335

len(hosts), spare_pool.pool, target_pool.pool)

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

336

metrics.Counter(

337

'chromeos/autotest/balance_pools/duts_moved',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

338

'DUTs transferred between pools',

339

# TODO(jrbarnette) The 'board' field is a legacy. We need to

340

# leave it here until we do the extra work Monarch requires to

341

# delete a field.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

342

field_spec=[

343

ts_mon.StringField('board'),

344

ts_mon.StringField('model'),

345

ts_mon.StringField('source_pool'),

346

ts_mon.StringField('target_pool'),

]

).increment_by(

len(hosts),

fields={

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

351

'board': target_pool.labels.get('model', ''),

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

352

'model': target_pool.labels.get('model', ''),

353

'source_pool': spare_pool.pool,

354

'target_pool': target_pool.pool,

355

},

356

)

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

357

if not hosts:

358

return

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

359

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

360

additions = target_pool.pool_labels

361

removals = spare_pool.pool_labels

362

for host in hosts:

363

if not dry_run:

364

_log_message('Updating host: %s.', host.hostname)

Richard Barnette

07303cb

2016-04-15 16:56:16 -0700

[diff] [blame]

365

host.remove_labels(removals)

366

host.add_labels(additions)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

367

else:

368

_log_message('atest label remove -m %s %s',

369

host.hostname, ' '.join(removals))

370

_log_message('atest label add -m %s %s',

371

host.hostname, ' '.join(additions))

372

373

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

374

def _balance_model(arguments, afe, pool, labels, start_time, end_time):

375

"""Balance one model as requested by command line arguments.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

376

377

@param arguments Parsed command line arguments.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

378

@param afe AFE object to be used for the changes.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

379

@param pool Pool of the model to be balanced.

380

@param labels Restrict the balancing operation within DUTs

381

that have these labels.

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

382

@param start_time Start time for HostJobHistory objects in

383

the DUT pools.

384

@param end_time End time for HostJobHistory objects in the

385

DUT pools.

386

387

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

388

spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)

389

main_pool = _DUTPool(afe, pool, labels, start_time, end_time)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

390

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

391

target_total = main_pool.total_hosts

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

392

if arguments.total is not None:

393

target_total = arguments.total

394

elif arguments.grow:

395

target_total += arguments.grow

396

elif arguments.shrink:

397

target_total -= arguments.shrink

398

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

399

spares_needed = main_pool.calculate_spares_needed(target_total)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

400

if spares_needed > 0:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

401

spare_duts = spare_pool.working_hosts[:spares_needed]

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

402

shortfall = spares_needed - len(spare_duts)

403

else:

404

spare_duts = []

405

shortfall = spares_needed

406

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

407

surplus_duts = main_pool.allocate_surplus(shortfall)

408

409

if spares_needed or surplus_duts or arguments.verbose:

410

dry_run = arguments.dry_run

411

_log_message('')

412

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

413

_log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

414

_log_info(dry_run,

415

'Total %d DUTs, %d working, %d broken, %d reserved.',

416

main_pool.total_hosts, len(main_pool.working_hosts),

417

len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))

418

419

if spares_needed > 0:

420

add_msg = 'grow pool by %d DUTs' % spares_needed

421

elif spares_needed < 0:

422

add_msg = 'shrink pool by %d DUTs' % -spares_needed

423

else:

424

add_msg = 'no change to pool size'

425

_log_info(dry_run, 'Target is %d working DUTs; %s.',

426

target_total, add_msg)

427

428

_log_info(dry_run,

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

429

'%s %s pool has %d spares available for balancing pool %s',

430

labels, spare_pool.pool, len(spare_pool.working_hosts),

431

main_pool.pool)

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

432

433

if spares_needed > len(spare_duts):

434

_log_error('Not enough spares: need %d, only have %d.',

435

spares_needed, len(spare_duts))

436

elif shortfall >= 0:

437

_log_info(dry_run,

438

'%s %s pool will return %d broken DUTs, '

439

'leaving %d still in the pool.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

440

labels, main_pool.pool,

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

441

len(surplus_duts),

442

len(main_pool.broken_hosts) - len(surplus_duts))

443

else:

444

_log_info(dry_run,

445

'%s %s pool will return %d surplus DUTs, '

446

'including %d working DUTs.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

447

labels, main_pool.pool,

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

448

len(main_pool.broken_hosts) - shortfall,

449

-shortfall)

450

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

451

if (len(main_pool.broken_hosts) > arguments.max_broken and

452

not arguments.force_rebalance):

453

_log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

454

labels, main_pool.pool, len(main_pool.broken_hosts))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

455

_log_error('Please investigate this model to for a bug ')

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

456

_log_error('that is bricking devices. Once you have finished your ')

457

_log_error('investigation, you can force a rebalance with ')

458

_log_error('--force-rebalance')

Jacob Kopczynski

2017-11-10 16:26:42 -0800

[diff] [blame]

459

spare_duts = []

460

surplus_duts = []

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

461

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

462

if not spare_duts and not surplus_duts:

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

463

if arguments.verbose:

464

_log_info(arguments.dry_run, 'No exchange required.')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

465

466

_exchange_labels(arguments.dry_run, surplus_duts,

467

spare_pool, main_pool)

468

_exchange_labels(arguments.dry_run, spare_duts,

469

main_pool, spare_pool)

470

471

472

def _parse_command(argv):

473

"""Parse the command line arguments.

474

475

Create an argument parser for this command's syntax, parse the

476

command line, and return the result of the `ArgumentParser`

477

`parse_args()` method.

478

479

@param argv Standard command line argument vector; `argv[0]` is

480

assumed to be the command name.

481

482

@return Result returned by `ArgumentParser.parse_args()`.

483

484

"""

485

parser = argparse.ArgumentParser(

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

486

prog=os.path.basename(argv[0]),

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

487

description='Balance pool shortages from spares on reserve')

488

Prathmesh Prabhu

c8cf0f6

2017-11-09 08:57:58 -0800

[diff] [blame]

489

parser.add_argument(

490

'-w', '--web', type=str, default=None,

491

help='AFE host to use. Default comes from shadow_config.',

492

)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

493

count_group = parser.add_mutually_exclusive_group()

494

count_group.add_argument('-t', '--total', type=int,

495

metavar='COUNT', default=None,

496

help='Set the number of DUTs in the '

497

'pool to the specified count for '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

498

'every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

499

count_group.add_argument('-a', '--grow', type=int,

500

metavar='COUNT', default=None,

501

help='Add the specified number of DUTs '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

502

'to the pool for every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

503

count_group.add_argument('-d', '--shrink', type=int,

504

metavar='COUNT', default=None,

505

help='Remove the specified number of DUTs '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

506

'from the pool for every MODEL')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

507

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

508

parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

509

metavar='POOL',

510

help='Pool from which to draw replacement '

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

511

'spares (default: pool:%s)' % _SPARE_DEFAULT)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

512

parser.add_argument('-n', '--dry-run', action='store_true',

513

help='Report actions to take in the form of '

514

'shell commands')

David James

2015-05-06 19:30:46 -0700

[diff] [blame]

515

parser.add_argument('-v', '--verbose', action='store_true',

516

help='Print more detail about calculations for debug '

517

'purposes.')

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

518

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

519

parser.add_argument('-m', '--max-broken', default=2, type=int,

520

metavar='COUNT',

521

help='Only rebalance a pool if it has at most '

522

'COUNT broken DUTs.')

523

parser.add_argument('-f', '--force-rebalance', action='store_true',

524

help='Forcefully rebalance all DUTs in a pool, even '

525

'if it has a large number of broken DUTs. '

526

'Before doing this, please investigate whether '

527

'there is a bug that is bricking devices in the '

528

'lab.')

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

529

parser.add_argument('--production', action='store_true',

Prathmesh Prabhu

7a050da

2017-11-09 09:15:03 -0800

[diff] [blame]

530

help='Treat this as a production run. This will '

531

'collect metrics.')

David James

2015-07-17 15:23:04 -0700

[diff] [blame]

532

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

533

parser.add_argument(

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

534

'--all-models',

535

action='store_true',

536

help='Rebalance all managed models. This will do a very expensive '

537

'check to see how many models have at least one broken DUT. '

538

'To bypass that check, set --max-broken-models to 0.',

539

)

540

parser.add_argument(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

541

'--max-broken-models', default=None, type=int, metavar='COUNT',

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

542

help='Only rebalance all models if number of models with broken '

543

'DUTs in the specified pool is less than COUNT.',

544

)

545

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

546

parser.add_argument('pool',

547

metavar='POOL',

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

548

help='Name of the pool to balance. Use %s to balance '

549

'all critical pools' % _ALL_CRITICAL_POOLS)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

550

parser.add_argument('models', nargs='*', metavar='MODEL',

551

help='Names of models to balance.')

Aviv Keshet

39c20dd

2017-11-02 10:17:13 -0700

[diff] [blame]

552

Matthew Leszczenski

df2c3d7

2018-11-13 14:50:01 -0800

[diff] [blame]

553

parser.add_argument('-p', '--phase', metavar='PHASE',

554

help='Optional phase label to restrict balance '

555

'operation to.')

556

Chung-yih Wang

cc1d9cb

2017-11-30 11:20:45 +0800

[diff] [blame]

557

parser.add_argument('--sku', type=str,

558

help='Optional name of sku to restrict to.')

559

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

560

arguments = parser.parse_args(argv[1:])

David James

8352bc2

2015-05-05 16:37:05 -0700

[diff] [blame]

561

562

# Error-check arguments.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

563

if arguments.models and arguments.all_models:

564

parser.error('Cannot specify individual models on the command line '

565

'when using --all-models.')

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

566

if (arguments.pool == _ALL_CRITICAL_POOLS and

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

567

arguments.spare != _SPARE_DEFAULT):

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

568

parser.error('Cannot specify --spare pool to be %s when balancing all '

569

'critical pools.' % _SPARE_DEFAULT)

Richard Barnette

2018-06-14 14:34:34 -0700

[diff] [blame]

570

for p in (arguments.spare, arguments.pool):

571

if not _VALID_POOL_PATTERN.match(p):

572

parser.error('Invalid pool name: %s' % p)

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

return arguments

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

576

def infer_balancer_targets(afe, arguments, pools):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

577

"""Take some arguments and translate them to a list of models to balance

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

578

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

579

Args:

580

@param afe AFE object to be used for taking inventory.

581

@param arguments Parsed command line arguments.

582

@param pools The list of pools to balance.

583

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

584

@returns a list of (model, labels) tuples to be balanced

J. Richard Barnette

2015-04-21 10:22:31 -0700

[diff] [blame]

585

586

"""

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

587

balancer_targets = []

Aviv Keshet

39c20dd

2017-11-02 10:17:13 -0700

[diff] [blame]

588

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

589

for pool in pools:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

590

if arguments.all_models:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

591

inventory = lab_inventory.get_inventory(afe)

Xixuan Wu

99f20d1

2019-08-30 18:29:59 -0700

[diff] [blame]

592

for model in inventory.get_pool_models(pool):

593

labels = labellib.LabelsMapping()

594

labels['model'] = model

595

if arguments.phase:

596

labels['phase'] = arguments.phase

597

balancer_targets.append((pool, labels.getlabels()))

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

598

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

599

for model in arguments.models:

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

600

labels = labellib.LabelsMapping()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

601

labels['model'] = model

Richard Barnette

32fb1e8

2018-01-30 13:39:30 -0800

[diff] [blame]

602

if arguments.sku:

603

labels['sku'] = arguments.sku

Matthew Leszczenski

df2c3d7

2018-11-13 14:50:01 -0800

[diff] [blame]

604

if arguments.phase:

605

labels['phase'] = arguments.phase

Richard Barnette

32fb1e8

2018-01-30 13:39:30 -0800

[diff] [blame]

606

balancer_targets.append((pool, labels.getlabels()))

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

607

return balancer_targets

Jacob Kopczynski

2017-08-25 17:28:35 -0700

[diff] [blame]

def main(argv):

"""Standard main routine.

612

613

@param argv Command line arguments including `sys.argv[0]`.

614

615

"""

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

616

arguments = _parse_command(argv)

617

if arguments.production:

Aviv Keshet

2cc427d

2018-04-18 13:39:24 -0700

[diff] [blame]

618

metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',

619

indirect=True)

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

620

else:

Jacob Kopczynski

2017-10-10 14:37:33 -0700

[diff] [blame]

621

metrics_manager = site_utils.TrivialContextManager()

622

623

with metrics_manager:

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

624

with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):

625

end_time = time.time()

626

start_time = end_time - 24 * 60 * 60

Aviv Keshet

a883432

2018-05-07 13:28:32 -0700

[diff] [blame]

627

afe = frontend_wrappers.RetryingAFE(server=arguments.web)

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

628

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

629

def balancer(pool, labels):

630

"""Balance the specified model.

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

631

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

632

@param pool: The pool to rebalance for the model.

633

@param labels: labels to restrict to balancing operations

634

within.

635

"""

636

_balance_model(arguments, afe, pool, labels,

637

start_time, end_time)

638

_log_message('')

Prathmesh Prabhu

2017-11-09 16:42:48 -0800

[diff] [blame]

639

Aviv Keshet

2018-04-18 13:48:02 -0700

[diff] [blame]

640

pools = (lab_inventory.CRITICAL_POOLS

641

if arguments.pool == _ALL_CRITICAL_POOLS

642

else [arguments.pool])

643

balancer_targets = infer_balancer_targets(afe, arguments, pools)

644

try:

645

parallel.RunTasksInProcessPool(

balancer,

balancer_targets,

processes=8,

)

except KeyboardInterrupt:

651

pass

J. Richard Barnette