Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

9

board and pool, and determines whether each DUT is working or

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

13

usage: lab_inventory.py [ options ] [ board ... ]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

20

--board-notify <address>[,<address>]

21

Send the "board status" e-mail to all the specified e-mail

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

29

When generating the "board status" e-mail, included a list of

30

<number> specific DUTs to be recommended for repair.

31

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

32

--logdir <directory>

33

Log progress and actions in a file under this directory. Text

34

of any e-mail sent will also be logged in a timestamped file in

35

this directory.

36

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

37

--debug

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

38

Suppress all logging and sending e-mail. Instead, write the

39

output that would be generated onto stdout.

40

41

<board> arguments:

42

With no arguments, gathers the status for all boards in the lab.

43

With one or more named boards on the command line, restricts

44

reporting to just those boards.

"""

import argparse

import logging

import logging.handlers

52

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

53

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

58

from autotest_lib.client.bin import utils

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

59

from autotest_lib.client.common_lib import time_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

60

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

61

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

62

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

63

from autotest_lib.site_utils import gmail_lib

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

64

from autotest_lib.site_utils.suite_scheduler import constants

65

66

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

67

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

68

SPARE_POOL = constants.Pools.SPARE_POOL

69

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

70

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

71

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

72

# monitoring by this script. Currently, we're excluding any

73

# 'adb' host, because we're not ready to monitor Android or

74

# Brillo hosts.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

75

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

76

_EXCLUDED_LABELS = set(['adb'])

77

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

78

# _DEFAULT_DURATION:

79

# Default value used for the --duration command line option.

80

# Specifies how far back in time to search in order to determine

81

# DUT status.

82

83

_DEFAULT_DURATION = 24

84

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

85

# _LOGDIR:

86

# Relative path used in the calculation of the default setting

87

# for the --logdir option. The full path path is relative to

88

# the root of the autotest directory, as determined from

89

# sys.argv[0].

90

# _LOGFILE:

91

# Basename of a file to which general log information will be

92

# written.

93

# _LOG_FORMAT:

94

# Format string for log messages.

95

96

_LOGDIR = os.path.join('logs', 'dut-data')

97

_LOGFILE = 'lab-inventory.log'

98

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

99

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

100

# Pattern describing location-based host names in the Chrome OS test

101

# labs. Each DUT hostname designates the DUT's location:

102

# * A lab (room) that's physically separated from other labs

103

# (i.e. there's a door).

104

# * A row (or aisle) of DUTs within the lab.

105

# * A vertical rack of shelves on the row.

106

# * A specific host on one shelf of the rack.

107

108

_HOSTNAME_PATTERN = re.compile(

109

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

110

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

111

# Default entry for managed pools.

112

113

_MANAGED_POOL_DEFAULT = 'all_pools'

114

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

115

116

class _PoolCounts(object):

117

"""Maintains a set of `HostJobHistory` objects for a pool.

118

119

The collected history objects are nominally all part of a single

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

120

scheduling pool of DUTs. The collection maintains a list of

121

working DUTs, a list of broken DUTs, and a list of all DUTs.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

122

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

123

Performance note: Certain methods in this class are potentially

124

expensive:

125

* `get_working()`

126

* `get_working_list()`

127

* `get_broken()`

128

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

129

* `get_idle()`

130

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

131

The first time any one of these methods is called, it causes

132

multiple RPC calls with a relatively expensive set of database

133

queries. However, the results of the queries are cached in the

134

individual `HostJobHistory` objects, so only the first call

135

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

136

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

137

Additionally, `get_working_list()`, `get_broken_list()` and

138

`get_idle_list()` cache their return values to avoid recalculating

139

lists at every call; this caching is separate from the caching of RPC

140

results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

141

142

This class is deliberately constructed to delay the RPC cost

143

until the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

144

`record_host()`) so that it's possible to construct a complete

145

`_LabInventory` without making the expensive queries at creation

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

146

time. `_populate_board_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

152

self._working_list = None

153

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

154

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

155

156

157

def record_host(self, host_history):

158

"""Add one `HostJobHistory` object to the collection.

159

160

@param host_history The `HostJobHistory` object to be

161

remembered.

162

163

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

164

self._working_list = None

165

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

166

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

167

self._histories.append(host_history)

168

169

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

170

def get_working_list(self):

171

"""Return a list of all working DUTs in the pool.

172

173

Filter `self._histories` for histories where the last

174

diagnosis is `WORKING`.

175

176

Cache the result so that we only cacluate it once.

177

178

@return A list of HostJobHistory objects.

179

180

"""

181

if self._working_list is None:

182

self._working_list = [h for h in self._histories

183

if h.last_diagnosis()[0] == status_history.WORKING]

184

return self._working_list

185

186

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

187

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

188

"""Return the number of working DUTs in the pool."""

189

return len(self.get_working_list())

190

191

192

def get_broken_list(self):

193

"""Return a list of all broken DUTs in the pool.

194

195

Filter `self._histories` for histories where the last

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

196

diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

197

198

Cache the result so that we only cacluate it once.

199

200

@return A list of HostJobHistory objects.

201

202

"""

203

if self._broken_list is None:

204

self._broken_list = [h for h in self._histories

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

205

if h.last_diagnosis()[0] == status_history.BROKEN]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

206

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

207

208

209

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

210

"""Return the number of broken DUTs in the pool."""

211

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

212

213

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

214

def get_idle_list(self):

215

"""Return a list of all idle DUTs in the pool.

216

217

Filter `self._histories` for histories where the last

218

diagnosis is `UNUSED` or `UNKNOWN`.

219

220

Cache the result so that we only cacluate it once.

221

222

@return A list of HostJobHistory objects.

223

224

"""

225

idle_list = [status_history.UNUSED, status_history.UNKNOWN]

226

if self._idle_list is None:

227

self._idle_list = [h for h in self._histories

228

if h.last_diagnosis()[0] in idle_list]

229

return self._idle_list

def get_idle(self):

"""Return the number of idle DUTs in the pool."""

234

return len(self.get_idle_list())

235

236

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

237

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

238

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

239

return len(self._histories)

240

241

242

class _BoardCounts(object):

243

"""Maintains a set of `HostJobHistory` objects for a board.

244

245

The collected history objects are nominally all of the same

246

board. The collection maintains a count of working DUTs, a

247

count of broken DUTs, and a total count. The counts can be

248

obtained either for a single pool, or as a total across all

249

pools.

250

251

DUTs in the collection must be assigned to one of the pools

252

in `_MANAGED_POOLS`.

253

254

The `get_working()` and `get_broken()` methods rely on the

255

methods of the same name in _PoolCounts, so the performance

256

note in _PoolCounts applies here as well.

"""

def __init__(self):

self._pools = {

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

262

pool: _PoolCounts() for pool in MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

263

}

264

265

def record_host(self, host_history):

266

"""Add one `HostJobHistory` object to the collection.

267

268

@param host_history The `HostJobHistory` object to be

269

remembered.

270

271

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

272

pool = host_history.host_pool

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

273

self._pools[pool].record_host(host_history)

274

275

276

def _count_pool(self, get_pool_count, pool=None):

277

"""Internal helper to count hosts in a given pool.

278

279

The `get_pool_count` parameter is a function to calculate

280

the exact count of interest for the pool.

281

282

@param get_pool_count Function to return a count from a

283

_PoolCount object.

284

@param pool The pool to be counted. If `None`,

285

return the total across all pools.

"""

if pool is None:

return sum([get_pool_count(counts)

290

for counts in self._pools.values()])

291

else:

292

return get_pool_count(self._pools[pool])

293

294

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

295

def get_working_list(self):

296

"""Return a list of all working DUTs for the board.

297

298

Go through all HostJobHistory objects in the board's pools,

299

selecting the ones where the last diagnosis is `WORKING`.

300

301

@return A list of HostJobHistory objects.

"""

l = []

for p in self._pools.values():

306

l.extend(p.get_working_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

310

def get_working(self, pool=None):

311

"""Return the number of working DUTs in a pool.

312

313

@param pool The pool to be counted. If `None`, return the

314

total across all pools.

315

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

316

@return The total number of working DUTs in the selected

317

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

318

"""

319

return self._count_pool(_PoolCounts.get_working, pool)

320

321

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

322

def get_broken_list(self):

323

"""Return a list of all broken DUTs for the board.

324

325

Go through all HostJobHistory objects in the board's pools,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

326

selecting the ones where the last diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

327

328

@return A list of HostJobHistory objects.

"""

l = []

for p in self._pools.values():

333

l.extend(p.get_broken_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

337

def get_broken(self, pool=None):

338

"""Return the number of broken DUTs in a pool.

339

340

@param pool The pool to be counted. If `None`, return the

341

total across all pools.

342

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

343

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

344

"""

345

return self._count_pool(_PoolCounts.get_broken, pool)

346

347

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

348

def get_idle_list(self, pool=None):

349

"""Return a list of all idle DUTs for the board.

350

351

Go through all HostJobHistory objects in the board's pools,

352

selecting the ones where the last diagnosis is `UNUSED` or `UNKNOWN`.

353

354

@param pool: The pool to be counted. If `None`, return the total list

355

across all pools.

356

357

@return A list of HostJobHistory objects.

"""

if pool is None:

l = []

for p in self._pools.values():

363

l.extend(p.get_idle_list())

364

return l

365

else:

366

return _PoolCounts.get_idle_list(self._pools[pool])

367

368

369

def get_idle(self, pool=None):

370

"""Return the number of idle DUTs in a pool.

371

372

@param pool: The pool to be counted. If `None`, return the total

373

across all pools.

374

375

@return The total number of idle DUTs in the selected pool(s).

376

"""

377

return self._count_pool(_PoolCounts.get_idle, pool)

378

379

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

380

def get_spares_buffer(self):

381

"""Return the the nominal number of working spares.

382

383

Calculates and returns how many working spares there would

384

be in the spares pool if all broken DUTs were in the spares

385

pool. This number may be negative, indicating a shortfall

386

in the critical pools.

387

388

@return The total number DUTs in the spares pool, less the total

389

number of broken DUTs in all pools.

390

"""

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

391

return self.get_total(SPARE_POOL) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

392

393

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

394

def get_total(self, pool=None):

395

"""Return the total number of DUTs in a pool.

396

397

@param pool The pool to be counted. If `None`, return the

398

total across all pools.

399

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

400

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

401

"""

402

return self._count_pool(_PoolCounts.get_total, pool)

403

404

405

class _LabInventory(dict):

406

"""Collection of `HostJobHistory` objects for the Lab's inventory.

407

408

The collection is indexed by board. Indexing returns the

409

_BoardCounts object associated with the board.

410

411

The collection is also iterable. The iterator returns all the

412

boards in the inventory, in unspecified order.

"""

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

416

@staticmethod

417

def _eligible_host(afehost):

418

"""Return whether this host is eligible for monitoring.

419

420

Hosts with any label that's in `_EXCLUDED_LABELS` aren't

421

eligible.

422

423

@param afehost The host to be tested for eligibility.

424

"""

425

return not len(_EXCLUDED_LABELS.intersection(afehost.labels))

426

427

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

428

@classmethod

429

def create_inventory(cls, afe, start_time, end_time, boardlist=[]):

430

"""Return a Lab inventory with specified parameters.

431

432

By default, gathers inventory from `HostJobHistory` objects

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

433

for all DUTs in the `MANAGED_POOLS` list. If `boardlist`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

434

is supplied, the inventory will be restricted to only the

435

given boards.

436

437

@param afe AFE object for constructing the

438

`HostJobHistory` objects.

439

@param start_time Start time for the `HostJobHistory`

440

objects.

441

@param end_time End time for the `HostJobHistory`

442

objects.

443

@param boardlist List of boards to include. If empty,

444

include all available boards.

445

@return A `_LabInventory` object for the specified boards.

446

447

"""

448

label_list = [constants.Labels.POOL_PREFIX + l

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

449

for l in MANAGED_POOLS]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

450

afehosts = afe.get_hosts(labels__name__in=label_list)

451

if boardlist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

452

# We're deliberately not checking host eligibility in this

453

# code path. This is a debug path, not used in production;

454

# it may be useful to include ineligible hosts here.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

455

boardhosts = []

456

for board in boardlist:

457

board_label = constants.Labels.BOARD_PREFIX + board

458

host_list = [h for h in afehosts

459

if board_label in h.labels]

460

boardhosts.extend(host_list)

461

afehosts = boardhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

462

else:

463

afehosts = [h for h in afehosts if cls._eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

464

create = lambda host: (

465

status_history.HostJobHistory(afe, host,

466

start_time, end_time))

467

return cls([create(host) for host in afehosts])

468

469

470

def __init__(self, histories):

J. Richard Barnette

6948ed3

2015-05-06 08:57:10 -0700

[diff] [blame]

471

# N.B. The query that finds our hosts is restricted to those

472

# with a valid pool: label, but doesn't check for a valid

473

# board: label. In some (insufficiently) rare cases, the

474

# AFE hosts table has been known to (incorrectly) have DUTs

475

# with a pool: but no board: label. We explicitly exclude

476

# those here.

477

histories = [h for h in histories

478

if h.host_board is not None]

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

479

boards = set([h.host_board for h in histories])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

480

initval = { board: _BoardCounts() for board in boards }

481

super(_LabInventory, self).__init__(initval)

482

self._dut_count = len(histories)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

483

self._managed_boards = {}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

484

for h in histories:

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

485

self[h.host_board].record_host(h)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

486

487

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

488

def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

489

"""Return the set of "managed" boards.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

490

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

491

Operationally, saying a board is "managed" means that the

492

board will be included in the "board" and "repair

493

recommendations" reports. That is, if there are failures in

494

the board's inventory then lab techs will be asked to fix

495

them without a separate ticket.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

496

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

497

For purposes of implementation, a board is "managed" if it

498

has DUTs in both the spare and a non-spare (i.e. critical)

499

pool.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

500

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

501

@param pool: The specified pool for managed boards.

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

502

@return A set of all the boards that have both spare and

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

503

non-spare pools, unless the pool is specified,

504

then the set of boards in that pool.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

505

"""

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

506

if self._managed_boards.get(pool, None) is None:

507

self._managed_boards[pool] = set()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

508

for board, counts in self.items():

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

509

# Get the counts for all pools, otherwise get it for the

510

# specified pool.

511

if pool == _MANAGED_POOL_DEFAULT:

512

spares = counts.get_total(SPARE_POOL)

513

total = counts.get_total()

514

if spares != 0 and spares != total:

515

self._managed_boards[pool].add(board)

516

else:

517

if counts.get_total(pool) != 0:

518

self._managed_boards[pool].add(board)

519

return self._managed_boards[pool]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

520

521

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

522

def get_num_duts(self):

523

"""Return the total number of DUTs in the inventory."""

524

return self._dut_count

525

526

527

def get_num_boards(self):

528

"""Return the total number of boards in the inventory."""

return len(self)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

532

def _sort_by_location(inventory_list):

533

"""Return a list of DUTs, organized by location.

534

535

Take the given list of `HostJobHistory` objects, separate it

536

into a list per lab, and sort each lab's list by location. The

537

order of sorting within a lab is

538

* By row number within the lab,

539

* then by rack number within the row,

540

* then by host shelf number within the rack.

541

542

Return a list of the sorted lists.

543

544

Implementation note: host locations are sorted by converting

545

each location into a base 100 number. If row, rack or

546

host numbers exceed the range [0..99], then sorting will

547

break down.

548

549

@return A list of sorted lists of DUTs.

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

555

location = _HOSTNAME_PATTERN.match(history.host.hostname)

556

if location:

557

lab = location.group(1)

558

key = 0

559

for idx in location.group(2, 3, 4):

560

key = BASE * key + int(idx)

561

lab_lists.setdefault(lab, []).append((key, history))

562

return_list = []

563

for dut_list in lab_lists.values():

564

dut_list.sort(key=lambda t: t[0])

565

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

570

"""Return a numeric score rating a set of DUTs to be repaired.

571

572

`buffer_counts` is a dictionary mapping board names to the

573

size of the board's spares buffer.

574

575

`repair_list` is a list of DUTs to be repaired.

576

577

This function calculates the new set of buffer counts that would

578

result from the proposed repairs, and scores the new set using

579

two numbers:

580

* Worst case buffer count for any board (higher is better).

581

This is the more siginficant number for comparison.

582

* Number of boards at the worst case (lower is better). This

583

is the less significant number.

584

585

Implementation note: The score could fail to reflect the

586

intended criteria if there are more than 1000 boards in the

587

inventory.

588

589

@param spare_counts A dictionary mapping boards to buffer counts.

590

@param repair_list A list of boards to be repaired.

591

@return A numeric score.

592

593

"""

594

# Go through `buffer_counts`, and create a list of new counts

595

# that records the buffer count for each board after repair.

596

# The new list of counts discards the board names, as they don't

597

# contribute to the final score.

598

_NBOARDS = 1000

599

repair_inventory = _LabInventory(repair_list)

600

new_counts = []

601

for b, c in buffer_counts.items():

602

if b in repair_inventory:

603

newcount = repair_inventory[b].get_total()

604

else:

605

newcount = 0

606

new_counts.append(c + newcount)

607

# Go through the new list of counts. Find the worst available

608

# spares count, and count how many times that worst case occurs.

609

worst_count = new_counts[0]

610

num_worst = 1

611

for c in new_counts[1:]:

612

if c == worst_count:

613

num_worst += 1

614

elif c < worst_count:

615

worst_count = c

616

num_worst = 1

617

# Return the calculated score

618

return _NBOARDS * worst_count - num_worst

619

620

621

def _generate_repair_recommendation(inventory, num_recommend):

622

"""Return a summary of selected DUTs needing repair.

623

624

Returns a message recommending a list of broken DUTs to be

625

repaired. The list of DUTs is selected based on these

626

criteria:

627

* No more than `num_recommend` DUTs will be listed.

628

* All DUTs must be in the same lab.

629

* DUTs should be selected for some degree of physical

630

proximity.

631

* DUTs for boards with a low spares buffer are more important

632

than DUTs with larger buffers.

633

634

The algorithm used will guarantee that at least one DUT from a

635

board with the smallest spares buffer will be recommended. If

636

the worst spares buffer number is shared by more than one board,

637

the algorithm will tend to prefer repair sets that include more

638

of those boards over sets that cover fewer boards.

639

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

640

@param inventory Inventory for generating recommendations.

641

@param num_recommend Number of DUTs to recommend for repair.

642

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

643

"""

644

logging.debug('Creating DUT repair recommendations')

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

645

board_buffer_counts = {}

646

broken_list = []

647

for board in inventory.get_managed_boards():

648

logging.debug('Listing failed DUTs for %s', board)

649

counts = inventory[board]

650

if counts.get_broken() != 0:

651

board_buffer_counts[board] = counts.get_spares_buffer()

652

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

653

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

654

# simplification is hard:

655

# * Calculating an initial recommendation outside of

656

# the loop likely would make things more complicated,

657

# not less.

658

# * It's necessary to calculate an initial lab slice once per

659

# lab _before_ the while loop, in case the number of broken

660

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

661

recommendation = None

662

best_score = None

663

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

664

start = 0

665

end = num_recommend

666

lab_slice = lab_duts[start : end]

667

lab_score = _score_repair_set(board_buffer_counts,

668

lab_slice)

669

while end < len(lab_duts):

670

start += 1

671

end += 1

672

new_slice = lab_duts[start : end]

673

new_score = _score_repair_set(board_buffer_counts,

674

new_slice)

675

if new_score > lab_score:

676

lab_slice = new_slice

677

lab_score = new_score

678

if recommendation is None or lab_score > best_score:

679

recommendation = lab_slice

680

best_score = lab_score

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

681

# N.B. The trailing space here is manadatory: Without it, Gmail

682

# will parse the URL wrong. Don't ask. If you simply _must_

683

# know more, go try it yourself...

684

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

685

message = ['Repair recommendations:\n',

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

686

line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

687

for h in recommendation:

688

servo_name = servo_host.make_servo_hostname(h.host.hostname)

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

689

servo_present = utils.host_is_in_lab_zone(servo_name)

690

_, event = h.last_diagnosis()

691

line = line_fmt % (

692

h.host.hostname, h.host_board,

693

'Yes' if servo_present else 'No', event.job_url)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

694

message.append(line)

695

return '\n'.join(message)

696

697

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

698

def _generate_board_inventory_message(inventory):

699

"""Generate the "board inventory" e-mail message.

700

701

The board inventory is a list by board summarizing the number

702

of working and broken DUTs, and the total shortfall or surplus

703

of working devices relative to the minimum critical pool

704

requirement.

705

706

The report omits boards with no DUTs in the spare pool or with

707

no DUTs in a critical pool.

708

709

N.B. For sample output text formattted as users can expect to

710

see it in e-mail and log files, refer to the unit tests.

711

712

@param inventory _LabInventory object with the inventory to

713

be reported on.

714

@return String with the inventory message to be sent.

715

716

"""

717

logging.debug('Creating board inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

718

nworking = 0

719

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

720

nidle = 0

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

721

nbroken_boards = 0

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

722

ntotal_boards = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

723

summaries = []

724

for board in inventory.get_managed_boards():

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

725

counts = inventory[board]

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

726

logging.debug('Counting %2d DUTS for board %s',

727

counts.get_total(), board)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

728

# Summary elements laid out in the same order as the text

729

# headers:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

730

# Board Avail Bad Idle Good Spare Total

731

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

732

element = (board,

733

counts.get_spares_buffer(),

734

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

735

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

736

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

737

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

738

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

739

if element[2]:

740

summaries.append(element)

741

nbroken_boards += 1

742

ntotal_boards += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

743

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

744

nidle += element[3]

745

nworking += element[4]

746

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

747

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

748

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

749

idle_percent = int(round(100.0 * nidle / ntotal))

750

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

751

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

752

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

753

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

754

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

755

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

756

nworking, working_percent,

757

ntotal),

758

'',

759

'Boards with failures: %d' % nbroken_boards,

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

760

'Boards in inventory: %d' % ntotal_boards,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

761

'', '',

762

'Full board inventory:\n',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

763

'%-22s %5s %5s %5s %5s %5s %5s' % (

764

'Board', 'Avail', 'Bad', 'Idle', 'Good',

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

765

'Spare', 'Total')]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

766

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

767

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

768

return '\n'.join(message)

769

770

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

771

_POOL_INVENTORY_HEADER = '''\

Aviv Keshet

056d74c

2015-07-14 09:18:43 -0700

[diff] [blame]

772

Notice to Infrastructure deputies: All boards shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

773

less than full strength, please take action to resolve the issues.

774

Once you're satisified that failures won't recur, failed DUTs can

775

be replaced with spares by running `balance_pool`. Detailed

776

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

777

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

781

def _generate_pool_inventory_message(inventory):

782

"""Generate the "pool inventory" e-mail message.

783

784

The pool inventory is a list by pool and board summarizing the

785

number of working and broken DUTs in the pool. Only boards with

786

at least one broken DUT are included in the list.

787

788

N.B. For sample output text formattted as users can expect to

789

see it in e-mail and log files, refer to the unit tests.

790

791

@param inventory _LabInventory object with the inventory to

792

be reported on.

793

@return String with the inventory message to be sent.

794

795

"""

796

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

797

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

798

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

799

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

800

message.append(

801

'%sStatus for pool:%s, by board:' % (newline, pool))

802

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

803

'%-20s %5s %5s %5s %5s' % (

804

'Board', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

805

data_list = []

806

for board, counts in inventory.items():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

807

logging.debug('Counting %2d DUTs for %s, %s',

808

counts.get_total(pool), board, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

809

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

810

idle = counts.get_idle(pool)

811

# boards at full strength are not reported

812

if broken == 0 and idle == 0:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

813

continue

814

working = counts.get_working(pool)

815

total = counts.get_total(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

816

data_list.append((board, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

817

if data_list:

818

data_list = sorted(data_list, key=lambda d: -d[1])

819

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

820

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

821

else:

822

message.append('(All boards at full strength)')

823

newline = '\n'

824

return '\n'.join(message)

825

826

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

827

_IDLE_INVENTORY_HEADER = '''\

828

Notice to Infrastructure deputies: The hosts shown below haven't

829

run any jobs for at least 24 hours. Please check each host; locked

830

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

836

"""Generate the "idle inventory" e-mail message.

837

838

The idle inventory is a host list with corresponding pool and board,

839

where the hosts are idle (`UNKWOWN` or `UNUSED`).

840

841

N.B. For sample output text format as users can expect to

842

see it in e-mail and log files, refer to the unit tests.

843

844

@param inventory _LabInventory object with the inventory to

845

be reported on.

846

@return String with the inventory message to be sent.

847

848

"""

849

logging.debug('Creating idle inventory')

850

message = [_IDLE_INVENTORY_HEADER]

851

message.append('Idle Host List:')

852

message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))

853

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

854

for pool in MANAGED_POOLS:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

855

for board, counts in inventory.items():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

856

logging.debug('Counting %2d DUTs for %s, %s',

857

counts.get_total(pool), board, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

858

data_list.extend([(dut.host.hostname, board, pool)

859

for dut in counts.get_idle_list(pool)])

860

if data_list:

861

message.extend(['%-30s %-20s %s' % t for t in data_list])

862

else:

863

message.append('(No idle DUTs)')

864

return '\n'.join(message)

865

866

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

867

def _send_email(arguments, tag, subject, recipients, body):

868

"""Send an inventory e-mail message.

869

870

The message is logged in the selected log directory using `tag`

871

for the file name.

872

873

If the --print option was requested, the message is neither

874

logged nor sent, but merely printed on stdout.

875

876

@param arguments Parsed command-line options.

877

@param tag Tag identifying the inventory for logging

878

purposes.

879

@param subject E-mail Subject: header line.

880

@param recipients E-mail addresses for the To: header line.

881

@param body E-mail message body.

882

883

"""

884

logging.debug('Generating email: "%s"', subject)

885

all_recipients = ', '.join(recipients)

886

report_body = '\n'.join([

887

'To: %s' % all_recipients,

888

'Subject: %s' % subject,

889

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

890

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

891

print report_body

892

else:

893

filename = os.path.join(arguments.logdir, tag)

894

try:

895

report_file = open(filename, 'w')

896

report_file.write(report_body)

897

report_file.close()

898

except EnvironmentError as e:

899

logging.error('Failed to write %s: %s', filename, e)

900

try:

901

gmail_lib.send_email(all_recipients, subject, body)

902

except Exception as e:

903

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

def _separate_email_addresses(address_list):

908

"""Parse a list of comma-separated lists of e-mail addresses.

909

910

@param address_list A list of strings containing comma

911

separate e-mail addresses.

912

@return A list of the individual e-mail addresses.

"""

newlist = []

for arg in address_list:

917

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

922

"""Validate command-line arguments.

923

924

Join comma separated e-mail addresses for `--board-notify` and

925

`--pool-notify` in separate option arguments into a single list.

926

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

927

For non-debug uses, require that notification be requested for

928

at least one report. For debug, if notification isn't specified,

929

treat it as "run all the reports."

930

931

The return value indicates success or failure; in the case of

932

failure, we also write an error message to stderr.

933

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

934

@param arguments Command-line arguments as returned by

935

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

936

@return True if the arguments are semantically good, or False

937

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

938

939

"""

940

arguments.board_notify = _separate_email_addresses(

941

arguments.board_notify)

942

arguments.pool_notify = _separate_email_addresses(

943

arguments.pool_notify)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

944

if not arguments.board_notify and not arguments.pool_notify:

945

if not arguments.debug:

946

sys.stderr.write('Must specify at least one of '

947

'--board-notify or --pool-notify\n')

948

return False

949

else:

950

# We want to run all the reports. An empty notify list

951

# will cause a report to be skipped, so make sure the

952

# lists are non-empty.

953

arguments.board_notify = ['']

954

arguments.pool_notify = ['']

955

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

956

957

958

def _get_logdir(script):

959

"""Get the default directory for the `--logdir` option.

960

961

The default log directory is based on the parent directory

962

containing this script.

963

964

@param script Path to this script file.

965

@return A path to a directory.

966

967

"""

968

basedir = os.path.dirname(os.path.abspath(script))

969

basedir = os.path.dirname(basedir)

970

return os.path.join(basedir, _LOGDIR)

971

972

973

def _parse_command(argv):

974

"""Parse the command line arguments.

975

976

Create an argument parser for this command's syntax, parse the

977

command line, and return the result of the ArgumentParser

978

parse_args() method.

979

980

@param argv Standard command line argument vector; argv[0] is

981

assumed to be the command name.

982

@return Result returned by ArgumentParser.parse_args().

983

984

"""

985

parser = argparse.ArgumentParser(

986

prog=argv[0],

987

description='Gather and report lab inventory statistics')

988

parser.add_argument('-d', '--duration', type=int,

989

default=_DEFAULT_DURATION, metavar='HOURS',

990

help='number of hours back to search for status'

991

' (default: %d)' % _DEFAULT_DURATION)

992

parser.add_argument('--board-notify', action='append',

993

default=[], metavar='ADDRESS',

994

help='Generate board inventory message, '

995

'and send it to the given e-mail address(es)')

996

parser.add_argument('--pool-notify', action='append',

997

default=[], metavar='ADDRESS',

998

help='Generate pool inventory message, '

999

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1000

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1001

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1002

'recommended for repair (default: no '

1003

'recommendation)'))

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1004

parser.add_argument('--debug', action='store_true',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1005

help='Print e-mail messages on stdout '

1006

'without sending them.')

1007

parser.add_argument('--logdir', default=_get_logdir(argv[0]),

1008

help='Directory where logs will be written.')

1009

parser.add_argument('boardnames', nargs='*',

1010

metavar='BOARD',

1011

help='names of boards to report on '

1012

'(default: all boards)')

1013

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1014

if not _verify_arguments(arguments):

1015

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1020

"""Configure the `logging` module for our needs.

1021

1022

How we log depends on whether the `--print` option was

1023

provided on the command line. Without the option, we log all

1024

messages at DEBUG level or above, and write them to a file in

1025

the directory specified by the `--logdir` option. With the

1026

option, we write log messages to stdout; messages below INFO

1027

level are discarded.

1028

1029

The log file is configured to rotate once a week on Friday

1030

evening, preserving ~3 months worth of history.

1031

1032

@param arguments Command-line arguments as returned by

1033

`ArgumentParser`

1034

1035

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1036

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1037

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1038

root_logger.setLevel(logging.INFO)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1039

handler = logging.StreamHandler(sys.stdout)

1040

handler.setFormatter(logging.Formatter())

1041

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1042

if not os.path.exists(arguments.logdir):

1043

os.mkdir(arguments.logdir)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1044

root_logger.setLevel(logging.DEBUG)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1045

logfile = os.path.join(arguments.logdir, _LOGFILE)

1046

handler = logging.handlers.TimedRotatingFileHandler(

1047

logfile, when='W4', backupCount=13)

1048

formatter = logging.Formatter(_LOG_FORMAT,

1049

time_utils.TIME_FMT)

1050

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1051

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1052

# implicitly imported logging_config, which calls

1053

# logging.basicConfig() *at module level*. That gives us an

1054

# extra logging handler that we don't want. So, clear out all

1055

# the handlers here.

1056

for h in root_logger.handlers:

1057

root_logger.removeHandler(h)

1058

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1059

1060

1061

def _populate_board_counts(inventory):

1062

"""Gather board counts while providing interactive feedback.

1063

1064

Gathering the status of all individual DUTs in the lab can take

1065

considerable time (~30 minutes at the time of this writing).

1066

1067

Normally, we pay that cost by querying as we go. However, with

1068

the `--print` option, a human being may be watching the

1069

progress. So, we force the first (expensive) queries to happen

1070

up front, and provide a small ASCII progress bar to give an

1071

indicator of how many boards have been processed.

1072

1073

@param inventory _LabInventory object with the inventory to

be gathered.

"""

n = 0

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1078

total_broken = 0

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1079

for counts in inventory.values():

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

# This next call is where all the time goes - it forces all

1090

# of a board's HostJobHistory objects to query the database

1091

# and cache their results.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1092

total_broken += counts.get_broken()

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1093

sys.stdout.write('\n')

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1094

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

def main(argv):

"""Standard main routine.

1099

@param argv Command line arguments including `sys.argv[0]`.

1100

"""

1101

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1102

if not arguments:

1103

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1104

_configure_logging(arguments)

1105

try:

1106

end_time = int(time.time())

1107

start_time = end_time - arguments.duration * 60 * 60

1108

timestamp = time.strftime('%Y-%m-%d.%H',

1109

time.localtime(end_time))

1110

logging.debug('Starting lab inventory for %s', timestamp)

1111

if arguments.board_notify:

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1112

if arguments.recommend:

1113

logging.debug('Will include repair recommendations')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1114

logging.debug('Will include board inventory')

1115

if arguments.pool_notify:

1116

logging.debug('Will include pool inventory')

1117

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

1118

afe = frontend_wrappers.RetryingAFE(server=None)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1119

inventory = _LabInventory.create_inventory(

1120

afe, start_time, end_time, arguments.boardnames)

1121

logging.info('Found %d hosts across %d boards',

1122

inventory.get_num_duts(),

1123

inventory.get_num_boards())

1124

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1125

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1126

_populate_board_counts(inventory)

1127

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1128

if arguments.board_notify:

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1129

if arguments.recommend:

1130

recommend_message = _generate_repair_recommendation(

1131

inventory, arguments.recommend) + '\n\n\n'

1132

else:

1133

recommend_message = ''

1134

board_message = _generate_board_inventory_message(inventory)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1135

_send_email(arguments,

1136

'boards-%s.txt' % timestamp,

1137

'DUT board inventory %s' % timestamp,

1138

arguments.board_notify,

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1139

recommend_message + board_message)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1140

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1141

if arguments.pool_notify:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

1142

pool_message = _generate_pool_inventory_message(inventory)

1143

idle_message = _generate_idle_inventory_message(inventory)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1144

_send_email(arguments,

1145

'pools-%s.txt' % timestamp,

1146

'DUT pool inventory %s' % timestamp,

1147

arguments.pool_notify,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

1148

pool_message + '\n\n\n' + idle_message)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1149

except KeyboardInterrupt:

1150

pass

1151

except EnvironmentError as e:

1152

logging.exception('Unexpected OS error: %s', e)

1153

except Exception as e:

1154

logging.exception('Unexpected exception: %s', e)

1155

1156

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1157

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1158

end_time = int(time.time())

1159

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1160

return _LabInventory.create_inventory(afe, start_time, end_time)

1161

1162

1163

def get_managed_boards(afe):

1164

return get_inventory(afe).get_managed_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1165

1166

J. Richard Barnette