Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

9

board and pool, and determines whether each DUT is working or

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

13

usage: lab_inventory.py [ options ] [ board ... ]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

20

--board-notify <address>[,<address>]

21

Send the "board status" e-mail to all the specified e-mail

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

29

When generating the "board status" e-mail, included a list of

30

<number> specific DUTs to be recommended for repair.

31

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

32

--logdir <directory>

33

Log progress and actions in a file under this directory. Text

34

of any e-mail sent will also be logged in a timestamped file in

35

this directory.

36

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

37

--debug

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

38

Suppress all logging and sending e-mail. Instead, write the

39

output that would be generated onto stdout.

40

41

<board> arguments:

42

With no arguments, gathers the status for all boards in the lab.

43

With one or more named boards on the command line, restricts

44

reporting to just those boards.

"""

import argparse

import logging

import logging.handlers

52

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

53

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

58

from autotest_lib.client.bin import utils

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

59

from autotest_lib.client.common_lib import time_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

60

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

61

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

62

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

63

from autotest_lib.site_utils import gmail_lib

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

64

from autotest_lib.site_utils.suite_scheduler import constants

65

66

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

67

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

68

SPARE_POOL = constants.Pools.SPARE_POOL

69

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

70

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

71

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

72

# monitoring by this script. Currently, we're excluding these:

73

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

74

# + 'board:guado_moblab' - These are maintained by a separate

75

# process that doesn't use this script.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

76

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

77

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

78

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

79

# _DEFAULT_DURATION:

80

# Default value used for the --duration command line option.

81

# Specifies how far back in time to search in order to determine

82

# DUT status.

83

84

_DEFAULT_DURATION = 24

85

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

86

# _LOGDIR:

87

# Relative path used in the calculation of the default setting

88

# for the --logdir option. The full path path is relative to

89

# the root of the autotest directory, as determined from

90

# sys.argv[0].

91

# _LOGFILE:

92

# Basename of a file to which general log information will be

93

# written.

94

# _LOG_FORMAT:

95

# Format string for log messages.

96

97

_LOGDIR = os.path.join('logs', 'dut-data')

98

_LOGFILE = 'lab-inventory.log'

99

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

100

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

101

# Pattern describing location-based host names in the Chrome OS test

102

# labs. Each DUT hostname designates the DUT's location:

103

# * A lab (room) that's physically separated from other labs

104

# (i.e. there's a door).

105

# * A row (or aisle) of DUTs within the lab.

106

# * A vertical rack of shelves on the row.

107

# * A specific host on one shelf of the rack.

108

109

_HOSTNAME_PATTERN = re.compile(

110

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

111

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

112

# Default entry for managed pools.

113

114

_MANAGED_POOL_DEFAULT = 'all_pools'

115

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

116

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

117

class _CachedHostJobHistories(object):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

118

"""Maintains a set of `HostJobHistory` objects for a pool.

119

120

The collected history objects are nominally all part of a single

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

121

scheduling pool of DUTs. The collection maintains a list of

122

working DUTs, a list of broken DUTs, and a list of all DUTs.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

123

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

124

Performance note: Certain methods in this class are potentially

125

expensive:

126

* `get_working()`

127

* `get_working_list()`

128

* `get_broken()`

129

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

130

* `get_idle()`

131

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

132

The first time any one of these methods is called, it causes

133

multiple RPC calls with a relatively expensive set of database

134

queries. However, the results of the queries are cached in the

135

individual `HostJobHistory` objects, so only the first call

136

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

137

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

138

Additionally, `get_working_list()`, `get_broken_list()` and

139

`get_idle_list()` cache their return values to avoid recalculating

140

lists at every call; this caching is separate from the caching of RPC

141

results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

142

143

This class is deliberately constructed to delay the RPC cost

144

until the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

145

`record_host()`) so that it's possible to construct a complete

146

`_LabInventory` without making the expensive queries at creation

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

147

time. `_populate_board_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

153

self._working_list = None

154

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

155

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

156

157

158

def record_host(self, host_history):

159

"""Add one `HostJobHistory` object to the collection.

160

161

@param host_history The `HostJobHistory` object to be

162

remembered.

163

164

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

165

self._working_list = None

166

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

167

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

168

self._histories.append(host_history)

169

170

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

171

def get_working_list(self):

172

"""Return a list of all working DUTs in the pool.

173

174

Filter `self._histories` for histories where the last

175

diagnosis is `WORKING`.

176

177

Cache the result so that we only cacluate it once.

178

179

@return A list of HostJobHistory objects.

180

181

"""

182

if self._working_list is None:

183

self._working_list = [h for h in self._histories

184

if h.last_diagnosis()[0] == status_history.WORKING]

185

return self._working_list

186

187

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

188

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

189

"""Return the number of working DUTs in the pool."""

190

return len(self.get_working_list())

191

192

193

def get_broken_list(self):

194

"""Return a list of all broken DUTs in the pool.

195

196

Filter `self._histories` for histories where the last

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

197

diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

198

199

Cache the result so that we only cacluate it once.

200

201

@return A list of HostJobHistory objects.

202

203

"""

204

if self._broken_list is None:

205

self._broken_list = [h for h in self._histories

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

206

if h.last_diagnosis()[0] == status_history.BROKEN]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

207

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

208

209

210

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

211

"""Return the number of broken DUTs in the pool."""

212

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

213

214

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

215

def get_idle_list(self):

216

"""Return a list of all idle DUTs in the pool.

217

218

Filter `self._histories` for histories where the last

219

diagnosis is `UNUSED` or `UNKNOWN`.

220

221

Cache the result so that we only cacluate it once.

222

223

@return A list of HostJobHistory objects.

224

225

"""

226

idle_list = [status_history.UNUSED, status_history.UNKNOWN]

227

if self._idle_list is None:

228

self._idle_list = [h for h in self._histories

229

if h.last_diagnosis()[0] in idle_list]

230

return self._idle_list

def get_idle(self):

"""Return the number of idle DUTs in the pool."""

235

return len(self.get_idle_list())

236

237

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

238

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

239

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

240

return len(self._histories)

241

242

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

243

class _ManagedPoolsHostJobHistories(object):

244

"""Maintains a set of `HostJobHistory`s per managed pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

245

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

246

The collection maintains a count of working DUTs, a count of broken DUTs,

247

and a total count. The counts can be obtained either for a single pool, or

248

as a total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

249

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

250

DUTs in the collection must be assigned to one of the pools in

251

`_MANAGED_POOLS`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

252

253

The `get_working()` and `get_broken()` methods rely on the

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

254

methods of the same name in _CachedHostJobHistories, so the performance

255

note in _CachedHostJobHistories applies here as well.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

260

self._histories_by_pool = {

261

pool: _CachedHostJobHistories() for pool in MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

262

}

263

264

def record_host(self, host_history):

265

"""Add one `HostJobHistory` object to the collection.

266

267

@param host_history The `HostJobHistory` object to be

268

remembered.

269

270

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

271

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

272

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

273

274

275

def _count_pool(self, get_pool_count, pool=None):

276

"""Internal helper to count hosts in a given pool.

277

278

The `get_pool_count` parameter is a function to calculate

279

the exact count of interest for the pool.

280

281

@param get_pool_count Function to return a count from a

282

_PoolCount object.

283

@param pool The pool to be counted. If `None`,

284

return the total across all pools.

285

286

"""

287

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

288

return sum([get_pool_count(cached_history) for cached_history in

289

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

290

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

291

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

292

293

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

294

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

295

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

296

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

297

Go through all HostJobHistory objects across all pools, selecting the

298

ones where the last diagnosis is `WORKING`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

299

300

@return A list of HostJobHistory objects.

301

302

"""

303

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

304

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

305

l.extend(p.get_working_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

309

def get_working(self, pool=None):

310

"""Return the number of working DUTs in a pool.

311

312

@param pool The pool to be counted. If `None`, return the

313

total across all pools.

314

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

315

@return The total number of working DUTs in the selected

316

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

317

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

318

return self._count_pool(_CachedHostJobHistories.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

319

320

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

321

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

322

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

323

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

324

Go through all HostJobHistory objects in the across all pools,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

325

selecting the ones where the last diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

326

327

@return A list of HostJobHistory objects.

328

329

"""

330

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

331

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

332

l.extend(p.get_broken_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

336

def get_broken(self, pool=None):

337

"""Return the number of broken DUTs in a pool.

338

339

@param pool The pool to be counted. If `None`, return the

340

total across all pools.

341

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

342

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

343

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

344

return self._count_pool(_CachedHostJobHistories.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

345

346

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

347

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

348

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

349

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

350

Go through all HostJobHistory objects in the given pool, selecting the

351

ones where the last diagnosis is `UNUSED` or `UNKNOWN`.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

352

353

@param pool: The pool to be counted. If `None`, return the total list

354

across all pools.

355

356

@return A list of HostJobHistory objects.

"""

if pool is None:

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

361

for p in self._histories_by_pool.values():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

362

l.extend(p.get_idle_list())

363

return l

364

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

365

return _CachedHostJobHistories.get_idle_list(

366

self._histories_by_pool[pool])

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

367

368

369

def get_idle(self, pool=None):

370

"""Return the number of idle DUTs in a pool.

371

372

@param pool: The pool to be counted. If `None`, return the total

373

across all pools.

374

375

@return The total number of idle DUTs in the selected pool(s).

376

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

377

return self._count_pool(_CachedHostJobHistories.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

378

379

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

380

def get_spares_buffer(self):

381

"""Return the the nominal number of working spares.

382

383

Calculates and returns how many working spares there would

384

be in the spares pool if all broken DUTs were in the spares

385

pool. This number may be negative, indicating a shortfall

386

in the critical pools.

387

388

@return The total number DUTs in the spares pool, less the total

389

number of broken DUTs in all pools.

390

"""

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

391

return self.get_total(SPARE_POOL) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

392

393

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

394

def get_total(self, pool=None):

395

"""Return the total number of DUTs in a pool.

396

397

@param pool The pool to be counted. If `None`, return the

398

total across all pools.

399

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

400

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

401

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

402

return self._count_pool(_CachedHostJobHistories.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

403

404

405

class _LabInventory(dict):

406

"""Collection of `HostJobHistory` objects for the Lab's inventory.

407

408

The collection is indexed by board. Indexing returns the

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

409

_ManagedPoolsHostJobHistories object associated with the board.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

410

411

The collection is also iterable. The iterator returns all the

412

boards in the inventory, in unspecified order.

"""

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

416

@staticmethod

417

def _eligible_host(afehost):

418

"""Return whether this host is eligible for monitoring.

419

Richard Barnette

99473f6

2017-10-17 14:43:46 -0700

[diff] [blame]

420

A host is eligible if it's in exactly one pool and it has no

421

labels from the `_EXCLUDED_LABELS` set.

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

422

423

@param afehost The host to be tested for eligibility.

424

"""

Richard Barnette

99473f6

2017-10-17 14:43:46 -0700

[diff] [blame]

425

pools = [l for l in afehost.labels

426

if l.startswith(constants.Labels.POOL_PREFIX)]

427

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

428

return len(pools) == 1 and not excluded

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

429

430

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

431

@classmethod

432

def create_inventory(cls, afe, start_time, end_time, boardlist=[]):

433

"""Return a Lab inventory with specified parameters.

434

435

By default, gathers inventory from `HostJobHistory` objects

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

436

for all DUTs in the `MANAGED_POOLS` list. If `boardlist`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

437

is supplied, the inventory will be restricted to only the

438

given boards.

439

440

@param afe AFE object for constructing the

441

`HostJobHistory` objects.

442

@param start_time Start time for the `HostJobHistory`

443

objects.

444

@param end_time End time for the `HostJobHistory`

445

objects.

446

@param boardlist List of boards to include. If empty,

447

include all available boards.

448

@return A `_LabInventory` object for the specified boards.

449

450

"""

451

label_list = [constants.Labels.POOL_PREFIX + l

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

452

for l in MANAGED_POOLS]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

453

afehosts = afe.get_hosts(labels__name__in=label_list)

454

if boardlist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

455

# We're deliberately not checking host eligibility in this

456

# code path. This is a debug path, not used in production;

457

# it may be useful to include ineligible hosts here.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

458

boardhosts = []

459

for board in boardlist:

460

board_label = constants.Labels.BOARD_PREFIX + board

461

host_list = [h for h in afehosts

462

if board_label in h.labels]

463

boardhosts.extend(host_list)

464

afehosts = boardhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

465

else:

466

afehosts = [h for h in afehosts if cls._eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

467

create = lambda host: (

468

status_history.HostJobHistory(afe, host,

469

start_time, end_time))

470

return cls([create(host) for host in afehosts])

471

472

473

def __init__(self, histories):

J. Richard Barnette

6948ed3

2015-05-06 08:57:10 -0700

[diff] [blame]

474

# N.B. The query that finds our hosts is restricted to those

475

# with a valid pool: label, but doesn't check for a valid

476

# board: label. In some (insufficiently) rare cases, the

477

# AFE hosts table has been known to (incorrectly) have DUTs

478

# with a pool: but no board: label. We explicitly exclude

479

# those here.

480

histories = [h for h in histories

481

if h.host_board is not None]

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

482

boards = set([h.host_board for h in histories])

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame^]

483

initval = { board: _ManagedPoolsHostJobHistories() for board in boards }

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

484

super(_LabInventory, self).__init__(initval)

485

self._dut_count = len(histories)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

486

self._managed_boards = {}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

487

for h in histories:

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

488

self[h.host_board].record_host(h)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

489

490

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

491

def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

492

"""Return the set of "managed" boards.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

493

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

494

Operationally, saying a board is "managed" means that the

495

board will be included in the "board" and "repair

496

recommendations" reports. That is, if there are failures in

497

the board's inventory then lab techs will be asked to fix

498

them without a separate ticket.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

499

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

500

For purposes of implementation, a board is "managed" if it

501

has DUTs in both the spare and a non-spare (i.e. critical)

502

pool.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

503

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

504

@param pool: The specified pool for managed boards.

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

505

@return A set of all the boards that have both spare and

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

506

non-spare pools, unless the pool is specified,

507

then the set of boards in that pool.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

508

"""

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

509

if self._managed_boards.get(pool, None) is None:

510

self._managed_boards[pool] = set()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

511

for board, counts in self.items():

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

512

# Get the counts for all pools, otherwise get it for the

513

# specified pool.

514

if pool == _MANAGED_POOL_DEFAULT:

515

spares = counts.get_total(SPARE_POOL)

516

total = counts.get_total()

517

if spares != 0 and spares != total:

518

self._managed_boards[pool].add(board)

519

else:

520

if counts.get_total(pool) != 0:

521

self._managed_boards[pool].add(board)

522

return self._managed_boards[pool]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

523

524

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

525

def get_num_duts(self):

526

"""Return the total number of DUTs in the inventory."""

527

return self._dut_count

528

529

530

def get_num_boards(self):

531

"""Return the total number of boards in the inventory."""

return len(self)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

535

def _sort_by_location(inventory_list):

536

"""Return a list of DUTs, organized by location.

537

538

Take the given list of `HostJobHistory` objects, separate it

539

into a list per lab, and sort each lab's list by location. The

540

order of sorting within a lab is

541

* By row number within the lab,

542

* then by rack number within the row,

543

* then by host shelf number within the rack.

544

545

Return a list of the sorted lists.

546

547

Implementation note: host locations are sorted by converting

548

each location into a base 100 number. If row, rack or

549

host numbers exceed the range [0..99], then sorting will

550

break down.

551

552

@return A list of sorted lists of DUTs.

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

558

location = _HOSTNAME_PATTERN.match(history.host.hostname)

559

if location:

560

lab = location.group(1)

561

key = 0

562

for idx in location.group(2, 3, 4):

563

key = BASE * key + int(idx)

564

lab_lists.setdefault(lab, []).append((key, history))

565

return_list = []

566

for dut_list in lab_lists.values():

567

dut_list.sort(key=lambda t: t[0])

568

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

573

"""Return a numeric score rating a set of DUTs to be repaired.

574

575

`buffer_counts` is a dictionary mapping board names to the

576

size of the board's spares buffer.

577

578

`repair_list` is a list of DUTs to be repaired.

579

580

This function calculates the new set of buffer counts that would

581

result from the proposed repairs, and scores the new set using

582

two numbers:

583

* Worst case buffer count for any board (higher is better).

584

This is the more siginficant number for comparison.

585

* Number of boards at the worst case (lower is better). This

586

is the less significant number.

587

588

Implementation note: The score could fail to reflect the

589

intended criteria if there are more than 1000 boards in the

590

inventory.

591

592

@param spare_counts A dictionary mapping boards to buffer counts.

593

@param repair_list A list of boards to be repaired.

594

@return A numeric score.

595

596

"""

597

# Go through `buffer_counts`, and create a list of new counts

598

# that records the buffer count for each board after repair.

599

# The new list of counts discards the board names, as they don't

600

# contribute to the final score.

601

_NBOARDS = 1000

602

repair_inventory = _LabInventory(repair_list)

603

new_counts = []

604

for b, c in buffer_counts.items():

605

if b in repair_inventory:

606

newcount = repair_inventory[b].get_total()

607

else:

608

newcount = 0

609

new_counts.append(c + newcount)

610

# Go through the new list of counts. Find the worst available

611

# spares count, and count how many times that worst case occurs.

612

worst_count = new_counts[0]

613

num_worst = 1

614

for c in new_counts[1:]:

615

if c == worst_count:

616

num_worst += 1

617

elif c < worst_count:

618

worst_count = c

619

num_worst = 1

620

# Return the calculated score

621

return _NBOARDS * worst_count - num_worst

622

623

624

def _generate_repair_recommendation(inventory, num_recommend):

625

"""Return a summary of selected DUTs needing repair.

626

627

Returns a message recommending a list of broken DUTs to be

628

repaired. The list of DUTs is selected based on these

629

criteria:

630

* No more than `num_recommend` DUTs will be listed.

631

* All DUTs must be in the same lab.

632

* DUTs should be selected for some degree of physical

633

proximity.

634

* DUTs for boards with a low spares buffer are more important

635

than DUTs with larger buffers.

636

637

The algorithm used will guarantee that at least one DUT from a

638

board with the smallest spares buffer will be recommended. If

639

the worst spares buffer number is shared by more than one board,

640

the algorithm will tend to prefer repair sets that include more

641

of those boards over sets that cover fewer boards.

642

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

643

@param inventory Inventory for generating recommendations.

644

@param num_recommend Number of DUTs to recommend for repair.

645

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

646

"""

647

logging.debug('Creating DUT repair recommendations')

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

648

board_buffer_counts = {}

649

broken_list = []

650

for board in inventory.get_managed_boards():

651

logging.debug('Listing failed DUTs for %s', board)

652

counts = inventory[board]

653

if counts.get_broken() != 0:

654

board_buffer_counts[board] = counts.get_spares_buffer()

655

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

656

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

657

# simplification is hard:

658

# * Calculating an initial recommendation outside of

659

# the loop likely would make things more complicated,

660

# not less.

661

# * It's necessary to calculate an initial lab slice once per

662

# lab _before_ the while loop, in case the number of broken

663

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

664

recommendation = None

665

best_score = None

666

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

667

start = 0

668

end = num_recommend

669

lab_slice = lab_duts[start : end]

670

lab_score = _score_repair_set(board_buffer_counts,

671

lab_slice)

672

while end < len(lab_duts):

673

start += 1

674

end += 1

675

new_slice = lab_duts[start : end]

676

new_score = _score_repair_set(board_buffer_counts,

677

new_slice)

678

if new_score > lab_score:

679

lab_slice = new_slice

680

lab_score = new_score

681

if recommendation is None or lab_score > best_score:

682

recommendation = lab_slice

683

best_score = lab_score

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

684

# N.B. The trailing space here is manadatory: Without it, Gmail

685

# will parse the URL wrong. Don't ask. If you simply _must_

686

# know more, go try it yourself...

687

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

688

message = ['Repair recommendations:\n',

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

689

line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

690

for h in recommendation:

691

servo_name = servo_host.make_servo_hostname(h.host.hostname)

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

692

servo_present = utils.host_is_in_lab_zone(servo_name)

693

_, event = h.last_diagnosis()

694

line = line_fmt % (

695

h.host.hostname, h.host_board,

696

'Yes' if servo_present else 'No', event.job_url)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

697

message.append(line)

698

return '\n'.join(message)

699

700

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

701

def _generate_board_inventory_message(inventory):

702

"""Generate the "board inventory" e-mail message.

703

704

The board inventory is a list by board summarizing the number

705

of working and broken DUTs, and the total shortfall or surplus

706

of working devices relative to the minimum critical pool

707

requirement.

708

709

The report omits boards with no DUTs in the spare pool or with

710

no DUTs in a critical pool.

711

712

N.B. For sample output text formattted as users can expect to

713

see it in e-mail and log files, refer to the unit tests.

714

715

@param inventory _LabInventory object with the inventory to

716

be reported on.

717

@return String with the inventory message to be sent.

718

719

"""

720

logging.debug('Creating board inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

721

nworking = 0

722

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

723

nidle = 0

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

724

nbroken_boards = 0

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

725

ntotal_boards = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

726

summaries = []

727

for board in inventory.get_managed_boards():

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

728

counts = inventory[board]

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

729

logging.debug('Counting %2d DUTS for board %s',

730

counts.get_total(), board)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

731

# Summary elements laid out in the same order as the text

732

# headers:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

733

# Board Avail Bad Idle Good Spare Total

734

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

735

element = (board,

736

counts.get_spares_buffer(),

737

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

738

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

739

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

740

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

741

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

742

if element[2]:

743

summaries.append(element)

744

nbroken_boards += 1

745

ntotal_boards += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

746

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

747

nidle += element[3]

748

nworking += element[4]

749

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

750

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

751

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

752

idle_percent = int(round(100.0 * nidle / ntotal))

753

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

754

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

755

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

756

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

757

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

758

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

759

nworking, working_percent,

760

ntotal),

761

'',

762

'Boards with failures: %d' % nbroken_boards,

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

763

'Boards in inventory: %d' % ntotal_boards,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

764

'', '',

765

'Full board inventory:\n',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

766

'%-22s %5s %5s %5s %5s %5s %5s' % (

767

'Board', 'Avail', 'Bad', 'Idle', 'Good',

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

768

'Spare', 'Total')]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

769

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

770

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

771

return '\n'.join(message)

772

773

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

774

_POOL_INVENTORY_HEADER = '''\

Aviv Keshet

056d74c

2015-07-14 09:18:43 -0700

[diff] [blame]

775

Notice to Infrastructure deputies: All boards shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

776

less than full strength, please take action to resolve the issues.

777

Once you're satisified that failures won't recur, failed DUTs can

778

be replaced with spares by running `balance_pool`. Detailed

779

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

780

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

784

def _generate_pool_inventory_message(inventory):

785

"""Generate the "pool inventory" e-mail message.

786

787

The pool inventory is a list by pool and board summarizing the

788

number of working and broken DUTs in the pool. Only boards with

789

at least one broken DUT are included in the list.

790

791

N.B. For sample output text formattted as users can expect to

792

see it in e-mail and log files, refer to the unit tests.

793

794

@param inventory _LabInventory object with the inventory to

795

be reported on.

796

@return String with the inventory message to be sent.

797

798

"""

799

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

800

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

801

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

802

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

803

message.append(

804

'%sStatus for pool:%s, by board:' % (newline, pool))

805

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

806

'%-20s %5s %5s %5s %5s' % (

807

'Board', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

808

data_list = []

809

for board, counts in inventory.items():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

810

logging.debug('Counting %2d DUTs for %s, %s',

811

counts.get_total(pool), board, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

812

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

813

idle = counts.get_idle(pool)

814

# boards at full strength are not reported

815

if broken == 0 and idle == 0:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

816

continue

817

working = counts.get_working(pool)

818

total = counts.get_total(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

819

data_list.append((board, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

820

if data_list:

821

data_list = sorted(data_list, key=lambda d: -d[1])

822

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

823

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

824

else:

825

message.append('(All boards at full strength)')

826

newline = '\n'

827

return '\n'.join(message)

828

829

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

830

_IDLE_INVENTORY_HEADER = '''\

831

Notice to Infrastructure deputies: The hosts shown below haven't

832

run any jobs for at least 24 hours. Please check each host; locked

833

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

839

"""Generate the "idle inventory" e-mail message.

840

841

The idle inventory is a host list with corresponding pool and board,

842

where the hosts are idle (`UNKWOWN` or `UNUSED`).

843

844

N.B. For sample output text format as users can expect to

845

see it in e-mail and log files, refer to the unit tests.

846

847

@param inventory _LabInventory object with the inventory to

848

be reported on.

849

@return String with the inventory message to be sent.

850

851

"""

852

logging.debug('Creating idle inventory')

853

message = [_IDLE_INVENTORY_HEADER]

854

message.append('Idle Host List:')

855

message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))

856

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

857

for pool in MANAGED_POOLS:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

858

for board, counts in inventory.items():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

859

logging.debug('Counting %2d DUTs for %s, %s',

860

counts.get_total(pool), board, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

861

data_list.extend([(dut.host.hostname, board, pool)

862

for dut in counts.get_idle_list(pool)])

863

if data_list:

864

message.extend(['%-30s %-20s %s' % t for t in data_list])

865

else:

866

message.append('(No idle DUTs)')

867

return '\n'.join(message)

868

869

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

870

def _send_email(arguments, tag, subject, recipients, body):

871

"""Send an inventory e-mail message.

872

873

The message is logged in the selected log directory using `tag`

874

for the file name.

875

876

If the --print option was requested, the message is neither

877

logged nor sent, but merely printed on stdout.

878

879

@param arguments Parsed command-line options.

880

@param tag Tag identifying the inventory for logging

881

purposes.

882

@param subject E-mail Subject: header line.

883

@param recipients E-mail addresses for the To: header line.

884

@param body E-mail message body.

885

886

"""

887

logging.debug('Generating email: "%s"', subject)

888

all_recipients = ', '.join(recipients)

889

report_body = '\n'.join([

890

'To: %s' % all_recipients,

891

'Subject: %s' % subject,

892

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

893

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

894

print report_body

895

else:

896

filename = os.path.join(arguments.logdir, tag)

897

try:

898

report_file = open(filename, 'w')

899

report_file.write(report_body)

900

report_file.close()

901

except EnvironmentError as e:

902

logging.error('Failed to write %s: %s', filename, e)

903

try:

904

gmail_lib.send_email(all_recipients, subject, body)

905

except Exception as e:

906

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

def _separate_email_addresses(address_list):

911

"""Parse a list of comma-separated lists of e-mail addresses.

912

913

@param address_list A list of strings containing comma

914

separate e-mail addresses.

915

@return A list of the individual e-mail addresses.

"""

newlist = []

for arg in address_list:

920

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

925

"""Validate command-line arguments.

926

927

Join comma separated e-mail addresses for `--board-notify` and

928

`--pool-notify` in separate option arguments into a single list.

929

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

930

For non-debug uses, require that notification be requested for

931

at least one report. For debug, if notification isn't specified,

932

treat it as "run all the reports."

933

934

The return value indicates success or failure; in the case of

935

failure, we also write an error message to stderr.

936

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

937

@param arguments Command-line arguments as returned by

938

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

939

@return True if the arguments are semantically good, or False

940

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

941

942

"""

943

arguments.board_notify = _separate_email_addresses(

944

arguments.board_notify)

945

arguments.pool_notify = _separate_email_addresses(

946

arguments.pool_notify)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

947

if not arguments.board_notify and not arguments.pool_notify:

948

if not arguments.debug:

949

sys.stderr.write('Must specify at least one of '

950

'--board-notify or --pool-notify\n')

951

return False

952

else:

953

# We want to run all the reports. An empty notify list

954

# will cause a report to be skipped, so make sure the

955

# lists are non-empty.

956

arguments.board_notify = ['']

957

arguments.pool_notify = ['']

958

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

959

960

961

def _get_logdir(script):

962

"""Get the default directory for the `--logdir` option.

963

964

The default log directory is based on the parent directory

965

containing this script.

966

967

@param script Path to this script file.

968

@return A path to a directory.

969

970

"""

971

basedir = os.path.dirname(os.path.abspath(script))

972

basedir = os.path.dirname(basedir)

973

return os.path.join(basedir, _LOGDIR)

974

975

976

def _parse_command(argv):

977

"""Parse the command line arguments.

978

979

Create an argument parser for this command's syntax, parse the

980

command line, and return the result of the ArgumentParser

981

parse_args() method.

982

983

@param argv Standard command line argument vector; argv[0] is

984

assumed to be the command name.

985

@return Result returned by ArgumentParser.parse_args().

986

987

"""

988

parser = argparse.ArgumentParser(

989

prog=argv[0],

990

description='Gather and report lab inventory statistics')

991

parser.add_argument('-d', '--duration', type=int,

992

default=_DEFAULT_DURATION, metavar='HOURS',

993

help='number of hours back to search for status'

994

' (default: %d)' % _DEFAULT_DURATION)

995

parser.add_argument('--board-notify', action='append',

996

default=[], metavar='ADDRESS',

997

help='Generate board inventory message, '

998

'and send it to the given e-mail address(es)')

999

parser.add_argument('--pool-notify', action='append',

1000

default=[], metavar='ADDRESS',

1001

help='Generate pool inventory message, '

1002

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1003

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1004

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1005

'recommended for repair (default: no '

1006

'recommendation)'))

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1007

parser.add_argument('--debug', action='store_true',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1008

help='Print e-mail messages on stdout '

1009

'without sending them.')

1010

parser.add_argument('--logdir', default=_get_logdir(argv[0]),

1011

help='Directory where logs will be written.')

1012

parser.add_argument('boardnames', nargs='*',

1013

metavar='BOARD',

1014

help='names of boards to report on '

1015

'(default: all boards)')

1016

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1017

if not _verify_arguments(arguments):

1018

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1023

"""Configure the `logging` module for our needs.

1024

1025

How we log depends on whether the `--print` option was

1026

provided on the command line. Without the option, we log all

1027

messages at DEBUG level or above, and write them to a file in

1028

the directory specified by the `--logdir` option. With the

1029

option, we write log messages to stdout; messages below INFO

1030

level are discarded.

1031

1032

The log file is configured to rotate once a week on Friday

1033

evening, preserving ~3 months worth of history.

1034

1035

@param arguments Command-line arguments as returned by

1036

`ArgumentParser`

1037

1038

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1039

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1040

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1041

root_logger.setLevel(logging.INFO)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1042

handler = logging.StreamHandler(sys.stdout)

1043

handler.setFormatter(logging.Formatter())

1044

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1045

if not os.path.exists(arguments.logdir):

1046

os.mkdir(arguments.logdir)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1047

root_logger.setLevel(logging.DEBUG)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1048

logfile = os.path.join(arguments.logdir, _LOGFILE)

1049

handler = logging.handlers.TimedRotatingFileHandler(

1050

logfile, when='W4', backupCount=13)

1051

formatter = logging.Formatter(_LOG_FORMAT,

1052

time_utils.TIME_FMT)

1053

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1054

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1055

# implicitly imported logging_config, which calls

1056

# logging.basicConfig() *at module level*. That gives us an

1057

# extra logging handler that we don't want. So, clear out all

1058

# the handlers here.

1059

for h in root_logger.handlers:

1060

root_logger.removeHandler(h)

1061

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1062

1063

1064

def _populate_board_counts(inventory):

1065

"""Gather board counts while providing interactive feedback.

1066

1067

Gathering the status of all individual DUTs in the lab can take

1068

considerable time (~30 minutes at the time of this writing).

1069

1070

Normally, we pay that cost by querying as we go. However, with

1071

the `--print` option, a human being may be watching the

1072

progress. So, we force the first (expensive) queries to happen

1073

up front, and provide a small ASCII progress bar to give an

1074

indicator of how many boards have been processed.

1075

1076

@param inventory _LabInventory object with the inventory to

be gathered.

"""

n = 0

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1081

total_broken = 0

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1082

for counts in inventory.values():

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

# This next call is where all the time goes - it forces all

1093

# of a board's HostJobHistory objects to query the database

1094

# and cache their results.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1095

total_broken += counts.get_broken()

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1096

sys.stdout.write('\n')

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1097

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

def main(argv):

"""Standard main routine.

1102

@param argv Command line arguments including `sys.argv[0]`.

1103

"""

1104

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1105

if not arguments:

1106

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1107

_configure_logging(arguments)

1108

try:

1109

end_time = int(time.time())

1110

start_time = end_time - arguments.duration * 60 * 60

1111

timestamp = time.strftime('%Y-%m-%d.%H',

1112

time.localtime(end_time))

1113

logging.debug('Starting lab inventory for %s', timestamp)

1114

if arguments.board_notify:

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1115

if arguments.recommend:

1116

logging.debug('Will include repair recommendations')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1117

logging.debug('Will include board inventory')

1118

if arguments.pool_notify:

1119

logging.debug('Will include pool inventory')

1120

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

1121

afe = frontend_wrappers.RetryingAFE(server=None)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1122

inventory = _LabInventory.create_inventory(

1123

afe, start_time, end_time, arguments.boardnames)

1124

logging.info('Found %d hosts across %d boards',

1125

inventory.get_num_duts(),

1126

inventory.get_num_boards())

1127

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1128

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1129

_populate_board_counts(inventory)

1130

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1131

if arguments.board_notify:

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1132

if arguments.recommend:

1133

recommend_message = _generate_repair_recommendation(

1134

inventory, arguments.recommend) + '\n\n\n'

1135

else:

1136

recommend_message = ''

1137

board_message = _generate_board_inventory_message(inventory)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1138

_send_email(arguments,

1139

'boards-%s.txt' % timestamp,

1140

'DUT board inventory %s' % timestamp,

1141

arguments.board_notify,

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1142

recommend_message + board_message)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1143

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1144

if arguments.pool_notify:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

1145

pool_message = _generate_pool_inventory_message(inventory)

1146

idle_message = _generate_idle_inventory_message(inventory)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1147

_send_email(arguments,

1148

'pools-%s.txt' % timestamp,

1149

'DUT pool inventory %s' % timestamp,

1150

arguments.pool_notify,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

1151

pool_message + '\n\n\n' + idle_message)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1152

except KeyboardInterrupt:

1153

pass

1154

except EnvironmentError as e:

1155

logging.exception('Unexpected OS error: %s', e)

1156

except Exception as e:

1157

logging.exception('Unexpected exception: %s', e)

1158

1159

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1160

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1161

end_time = int(time.time())

1162

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1163

return _LabInventory.create_inventory(afe, start_time, end_time)

1164

1165

1166

def get_managed_boards(afe):

1167

return get_inventory(afe).get_managed_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1168

1169

J. Richard Barnette