Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

9

board and pool, and determines whether each DUT is working or

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

13

usage: lab_inventory.py [ options ] [ board ... ]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

20

--board-notify <address>[,<address>]

21

Send the "board status" e-mail to all the specified e-mail

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

29

When generating the "board status" e-mail, included a list of

30

<number> specific DUTs to be recommended for repair.

31

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

32

--logdir <directory>

33

Log progress and actions in a file under this directory. Text

34

of any e-mail sent will also be logged in a timestamped file in

35

this directory.

36

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

37

--debug

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

38

Suppress all logging and sending e-mail. Instead, write the

39

output that would be generated onto stdout.

40

41

<board> arguments:

42

With no arguments, gathers the status for all boards in the lab.

43

With one or more named boards on the command line, restricts

44

reporting to just those boards.

"""

import argparse

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame^]

50

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

51

import logging

52

import logging.handlers

53

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

54

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

59

from autotest_lib.client.bin import utils

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

60

from autotest_lib.client.common_lib import time_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

61

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

62

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

63

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

64

from autotest_lib.site_utils import gmail_lib

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

65

from autotest_lib.site_utils.suite_scheduler import constants

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame^]

66

from autotest_lib.utils import labellib

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

67

68

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

69

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

70

SPARE_POOL = constants.Pools.SPARE_POOL

71

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

72

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

73

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

74

# monitoring by this script. Currently, we're excluding these:

75

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

76

# + 'board:guado_moblab' - These are maintained by a separate

77

# process that doesn't use this script.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

78

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

79

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

80

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

81

# _DEFAULT_DURATION:

82

# Default value used for the --duration command line option.

83

# Specifies how far back in time to search in order to determine

84

# DUT status.

85

86

_DEFAULT_DURATION = 24

87

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

88

# _LOGDIR:

89

# Relative path used in the calculation of the default setting

90

# for the --logdir option. The full path path is relative to

91

# the root of the autotest directory, as determined from

92

# sys.argv[0].

93

# _LOGFILE:

94

# Basename of a file to which general log information will be

95

# written.

96

# _LOG_FORMAT:

97

# Format string for log messages.

98

99

_LOGDIR = os.path.join('logs', 'dut-data')

100

_LOGFILE = 'lab-inventory.log'

101

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

102

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

103

# Pattern describing location-based host names in the Chrome OS test

104

# labs. Each DUT hostname designates the DUT's location:

105

# * A lab (room) that's physically separated from other labs

106

# (i.e. there's a door).

107

# * A row (or aisle) of DUTs within the lab.

108

# * A vertical rack of shelves on the row.

109

# * A specific host on one shelf of the rack.

110

111

_HOSTNAME_PATTERN = re.compile(

112

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

113

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

114

# Default entry for managed pools.

115

116

_MANAGED_POOL_DEFAULT = 'all_pools'

117

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

118

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

119

class _CachedHostJobHistories(object):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

120

"""Maintains a set of `HostJobHistory` objects for a pool.

121

122

The collected history objects are nominally all part of a single

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

123

scheduling pool of DUTs. The collection maintains a list of

124

working DUTs, a list of broken DUTs, and a list of all DUTs.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

125

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

126

Performance note: Certain methods in this class are potentially

127

expensive:

128

* `get_working()`

129

* `get_working_list()`

130

* `get_broken()`

131

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

132

* `get_idle()`

133

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

134

The first time any one of these methods is called, it causes

135

multiple RPC calls with a relatively expensive set of database

136

queries. However, the results of the queries are cached in the

137

individual `HostJobHistory` objects, so only the first call

138

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

139

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

140

Additionally, `get_working_list()`, `get_broken_list()` and

141

`get_idle_list()` cache their return values to avoid recalculating

142

lists at every call; this caching is separate from the caching of RPC

143

results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

144

145

This class is deliberately constructed to delay the RPC cost

146

until the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

147

`record_host()`) so that it's possible to construct a complete

148

`_LabInventory` without making the expensive queries at creation

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

149

time. `_populate_board_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

155

self._working_list = None

156

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

157

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

158

159

160

def record_host(self, host_history):

161

"""Add one `HostJobHistory` object to the collection.

162

163

@param host_history The `HostJobHistory` object to be

164

remembered.

165

166

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

167

self._working_list = None

168

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

169

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

170

self._histories.append(host_history)

171

172

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

173

def get_working_list(self):

174

"""Return a list of all working DUTs in the pool.

175

176

Filter `self._histories` for histories where the last

177

diagnosis is `WORKING`.

178

179

Cache the result so that we only cacluate it once.

180

181

@return A list of HostJobHistory objects.

182

183

"""

184

if self._working_list is None:

185

self._working_list = [h for h in self._histories

186

if h.last_diagnosis()[0] == status_history.WORKING]

187

return self._working_list

188

189

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

190

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

191

"""Return the number of working DUTs in the pool."""

192

return len(self.get_working_list())

193

194

195

def get_broken_list(self):

196

"""Return a list of all broken DUTs in the pool.

197

198

Filter `self._histories` for histories where the last

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

199

diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

200

201

Cache the result so that we only cacluate it once.

202

203

@return A list of HostJobHistory objects.

204

205

"""

206

if self._broken_list is None:

207

self._broken_list = [h for h in self._histories

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

208

if h.last_diagnosis()[0] == status_history.BROKEN]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

209

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

210

211

212

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

213

"""Return the number of broken DUTs in the pool."""

214

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

215

216

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

217

def get_idle_list(self):

218

"""Return a list of all idle DUTs in the pool.

219

220

Filter `self._histories` for histories where the last

221

diagnosis is `UNUSED` or `UNKNOWN`.

222

223

Cache the result so that we only cacluate it once.

224

225

@return A list of HostJobHistory objects.

226

227

"""

228

idle_list = [status_history.UNUSED, status_history.UNKNOWN]

229

if self._idle_list is None:

230

self._idle_list = [h for h in self._histories

231

if h.last_diagnosis()[0] in idle_list]

232

return self._idle_list

def get_idle(self):

"""Return the number of idle DUTs in the pool."""

237

return len(self.get_idle_list())

238

239

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

240

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

241

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

242

return len(self._histories)

243

244

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

245

class _ManagedPoolsHostJobHistories(object):

246

"""Maintains a set of `HostJobHistory`s per managed pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

247

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

248

The collection maintains a count of working DUTs, a count of broken DUTs,

249

and a total count. The counts can be obtained either for a single pool, or

250

as a total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

251

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

252

DUTs in the collection must be assigned to one of the pools in

253

`_MANAGED_POOLS`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

254

255

The `get_working()` and `get_broken()` methods rely on the

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

256

methods of the same name in _CachedHostJobHistories, so the performance

257

note in _CachedHostJobHistories applies here as well.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

262

self._histories_by_pool = {

263

pool: _CachedHostJobHistories() for pool in MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

264

}

265

266

def record_host(self, host_history):

267

"""Add one `HostJobHistory` object to the collection.

268

269

@param host_history The `HostJobHistory` object to be

270

remembered.

271

272

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

273

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

274

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

275

276

277

def _count_pool(self, get_pool_count, pool=None):

278

"""Internal helper to count hosts in a given pool.

279

280

The `get_pool_count` parameter is a function to calculate

281

the exact count of interest for the pool.

282

283

@param get_pool_count Function to return a count from a

284

_PoolCount object.

285

@param pool The pool to be counted. If `None`,

286

return the total across all pools.

287

288

"""

289

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

290

return sum([get_pool_count(cached_history) for cached_history in

291

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

292

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

293

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

294

295

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

296

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

297

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

298

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

299

Go through all HostJobHistory objects across all pools, selecting the

300

ones where the last diagnosis is `WORKING`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

301

302

@return A list of HostJobHistory objects.

303

304

"""

305

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

306

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

307

l.extend(p.get_working_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

311

def get_working(self, pool=None):

312

"""Return the number of working DUTs in a pool.

313

314

@param pool The pool to be counted. If `None`, return the

315

total across all pools.

316

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

317

@return The total number of working DUTs in the selected

318

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

319

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

320

return self._count_pool(_CachedHostJobHistories.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

321

322

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

323

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

324

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

325

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

326

Go through all HostJobHistory objects in the across all pools,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

327

selecting the ones where the last diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

328

329

@return A list of HostJobHistory objects.

330

331

"""

332

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

333

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

334

l.extend(p.get_broken_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

338

def get_broken(self, pool=None):

339

"""Return the number of broken DUTs in a pool.

340

341

@param pool The pool to be counted. If `None`, return the

342

total across all pools.

343

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

344

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

345

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

346

return self._count_pool(_CachedHostJobHistories.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

347

348

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

349

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

350

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

351

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

352

Go through all HostJobHistory objects in the given pool, selecting the

353

ones where the last diagnosis is `UNUSED` or `UNKNOWN`.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

354

355

@param pool: The pool to be counted. If `None`, return the total list

356

across all pools.

357

358

@return A list of HostJobHistory objects.

"""

if pool is None:

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

363

for p in self._histories_by_pool.values():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

364

l.extend(p.get_idle_list())

365

return l

366

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

367

return _CachedHostJobHistories.get_idle_list(

368

self._histories_by_pool[pool])

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

369

370

371

def get_idle(self, pool=None):

372

"""Return the number of idle DUTs in a pool.

373

374

@param pool: The pool to be counted. If `None`, return the total

375

across all pools.

376

377

@return The total number of idle DUTs in the selected pool(s).

378

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

379

return self._count_pool(_CachedHostJobHistories.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

380

381

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

382

def get_spares_buffer(self):

383

"""Return the the nominal number of working spares.

384

385

Calculates and returns how many working spares there would

386

be in the spares pool if all broken DUTs were in the spares

387

pool. This number may be negative, indicating a shortfall

388

in the critical pools.

389

390

@return The total number DUTs in the spares pool, less the total

391

number of broken DUTs in all pools.

392

"""

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

393

return self.get_total(SPARE_POOL) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

394

395

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

396

def get_total(self, pool=None):

397

"""Return the total number of DUTs in a pool.

398

399

@param pool The pool to be counted. If `None`, return the

400

total across all pools.

401

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

402

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

403

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

404

return self._count_pool(_CachedHostJobHistories.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

405

406

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

407

class _LabInventory(object):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

408

"""Collection of `HostJobHistory` objects for the Lab's inventory.

409

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

410

Important attributes:

411

by_board: A dict mapping board to ManagedPoolsHostJobHistories

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

415

@staticmethod

416

def _eligible_host(afehost):

417

"""Return whether this host is eligible for monitoring.

418

Richard Barnette

99473f6

2017-10-17 14:43:46 -0700

[diff] [blame]

419

A host is eligible if it's in exactly one pool and it has no

420

labels from the `_EXCLUDED_LABELS` set.

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

421

422

@param afehost The host to be tested for eligibility.

423

"""

Richard Barnette

99473f6

2017-10-17 14:43:46 -0700

[diff] [blame]

424

pools = [l for l in afehost.labels

425

if l.startswith(constants.Labels.POOL_PREFIX)]

426

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

427

return len(pools) == 1 and not excluded

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

428

429

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

430

@classmethod

431

def create_inventory(cls, afe, start_time, end_time, boardlist=[]):

432

"""Return a Lab inventory with specified parameters.

433

434

By default, gathers inventory from `HostJobHistory` objects

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

435

for all DUTs in the `MANAGED_POOLS` list. If `boardlist`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

436

is supplied, the inventory will be restricted to only the

437

given boards.

438

439

@param afe AFE object for constructing the

440

`HostJobHistory` objects.

441

@param start_time Start time for the `HostJobHistory`

442

objects.

443

@param end_time End time for the `HostJobHistory`

444

objects.

445

@param boardlist List of boards to include. If empty,

446

include all available boards.

447

@return A `_LabInventory` object for the specified boards.

448

449

"""

450

label_list = [constants.Labels.POOL_PREFIX + l

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

451

for l in MANAGED_POOLS]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

452

afehosts = afe.get_hosts(labels__name__in=label_list)

453

if boardlist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

454

# We're deliberately not checking host eligibility in this

455

# code path. This is a debug path, not used in production;

456

# it may be useful to include ineligible hosts here.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

457

boardhosts = []

458

for board in boardlist:

459

board_label = constants.Labels.BOARD_PREFIX + board

460

host_list = [h for h in afehosts

461

if board_label in h.labels]

462

boardhosts.extend(host_list)

463

afehosts = boardhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

464

else:

465

afehosts = [h for h in afehosts if cls._eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

466

create = lambda host: (

467

status_history.HostJobHistory(afe, host,

468

start_time, end_time))

469

return cls([create(host) for host in afehosts])

470

471

472

def __init__(self, histories):

J. Richard Barnette

6948ed3

2015-05-06 08:57:10 -0700

[diff] [blame]

473

# N.B. The query that finds our hosts is restricted to those

474

# with a valid pool: label, but doesn't check for a valid

475

# board: label. In some (insufficiently) rare cases, the

476

# AFE hosts table has been known to (incorrectly) have DUTs

477

# with a pool: but no board: label. We explicitly exclude

478

# those here.

479

histories = [h for h in histories

480

if h.host_board is not None]

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

481

self.histories = histories

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

482

self._dut_count = len(histories)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

483

self._managed_boards = {}

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame^]

484

self.by_board = self._classify_by_label_type('board')

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

485

486

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame^]

487

def _classify_by_label_type(self, label_key):

488

"""Classify histories by labels with the given key.

489

490

@returns a dict mapping labels with the given key to

491

_ManagedPoolsHostJobHistories for DUTs with that label.

492

"""

493

classified = collections.defaultdict(_ManagedPoolsHostJobHistories)

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

494

for h in self.histories:

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame^]

495

labels = labellib.LabelsMapping(h.host.labels)

496

if label_key in labels:

497

classified[labels[label_key]].record_host(h)

498

return dict(classified)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

499

500

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

501

def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

502

"""Return the set of "managed" boards.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

503

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

504

Operationally, saying a board is "managed" means that the

505

board will be included in the "board" and "repair

506

recommendations" reports. That is, if there are failures in

507

the board's inventory then lab techs will be asked to fix

508

them without a separate ticket.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

509

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

510

For purposes of implementation, a board is "managed" if it

511

has DUTs in both the spare and a non-spare (i.e. critical)

512

pool.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

513

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

514

@param pool: The specified pool for managed boards.

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

515

@return A set of all the boards that have both spare and

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

516

non-spare pools, unless the pool is specified,

517

then the set of boards in that pool.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

518

"""

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

519

if self._managed_boards.get(pool, None) is None:

520

self._managed_boards[pool] = set()

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

521

for board, counts in self.by_board.iteritems():

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

522

# Get the counts for all pools, otherwise get it for the

523

# specified pool.

524

if pool == _MANAGED_POOL_DEFAULT:

525

spares = counts.get_total(SPARE_POOL)

526

total = counts.get_total()

527

if spares != 0 and spares != total:

528

self._managed_boards[pool].add(board)

529

else:

530

if counts.get_total(pool) != 0:

531

self._managed_boards[pool].add(board)

532

return self._managed_boards[pool]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

533

534

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

535

def get_num_duts(self):

536

"""Return the total number of DUTs in the inventory."""

537

return self._dut_count

538

539

540

def get_num_boards(self):

541

"""Return the total number of boards in the inventory."""

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

542

return len(self.by_board)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

543

544

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

545

def _sort_by_location(inventory_list):

546

"""Return a list of DUTs, organized by location.

547

548

Take the given list of `HostJobHistory` objects, separate it

549

into a list per lab, and sort each lab's list by location. The

550

order of sorting within a lab is

551

* By row number within the lab,

552

* then by rack number within the row,

553

* then by host shelf number within the rack.

554

555

Return a list of the sorted lists.

556

557

Implementation note: host locations are sorted by converting

558

each location into a base 100 number. If row, rack or

559

host numbers exceed the range [0..99], then sorting will

560

break down.

561

562

@return A list of sorted lists of DUTs.

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

568

location = _HOSTNAME_PATTERN.match(history.host.hostname)

569

if location:

570

lab = location.group(1)

571

key = 0

572

for idx in location.group(2, 3, 4):

573

key = BASE * key + int(idx)

574

lab_lists.setdefault(lab, []).append((key, history))

575

return_list = []

576

for dut_list in lab_lists.values():

577

dut_list.sort(key=lambda t: t[0])

578

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

583

"""Return a numeric score rating a set of DUTs to be repaired.

584

585

`buffer_counts` is a dictionary mapping board names to the

586

size of the board's spares buffer.

587

588

`repair_list` is a list of DUTs to be repaired.

589

590

This function calculates the new set of buffer counts that would

591

result from the proposed repairs, and scores the new set using

592

two numbers:

593

* Worst case buffer count for any board (higher is better).

594

This is the more siginficant number for comparison.

595

* Number of boards at the worst case (lower is better). This

596

is the less significant number.

597

598

Implementation note: The score could fail to reflect the

599

intended criteria if there are more than 1000 boards in the

600

inventory.

601

602

@param spare_counts A dictionary mapping boards to buffer counts.

603

@param repair_list A list of boards to be repaired.

604

@return A numeric score.

605

606

"""

607

# Go through `buffer_counts`, and create a list of new counts

608

# that records the buffer count for each board after repair.

609

# The new list of counts discards the board names, as they don't

610

# contribute to the final score.

611

_NBOARDS = 1000

612

repair_inventory = _LabInventory(repair_list)

613

new_counts = []

614

for b, c in buffer_counts.items():

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

615

if b in repair_inventory.by_board:

616

newcount = repair_inventory.by_board[b].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

617

else:

618

newcount = 0

619

new_counts.append(c + newcount)

620

# Go through the new list of counts. Find the worst available

621

# spares count, and count how many times that worst case occurs.

622

worst_count = new_counts[0]

623

num_worst = 1

624

for c in new_counts[1:]:

625

if c == worst_count:

626

num_worst += 1

627

elif c < worst_count:

628

worst_count = c

629

num_worst = 1

630

# Return the calculated score

631

return _NBOARDS * worst_count - num_worst

632

633

634

def _generate_repair_recommendation(inventory, num_recommend):

635

"""Return a summary of selected DUTs needing repair.

636

637

Returns a message recommending a list of broken DUTs to be

638

repaired. The list of DUTs is selected based on these

639

criteria:

640

* No more than `num_recommend` DUTs will be listed.

641

* All DUTs must be in the same lab.

642

* DUTs should be selected for some degree of physical

643

proximity.

644

* DUTs for boards with a low spares buffer are more important

645

than DUTs with larger buffers.

646

647

The algorithm used will guarantee that at least one DUT from a

648

board with the smallest spares buffer will be recommended. If

649

the worst spares buffer number is shared by more than one board,

650

the algorithm will tend to prefer repair sets that include more

651

of those boards over sets that cover fewer boards.

652

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

653

@param inventory Inventory for generating recommendations.

654

@param num_recommend Number of DUTs to recommend for repair.

655

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

656

"""

657

logging.debug('Creating DUT repair recommendations')

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

658

board_buffer_counts = {}

659

broken_list = []

660

for board in inventory.get_managed_boards():

661

logging.debug('Listing failed DUTs for %s', board)

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

662

counts = inventory.by_board[board]

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

663

if counts.get_broken() != 0:

664

board_buffer_counts[board] = counts.get_spares_buffer()

665

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

666

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

667

# simplification is hard:

668

# * Calculating an initial recommendation outside of

669

# the loop likely would make things more complicated,

670

# not less.

671

# * It's necessary to calculate an initial lab slice once per

672

# lab _before_ the while loop, in case the number of broken

673

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

674

recommendation = None

675

best_score = None

676

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

677

start = 0

678

end = num_recommend

679

lab_slice = lab_duts[start : end]

680

lab_score = _score_repair_set(board_buffer_counts,

681

lab_slice)

682

while end < len(lab_duts):

683

start += 1

684

end += 1

685

new_slice = lab_duts[start : end]

686

new_score = _score_repair_set(board_buffer_counts,

687

new_slice)

688

if new_score > lab_score:

689

lab_slice = new_slice

690

lab_score = new_score

691

if recommendation is None or lab_score > best_score:

692

recommendation = lab_slice

693

best_score = lab_score

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

694

# N.B. The trailing space here is manadatory: Without it, Gmail

695

# will parse the URL wrong. Don't ask. If you simply _must_

696

# know more, go try it yourself...

697

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

698

message = ['Repair recommendations:\n',

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

699

line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

700

for h in recommendation:

701

servo_name = servo_host.make_servo_hostname(h.host.hostname)

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

702

servo_present = utils.host_is_in_lab_zone(servo_name)

703

_, event = h.last_diagnosis()

704

line = line_fmt % (

705

h.host.hostname, h.host_board,

706

'Yes' if servo_present else 'No', event.job_url)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

707

message.append(line)

708

return '\n'.join(message)

709

710

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

711

def _generate_board_inventory_message(inventory):

712

"""Generate the "board inventory" e-mail message.

713

714

The board inventory is a list by board summarizing the number

715

of working and broken DUTs, and the total shortfall or surplus

716

of working devices relative to the minimum critical pool

717

requirement.

718

719

The report omits boards with no DUTs in the spare pool or with

720

no DUTs in a critical pool.

721

722

N.B. For sample output text formattted as users can expect to

723

see it in e-mail and log files, refer to the unit tests.

724

725

@param inventory _LabInventory object with the inventory to

726

be reported on.

727

@return String with the inventory message to be sent.

728

729

"""

730

logging.debug('Creating board inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

731

nworking = 0

732

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

733

nidle = 0

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

734

nbroken_boards = 0

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

735

ntotal_boards = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

736

summaries = []

737

for board in inventory.get_managed_boards():

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

738

counts = inventory.by_board[board]

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

739

logging.debug('Counting %2d DUTS for board %s',

740

counts.get_total(), board)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

741

# Summary elements laid out in the same order as the text

742

# headers:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

743

# Board Avail Bad Idle Good Spare Total

744

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

745

element = (board,

746

counts.get_spares_buffer(),

747

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

748

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

749

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

750

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

751

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

752

if element[2]:

753

summaries.append(element)

754

nbroken_boards += 1

755

ntotal_boards += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

756

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

757

nidle += element[3]

758

nworking += element[4]

759

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

760

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

761

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

762

idle_percent = int(round(100.0 * nidle / ntotal))

763

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

764

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

765

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

766

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

767

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

768

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

769

nworking, working_percent,

770

ntotal),

771

'',

772

'Boards with failures: %d' % nbroken_boards,

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

773

'Boards in inventory: %d' % ntotal_boards,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

774

'', '',

775

'Full board inventory:\n',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

776

'%-22s %5s %5s %5s %5s %5s %5s' % (

777

'Board', 'Avail', 'Bad', 'Idle', 'Good',

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

778

'Spare', 'Total')]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

779

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

780

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

781

return '\n'.join(message)

782

783

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

784

_POOL_INVENTORY_HEADER = '''\

Aviv Keshet

056d74c

2015-07-14 09:18:43 -0700

[diff] [blame]

785

Notice to Infrastructure deputies: All boards shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

786

less than full strength, please take action to resolve the issues.

787

Once you're satisified that failures won't recur, failed DUTs can

788

be replaced with spares by running `balance_pool`. Detailed

789

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

790

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

794

def _generate_pool_inventory_message(inventory):

795

"""Generate the "pool inventory" e-mail message.

796

797

The pool inventory is a list by pool and board summarizing the

798

number of working and broken DUTs in the pool. Only boards with

799

at least one broken DUT are included in the list.

800

801

N.B. For sample output text formattted as users can expect to

802

see it in e-mail and log files, refer to the unit tests.

803

804

@param inventory _LabInventory object with the inventory to

805

be reported on.

806

@return String with the inventory message to be sent.

807

808

"""

809

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

810

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

811

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

812

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

813

message.append(

814

'%sStatus for pool:%s, by board:' % (newline, pool))

815

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

816

'%-20s %5s %5s %5s %5s' % (

817

'Board', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

818

data_list = []

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

819

for board, counts in inventory.by_board.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

820

logging.debug('Counting %2d DUTs for %s, %s',

821

counts.get_total(pool), board, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

822

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

823

idle = counts.get_idle(pool)

824

# boards at full strength are not reported

825

if broken == 0 and idle == 0:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

826

continue

827

working = counts.get_working(pool)

828

total = counts.get_total(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

829

data_list.append((board, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

830

if data_list:

831

data_list = sorted(data_list, key=lambda d: -d[1])

832

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

833

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

834

else:

835

message.append('(All boards at full strength)')

836

newline = '\n'

837

return '\n'.join(message)

838

839

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

840

_IDLE_INVENTORY_HEADER = '''\

841

Notice to Infrastructure deputies: The hosts shown below haven't

842

run any jobs for at least 24 hours. Please check each host; locked

843

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

849

"""Generate the "idle inventory" e-mail message.

850

851

The idle inventory is a host list with corresponding pool and board,

852

where the hosts are idle (`UNKWOWN` or `UNUSED`).

853

854

N.B. For sample output text format as users can expect to

855

see it in e-mail and log files, refer to the unit tests.

856

857

@param inventory _LabInventory object with the inventory to

858

be reported on.

859

@return String with the inventory message to be sent.

860

861

"""

862

logging.debug('Creating idle inventory')

863

message = [_IDLE_INVENTORY_HEADER]

864

message.append('Idle Host List:')

865

message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))

866

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

867

for pool in MANAGED_POOLS:

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

868

for board, counts in inventory.by_board.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

869

logging.debug('Counting %2d DUTs for %s, %s',

870

counts.get_total(pool), board, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

871

data_list.extend([(dut.host.hostname, board, pool)

872

for dut in counts.get_idle_list(pool)])

873

if data_list:

874

message.extend(['%-30s %-20s %s' % t for t in data_list])

875

else:

876

message.append('(No idle DUTs)')

877

return '\n'.join(message)

878

879

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

880

def _send_email(arguments, tag, subject, recipients, body):

881

"""Send an inventory e-mail message.

882

883

The message is logged in the selected log directory using `tag`

884

for the file name.

885

886

If the --print option was requested, the message is neither

887

logged nor sent, but merely printed on stdout.

888

889

@param arguments Parsed command-line options.

890

@param tag Tag identifying the inventory for logging

891

purposes.

892

@param subject E-mail Subject: header line.

893

@param recipients E-mail addresses for the To: header line.

894

@param body E-mail message body.

895

896

"""

897

logging.debug('Generating email: "%s"', subject)

898

all_recipients = ', '.join(recipients)

899

report_body = '\n'.join([

900

'To: %s' % all_recipients,

901

'Subject: %s' % subject,

902

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

903

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

904

print report_body

905

else:

906

filename = os.path.join(arguments.logdir, tag)

907

try:

908

report_file = open(filename, 'w')

909

report_file.write(report_body)

910

report_file.close()

911

except EnvironmentError as e:

912

logging.error('Failed to write %s: %s', filename, e)

913

try:

914

gmail_lib.send_email(all_recipients, subject, body)

915

except Exception as e:

916

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

def _separate_email_addresses(address_list):

921

"""Parse a list of comma-separated lists of e-mail addresses.

922

923

@param address_list A list of strings containing comma

924

separate e-mail addresses.

925

@return A list of the individual e-mail addresses.

"""

newlist = []

for arg in address_list:

930

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

935

"""Validate command-line arguments.

936

937

Join comma separated e-mail addresses for `--board-notify` and

938

`--pool-notify` in separate option arguments into a single list.

939

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

940

For non-debug uses, require that notification be requested for

941

at least one report. For debug, if notification isn't specified,

942

treat it as "run all the reports."

943

944

The return value indicates success or failure; in the case of

945

failure, we also write an error message to stderr.

946

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

947

@param arguments Command-line arguments as returned by

948

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

949

@return True if the arguments are semantically good, or False

950

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

951

952

"""

953

arguments.board_notify = _separate_email_addresses(

954

arguments.board_notify)

955

arguments.pool_notify = _separate_email_addresses(

956

arguments.pool_notify)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

957

if not arguments.board_notify and not arguments.pool_notify:

958

if not arguments.debug:

959

sys.stderr.write('Must specify at least one of '

960

'--board-notify or --pool-notify\n')

961

return False

962

else:

963

# We want to run all the reports. An empty notify list

964

# will cause a report to be skipped, so make sure the

965

# lists are non-empty.

966

arguments.board_notify = ['']

967

arguments.pool_notify = ['']

968

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

969

970

971

def _get_logdir(script):

972

"""Get the default directory for the `--logdir` option.

973

974

The default log directory is based on the parent directory

975

containing this script.

976

977

@param script Path to this script file.

978

@return A path to a directory.

979

980

"""

981

basedir = os.path.dirname(os.path.abspath(script))

982

basedir = os.path.dirname(basedir)

983

return os.path.join(basedir, _LOGDIR)

984

985

986

def _parse_command(argv):

987

"""Parse the command line arguments.

988

989

Create an argument parser for this command's syntax, parse the

990

command line, and return the result of the ArgumentParser

991

parse_args() method.

992

993

@param argv Standard command line argument vector; argv[0] is

994

assumed to be the command name.

995

@return Result returned by ArgumentParser.parse_args().

996

997

"""

998

parser = argparse.ArgumentParser(

999

prog=argv[0],

1000

description='Gather and report lab inventory statistics')

1001

parser.add_argument('-d', '--duration', type=int,

1002

default=_DEFAULT_DURATION, metavar='HOURS',

1003

help='number of hours back to search for status'

1004

' (default: %d)' % _DEFAULT_DURATION)

1005

parser.add_argument('--board-notify', action='append',

1006

default=[], metavar='ADDRESS',

1007

help='Generate board inventory message, '

1008

'and send it to the given e-mail address(es)')

1009

parser.add_argument('--pool-notify', action='append',

1010

default=[], metavar='ADDRESS',

1011

help='Generate pool inventory message, '

1012

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1013

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1014

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1015

'recommended for repair (default: no '

1016

'recommendation)'))

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1017

parser.add_argument('--debug', action='store_true',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1018

help='Print e-mail messages on stdout '

1019

'without sending them.')

1020

parser.add_argument('--logdir', default=_get_logdir(argv[0]),

1021

help='Directory where logs will be written.')

1022

parser.add_argument('boardnames', nargs='*',

1023

metavar='BOARD',

1024

help='names of boards to report on '

1025

'(default: all boards)')

1026

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1027

if not _verify_arguments(arguments):

1028

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1033

"""Configure the `logging` module for our needs.

1034

1035

How we log depends on whether the `--print` option was

1036

provided on the command line. Without the option, we log all

1037

messages at DEBUG level or above, and write them to a file in

1038

the directory specified by the `--logdir` option. With the

1039

option, we write log messages to stdout; messages below INFO

1040

level are discarded.

1041

1042

The log file is configured to rotate once a week on Friday

1043

evening, preserving ~3 months worth of history.

1044

1045

@param arguments Command-line arguments as returned by

1046

`ArgumentParser`

1047

1048

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1049

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1050

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1051

root_logger.setLevel(logging.INFO)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1052

handler = logging.StreamHandler(sys.stdout)

1053

handler.setFormatter(logging.Formatter())

1054

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1055

if not os.path.exists(arguments.logdir):

1056

os.mkdir(arguments.logdir)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1057

root_logger.setLevel(logging.DEBUG)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1058

logfile = os.path.join(arguments.logdir, _LOGFILE)

1059

handler = logging.handlers.TimedRotatingFileHandler(

1060

logfile, when='W4', backupCount=13)

1061

formatter = logging.Formatter(_LOG_FORMAT,

1062

time_utils.TIME_FMT)

1063

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1064

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1065

# implicitly imported logging_config, which calls

1066

# logging.basicConfig() *at module level*. That gives us an

1067

# extra logging handler that we don't want. So, clear out all

1068

# the handlers here.

1069

for h in root_logger.handlers:

1070

root_logger.removeHandler(h)

1071

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1072

1073

1074

def _populate_board_counts(inventory):

1075

"""Gather board counts while providing interactive feedback.

1076

1077

Gathering the status of all individual DUTs in the lab can take

1078

considerable time (~30 minutes at the time of this writing).

1079

1080

Normally, we pay that cost by querying as we go. However, with

1081

the `--print` option, a human being may be watching the

1082

progress. So, we force the first (expensive) queries to happen

1083

up front, and provide a small ASCII progress bar to give an

1084

indicator of how many boards have been processed.

1085

1086

@param inventory _LabInventory object with the inventory to

be gathered.

"""

n = 0

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1091

total_broken = 0

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1092

for counts in inventory.values():

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

# This next call is where all the time goes - it forces all

1103

# of a board's HostJobHistory objects to query the database

1104

# and cache their results.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1105

total_broken += counts.get_broken()

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1106

sys.stdout.write('\n')

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1107

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

def main(argv):

"""Standard main routine.

1112

@param argv Command line arguments including `sys.argv[0]`.

1113

"""

1114

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1115

if not arguments:

1116

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1117

_configure_logging(arguments)

1118

try:

1119

end_time = int(time.time())

1120

start_time = end_time - arguments.duration * 60 * 60

1121

timestamp = time.strftime('%Y-%m-%d.%H',

1122

time.localtime(end_time))

1123

logging.debug('Starting lab inventory for %s', timestamp)

1124

if arguments.board_notify:

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1125

if arguments.recommend:

1126

logging.debug('Will include repair recommendations')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1127

logging.debug('Will include board inventory')

1128

if arguments.pool_notify:

1129

logging.debug('Will include pool inventory')

1130

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

1131

afe = frontend_wrappers.RetryingAFE(server=None)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1132

inventory = _LabInventory.create_inventory(

1133

afe, start_time, end_time, arguments.boardnames)

1134

logging.info('Found %d hosts across %d boards',

1135

inventory.get_num_duts(),

1136

inventory.get_num_boards())

1137

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1138

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1139

_populate_board_counts(inventory)

1140

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1141

if arguments.board_notify:

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1142

if arguments.recommend:

1143

recommend_message = _generate_repair_recommendation(

1144

inventory, arguments.recommend) + '\n\n\n'

1145

else:

1146

recommend_message = ''

1147

board_message = _generate_board_inventory_message(inventory)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1148

_send_email(arguments,

1149

'boards-%s.txt' % timestamp,

1150

'DUT board inventory %s' % timestamp,

1151

arguments.board_notify,

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1152

recommend_message + board_message)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1153

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1154

if arguments.pool_notify:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

1155

pool_message = _generate_pool_inventory_message(inventory)

1156

idle_message = _generate_idle_inventory_message(inventory)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1157

_send_email(arguments,

1158

'pools-%s.txt' % timestamp,

1159

'DUT pool inventory %s' % timestamp,

1160

arguments.pool_notify,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

1161

pool_message + '\n\n\n' + idle_message)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1162

except KeyboardInterrupt:

1163

pass

1164

except EnvironmentError as e:

1165

logging.exception('Unexpected OS error: %s', e)

1166

except Exception as e:

1167

logging.exception('Unexpected exception: %s', e)

1168

1169

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1170

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1171

end_time = int(time.time())

1172

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1173

return _LabInventory.create_inventory(afe, start_time, end_time)

1174

1175

1176

def get_managed_boards(afe):

1177

return get_inventory(afe).get_managed_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1178

1179

J. Richard Barnette