Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

9

board and pool, and determines whether each DUT is working or

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

13

usage: lab_inventory.py [ options ] [ board ... ]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

20

--board-notify <address>[,<address>]

21

Send the "board status" e-mail to all the specified e-mail

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

29

When generating the "board status" e-mail, included a list of

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

32

--repair-loops

33

Scan the inventory for DUTs stuck in repair loops, and report them

34

via a Monarch presence metric.

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

45

<board> arguments:

46

With no arguments, gathers the status for all boards in the lab.

47

With one or more named boards on the command line, restricts

48

reporting to just those boards.

"""

import argparse

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame]

54

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

55

import logging

56

import logging.handlers

57

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

58

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

63

from autotest_lib.client.bin import utils

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

64

from autotest_lib.client.common_lib import time_utils

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

65

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

66

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

67

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

68

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

69

from autotest_lib.site_utils import gmail_lib

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

70

from autotest_lib.site_utils.suite_scheduler import constants

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame]

71

from autotest_lib.utils import labellib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

72

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

73

74

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

75

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

76

SPARE_POOL = constants.Pools.SPARE_POOL

77

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

78

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

79

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

80

# monitoring by this script. Currently, we're excluding these:

81

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

82

# + 'board:guado_moblab' - These are maintained by a separate

83

# process that doesn't use this script.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

84

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

85

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

86

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

87

# _DEFAULT_DURATION:

88

# Default value used for the --duration command line option.

89

# Specifies how far back in time to search in order to determine

90

# DUT status.

91

92

_DEFAULT_DURATION = 24

93

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

94

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

95

# Relative path used in the calculation of the default setting for

96

# the --logdir option. The full path is relative to the root of the

97

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

98

# _LOGFILE:

99

# Basename of a file to which general log information will be

100

# written.

101

# _LOG_FORMAT:

102

# Format string for log messages.

103

104

_LOGDIR = os.path.join('logs', 'dut-data')

105

_LOGFILE = 'lab-inventory.log'

106

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

107

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

108

# Pattern describing location-based host names in the Chrome OS test

109

# labs. Each DUT hostname designates the DUT's location:

110

# * A lab (room) that's physically separated from other labs

111

# (i.e. there's a door).

112

# * A row (or aisle) of DUTs within the lab.

113

# * A vertical rack of shelves on the row.

114

# * A specific host on one shelf of the rack.

115

116

_HOSTNAME_PATTERN = re.compile(

117

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

118

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

119

# Default entry for managed pools.

120

121

_MANAGED_POOL_DEFAULT = 'all_pools'

122

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

123

# _REPAIR_LOOP_THRESHOLD:

124

# The number of repeated Repair tasks that must be seen to declare

125

# that a DUT is stuck in a repair loop.

126

127

_REPAIR_LOOP_THRESHOLD = 4

128

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

129

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

130

class _CachedHostJobHistories(object):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

131

"""Maintains a set of `HostJobHistory` objects for a pool.

132

133

The collected history objects are nominally all part of a single

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

134

scheduling pool of DUTs. The collection maintains a list of

135

working DUTs, a list of broken DUTs, and a list of all DUTs.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

136

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

137

Performance note: Certain methods in this class are potentially

138

expensive:

139

* `get_working()`

140

* `get_working_list()`

141

* `get_broken()`

142

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

143

* `get_idle()`

144

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

145

The first time any one of these methods is called, it causes

146

multiple RPC calls with a relatively expensive set of database

147

queries. However, the results of the queries are cached in the

148

individual `HostJobHistory` objects, so only the first call

149

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

150

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

151

Additionally, `get_working_list()`, `get_broken_list()` and

152

`get_idle_list()` cache their return values to avoid recalculating

153

lists at every call; this caching is separate from the caching of RPC

154

results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

155

156

This class is deliberately constructed to delay the RPC cost

157

until the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

158

`record_host()`) so that it's possible to construct a complete

159

`_LabInventory` without making the expensive queries at creation

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

160

time. `_populate_board_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

166

self._working_list = None

167

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

168

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

169

170

171

def record_host(self, host_history):

172

"""Add one `HostJobHistory` object to the collection.

173

174

@param host_history The `HostJobHistory` object to be

175

remembered.

176

177

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

178

self._working_list = None

179

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

180

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

181

self._histories.append(host_history)

182

183

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

184

def get_working_list(self):

185

"""Return a list of all working DUTs in the pool.

186

187

Filter `self._histories` for histories where the last

188

diagnosis is `WORKING`.

189

190

Cache the result so that we only cacluate it once.

191

192

@return A list of HostJobHistory objects.

193

194

"""

195

if self._working_list is None:

196

self._working_list = [h for h in self._histories

197

if h.last_diagnosis()[0] == status_history.WORKING]

198

return self._working_list

199

200

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

201

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

202

"""Return the number of working DUTs in the pool."""

203

return len(self.get_working_list())

204

205

206

def get_broken_list(self):

207

"""Return a list of all broken DUTs in the pool.

208

209

Filter `self._histories` for histories where the last

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

210

diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

211

212

Cache the result so that we only cacluate it once.

213

214

@return A list of HostJobHistory objects.

215

216

"""

217

if self._broken_list is None:

218

self._broken_list = [h for h in self._histories

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

219

if h.last_diagnosis()[0] == status_history.BROKEN]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

220

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

221

222

223

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

224

"""Return the number of broken DUTs in the pool."""

225

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

226

227

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

228

def get_idle_list(self):

229

"""Return a list of all idle DUTs in the pool.

230

231

Filter `self._histories` for histories where the last

232

diagnosis is `UNUSED` or `UNKNOWN`.

233

234

Cache the result so that we only cacluate it once.

235

236

@return A list of HostJobHistory objects.

237

238

"""

239

idle_list = [status_history.UNUSED, status_history.UNKNOWN]

240

if self._idle_list is None:

241

self._idle_list = [h for h in self._histories

242

if h.last_diagnosis()[0] in idle_list]

243

return self._idle_list

def get_idle(self):

"""Return the number of idle DUTs in the pool."""

248

return len(self.get_idle_list())

249

250

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

251

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

252

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

253

return len(self._histories)

254

255

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

256

class _ManagedPoolsHostJobHistories(object):

257

"""Maintains a set of `HostJobHistory`s per managed pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

258

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

259

The collection maintains a count of working DUTs, a count of broken DUTs,

260

and a total count. The counts can be obtained either for a single pool, or

261

as a total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

262

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

263

DUTs in the collection must be assigned to one of the pools in

264

`_MANAGED_POOLS`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

265

266

The `get_working()` and `get_broken()` methods rely on the

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

267

methods of the same name in _CachedHostJobHistories, so the performance

268

note in _CachedHostJobHistories applies here as well.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

273

self._histories_by_pool = {

274

pool: _CachedHostJobHistories() for pool in MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

275

}

276

277

def record_host(self, host_history):

278

"""Add one `HostJobHistory` object to the collection.

279

280

@param host_history The `HostJobHistory` object to be

281

remembered.

282

283

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

284

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

285

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

286

287

288

def _count_pool(self, get_pool_count, pool=None):

289

"""Internal helper to count hosts in a given pool.

290

291

The `get_pool_count` parameter is a function to calculate

292

the exact count of interest for the pool.

293

294

@param get_pool_count Function to return a count from a

295

_PoolCount object.

296

@param pool The pool to be counted. If `None`,

297

return the total across all pools.

298

299

"""

300

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

301

return sum([get_pool_count(cached_history) for cached_history in

302

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

303

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

304

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

305

306

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

307

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

308

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

309

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

310

Go through all HostJobHistory objects across all pools, selecting the

311

ones where the last diagnosis is `WORKING`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

312

313

@return A list of HostJobHistory objects.

314

315

"""

316

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

317

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

318

l.extend(p.get_working_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

322

def get_working(self, pool=None):

323

"""Return the number of working DUTs in a pool.

324

325

@param pool The pool to be counted. If `None`, return the

326

total across all pools.

327

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

328

@return The total number of working DUTs in the selected

329

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

330

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

331

return self._count_pool(_CachedHostJobHistories.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

332

333

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

334

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

335

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

336

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

337

Go through all HostJobHistory objects in the across all pools,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

338

selecting the ones where the last diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

339

340

@return A list of HostJobHistory objects.

341

342

"""

343

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

344

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

345

l.extend(p.get_broken_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

349

def get_broken(self, pool=None):

350

"""Return the number of broken DUTs in a pool.

351

352

@param pool The pool to be counted. If `None`, return the

353

total across all pools.

354

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

355

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

356

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

357

return self._count_pool(_CachedHostJobHistories.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

358

359

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

360

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

361

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

362

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

363

Go through all HostJobHistory objects in the given pool, selecting the

364

ones where the last diagnosis is `UNUSED` or `UNKNOWN`.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

365

366

@param pool: The pool to be counted. If `None`, return the total list

367

across all pools.

368

369

@return A list of HostJobHistory objects.

"""

if pool is None:

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

374

for p in self._histories_by_pool.values():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

375

l.extend(p.get_idle_list())

376

return l

377

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

378

return _CachedHostJobHistories.get_idle_list(

379

self._histories_by_pool[pool])

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

380

381

382

def get_idle(self, pool=None):

383

"""Return the number of idle DUTs in a pool.

384

385

@param pool: The pool to be counted. If `None`, return the total

386

across all pools.

387

388

@return The total number of idle DUTs in the selected pool(s).

389

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

390

return self._count_pool(_CachedHostJobHistories.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

391

392

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

393

def get_spares_buffer(self):

394

"""Return the the nominal number of working spares.

395

396

Calculates and returns how many working spares there would

397

be in the spares pool if all broken DUTs were in the spares

398

pool. This number may be negative, indicating a shortfall

399

in the critical pools.

400

401

@return The total number DUTs in the spares pool, less the total

402

number of broken DUTs in all pools.

403

"""

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

404

return self.get_total(SPARE_POOL) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

405

406

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

407

def get_total(self, pool=None):

408

"""Return the total number of DUTs in a pool.

409

410

@param pool The pool to be counted. If `None`, return the

411

total across all pools.

412

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

413

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

414

"""

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

415

return self._count_pool(_CachedHostJobHistories.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

416

417

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

418

class _LabInventory(object):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

419

"""Collection of `HostJobHistory` objects for the Lab's inventory.

420

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

421

Important attributes:

422

by_board: A dict mapping board to ManagedPoolsHostJobHistories

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

426

@staticmethod

427

def _eligible_host(afehost):

428

"""Return whether this host is eligible for monitoring.

429

Richard Barnette

99473f6

2017-10-17 14:43:46 -0700

[diff] [blame]

430

A host is eligible if it's in exactly one pool and it has no

431

labels from the `_EXCLUDED_LABELS` set.

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

432

433

@param afehost The host to be tested for eligibility.

434

"""

Richard Barnette

99473f6

2017-10-17 14:43:46 -0700

[diff] [blame]

435

pools = [l for l in afehost.labels

436

if l.startswith(constants.Labels.POOL_PREFIX)]

437

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

438

return len(pools) == 1 and not excluded

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

439

440

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

441

@classmethod

442

def create_inventory(cls, afe, start_time, end_time, boardlist=[]):

443

"""Return a Lab inventory with specified parameters.

444

445

By default, gathers inventory from `HostJobHistory` objects

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

446

for all DUTs in the `MANAGED_POOLS` list. If `boardlist`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

447

is supplied, the inventory will be restricted to only the

448

given boards.

449

450

@param afe AFE object for constructing the

451

`HostJobHistory` objects.

452

@param start_time Start time for the `HostJobHistory`

453

objects.

454

@param end_time End time for the `HostJobHistory`

455

objects.

456

@param boardlist List of boards to include. If empty,

457

include all available boards.

458

@return A `_LabInventory` object for the specified boards.

459

460

"""

461

label_list = [constants.Labels.POOL_PREFIX + l

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

462

for l in MANAGED_POOLS]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

463

afehosts = afe.get_hosts(labels__name__in=label_list)

464

if boardlist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

465

# We're deliberately not checking host eligibility in this

466

# code path. This is a debug path, not used in production;

467

# it may be useful to include ineligible hosts here.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

468

boardhosts = []

469

for board in boardlist:

470

board_label = constants.Labels.BOARD_PREFIX + board

471

host_list = [h for h in afehosts

472

if board_label in h.labels]

473

boardhosts.extend(host_list)

474

afehosts = boardhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

475

else:

476

afehosts = [h for h in afehosts if cls._eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

477

create = lambda host: (

478

status_history.HostJobHistory(afe, host,

479

start_time, end_time))

480

return cls([create(host) for host in afehosts])

481

482

483

def __init__(self, histories):

J. Richard Barnette

6948ed3

2015-05-06 08:57:10 -0700

[diff] [blame]

484

# N.B. The query that finds our hosts is restricted to those

485

# with a valid pool: label, but doesn't check for a valid

486

# board: label. In some (insufficiently) rare cases, the

487

# AFE hosts table has been known to (incorrectly) have DUTs

488

# with a pool: but no board: label. We explicitly exclude

489

# those here.

490

histories = [h for h in histories

491

if h.host_board is not None]

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

492

self.histories = histories

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

493

self._dut_count = len(histories)

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

494

self._managed_boards = {}

Prathmesh Prabhu

2017-11-09 08:53:53 -0800

[diff] [blame]

495

self._managed_models = {}

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame]

496

self.by_board = self._classify_by_label_type('board')

Prathmesh Prabhu

2017-11-09 08:53:53 -0800

[diff] [blame]

497

self.by_model = self._classify_by_label_type('model')

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

498

499

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame]

500

def _classify_by_label_type(self, label_key):

501

"""Classify histories by labels with the given key.

502

503

@returns a dict mapping labels with the given key to

504

_ManagedPoolsHostJobHistories for DUTs with that label.

505

"""

506

classified = collections.defaultdict(_ManagedPoolsHostJobHistories)

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

507

for h in self.histories:

Prathmesh Prabhu

2017-11-08 18:05:45 -0800

[diff] [blame]

508

labels = labellib.LabelsMapping(h.host.labels)

509

if label_key in labels:

510

classified[labels[label_key]].record_host(h)

511

return dict(classified)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

512

513

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

514

def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

515

"""Return the set of "managed" boards.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

516

Prathmesh Prabhu

2017-11-09 08:53:53 -0800

[diff] [blame]

517

@param pool: The specified pool for managed boards.

518

@return A set of all the boards that have both spare and

519

non-spare pools, unless the pool is specified,

520

then the set of boards in that pool.

521

"""

522

if self._managed_boards.get(pool) is None:

523

self._managed_boards[pool] = set()

524

for board, counts in self.by_board.iteritems():

525

if self._is_managed(pool, counts):

526

self._managed_boards[pool].add(board)

527

return self._managed_boards[pool]

528

529

530

def get_managed_models(self, pool=_MANAGED_POOL_DEFAULT):

531

"""Return the set of "managed" models.

532

533

@param pool: The specified pool for managed models.

534

@return A set of all the models that have both spare and

535

non-spare pools, unless the pool is specified,

536

then the set of models in that pool.

537

"""

538

if self._managed_models.get(pool) is None:

539

self._managed_models[pool] = set()

540

for board, counts in self.by_model.iteritems():

541

if self._is_managed(pool, counts):

542

self._managed_models[pool].add(board)

543

return self._managed_models[pool]

544

545

546

def _is_managed(self, pool, histories):

547

"""Deterime if the given histories contain DUTs to be managed for pool.

548

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

549

Operationally, saying a board is "managed" means that the

550

board will be included in the "board" and "repair

551

recommendations" reports. That is, if there are failures in

552

the board's inventory then lab techs will be asked to fix

553

them without a separate ticket.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

554

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

555

For purposes of implementation, a board is "managed" if it

556

has DUTs in both the spare and a non-spare (i.e. critical)

557

pool.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

558

559

"""

Prathmesh Prabhu

2017-11-09 08:53:53 -0800

[diff] [blame]

560

# Get the counts for all pools, otherwise get it for the

561

# specified pool.

562

if pool == _MANAGED_POOL_DEFAULT:

563

spares = histories.get_total(SPARE_POOL)

564

total = histories.get_total()

565

return spares != 0 and spares != total

566

else:

567

return histories.get_total(pool) != 0

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

568

569

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

570

def get_num_duts(self):

571

"""Return the total number of DUTs in the inventory."""

572

return self._dut_count

573

574

575

def get_num_boards(self):

576

"""Return the total number of boards in the inventory."""

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

577

return len(self.by_board)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

578

579

Prathmesh Prabhu

2017-11-09 08:53:53 -0800

[diff] [blame]

580

def get_num_models(self):

581

"""Return the total number of models in the inventory."""

582

return len(self.by_model)

583

584

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

585

def _sort_by_location(inventory_list):

586

"""Return a list of DUTs, organized by location.

587

588

Take the given list of `HostJobHistory` objects, separate it

589

into a list per lab, and sort each lab's list by location. The

590

order of sorting within a lab is

591

* By row number within the lab,

592

* then by rack number within the row,

593

* then by host shelf number within the rack.

594

595

Return a list of the sorted lists.

596

597

Implementation note: host locations are sorted by converting

598

each location into a base 100 number. If row, rack or

599

host numbers exceed the range [0..99], then sorting will

600

break down.

601

602

@return A list of sorted lists of DUTs.

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

608

location = _HOSTNAME_PATTERN.match(history.host.hostname)

609

if location:

610

lab = location.group(1)

611

key = 0

612

for idx in location.group(2, 3, 4):

613

key = BASE * key + int(idx)

614

lab_lists.setdefault(lab, []).append((key, history))

615

return_list = []

616

for dut_list in lab_lists.values():

617

dut_list.sort(key=lambda t: t[0])

618

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

623

"""Return a numeric score rating a set of DUTs to be repaired.

624

625

`buffer_counts` is a dictionary mapping board names to the

626

size of the board's spares buffer.

627

628

`repair_list` is a list of DUTs to be repaired.

629

630

This function calculates the new set of buffer counts that would

631

result from the proposed repairs, and scores the new set using

632

two numbers:

633

* Worst case buffer count for any board (higher is better).

634

This is the more siginficant number for comparison.

635

* Number of boards at the worst case (lower is better). This

636

is the less significant number.

637

638

Implementation note: The score could fail to reflect the

639

intended criteria if there are more than 1000 boards in the

640

inventory.

641

642

@param spare_counts A dictionary mapping boards to buffer counts.

643

@param repair_list A list of boards to be repaired.

644

@return A numeric score.

645

646

"""

647

# Go through `buffer_counts`, and create a list of new counts

648

# that records the buffer count for each board after repair.

649

# The new list of counts discards the board names, as they don't

650

# contribute to the final score.

651

_NBOARDS = 1000

652

repair_inventory = _LabInventory(repair_list)

653

new_counts = []

654

for b, c in buffer_counts.items():

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

655

if b in repair_inventory.by_board:

656

newcount = repair_inventory.by_board[b].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

657

else:

658

newcount = 0

659

new_counts.append(c + newcount)

660

# Go through the new list of counts. Find the worst available

661

# spares count, and count how many times that worst case occurs.

662

worst_count = new_counts[0]

663

num_worst = 1

664

for c in new_counts[1:]:

665

if c == worst_count:

666

num_worst += 1

667

elif c < worst_count:

668

worst_count = c

669

num_worst = 1

670

# Return the calculated score

671

return _NBOARDS * worst_count - num_worst

672

673

674

def _generate_repair_recommendation(inventory, num_recommend):

675

"""Return a summary of selected DUTs needing repair.

676

677

Returns a message recommending a list of broken DUTs to be

678

repaired. The list of DUTs is selected based on these

679

criteria:

680

* No more than `num_recommend` DUTs will be listed.

681

* All DUTs must be in the same lab.

682

* DUTs should be selected for some degree of physical

683

proximity.

684

* DUTs for boards with a low spares buffer are more important

685

than DUTs with larger buffers.

686

687

The algorithm used will guarantee that at least one DUT from a

688

board with the smallest spares buffer will be recommended. If

689

the worst spares buffer number is shared by more than one board,

690

the algorithm will tend to prefer repair sets that include more

691

of those boards over sets that cover fewer boards.

692

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

693

@param inventory Inventory for generating recommendations.

694

@param num_recommend Number of DUTs to recommend for repair.

695

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

696

"""

697

logging.debug('Creating DUT repair recommendations')

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

698

board_buffer_counts = {}

699

broken_list = []

700

for board in inventory.get_managed_boards():

701

logging.debug('Listing failed DUTs for %s', board)

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

702

counts = inventory.by_board[board]

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

703

if counts.get_broken() != 0:

704

board_buffer_counts[board] = counts.get_spares_buffer()

705

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

706

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

707

# simplification is hard:

708

# * Calculating an initial recommendation outside of

709

# the loop likely would make things more complicated,

710

# not less.

711

# * It's necessary to calculate an initial lab slice once per

712

# lab _before_ the while loop, in case the number of broken

713

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

714

recommendation = None

715

best_score = None

716

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

717

start = 0

718

end = num_recommend

719

lab_slice = lab_duts[start : end]

720

lab_score = _score_repair_set(board_buffer_counts,

721

lab_slice)

722

while end < len(lab_duts):

723

start += 1

724

end += 1

725

new_slice = lab_duts[start : end]

726

new_score = _score_repair_set(board_buffer_counts,

727

new_slice)

728

if new_score > lab_score:

729

lab_slice = new_slice

730

lab_score = new_score

731

if recommendation is None or lab_score > best_score:

732

recommendation = lab_slice

733

best_score = lab_score

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

734

# N.B. The trailing space here is manadatory: Without it, Gmail

735

# will parse the URL wrong. Don't ask. If you simply _must_

736

# know more, go try it yourself...

737

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

738

message = ['Repair recommendations:\n',

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

739

line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

740

for h in recommendation:

741

servo_name = servo_host.make_servo_hostname(h.host.hostname)

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

742

servo_present = utils.host_is_in_lab_zone(servo_name)

743

_, event = h.last_diagnosis()

744

line = line_fmt % (

745

h.host.hostname, h.host_board,

746

'Yes' if servo_present else 'No', event.job_url)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

747

message.append(line)

748

return '\n'.join(message)

749

750

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

751

def _generate_board_inventory_message(inventory):

752

"""Generate the "board inventory" e-mail message.

753

754

The board inventory is a list by board summarizing the number

755

of working and broken DUTs, and the total shortfall or surplus

756

of working devices relative to the minimum critical pool

757

requirement.

758

759

The report omits boards with no DUTs in the spare pool or with

760

no DUTs in a critical pool.

761

762

N.B. For sample output text formattted as users can expect to

763

see it in e-mail and log files, refer to the unit tests.

764

765

@param inventory _LabInventory object with the inventory to

766

be reported on.

767

@return String with the inventory message to be sent.

768

769

"""

770

logging.debug('Creating board inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

771

nworking = 0

772

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

773

nidle = 0

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

774

nbroken_boards = 0

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

775

ntotal_boards = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

776

summaries = []

777

for board in inventory.get_managed_boards():

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

778

counts = inventory.by_board[board]

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

779

logging.debug('Counting %2d DUTS for board %s',

780

counts.get_total(), board)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

781

# Summary elements laid out in the same order as the text

782

# headers:

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

783

# Board Avail Bad Idle Good Spare Total

784

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

785

element = (board,

786

counts.get_spares_buffer(),

787

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

788

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

789

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

790

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

791

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

792

if element[2]:

793

summaries.append(element)

794

nbroken_boards += 1

795

ntotal_boards += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

796

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

797

nidle += element[3]

798

nworking += element[4]

799

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

800

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

801

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

802

idle_percent = int(round(100.0 * nidle / ntotal))

803

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

804

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

805

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

806

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

807

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

808

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

809

nworking, working_percent,

810

ntotal),

811

'',

812

'Boards with failures: %d' % nbroken_boards,

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

813

'Boards in inventory: %d' % ntotal_boards,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

814

'', '',

815

'Full board inventory:\n',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

816

'%-22s %5s %5s %5s %5s %5s %5s' % (

817

'Board', 'Avail', 'Bad', 'Idle', 'Good',

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

818

'Spare', 'Total')]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

819

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

820

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

821

return '\n'.join(message)

822

823

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

824

_POOL_INVENTORY_HEADER = '''\

Aviv Keshet

056d74c

2015-07-14 09:18:43 -0700

[diff] [blame]

825

Notice to Infrastructure deputies: All boards shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

826

less than full strength, please take action to resolve the issues.

827

Once you're satisified that failures won't recur, failed DUTs can

828

be replaced with spares by running `balance_pool`. Detailed

829

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

830

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

834

def _generate_pool_inventory_message(inventory):

835

"""Generate the "pool inventory" e-mail message.

836

837

The pool inventory is a list by pool and board summarizing the

838

number of working and broken DUTs in the pool. Only boards with

839

at least one broken DUT are included in the list.

840

841

N.B. For sample output text formattted as users can expect to

842

see it in e-mail and log files, refer to the unit tests.

843

844

@param inventory _LabInventory object with the inventory to

845

be reported on.

846

@return String with the inventory message to be sent.

847

848

"""

849

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

850

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

851

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

852

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

853

message.append(

854

'%sStatus for pool:%s, by board:' % (newline, pool))

855

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

856

'%-20s %5s %5s %5s %5s' % (

857

'Board', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

858

data_list = []

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

859

for board, counts in inventory.by_board.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

860

logging.debug('Counting %2d DUTs for %s, %s',

861

counts.get_total(pool), board, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

862

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

863

idle = counts.get_idle(pool)

864

# boards at full strength are not reported

865

if broken == 0 and idle == 0:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

866

continue

867

working = counts.get_working(pool)

868

total = counts.get_total(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

869

data_list.append((board, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

870

if data_list:

871

data_list = sorted(data_list, key=lambda d: -d[1])

872

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

873

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

874

else:

875

message.append('(All boards at full strength)')

876

newline = '\n'

877

return '\n'.join(message)

878

879

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

880

_IDLE_INVENTORY_HEADER = '''\

881

Notice to Infrastructure deputies: The hosts shown below haven't

882

run any jobs for at least 24 hours. Please check each host; locked

883

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

889

"""Generate the "idle inventory" e-mail message.

890

891

The idle inventory is a host list with corresponding pool and board,

892

where the hosts are idle (`UNKWOWN` or `UNUSED`).

893

894

N.B. For sample output text format as users can expect to

895

see it in e-mail and log files, refer to the unit tests.

896

897

@param inventory _LabInventory object with the inventory to

898

be reported on.

899

@return String with the inventory message to be sent.

900

901

"""

902

logging.debug('Creating idle inventory')

903

message = [_IDLE_INVENTORY_HEADER]

904

message.append('Idle Host List:')

905

message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))

906

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

907

for pool in MANAGED_POOLS:

Prathmesh Prabhu

2017-11-08 17:36:51 -0800

[diff] [blame]

908

for board, counts in inventory.by_board.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

909

logging.debug('Counting %2d DUTs for %s, %s',

910

counts.get_total(pool), board, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

911

data_list.extend([(dut.host.hostname, board, pool)

912

for dut in counts.get_idle_list(pool)])

913

if data_list:

914

message.extend(['%-30s %-20s %s' % t for t in data_list])

915

else:

916

message.append('(No idle DUTs)')

917

return '\n'.join(message)

918

919

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

920

def _send_email(arguments, tag, subject, recipients, body):

921

"""Send an inventory e-mail message.

922

923

The message is logged in the selected log directory using `tag`

924

for the file name.

925

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

926

If the --debug option was requested, the message is neither

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

927

logged nor sent, but merely printed on stdout.

928

929

@param arguments Parsed command-line options.

930

@param tag Tag identifying the inventory for logging

931

purposes.

932

@param subject E-mail Subject: header line.

933

@param recipients E-mail addresses for the To: header line.

934

@param body E-mail message body.

935

936

"""

937

logging.debug('Generating email: "%s"', subject)

938

all_recipients = ', '.join(recipients)

939

report_body = '\n'.join([

940

'To: %s' % all_recipients,

941

'Subject: %s' % subject,

942

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

943

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

944

print report_body

945

else:

946

filename = os.path.join(arguments.logdir, tag)

947

try:

948

report_file = open(filename, 'w')

949

report_file.write(report_body)

950

report_file.close()

951

except EnvironmentError as e:

952

logging.error('Failed to write %s: %s', filename, e)

953

try:

954

gmail_lib.send_email(all_recipients, subject, body)

955

except Exception as e:

956

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

960

def _populate_board_counts(inventory):

961

"""Gather board counts while providing interactive feedback.

962

963

Gathering the status of all individual DUTs in the lab can take

964

considerable time (~30 minutes at the time of this writing).

965

966

Normally, we pay that cost by querying as we go. However, with

967

the `--debug` option, we expect a human being to be watching the

968

progress in real time. So, we force the first (expensive)

969

queries to happen up front, and provide simple ASCII output

970

(without using logging) to show a progress bar and results.

971

972

@param inventory _LabInventory object with the inventory to

be gathered.

"""

n = 0

total_broken = 0

for counts in inventory.by_board.itervalues():

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

# This next call is where all the time goes - it forces all

989

# of a board's HostJobHistory objects to query the database

990

# and cache their results.

991

total_broken += counts.get_broken()

992

sys.stdout.write('\n')

993

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

994

995

996

def _perform_board_inventory(arguments, inventory, timestamp):

997

"""Perform the board inventory report.

998

999

The board inventory report consists of the following:

1000

* A list of DUTs that are recommended to be repaired.

1001

This list is optional, and only appears if the `--recommend`

1002

option is present.

1003

* A list of all boards that have failed DUTs, with counts

1004

of working, broken, and spare DUTs, among others.

1005

1006

@param arguments Command-line arguments as returned by

1007

`ArgumentParser`

1008

@param inventory _LabInventory object with the inventory to

1009

be reported.

1010

@param timestamp A string used to identify this run's timestamp

1011

in logs and email output.

1012

"""

1013

if arguments.recommend:

1014

recommend_message = _generate_repair_recommendation(

1015

inventory, arguments.recommend) + '\n\n\n'

1016

else:

1017

recommend_message = ''

1018

board_message = _generate_board_inventory_message(inventory)

1019

_send_email(arguments,

1020

'boards-%s.txt' % timestamp,

1021

'DUT board inventory %s' % timestamp,

1022

arguments.board_notify,

1023

recommend_message + board_message)

1024

1025

1026

def _perform_pool_inventory(arguments, inventory, timestamp):

1027

"""Perform the pool inventory report.

1028

1029

The pool inventory report consists of the following:

1030

* A list of all critical pools that have failed DUTs, with counts

1031

of working, broken, and idle DUTs.

1032

* A list of all idle DUTs by hostname including the board and

1033

pool.

1034

1035

@param arguments Command-line arguments as returned by

1036

`ArgumentParser`

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

1037

@param inventory _LabInventory object with the inventory to be

1038

reported.

1039

@param timestamp A string used to identify this run's timestamp in

1040

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1041

"""

1042

pool_message = _generate_pool_inventory_message(inventory)

1043

idle_message = _generate_idle_inventory_message(inventory)

1044

_send_email(arguments,

1045

'pools-%s.txt' % timestamp,

1046

'DUT pool inventory %s' % timestamp,

1047

arguments.pool_notify,

1048

pool_message + '\n\n\n' + idle_message)

1049

1050

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

1051

def _dut_in_repair_loop(history):

1052

"""Return whether a DUT's history indicates a repair loop.

1053

1054

A DUT is considered looping if it runs no tests, and no tasks pass

1055

other than repair tasks.

1056

1057

@param history An instance of `status_history.HostJobHistory` to be

1058

scanned for a repair loop. The caller guarantees

1059

that this history corresponds to a working DUT.

1060

@returns Return a true value if the DUT's most recent history

1061

indicates a repair loop.

1062

"""

1063

# Our caller passes only histories for working DUTs; that means

1064

# we've already paid the cost of fetching the diagnosis task, and

1065

# we know that the task was successful. The diagnosis task will be

1066

# one of the tasks we must scan to find a loop, so if the task isn't

1067

# a repair task, then our history includes a successful non-repair

1068

# task, and we're not looping.

1069

#

1070

# The for loop below is very expensive, because it must fetch the

1071

# full history, regardless of how many tasks we examine. At the

1072

# time of this writing, this check against the diagnosis task

1073

# reduces the cost of finding loops in the full inventory from hours

1074

# to minutes.

1075

if history.last_diagnosis()[1].name != 'Repair':

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1080

# This is a test, so we're not looping.

1081

return False

1082

if task.diagnosis == status_history.BROKEN:

1083

# Failed a repair, so we're not looping.

1084

return False

1085

if (task.diagnosis == status_history.WORKING

1086

and task.name != 'Repair'):

1087

# Non-repair task succeeded, so we're not looping.

1088

return False

1089

# At this point, we have either a failed non-repair task, or

1090

# a successful repair.

1091

if task.name == 'Repair':

1092

repair_ok_count += 1

1093

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

def _perform_repair_loop_report(arguments, inventory):

1098

"""Scan the inventory for DUTs stuck in a repair loop.

1099

1100

This routine walks through the given inventory looking for DUTs

1101

where the most recent history shows that the DUT is regularly

1102

passing repair tasks, but has not run any tests.

1103

1104

@param arguments Command-line arguments as returned by

1105

`ArgumentParser`

1106

@param inventory _LabInventory object with the inventory to be

1107

reported.

1108

"""

1109

loop_presence = metrics.BooleanMetric(

1110

'chromeos/autotest/inventory/repair_loops',

1111

'DUTs stuck in repair loops')

1112

logging.info('Scanning for DUTs in repair loops.')

1113

for counts in inventory.by_board.itervalues():

1114

for history in counts.get_working_list():

1115

# Managed DUTs with names that don't match

1116

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1117

# don't want arbitrary strings being attached to the

1118

# 'dut_hostname' field, so for safety, we exclude all

1119

# anomalies.

1120

if not _HOSTNAME_PATTERN.match(history.hostname):

1121

continue

1122

if _dut_in_repair_loop(history):

1123

fields = {'dut_hostname': history.hostname,

1124

'board': history.host_board,

1125

'pool': history.host_pool}

1126

logging.info('Looping DUT: %(dut_hostname)s, '

1127

'board: %(board)s, pool: %(pool)s',

1128

fields)

1129

loop_presence.set(True, fields=fields)

1130

1131

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1132

def _log_startup(arguments, startup_time):

1133

"""Log the start of this inventory run.

1134

1135

Print various log messages indicating the start of the run. Return

1136

a string based on `startup_time` that will be used to identify this

1137

run in log files and e-mail messages.

1138

1139

@param startup_time A UNIX timestamp marking the moment when

1140

this inventory run began.

1141

@returns A timestamp string that will be used to identify this run

1142

in logs and email output.

1143

"""

1144

timestamp = time.strftime('%Y-%m-%d.%H',

1145

time.localtime(startup_time))

1146

logging.debug('Starting lab inventory for %s', timestamp)

1147

if arguments.board_notify:

1148

if arguments.recommend:

1149

logging.debug('Will include repair recommendations')

1150

logging.debug('Will include board inventory')

1151

if arguments.pool_notify:

1152

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1157

"""Create the `_LabInventory` instance to use for reporting.

1158

1159

@param end_time A UNIX timestamp for the end of the time range

1160

to be searched in this inventory run.

1161

"""

1162

start_time = end_time - arguments.duration * 60 * 60

1163

afe = frontend_wrappers.RetryingAFE(server=None)

1164

inventory = _LabInventory.create_inventory(

1165

afe, start_time, end_time, arguments.boardnames)

1166

logging.info('Found %d hosts across %d boards',

1167

inventory.get_num_duts(),

1168

inventory.get_num_boards())

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

1172

def _perform_inventory_reports(arguments):

1173

"""Perform all inventory checks requested on the command line.

1174

1175

Create the initial inventory and run through the inventory reports

1176

as called for by the parsed command-line arguments.

1177

1178

@param arguments Command-line arguments as returned by

1179

`ArgumentParser`.

1180

"""

1181

startup_time = time.time()

1182

timestamp = _log_startup(arguments, startup_time)

1183

inventory = _create_inventory(arguments, startup_time)

1184

if arguments.debug:

1185

_populate_board_counts(inventory)

1186

if arguments.board_notify:

1187

_perform_board_inventory(arguments, inventory, timestamp)

1188

if arguments.pool_notify:

1189

_perform_pool_inventory(arguments, inventory, timestamp)

1190

if arguments.repair_loops:

1191

_perform_repair_loop_report(arguments, inventory)

1192

1193

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1194

def _separate_email_addresses(address_list):

1195

"""Parse a list of comma-separated lists of e-mail addresses.

1196

1197

@param address_list A list of strings containing comma

1198

separate e-mail addresses.

1199

@return A list of the individual e-mail addresses.

"""

newlist = []

for arg in address_list:

1204

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1209

"""Validate command-line arguments.

1210

1211

Join comma separated e-mail addresses for `--board-notify` and

1212

`--pool-notify` in separate option arguments into a single list.

1213

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1214

For non-debug uses, require that notification be requested for

1215

at least one report. For debug, if notification isn't specified,

1216

treat it as "run all the reports."

1217

1218

The return value indicates success or failure; in the case of

1219

failure, we also write an error message to stderr.

1220

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1221

@param arguments Command-line arguments as returned by

1222

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1223

@return True if the arguments are semantically good, or False

1224

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1225

1226

"""

1227

arguments.board_notify = _separate_email_addresses(

1228

arguments.board_notify)

1229

arguments.pool_notify = _separate_email_addresses(

1230

arguments.pool_notify)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1231

if not arguments.board_notify and not arguments.pool_notify:

1232

if not arguments.debug:

1233

sys.stderr.write('Must specify at least one of '

1234

'--board-notify or --pool-notify\n')

1235

return False

1236

else:

1237

# We want to run all the reports. An empty notify list

1238

# will cause a report to be skipped, so make sure the

1239

# lists are non-empty.

1240

arguments.board_notify = ['']

1241

arguments.pool_notify = ['']

1242

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1243

1244

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1245

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1246

"""Get the default directory for the `--logdir` option.

1247

1248

The default log directory is based on the parent directory

1249

containing this script.

1250

1251

@param script Path to this script file.

1252

@return A path to a directory.

1253

1254

"""

1255

basedir = os.path.dirname(os.path.abspath(script))

1256

basedir = os.path.dirname(basedir)

1257

return os.path.join(basedir, _LOGDIR)

1258

1259

1260

def _parse_command(argv):

1261

"""Parse the command line arguments.

1262

1263

Create an argument parser for this command's syntax, parse the

1264

command line, and return the result of the ArgumentParser

1265

parse_args() method.

1266

1267

@param argv Standard command line argument vector; argv[0] is

1268

assumed to be the command name.

1269

@return Result returned by ArgumentParser.parse_args().

1270

1271

"""

1272

parser = argparse.ArgumentParser(

1273

prog=argv[0],

1274

description='Gather and report lab inventory statistics')

1275

parser.add_argument('-d', '--duration', type=int,

1276

default=_DEFAULT_DURATION, metavar='HOURS',

1277

help='number of hours back to search for status'

1278

' (default: %d)' % _DEFAULT_DURATION)

1279

parser.add_argument('--board-notify', action='append',

1280

default=[], metavar='ADDRESS',

1281

help='Generate board inventory message, '

1282

'and send it to the given e-mail address(es)')

1283

parser.add_argument('--pool-notify', action='append',

1284

default=[], metavar='ADDRESS',

1285

help='Generate pool inventory message, '

1286

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1287

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1288

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1289

'recommended for repair (default: no '

1290

'recommendation)'))

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

1291

parser.add_argument('--repair-loops', action='store_true',

1292

help='Check for devices stuck in repair loops.')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1293

parser.add_argument('--debug', action='store_true',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1294

help='Print e-mail messages on stdout '

1295

'without sending them.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1296

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1297

help='Directory where logs will be written.')

1298

parser.add_argument('boardnames', nargs='*',

1299

metavar='BOARD',

1300

help='names of boards to report on '

1301

'(default: all boards)')

1302

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1303

if not _verify_arguments(arguments):

1304

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1309

"""Configure the `logging` module for our needs.

1310

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1311

How we log depends on whether the `--debug` option was provided on

1312

the command line.

1313

* Without the option, we configure the logging to capture all

1314

potentially relevant events in a log file. The log file is

1315

configured to rotate once a week on Friday evening, preserving

1316

~3 months worth of history.

1317

* With the option, we expect stdout to contain other

1318

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

1319

messages), so we restrict the output to INFO level.

1320

1321

For convenience, when `--debug` is on, the logging format has

1322

no adornments, so that a call like `logging.info(msg)` simply writes

1323

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1324

1325

@param arguments Command-line arguments as returned by

1326

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1327

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1328

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1329

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1330

root_logger.setLevel(logging.INFO)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1331

handler = logging.StreamHandler(sys.stdout)

1332

handler.setFormatter(logging.Formatter())

1333

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1334

if not os.path.exists(arguments.logdir):

1335

os.mkdir(arguments.logdir)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1336

root_logger.setLevel(logging.DEBUG)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1337

logfile = os.path.join(arguments.logdir, _LOGFILE)

1338

handler = logging.handlers.TimedRotatingFileHandler(

1339

logfile, when='W4', backupCount=13)

1340

formatter = logging.Formatter(_LOG_FORMAT,

1341

time_utils.TIME_FMT)

1342

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1343

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1344

# implicitly imported logging_config, which calls

1345

# logging.basicConfig() *at module level*. That gives us an

1346

# extra logging handler that we don't want. So, clear out all

1347

# the handlers here.

1348

for h in root_logger.handlers:

1349

root_logger.removeHandler(h)

1350

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1351

1352

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1353

def main(argv):

1354

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

1355

1356

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1357

"""

1358

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1359

if not arguments:

1360

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1361

_configure_logging(arguments)

1362

try:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame^]

1363

if not arguments.debug:

1364

with site_utils.SetupTsMonGlobalState(

1365

'repair_loops', short_lived=True, auto_flush=False):

1366

_perform_inventory_reports(arguments)

1367

else:

1368

_perform_inventory_reports(arguments)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1369

except KeyboardInterrupt:

1370

pass

1371

except EnvironmentError as e:

1372

logging.exception('Unexpected OS error: %s', e)

1373

except Exception as e:

1374

logging.exception('Unexpected exception: %s', e)

1375

1376

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1377

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1378

end_time = int(time.time())

1379

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1380

return _LabInventory.create_inventory(afe, start_time, end_time)

1381

1382

1383

def get_managed_boards(afe):

1384

return get_inventory(afe).get_managed_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1385

1386

J. Richard Barnette