Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

32

--report-untestable

33

Scan the inventory for DUTs that can't test because they're stuck in

34

repair loops, or because the scheduler can't give them work.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

54

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

55

import logging

56

import logging.handlers

57

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

58

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

63

from autotest_lib.client.bin import utils

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

64

from autotest_lib.client.common_lib import time_utils

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

65

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

66

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

67

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

68

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

69

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

70

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

71

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

72

73

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

74

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

75

SPARE_POOL = constants.Pools.SPARE_POOL

76

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

77

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

78

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

79

# monitoring by this script. Currently, we're excluding these:

80

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

81

# + 'board:guado_moblab' - These are maintained by a separate

82

# process that doesn't use this script.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

83

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

84

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

85

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

86

# _DEFAULT_DURATION:

87

# Default value used for the --duration command line option.

88

# Specifies how far back in time to search in order to determine

89

# DUT status.

90

91

_DEFAULT_DURATION = 24

92

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

93

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

94

# Relative path used in the calculation of the default setting for

95

# the --logdir option. The full path is relative to the root of the

96

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

97

# _LOGFILE:

98

# Basename of a file to which general log information will be

99

# written.

100

# _LOG_FORMAT:

101

# Format string for log messages.

102

103

_LOGDIR = os.path.join('logs', 'dut-data')

104

_LOGFILE = 'lab-inventory.log'

105

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

106

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

107

# Pattern describing location-based host names in the Chrome OS test

108

# labs. Each DUT hostname designates the DUT's location:

109

# * A lab (room) that's physically separated from other labs

110

# (i.e. there's a door).

111

# * A row (or aisle) of DUTs within the lab.

112

# * A vertical rack of shelves on the row.

113

# * A specific host on one shelf of the rack.

114

115

_HOSTNAME_PATTERN = re.compile(

116

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

117

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

118

# _REPAIR_LOOP_THRESHOLD:

119

# The number of repeated Repair tasks that must be seen to declare

120

# that a DUT is stuck in a repair loop.

121

122

_REPAIR_LOOP_THRESHOLD = 4

123

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

124

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

125

_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(

126

'chromeos/autotest/inventory/untestable',

127

'DUTs that cannot be scheduled for testing')

128

129

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

130

class _HostSetInventory(object):

131

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

132

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

133

The collection is segregated into disjoint categories of "working",

134

"broken", and "idle" DUTs. Accessor methods allow finding both the

135

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

136

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

137

Performance note: Certain methods in this class are potentially

138

expensive:

139

* `get_working()`

140

* `get_working_list()`

141

* `get_broken()`

142

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

143

* `get_idle()`

144

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

145

The first time any one of these methods is called, it causes

146

multiple RPC calls with a relatively expensive set of database

147

queries. However, the results of the queries are cached in the

148

individual `HostJobHistory` objects, so only the first call

149

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

150

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

151

Additionally, `get_working_list()`, `get_broken_list()` and

152

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

153

lists at every call; this caching is separate from the caching of

154

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

155

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

156

This class is deliberately constructed to delay the RPC cost until

157

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

158

`record_host()`) so that it's possible to construct a complete

159

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

160

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

161

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

162

Current usage of this class is that all DUTs are part of a single

163

scheduling pool of DUTs; however, this class make no assumptions

164

about the actual relationship among the DUTs.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

169

self._working_list = None

170

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

171

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

172

173

174

def record_host(self, host_history):

175

"""Add one `HostJobHistory` object to the collection.

176

177

@param host_history The `HostJobHistory` object to be

178

remembered.

179

180

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

181

self._working_list = None

182

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

183

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

184

self._histories.append(host_history)

185

186

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

187

def get_working_list(self):

188

"""Return a list of all working DUTs in the pool.

189

190

Filter `self._histories` for histories where the last

191

diagnosis is `WORKING`.

192

193

Cache the result so that we only cacluate it once.

194

195

@return A list of HostJobHistory objects.

196

197

"""

198

if self._working_list is None:

199

self._working_list = [h for h in self._histories

200

if h.last_diagnosis()[0] == status_history.WORKING]

201

return self._working_list

202

203

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

204

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

205

"""Return the number of working DUTs in the pool."""

206

return len(self.get_working_list())

207

208

209

def get_broken_list(self):

210

"""Return a list of all broken DUTs in the pool.

211

212

Filter `self._histories` for histories where the last

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

213

diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

214

215

Cache the result so that we only cacluate it once.

216

217

@return A list of HostJobHistory objects.

218

219

"""

220

if self._broken_list is None:

221

self._broken_list = [h for h in self._histories

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

222

if h.last_diagnosis()[0] == status_history.BROKEN]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

223

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

224

225

226

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

227

"""Return the number of broken DUTs in the pool."""

228

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

229

230

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

231

def get_idle_list(self):

232

"""Return a list of all idle DUTs in the pool.

233

234

Filter `self._histories` for histories where the last

235

diagnosis is `UNUSED` or `UNKNOWN`.

236

237

Cache the result so that we only cacluate it once.

238

239

@return A list of HostJobHistory objects.

240

241

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

242

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

243

if self._idle_list is None:

244

self._idle_list = [h for h in self._histories

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

245

if h.last_diagnosis()[0] in idle_statuses]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

246

return self._idle_list

def get_idle(self):

"""Return the number of idle DUTs in the pool."""

251

return len(self.get_idle_list())

252

253

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

254

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

255

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

256

return len(self._histories)

257

258

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

259

class _PoolSetInventory(object):

260

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

261

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

262

The collection is segregated into disjoint categories of "working",

263

"broken", and "idle" DUTs. Accessor methods allow finding both the

264

list of DUTs in each category, as well as counts of each category.

265

Accessor queries can be for an individual pool, or against all

266

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

267

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

268

Performance note: This class relies on `_HostSetInventory`. Public

269

methods in this class generally rely on methods of the same name in

270

the underlying class, and so will have the same underlying

271

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

272

"""

273

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

274

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

275

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

276

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

277

}

278

279

def record_host(self, host_history):

280

"""Add one `HostJobHistory` object to the collection.

281

282

@param host_history The `HostJobHistory` object to be

283

remembered.

284

285

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

286

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

287

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

288

289

290

def _count_pool(self, get_pool_count, pool=None):

291

"""Internal helper to count hosts in a given pool.

292

293

The `get_pool_count` parameter is a function to calculate

294

the exact count of interest for the pool.

295

296

@param get_pool_count Function to return a count from a

297

_PoolCount object.

298

@param pool The pool to be counted. If `None`,

299

return the total across all pools.

300

301

"""

302

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

303

return sum([get_pool_count(cached_history) for cached_history in

304

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

305

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

306

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

307

308

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

309

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

310

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

311

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

312

Go through all HostJobHistory objects across all pools, selecting the

313

ones where the last diagnosis is `WORKING`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

314

315

@return A list of HostJobHistory objects.

316

317

"""

318

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

319

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

320

l.extend(p.get_working_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

324

def get_working(self, pool=None):

325

"""Return the number of working DUTs in a pool.

326

327

@param pool The pool to be counted. If `None`, return the

328

total across all pools.

329

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

330

@return The total number of working DUTs in the selected

331

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

332

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

333

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

334

335

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

336

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

337

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

338

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

339

Go through all HostJobHistory objects in the across all pools,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

340

selecting the ones where the last diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

341

342

@return A list of HostJobHistory objects.

343

344

"""

345

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

346

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

347

l.extend(p.get_broken_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

351

def get_broken(self, pool=None):

352

"""Return the number of broken DUTs in a pool.

353

354

@param pool The pool to be counted. If `None`, return the

355

total across all pools.

356

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

357

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

358

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

359

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

360

361

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

362

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

363

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

364

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

365

Go through all HostJobHistory objects in the given pool, selecting the

366

ones where the last diagnosis is `UNUSED` or `UNKNOWN`.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

367

368

@param pool: The pool to be counted. If `None`, return the total list

369

across all pools.

370

371

@return A list of HostJobHistory objects.

"""

if pool is None:

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

376

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

377

l.extend(p.get_idle_list())

378

return l

379

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

380

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

381

382

383

def get_idle(self, pool=None):

384

"""Return the number of idle DUTs in a pool.

385

386

@param pool: The pool to be counted. If `None`, return the total

387

across all pools.

388

389

@return The total number of idle DUTs in the selected pool(s).

390

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

391

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

392

393

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

394

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

395

"""Return the the nominal number of working spares.

396

397

Calculates and returns how many working spares there would

398

be in the spares pool if all broken DUTs were in the spares

399

pool. This number may be negative, indicating a shortfall

400

in the critical pools.

401

402

@return The total number DUTs in the spares pool, less the total

403

number of broken DUTs in all pools.

404

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

405

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

406

407

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

408

def get_total(self, pool=None):

409

"""Return the total number of DUTs in a pool.

410

411

@param pool The pool to be counted. If `None`, return the

412

total across all pools.

413

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

414

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

415

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

416

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

417

418

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

419

def _eligible_host(afehost):

420

"""Return whether this host is eligible for monitoring.

421

422

A host is eligible if it has a (unique) 'model' label, it's in

423

exactly one pool, and it has no labels from the

424

`_EXCLUDED_LABELS` set.

425

426

@param afehost The host to be tested for eligibility.

427

"""

428

# DUTs without an existing, unique 'model' or 'pool' label

429

# aren't meant to exist in the managed inventory; their presence

430

# generally indicates an error in the database. Unfortunately

431

# such errors have been seen to occur from time to time.

432

#

433

# The _LabInventory constructor requires hosts to conform to the

434

# label restrictions, and may fail if they don't. Failing an

435

# inventory run for a single bad entry is the wrong thing, so we

436

# ignore the problem children here, to keep them out of the

437

# inventory.

438

models = [l for l in afehost.labels

439

if l.startswith(constants.Labels.MODEL_PREFIX)]

440

pools = [l for l in afehost.labels

441

if l.startswith(constants.Labels.POOL_PREFIX)]

442

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

443

return len(models) == 1 and len(pools) == 1 and not excluded

444

445

446

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

447

"""Collection of `HostJobHistory` objects for the Lab's inventory.

448

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

449

This is a dict-like collection indexed by model. Indexing returns

450

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

451

"""

452

453

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

454

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

455

"""Return a Lab inventory with specified parameters.

456

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

457

By default, gathers inventory from `HostJobHistory` objects for

458

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

459

supplied, the inventory will be restricted to only the given

460

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

461

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

462

@param afe AFE object for constructing the

463

`HostJobHistory` objects.

464

@param start_time Start time for the `HostJobHistory` objects.

465

@param end_time End time for the `HostJobHistory` objects.

466

@param modellist List of models to include. If empty,

467

include all available models.

468

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

469

470

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

471

target_pools = MANAGED_POOLS

472

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

473

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

474

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

475

# We're deliberately not checking host eligibility in this

476

# code path. This is a debug path, not used in production;

477

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

478

modelhosts = []

479

for model in modellist:

480

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

481

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

482

if model_label in h.labels]

483

modelhosts.extend(host_list)

484

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

485

else:

Richard Barnette

3a40449

2018-02-08 13:57:01 -0800

[diff] [blame]

486

afehosts = [h for h in afehosts if _eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

487

create = lambda host: (

488

status_history.HostJobHistory(afe, host,

489

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

490

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

491

492

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

493

def __init__(self, histories, pools):

494

models = {h.host_model for h in histories}

495

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

496

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

497

for h in histories:

498

self[h.host_model].record_host(h)

499

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

500

501

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

502

def __getitem__(self, key):

503

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

504

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

505

506

def __len__(self):

507

return self._modeldata.__len__()

def __iter__(self):

return self._modeldata.__iter__()

512

513

514

def reportable_items(self, spare_pool=SPARE_POOL):

515

"""Iterate over all items subject to reporting.

516

517

Yields the contents of `self.iteritems()` filtered to include

518

only reportable models. A model is reportable if it has DUTs in

519

both `spare_pool` and at least one other pool.

520

521

@param spare_pool The spare pool to be tested for reporting.

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

522

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

523

for model, histories in self.iteritems():

524

spares = histories.get_total(spare_pool)

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

525

total = histories.get_total()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

526

if spares != 0 and spares != total:

527

yield model, histories

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

528

529

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

530

def get_num_duts(self):

531

"""Return the total number of DUTs in the inventory."""

532

return self._dut_count

533

534

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

535

def get_num_models(self):

536

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

return len(self)

def get_pool_models(self, pool):

541

"""Return all models in `pool`.

542

543

@param pool The pool to be inventoried for models.

544

"""

545

return {m for m, h in self.iteritems() if h.get_total(pool)}

546

547

548

def get_boards(self):

549

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

550

551

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

552

def _sort_by_location(inventory_list):

553

"""Return a list of DUTs, organized by location.

554

555

Take the given list of `HostJobHistory` objects, separate it

556

into a list per lab, and sort each lab's list by location. The

557

order of sorting within a lab is

558

* By row number within the lab,

559

* then by rack number within the row,

560

* then by host shelf number within the rack.

561

562

Return a list of the sorted lists.

563

564

Implementation note: host locations are sorted by converting

565

each location into a base 100 number. If row, rack or

566

host numbers exceed the range [0..99], then sorting will

567

break down.

568

569

@return A list of sorted lists of DUTs.

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

575

location = _HOSTNAME_PATTERN.match(history.host.hostname)

576

if location:

577

lab = location.group(1)

578

key = 0

579

for idx in location.group(2, 3, 4):

580

key = BASE * key + int(idx)

581

lab_lists.setdefault(lab, []).append((key, history))

582

return_list = []

583

for dut_list in lab_lists.values():

584

dut_list.sort(key=lambda t: t[0])

585

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

590

"""Return a numeric score rating a set of DUTs to be repaired.

591

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

592

`buffer_counts` is a dictionary mapping model names to the size of

593

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

594

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

595

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

596

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

597

598

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

599

result from the proposed repairs, and scores the new set using two

600

numbers:

601

* Worst case buffer count for any model (higher is better). This

602

is the more significant number for comparison.

603

* Number of models at the worst case (lower is better). This is

604

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

605

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

606

Implementation note: The score could fail to reflect the intended

607

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

608

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

609

@param spare_counts A dictionary mapping models to buffer counts.

610

@param repair_list A list of `HostJobHistory` objects for the

611

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

612

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

613

"""

614

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

615

# that records the buffer count for each model after repair.

616

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

617

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

618

_NMODELS = 1000

619

pools = {h.host_pool for h in repair_list}

620

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

621

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

622

for m, c in buffer_counts.iteritems():

623

if m in repair_inventory:

624

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

625

else:

626

newcount = 0

627

new_counts.append(c + newcount)

628

# Go through the new list of counts. Find the worst available

629

# spares count, and count how many times that worst case occurs.

630

worst_count = new_counts[0]

631

num_worst = 1

632

for c in new_counts[1:]:

633

if c == worst_count:

634

num_worst += 1

635

elif c < worst_count:

636

worst_count = c

637

num_worst = 1

638

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

639

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

640

641

642

def _generate_repair_recommendation(inventory, num_recommend):

643

"""Return a summary of selected DUTs needing repair.

644

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

645

Returns a message recommending a list of broken DUTs to be repaired.

646

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

647

* No more than `num_recommend` DUTs will be listed.

648

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

649

* DUTs should be selected for some degree of physical proximity.

650

* DUTs for models with a low spares buffer are more important than

651

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

652

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

653

The algorithm used will guarantee that at least one DUT from a model

654

with the lowest spares buffer will be recommended. If the worst

655

spares buffer number is shared by more than one model, the algorithm

656

will tend to prefer repair sets that include more of those models

657

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

658

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

659

@param inventory `_LabInventory` object from which to generate

660

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

661

@param num_recommend Number of DUTs to recommend for repair.

662

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

663

"""

664

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

665

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

666

broken_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

667

for model, counts in inventory.reportable_items():

668

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

669

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

670

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

671

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

672

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

673

# simplification is hard:

674

# * Calculating an initial recommendation outside of

675

# the loop likely would make things more complicated,

676

# not less.

677

# * It's necessary to calculate an initial lab slice once per

678

# lab _before_ the while loop, in case the number of broken

679

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

680

recommendation = None

681

best_score = None

682

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

683

start = 0

684

end = num_recommend

685

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

686

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

687

while end < len(lab_duts):

688

start += 1

689

end += 1

690

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

691

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

692

if new_score > lab_score:

693

lab_slice = new_slice

694

lab_score = new_score

695

if recommendation is None or lab_score > best_score:

696

recommendation = lab_slice

697

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

698

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

699

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

700

# know more, go try it yourself...

701

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

702

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

703

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

704

for h in recommendation:

705

servo_name = servo_host.make_servo_hostname(h.host.hostname)

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

706

servo_present = utils.host_is_in_lab_zone(servo_name)

707

_, event = h.last_diagnosis()

708

line = line_fmt % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

709

h.host.hostname, h.host_model,

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

710

'Yes' if servo_present else 'No', event.job_url)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

711

message.append(line)

712

return '\n'.join(message)

713

714

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

715

def _generate_model_inventory_message(inventory):

716

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

717

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

718

The model inventory is a list by model summarizing the number of

719

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

720

of working devices relative to the minimum critical pool

721

requirement.

722

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

723

The report omits models with no DUTs in the spare pool or with no

724

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

725

726

N.B. For sample output text formattted as users can expect to

727

see it in e-mail and log files, refer to the unit tests.

728

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

729

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

730

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

731

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

732

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

733

nworking = 0

734

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

735

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

736

nbroken_models = 0

737

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

738

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

739

column_names = (

740

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

741

for model, counts in inventory.reportable_items():

742

logging.debug('Counting %2d DUTS for model %s',

743

counts.get_total(), model)

744

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

745

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

746

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

747

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

748

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

749

counts.get_spares_buffer(),

750

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

751

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

752

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

753

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

754

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

755

if element[2]:

756

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

757

nbroken_models += 1

758

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

759

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

760

nidle += element[3]

761

nworking += element[4]

762

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

763

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

764

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

765

idle_percent = int(round(100.0 * nidle / ntotal))

766

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

767

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

768

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

769

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

770

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

771

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

772

nworking, working_percent,

773

ntotal),

774

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

775

'Models with failures: %d' % nbroken_models,

776

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

777

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

778

'Full model inventory:\n',

779

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

780

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

781

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

782

return '\n'.join(message)

783

784

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

785

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

786

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

787

less than full strength, please take action to resolve the issues.

788

Once you're satisified that failures won't recur, failed DUTs can

789

be replaced with spares by running `balance_pool`. Detailed

790

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

791

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

795

def _generate_pool_inventory_message(inventory):

796

"""Generate the "pool inventory" e-mail message.

797

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

798

The pool inventory is a list by pool and model summarizing the

799

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

800

at least one broken DUT are included in the list.

801

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

802

N.B. For sample output text formattted as users can expect to see it

803

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

804

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

805

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

806

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

807

"""

808

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

809

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

810

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

811

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

812

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

813

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

814

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

815

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

816

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

817

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

818

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

819

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

820

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

821

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

822

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

823

# models at full strength are not reported

824

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

825

continue

826

working = counts.get_working(pool)

827

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

828

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

829

if data_list:

830

data_list = sorted(data_list, key=lambda d: -d[1])

831

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

832

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

833

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

834

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

835

newline = '\n'

836

return '\n'.join(message)

837

838

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

839

_IDLE_INVENTORY_HEADER = '''\

840

Notice to Infrastructure deputies: The hosts shown below haven't

841

run any jobs for at least 24 hours. Please check each host; locked

842

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

848

"""Generate the "idle inventory" e-mail message.

849

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

850

The idle inventory is a host list with corresponding pool and model,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

851

where the hosts are idle (`UNKWOWN` or `UNUSED`).

852

853

N.B. For sample output text format as users can expect to

854

see it in e-mail and log files, refer to the unit tests.

855

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

856

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

857

@return String with the inventory message to be sent.

858

859

"""

860

logging.debug('Creating idle inventory')

861

message = [_IDLE_INVENTORY_HEADER]

862

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

863

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

864

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

865

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

866

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

867

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

868

counts.get_total(pool), model, pool)

869

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

870

for dut in counts.get_idle_list(pool)])

871

if data_list:

872

message.extend(['%-30s %-20s %s' % t for t in data_list])

873

else:

874

message.append('(No idle DUTs)')

875

return '\n'.join(message)

876

877

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

878

def _send_email(arguments, tag, subject, recipients, body):

879

"""Send an inventory e-mail message.

880

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

881

The message is logged in the selected log directory using `tag` for

882

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

883

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

884

If the --debug option was requested, the message is neither logged

885

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

886

887

@param arguments Parsed command-line options.

888

@param tag Tag identifying the inventory for logging

889

purposes.

890

@param subject E-mail Subject: header line.

891

@param recipients E-mail addresses for the To: header line.

892

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

893

"""

894

logging.debug('Generating email: "%s"', subject)

895

all_recipients = ', '.join(recipients)

896

report_body = '\n'.join([

897

'To: %s' % all_recipients,

898

'Subject: %s' % subject,

899

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

900

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

901

print report_body

902

else:

903

filename = os.path.join(arguments.logdir, tag)

904

try:

905

report_file = open(filename, 'w')

906

report_file.write(report_body)

907

report_file.close()

908

except EnvironmentError as e:

909

logging.error('Failed to write %s: %s', filename, e)

910

try:

911

gmail_lib.send_email(all_recipients, subject, body)

912

except Exception as e:

913

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

917

def _populate_model_counts(inventory):

918

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

919

920

Gathering the status of all individual DUTs in the lab can take

921

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

922

Normally, we pay that cost by querying as we go. However, with

923

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

924

progress in real time. So, we force the first (expensive) queries

925

to happen up front, and provide simple ASCII output on sys.stdout

926

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

927

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

928

@param inventory `_LabInventory` object from which to gather

929

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

930

"""

931

n = 0

932

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

933

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

943

# This next call is where all the time goes - it forces all of a

944

# model's `HostJobHistory` objects to query the database and

945

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

946

total_broken += counts.get_broken()

947

sys.stdout.write('\n')

948

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

949

950

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

951

def _perform_model_inventory(arguments, inventory, timestamp):

952

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

953

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

954

The model inventory report consists of the following:

955

* A list of DUTs that are recommended to be repaired. This list

956

is optional, and only appears if the `--recommend` option is

957

present.

958

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

959

of working, broken, and spare DUTs, among others.

960

961

@param arguments Command-line arguments as returned by

962

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

963

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

964

@param timestamp A string used to identify this run's timestamp

965

in logs and email output.

966

"""

967

if arguments.recommend:

968

recommend_message = _generate_repair_recommendation(

969

inventory, arguments.recommend) + '\n\n\n'

970

else:

971

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

972

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

973

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

974

'models-%s.txt' % timestamp,

975

'DUT model inventory %s' % timestamp,

976

arguments.model_notify,

977

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

978

979

980

def _perform_pool_inventory(arguments, inventory, timestamp):

981

"""Perform the pool inventory report.

982

983

The pool inventory report consists of the following:

984

* A list of all critical pools that have failed DUTs, with counts

985

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

986

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

987

pool.

988

989

@param arguments Command-line arguments as returned by

990

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

991

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

992

@param timestamp A string used to identify this run's timestamp in

993

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

994

"""

995

pool_message = _generate_pool_inventory_message(inventory)

996

idle_message = _generate_idle_inventory_message(inventory)

997

_send_email(arguments,

998

'pools-%s.txt' % timestamp,

999

'DUT pool inventory %s' % timestamp,

1000

arguments.pool_notify,

1001

pool_message + '\n\n\n' + idle_message)

1002

1003

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1004

def _dut_in_repair_loop(history):

1005

"""Return whether a DUT's history indicates a repair loop.

1006

1007

A DUT is considered looping if it runs no tests, and no tasks pass

1008

other than repair tasks.

1009

1010

@param history An instance of `status_history.HostJobHistory` to be

1011

scanned for a repair loop. The caller guarantees

1012

that this history corresponds to a working DUT.

1013

@returns Return a true value if the DUT's most recent history

1014

indicates a repair loop.

1015

"""

1016

# Our caller passes only histories for working DUTs; that means

1017

# we've already paid the cost of fetching the diagnosis task, and

1018

# we know that the task was successful. The diagnosis task will be

1019

# one of the tasks we must scan to find a loop, so if the task isn't

1020

# a repair task, then our history includes a successful non-repair

1021

# task, and we're not looping.

1022

#

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1023

# The for loop below is very expensive, because it must fetch the

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1024

# full history, regardless of how many tasks we examine. At the

1025

# time of this writing, this check against the diagnosis task

1026

# reduces the cost of finding loops in the full inventory from hours

1027

# to minutes.

1028

if history.last_diagnosis()[1].name != 'Repair':

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1033

# This is a test, so we're not looping.

1034

return False

1035

if task.diagnosis == status_history.BROKEN:

1036

# Failed a repair, so we're not looping.

1037

return False

1038

if (task.diagnosis == status_history.WORKING

1039

and task.name != 'Repair'):

1040

# Non-repair task succeeded, so we're not looping.

1041

return False

1042

# At this point, we have either a failed non-repair task, or

1043

# a successful repair.

1044

if task.name == 'Repair':

1045

repair_ok_count += 1

1046

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1050

def _report_untestable_dut(history, state):

1051

fields = {

1052

'dut_hostname': history.hostname,

1053

'model': history.host_model,

1054

'pool': history.host_pool,

1055

'state': state,

1056

}

1057

logging.info('Untestable DUT: %(dut_hostname)s, model: %(model)s, '

1058

'pool: %(pool)s', fields)

1059

_UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1060

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1061

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1062

def _report_repair_loop_metrics(inventory):

1063

"""Find and report DUTs stuck in a repair loop.

1064

1065

Go through `inventory`, and find and report any DUT identified as

1066

being in a repair loop.

1067

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1068

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1069

"""

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1070

logging.info('Scanning for DUTs in repair loops.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1071

for counts in inventory.itervalues():

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1072

for history in counts.get_working_list():

1073

# Managed DUTs with names that don't match

1074

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1075

# don't want arbitrary strings being attached to the

1076

# 'dut_hostname' field, so for safety, we exclude all

1077

# anomalies.

1078

if not _HOSTNAME_PATTERN.match(history.hostname):

1079

continue

1080

if _dut_in_repair_loop(history):

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1081

_report_untestable_dut(history, 'repair_loop')

1082

1083

1084

def _report_idle_dut_metrics(inventory):

1085

"""Find and report idle, unlocked DUTs.

1086

1087

Go through `inventory`, and find and report any DUT identified as

1088

"idle" that is not also locked.

1089

1090

@param inventory `_LabInventory` object to be reported on.

1091

"""

1092

logging.info('Scanning for idle, unlocked DUTs.')

1093

for counts in inventory.itervalues():

1094

for history in counts.get_idle_list():

1095

# Managed DUTs with names that don't match

1096

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1097

# don't want arbitrary strings being attached to the

1098

# 'dut_hostname' field, so for safety, we exclude all

1099

# anomalies.

1100

if not _HOSTNAME_PATTERN.match(history.hostname):

1101

continue

1102

if not history.host.locked:

1103

_report_untestable_dut(history, 'idle_unlocked')

1104

1105

1106

def _report_untestable_dut_metrics(inventory):

1107

"""Scan the inventory for DUTs unable to run tests.

1108

1109

DUTs in the inventory are judged "untestable" if they meet one of

1110

two criteria:

1111

* The DUT is stuck in a repair loop; that is, it regularly passes

1112

repair, but never passes other operations.

1113

* The DUT runs no tasks at all, but is not locked.

1114

1115

This routine walks through the given inventory looking for DUTs in

1116

either of these states. Results are reported via a Monarch presence

1117

metric.

1118

1119

Note: To make sure that DUTs aren't flagged as "idle" merely

1120

because there's no work, a separate job runs prior to regular

1121

inventory runs which schedules trivial work on any DUT that appears

1122

idle.

1123

1124

@param inventory `_LabInventory` object to be reported on.

1125

"""

1126

_report_repair_loop_metrics(inventory)

1127

_report_idle_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1128

1129

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1130

def _log_startup(arguments, startup_time):

1131

"""Log the start of this inventory run.

1132

1133

Print various log messages indicating the start of the run. Return

1134

a string based on `startup_time` that will be used to identify this

1135

run in log files and e-mail messages.

1136

1137

@param startup_time A UNIX timestamp marking the moment when

1138

this inventory run began.

1139

@returns A timestamp string that will be used to identify this run

1140

in logs and email output.

1141

"""

1142

timestamp = time.strftime('%Y-%m-%d.%H',

1143

time.localtime(startup_time))

1144

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1145

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1146

if arguments.recommend:

1147

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1148

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1149

if arguments.pool_notify:

1150

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1155

"""Create the `_LabInventory` instance to use for reporting.

1156

1157

@param end_time A UNIX timestamp for the end of the time range

1158

to be searched in this inventory run.

1159

"""

1160

start_time = end_time - arguments.duration * 60 * 60

1161

afe = frontend_wrappers.RetryingAFE(server=None)

1162

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1163

afe, start_time, end_time, arguments.modelnames)

1164

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1165

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1166

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1170

def _perform_inventory_reports(arguments):

1171

"""Perform all inventory checks requested on the command line.

1172

1173

Create the initial inventory and run through the inventory reports

1174

as called for by the parsed command-line arguments.

1175

1176

@param arguments Command-line arguments as returned by

1177

`ArgumentParser`.

1178

"""

1179

startup_time = time.time()

1180

timestamp = _log_startup(arguments, startup_time)

1181

inventory = _create_inventory(arguments, startup_time)

1182

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1183

_populate_model_counts(inventory)

1184

if arguments.model_notify:

1185

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1186

if arguments.pool_notify:

1187

_perform_pool_inventory(arguments, inventory, timestamp)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1188

if arguments.report_untestable:

1189

_report_untestable_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1190

1191

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1192

def _separate_email_addresses(address_list):

1193

"""Parse a list of comma-separated lists of e-mail addresses.

1194

1195

@param address_list A list of strings containing comma

1196

separate e-mail addresses.

1197

@return A list of the individual e-mail addresses.

"""

newlist = []

for arg in address_list:

1202

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1207

"""Validate command-line arguments.

1208

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1209

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1210

`--pool-notify` in separate option arguments into a single list.

1211

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1212

For non-debug uses, require that at least one inventory report be

1213

requested. For debug, if a report isn't specified, treat it as "run

1214

all the reports."

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1215

1216

The return value indicates success or failure; in the case of

1217

failure, we also write an error message to stderr.

1218

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1219

@param arguments Command-line arguments as returned by

1220

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1221

@return True if the arguments are semantically good, or False

1222

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1223

1224

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1225

arguments.model_notify = _separate_email_addresses(

1226

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1227

arguments.pool_notify = _separate_email_addresses(

1228

arguments.pool_notify)

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1229

if not any([arguments.model_notify, arguments.pool_notify,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1230

arguments.report_untestable]):

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1231

if not arguments.debug:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1232

sys.stderr.write('Must request at least one report via '

1233

'--model-notify, --pool-notify, or '

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1234

'--report-untestable\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1235

return False

1236

else:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1237

# We want to run all the e-mail reports. An empty notify

1238

# list will cause a report to be skipped, so make sure the

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1239

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1240

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1241

arguments.pool_notify = ['']

1242

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1243

1244

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1245

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1246

"""Get the default directory for the `--logdir` option.

1247

1248

The default log directory is based on the parent directory

1249

containing this script.

1250

1251

@param script Path to this script file.

1252

@return A path to a directory.

1253

1254

"""

1255

basedir = os.path.dirname(os.path.abspath(script))

1256

basedir = os.path.dirname(basedir)

1257

return os.path.join(basedir, _LOGDIR)

1258

1259

1260

def _parse_command(argv):

1261

"""Parse the command line arguments.

1262

1263

Create an argument parser for this command's syntax, parse the

1264

command line, and return the result of the ArgumentParser

1265

parse_args() method.

1266

1267

@param argv Standard command line argument vector; argv[0] is

1268

assumed to be the command name.

1269

@return Result returned by ArgumentParser.parse_args().

1270

1271

"""

1272

parser = argparse.ArgumentParser(

1273

prog=argv[0],

1274

description='Gather and report lab inventory statistics')

1275

parser.add_argument('-d', '--duration', type=int,

1276

default=_DEFAULT_DURATION, metavar='HOURS',

1277

help='number of hours back to search for status'

1278

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1279

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1280

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1281

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1282

'and send it to the given e-mail address(es)')

1283

parser.add_argument('--pool-notify', action='append',

1284

default=[], metavar='ADDRESS',

1285

help='Generate pool inventory message, '

1286

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1287

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1288

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1289

'recommended for repair (default: no '

1290

'recommendation)'))

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1291

parser.add_argument('--report-untestable', action='store_true',

1292

help='Check for devices unable to run tests.')

Richard Barnette

88b9489

2018-02-07 12:11:02 -0800

[diff] [blame]

1293

parser.add_argument('--debug-metrics', action='store_true',

1294

help='Include debug information about the metrics '

1295

'that would be reported ')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1296

parser.add_argument('--debug', action='store_true',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1297

help='Print e-mail messages on stdout '

1298

'without sending them.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1299

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1300

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1301

parser.add_argument('modelnames', nargs='*',

1302

metavar='MODEL',

1303

help='names of models to report on '

1304

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1305

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1306

if not _verify_arguments(arguments):

1307

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1312

"""Configure the `logging` module for our needs.

1313

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1314

How we log depends on whether the `--debug` option was provided on

1315

the command line.

1316

* Without the option, we configure the logging to capture all

1317

potentially relevant events in a log file. The log file is

1318

configured to rotate once a week on Friday evening, preserving

1319

~3 months worth of history.

1320

* With the option, we expect stdout to contain other

1321

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1322

messages), so we restrict the output to INFO level.

1323

1324

For convenience, when `--debug` is on, the logging format has

1325

no adornments, so that a call like `logging.info(msg)` simply writes

1326

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1327

1328

@param arguments Command-line arguments as returned by

1329

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1330

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1331

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1332

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1333

root_logger.setLevel(logging.INFO)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1334

handler = logging.StreamHandler(sys.stdout)

1335

handler.setFormatter(logging.Formatter())

1336

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1337

if not os.path.exists(arguments.logdir):

1338

os.mkdir(arguments.logdir)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1339

root_logger.setLevel(logging.DEBUG)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1340

logfile = os.path.join(arguments.logdir, _LOGFILE)

1341

handler = logging.handlers.TimedRotatingFileHandler(

1342

logfile, when='W4', backupCount=13)

1343

formatter = logging.Formatter(_LOG_FORMAT,

1344

time_utils.TIME_FMT)

1345

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1346

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1347

# implicitly imported logging_config, which calls

1348

# logging.basicConfig() *at module level*. That gives us an

1349

# extra logging handler that we don't want. So, clear out all

1350

# the handlers here.

1351

for h in root_logger.handlers:

1352

root_logger.removeHandler(h)

1353

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1354

1355

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1356

def main(argv):

1357

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1358

1359

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1360

"""

1361

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1362

if not arguments:

1363

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1364

_configure_logging(arguments)

1365

try:

Richard Barnette

88b9489

2018-02-07 12:11:02 -0800

[diff] [blame]

1366

if arguments.debug_metrics or not arguments.debug:

1367

metrics_file = None if not arguments.debug_metrics else '/dev/null'

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1368

with site_utils.SetupTsMonGlobalState(

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1369

'lab_inventory', debug_file=metrics_file,

Richard Barnette

88b9489

2018-02-07 12:11:02 -0800

[diff] [blame]

1370

auto_flush=False):

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1371

_perform_inventory_reports(arguments)

Richard Barnette

8840588

2018-02-07 11:39:30 -0800

[diff] [blame]

1372

metrics.Flush()

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1373

else:

1374

_perform_inventory_reports(arguments)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1375

except KeyboardInterrupt:

1376

pass

1377

except EnvironmentError as e:

1378

logging.exception('Unexpected OS error: %s', e)

1379

except Exception as e:

1380

logging.exception('Unexpected exception: %s', e)

1381

1382

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1383

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1384

end_time = int(time.time())

1385

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1386

return _LabInventory.create_inventory(afe, start_time, end_time)

1387

1388

1389

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1390

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1391

1392

J. Richard Barnette