Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

32

--report-untestable

33

Scan the inventory for DUTs that can't test because they're stuck in

34

repair loops, or because the scheduler can't give them work.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

53

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

54

import logging

55

import logging.handlers

56

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

57

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

62

from autotest_lib.client.bin import utils

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

63

from autotest_lib.client.common_lib import time_utils

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

64

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

65

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

66

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

67

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

68

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

69

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

70

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

71

72

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

73

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

74

SPARE_POOL = constants.Pools.SPARE_POOL

75

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

76

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

77

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

78

# monitoring by this script. Currently, we're excluding these:

79

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

80

# + 'board:guado_moblab' - These are maintained by a separate

81

# process that doesn't use this script.

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame^]

82

# + 'board:scarlet' due to crbug.com/846012 and other issues discussed at

83

# https://bugs.chromium.org/p/chromium/issues/detail?id=861806#c2

84

# + 'board:veyron_rialto' due to crbug.com/854404

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

85

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame^]

86

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab', 'board:scarlet',

87

'board:veyron_rialto'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

88

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

89

# _DEFAULT_DURATION:

90

# Default value used for the --duration command line option.

91

# Specifies how far back in time to search in order to determine

92

# DUT status.

93

94

_DEFAULT_DURATION = 24

95

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

96

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

97

# Relative path used in the calculation of the default setting for

98

# the --logdir option. The full path is relative to the root of the

99

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

100

# _LOGFILE:

101

# Basename of a file to which general log information will be

102

# written.

103

# _LOG_FORMAT:

104

# Format string for log messages.

105

106

_LOGDIR = os.path.join('logs', 'dut-data')

107

_LOGFILE = 'lab-inventory.log'

108

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

109

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

110

# Pattern describing location-based host names in the Chrome OS test

111

# labs. Each DUT hostname designates the DUT's location:

112

# * A lab (room) that's physically separated from other labs

113

# (i.e. there's a door).

114

# * A row (or aisle) of DUTs within the lab.

115

# * A vertical rack of shelves on the row.

116

# * A specific host on one shelf of the rack.

117

118

_HOSTNAME_PATTERN = re.compile(

119

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

120

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

121

# _REPAIR_LOOP_THRESHOLD:

122

# The number of repeated Repair tasks that must be seen to declare

123

# that a DUT is stuck in a repair loop.

124

125

_REPAIR_LOOP_THRESHOLD = 4

126

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

127

Prathmesh Prabhu

2018-05-07 14:49:33 -0700

[diff] [blame]

128

_METRICS_PREFIX = 'chromeos/autotest/inventory'

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

129

_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(

Prathmesh Prabhu

2018-05-07 14:49:33 -0700

[diff] [blame]

130

'%s/untestable' % _METRICS_PREFIX,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

131

'DUTs that cannot be scheduled for testing')

132

133

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

134

def _host_is_working(history):

135

return history.last_diagnosis()[0] == status_history.WORKING

136

137

138

def _host_is_broken(history):

139

return history.last_diagnosis()[0] == status_history.BROKEN

140

141

142

def _host_is_idle(history):

143

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

144

return history.last_diagnosis()[0] in idle_statuses

145

146

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

147

class _HostSetInventory(object):

148

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

149

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

150

Current usage of this class is that all DUTs are part of a single

151

scheduling pool of DUTs for a single model; however, this class make

152

no assumptions about the actual relationship among the DUTs.

153

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

154

The collection is segregated into disjoint categories of "working",

155

"broken", and "idle" DUTs. Accessor methods allow finding both the

156

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

157

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

158

Performance note: Certain methods in this class are potentially

159

expensive:

160

* `get_working()`

161

* `get_working_list()`

162

* `get_broken()`

163

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

164

* `get_idle()`

165

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

166

The first time any one of these methods is called, it causes

167

multiple RPC calls with a relatively expensive set of database

168

queries. However, the results of the queries are cached in the

169

individual `HostJobHistory` objects, so only the first call

170

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

171

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

172

Additionally, `get_working_list()`, `get_broken_list()` and

173

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

174

lists at every call; this caching is separate from the caching of

175

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

176

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

177

This class is deliberately constructed to delay the RPC cost until

178

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

179

`record_host()`) so that it's possible to construct a complete

180

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

181

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

186

self._working_list = None

187

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

188

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

189

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

190

def record_host(self, host_history):

191

"""Add one `HostJobHistory` object to the collection.

192

193

@param host_history The `HostJobHistory` object to be

194

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

195

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

196

self._working_list = None

197

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

198

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

199

self._histories.append(host_history)

200

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

201

def get_working_list(self):

202

"""Return a list of all working DUTs in the pool.

203

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

204

Filter `self._histories` for histories where the DUT is

205

diagnosed as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

206

207

Cache the result so that we only cacluate it once.

208

209

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

210

"""

211

if self._working_list is None:

212

self._working_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

213

if _host_is_working(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

214

return self._working_list

215

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

216

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

217

"""Return the number of working DUTs in the pool."""

218

return len(self.get_working_list())

219

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

220

def get_broken_list(self):

221

"""Return a list of all broken DUTs in the pool.

222

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

223

Filter `self._histories` for histories where the DUT is

224

diagnosed as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

225

226

Cache the result so that we only cacluate it once.

227

228

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

229

"""

230

if self._broken_list is None:

231

self._broken_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

232

if _host_is_broken(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

233

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

234

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

235

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

236

"""Return the number of broken DUTs in the pool."""

237

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

238

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

239

def get_idle_list(self):

240

"""Return a list of all idle DUTs in the pool.

241

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

242

Filter `self._histories` for histories where the DUT is

243

diagnosed as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

244

245

Cache the result so that we only cacluate it once.

246

247

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

248

"""

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

249

if self._idle_list is None:

250

self._idle_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

251

if _host_is_idle(h)]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

252

return self._idle_list

253

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

254

def get_idle(self):

255

"""Return the number of idle DUTs in the pool."""

256

return len(self.get_idle_list())

257

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

258

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

259

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

260

return len(self._histories)

261

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

262

def get_all_histories(self):

263

return self._histories

264

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

265

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

266

class _PoolSetInventory(object):

267

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

268

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

269

The collection is segregated into disjoint categories of "working",

270

"broken", and "idle" DUTs. Accessor methods allow finding both the

271

list of DUTs in each category, as well as counts of each category.

272

Accessor queries can be for an individual pool, or against all

273

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

274

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

275

Performance note: This class relies on `_HostSetInventory`. Public

276

methods in this class generally rely on methods of the same name in

277

the underlying class, and so will have the same underlying

278

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

279

"""

280

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

281

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

282

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

283

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

284

}

285

286

def record_host(self, host_history):

287

"""Add one `HostJobHistory` object to the collection.

288

289

@param host_history The `HostJobHistory` object to be

290

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

291

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

292

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

293

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

294

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

295

def _count_pool(self, get_pool_count, pool=None):

296

"""Internal helper to count hosts in a given pool.

297

298

The `get_pool_count` parameter is a function to calculate

299

the exact count of interest for the pool.

300

301

@param get_pool_count Function to return a count from a

302

_PoolCount object.

303

@param pool The pool to be counted. If `None`,

304

return the total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

305

"""

306

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

307

return sum([get_pool_count(cached_history) for cached_history in

308

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

309

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

310

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

311

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

312

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

313

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

314

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

315

Go through all HostJobHistory objects across all pools,

316

selecting all DUTs identified as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

317

318

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

319

"""

320

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

321

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

322

l.extend(p.get_working_list())

323

return l

324

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

325

def get_working(self, pool=None):

326

"""Return the number of working DUTs in a pool.

327

328

@param pool The pool to be counted. If `None`, return the

329

total across all pools.

330

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

331

@return The total number of working DUTs in the selected

332

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

333

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

334

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

335

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

336

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

337

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

338

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

339

Go through all HostJobHistory objects across all pools,

340

selecting all DUTs identified as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

341

342

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

343

"""

344

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

345

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

346

l.extend(p.get_broken_list())

347

return l

348

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

349

def get_broken(self, pool=None):

350

"""Return the number of broken DUTs in a pool.

351

352

@param pool The pool to be counted. If `None`, return the

353

total across all pools.

354

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

355

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

356

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

357

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

358

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

359

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

360

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

361

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

362

Go through all HostJobHistory objects across all pools,

363

selecting all DUTs identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

364

365

@param pool: The pool to be counted. If `None`, return the total list

366

across all pools.

367

368

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

369

"""

370

if pool is None:

371

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

372

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

373

l.extend(p.get_idle_list())

374

return l

375

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

376

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

377

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

378

def get_idle(self, pool=None):

379

"""Return the number of idle DUTs in a pool.

380

381

@param pool: The pool to be counted. If `None`, return the total

382

across all pools.

383

384

@return The total number of idle DUTs in the selected pool(s).

385

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

386

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

387

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

388

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

389

"""Return the the nominal number of working spares.

390

391

Calculates and returns how many working spares there would

392

be in the spares pool if all broken DUTs were in the spares

393

pool. This number may be negative, indicating a shortfall

394

in the critical pools.

395

396

@return The total number DUTs in the spares pool, less the total

397

number of broken DUTs in all pools.

398

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

399

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

400

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

401

def get_total(self, pool=None):

402

"""Return the total number of DUTs in a pool.

403

404

@param pool The pool to be counted. If `None`, return the

405

total across all pools.

406

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

407

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

408

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

409

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

410

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

411

def get_all_histories(self, pool=None):

412

if pool is None:

413

for p in self._histories_by_pool.itervalues():

414

for h in p.get_all_histories():

415

yield h

416

else:

417

for h in self._histories_by_pool[pool].get_all_histories():

418

yield h

419

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

420

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

421

def _eligible_host(afehost):

422

"""Return whether this host is eligible for monitoring.

423

424

A host is eligible if it has a (unique) 'model' label, it's in

425

exactly one pool, and it has no labels from the

426

`_EXCLUDED_LABELS` set.

427

428

@param afehost The host to be tested for eligibility.

429

"""

430

# DUTs without an existing, unique 'model' or 'pool' label

431

# aren't meant to exist in the managed inventory; their presence

432

# generally indicates an error in the database. Unfortunately

433

# such errors have been seen to occur from time to time.

434

#

435

# The _LabInventory constructor requires hosts to conform to the

436

# label restrictions, and may fail if they don't. Failing an

437

# inventory run for a single bad entry is the wrong thing, so we

438

# ignore the problem children here, to keep them out of the

439

# inventory.

440

models = [l for l in afehost.labels

441

if l.startswith(constants.Labels.MODEL_PREFIX)]

442

pools = [l for l in afehost.labels

443

if l.startswith(constants.Labels.POOL_PREFIX)]

444

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

445

return len(models) == 1 and len(pools) == 1 and not excluded

446

447

448

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

449

"""Collection of `HostJobHistory` objects for the Lab's inventory.

450

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

451

This is a dict-like collection indexed by model. Indexing returns

452

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

453

"""

454

455

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

456

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

457

"""Return a Lab inventory with specified parameters.

458

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

459

By default, gathers inventory from `HostJobHistory` objects for

460

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

461

supplied, the inventory will be restricted to only the given

462

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

463

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

464

@param afe AFE object for constructing the

465

`HostJobHistory` objects.

466

@param start_time Start time for the `HostJobHistory` objects.

467

@param end_time End time for the `HostJobHistory` objects.

468

@param modellist List of models to include. If empty,

469

include all available models.

470

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

471

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

472

target_pools = MANAGED_POOLS

473

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

474

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

475

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

476

# We're deliberately not checking host eligibility in this

477

# code path. This is a debug path, not used in production;

478

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

479

modelhosts = []

480

for model in modellist:

481

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

482

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

483

if model_label in h.labels]

484

modelhosts.extend(host_list)

485

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

486

else:

Richard Barnette

3a40449

2018-02-08 13:57:01 -0800

[diff] [blame]

487

afehosts = [h for h in afehosts if _eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

488

create = lambda host: (

489

status_history.HostJobHistory(afe, host,

490

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

491

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

492

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

493

def __init__(self, histories, pools):

494

models = {h.host_model for h in histories}

495

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

496

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

497

for h in histories:

498

self[h.host_model].record_host(h)

499

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

500

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

501

def __getitem__(self, key):

502

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

503

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

504

def __len__(self):

505

return self._modeldata.__len__()

506

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

507

def __iter__(self):

508

return self._modeldata.__iter__()

509

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

510

def get_num_duts(self):

511

"""Return the total number of DUTs in the inventory."""

512

return self._dut_count

513

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

514

def get_num_models(self):

515

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

516

return len(self)

517

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

518

def get_pool_models(self, pool):

519

"""Return all models in `pool`.

520

521

@param pool The pool to be inventoried for models.

522

"""

523

return {m for m, h in self.iteritems() if h.get_total(pool)}

524

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

525

def get_boards(self):

526

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

527

528

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

529

def _reportable_models(inventory, spare_pool=SPARE_POOL):

530

"""Iterate over all models subject to reporting.

531

532

Yields the contents of `inventory.iteritems()` filtered to include

533

only reportable models. A model is reportable if it has DUTs in

534

both `spare_pool` and at least one other pool.

535

536

@param spare_pool The spare pool to be tested for reporting.

537

"""

538

for model, poolset in inventory.iteritems():

539

spares = poolset.get_total(spare_pool)

540

total = poolset.get_total()

541

if spares != 0 and spares != total:

yield model, poolset

def _all_dut_histories(inventory):

546

for poolset in inventory.itervalues():

547

for h in poolset.get_all_histories():

yield h

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

551

def _sort_by_location(inventory_list):

552

"""Return a list of DUTs, organized by location.

553

554

Take the given list of `HostJobHistory` objects, separate it

555

into a list per lab, and sort each lab's list by location. The

556

order of sorting within a lab is

557

* By row number within the lab,

558

* then by rack number within the row,

559

* then by host shelf number within the rack.

560

561

Return a list of the sorted lists.

562

563

Implementation note: host locations are sorted by converting

564

each location into a base 100 number. If row, rack or

565

host numbers exceed the range [0..99], then sorting will

566

break down.

567

568

@return A list of sorted lists of DUTs.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

573

location = _HOSTNAME_PATTERN.match(history.host.hostname)

574

if location:

575

lab = location.group(1)

576

key = 0

577

for idx in location.group(2, 3, 4):

578

key = BASE * key + int(idx)

579

lab_lists.setdefault(lab, []).append((key, history))

580

return_list = []

581

for dut_list in lab_lists.values():

582

dut_list.sort(key=lambda t: t[0])

583

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

588

"""Return a numeric score rating a set of DUTs to be repaired.

589

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

590

`buffer_counts` is a dictionary mapping model names to the size of

591

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

592

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

593

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

594

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

595

596

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

597

result from the proposed repairs, and scores the new set using two

598

numbers:

599

* Worst case buffer count for any model (higher is better). This

600

is the more significant number for comparison.

601

* Number of models at the worst case (lower is better). This is

602

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

603

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

604

Implementation note: The score could fail to reflect the intended

605

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

606

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

607

@param spare_counts A dictionary mapping models to buffer counts.

608

@param repair_list A list of `HostJobHistory` objects for the

609

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

610

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

611

"""

612

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

613

# that records the buffer count for each model after repair.

614

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

615

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

616

_NMODELS = 1000

617

pools = {h.host_pool for h in repair_list}

618

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

619

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

620

for m, c in buffer_counts.iteritems():

621

if m in repair_inventory:

622

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

623

else:

624

newcount = 0

625

new_counts.append(c + newcount)

626

# Go through the new list of counts. Find the worst available

627

# spares count, and count how many times that worst case occurs.

628

worst_count = new_counts[0]

629

num_worst = 1

630

for c in new_counts[1:]:

631

if c == worst_count:

632

num_worst += 1

633

elif c < worst_count:

634

worst_count = c

635

num_worst = 1

636

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

637

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

638

639

640

def _generate_repair_recommendation(inventory, num_recommend):

641

"""Return a summary of selected DUTs needing repair.

642

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

643

Returns a message recommending a list of broken DUTs to be repaired.

644

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

645

* No more than `num_recommend` DUTs will be listed.

646

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

647

* DUTs should be selected for some degree of physical proximity.

648

* DUTs for models with a low spares buffer are more important than

649

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

650

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

651

The algorithm used will guarantee that at least one DUT from a model

652

with the lowest spares buffer will be recommended. If the worst

653

spares buffer number is shared by more than one model, the algorithm

654

will tend to prefer repair sets that include more of those models

655

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

656

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

657

@param inventory `_LabInventory` object from which to generate

658

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

659

@param num_recommend Number of DUTs to recommend for repair.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

660

"""

661

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

662

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

663

broken_list = []

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

664

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

665

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

666

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

667

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

668

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

669

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

670

# simplification is hard:

671

# * Calculating an initial recommendation outside of

672

# the loop likely would make things more complicated,

673

# not less.

674

# * It's necessary to calculate an initial lab slice once per

675

# lab _before_ the while loop, in case the number of broken

676

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

677

recommendation = None

678

best_score = None

679

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

680

start = 0

681

end = num_recommend

682

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

683

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

684

while end < len(lab_duts):

685

start += 1

686

end += 1

687

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

688

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

689

if new_score > lab_score:

690

lab_slice = new_slice

691

lab_score = new_score

692

if recommendation is None or lab_score > best_score:

693

recommendation = lab_slice

694

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

695

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

696

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

697

# know more, go try it yourself...

698

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

699

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

700

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

701

for h in recommendation:

702

servo_name = servo_host.make_servo_hostname(h.host.hostname)

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

703

servo_present = utils.host_is_in_lab_zone(servo_name)

704

_, event = h.last_diagnosis()

705

line = line_fmt % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

706

h.host.hostname, h.host_model,

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

707

'Yes' if servo_present else 'No', event.job_url)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

708

message.append(line)

709

return '\n'.join(message)

710

711

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

712

def _generate_model_inventory_message(inventory):

713

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

714

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

715

The model inventory is a list by model summarizing the number of

716

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

717

of working devices relative to the minimum critical pool

718

requirement.

719

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

720

The report omits models with no DUTs in the spare pool or with no

721

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

722

723

N.B. For sample output text formattted as users can expect to

724

see it in e-mail and log files, refer to the unit tests.

725

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

726

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

727

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

728

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

729

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

730

nworking = 0

731

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

732

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

733

nbroken_models = 0

734

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

735

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

736

column_names = (

737

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

738

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

739

logging.debug('Counting %2d DUTS for model %s',

740

counts.get_total(), model)

741

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

742

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

743

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

744

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

745

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

746

counts.get_spares_buffer(),

747

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

748

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

749

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

750

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

751

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

752

if element[2]:

753

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

754

nbroken_models += 1

755

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

756

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

757

nidle += element[3]

758

nworking += element[4]

759

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

760

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

761

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

762

idle_percent = int(round(100.0 * nidle / ntotal))

763

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

764

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

765

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

766

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

767

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

768

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

769

nworking, working_percent,

770

ntotal),

771

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

772

'Models with failures: %d' % nbroken_models,

773

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

774

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

775

'Full model inventory:\n',

776

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

777

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

778

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

779

return '\n'.join(message)

780

781

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

782

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

783

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

784

less than full strength, please take action to resolve the issues.

785

Once you're satisified that failures won't recur, failed DUTs can

786

be replaced with spares by running `balance_pool`. Detailed

787

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

788

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

792

def _generate_pool_inventory_message(inventory):

793

"""Generate the "pool inventory" e-mail message.

794

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

795

The pool inventory is a list by pool and model summarizing the

796

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

797

at least one broken DUT are included in the list.

798

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

799

N.B. For sample output text formattted as users can expect to see it

800

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

801

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

802

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

803

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

804

"""

805

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

806

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

807

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

808

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

809

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

810

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

811

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

812

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

813

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

814

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

815

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

816

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

817

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

818

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

819

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

820

# models at full strength are not reported

821

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

822

continue

823

working = counts.get_working(pool)

824

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

825

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

826

if data_list:

827

data_list = sorted(data_list, key=lambda d: -d[1])

828

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

829

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

830

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

831

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

832

newline = '\n'

833

return '\n'.join(message)

834

835

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

836

_IDLE_INVENTORY_HEADER = '''\

837

Notice to Infrastructure deputies: The hosts shown below haven't

838

run any jobs for at least 24 hours. Please check each host; locked

839

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

845

"""Generate the "idle inventory" e-mail message.

846

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

847

The idle inventory is a host list with corresponding pool and model,

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

848

where the hosts are identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

849

850

N.B. For sample output text format as users can expect to

851

see it in e-mail and log files, refer to the unit tests.

852

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

853

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

854

@return String with the inventory message to be sent.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

855

"""

856

logging.debug('Creating idle inventory')

857

message = [_IDLE_INVENTORY_HEADER]

858

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

859

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

860

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

861

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

862

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

863

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

864

counts.get_total(pool), model, pool)

865

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

866

for dut in counts.get_idle_list(pool)])

867

if data_list:

868

message.extend(['%-30s %-20s %s' % t for t in data_list])

869

else:

870

message.append('(No idle DUTs)')

871

return '\n'.join(message)

872

873

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

874

def _send_email(arguments, tag, subject, recipients, body):

875

"""Send an inventory e-mail message.

876

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

877

The message is logged in the selected log directory using `tag` for

878

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

879

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

880

If the --debug option was requested, the message is neither logged

881

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

882

883

@param arguments Parsed command-line options.

884

@param tag Tag identifying the inventory for logging

885

purposes.

886

@param subject E-mail Subject: header line.

887

@param recipients E-mail addresses for the To: header line.

888

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

889

"""

890

logging.debug('Generating email: "%s"', subject)

891

all_recipients = ', '.join(recipients)

892

report_body = '\n'.join([

893

'To: %s' % all_recipients,

894

'Subject: %s' % subject,

895

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

896

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

897

print report_body

898

else:

899

filename = os.path.join(arguments.logdir, tag)

900

try:

901

report_file = open(filename, 'w')

902

report_file.write(report_body)

903

report_file.close()

904

except EnvironmentError as e:

905

logging.error('Failed to write %s: %s', filename, e)

906

try:

907

gmail_lib.send_email(all_recipients, subject, body)

908

except Exception as e:

909

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

913

def _populate_model_counts(inventory):

914

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

915

916

Gathering the status of all individual DUTs in the lab can take

917

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

918

Normally, we pay that cost by querying as we go. However, with

919

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

920

progress in real time. So, we force the first (expensive) queries

921

to happen up front, and provide simple ASCII output on sys.stdout

922

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

923

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

924

@param inventory `_LabInventory` object from which to gather

925

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

926

"""

927

n = 0

928

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

929

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

939

# This next call is where all the time goes - it forces all of a

940

# model's `HostJobHistory` objects to query the database and

941

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

942

total_broken += counts.get_broken()

943

sys.stdout.write('\n')

944

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

945

946

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

947

def _perform_model_inventory(arguments, inventory, timestamp):

948

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

949

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

950

The model inventory report consists of the following:

951

* A list of DUTs that are recommended to be repaired. This list

952

is optional, and only appears if the `--recommend` option is

953

present.

954

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

955

of working, broken, and spare DUTs, among others.

956

957

@param arguments Command-line arguments as returned by

958

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

959

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

960

@param timestamp A string used to identify this run's timestamp

961

in logs and email output.

962

"""

963

if arguments.recommend:

964

recommend_message = _generate_repair_recommendation(

965

inventory, arguments.recommend) + '\n\n\n'

966

else:

967

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

968

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

969

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

970

'models-%s.txt' % timestamp,

971

'DUT model inventory %s' % timestamp,

972

arguments.model_notify,

973

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

974

975

976

def _perform_pool_inventory(arguments, inventory, timestamp):

977

"""Perform the pool inventory report.

978

979

The pool inventory report consists of the following:

980

* A list of all critical pools that have failed DUTs, with counts

981

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

982

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

983

pool.

984

985

@param arguments Command-line arguments as returned by

986

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

987

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

988

@param timestamp A string used to identify this run's timestamp in

989

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

990

"""

991

pool_message = _generate_pool_inventory_message(inventory)

992

idle_message = _generate_idle_inventory_message(inventory)

993

_send_email(arguments,

994

'pools-%s.txt' % timestamp,

995

'DUT pool inventory %s' % timestamp,

996

arguments.pool_notify,

997

pool_message + '\n\n\n' + idle_message)

998

999

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1000

def _dut_in_repair_loop(history):

1001

"""Return whether a DUT's history indicates a repair loop.

1002

1003

A DUT is considered looping if it runs no tests, and no tasks pass

1004

other than repair tasks.

1005

1006

@param history An instance of `status_history.HostJobHistory` to be

1007

scanned for a repair loop. The caller guarantees

1008

that this history corresponds to a working DUT.

1009

@returns Return a true value if the DUT's most recent history

1010

indicates a repair loop.

1011

"""

1012

# Our caller passes only histories for working DUTs; that means

1013

# we've already paid the cost of fetching the diagnosis task, and

1014

# we know that the task was successful. The diagnosis task will be

1015

# one of the tasks we must scan to find a loop, so if the task isn't

1016

# a repair task, then our history includes a successful non-repair

1017

# task, and we're not looping.

1018

#

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1019

# The for loop below is very expensive, because it must fetch the

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1020

# full history, regardless of how many tasks we examine. At the

1021

# time of this writing, this check against the diagnosis task

1022

# reduces the cost of finding loops in the full inventory from hours

1023

# to minutes.

1024

if history.last_diagnosis()[1].name != 'Repair':

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1029

# This is a test, so we're not looping.

1030

return False

1031

if task.diagnosis == status_history.BROKEN:

1032

# Failed a repair, so we're not looping.

1033

return False

1034

if (task.diagnosis == status_history.WORKING

1035

and task.name != 'Repair'):

1036

# Non-repair task succeeded, so we're not looping.

1037

return False

1038

# At this point, we have either a failed non-repair task, or

1039

# a successful repair.

1040

if task.name == 'Repair':

1041

repair_ok_count += 1

1042

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1046

def _report_untestable_dut(history, state):

1047

fields = {

1048

'dut_hostname': history.hostname,

1049

'model': history.host_model,

1050

'pool': history.host_pool,

1051

'state': state,

1052

}

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1053

logging.info('DUT in state %(state)s: %(dut_hostname)s, '

1054

'model: %(model)s, pool: %(pool)s', fields)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1055

_UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1056

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1057

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1058

def _report_untestable_dut_metrics(inventory):

1059

"""Scan the inventory for DUTs unable to run tests.

1060

1061

DUTs in the inventory are judged "untestable" if they meet one of

1062

two criteria:

1063

* The DUT is stuck in a repair loop; that is, it regularly passes

1064

repair, but never passes other operations.

1065

* The DUT runs no tasks at all, but is not locked.

1066

1067

This routine walks through the given inventory looking for DUTs in

1068

either of these states. Results are reported via a Monarch presence

1069

metric.

1070

1071

Note: To make sure that DUTs aren't flagged as "idle" merely

1072

because there's no work, a separate job runs prior to regular

1073

inventory runs which schedules trivial work on any DUT that appears

1074

idle.

1075

1076

@param inventory `_LabInventory` object to be reported on.

1077

"""

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1078

logging.info('Scanning for untestable DUTs.')

1079

for history in _all_dut_histories(inventory):

1080

# Managed DUTs with names that don't match

1081

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1082

# don't want arbitrary strings being attached to the

1083

# 'dut_hostname' field, so for safety, we exclude all

1084

# anomalies.

1085

if not _HOSTNAME_PATTERN.match(history.hostname):

1086

continue

1087

if _host_is_working(history):

1088

if _dut_in_repair_loop(history):

1089

_report_untestable_dut(history, 'repair_loop')

1090

elif _host_is_idle(history):

1091

if not history.host.locked:

1092

_report_untestable_dut(history, 'idle_unlocked')

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1093

1094

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1095

def _log_startup(arguments, startup_time):

1096

"""Log the start of this inventory run.

1097

1098

Print various log messages indicating the start of the run. Return

1099

a string based on `startup_time` that will be used to identify this

1100

run in log files and e-mail messages.

1101

1102

@param startup_time A UNIX timestamp marking the moment when

1103

this inventory run began.

1104

@returns A timestamp string that will be used to identify this run

1105

in logs and email output.

1106

"""

1107

timestamp = time.strftime('%Y-%m-%d.%H',

1108

time.localtime(startup_time))

1109

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1110

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1111

if arguments.recommend:

1112

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1113

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1114

if arguments.pool_notify:

1115

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1120

"""Create the `_LabInventory` instance to use for reporting.

1121

1122

@param end_time A UNIX timestamp for the end of the time range

1123

to be searched in this inventory run.

1124

"""

1125

start_time = end_time - arguments.duration * 60 * 60

1126

afe = frontend_wrappers.RetryingAFE(server=None)

1127

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1128

afe, start_time, end_time, arguments.modelnames)

1129

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1130

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1131

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1135

def _perform_inventory_reports(arguments):

1136

"""Perform all inventory checks requested on the command line.

1137

1138

Create the initial inventory and run through the inventory reports

1139

as called for by the parsed command-line arguments.

1140

1141

@param arguments Command-line arguments as returned by

1142

`ArgumentParser`.

1143

"""

1144

startup_time = time.time()

1145

timestamp = _log_startup(arguments, startup_time)

1146

inventory = _create_inventory(arguments, startup_time)

1147

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1148

_populate_model_counts(inventory)

1149

if arguments.model_notify:

1150

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1151

if arguments.pool_notify:

1152

_perform_pool_inventory(arguments, inventory, timestamp)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1153

if arguments.report_untestable:

1154

_report_untestable_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1155

1156

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1157

def _separate_email_addresses(address_list):

1158

"""Parse a list of comma-separated lists of e-mail addresses.

1159

1160

@param address_list A list of strings containing comma

1161

separate e-mail addresses.

1162

@return A list of the individual e-mail addresses.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1163

"""

1164

newlist = []

1165

for arg in address_list:

1166

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1171

"""Validate command-line arguments.

1172

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1173

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1174

`--pool-notify` in separate option arguments into a single list.

1175

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1176

For non-debug uses, require that at least one inventory report be

1177

requested. For debug, if a report isn't specified, treat it as "run

1178

all the reports."

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1179

1180

The return value indicates success or failure; in the case of

1181

failure, we also write an error message to stderr.

1182

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1183

@param arguments Command-line arguments as returned by

1184

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1185

@return True if the arguments are semantically good, or False

1186

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1187

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1188

arguments.model_notify = _separate_email_addresses(

1189

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1190

arguments.pool_notify = _separate_email_addresses(

1191

arguments.pool_notify)

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1192

if not any([arguments.model_notify, arguments.pool_notify,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1193

arguments.report_untestable]):

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1194

if not arguments.debug:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1195

sys.stderr.write('Must request at least one report via '

1196

'--model-notify, --pool-notify, or '

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1197

'--report-untestable\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1198

return False

1199

else:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1200

# We want to run all the e-mail reports. An empty notify

1201

# list will cause a report to be skipped, so make sure the

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1202

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1203

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1204

arguments.pool_notify = ['']

1205

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1206

1207

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1208

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1209

"""Get the default directory for the `--logdir` option.

1210

1211

The default log directory is based on the parent directory

1212

containing this script.

1213

1214

@param script Path to this script file.

1215

@return A path to a directory.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1216

"""

1217

basedir = os.path.dirname(os.path.abspath(script))

1218

basedir = os.path.dirname(basedir)

1219

return os.path.join(basedir, _LOGDIR)

1220

1221

1222

def _parse_command(argv):

1223

"""Parse the command line arguments.

1224

1225

Create an argument parser for this command's syntax, parse the

1226

command line, and return the result of the ArgumentParser

1227

parse_args() method.

1228

1229

@param argv Standard command line argument vector; argv[0] is

1230

assumed to be the command name.

1231

@return Result returned by ArgumentParser.parse_args().

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1232

"""

1233

parser = argparse.ArgumentParser(

1234

prog=argv[0],

1235

description='Gather and report lab inventory statistics')

1236

parser.add_argument('-d', '--duration', type=int,

1237

default=_DEFAULT_DURATION, metavar='HOURS',

1238

help='number of hours back to search for status'

1239

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1240

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1241

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1242

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1243

'and send it to the given e-mail address(es)')

1244

parser.add_argument('--pool-notify', action='append',

1245

default=[], metavar='ADDRESS',

1246

help='Generate pool inventory message, '

1247

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1248

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1249

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1250

'recommended for repair (default: no '

1251

'recommendation)'))

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1252

parser.add_argument('--report-untestable', action='store_true',

1253

help='Check for devices unable to run tests.')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1254

parser.add_argument('--debug', action='store_true',

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1255

help='Print e-mail, metrics messages on stdout '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1256

'without sending them.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1257

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1258

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1259

parser.add_argument('modelnames', nargs='*',

1260

metavar='MODEL',

1261

help='names of models to report on '

1262

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1263

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1264

if not _verify_arguments(arguments):

1265

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1270

"""Configure the `logging` module for our needs.

1271

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1272

How we log depends on whether the `--debug` option was provided on

1273

the command line.

1274

* Without the option, we configure the logging to capture all

1275

potentially relevant events in a log file. The log file is

1276

configured to rotate once a week on Friday evening, preserving

1277

~3 months worth of history.

1278

* With the option, we expect stdout to contain other

1279

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1280

messages), so we restrict the output to INFO level.

1281

1282

For convenience, when `--debug` is on, the logging format has

1283

no adornments, so that a call like `logging.info(msg)` simply writes

1284

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1285

1286

@param arguments Command-line arguments as returned by

1287

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1288

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1289

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1290

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1291

root_logger.setLevel(logging.INFO)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1292

handler = logging.StreamHandler(sys.stdout)

1293

handler.setFormatter(logging.Formatter())

1294

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1295

if not os.path.exists(arguments.logdir):

1296

os.mkdir(arguments.logdir)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1297

root_logger.setLevel(logging.DEBUG)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1298

logfile = os.path.join(arguments.logdir, _LOGFILE)

1299

handler = logging.handlers.TimedRotatingFileHandler(

1300

logfile, when='W4', backupCount=13)

1301

formatter = logging.Formatter(_LOG_FORMAT,

1302

time_utils.TIME_FMT)

1303

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1304

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1305

# implicitly imported logging_config, which calls

1306

# logging.basicConfig() *at module level*. That gives us an

1307

# extra logging handler that we don't want. So, clear out all

1308

# the handlers here.

1309

for h in root_logger.handlers:

1310

root_logger.removeHandler(h)

1311

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1312

1313

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1314

def main(argv):

1315

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1316

1317

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1318

"""

1319

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1320

if not arguments:

1321

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1322

_configure_logging(arguments)

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1323

1324

if arguments.debug:

1325

logging.info('--debug mode: Will not report metrics to monarch')

1326

metrics_file = '/dev/null'

else:

metrics_file = None

Prathmesh Prabhu

2018-05-07 14:37:35 -0700

[diff] [blame]

1330

with site_utils.SetupTsMonGlobalState(

1331

'lab_inventory', debug_file=metrics_file,

1332

auto_flush=False):

Prathmesh Prabhu

2018-05-07 14:49:33 -0700

[diff] [blame]

1333

success = False

Prathmesh Prabhu

58728f4

2018-05-07 14:37:35 -0700

[diff] [blame]

1334

try:

Prathmesh Prabhu

2018-05-07 14:49:33 -0700

[diff] [blame]

1335

with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):

1336

_perform_inventory_reports(arguments)

1337

success = True

Prathmesh Prabhu

58728f4

2018-05-07 14:37:35 -0700

[diff] [blame]

1338

except KeyboardInterrupt:

1339

pass

1340

except (EnvironmentError, Exception):

1341

# Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.

1342

logging.exception('Error escaped main')

1343

raise

1344

finally:

Prathmesh Prabhu

2018-05-07 14:49:33 -0700

[diff] [blame]

1345

metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(

1346

fields={'success': success})

Prathmesh Prabhu

58728f4

2018-05-07 14:37:35 -0700

[diff] [blame]

1347

metrics.Flush()

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1348

1349

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1350

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1351

end_time = int(time.time())

1352

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1353

return _LabInventory.create_inventory(afe, start_time, end_time)

1354

1355

1356

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1357

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1358

1359

J. Richard Barnette