Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

32

--report-untestable

33

Scan the inventory for DUTs that can't test because they're stuck in

34

repair loops, or because the scheduler can't give them work.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

53

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

54

import logging

55

import logging.handlers

56

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

57

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

62

from autotest_lib.client.bin import utils

Richard Barnette

6f6ce32

2018-09-07 16:23:20 +0000

[diff] [blame]

63

from autotest_lib.client.common_lib import time_utils

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

64

from autotest_lib.frontend.afe.json_rpc import proxy

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

65

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

66

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

67

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

68

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

69

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

70

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

71

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

72

73

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

74

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

75

SPARE_POOL = constants.Pools.SPARE_POOL

76

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

77

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

78

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

79

# monitoring by this script. Currently, we're excluding these:

80

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

81

# + 'board:guado_moblab' - These are maintained by a separate

82

# process that doesn't use this script.

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

83

# + 'board:veyron_rialto' due to crbug.com/854404

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

84

C Shapiro

7de0442

2018-08-29 14:46:11 -0600

[diff] [blame]

85

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

86

'board:veyron_rialto'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

87

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

88

# _DEFAULT_DURATION:

89

# Default value used for the --duration command line option.

90

# Specifies how far back in time to search in order to determine

91

# DUT status.

92

93

_DEFAULT_DURATION = 24

94

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

95

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

96

# Relative path used in the calculation of the default setting for

97

# the --logdir option. The full path is relative to the root of the

98

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

99

# _LOGFILE:

100

# Basename of a file to which general log information will be

101

# written.

102

# _LOG_FORMAT:

103

# Format string for log messages.

104

105

_LOGDIR = os.path.join('logs', 'dut-data')

106

_LOGFILE = 'lab-inventory.log'

107

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

108

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

109

# Pattern describing location-based host names in the Chrome OS test

110

# labs. Each DUT hostname designates the DUT's location:

111

# * A lab (room) that's physically separated from other labs

112

# (i.e. there's a door).

113

# * A row (or aisle) of DUTs within the lab.

114

# * A vertical rack of shelves on the row.

115

# * A specific host on one shelf of the rack.

116

117

_HOSTNAME_PATTERN = re.compile(

118

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

119

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

120

# _REPAIR_LOOP_THRESHOLD:

121

# The number of repeated Repair tasks that must be seen to declare

122

# that a DUT is stuck in a repair loop.

123

124

_REPAIR_LOOP_THRESHOLD = 4

125

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

126

Prathmesh Prabhu

b69a6cc

2018-05-07 14:49:33 -0700

[diff] [blame]

127

_METRICS_PREFIX = 'chromeos/autotest/inventory'

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

128

_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

129

_METRICS_PREFIX + '/untestable',

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

130

'DUTs that cannot be scheduled for testing')

131

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

132

_MISSING_DUT_METRIC = metrics.Counter(

133

_METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'

134

' because they are invalid or deleted')

135

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

136

# _Diagnosis - namedtuple corresponding to the return value from

137

# `HostHistory.last_diagnosis()`

138

_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])

139

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

140

def _get_diagnosis(history):

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

141

dut_present = True

142

try:

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

143

diagnosis = _Diagnosis(*history.last_diagnosis())

144

if (diagnosis.status == status_history.BROKEN

145

and diagnosis.task.end_time < history.start_time):

146

return _Diagnosis(status_history.UNUSED, diagnosis.task)

147

else:

148

return diagnosis

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

149

except proxy.JSONRPCException as e:

logging.warn(e)

dut_present = False

finally:

_MISSING_DUT_METRIC.increment(

154

fields={'host': history.hostname, 'presence': dut_present})

Richard Barnette

a3071b7

2018-09-26 10:04:18 -0700

[diff] [blame]

155

return _Diagnosis(None, None)

156

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

157

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

158

def _host_is_working(history):

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

159

return _get_diagnosis(history).status == status_history.WORKING

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

160

161

162

def _host_is_broken(history):

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

163

return _get_diagnosis(history).status == status_history.BROKEN

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

164

165

166

def _host_is_idle(history):

167

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

168

return _get_diagnosis(history).status in idle_statuses

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

169

170

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

171

class _HostSetInventory(object):

172

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

173

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

174

Current usage of this class is that all DUTs are part of a single

175

scheduling pool of DUTs for a single model; however, this class make

176

no assumptions about the actual relationship among the DUTs.

177

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

178

The collection is segregated into disjoint categories of "working",

179

"broken", and "idle" DUTs. Accessor methods allow finding both the

180

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

181

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

182

Performance note: Certain methods in this class are potentially

183

expensive:

184

* `get_working()`

185

* `get_working_list()`

186

* `get_broken()`

187

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

188

* `get_idle()`

189

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

190

The first time any one of these methods is called, it causes

191

multiple RPC calls with a relatively expensive set of database

192

queries. However, the results of the queries are cached in the

193

individual `HostJobHistory` objects, so only the first call

194

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

195

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

196

Additionally, `get_working_list()`, `get_broken_list()` and

197

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

198

lists at every call; this caching is separate from the caching of

199

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

200

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

201

This class is deliberately constructed to delay the RPC cost until

202

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

203

`record_host()`) so that it's possible to construct a complete

204

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

205

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

210

self._working_list = None

211

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

212

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

213

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

214

def record_host(self, host_history):

215

"""Add one `HostJobHistory` object to the collection.

216

217

@param host_history The `HostJobHistory` object to be

218

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

219

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

220

self._working_list = None

221

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

222

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

223

self._histories.append(host_history)

224

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

225

def get_working_list(self):

226

"""Return a list of all working DUTs in the pool.

227

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

228

Filter `self._histories` for histories where the DUT is

229

diagnosed as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

230

231

Cache the result so that we only cacluate it once.

232

233

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

234

"""

235

if self._working_list is None:

236

self._working_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

237

if _host_is_working(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

238

return self._working_list

239

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

240

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

241

"""Return the number of working DUTs in the pool."""

242

return len(self.get_working_list())

243

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

244

def get_broken_list(self):

245

"""Return a list of all broken DUTs in the pool.

246

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

247

Filter `self._histories` for histories where the DUT is

248

diagnosed as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

249

250

Cache the result so that we only cacluate it once.

251

252

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

253

"""

254

if self._broken_list is None:

255

self._broken_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

256

if _host_is_broken(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

257

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

258

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

259

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

260

"""Return the number of broken DUTs in the pool."""

261

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

262

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

263

def get_idle_list(self):

264

"""Return a list of all idle DUTs in the pool.

265

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

266

Filter `self._histories` for histories where the DUT is

267

diagnosed as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

268

269

Cache the result so that we only cacluate it once.

270

271

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

272

"""

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

273

if self._idle_list is None:

274

self._idle_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

275

if _host_is_idle(h)]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

276

return self._idle_list

277

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

278

def get_idle(self):

279

"""Return the number of idle DUTs in the pool."""

280

return len(self.get_idle_list())

281

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

282

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

283

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

284

return len(self._histories)

285

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

286

def get_all_histories(self):

287

return self._histories

288

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

289

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

290

class _PoolSetInventory(object):

291

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

292

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

293

The collection is segregated into disjoint categories of "working",

294

"broken", and "idle" DUTs. Accessor methods allow finding both the

295

list of DUTs in each category, as well as counts of each category.

296

Accessor queries can be for an individual pool, or against all

297

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

298

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

299

Performance note: This class relies on `_HostSetInventory`. Public

300

methods in this class generally rely on methods of the same name in

301

the underlying class, and so will have the same underlying

302

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

303

"""

304

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

305

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

306

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

307

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

308

}

309

310

def record_host(self, host_history):

311

"""Add one `HostJobHistory` object to the collection.

312

313

@param host_history The `HostJobHistory` object to be

314

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

315

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

316

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

317

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

318

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

319

def _count_pool(self, get_pool_count, pool=None):

320

"""Internal helper to count hosts in a given pool.

321

322

The `get_pool_count` parameter is a function to calculate

323

the exact count of interest for the pool.

324

325

@param get_pool_count Function to return a count from a

326

_PoolCount object.

327

@param pool The pool to be counted. If `None`,

328

return the total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

329

"""

330

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

331

return sum([get_pool_count(cached_history) for cached_history in

332

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

333

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

334

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

335

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

336

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

337

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

338

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

339

Go through all HostJobHistory objects across all pools,

340

selecting all DUTs identified as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

341

342

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

343

"""

344

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

345

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

346

l.extend(p.get_working_list())

347

return l

348

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

349

def get_working(self, pool=None):

350

"""Return the number of working DUTs in a pool.

351

352

@param pool The pool to be counted. If `None`, return the

353

total across all pools.

354

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

355

@return The total number of working DUTs in the selected

356

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

357

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

358

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

359

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

360

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

361

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

362

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

363

Go through all HostJobHistory objects across all pools,

364

selecting all DUTs identified as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

365

366

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

367

"""

368

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

369

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

370

l.extend(p.get_broken_list())

371

return l

372

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

373

def get_broken(self, pool=None):

374

"""Return the number of broken DUTs in a pool.

375

376

@param pool The pool to be counted. If `None`, return the

377

total across all pools.

378

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

379

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

380

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

381

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

382

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

383

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

384

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

385

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

386

Go through all HostJobHistory objects across all pools,

387

selecting all DUTs identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

388

389

@param pool: The pool to be counted. If `None`, return the total list

390

across all pools.

391

392

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

393

"""

394

if pool is None:

395

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

396

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

397

l.extend(p.get_idle_list())

398

return l

399

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

400

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

401

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

402

def get_idle(self, pool=None):

403

"""Return the number of idle DUTs in a pool.

404

405

@param pool: The pool to be counted. If `None`, return the total

406

across all pools.

407

408

@return The total number of idle DUTs in the selected pool(s).

409

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

410

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

411

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

412

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

413

"""Return the the nominal number of working spares.

414

415

Calculates and returns how many working spares there would

416

be in the spares pool if all broken DUTs were in the spares

417

pool. This number may be negative, indicating a shortfall

418

in the critical pools.

419

420

@return The total number DUTs in the spares pool, less the total

421

number of broken DUTs in all pools.

422

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

423

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

424

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

425

def get_total(self, pool=None):

426

"""Return the total number of DUTs in a pool.

427

428

@param pool The pool to be counted. If `None`, return the

429

total across all pools.

430

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

431

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

432

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

433

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

434

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

435

def get_all_histories(self, pool=None):

436

if pool is None:

437

for p in self._histories_by_pool.itervalues():

438

for h in p.get_all_histories():

439

yield h

440

else:

441

for h in self._histories_by_pool[pool].get_all_histories():

442

yield h

443

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

444

Prathmesh Prabhu

2637cd4

2018-11-08 08:18:21 -0800

[diff] [blame]

445

def _is_migrated_to_skylab(afehost):

446

"""Return True if the provided frontend.Host has been migrated to skylab."""

447

return afehost.hostname.endswith('-migrated-do-not-use')

448

449

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

450

def _eligible_host(afehost):

451

"""Return whether this host is eligible for monitoring.

452

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

453

@param afehost The host to be tested for eligibility.

454

"""

Prathmesh Prabhu

2637cd4

2018-11-08 08:18:21 -0800

[diff] [blame]

455

if _is_migrated_to_skylab(afehost):

456

return False

457

458

# DUTs without an existing, unique 'model' or 'pool' label aren't meant to

459

# exist in the managed inventory; their presence generally indicates an

460

# error in the database. The _LabInventory constructor requires hosts to

461

# conform to the label restrictions. Failing an inventory run for a single

462

# bad entry is wrong, so we ignore these hosts.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

463

models = [l for l in afehost.labels

464

if l.startswith(constants.Labels.MODEL_PREFIX)]

465

pools = [l for l in afehost.labels

466

if l.startswith(constants.Labels.POOL_PREFIX)]

467

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

468

return len(models) == 1 and len(pools) == 1 and not excluded

469

470

471

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

472

"""Collection of `HostJobHistory` objects for the Lab's inventory.

473

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

474

This is a dict-like collection indexed by model. Indexing returns

475

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

476

"""

477

478

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

479

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

480

"""Return a Lab inventory with specified parameters.

481

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

482

By default, gathers inventory from `HostJobHistory` objects for

483

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

484

supplied, the inventory will be restricted to only the given

485

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

486

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

487

@param afe AFE object for constructing the

488

`HostJobHistory` objects.

489

@param start_time Start time for the `HostJobHistory` objects.

490

@param end_time End time for the `HostJobHistory` objects.

491

@param modellist List of models to include. If empty,

492

include all available models.

493

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

494

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

495

target_pools = MANAGED_POOLS

496

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

497

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

498

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

499

# We're deliberately not checking host eligibility in this

500

# code path. This is a debug path, not used in production;

501

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

502

modelhosts = []

503

for model in modellist:

504

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

505

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

506

if model_label in h.labels]

507

modelhosts.extend(host_list)

508

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

509

else:

Richard Barnette

3a40449

2018-02-08 13:57:01 -0800

[diff] [blame]

510

afehosts = [h for h in afehosts if _eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

511

create = lambda host: (

512

status_history.HostJobHistory(afe, host,

513

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

514

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

515

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

516

def __init__(self, histories, pools):

517

models = {h.host_model for h in histories}

518

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

519

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

520

for h in histories:

521

self[h.host_model].record_host(h)

522

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

523

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

524

def __getitem__(self, key):

525

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

526

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

527

def __len__(self):

528

return self._modeldata.__len__()

529

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

530

def __iter__(self):

531

return self._modeldata.__iter__()

532

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

533

def get_num_duts(self):

534

"""Return the total number of DUTs in the inventory."""

535

return self._dut_count

536

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

537

def get_num_models(self):

538

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

539

return len(self)

540

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

541

def get_pool_models(self, pool):

542

"""Return all models in `pool`.

543

544

@param pool The pool to be inventoried for models.

545

"""

546

return {m for m, h in self.iteritems() if h.get_total(pool)}

547

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

548

def get_boards(self):

549

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

550

551

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

552

def _reportable_models(inventory, spare_pool=SPARE_POOL):

553

"""Iterate over all models subject to reporting.

554

555

Yields the contents of `inventory.iteritems()` filtered to include

556

only reportable models. A model is reportable if it has DUTs in

557

both `spare_pool` and at least one other pool.

558

559

@param spare_pool The spare pool to be tested for reporting.

560

"""

561

for model, poolset in inventory.iteritems():

562

spares = poolset.get_total(spare_pool)

563

total = poolset.get_total()

564

if spares != 0 and spares != total:

yield model, poolset

def _all_dut_histories(inventory):

569

for poolset in inventory.itervalues():

570

for h in poolset.get_all_histories():

yield h

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

574

def _sort_by_location(inventory_list):

575

"""Return a list of DUTs, organized by location.

576

577

Take the given list of `HostJobHistory` objects, separate it

578

into a list per lab, and sort each lab's list by location. The

579

order of sorting within a lab is

580

* By row number within the lab,

581

* then by rack number within the row,

582

* then by host shelf number within the rack.

583

584

Return a list of the sorted lists.

585

586

Implementation note: host locations are sorted by converting

587

each location into a base 100 number. If row, rack or

588

host numbers exceed the range [0..99], then sorting will

589

break down.

590

591

@return A list of sorted lists of DUTs.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

596

location = _HOSTNAME_PATTERN.match(history.host.hostname)

597

if location:

598

lab = location.group(1)

599

key = 0

600

for idx in location.group(2, 3, 4):

601

key = BASE * key + int(idx)

602

lab_lists.setdefault(lab, []).append((key, history))

603

return_list = []

604

for dut_list in lab_lists.values():

605

dut_list.sort(key=lambda t: t[0])

606

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

611

"""Return a numeric score rating a set of DUTs to be repaired.

612

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

613

`buffer_counts` is a dictionary mapping model names to the size of

614

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

615

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

616

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

617

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

618

619

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

620

result from the proposed repairs, and scores the new set using two

621

numbers:

622

* Worst case buffer count for any model (higher is better). This

623

is the more significant number for comparison.

624

* Number of models at the worst case (lower is better). This is

625

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

626

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

627

Implementation note: The score could fail to reflect the intended

628

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

629

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

630

@param spare_counts A dictionary mapping models to buffer counts.

631

@param repair_list A list of `HostJobHistory` objects for the

632

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

633

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

634

"""

635

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

636

# that records the buffer count for each model after repair.

637

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

638

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

639

_NMODELS = 1000

640

pools = {h.host_pool for h in repair_list}

641

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

642

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

643

for m, c in buffer_counts.iteritems():

644

if m in repair_inventory:

645

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

646

else:

647

newcount = 0

648

new_counts.append(c + newcount)

649

# Go through the new list of counts. Find the worst available

650

# spares count, and count how many times that worst case occurs.

651

worst_count = new_counts[0]

652

num_worst = 1

653

for c in new_counts[1:]:

654

if c == worst_count:

655

num_worst += 1

656

elif c < worst_count:

657

worst_count = c

658

num_worst = 1

659

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

660

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

661

662

663

def _generate_repair_recommendation(inventory, num_recommend):

664

"""Return a summary of selected DUTs needing repair.

665

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

666

Returns a message recommending a list of broken DUTs to be repaired.

667

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

668

* No more than `num_recommend` DUTs will be listed.

669

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

670

* DUTs should be selected for some degree of physical proximity.

671

* DUTs for models with a low spares buffer are more important than

672

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

673

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

674

The algorithm used will guarantee that at least one DUT from a model

675

with the lowest spares buffer will be recommended. If the worst

676

spares buffer number is shared by more than one model, the algorithm

677

will tend to prefer repair sets that include more of those models

678

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

679

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

680

@param inventory `_LabInventory` object from which to generate

681

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

682

@param num_recommend Number of DUTs to recommend for repair.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

683

"""

684

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

685

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

686

broken_list = []

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

687

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

688

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

689

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

690

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

691

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

692

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

693

# simplification is hard:

694

# * Calculating an initial recommendation outside of

695

# the loop likely would make things more complicated,

696

# not less.

697

# * It's necessary to calculate an initial lab slice once per

698

# lab _before_ the while loop, in case the number of broken

699

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

700

recommendation = None

701

best_score = None

702

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

703

start = 0

704

end = num_recommend

705

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

706

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

707

while end < len(lab_duts):

708

start += 1

709

end += 1

710

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

711

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

712

if new_score > lab_score:

713

lab_slice = new_slice

714

lab_score = new_score

715

if recommendation is None or lab_score > best_score:

716

recommendation = lab_slice

717

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

718

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

719

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

720

# know more, go try it yourself...

721

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

722

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

723

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame]

724

if recommendation:

725

for h in recommendation:

726

servo_name = servo_host.make_servo_hostname(h.host.hostname)

727

servo_present = utils.host_is_in_lab_zone(servo_name)

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

728

event = _get_diagnosis(h).task

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame]

729

line = line_fmt % (

730

h.host.hostname, h.host_model,

731

'Yes' if servo_present else 'No', event.job_url)

732

message.append(line)

733

else:

734

message.append('(No DUTs to repair)')

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

735

return '\n'.join(message)

736

737

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

738

def _generate_model_inventory_message(inventory):

739

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

740

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

741

The model inventory is a list by model summarizing the number of

742

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

743

of working devices relative to the minimum critical pool

744

requirement.

745

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

746

The report omits models with no DUTs in the spare pool or with no

747

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

748

749

N.B. For sample output text formattted as users can expect to

750

see it in e-mail and log files, refer to the unit tests.

751

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

752

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

753

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

754

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

755

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

756

nworking = 0

757

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

758

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

759

nbroken_models = 0

760

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

761

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

762

column_names = (

763

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

764

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

765

logging.debug('Counting %2d DUTS for model %s',

766

counts.get_total(), model)

767

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

768

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

769

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

770

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

771

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

772

counts.get_spares_buffer(),

773

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

774

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

775

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

776

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

777

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

778

if element[2]:

779

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

780

nbroken_models += 1

781

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

782

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

783

nidle += element[3]

784

nworking += element[4]

785

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

786

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

787

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

788

idle_percent = int(round(100.0 * nidle / ntotal))

789

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

790

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

791

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

792

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

793

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

794

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

795

nworking, working_percent,

796

ntotal),

797

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

798

'Models with failures: %d' % nbroken_models,

799

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

800

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

801

'Full model inventory:\n',

802

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

803

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

804

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

805

return '\n'.join(message)

806

807

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

808

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

809

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

810

less than full strength, please take action to resolve the issues.

811

Once you're satisified that failures won't recur, failed DUTs can

812

be replaced with spares by running `balance_pool`. Detailed

813

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

814

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

818

def _generate_pool_inventory_message(inventory):

819

"""Generate the "pool inventory" e-mail message.

820

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

821

The pool inventory is a list by pool and model summarizing the

822

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

823

at least one broken DUT are included in the list.

824

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

825

N.B. For sample output text formattted as users can expect to see it

826

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

827

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

828

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

829

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

830

"""

831

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

832

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

833

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

834

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

835

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

836

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

837

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

838

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

839

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

840

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

841

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

842

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

843

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

844

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

845

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

846

# models at full strength are not reported

847

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

848

continue

849

working = counts.get_working(pool)

850

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

851

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

852

if data_list:

853

data_list = sorted(data_list, key=lambda d: -d[1])

854

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

855

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

856

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

857

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

858

newline = '\n'

859

return '\n'.join(message)

860

861

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

862

_IDLE_INVENTORY_HEADER = '''\

863

Notice to Infrastructure deputies: The hosts shown below haven't

864

run any jobs for at least 24 hours. Please check each host; locked

865

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

871

"""Generate the "idle inventory" e-mail message.

872

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

873

The idle inventory is a host list with corresponding pool and model,

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

874

where the hosts are identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

875

876

N.B. For sample output text format as users can expect to

877

see it in e-mail and log files, refer to the unit tests.

878

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

879

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

880

@return String with the inventory message to be sent.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

881

"""

882

logging.debug('Creating idle inventory')

883

message = [_IDLE_INVENTORY_HEADER]

884

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

885

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

886

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

887

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

888

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

889

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

890

counts.get_total(pool), model, pool)

891

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

892

for dut in counts.get_idle_list(pool)])

893

if data_list:

894

message.extend(['%-30s %-20s %s' % t for t in data_list])

895

else:

896

message.append('(No idle DUTs)')

897

return '\n'.join(message)

898

899

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

900

def _send_email(arguments, tag, subject, recipients, body):

901

"""Send an inventory e-mail message.

902

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

903

The message is logged in the selected log directory using `tag` for

904

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

905

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

906

If the --debug option was requested, the message is neither logged

907

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

908

909

@param arguments Parsed command-line options.

910

@param tag Tag identifying the inventory for logging

911

purposes.

912

@param subject E-mail Subject: header line.

913

@param recipients E-mail addresses for the To: header line.

914

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

915

"""

916

logging.debug('Generating email: "%s"', subject)

917

all_recipients = ', '.join(recipients)

918

report_body = '\n'.join([

919

'To: %s' % all_recipients,

920

'Subject: %s' % subject,

921

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

922

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

923

print report_body

924

else:

925

filename = os.path.join(arguments.logdir, tag)

926

try:

927

report_file = open(filename, 'w')

928

report_file.write(report_body)

929

report_file.close()

930

except EnvironmentError as e:

931

logging.error('Failed to write %s: %s', filename, e)

932

try:

933

gmail_lib.send_email(all_recipients, subject, body)

934

except Exception as e:

935

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

939

def _populate_model_counts(inventory):

940

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

941

942

Gathering the status of all individual DUTs in the lab can take

943

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

944

Normally, we pay that cost by querying as we go. However, with

945

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

946

progress in real time. So, we force the first (expensive) queries

947

to happen up front, and provide simple ASCII output on sys.stdout

948

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

949

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

950

@param inventory `_LabInventory` object from which to gather

951

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

952

"""

953

n = 0

954

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

955

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

965

# This next call is where all the time goes - it forces all of a

966

# model's `HostJobHistory` objects to query the database and

967

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

968

total_broken += counts.get_broken()

969

sys.stdout.write('\n')

970

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

971

972

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

973

def _perform_model_inventory(arguments, inventory, timestamp):

974

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

975

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

976

The model inventory report consists of the following:

977

* A list of DUTs that are recommended to be repaired. This list

978

is optional, and only appears if the `--recommend` option is

979

present.

980

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

981

of working, broken, and spare DUTs, among others.

982

983

@param arguments Command-line arguments as returned by

984

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

985

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

986

@param timestamp A string used to identify this run's timestamp

987

in logs and email output.

988

"""

989

if arguments.recommend:

990

recommend_message = _generate_repair_recommendation(

991

inventory, arguments.recommend) + '\n\n\n'

992

else:

993

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

994

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

995

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

996

'models-%s.txt' % timestamp,

997

'DUT model inventory %s' % timestamp,

998

arguments.model_notify,

999

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1000

1001

1002

def _perform_pool_inventory(arguments, inventory, timestamp):

1003

"""Perform the pool inventory report.

1004

1005

The pool inventory report consists of the following:

1006

* A list of all critical pools that have failed DUTs, with counts

1007

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1008

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1009

pool.

1010

1011

@param arguments Command-line arguments as returned by

1012

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1013

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1014

@param timestamp A string used to identify this run's timestamp in

1015

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1016

"""

1017

pool_message = _generate_pool_inventory_message(inventory)

1018

idle_message = _generate_idle_inventory_message(inventory)

1019

_send_email(arguments,

1020

'pools-%s.txt' % timestamp,

1021

'DUT pool inventory %s' % timestamp,

1022

arguments.pool_notify,

1023

pool_message + '\n\n\n' + idle_message)

1024

1025

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1026

def _dut_in_repair_loop(history):

1027

"""Return whether a DUT's history indicates a repair loop.

1028

1029

A DUT is considered looping if it runs no tests, and no tasks pass

1030

other than repair tasks.

1031

1032

@param history An instance of `status_history.HostJobHistory` to be

1033

scanned for a repair loop. The caller guarantees

1034

that this history corresponds to a working DUT.

1035

@returns Return a true value if the DUT's most recent history

1036

indicates a repair loop.

1037

"""

1038

# Our caller passes only histories for working DUTs; that means

1039

# we've already paid the cost of fetching the diagnosis task, and

1040

# we know that the task was successful. The diagnosis task will be

1041

# one of the tasks we must scan to find a loop, so if the task isn't

1042

# a repair task, then our history includes a successful non-repair

1043

# task, and we're not looping.

1044

#

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1045

# The for loop below is very expensive, because it must fetch the

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1046

# full history, regardless of how many tasks we examine. At the

1047

# time of this writing, this check against the diagnosis task

1048

# reduces the cost of finding loops in the full inventory from hours

1049

# to minutes.

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

1050

if _get_diagnosis(history).task.name != 'Repair':

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1055

# This is a test, so we're not looping.

1056

return False

1057

if task.diagnosis == status_history.BROKEN:

1058

# Failed a repair, so we're not looping.

1059

return False

1060

if (task.diagnosis == status_history.WORKING

1061

and task.name != 'Repair'):

1062

# Non-repair task succeeded, so we're not looping.

1063

return False

1064

# At this point, we have either a failed non-repair task, or

1065

# a successful repair.

1066

if task.name == 'Repair':

1067

repair_ok_count += 1

1068

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1072

def _report_untestable_dut(history, state):

1073

fields = {

1074

'dut_hostname': history.hostname,

1075

'model': history.host_model,

1076

'pool': history.host_pool,

1077

'state': state,

1078

}

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1079

logging.info('DUT in state %(state)s: %(dut_hostname)s, '

1080

'model: %(model)s, pool: %(pool)s', fields)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1081

_UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1082

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1083

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1084

def _report_untestable_dut_metrics(inventory):

1085

"""Scan the inventory for DUTs unable to run tests.

1086

1087

DUTs in the inventory are judged "untestable" if they meet one of

1088

two criteria:

1089

* The DUT is stuck in a repair loop; that is, it regularly passes

1090

repair, but never passes other operations.

1091

* The DUT runs no tasks at all, but is not locked.

1092

1093

This routine walks through the given inventory looking for DUTs in

1094

either of these states. Results are reported via a Monarch presence

1095

metric.

1096

1097

Note: To make sure that DUTs aren't flagged as "idle" merely

1098

because there's no work, a separate job runs prior to regular

1099

inventory runs which schedules trivial work on any DUT that appears

1100

idle.

1101

1102

@param inventory `_LabInventory` object to be reported on.

1103

"""

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1104

logging.info('Scanning for untestable DUTs.')

1105

for history in _all_dut_histories(inventory):

1106

# Managed DUTs with names that don't match

1107

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1108

# don't want arbitrary strings being attached to the

1109

# 'dut_hostname' field, so for safety, we exclude all

1110

# anomalies.

1111

if not _HOSTNAME_PATTERN.match(history.hostname):

1112

continue

1113

if _host_is_working(history):

1114

if _dut_in_repair_loop(history):

1115

_report_untestable_dut(history, 'repair_loop')

1116

elif _host_is_idle(history):

1117

if not history.host.locked:

1118

_report_untestable_dut(history, 'idle_unlocked')

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1119

1120

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1121

def _log_startup(arguments, startup_time):

1122

"""Log the start of this inventory run.

1123

1124

Print various log messages indicating the start of the run. Return

1125

a string based on `startup_time` that will be used to identify this

1126

run in log files and e-mail messages.

1127

1128

@param startup_time A UNIX timestamp marking the moment when

1129

this inventory run began.

1130

@returns A timestamp string that will be used to identify this run

1131

in logs and email output.

1132

"""

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame]

1133

timestamp = time.strftime('%Y-%m-%d.%H',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1134

time.localtime(startup_time))

1135

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1136

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1137

if arguments.recommend:

1138

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1139

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1140

if arguments.pool_notify:

1141

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1146

"""Create the `_LabInventory` instance to use for reporting.

1147

1148

@param end_time A UNIX timestamp for the end of the time range

1149

to be searched in this inventory run.

1150

"""

1151

start_time = end_time - arguments.duration * 60 * 60

1152

afe = frontend_wrappers.RetryingAFE(server=None)

1153

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1154

afe, start_time, end_time, arguments.modelnames)

1155

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1156

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1157

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1161

def _perform_inventory_reports(arguments):

1162

"""Perform all inventory checks requested on the command line.

1163

1164

Create the initial inventory and run through the inventory reports

1165

as called for by the parsed command-line arguments.

1166

1167

@param arguments Command-line arguments as returned by

1168

`ArgumentParser`.

1169

"""

1170

startup_time = time.time()

1171

timestamp = _log_startup(arguments, startup_time)

1172

inventory = _create_inventory(arguments, startup_time)

1173

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1174

_populate_model_counts(inventory)

1175

if arguments.model_notify:

1176

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1177

if arguments.pool_notify:

1178

_perform_pool_inventory(arguments, inventory, timestamp)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1179

if arguments.report_untestable:

1180

_report_untestable_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1181

1182

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1183

def _separate_email_addresses(address_list):

1184

"""Parse a list of comma-separated lists of e-mail addresses.

1185

1186

@param address_list A list of strings containing comma

1187

separate e-mail addresses.

1188

@return A list of the individual e-mail addresses.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1189

"""

1190

newlist = []

1191

for arg in address_list:

1192

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1197

"""Validate command-line arguments.

1198

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1199

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1200

`--pool-notify` in separate option arguments into a single list.

1201

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1202

For non-debug uses, require that at least one inventory report be

1203

requested. For debug, if a report isn't specified, treat it as "run

1204

all the reports."

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1205

1206

The return value indicates success or failure; in the case of

1207

failure, we also write an error message to stderr.

1208

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1209

@param arguments Command-line arguments as returned by

1210

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1211

@return True if the arguments are semantically good, or False

1212

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1213

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1214

arguments.model_notify = _separate_email_addresses(

1215

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1216

arguments.pool_notify = _separate_email_addresses(

1217

arguments.pool_notify)

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1218

if not any([arguments.model_notify, arguments.pool_notify,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1219

arguments.report_untestable]):

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1220

if not arguments.debug:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1221

sys.stderr.write('Must request at least one report via '

1222

'--model-notify, --pool-notify, or '

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1223

'--report-untestable\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1224

return False

1225

else:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1226

# We want to run all the e-mail reports. An empty notify

1227

# list will cause a report to be skipped, so make sure the

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1228

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1229

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1230

arguments.pool_notify = ['']

1231

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1232

1233

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1234

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1235

"""Get the default directory for the `--logdir` option.

1236

1237

The default log directory is based on the parent directory

1238

containing this script.

1239

1240

@param script Path to this script file.

1241

@return A path to a directory.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1242

"""

1243

basedir = os.path.dirname(os.path.abspath(script))

1244

basedir = os.path.dirname(basedir)

1245

return os.path.join(basedir, _LOGDIR)

1246

1247

1248

def _parse_command(argv):

1249

"""Parse the command line arguments.

1250

1251

Create an argument parser for this command's syntax, parse the

1252

command line, and return the result of the ArgumentParser

1253

parse_args() method.

1254

1255

@param argv Standard command line argument vector; argv[0] is

1256

assumed to be the command name.

1257

@return Result returned by ArgumentParser.parse_args().

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1258

"""

1259

parser = argparse.ArgumentParser(

1260

prog=argv[0],

1261

description='Gather and report lab inventory statistics')

1262

parser.add_argument('-d', '--duration', type=int,

1263

default=_DEFAULT_DURATION, metavar='HOURS',

1264

help='number of hours back to search for status'

1265

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1266

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1267

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1268

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1269

'and send it to the given e-mail address(es)')

1270

parser.add_argument('--pool-notify', action='append',

1271

default=[], metavar='ADDRESS',

1272

help='Generate pool inventory message, '

1273

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1274

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1275

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1276

'recommended for repair (default: no '

1277

'recommendation)'))

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1278

parser.add_argument('--report-untestable', action='store_true',

1279

help='Check for devices unable to run tests.')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1280

parser.add_argument('--debug', action='store_true',

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1281

help='Print e-mail, metrics messages on stdout '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1282

'without sending them.')

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1283

parser.add_argument('--no-metrics', action='store_false',

1284

dest='use_metrics',

1285

help='Suppress generation of Monarch metrics.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1286

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1287

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1288

parser.add_argument('modelnames', nargs='*',

1289

metavar='MODEL',

1290

help='names of models to report on '

1291

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1292

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1293

if not _verify_arguments(arguments):

1294

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1299

"""Configure the `logging` module for our needs.

1300

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1301

How we log depends on whether the `--debug` option was provided on

1302

the command line.

1303

* Without the option, we configure the logging to capture all

1304

potentially relevant events in a log file. The log file is

1305

configured to rotate once a week on Friday evening, preserving

1306

~3 months worth of history.

1307

* With the option, we expect stdout to contain other

1308

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1309

messages), so we restrict the output to INFO level.

1310

1311

For convenience, when `--debug` is on, the logging format has

1312

no adornments, so that a call like `logging.info(msg)` simply writes

1313

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1314

1315

@param arguments Command-line arguments as returned by

1316

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1317

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1318

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1319

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1320

root_logger.setLevel(logging.INFO)

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame]

1321

handler = logging.StreamHandler(sys.stdout)

1322

handler.setFormatter(logging.Formatter())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1323

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1324

if not os.path.exists(arguments.logdir):

1325

os.mkdir(arguments.logdir)

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame]

1326

root_logger.setLevel(logging.DEBUG)

1327

logfile = os.path.join(arguments.logdir, _LOGFILE)

1328

handler = logging.handlers.TimedRotatingFileHandler(

1329

logfile, when='W4', backupCount=13)

1330

formatter = logging.Formatter(_LOG_FORMAT,

1331

time_utils.TIME_FMT)

1332

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1333

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1334

# implicitly imported logging_config, which calls

1335

# logging.basicConfig() *at module level*. That gives us an

1336

# extra logging handler that we don't want. So, clear out all

1337

# the handlers here.

1338

for h in root_logger.handlers:

1339

root_logger.removeHandler(h)

1340

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1341

1342

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1343

def main(argv):

1344

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1345

1346

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1347

"""

1348

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1349

if not arguments:

1350

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1351

_configure_logging(arguments)

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1352

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1353

try:

1354

if arguments.use_metrics:

1355

if arguments.debug:

1356

logging.info('Debug mode: Will not report metrics to monarch.')

1357

metrics_file = '/dev/null'

1358

else:

1359

metrics_file = None

1360

with site_utils.SetupTsMonGlobalState(

1361

'lab_inventory', debug_file=metrics_file,

auto_flush=False):

success = False

try:

with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):

1366

_perform_inventory_reports(arguments)

1367

success = True

1368

finally:

1369

metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(

1370

fields={'success': success})

1371

metrics.Flush()

1372

else:

1373

_perform_inventory_reports(arguments)

1374

except KeyboardInterrupt:

1375

pass

1376

except Exception:

1377

# Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.

1378

logging.exception('Error escaped main')

1379

raise

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1380

1381

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1382

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1383

end_time = int(time.time())

1384

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1385

return _LabInventory.create_inventory(afe, start_time, end_time)

1386

1387

1388

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1389

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1390

1391

J. Richard Barnette