Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

32

--report-untestable

33

Scan the inventory for DUTs that can't test because they're stuck in

34

repair loops, or because the scheduler can't give them work.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

53

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

54

import logging

55

import logging.handlers

56

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

57

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

62

from autotest_lib.client.bin import utils

Richard Barnette

6f6ce32

2018-09-07 16:23:20 +0000

[diff] [blame]

63

from autotest_lib.client.common_lib import time_utils

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

64

from autotest_lib.frontend.afe.json_rpc import proxy

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

65

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

66

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

67

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

68

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

69

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

70

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

71

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

72

73

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

74

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

75

SPARE_POOL = constants.Pools.SPARE_POOL

76

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

77

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

78

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

79

# monitoring by this script. Currently, we're excluding these:

80

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

81

# + 'board:guado_moblab' - These are maintained by a separate

82

# process that doesn't use this script.

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

83

# + 'board:veyron_rialto' due to crbug.com/854404

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

84

C Shapiro

7de0442

2018-08-29 14:46:11 -0600

[diff] [blame]

85

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

86

'board:veyron_rialto'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

87

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

88

# _DEFAULT_DURATION:

89

# Default value used for the --duration command line option.

90

# Specifies how far back in time to search in order to determine

91

# DUT status.

92

93

_DEFAULT_DURATION = 24

94

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

95

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

96

# Relative path used in the calculation of the default setting for

97

# the --logdir option. The full path is relative to the root of the

98

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

99

# _LOGFILE:

100

# Basename of a file to which general log information will be

101

# written.

102

# _LOG_FORMAT:

103

# Format string for log messages.

104

105

_LOGDIR = os.path.join('logs', 'dut-data')

106

_LOGFILE = 'lab-inventory.log'

107

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

108

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

109

# Pattern describing location-based host names in the Chrome OS test

110

# labs. Each DUT hostname designates the DUT's location:

111

# * A lab (room) that's physically separated from other labs

112

# (i.e. there's a door).

113

# * A row (or aisle) of DUTs within the lab.

114

# * A vertical rack of shelves on the row.

115

# * A specific host on one shelf of the rack.

116

117

_HOSTNAME_PATTERN = re.compile(

118

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

119

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

120

# _REPAIR_LOOP_THRESHOLD:

121

# The number of repeated Repair tasks that must be seen to declare

122

# that a DUT is stuck in a repair loop.

123

124

_REPAIR_LOOP_THRESHOLD = 4

125

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

126

Prathmesh Prabhu

b69a6cc

2018-05-07 14:49:33 -0700

[diff] [blame]

127

_METRICS_PREFIX = 'chromeos/autotest/inventory'

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

128

_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

129

_METRICS_PREFIX + '/untestable',

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

130

'DUTs that cannot be scheduled for testing')

131

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

132

_MISSING_DUT_METRIC = metrics.Counter(

133

_METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'

134

' because they are invalid or deleted')

135

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

136

# _Diagnosis - namedtuple corresponding to the return value from

137

# `HostHistory.last_diagnosis()`

138

_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])

139

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

140

def _get_diagnosis(history):

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

141

dut_present = True

142

try:

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

143

diagnosis = _Diagnosis(*history.last_diagnosis())

144

if (diagnosis.status == status_history.BROKEN

145

and diagnosis.task.end_time < history.start_time):

146

return _Diagnosis(status_history.UNUSED, diagnosis.task)

147

else:

148

return diagnosis

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

149

except proxy.JSONRPCException as e:

logging.warn(e)

dut_present = False

finally:

_MISSING_DUT_METRIC.increment(

154

fields={'host': history.hostname, 'presence': dut_present})

Richard Barnette

a3071b7

2018-09-26 10:04:18 -0700

[diff] [blame^]

155

return _Diagnosis(None, None)

156

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

157

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

158

def _host_is_working(history):

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

159

return _get_diagnosis(history).status == status_history.WORKING

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

160

161

162

def _host_is_broken(history):

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

163

return _get_diagnosis(history).status == status_history.BROKEN

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

164

165

166

def _host_is_idle(history):

167

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

168

return _get_diagnosis(history).status in idle_statuses

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

169

170

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

171

class _HostSetInventory(object):

172

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

173

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

174

Current usage of this class is that all DUTs are part of a single

175

scheduling pool of DUTs for a single model; however, this class make

176

no assumptions about the actual relationship among the DUTs.

177

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

178

The collection is segregated into disjoint categories of "working",

179

"broken", and "idle" DUTs. Accessor methods allow finding both the

180

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

181

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

182

Performance note: Certain methods in this class are potentially

183

expensive:

184

* `get_working()`

185

* `get_working_list()`

186

* `get_broken()`

187

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

188

* `get_idle()`

189

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

190

The first time any one of these methods is called, it causes

191

multiple RPC calls with a relatively expensive set of database

192

queries. However, the results of the queries are cached in the

193

individual `HostJobHistory` objects, so only the first call

194

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

195

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

196

Additionally, `get_working_list()`, `get_broken_list()` and

197

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

198

lists at every call; this caching is separate from the caching of

199

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

200

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

201

This class is deliberately constructed to delay the RPC cost until

202

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

203

`record_host()`) so that it's possible to construct a complete

204

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

205

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

210

self._working_list = None

211

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

212

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

213

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

214

def record_host(self, host_history):

215

"""Add one `HostJobHistory` object to the collection.

216

217

@param host_history The `HostJobHistory` object to be

218

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

219

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

220

self._working_list = None

221

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

222

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

223

self._histories.append(host_history)

224

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

225

def get_working_list(self):

226

"""Return a list of all working DUTs in the pool.

227

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

228

Filter `self._histories` for histories where the DUT is

229

diagnosed as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

230

231

Cache the result so that we only cacluate it once.

232

233

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

234

"""

235

if self._working_list is None:

236

self._working_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

237

if _host_is_working(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

238

return self._working_list

239

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

240

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

241

"""Return the number of working DUTs in the pool."""

242

return len(self.get_working_list())

243

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

244

def get_broken_list(self):

245

"""Return a list of all broken DUTs in the pool.

246

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

247

Filter `self._histories` for histories where the DUT is

248

diagnosed as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

249

250

Cache the result so that we only cacluate it once.

251

252

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

253

"""

254

if self._broken_list is None:

255

self._broken_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

256

if _host_is_broken(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

257

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

258

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

259

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

260

"""Return the number of broken DUTs in the pool."""

261

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

262

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

263

def get_idle_list(self):

264

"""Return a list of all idle DUTs in the pool.

265

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

266

Filter `self._histories` for histories where the DUT is

267

diagnosed as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

268

269

Cache the result so that we only cacluate it once.

270

271

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

272

"""

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

273

if self._idle_list is None:

274

self._idle_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

275

if _host_is_idle(h)]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

276

return self._idle_list

277

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

278

def get_idle(self):

279

"""Return the number of idle DUTs in the pool."""

280

return len(self.get_idle_list())

281

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

282

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

283

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

284

return len(self._histories)

285

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

286

def get_all_histories(self):

287

return self._histories

288

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

289

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

290

class _PoolSetInventory(object):

291

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

292

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

293

The collection is segregated into disjoint categories of "working",

294

"broken", and "idle" DUTs. Accessor methods allow finding both the

295

list of DUTs in each category, as well as counts of each category.

296

Accessor queries can be for an individual pool, or against all

297

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

298

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

299

Performance note: This class relies on `_HostSetInventory`. Public

300

methods in this class generally rely on methods of the same name in

301

the underlying class, and so will have the same underlying

302

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

303

"""

304

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

305

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

306

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

307

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

308

}

309

310

def record_host(self, host_history):

311

"""Add one `HostJobHistory` object to the collection.

312

313

@param host_history The `HostJobHistory` object to be

314

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

315

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

316

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

317

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

318

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

319

def _count_pool(self, get_pool_count, pool=None):

320

"""Internal helper to count hosts in a given pool.

321

322

The `get_pool_count` parameter is a function to calculate

323

the exact count of interest for the pool.

324

325

@param get_pool_count Function to return a count from a

326

_PoolCount object.

327

@param pool The pool to be counted. If `None`,

328

return the total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

329

"""

330

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

331

return sum([get_pool_count(cached_history) for cached_history in

332

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

333

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

334

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

335

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

336

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

337

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

338

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

339

Go through all HostJobHistory objects across all pools,

340

selecting all DUTs identified as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

341

342

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

343

"""

344

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

345

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

346

l.extend(p.get_working_list())

347

return l

348

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

349

def get_working(self, pool=None):

350

"""Return the number of working DUTs in a pool.

351

352

@param pool The pool to be counted. If `None`, return the

353

total across all pools.

354

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

355

@return The total number of working DUTs in the selected

356

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

357

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

358

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

359

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

360

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

361

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

362

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

363

Go through all HostJobHistory objects across all pools,

364

selecting all DUTs identified as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

365

366

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

367

"""

368

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

369

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

370

l.extend(p.get_broken_list())

371

return l

372

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

373

def get_broken(self, pool=None):

374

"""Return the number of broken DUTs in a pool.

375

376

@param pool The pool to be counted. If `None`, return the

377

total across all pools.

378

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

379

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

380

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

381

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

382

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

383

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

384

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

385

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

386

Go through all HostJobHistory objects across all pools,

387

selecting all DUTs identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

388

389

@param pool: The pool to be counted. If `None`, return the total list

390

across all pools.

391

392

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

393

"""

394

if pool is None:

395

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

396

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

397

l.extend(p.get_idle_list())

398

return l

399

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

400

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

401

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

402

def get_idle(self, pool=None):

403

"""Return the number of idle DUTs in a pool.

404

405

@param pool: The pool to be counted. If `None`, return the total

406

across all pools.

407

408

@return The total number of idle DUTs in the selected pool(s).

409

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

410

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

411

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

412

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

413

"""Return the the nominal number of working spares.

414

415

Calculates and returns how many working spares there would

416

be in the spares pool if all broken DUTs were in the spares

417

pool. This number may be negative, indicating a shortfall

418

in the critical pools.

419

420

@return The total number DUTs in the spares pool, less the total

421

number of broken DUTs in all pools.

422

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

423

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

424

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

425

def get_total(self, pool=None):

426

"""Return the total number of DUTs in a pool.

427

428

@param pool The pool to be counted. If `None`, return the

429

total across all pools.

430

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

431

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

432

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

433

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

434

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

435

def get_all_histories(self, pool=None):

436

if pool is None:

437

for p in self._histories_by_pool.itervalues():

438

for h in p.get_all_histories():

439

yield h

440

else:

441

for h in self._histories_by_pool[pool].get_all_histories():

442

yield h

443

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

444

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

445

def _eligible_host(afehost):

446

"""Return whether this host is eligible for monitoring.

447

448

A host is eligible if it has a (unique) 'model' label, it's in

449

exactly one pool, and it has no labels from the

450

`_EXCLUDED_LABELS` set.

451

452

@param afehost The host to be tested for eligibility.

453

"""

454

# DUTs without an existing, unique 'model' or 'pool' label

455

# aren't meant to exist in the managed inventory; their presence

456

# generally indicates an error in the database. Unfortunately

457

# such errors have been seen to occur from time to time.

458

#

459

# The _LabInventory constructor requires hosts to conform to the

460

# label restrictions, and may fail if they don't. Failing an

461

# inventory run for a single bad entry is the wrong thing, so we

462

# ignore the problem children here, to keep them out of the

463

# inventory.

464

models = [l for l in afehost.labels

465

if l.startswith(constants.Labels.MODEL_PREFIX)]

466

pools = [l for l in afehost.labels

467

if l.startswith(constants.Labels.POOL_PREFIX)]

468

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

469

return len(models) == 1 and len(pools) == 1 and not excluded

470

471

472

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

473

"""Collection of `HostJobHistory` objects for the Lab's inventory.

474

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

475

This is a dict-like collection indexed by model. Indexing returns

476

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

477

"""

478

479

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

480

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

481

"""Return a Lab inventory with specified parameters.

482

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

483

By default, gathers inventory from `HostJobHistory` objects for

484

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

485

supplied, the inventory will be restricted to only the given

486

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

487

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

488

@param afe AFE object for constructing the

489

`HostJobHistory` objects.

490

@param start_time Start time for the `HostJobHistory` objects.

491

@param end_time End time for the `HostJobHistory` objects.

492

@param modellist List of models to include. If empty,

493

include all available models.

494

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

495

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

496

target_pools = MANAGED_POOLS

497

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

498

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

499

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

500

# We're deliberately not checking host eligibility in this

501

# code path. This is a debug path, not used in production;

502

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

503

modelhosts = []

504

for model in modellist:

505

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

506

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

507

if model_label in h.labels]

508

modelhosts.extend(host_list)

509

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

510

else:

Richard Barnette

3a40449

2018-02-08 13:57:01 -0800

[diff] [blame]

511

afehosts = [h for h in afehosts if _eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

512

create = lambda host: (

513

status_history.HostJobHistory(afe, host,

514

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

515

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

516

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

517

def __init__(self, histories, pools):

518

models = {h.host_model for h in histories}

519

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

520

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

521

for h in histories:

522

self[h.host_model].record_host(h)

523

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

524

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

525

def __getitem__(self, key):

526

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

527

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

528

def __len__(self):

529

return self._modeldata.__len__()

530

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

531

def __iter__(self):

532

return self._modeldata.__iter__()

533

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

534

def get_num_duts(self):

535

"""Return the total number of DUTs in the inventory."""

536

return self._dut_count

537

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

538

def get_num_models(self):

539

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

540

return len(self)

541

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

542

def get_pool_models(self, pool):

543

"""Return all models in `pool`.

544

545

@param pool The pool to be inventoried for models.

546

"""

547

return {m for m, h in self.iteritems() if h.get_total(pool)}

548

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

549

def get_boards(self):

550

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

551

552

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

553

def _reportable_models(inventory, spare_pool=SPARE_POOL):

554

"""Iterate over all models subject to reporting.

555

556

Yields the contents of `inventory.iteritems()` filtered to include

557

only reportable models. A model is reportable if it has DUTs in

558

both `spare_pool` and at least one other pool.

559

560

@param spare_pool The spare pool to be tested for reporting.

561

"""

562

for model, poolset in inventory.iteritems():

563

spares = poolset.get_total(spare_pool)

564

total = poolset.get_total()

565

if spares != 0 and spares != total:

yield model, poolset

def _all_dut_histories(inventory):

570

for poolset in inventory.itervalues():

571

for h in poolset.get_all_histories():

yield h

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

575

def _sort_by_location(inventory_list):

576

"""Return a list of DUTs, organized by location.

577

578

Take the given list of `HostJobHistory` objects, separate it

579

into a list per lab, and sort each lab's list by location. The

580

order of sorting within a lab is

581

* By row number within the lab,

582

* then by rack number within the row,

583

* then by host shelf number within the rack.

584

585

Return a list of the sorted lists.

586

587

Implementation note: host locations are sorted by converting

588

each location into a base 100 number. If row, rack or

589

host numbers exceed the range [0..99], then sorting will

590

break down.

591

592

@return A list of sorted lists of DUTs.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

597

location = _HOSTNAME_PATTERN.match(history.host.hostname)

598

if location:

599

lab = location.group(1)

600

key = 0

601

for idx in location.group(2, 3, 4):

602

key = BASE * key + int(idx)

603

lab_lists.setdefault(lab, []).append((key, history))

604

return_list = []

605

for dut_list in lab_lists.values():

606

dut_list.sort(key=lambda t: t[0])

607

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

612

"""Return a numeric score rating a set of DUTs to be repaired.

613

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

614

`buffer_counts` is a dictionary mapping model names to the size of

615

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

616

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

617

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

618

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

619

620

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

621

result from the proposed repairs, and scores the new set using two

622

numbers:

623

* Worst case buffer count for any model (higher is better). This

624

is the more significant number for comparison.

625

* Number of models at the worst case (lower is better). This is

626

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

627

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

628

Implementation note: The score could fail to reflect the intended

629

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

630

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

631

@param spare_counts A dictionary mapping models to buffer counts.

632

@param repair_list A list of `HostJobHistory` objects for the

633

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

634

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

635

"""

636

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

637

# that records the buffer count for each model after repair.

638

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

639

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

640

_NMODELS = 1000

641

pools = {h.host_pool for h in repair_list}

642

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

643

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

644

for m, c in buffer_counts.iteritems():

645

if m in repair_inventory:

646

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

647

else:

648

newcount = 0

649

new_counts.append(c + newcount)

650

# Go through the new list of counts. Find the worst available

651

# spares count, and count how many times that worst case occurs.

652

worst_count = new_counts[0]

653

num_worst = 1

654

for c in new_counts[1:]:

655

if c == worst_count:

656

num_worst += 1

657

elif c < worst_count:

658

worst_count = c

659

num_worst = 1

660

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

661

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

662

663

664

def _generate_repair_recommendation(inventory, num_recommend):

665

"""Return a summary of selected DUTs needing repair.

666

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

667

Returns a message recommending a list of broken DUTs to be repaired.

668

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

669

* No more than `num_recommend` DUTs will be listed.

670

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

671

* DUTs should be selected for some degree of physical proximity.

672

* DUTs for models with a low spares buffer are more important than

673

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

674

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

675

The algorithm used will guarantee that at least one DUT from a model

676

with the lowest spares buffer will be recommended. If the worst

677

spares buffer number is shared by more than one model, the algorithm

678

will tend to prefer repair sets that include more of those models

679

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

680

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

681

@param inventory `_LabInventory` object from which to generate

682

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

683

@param num_recommend Number of DUTs to recommend for repair.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

684

"""

685

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

686

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

687

broken_list = []

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

688

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

689

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

690

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

691

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

692

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

693

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

694

# simplification is hard:

695

# * Calculating an initial recommendation outside of

696

# the loop likely would make things more complicated,

697

# not less.

698

# * It's necessary to calculate an initial lab slice once per

699

# lab _before_ the while loop, in case the number of broken

700

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

701

recommendation = None

702

best_score = None

703

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

704

start = 0

705

end = num_recommend

706

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

707

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

708

while end < len(lab_duts):

709

start += 1

710

end += 1

711

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

712

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

713

if new_score > lab_score:

714

lab_slice = new_slice

715

lab_score = new_score

716

if recommendation is None or lab_score > best_score:

717

recommendation = lab_slice

718

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

719

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

720

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

721

# know more, go try it yourself...

722

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

723

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

724

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame]

725

if recommendation:

726

for h in recommendation:

727

servo_name = servo_host.make_servo_hostname(h.host.hostname)

728

servo_present = utils.host_is_in_lab_zone(servo_name)

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

729

event = _get_diagnosis(h).task

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame]

730

line = line_fmt % (

731

h.host.hostname, h.host_model,

732

'Yes' if servo_present else 'No', event.job_url)

733

message.append(line)

734

else:

735

message.append('(No DUTs to repair)')

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

736

return '\n'.join(message)

737

738

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

739

def _generate_model_inventory_message(inventory):

740

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

741

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

742

The model inventory is a list by model summarizing the number of

743

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

744

of working devices relative to the minimum critical pool

745

requirement.

746

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

747

The report omits models with no DUTs in the spare pool or with no

748

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

749

750

N.B. For sample output text formattted as users can expect to

751

see it in e-mail and log files, refer to the unit tests.

752

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

753

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

754

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

755

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

756

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

757

nworking = 0

758

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

759

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

760

nbroken_models = 0

761

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

762

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

763

column_names = (

764

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

765

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

766

logging.debug('Counting %2d DUTS for model %s',

767

counts.get_total(), model)

768

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

769

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

770

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

771

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

772

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

773

counts.get_spares_buffer(),

774

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

775

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

776

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

777

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

778

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

779

if element[2]:

780

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

781

nbroken_models += 1

782

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

783

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

784

nidle += element[3]

785

nworking += element[4]

786

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

787

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

788

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

789

idle_percent = int(round(100.0 * nidle / ntotal))

790

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

791

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

792

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

793

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

794

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

795

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

796

nworking, working_percent,

797

ntotal),

798

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

799

'Models with failures: %d' % nbroken_models,

800

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

801

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

802

'Full model inventory:\n',

803

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

804

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

805

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

806

return '\n'.join(message)

807

808

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

809

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

810

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

811

less than full strength, please take action to resolve the issues.

812

Once you're satisified that failures won't recur, failed DUTs can

813

be replaced with spares by running `balance_pool`. Detailed

814

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

815

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

819

def _generate_pool_inventory_message(inventory):

820

"""Generate the "pool inventory" e-mail message.

821

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

822

The pool inventory is a list by pool and model summarizing the

823

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

824

at least one broken DUT are included in the list.

825

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

826

N.B. For sample output text formattted as users can expect to see it

827

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

828

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

829

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

830

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

831

"""

832

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

833

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

834

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

835

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

836

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

837

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

838

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

839

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

840

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

841

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

842

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

843

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

844

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

845

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

846

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

847

# models at full strength are not reported

848

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

849

continue

850

working = counts.get_working(pool)

851

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

852

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

853

if data_list:

854

data_list = sorted(data_list, key=lambda d: -d[1])

855

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

856

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

857

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

858

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

859

newline = '\n'

860

return '\n'.join(message)

861

862

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

863

_IDLE_INVENTORY_HEADER = '''\

864

Notice to Infrastructure deputies: The hosts shown below haven't

865

run any jobs for at least 24 hours. Please check each host; locked

866

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

872

"""Generate the "idle inventory" e-mail message.

873

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

874

The idle inventory is a host list with corresponding pool and model,

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

875

where the hosts are identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

876

877

N.B. For sample output text format as users can expect to

878

see it in e-mail and log files, refer to the unit tests.

879

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

880

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

881

@return String with the inventory message to be sent.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

882

"""

883

logging.debug('Creating idle inventory')

884

message = [_IDLE_INVENTORY_HEADER]

885

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

886

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

887

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

888

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

889

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

890

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

891

counts.get_total(pool), model, pool)

892

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

893

for dut in counts.get_idle_list(pool)])

894

if data_list:

895

message.extend(['%-30s %-20s %s' % t for t in data_list])

896

else:

897

message.append('(No idle DUTs)')

898

return '\n'.join(message)

899

900

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

901

def _send_email(arguments, tag, subject, recipients, body):

902

"""Send an inventory e-mail message.

903

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

904

The message is logged in the selected log directory using `tag` for

905

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

906

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

907

If the --debug option was requested, the message is neither logged

908

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

909

910

@param arguments Parsed command-line options.

911

@param tag Tag identifying the inventory for logging

912

purposes.

913

@param subject E-mail Subject: header line.

914

@param recipients E-mail addresses for the To: header line.

915

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

916

"""

917

logging.debug('Generating email: "%s"', subject)

918

all_recipients = ', '.join(recipients)

919

report_body = '\n'.join([

920

'To: %s' % all_recipients,

921

'Subject: %s' % subject,

922

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

923

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

924

print report_body

925

else:

926

filename = os.path.join(arguments.logdir, tag)

927

try:

928

report_file = open(filename, 'w')

929

report_file.write(report_body)

930

report_file.close()

931

except EnvironmentError as e:

932

logging.error('Failed to write %s: %s', filename, e)

933

try:

934

gmail_lib.send_email(all_recipients, subject, body)

935

except Exception as e:

936

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

940

def _populate_model_counts(inventory):

941

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

942

943

Gathering the status of all individual DUTs in the lab can take

944

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

945

Normally, we pay that cost by querying as we go. However, with

946

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

947

progress in real time. So, we force the first (expensive) queries

948

to happen up front, and provide simple ASCII output on sys.stdout

949

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

950

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

951

@param inventory `_LabInventory` object from which to gather

952

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

953

"""

954

n = 0

955

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

956

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

966

# This next call is where all the time goes - it forces all of a

967

# model's `HostJobHistory` objects to query the database and

968

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

969

total_broken += counts.get_broken()

970

sys.stdout.write('\n')

971

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

972

973

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

974

def _perform_model_inventory(arguments, inventory, timestamp):

975

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

976

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

977

The model inventory report consists of the following:

978

* A list of DUTs that are recommended to be repaired. This list

979

is optional, and only appears if the `--recommend` option is

980

present.

981

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

982

of working, broken, and spare DUTs, among others.

983

984

@param arguments Command-line arguments as returned by

985

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

986

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

987

@param timestamp A string used to identify this run's timestamp

988

in logs and email output.

989

"""

990

if arguments.recommend:

991

recommend_message = _generate_repair_recommendation(

992

inventory, arguments.recommend) + '\n\n\n'

993

else:

994

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

995

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

996

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

997

'models-%s.txt' % timestamp,

998

'DUT model inventory %s' % timestamp,

999

arguments.model_notify,

1000

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1001

1002

1003

def _perform_pool_inventory(arguments, inventory, timestamp):

1004

"""Perform the pool inventory report.

1005

1006

The pool inventory report consists of the following:

1007

* A list of all critical pools that have failed DUTs, with counts

1008

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1009

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1010

pool.

1011

1012

@param arguments Command-line arguments as returned by

1013

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1014

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1015

@param timestamp A string used to identify this run's timestamp in

1016

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1017

"""

1018

pool_message = _generate_pool_inventory_message(inventory)

1019

idle_message = _generate_idle_inventory_message(inventory)

1020

_send_email(arguments,

1021

'pools-%s.txt' % timestamp,

1022

'DUT pool inventory %s' % timestamp,

1023

arguments.pool_notify,

1024

pool_message + '\n\n\n' + idle_message)

1025

1026

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1027

def _dut_in_repair_loop(history):

1028

"""Return whether a DUT's history indicates a repair loop.

1029

1030

A DUT is considered looping if it runs no tests, and no tasks pass

1031

other than repair tasks.

1032

1033

@param history An instance of `status_history.HostJobHistory` to be

1034

scanned for a repair loop. The caller guarantees

1035

that this history corresponds to a working DUT.

1036

@returns Return a true value if the DUT's most recent history

1037

indicates a repair loop.

1038

"""

1039

# Our caller passes only histories for working DUTs; that means

1040

# we've already paid the cost of fetching the diagnosis task, and

1041

# we know that the task was successful. The diagnosis task will be

1042

# one of the tasks we must scan to find a loop, so if the task isn't

1043

# a repair task, then our history includes a successful non-repair

1044

# task, and we're not looping.

1045

#

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1046

# The for loop below is very expensive, because it must fetch the

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1047

# full history, regardless of how many tasks we examine. At the

1048

# time of this writing, this check against the diagnosis task

1049

# reduces the cost of finding loops in the full inventory from hours

1050

# to minutes.

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

1051

if _get_diagnosis(history).task.name != 'Repair':

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1056

# This is a test, so we're not looping.

1057

return False

1058

if task.diagnosis == status_history.BROKEN:

1059

# Failed a repair, so we're not looping.

1060

return False

1061

if (task.diagnosis == status_history.WORKING

1062

and task.name != 'Repair'):

1063

# Non-repair task succeeded, so we're not looping.

1064

return False

1065

# At this point, we have either a failed non-repair task, or

1066

# a successful repair.

1067

if task.name == 'Repair':

1068

repair_ok_count += 1

1069

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1073

def _report_untestable_dut(history, state):

1074

fields = {

1075

'dut_hostname': history.hostname,

1076

'model': history.host_model,

1077

'pool': history.host_pool,

1078

'state': state,

1079

}

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1080

logging.info('DUT in state %(state)s: %(dut_hostname)s, '

1081

'model: %(model)s, pool: %(pool)s', fields)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1082

_UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1083

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1084

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1085

def _report_untestable_dut_metrics(inventory):

1086

"""Scan the inventory for DUTs unable to run tests.

1087

1088

DUTs in the inventory are judged "untestable" if they meet one of

1089

two criteria:

1090

* The DUT is stuck in a repair loop; that is, it regularly passes

1091

repair, but never passes other operations.

1092

* The DUT runs no tasks at all, but is not locked.

1093

1094

This routine walks through the given inventory looking for DUTs in

1095

either of these states. Results are reported via a Monarch presence

1096

metric.

1097

1098

Note: To make sure that DUTs aren't flagged as "idle" merely

1099

because there's no work, a separate job runs prior to regular

1100

inventory runs which schedules trivial work on any DUT that appears

1101

idle.

1102

1103

@param inventory `_LabInventory` object to be reported on.

1104

"""

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1105

logging.info('Scanning for untestable DUTs.')

1106

for history in _all_dut_histories(inventory):

1107

# Managed DUTs with names that don't match

1108

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1109

# don't want arbitrary strings being attached to the

1110

# 'dut_hostname' field, so for safety, we exclude all

1111

# anomalies.

1112

if not _HOSTNAME_PATTERN.match(history.hostname):

1113

continue

1114

if _host_is_working(history):

1115

if _dut_in_repair_loop(history):

1116

_report_untestable_dut(history, 'repair_loop')

1117

elif _host_is_idle(history):

1118

if not history.host.locked:

1119

_report_untestable_dut(history, 'idle_unlocked')

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1120

1121

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1122

def _log_startup(arguments, startup_time):

1123

"""Log the start of this inventory run.

1124

1125

Print various log messages indicating the start of the run. Return

1126

a string based on `startup_time` that will be used to identify this

1127

run in log files and e-mail messages.

1128

1129

@param startup_time A UNIX timestamp marking the moment when

1130

this inventory run began.

1131

@returns A timestamp string that will be used to identify this run

1132

in logs and email output.

1133

"""

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame]

1134

timestamp = time.strftime('%Y-%m-%d.%H',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1135

time.localtime(startup_time))

1136

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1137

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1138

if arguments.recommend:

1139

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1140

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1141

if arguments.pool_notify:

1142

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1147

"""Create the `_LabInventory` instance to use for reporting.

1148

1149

@param end_time A UNIX timestamp for the end of the time range

1150

to be searched in this inventory run.

1151

"""

1152

start_time = end_time - arguments.duration * 60 * 60

1153

afe = frontend_wrappers.RetryingAFE(server=None)

1154

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1155

afe, start_time, end_time, arguments.modelnames)

1156

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1157

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1158

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1162

def _perform_inventory_reports(arguments):

1163

"""Perform all inventory checks requested on the command line.

1164

1165

Create the initial inventory and run through the inventory reports

1166

as called for by the parsed command-line arguments.

1167

1168

@param arguments Command-line arguments as returned by

1169

`ArgumentParser`.

1170

"""

1171

startup_time = time.time()

1172

timestamp = _log_startup(arguments, startup_time)

1173

inventory = _create_inventory(arguments, startup_time)

1174

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1175

_populate_model_counts(inventory)

1176

if arguments.model_notify:

1177

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1178

if arguments.pool_notify:

1179

_perform_pool_inventory(arguments, inventory, timestamp)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1180

if arguments.report_untestable:

1181

_report_untestable_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1182

1183

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1184

def _separate_email_addresses(address_list):

1185

"""Parse a list of comma-separated lists of e-mail addresses.

1186

1187

@param address_list A list of strings containing comma

1188

separate e-mail addresses.

1189

@return A list of the individual e-mail addresses.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1190

"""

1191

newlist = []

1192

for arg in address_list:

1193

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1198

"""Validate command-line arguments.

1199

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1200

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1201

`--pool-notify` in separate option arguments into a single list.

1202

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1203

For non-debug uses, require that at least one inventory report be

1204

requested. For debug, if a report isn't specified, treat it as "run

1205

all the reports."

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1206

1207

The return value indicates success or failure; in the case of

1208

failure, we also write an error message to stderr.

1209

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1210

@param arguments Command-line arguments as returned by

1211

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1212

@return True if the arguments are semantically good, or False

1213

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1214

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1215

arguments.model_notify = _separate_email_addresses(

1216

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1217

arguments.pool_notify = _separate_email_addresses(

1218

arguments.pool_notify)

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1219

if not any([arguments.model_notify, arguments.pool_notify,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1220

arguments.report_untestable]):

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1221

if not arguments.debug:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1222

sys.stderr.write('Must request at least one report via '

1223

'--model-notify, --pool-notify, or '

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1224

'--report-untestable\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1225

return False

1226

else:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1227

# We want to run all the e-mail reports. An empty notify

1228

# list will cause a report to be skipped, so make sure the

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1229

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1230

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1231

arguments.pool_notify = ['']

1232

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1233

1234

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1235

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1236

"""Get the default directory for the `--logdir` option.

1237

1238

The default log directory is based on the parent directory

1239

containing this script.

1240

1241

@param script Path to this script file.

1242

@return A path to a directory.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1243

"""

1244

basedir = os.path.dirname(os.path.abspath(script))

1245

basedir = os.path.dirname(basedir)

1246

return os.path.join(basedir, _LOGDIR)

1247

1248

1249

def _parse_command(argv):

1250

"""Parse the command line arguments.

1251

1252

Create an argument parser for this command's syntax, parse the

1253

command line, and return the result of the ArgumentParser

1254

parse_args() method.

1255

1256

@param argv Standard command line argument vector; argv[0] is

1257

assumed to be the command name.

1258

@return Result returned by ArgumentParser.parse_args().

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1259

"""

1260

parser = argparse.ArgumentParser(

1261

prog=argv[0],

1262

description='Gather and report lab inventory statistics')

1263

parser.add_argument('-d', '--duration', type=int,

1264

default=_DEFAULT_DURATION, metavar='HOURS',

1265

help='number of hours back to search for status'

1266

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1267

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1268

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1269

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1270

'and send it to the given e-mail address(es)')

1271

parser.add_argument('--pool-notify', action='append',

1272

default=[], metavar='ADDRESS',

1273

help='Generate pool inventory message, '

1274

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1275

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1276

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1277

'recommended for repair (default: no '

1278

'recommendation)'))

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1279

parser.add_argument('--report-untestable', action='store_true',

1280

help='Check for devices unable to run tests.')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1281

parser.add_argument('--debug', action='store_true',

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1282

help='Print e-mail, metrics messages on stdout '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1283

'without sending them.')

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1284

parser.add_argument('--no-metrics', action='store_false',

1285

dest='use_metrics',

1286

help='Suppress generation of Monarch metrics.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1287

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1288

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1289

parser.add_argument('modelnames', nargs='*',

1290

metavar='MODEL',

1291

help='names of models to report on '

1292

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1293

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1294

if not _verify_arguments(arguments):

1295

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1300

"""Configure the `logging` module for our needs.

1301

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1302

How we log depends on whether the `--debug` option was provided on

1303

the command line.

1304

* Without the option, we configure the logging to capture all

1305

potentially relevant events in a log file. The log file is

1306

configured to rotate once a week on Friday evening, preserving

1307

~3 months worth of history.

1308

* With the option, we expect stdout to contain other

1309

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1310

messages), so we restrict the output to INFO level.

1311

1312

For convenience, when `--debug` is on, the logging format has

1313

no adornments, so that a call like `logging.info(msg)` simply writes

1314

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1315

1316

@param arguments Command-line arguments as returned by

1317

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1318

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1319

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1320

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1321

root_logger.setLevel(logging.INFO)

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame]

1322

handler = logging.StreamHandler(sys.stdout)

1323

handler.setFormatter(logging.Formatter())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1324

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1325

if not os.path.exists(arguments.logdir):

1326

os.mkdir(arguments.logdir)

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame]

1327

root_logger.setLevel(logging.DEBUG)

1328

logfile = os.path.join(arguments.logdir, _LOGFILE)

1329

handler = logging.handlers.TimedRotatingFileHandler(

1330

logfile, when='W4', backupCount=13)

1331

formatter = logging.Formatter(_LOG_FORMAT,

1332

time_utils.TIME_FMT)

1333

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1334

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1335

# implicitly imported logging_config, which calls

1336

# logging.basicConfig() *at module level*. That gives us an

1337

# extra logging handler that we don't want. So, clear out all

1338

# the handlers here.

1339

for h in root_logger.handlers:

1340

root_logger.removeHandler(h)

1341

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1342

1343

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1344

def main(argv):

1345

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1346

1347

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1348

"""

1349

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1350

if not arguments:

1351

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1352

_configure_logging(arguments)

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1353

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1354

try:

1355

if arguments.use_metrics:

1356

if arguments.debug:

1357

logging.info('Debug mode: Will not report metrics to monarch.')

1358

metrics_file = '/dev/null'

1359

else:

1360

metrics_file = None

1361

with site_utils.SetupTsMonGlobalState(

1362

'lab_inventory', debug_file=metrics_file,

auto_flush=False):

success = False

try:

with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):

1367

_perform_inventory_reports(arguments)

1368

success = True

1369

finally:

1370

metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(

1371

fields={'success': success})

1372

metrics.Flush()

1373

else:

1374

_perform_inventory_reports(arguments)

1375

except KeyboardInterrupt:

1376

pass

1377

except Exception:

1378

# Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.

1379

logging.exception('Error escaped main')

1380

raise

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1381

1382

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1383

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1384

end_time = int(time.time())

1385

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1386

return _LabInventory.create_inventory(afe, start_time, end_time)

1387

1388

1389

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1390

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1391

1392

J. Richard Barnette