Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

32

--report-untestable

33

Scan the inventory for DUTs that can't test because they're stuck in

34

repair loops, or because the scheduler can't give them work.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

53

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

54

import logging

55

import logging.handlers

56

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

57

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

62

from autotest_lib.client.bin import utils

Richard Barnette

6f6ce32

2018-09-07 16:23:20 +0000

[diff] [blame]

63

from autotest_lib.client.common_lib import time_utils

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

64

from autotest_lib.frontend.afe.json_rpc import proxy

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

65

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

66

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

67

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

68

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

69

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

70

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

71

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

72

73

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

74

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

75

SPARE_POOL = constants.Pools.SPARE_POOL

76

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

77

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

78

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

79

# monitoring by this script. Currently, we're excluding these:

80

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

81

# + 'board:guado_moblab' - These are maintained by a separate

82

# process that doesn't use this script.

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

83

# + 'board:veyron_rialto' due to crbug.com/854404

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

84

C Shapiro

7de0442

2018-08-29 14:46:11 -0600

[diff] [blame]

85

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

86

'board:veyron_rialto'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

87

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

88

# _DEFAULT_DURATION:

89

# Default value used for the --duration command line option.

90

# Specifies how far back in time to search in order to determine

91

# DUT status.

92

93

_DEFAULT_DURATION = 24

94

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

95

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

96

# Relative path used in the calculation of the default setting for

97

# the --logdir option. The full path is relative to the root of the

98

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

99

# _LOGFILE:

100

# Basename of a file to which general log information will be

101

# written.

102

# _LOG_FORMAT:

103

# Format string for log messages.

104

105

_LOGDIR = os.path.join('logs', 'dut-data')

106

_LOGFILE = 'lab-inventory.log'

107

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

108

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

109

# Pattern describing location-based host names in the Chrome OS test

110

# labs. Each DUT hostname designates the DUT's location:

111

# * A lab (room) that's physically separated from other labs

112

# (i.e. there's a door).

113

# * A row (or aisle) of DUTs within the lab.

114

# * A vertical rack of shelves on the row.

115

# * A specific host on one shelf of the rack.

116

117

_HOSTNAME_PATTERN = re.compile(

118

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

119

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

120

# _REPAIR_LOOP_THRESHOLD:

121

# The number of repeated Repair tasks that must be seen to declare

122

# that a DUT is stuck in a repair loop.

123

124

_REPAIR_LOOP_THRESHOLD = 4

125

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

126

Prathmesh Prabhu

b69a6cc

2018-05-07 14:49:33 -0700

[diff] [blame]

127

_METRICS_PREFIX = 'chromeos/autotest/inventory'

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

128

_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

129

_METRICS_PREFIX + '/untestable',

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

130

'DUTs that cannot be scheduled for testing')

131

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

132

_MISSING_DUT_METRIC = metrics.Counter(

133

_METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'

134

' because they are invalid or deleted')

135

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

136

# _Diagnosis - namedtuple corresponding to the return value from

137

# `HostHistory.last_diagnosis()`

138

_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])

139

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

140

def _get_diagnosis(history):

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

141

dut_present = True

142

try:

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

143

diagnosis = _Diagnosis(*history.last_diagnosis())

144

if (diagnosis.status == status_history.BROKEN

145

and diagnosis.task.end_time < history.start_time):

146

return _Diagnosis(status_history.UNUSED, diagnosis.task)

147

else:

148

return diagnosis

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

149

except proxy.JSONRPCException as e:

logging.warn(e)

dut_present = False

finally:

_MISSING_DUT_METRIC.increment(

154

fields={'host': history.hostname, 'presence': dut_present})

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

155

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

156

def _host_is_working(history):

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

157

return _get_diagnosis(history).status == status_history.WORKING

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

158

159

160

def _host_is_broken(history):

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

161

return _get_diagnosis(history).status == status_history.BROKEN

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

162

163

164

def _host_is_idle(history):

165

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

166

return _get_diagnosis(history).status in idle_statuses

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

167

168

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

169

class _HostSetInventory(object):

170

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

171

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

172

Current usage of this class is that all DUTs are part of a single

173

scheduling pool of DUTs for a single model; however, this class make

174

no assumptions about the actual relationship among the DUTs.

175

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

176

The collection is segregated into disjoint categories of "working",

177

"broken", and "idle" DUTs. Accessor methods allow finding both the

178

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

179

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

180

Performance note: Certain methods in this class are potentially

181

expensive:

182

* `get_working()`

183

* `get_working_list()`

184

* `get_broken()`

185

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

186

* `get_idle()`

187

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

188

The first time any one of these methods is called, it causes

189

multiple RPC calls with a relatively expensive set of database

190

queries. However, the results of the queries are cached in the

191

individual `HostJobHistory` objects, so only the first call

192

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

193

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

194

Additionally, `get_working_list()`, `get_broken_list()` and

195

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

196

lists at every call; this caching is separate from the caching of

197

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

198

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

199

This class is deliberately constructed to delay the RPC cost until

200

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

201

`record_host()`) so that it's possible to construct a complete

202

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

203

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

208

self._working_list = None

209

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

210

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

211

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

212

def record_host(self, host_history):

213

"""Add one `HostJobHistory` object to the collection.

214

215

@param host_history The `HostJobHistory` object to be

216

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

217

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

218

self._working_list = None

219

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

220

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

221

self._histories.append(host_history)

222

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

223

def get_working_list(self):

224

"""Return a list of all working DUTs in the pool.

225

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

226

Filter `self._histories` for histories where the DUT is

227

diagnosed as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

228

229

Cache the result so that we only cacluate it once.

230

231

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

232

"""

233

if self._working_list is None:

234

self._working_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

235

if _host_is_working(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

236

return self._working_list

237

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

238

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

239

"""Return the number of working DUTs in the pool."""

240

return len(self.get_working_list())

241

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

242

def get_broken_list(self):

243

"""Return a list of all broken DUTs in the pool.

244

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

245

Filter `self._histories` for histories where the DUT is

246

diagnosed as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

247

248

Cache the result so that we only cacluate it once.

249

250

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

251

"""

252

if self._broken_list is None:

253

self._broken_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

254

if _host_is_broken(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

255

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

256

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

257

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

258

"""Return the number of broken DUTs in the pool."""

259

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

260

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

261

def get_idle_list(self):

262

"""Return a list of all idle DUTs in the pool.

263

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

264

Filter `self._histories` for histories where the DUT is

265

diagnosed as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

266

267

Cache the result so that we only cacluate it once.

268

269

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

270

"""

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

271

if self._idle_list is None:

272

self._idle_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

273

if _host_is_idle(h)]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

274

return self._idle_list

275

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

276

def get_idle(self):

277

"""Return the number of idle DUTs in the pool."""

278

return len(self.get_idle_list())

279

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

280

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

281

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

282

return len(self._histories)

283

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

284

def get_all_histories(self):

285

return self._histories

286

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

287

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

288

class _PoolSetInventory(object):

289

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

290

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

291

The collection is segregated into disjoint categories of "working",

292

"broken", and "idle" DUTs. Accessor methods allow finding both the

293

list of DUTs in each category, as well as counts of each category.

294

Accessor queries can be for an individual pool, or against all

295

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

296

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

297

Performance note: This class relies on `_HostSetInventory`. Public

298

methods in this class generally rely on methods of the same name in

299

the underlying class, and so will have the same underlying

300

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

301

"""

302

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

303

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

304

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

305

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

306

}

307

308

def record_host(self, host_history):

309

"""Add one `HostJobHistory` object to the collection.

310

311

@param host_history The `HostJobHistory` object to be

312

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

313

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

314

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

315

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

316

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

317

def _count_pool(self, get_pool_count, pool=None):

318

"""Internal helper to count hosts in a given pool.

319

320

The `get_pool_count` parameter is a function to calculate

321

the exact count of interest for the pool.

322

323

@param get_pool_count Function to return a count from a

324

_PoolCount object.

325

@param pool The pool to be counted. If `None`,

326

return the total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

327

"""

328

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

329

return sum([get_pool_count(cached_history) for cached_history in

330

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

331

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

332

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

333

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

334

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

335

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

336

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

337

Go through all HostJobHistory objects across all pools,

338

selecting all DUTs identified as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

339

340

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

341

"""

342

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

343

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

344

l.extend(p.get_working_list())

345

return l

346

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

347

def get_working(self, pool=None):

348

"""Return the number of working DUTs in a pool.

349

350

@param pool The pool to be counted. If `None`, return the

351

total across all pools.

352

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

353

@return The total number of working DUTs in the selected

354

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

355

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

356

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

357

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

358

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

359

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

360

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

361

Go through all HostJobHistory objects across all pools,

362

selecting all DUTs identified as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

363

364

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

365

"""

366

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

367

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

368

l.extend(p.get_broken_list())

369

return l

370

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

371

def get_broken(self, pool=None):

372

"""Return the number of broken DUTs in a pool.

373

374

@param pool The pool to be counted. If `None`, return the

375

total across all pools.

376

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

377

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

378

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

379

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

380

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

381

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

382

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

383

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

384

Go through all HostJobHistory objects across all pools,

385

selecting all DUTs identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

386

387

@param pool: The pool to be counted. If `None`, return the total list

388

across all pools.

389

390

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

391

"""

392

if pool is None:

393

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

394

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

395

l.extend(p.get_idle_list())

396

return l

397

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

398

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

399

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

400

def get_idle(self, pool=None):

401

"""Return the number of idle DUTs in a pool.

402

403

@param pool: The pool to be counted. If `None`, return the total

404

across all pools.

405

406

@return The total number of idle DUTs in the selected pool(s).

407

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

408

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

409

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

410

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

411

"""Return the the nominal number of working spares.

412

413

Calculates and returns how many working spares there would

414

be in the spares pool if all broken DUTs were in the spares

415

pool. This number may be negative, indicating a shortfall

416

in the critical pools.

417

418

@return The total number DUTs in the spares pool, less the total

419

number of broken DUTs in all pools.

420

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

421

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

422

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

423

def get_total(self, pool=None):

424

"""Return the total number of DUTs in a pool.

425

426

@param pool The pool to be counted. If `None`, return the

427

total across all pools.

428

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

429

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

430

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

431

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

432

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

433

def get_all_histories(self, pool=None):

434

if pool is None:

435

for p in self._histories_by_pool.itervalues():

436

for h in p.get_all_histories():

437

yield h

438

else:

439

for h in self._histories_by_pool[pool].get_all_histories():

440

yield h

441

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

442

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

443

def _eligible_host(afehost):

444

"""Return whether this host is eligible for monitoring.

445

446

A host is eligible if it has a (unique) 'model' label, it's in

447

exactly one pool, and it has no labels from the

448

`_EXCLUDED_LABELS` set.

449

450

@param afehost The host to be tested for eligibility.

451

"""

452

# DUTs without an existing, unique 'model' or 'pool' label

453

# aren't meant to exist in the managed inventory; their presence

454

# generally indicates an error in the database. Unfortunately

455

# such errors have been seen to occur from time to time.

456

#

457

# The _LabInventory constructor requires hosts to conform to the

458

# label restrictions, and may fail if they don't. Failing an

459

# inventory run for a single bad entry is the wrong thing, so we

460

# ignore the problem children here, to keep them out of the

461

# inventory.

462

models = [l for l in afehost.labels

463

if l.startswith(constants.Labels.MODEL_PREFIX)]

464

pools = [l for l in afehost.labels

465

if l.startswith(constants.Labels.POOL_PREFIX)]

466

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

467

return len(models) == 1 and len(pools) == 1 and not excluded

468

469

470

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

471

"""Collection of `HostJobHistory` objects for the Lab's inventory.

472

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

473

This is a dict-like collection indexed by model. Indexing returns

474

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

475

"""

476

477

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

478

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

479

"""Return a Lab inventory with specified parameters.

480

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

481

By default, gathers inventory from `HostJobHistory` objects for

482

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

483

supplied, the inventory will be restricted to only the given

484

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

485

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

486

@param afe AFE object for constructing the

487

`HostJobHistory` objects.

488

@param start_time Start time for the `HostJobHistory` objects.

489

@param end_time End time for the `HostJobHistory` objects.

490

@param modellist List of models to include. If empty,

491

include all available models.

492

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

493

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

494

target_pools = MANAGED_POOLS

495

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

496

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

497

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

498

# We're deliberately not checking host eligibility in this

499

# code path. This is a debug path, not used in production;

500

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

501

modelhosts = []

502

for model in modellist:

503

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

504

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

505

if model_label in h.labels]

506

modelhosts.extend(host_list)

507

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

508

else:

Richard Barnette

3a40449

2018-02-08 13:57:01 -0800

[diff] [blame]

509

afehosts = [h for h in afehosts if _eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

510

create = lambda host: (

511

status_history.HostJobHistory(afe, host,

512

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

513

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

514

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

515

def __init__(self, histories, pools):

516

models = {h.host_model for h in histories}

517

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

518

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

519

for h in histories:

520

self[h.host_model].record_host(h)

521

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

522

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

523

def __getitem__(self, key):

524

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

525

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

526

def __len__(self):

527

return self._modeldata.__len__()

528

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

529

def __iter__(self):

530

return self._modeldata.__iter__()

531

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

532

def get_num_duts(self):

533

"""Return the total number of DUTs in the inventory."""

534

return self._dut_count

535

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

536

def get_num_models(self):

537

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

538

return len(self)

539

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

540

def get_pool_models(self, pool):

541

"""Return all models in `pool`.

542

543

@param pool The pool to be inventoried for models.

544

"""

545

return {m for m, h in self.iteritems() if h.get_total(pool)}

546

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

547

def get_boards(self):

548

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

549

550

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

551

def _reportable_models(inventory, spare_pool=SPARE_POOL):

552

"""Iterate over all models subject to reporting.

553

554

Yields the contents of `inventory.iteritems()` filtered to include

555

only reportable models. A model is reportable if it has DUTs in

556

both `spare_pool` and at least one other pool.

557

558

@param spare_pool The spare pool to be tested for reporting.

559

"""

560

for model, poolset in inventory.iteritems():

561

spares = poolset.get_total(spare_pool)

562

total = poolset.get_total()

563

if spares != 0 and spares != total:

yield model, poolset

def _all_dut_histories(inventory):

568

for poolset in inventory.itervalues():

569

for h in poolset.get_all_histories():

yield h

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

573

def _sort_by_location(inventory_list):

574

"""Return a list of DUTs, organized by location.

575

576

Take the given list of `HostJobHistory` objects, separate it

577

into a list per lab, and sort each lab's list by location. The

578

order of sorting within a lab is

579

* By row number within the lab,

580

* then by rack number within the row,

581

* then by host shelf number within the rack.

582

583

Return a list of the sorted lists.

584

585

Implementation note: host locations are sorted by converting

586

each location into a base 100 number. If row, rack or

587

host numbers exceed the range [0..99], then sorting will

588

break down.

589

590

@return A list of sorted lists of DUTs.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

595

location = _HOSTNAME_PATTERN.match(history.host.hostname)

596

if location:

597

lab = location.group(1)

598

key = 0

599

for idx in location.group(2, 3, 4):

600

key = BASE * key + int(idx)

601

lab_lists.setdefault(lab, []).append((key, history))

602

return_list = []

603

for dut_list in lab_lists.values():

604

dut_list.sort(key=lambda t: t[0])

605

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

610

"""Return a numeric score rating a set of DUTs to be repaired.

611

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

612

`buffer_counts` is a dictionary mapping model names to the size of

613

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

614

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

615

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

616

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

617

618

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

619

result from the proposed repairs, and scores the new set using two

620

numbers:

621

* Worst case buffer count for any model (higher is better). This

622

is the more significant number for comparison.

623

* Number of models at the worst case (lower is better). This is

624

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

625

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

626

Implementation note: The score could fail to reflect the intended

627

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

628

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

629

@param spare_counts A dictionary mapping models to buffer counts.

630

@param repair_list A list of `HostJobHistory` objects for the

631

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

632

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

633

"""

634

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

635

# that records the buffer count for each model after repair.

636

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

637

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

638

_NMODELS = 1000

639

pools = {h.host_pool for h in repair_list}

640

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

641

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

642

for m, c in buffer_counts.iteritems():

643

if m in repair_inventory:

644

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

645

else:

646

newcount = 0

647

new_counts.append(c + newcount)

648

# Go through the new list of counts. Find the worst available

649

# spares count, and count how many times that worst case occurs.

650

worst_count = new_counts[0]

651

num_worst = 1

652

for c in new_counts[1:]:

653

if c == worst_count:

654

num_worst += 1

655

elif c < worst_count:

656

worst_count = c

657

num_worst = 1

658

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

659

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

660

661

662

def _generate_repair_recommendation(inventory, num_recommend):

663

"""Return a summary of selected DUTs needing repair.

664

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

665

Returns a message recommending a list of broken DUTs to be repaired.

666

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

667

* No more than `num_recommend` DUTs will be listed.

668

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

669

* DUTs should be selected for some degree of physical proximity.

670

* DUTs for models with a low spares buffer are more important than

671

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

672

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

673

The algorithm used will guarantee that at least one DUT from a model

674

with the lowest spares buffer will be recommended. If the worst

675

spares buffer number is shared by more than one model, the algorithm

676

will tend to prefer repair sets that include more of those models

677

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

678

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

679

@param inventory `_LabInventory` object from which to generate

680

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

681

@param num_recommend Number of DUTs to recommend for repair.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

682

"""

683

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

684

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

685

broken_list = []

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

686

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

687

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

688

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

689

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

690

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

691

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

692

# simplification is hard:

693

# * Calculating an initial recommendation outside of

694

# the loop likely would make things more complicated,

695

# not less.

696

# * It's necessary to calculate an initial lab slice once per

697

# lab _before_ the while loop, in case the number of broken

698

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

699

recommendation = None

700

best_score = None

701

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

702

start = 0

703

end = num_recommend

704

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

705

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

706

while end < len(lab_duts):

707

start += 1

708

end += 1

709

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

710

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

711

if new_score > lab_score:

712

lab_slice = new_slice

713

lab_score = new_score

714

if recommendation is None or lab_score > best_score:

715

recommendation = lab_slice

716

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

717

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

718

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

719

# know more, go try it yourself...

720

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

721

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

722

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame]

723

if recommendation:

724

for h in recommendation:

725

servo_name = servo_host.make_servo_hostname(h.host.hostname)

726

servo_present = utils.host_is_in_lab_zone(servo_name)

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

727

event = _get_diagnosis(h).task

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame]

728

line = line_fmt % (

729

h.host.hostname, h.host_model,

730

'Yes' if servo_present else 'No', event.job_url)

731

message.append(line)

732

else:

733

message.append('(No DUTs to repair)')

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

734

return '\n'.join(message)

735

736

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

737

def _generate_model_inventory_message(inventory):

738

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

739

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

740

The model inventory is a list by model summarizing the number of

741

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

742

of working devices relative to the minimum critical pool

743

requirement.

744

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

745

The report omits models with no DUTs in the spare pool or with no

746

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

747

748

N.B. For sample output text formattted as users can expect to

749

see it in e-mail and log files, refer to the unit tests.

750

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

751

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

752

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

753

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

754

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

755

nworking = 0

756

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

757

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

758

nbroken_models = 0

759

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

760

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

761

column_names = (

762

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

763

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

764

logging.debug('Counting %2d DUTS for model %s',

765

counts.get_total(), model)

766

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

767

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

768

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

769

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

770

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

771

counts.get_spares_buffer(),

772

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

773

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

774

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

775

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

776

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

777

if element[2]:

778

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

779

nbroken_models += 1

780

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

781

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

782

nidle += element[3]

783

nworking += element[4]

784

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

785

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

786

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

787

idle_percent = int(round(100.0 * nidle / ntotal))

788

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

789

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

790

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

791

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

792

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

793

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

794

nworking, working_percent,

795

ntotal),

796

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

797

'Models with failures: %d' % nbroken_models,

798

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

799

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

800

'Full model inventory:\n',

801

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

802

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

803

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

804

return '\n'.join(message)

805

806

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

807

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

808

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

809

less than full strength, please take action to resolve the issues.

810

Once you're satisified that failures won't recur, failed DUTs can

811

be replaced with spares by running `balance_pool`. Detailed

812

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

813

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

817

def _generate_pool_inventory_message(inventory):

818

"""Generate the "pool inventory" e-mail message.

819

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

820

The pool inventory is a list by pool and model summarizing the

821

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

822

at least one broken DUT are included in the list.

823

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

824

N.B. For sample output text formattted as users can expect to see it

825

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

826

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

827

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

828

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

829

"""

830

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

831

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

832

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

833

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

834

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

835

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

836

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

837

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

838

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

839

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

840

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

841

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

842

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

843

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

844

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

845

# models at full strength are not reported

846

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

847

continue

848

working = counts.get_working(pool)

849

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

850

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

851

if data_list:

852

data_list = sorted(data_list, key=lambda d: -d[1])

853

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

854

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

855

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

856

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

857

newline = '\n'

858

return '\n'.join(message)

859

860

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

861

_IDLE_INVENTORY_HEADER = '''\

862

Notice to Infrastructure deputies: The hosts shown below haven't

863

run any jobs for at least 24 hours. Please check each host; locked

864

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

870

"""Generate the "idle inventory" e-mail message.

871

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

872

The idle inventory is a host list with corresponding pool and model,

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

873

where the hosts are identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

874

875

N.B. For sample output text format as users can expect to

876

see it in e-mail and log files, refer to the unit tests.

877

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

878

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

879

@return String with the inventory message to be sent.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

880

"""

881

logging.debug('Creating idle inventory')

882

message = [_IDLE_INVENTORY_HEADER]

883

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

884

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

885

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

886

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

887

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

888

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

889

counts.get_total(pool), model, pool)

890

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

891

for dut in counts.get_idle_list(pool)])

892

if data_list:

893

message.extend(['%-30s %-20s %s' % t for t in data_list])

894

else:

895

message.append('(No idle DUTs)')

896

return '\n'.join(message)

897

898

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

899

def _send_email(arguments, tag, subject, recipients, body):

900

"""Send an inventory e-mail message.

901

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

902

The message is logged in the selected log directory using `tag` for

903

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

904

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

905

If the --debug option was requested, the message is neither logged

906

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

907

908

@param arguments Parsed command-line options.

909

@param tag Tag identifying the inventory for logging

910

purposes.

911

@param subject E-mail Subject: header line.

912

@param recipients E-mail addresses for the To: header line.

913

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

914

"""

915

logging.debug('Generating email: "%s"', subject)

916

all_recipients = ', '.join(recipients)

917

report_body = '\n'.join([

918

'To: %s' % all_recipients,

919

'Subject: %s' % subject,

920

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

921

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

922

print report_body

923

else:

924

filename = os.path.join(arguments.logdir, tag)

925

try:

926

report_file = open(filename, 'w')

927

report_file.write(report_body)

928

report_file.close()

929

except EnvironmentError as e:

930

logging.error('Failed to write %s: %s', filename, e)

931

try:

932

gmail_lib.send_email(all_recipients, subject, body)

933

except Exception as e:

934

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

938

def _populate_model_counts(inventory):

939

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

940

941

Gathering the status of all individual DUTs in the lab can take

942

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

943

Normally, we pay that cost by querying as we go. However, with

944

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

945

progress in real time. So, we force the first (expensive) queries

946

to happen up front, and provide simple ASCII output on sys.stdout

947

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

948

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

949

@param inventory `_LabInventory` object from which to gather

950

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

951

"""

952

n = 0

953

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

954

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

964

# This next call is where all the time goes - it forces all of a

965

# model's `HostJobHistory` objects to query the database and

966

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

967

total_broken += counts.get_broken()

968

sys.stdout.write('\n')

969

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

970

971

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

972

def _perform_model_inventory(arguments, inventory, timestamp):

973

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

974

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

975

The model inventory report consists of the following:

976

* A list of DUTs that are recommended to be repaired. This list

977

is optional, and only appears if the `--recommend` option is

978

present.

979

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

980

of working, broken, and spare DUTs, among others.

981

982

@param arguments Command-line arguments as returned by

983

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

984

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

985

@param timestamp A string used to identify this run's timestamp

986

in logs and email output.

987

"""

988

if arguments.recommend:

989

recommend_message = _generate_repair_recommendation(

990

inventory, arguments.recommend) + '\n\n\n'

991

else:

992

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

993

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

994

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

995

'models-%s.txt' % timestamp,

996

'DUT model inventory %s' % timestamp,

997

arguments.model_notify,

998

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

999

1000

1001

def _perform_pool_inventory(arguments, inventory, timestamp):

1002

"""Perform the pool inventory report.

1003

1004

The pool inventory report consists of the following:

1005

* A list of all critical pools that have failed DUTs, with counts

1006

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1007

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1008

pool.

1009

1010

@param arguments Command-line arguments as returned by

1011

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1012

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1013

@param timestamp A string used to identify this run's timestamp in

1014

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1015

"""

1016

pool_message = _generate_pool_inventory_message(inventory)

1017

idle_message = _generate_idle_inventory_message(inventory)

1018

_send_email(arguments,

1019

'pools-%s.txt' % timestamp,

1020

'DUT pool inventory %s' % timestamp,

1021

arguments.pool_notify,

1022

pool_message + '\n\n\n' + idle_message)

1023

1024

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1025

def _dut_in_repair_loop(history):

1026

"""Return whether a DUT's history indicates a repair loop.

1027

1028

A DUT is considered looping if it runs no tests, and no tasks pass

1029

other than repair tasks.

1030

1031

@param history An instance of `status_history.HostJobHistory` to be

1032

scanned for a repair loop. The caller guarantees

1033

that this history corresponds to a working DUT.

1034

@returns Return a true value if the DUT's most recent history

1035

indicates a repair loop.

1036

"""

1037

# Our caller passes only histories for working DUTs; that means

1038

# we've already paid the cost of fetching the diagnosis task, and

1039

# we know that the task was successful. The diagnosis task will be

1040

# one of the tasks we must scan to find a loop, so if the task isn't

1041

# a repair task, then our history includes a successful non-repair

1042

# task, and we're not looping.

1043

#

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1044

# The for loop below is very expensive, because it must fetch the

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1045

# full history, regardless of how many tasks we examine. At the

1046

# time of this writing, this check against the diagnosis task

1047

# reduces the cost of finding loops in the full inventory from hours

1048

# to minutes.

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame]

1049

if _get_diagnosis(history).task.name != 'Repair':

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1054

# This is a test, so we're not looping.

1055

return False

1056

if task.diagnosis == status_history.BROKEN:

1057

# Failed a repair, so we're not looping.

1058

return False

1059

if (task.diagnosis == status_history.WORKING

1060

and task.name != 'Repair'):

1061

# Non-repair task succeeded, so we're not looping.

1062

return False

1063

# At this point, we have either a failed non-repair task, or

1064

# a successful repair.

1065

if task.name == 'Repair':

1066

repair_ok_count += 1

1067

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1071

def _report_untestable_dut(history, state):

1072

fields = {

1073

'dut_hostname': history.hostname,

1074

'model': history.host_model,

1075

'pool': history.host_pool,

1076

'state': state,

1077

}

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1078

logging.info('DUT in state %(state)s: %(dut_hostname)s, '

1079

'model: %(model)s, pool: %(pool)s', fields)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1080

_UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1081

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1082

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1083

def _report_untestable_dut_metrics(inventory):

1084

"""Scan the inventory for DUTs unable to run tests.

1085

1086

DUTs in the inventory are judged "untestable" if they meet one of

1087

two criteria:

1088

* The DUT is stuck in a repair loop; that is, it regularly passes

1089

repair, but never passes other operations.

1090

* The DUT runs no tasks at all, but is not locked.

1091

1092

This routine walks through the given inventory looking for DUTs in

1093

either of these states. Results are reported via a Monarch presence

1094

metric.

1095

1096

Note: To make sure that DUTs aren't flagged as "idle" merely

1097

because there's no work, a separate job runs prior to regular

1098

inventory runs which schedules trivial work on any DUT that appears

1099

idle.

1100

1101

@param inventory `_LabInventory` object to be reported on.

1102

"""

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1103

logging.info('Scanning for untestable DUTs.')

1104

for history in _all_dut_histories(inventory):

1105

# Managed DUTs with names that don't match

1106

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1107

# don't want arbitrary strings being attached to the

1108

# 'dut_hostname' field, so for safety, we exclude all

1109

# anomalies.

1110

if not _HOSTNAME_PATTERN.match(history.hostname):

1111

continue

1112

if _host_is_working(history):

1113

if _dut_in_repair_loop(history):

1114

_report_untestable_dut(history, 'repair_loop')

1115

elif _host_is_idle(history):

1116

if not history.host.locked:

1117

_report_untestable_dut(history, 'idle_unlocked')

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1118

1119

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1120

def _log_startup(arguments, startup_time):

1121

"""Log the start of this inventory run.

1122

1123

Print various log messages indicating the start of the run. Return

1124

a string based on `startup_time` that will be used to identify this

1125

run in log files and e-mail messages.

1126

1127

@param startup_time A UNIX timestamp marking the moment when

1128

this inventory run began.

1129

@returns A timestamp string that will be used to identify this run

1130

in logs and email output.

1131

"""

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame^]

1132

timestamp = time.strftime('%Y-%m-%d.%H',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1133

time.localtime(startup_time))

1134

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1135

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1136

if arguments.recommend:

1137

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1138

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1139

if arguments.pool_notify:

1140

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1145

"""Create the `_LabInventory` instance to use for reporting.

1146

1147

@param end_time A UNIX timestamp for the end of the time range

1148

to be searched in this inventory run.

1149

"""

1150

start_time = end_time - arguments.duration * 60 * 60

1151

afe = frontend_wrappers.RetryingAFE(server=None)

1152

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1153

afe, start_time, end_time, arguments.modelnames)

1154

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1155

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1156

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1160

def _perform_inventory_reports(arguments):

1161

"""Perform all inventory checks requested on the command line.

1162

1163

Create the initial inventory and run through the inventory reports

1164

as called for by the parsed command-line arguments.

1165

1166

@param arguments Command-line arguments as returned by

1167

`ArgumentParser`.

1168

"""

1169

startup_time = time.time()

1170

timestamp = _log_startup(arguments, startup_time)

1171

inventory = _create_inventory(arguments, startup_time)

1172

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1173

_populate_model_counts(inventory)

1174

if arguments.model_notify:

1175

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1176

if arguments.pool_notify:

1177

_perform_pool_inventory(arguments, inventory, timestamp)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1178

if arguments.report_untestable:

1179

_report_untestable_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1180

1181

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1182

def _separate_email_addresses(address_list):

1183

"""Parse a list of comma-separated lists of e-mail addresses.

1184

1185

@param address_list A list of strings containing comma

1186

separate e-mail addresses.

1187

@return A list of the individual e-mail addresses.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1188

"""

1189

newlist = []

1190

for arg in address_list:

1191

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1196

"""Validate command-line arguments.

1197

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1198

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1199

`--pool-notify` in separate option arguments into a single list.

1200

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1201

For non-debug uses, require that at least one inventory report be

1202

requested. For debug, if a report isn't specified, treat it as "run

1203

all the reports."

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1204

1205

The return value indicates success or failure; in the case of

1206

failure, we also write an error message to stderr.

1207

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1208

@param arguments Command-line arguments as returned by

1209

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1210

@return True if the arguments are semantically good, or False

1211

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1212

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1213

arguments.model_notify = _separate_email_addresses(

1214

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1215

arguments.pool_notify = _separate_email_addresses(

1216

arguments.pool_notify)

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1217

if not any([arguments.model_notify, arguments.pool_notify,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1218

arguments.report_untestable]):

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1219

if not arguments.debug:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1220

sys.stderr.write('Must request at least one report via '

1221

'--model-notify, --pool-notify, or '

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1222

'--report-untestable\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1223

return False

1224

else:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1225

# We want to run all the e-mail reports. An empty notify

1226

# list will cause a report to be skipped, so make sure the

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1227

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1228

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1229

arguments.pool_notify = ['']

1230

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1231

1232

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1233

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1234

"""Get the default directory for the `--logdir` option.

1235

1236

The default log directory is based on the parent directory

1237

containing this script.

1238

1239

@param script Path to this script file.

1240

@return A path to a directory.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1241

"""

1242

basedir = os.path.dirname(os.path.abspath(script))

1243

basedir = os.path.dirname(basedir)

1244

return os.path.join(basedir, _LOGDIR)

1245

1246

1247

def _parse_command(argv):

1248

"""Parse the command line arguments.

1249

1250

Create an argument parser for this command's syntax, parse the

1251

command line, and return the result of the ArgumentParser

1252

parse_args() method.

1253

1254

@param argv Standard command line argument vector; argv[0] is

1255

assumed to be the command name.

1256

@return Result returned by ArgumentParser.parse_args().

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1257

"""

1258

parser = argparse.ArgumentParser(

1259

prog=argv[0],

1260

description='Gather and report lab inventory statistics')

1261

parser.add_argument('-d', '--duration', type=int,

1262

default=_DEFAULT_DURATION, metavar='HOURS',

1263

help='number of hours back to search for status'

1264

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1265

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1266

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1267

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1268

'and send it to the given e-mail address(es)')

1269

parser.add_argument('--pool-notify', action='append',

1270

default=[], metavar='ADDRESS',

1271

help='Generate pool inventory message, '

1272

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1273

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1274

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1275

'recommended for repair (default: no '

1276

'recommendation)'))

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1277

parser.add_argument('--report-untestable', action='store_true',

1278

help='Check for devices unable to run tests.')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1279

parser.add_argument('--debug', action='store_true',

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1280

help='Print e-mail, metrics messages on stdout '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1281

'without sending them.')

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1282

parser.add_argument('--no-metrics', action='store_false',

1283

dest='use_metrics',

1284

help='Suppress generation of Monarch metrics.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1285

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1286

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1287

parser.add_argument('modelnames', nargs='*',

1288

metavar='MODEL',

1289

help='names of models to report on '

1290

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1291

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1292

if not _verify_arguments(arguments):

1293

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1298

"""Configure the `logging` module for our needs.

1299

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1300

How we log depends on whether the `--debug` option was provided on

1301

the command line.

1302

* Without the option, we configure the logging to capture all

1303

potentially relevant events in a log file. The log file is

1304

configured to rotate once a week on Friday evening, preserving

1305

~3 months worth of history.

1306

* With the option, we expect stdout to contain other

1307

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1308

messages), so we restrict the output to INFO level.

1309

1310

For convenience, when `--debug` is on, the logging format has

1311

no adornments, so that a call like `logging.info(msg)` simply writes

1312

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1313

1314

@param arguments Command-line arguments as returned by

1315

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1316

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1317

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1318

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1319

root_logger.setLevel(logging.INFO)

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame^]

1320

handler = logging.StreamHandler(sys.stdout)

1321

handler.setFormatter(logging.Formatter())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1322

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1323

if not os.path.exists(arguments.logdir):

1324

os.mkdir(arguments.logdir)

Alex Zamorzaev

8e5317e5

2018-09-25 00:42:30 +0000

[diff] [blame^]

1325

root_logger.setLevel(logging.DEBUG)

1326

logfile = os.path.join(arguments.logdir, _LOGFILE)

1327

handler = logging.handlers.TimedRotatingFileHandler(

1328

logfile, when='W4', backupCount=13)

1329

formatter = logging.Formatter(_LOG_FORMAT,

1330

time_utils.TIME_FMT)

1331

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1332

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1333

# implicitly imported logging_config, which calls

1334

# logging.basicConfig() *at module level*. That gives us an

1335

# extra logging handler that we don't want. So, clear out all

1336

# the handlers here.

1337

for h in root_logger.handlers:

1338

root_logger.removeHandler(h)

1339

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1340

1341

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1342

def main(argv):

1343

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1344

1345

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1346

"""

1347

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1348

if not arguments:

1349

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1350

_configure_logging(arguments)

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1351

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1352

try:

1353

if arguments.use_metrics:

1354

if arguments.debug:

1355

logging.info('Debug mode: Will not report metrics to monarch.')

1356

metrics_file = '/dev/null'

1357

else:

1358

metrics_file = None

1359

with site_utils.SetupTsMonGlobalState(

1360

'lab_inventory', debug_file=metrics_file,

auto_flush=False):

success = False

try:

with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):

1365

_perform_inventory_reports(arguments)

1366

success = True

1367

finally:

1368

metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(

1369

fields={'success': success})

1370

metrics.Flush()

1371

else:

1372

_perform_inventory_reports(arguments)

1373

except KeyboardInterrupt:

1374

pass

1375

except Exception:

1376

# Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.

1377

logging.exception('Error escaped main')

1378

raise

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1379

1380

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1381

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1382

end_time = int(time.time())

1383

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1384

return _LabInventory.create_inventory(afe, start_time, end_time)

1385

1386

1387

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1388

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1389

1390

J. Richard Barnette