Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

32

--report-untestable

33

Scan the inventory for DUTs that can't test because they're stuck in

34

repair loops, or because the scheduler can't give them work.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

53

import collections

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

54

import datetime

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

55

import logging

56

import logging.handlers

57

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

58

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

63

from autotest_lib.client.bin import utils

Richard Barnette

6f6ce32

2018-09-07 16:23:20 +0000

[diff] [blame]

64

from autotest_lib.client.common_lib import time_utils

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

65

from autotest_lib.frontend.afe.json_rpc import proxy

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

66

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

67

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

68

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

69

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

70

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

71

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

72

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

73

74

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

75

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

76

SPARE_POOL = constants.Pools.SPARE_POOL

77

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

78

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

79

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

80

# monitoring by this script. Currently, we're excluding these:

81

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

82

# + 'board:guado_moblab' - These are maintained by a separate

83

# process that doesn't use this script.

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

84

# + 'board:veyron_rialto' due to crbug.com/854404

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

85

C Shapiro

7de0442

2018-08-29 14:46:11 -0600

[diff] [blame]

86

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

87

'board:veyron_rialto'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

88

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

89

# _DEFAULT_DURATION:

90

# Default value used for the --duration command line option.

91

# Specifies how far back in time to search in order to determine

92

# DUT status.

93

94

_DEFAULT_DURATION = 24

95

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

96

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

97

# Relative path used in the calculation of the default setting for

98

# the --logdir option. The full path is relative to the root of the

99

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

100

# _LOGFILE:

101

# Basename of a file to which general log information will be

102

# written.

103

# _LOG_FORMAT:

104

# Format string for log messages.

105

106

_LOGDIR = os.path.join('logs', 'dut-data')

107

_LOGFILE = 'lab-inventory.log'

108

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

109

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

110

# Pattern describing location-based host names in the Chrome OS test

111

# labs. Each DUT hostname designates the DUT's location:

112

# * A lab (room) that's physically separated from other labs

113

# (i.e. there's a door).

114

# * A row (or aisle) of DUTs within the lab.

115

# * A vertical rack of shelves on the row.

116

# * A specific host on one shelf of the rack.

117

118

_HOSTNAME_PATTERN = re.compile(

119

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

120

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

121

# _REPAIR_LOOP_THRESHOLD:

122

# The number of repeated Repair tasks that must be seen to declare

123

# that a DUT is stuck in a repair loop.

124

125

_REPAIR_LOOP_THRESHOLD = 4

126

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

127

Prathmesh Prabhu

b69a6cc

2018-05-07 14:49:33 -0700

[diff] [blame]

128

_METRICS_PREFIX = 'chromeos/autotest/inventory'

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

129

_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

130

_METRICS_PREFIX + '/untestable',

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

131

'DUTs that cannot be scheduled for testing')

132

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

133

_MISSING_DUT_METRIC = metrics.Counter(

134

_METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'

135

' because they are invalid or deleted')

136

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

137

_TIMESTAMP_FORMAT = '%Y-%m-%d.%H'

138

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

139

def _get_diagnosis_safely(history, prop='diagnosis'):

140

return_prop = {'diagnosis': 0, 'task': 1}[prop]

141

dut_present = True

142

try:

143

return history.last_diagnosis()[return_prop]

144

except proxy.JSONRPCException as e:

logging.warn(e)

dut_present = False

finally:

_MISSING_DUT_METRIC.increment(

149

fields={'host': history.hostname, 'presence': dut_present})

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

150

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

151

def _host_is_working(history):

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

152

return _get_diagnosis_safely(history) == status_history.WORKING

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

153

154

155

def _host_is_broken(history):

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

156

return _get_diagnosis_safely(history) == status_history.BROKEN

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

157

158

159

def _host_is_idle(history):

160

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

161

return _get_diagnosis_safely(history) in idle_statuses

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

162

163

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

164

class _HostSetInventory(object):

165

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

166

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

167

Current usage of this class is that all DUTs are part of a single

168

scheduling pool of DUTs for a single model; however, this class make

169

no assumptions about the actual relationship among the DUTs.

170

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

171

The collection is segregated into disjoint categories of "working",

172

"broken", and "idle" DUTs. Accessor methods allow finding both the

173

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

174

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

175

Performance note: Certain methods in this class are potentially

176

expensive:

177

* `get_working()`

178

* `get_working_list()`

179

* `get_broken()`

180

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

181

* `get_idle()`

182

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

183

The first time any one of these methods is called, it causes

184

multiple RPC calls with a relatively expensive set of database

185

queries. However, the results of the queries are cached in the

186

individual `HostJobHistory` objects, so only the first call

187

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

188

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

189

Additionally, `get_working_list()`, `get_broken_list()` and

190

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

191

lists at every call; this caching is separate from the caching of

192

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

193

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

194

This class is deliberately constructed to delay the RPC cost until

195

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

196

`record_host()`) so that it's possible to construct a complete

197

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

198

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

203

self._working_list = None

204

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

205

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

206

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

207

def record_host(self, host_history):

208

"""Add one `HostJobHistory` object to the collection.

209

210

@param host_history The `HostJobHistory` object to be

211

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

212

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

213

self._working_list = None

214

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

215

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

216

self._histories.append(host_history)

217

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

218

def get_working_list(self):

219

"""Return a list of all working DUTs in the pool.

220

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

221

Filter `self._histories` for histories where the DUT is

222

diagnosed as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

223

224

Cache the result so that we only cacluate it once.

225

226

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

227

"""

228

if self._working_list is None:

229

self._working_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

230

if _host_is_working(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

231

return self._working_list

232

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

233

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

234

"""Return the number of working DUTs in the pool."""

235

return len(self.get_working_list())

236

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

237

def get_broken_list(self):

238

"""Return a list of all broken DUTs in the pool.

239

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

240

Filter `self._histories` for histories where the DUT is

241

diagnosed as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

242

243

Cache the result so that we only cacluate it once.

244

245

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

246

"""

247

if self._broken_list is None:

248

self._broken_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

249

if _host_is_broken(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

250

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

251

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

252

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

253

"""Return the number of broken DUTs in the pool."""

254

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

255

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

256

def get_idle_list(self):

257

"""Return a list of all idle DUTs in the pool.

258

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

259

Filter `self._histories` for histories where the DUT is

260

diagnosed as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

261

262

Cache the result so that we only cacluate it once.

263

264

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

265

"""

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

266

if self._idle_list is None:

267

self._idle_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

268

if _host_is_idle(h)]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

269

return self._idle_list

270

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

271

def get_idle(self):

272

"""Return the number of idle DUTs in the pool."""

273

return len(self.get_idle_list())

274

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

275

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

276

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

277

return len(self._histories)

278

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

279

def get_all_histories(self):

280

return self._histories

281

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

282

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

283

class _PoolSetInventory(object):

284

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

285

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

286

The collection is segregated into disjoint categories of "working",

287

"broken", and "idle" DUTs. Accessor methods allow finding both the

288

list of DUTs in each category, as well as counts of each category.

289

Accessor queries can be for an individual pool, or against all

290

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

291

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

292

Performance note: This class relies on `_HostSetInventory`. Public

293

methods in this class generally rely on methods of the same name in

294

the underlying class, and so will have the same underlying

295

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

296

"""

297

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

298

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

299

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

300

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

301

}

302

303

def record_host(self, host_history):

304

"""Add one `HostJobHistory` object to the collection.

305

306

@param host_history The `HostJobHistory` object to be

307

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

308

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

309

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

310

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

311

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

312

def _count_pool(self, get_pool_count, pool=None):

313

"""Internal helper to count hosts in a given pool.

314

315

The `get_pool_count` parameter is a function to calculate

316

the exact count of interest for the pool.

317

318

@param get_pool_count Function to return a count from a

319

_PoolCount object.

320

@param pool The pool to be counted. If `None`,

321

return the total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

322

"""

323

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

324

return sum([get_pool_count(cached_history) for cached_history in

325

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

326

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

327

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

328

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

329

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

330

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

331

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

332

Go through all HostJobHistory objects across all pools,

333

selecting all DUTs identified as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

334

335

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

336

"""

337

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

338

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

339

l.extend(p.get_working_list())

340

return l

341

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

342

def get_working(self, pool=None):

343

"""Return the number of working DUTs in a pool.

344

345

@param pool The pool to be counted. If `None`, return the

346

total across all pools.

347

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

348

@return The total number of working DUTs in the selected

349

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

350

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

351

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

352

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

353

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

354

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

355

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

356

Go through all HostJobHistory objects across all pools,

357

selecting all DUTs identified as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

358

359

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

360

"""

361

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

362

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

363

l.extend(p.get_broken_list())

364

return l

365

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

366

def get_broken(self, pool=None):

367

"""Return the number of broken DUTs in a pool.

368

369

@param pool The pool to be counted. If `None`, return the

370

total across all pools.

371

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

372

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

373

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

374

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

375

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

376

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

377

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

378

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

379

Go through all HostJobHistory objects across all pools,

380

selecting all DUTs identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

381

382

@param pool: The pool to be counted. If `None`, return the total list

383

across all pools.

384

385

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

386

"""

387

if pool is None:

388

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

389

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

390

l.extend(p.get_idle_list())

391

return l

392

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

393

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

394

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

395

def get_idle(self, pool=None):

396

"""Return the number of idle DUTs in a pool.

397

398

@param pool: The pool to be counted. If `None`, return the total

399

across all pools.

400

401

@return The total number of idle DUTs in the selected pool(s).

402

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

403

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

404

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

405

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

406

"""Return the the nominal number of working spares.

407

408

Calculates and returns how many working spares there would

409

be in the spares pool if all broken DUTs were in the spares

410

pool. This number may be negative, indicating a shortfall

411

in the critical pools.

412

413

@return The total number DUTs in the spares pool, less the total

414

number of broken DUTs in all pools.

415

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

416

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

417

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

418

def get_total(self, pool=None):

419

"""Return the total number of DUTs in a pool.

420

421

@param pool The pool to be counted. If `None`, return the

422

total across all pools.

423

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

424

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

425

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

426

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

427

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

428

def get_all_histories(self, pool=None):

429

if pool is None:

430

for p in self._histories_by_pool.itervalues():

431

for h in p.get_all_histories():

432

yield h

433

else:

434

for h in self._histories_by_pool[pool].get_all_histories():

435

yield h

436

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

437

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

438

def _eligible_host(afehost):

439

"""Return whether this host is eligible for monitoring.

440

441

A host is eligible if it has a (unique) 'model' label, it's in

442

exactly one pool, and it has no labels from the

443

`_EXCLUDED_LABELS` set.

444

445

@param afehost The host to be tested for eligibility.

446

"""

447

# DUTs without an existing, unique 'model' or 'pool' label

448

# aren't meant to exist in the managed inventory; their presence

449

# generally indicates an error in the database. Unfortunately

450

# such errors have been seen to occur from time to time.

451

#

452

# The _LabInventory constructor requires hosts to conform to the

453

# label restrictions, and may fail if they don't. Failing an

454

# inventory run for a single bad entry is the wrong thing, so we

455

# ignore the problem children here, to keep them out of the

456

# inventory.

457

models = [l for l in afehost.labels

458

if l.startswith(constants.Labels.MODEL_PREFIX)]

459

pools = [l for l in afehost.labels

460

if l.startswith(constants.Labels.POOL_PREFIX)]

461

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

462

return len(models) == 1 and len(pools) == 1 and not excluded

463

464

465

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

466

"""Collection of `HostJobHistory` objects for the Lab's inventory.

467

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

468

This is a dict-like collection indexed by model. Indexing returns

469

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

470

"""

471

472

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

473

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

474

"""Return a Lab inventory with specified parameters.

475

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

476

By default, gathers inventory from `HostJobHistory` objects for

477

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

478

supplied, the inventory will be restricted to only the given

479

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

480

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

481

@param afe AFE object for constructing the

482

`HostJobHistory` objects.

483

@param start_time Start time for the `HostJobHistory` objects.

484

@param end_time End time for the `HostJobHistory` objects.

485

@param modellist List of models to include. If empty,

486

include all available models.

487

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

488

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

489

target_pools = MANAGED_POOLS

490

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

491

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

492

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

493

# We're deliberately not checking host eligibility in this

494

# code path. This is a debug path, not used in production;

495

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

496

modelhosts = []

497

for model in modellist:

498

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

499

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

500

if model_label in h.labels]

501

modelhosts.extend(host_list)

502

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

503

else:

Richard Barnette

3a40449

2018-02-08 13:57:01 -0800

[diff] [blame]

504

afehosts = [h for h in afehosts if _eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

505

create = lambda host: (

506

status_history.HostJobHistory(afe, host,

507

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

508

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

509

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

510

def __init__(self, histories, pools):

511

models = {h.host_model for h in histories}

512

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

513

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

514

for h in histories:

515

self[h.host_model].record_host(h)

516

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

517

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

518

def __getitem__(self, key):

519

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

520

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

521

def __len__(self):

522

return self._modeldata.__len__()

523

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

524

def __iter__(self):

525

return self._modeldata.__iter__()

526

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

527

def get_num_duts(self):

528

"""Return the total number of DUTs in the inventory."""

529

return self._dut_count

530

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

531

def get_num_models(self):

532

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

533

return len(self)

534

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

535

def get_pool_models(self, pool):

536

"""Return all models in `pool`.

537

538

@param pool The pool to be inventoried for models.

539

"""

540

return {m for m, h in self.iteritems() if h.get_total(pool)}

541

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

542

def get_boards(self):

543

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

544

545

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

546

def _reportable_models(inventory, spare_pool=SPARE_POOL):

547

"""Iterate over all models subject to reporting.

548

549

Yields the contents of `inventory.iteritems()` filtered to include

550

only reportable models. A model is reportable if it has DUTs in

551

both `spare_pool` and at least one other pool.

552

553

@param spare_pool The spare pool to be tested for reporting.

554

"""

555

for model, poolset in inventory.iteritems():

556

spares = poolset.get_total(spare_pool)

557

total = poolset.get_total()

558

if spares != 0 and spares != total:

yield model, poolset

def _all_dut_histories(inventory):

563

for poolset in inventory.itervalues():

564

for h in poolset.get_all_histories():

yield h

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

568

def _sort_by_location(inventory_list):

569

"""Return a list of DUTs, organized by location.

570

571

Take the given list of `HostJobHistory` objects, separate it

572

into a list per lab, and sort each lab's list by location. The

573

order of sorting within a lab is

574

* By row number within the lab,

575

* then by rack number within the row,

576

* then by host shelf number within the rack.

577

578

Return a list of the sorted lists.

579

580

Implementation note: host locations are sorted by converting

581

each location into a base 100 number. If row, rack or

582

host numbers exceed the range [0..99], then sorting will

583

break down.

584

585

@return A list of sorted lists of DUTs.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

590

location = _HOSTNAME_PATTERN.match(history.host.hostname)

591

if location:

592

lab = location.group(1)

593

key = 0

594

for idx in location.group(2, 3, 4):

595

key = BASE * key + int(idx)

596

lab_lists.setdefault(lab, []).append((key, history))

597

return_list = []

598

for dut_list in lab_lists.values():

599

dut_list.sort(key=lambda t: t[0])

600

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

605

"""Return a numeric score rating a set of DUTs to be repaired.

606

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

607

`buffer_counts` is a dictionary mapping model names to the size of

608

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

609

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

610

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

611

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

612

613

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

614

result from the proposed repairs, and scores the new set using two

615

numbers:

616

* Worst case buffer count for any model (higher is better). This

617

is the more significant number for comparison.

618

* Number of models at the worst case (lower is better). This is

619

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

620

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

621

Implementation note: The score could fail to reflect the intended

622

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

623

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

624

@param spare_counts A dictionary mapping models to buffer counts.

625

@param repair_list A list of `HostJobHistory` objects for the

626

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

627

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

628

"""

629

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

630

# that records the buffer count for each model after repair.

631

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

632

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

633

_NMODELS = 1000

634

pools = {h.host_pool for h in repair_list}

635

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

636

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

637

for m, c in buffer_counts.iteritems():

638

if m in repair_inventory:

639

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

640

else:

641

newcount = 0

642

new_counts.append(c + newcount)

643

# Go through the new list of counts. Find the worst available

644

# spares count, and count how many times that worst case occurs.

645

worst_count = new_counts[0]

646

num_worst = 1

647

for c in new_counts[1:]:

648

if c == worst_count:

649

num_worst += 1

650

elif c < worst_count:

651

worst_count = c

652

num_worst = 1

653

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

654

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

655

656

657

def _generate_repair_recommendation(inventory, num_recommend):

658

"""Return a summary of selected DUTs needing repair.

659

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

660

Returns a message recommending a list of broken DUTs to be repaired.

661

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

662

* No more than `num_recommend` DUTs will be listed.

663

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

664

* DUTs should be selected for some degree of physical proximity.

665

* DUTs for models with a low spares buffer are more important than

666

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

667

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

668

The algorithm used will guarantee that at least one DUT from a model

669

with the lowest spares buffer will be recommended. If the worst

670

spares buffer number is shared by more than one model, the algorithm

671

will tend to prefer repair sets that include more of those models

672

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

673

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

674

@param inventory `_LabInventory` object from which to generate

675

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

676

@param num_recommend Number of DUTs to recommend for repair.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

677

"""

678

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

679

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

680

broken_list = []

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

681

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

682

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

683

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

684

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

685

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

686

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

687

# simplification is hard:

688

# * Calculating an initial recommendation outside of

689

# the loop likely would make things more complicated,

690

# not less.

691

# * It's necessary to calculate an initial lab slice once per

692

# lab _before_ the while loop, in case the number of broken

693

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

694

recommendation = None

695

best_score = None

696

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

697

start = 0

698

end = num_recommend

699

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

700

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

701

while end < len(lab_duts):

702

start += 1

703

end += 1

704

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

705

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

706

if new_score > lab_score:

707

lab_slice = new_slice

708

lab_score = new_score

709

if recommendation is None or lab_score > best_score:

710

recommendation = lab_slice

711

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

712

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

713

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

714

# know more, go try it yourself...

715

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

716

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

717

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame^]

718

if recommendation:

719

for h in recommendation:

720

servo_name = servo_host.make_servo_hostname(h.host.hostname)

721

servo_present = utils.host_is_in_lab_zone(servo_name)

722

event = _get_diagnosis_safely(h, 'task')

723

line = line_fmt % (

724

h.host.hostname, h.host_model,

725

'Yes' if servo_present else 'No', event.job_url)

726

message.append(line)

727

else:

728

message.append('(No DUTs to repair)')

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

729

return '\n'.join(message)

730

731

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

732

def _generate_model_inventory_message(inventory):

733

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

734

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

735

The model inventory is a list by model summarizing the number of

736

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

737

of working devices relative to the minimum critical pool

738

requirement.

739

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

740

The report omits models with no DUTs in the spare pool or with no

741

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

742

743

N.B. For sample output text formattted as users can expect to

744

see it in e-mail and log files, refer to the unit tests.

745

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

746

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

747

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

748

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

749

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

750

nworking = 0

751

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

752

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

753

nbroken_models = 0

754

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

755

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

756

column_names = (

757

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

758

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

759

logging.debug('Counting %2d DUTS for model %s',

760

counts.get_total(), model)

761

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

762

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

763

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

764

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

765

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

766

counts.get_spares_buffer(),

767

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

768

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

769

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

770

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

771

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

772

if element[2]:

773

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

774

nbroken_models += 1

775

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

776

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

777

nidle += element[3]

778

nworking += element[4]

779

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

780

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

781

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

782

idle_percent = int(round(100.0 * nidle / ntotal))

783

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

784

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

785

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

786

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

787

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

788

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

789

nworking, working_percent,

790

ntotal),

791

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

792

'Models with failures: %d' % nbroken_models,

793

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

794

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

795

'Full model inventory:\n',

796

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

797

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

798

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

799

return '\n'.join(message)

800

801

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

802

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

803

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

804

less than full strength, please take action to resolve the issues.

805

Once you're satisified that failures won't recur, failed DUTs can

806

be replaced with spares by running `balance_pool`. Detailed

807

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

808

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

812

def _generate_pool_inventory_message(inventory):

813

"""Generate the "pool inventory" e-mail message.

814

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

815

The pool inventory is a list by pool and model summarizing the

816

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

817

at least one broken DUT are included in the list.

818

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

819

N.B. For sample output text formattted as users can expect to see it

820

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

821

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

822

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

823

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

824

"""

825

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

826

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

827

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

828

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

829

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

830

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

831

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

832

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

833

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

834

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

835

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

836

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

837

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

838

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

839

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

840

# models at full strength are not reported

841

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

842

continue

843

working = counts.get_working(pool)

844

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

845

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

846

if data_list:

847

data_list = sorted(data_list, key=lambda d: -d[1])

848

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

849

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

850

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

851

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

852

newline = '\n'

853

return '\n'.join(message)

854

855

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

856

_IDLE_INVENTORY_HEADER = '''\

857

Notice to Infrastructure deputies: The hosts shown below haven't

858

run any jobs for at least 24 hours. Please check each host; locked

859

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

865

"""Generate the "idle inventory" e-mail message.

866

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

867

The idle inventory is a host list with corresponding pool and model,

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

868

where the hosts are identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

869

870

N.B. For sample output text format as users can expect to

871

see it in e-mail and log files, refer to the unit tests.

872

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

873

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

874

@return String with the inventory message to be sent.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

875

"""

876

logging.debug('Creating idle inventory')

877

message = [_IDLE_INVENTORY_HEADER]

878

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

879

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

880

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

881

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

882

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

883

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

884

counts.get_total(pool), model, pool)

885

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

886

for dut in counts.get_idle_list(pool)])

887

if data_list:

888

message.extend(['%-30s %-20s %s' % t for t in data_list])

889

else:

890

message.append('(No idle DUTs)')

891

return '\n'.join(message)

892

893

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

894

def _send_email(arguments, tag, subject, recipients, body):

895

"""Send an inventory e-mail message.

896

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

897

The message is logged in the selected log directory using `tag` for

898

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

899

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

900

If the --debug option was requested, the message is neither logged

901

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

902

903

@param arguments Parsed command-line options.

904

@param tag Tag identifying the inventory for logging

905

purposes.

906

@param subject E-mail Subject: header line.

907

@param recipients E-mail addresses for the To: header line.

908

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

909

"""

910

logging.debug('Generating email: "%s"', subject)

911

all_recipients = ', '.join(recipients)

912

report_body = '\n'.join([

913

'To: %s' % all_recipients,

914

'Subject: %s' % subject,

915

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

916

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

917

print report_body

918

else:

919

filename = os.path.join(arguments.logdir, tag)

920

try:

921

report_file = open(filename, 'w')

922

report_file.write(report_body)

923

report_file.close()

924

except EnvironmentError as e:

925

logging.error('Failed to write %s: %s', filename, e)

926

try:

927

gmail_lib.send_email(all_recipients, subject, body)

928

except Exception as e:

929

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

933

def _populate_model_counts(inventory):

934

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

935

936

Gathering the status of all individual DUTs in the lab can take

937

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

938

Normally, we pay that cost by querying as we go. However, with

939

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

940

progress in real time. So, we force the first (expensive) queries

941

to happen up front, and provide simple ASCII output on sys.stdout

942

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

943

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

944

@param inventory `_LabInventory` object from which to gather

945

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

946

"""

947

n = 0

948

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

949

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

959

# This next call is where all the time goes - it forces all of a

960

# model's `HostJobHistory` objects to query the database and

961

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

962

total_broken += counts.get_broken()

963

sys.stdout.write('\n')

964

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

965

966

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

967

def _perform_model_inventory(arguments, inventory, timestamp):

968

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

969

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

970

The model inventory report consists of the following:

971

* A list of DUTs that are recommended to be repaired. This list

972

is optional, and only appears if the `--recommend` option is

973

present.

974

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

975

of working, broken, and spare DUTs, among others.

976

977

@param arguments Command-line arguments as returned by

978

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

979

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

980

@param timestamp A string used to identify this run's timestamp

981

in logs and email output.

982

"""

983

if arguments.recommend:

984

recommend_message = _generate_repair_recommendation(

985

inventory, arguments.recommend) + '\n\n\n'

986

else:

987

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

988

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

989

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

990

'models-%s.txt' % timestamp,

991

'DUT model inventory %s' % timestamp,

992

arguments.model_notify,

993

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

994

995

996

def _perform_pool_inventory(arguments, inventory, timestamp):

997

"""Perform the pool inventory report.

998

999

The pool inventory report consists of the following:

1000

* A list of all critical pools that have failed DUTs, with counts

1001

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1002

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1003

pool.

1004

1005

@param arguments Command-line arguments as returned by

1006

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1007

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1008

@param timestamp A string used to identify this run's timestamp in

1009

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1010

"""

1011

pool_message = _generate_pool_inventory_message(inventory)

1012

idle_message = _generate_idle_inventory_message(inventory)

1013

_send_email(arguments,

1014

'pools-%s.txt' % timestamp,

1015

'DUT pool inventory %s' % timestamp,

1016

arguments.pool_notify,

1017

pool_message + '\n\n\n' + idle_message)

1018

1019

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1020

def _dut_in_repair_loop(history):

1021

"""Return whether a DUT's history indicates a repair loop.

1022

1023

A DUT is considered looping if it runs no tests, and no tasks pass

1024

other than repair tasks.

1025

1026

@param history An instance of `status_history.HostJobHistory` to be

1027

scanned for a repair loop. The caller guarantees

1028

that this history corresponds to a working DUT.

1029

@returns Return a true value if the DUT's most recent history

1030

indicates a repair loop.

1031

"""

1032

# Our caller passes only histories for working DUTs; that means

1033

# we've already paid the cost of fetching the diagnosis task, and

1034

# we know that the task was successful. The diagnosis task will be

1035

# one of the tasks we must scan to find a loop, so if the task isn't

1036

# a repair task, then our history includes a successful non-repair

1037

# task, and we're not looping.

1038

#

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1039

# The for loop below is very expensive, because it must fetch the

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1040

# full history, regardless of how many tasks we examine. At the

1041

# time of this writing, this check against the diagnosis task

1042

# reduces the cost of finding loops in the full inventory from hours

1043

# to minutes.

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

1044

if _get_diagnosis_safely(history, 'task').name != 'Repair':

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1049

# This is a test, so we're not looping.

1050

return False

1051

if task.diagnosis == status_history.BROKEN:

1052

# Failed a repair, so we're not looping.

1053

return False

1054

if (task.diagnosis == status_history.WORKING

1055

and task.name != 'Repair'):

1056

# Non-repair task succeeded, so we're not looping.

1057

return False

1058

# At this point, we have either a failed non-repair task, or

1059

# a successful repair.

1060

if task.name == 'Repair':

1061

repair_ok_count += 1

1062

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1066

def _report_untestable_dut(history, state):

1067

fields = {

1068

'dut_hostname': history.hostname,

1069

'model': history.host_model,

1070

'pool': history.host_pool,

1071

'state': state,

1072

}

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1073

logging.info('DUT in state %(state)s: %(dut_hostname)s, '

1074

'model: %(model)s, pool: %(pool)s', fields)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1075

_UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1076

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1077

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1078

def _report_untestable_dut_metrics(inventory):

1079

"""Scan the inventory for DUTs unable to run tests.

1080

1081

DUTs in the inventory are judged "untestable" if they meet one of

1082

two criteria:

1083

* The DUT is stuck in a repair loop; that is, it regularly passes

1084

repair, but never passes other operations.

1085

* The DUT runs no tasks at all, but is not locked.

1086

1087

This routine walks through the given inventory looking for DUTs in

1088

either of these states. Results are reported via a Monarch presence

1089

metric.

1090

1091

Note: To make sure that DUTs aren't flagged as "idle" merely

1092

because there's no work, a separate job runs prior to regular

1093

inventory runs which schedules trivial work on any DUT that appears

1094

idle.

1095

1096

@param inventory `_LabInventory` object to be reported on.

1097

"""

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1098

logging.info('Scanning for untestable DUTs.')

1099

for history in _all_dut_histories(inventory):

1100

# Managed DUTs with names that don't match

1101

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1102

# don't want arbitrary strings being attached to the

1103

# 'dut_hostname' field, so for safety, we exclude all

1104

# anomalies.

1105

if not _HOSTNAME_PATTERN.match(history.hostname):

1106

continue

1107

if _host_is_working(history):

1108

if _dut_in_repair_loop(history):

1109

_report_untestable_dut(history, 'repair_loop')

1110

elif _host_is_idle(history):

1111

if not history.host.locked:

1112

_report_untestable_dut(history, 'idle_unlocked')

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1113

1114

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1115

def _log_startup(arguments, startup_time):

1116

"""Log the start of this inventory run.

1117

1118

Print various log messages indicating the start of the run. Return

1119

a string based on `startup_time` that will be used to identify this

1120

run in log files and e-mail messages.

1121

1122

@param startup_time A UNIX timestamp marking the moment when

1123

this inventory run began.

1124

@returns A timestamp string that will be used to identify this run

1125

in logs and email output.

1126

"""

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

1127

timestamp = time.strftime(_TIMESTAMP_FORMAT,

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1128

time.localtime(startup_time))

1129

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1130

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1131

if arguments.recommend:

1132

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1133

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1134

if arguments.pool_notify:

1135

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1140

"""Create the `_LabInventory` instance to use for reporting.

1141

1142

@param end_time A UNIX timestamp for the end of the time range

1143

to be searched in this inventory run.

1144

"""

1145

start_time = end_time - arguments.duration * 60 * 60

1146

afe = frontend_wrappers.RetryingAFE(server=None)

1147

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1148

afe, start_time, end_time, arguments.modelnames)

1149

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1150

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1151

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1155

def _perform_inventory_reports(arguments):

1156

"""Perform all inventory checks requested on the command line.

1157

1158

Create the initial inventory and run through the inventory reports

1159

as called for by the parsed command-line arguments.

1160

1161

@param arguments Command-line arguments as returned by

1162

`ArgumentParser`.

1163

"""

1164

startup_time = time.time()

1165

timestamp = _log_startup(arguments, startup_time)

1166

inventory = _create_inventory(arguments, startup_time)

1167

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1168

_populate_model_counts(inventory)

1169

if arguments.model_notify:

1170

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1171

if arguments.pool_notify:

1172

_perform_pool_inventory(arguments, inventory, timestamp)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1173

if arguments.report_untestable:

1174

_report_untestable_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1175

1176

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1177

def _separate_email_addresses(address_list):

1178

"""Parse a list of comma-separated lists of e-mail addresses.

1179

1180

@param address_list A list of strings containing comma

1181

separate e-mail addresses.

1182

@return A list of the individual e-mail addresses.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1183

"""

1184

newlist = []

1185

for arg in address_list:

1186

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1191

"""Validate command-line arguments.

1192

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1193

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1194

`--pool-notify` in separate option arguments into a single list.

1195

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1196

For non-debug uses, require that at least one inventory report be

1197

requested. For debug, if a report isn't specified, treat it as "run

1198

all the reports."

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1199

1200

The return value indicates success or failure; in the case of

1201

failure, we also write an error message to stderr.

1202

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1203

@param arguments Command-line arguments as returned by

1204

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1205

@return True if the arguments are semantically good, or False

1206

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1207

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1208

arguments.model_notify = _separate_email_addresses(

1209

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1210

arguments.pool_notify = _separate_email_addresses(

1211

arguments.pool_notify)

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1212

if not any([arguments.model_notify, arguments.pool_notify,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1213

arguments.report_untestable]):

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1214

if not arguments.debug:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1215

sys.stderr.write('Must request at least one report via '

1216

'--model-notify, --pool-notify, or '

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1217

'--report-untestable\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1218

return False

1219

else:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1220

# We want to run all the e-mail reports. An empty notify

1221

# list will cause a report to be skipped, so make sure the

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1222

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1223

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1224

arguments.pool_notify = ['']

1225

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1226

1227

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1228

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1229

"""Get the default directory for the `--logdir` option.

1230

1231

The default log directory is based on the parent directory

1232

containing this script.

1233

1234

@param script Path to this script file.

1235

@return A path to a directory.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1236

"""

1237

basedir = os.path.dirname(os.path.abspath(script))

1238

basedir = os.path.dirname(basedir)

1239

return os.path.join(basedir, _LOGDIR)

1240

1241

1242

def _parse_command(argv):

1243

"""Parse the command line arguments.

1244

1245

Create an argument parser for this command's syntax, parse the

1246

command line, and return the result of the ArgumentParser

1247

parse_args() method.

1248

1249

@param argv Standard command line argument vector; argv[0] is

1250

assumed to be the command name.

1251

@return Result returned by ArgumentParser.parse_args().

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1252

"""

1253

parser = argparse.ArgumentParser(

1254

prog=argv[0],

1255

description='Gather and report lab inventory statistics')

1256

parser.add_argument('-d', '--duration', type=int,

1257

default=_DEFAULT_DURATION, metavar='HOURS',

1258

help='number of hours back to search for status'

1259

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1260

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1261

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1262

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1263

'and send it to the given e-mail address(es)')

1264

parser.add_argument('--pool-notify', action='append',

1265

default=[], metavar='ADDRESS',

1266

help='Generate pool inventory message, '

1267

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1268

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1269

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1270

'recommended for repair (default: no '

1271

'recommendation)'))

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1272

parser.add_argument('--report-untestable', action='store_true',

1273

help='Check for devices unable to run tests.')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1274

parser.add_argument('--debug', action='store_true',

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1275

help='Print e-mail, metrics messages on stdout '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1276

'without sending them.')

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1277

parser.add_argument('--no-metrics', action='store_false',

1278

dest='use_metrics',

1279

help='Suppress generation of Monarch metrics.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1280

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1281

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1282

parser.add_argument('modelnames', nargs='*',

1283

metavar='MODEL',

1284

help='names of models to report on '

1285

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1286

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1287

if not _verify_arguments(arguments):

1288

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1293

"""Configure the `logging` module for our needs.

1294

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1295

How we log depends on whether the `--debug` option was provided on

1296

the command line.

1297

* Without the option, we configure the logging to capture all

1298

potentially relevant events in a log file. The log file is

1299

configured to rotate once a week on Friday evening, preserving

1300

~3 months worth of history.

1301

* With the option, we expect stdout to contain other

1302

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1303

messages), so we restrict the output to INFO level.

1304

1305

For convenience, when `--debug` is on, the logging format has

1306

no adornments, so that a call like `logging.info(msg)` simply writes

1307

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1308

1309

@param arguments Command-line arguments as returned by

1310

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1311

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1312

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1313

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1314

root_logger.setLevel(logging.INFO)

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

1315

logfile = sys.stdout

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1316

else:

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

1317

root_logger.setLevel(logging.DEBUG)

1318

logfile = open(os.path.join(

1319

arguments.logdir,

1320

_LOGFILE + datetime.datetime.today().strftime(_TIMESTAMP_FORMAT)

1321

))

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1322

if not os.path.exists(arguments.logdir):

1323

os.mkdir(arguments.logdir)

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

1324

handler = logging.StreamHandler(logfile)

1325

formatter = logging.Formatter(

1326

_LOG_FORMAT, time_utils.TIME_FMT)

1327

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1328

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1329

# implicitly imported logging_config, which calls

1330

# logging.basicConfig() *at module level*. That gives us an

1331

# extra logging handler that we don't want. So, clear out all

1332

# the handlers here.

1333

for h in root_logger.handlers:

1334

root_logger.removeHandler(h)

1335

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1336

1337

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1338

def main(argv):

1339

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1340

1341

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1342

"""

1343

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1344

if not arguments:

1345

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1346

_configure_logging(arguments)

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1347

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1348

try:

1349

if arguments.use_metrics:

1350

if arguments.debug:

1351

logging.info('Debug mode: Will not report metrics to monarch.')

1352

metrics_file = '/dev/null'

1353

else:

1354

metrics_file = None

1355

with site_utils.SetupTsMonGlobalState(

1356

'lab_inventory', debug_file=metrics_file,

auto_flush=False):

success = False

try:

with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):

1361

_perform_inventory_reports(arguments)

1362

success = True

1363

finally:

1364

metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(

1365

fields={'success': success})

1366

metrics.Flush()

1367

else:

1368

_perform_inventory_reports(arguments)

1369

except KeyboardInterrupt:

1370

pass

1371

except Exception:

1372

# Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.

1373

logging.exception('Error escaped main')

1374

raise

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1375

1376

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1377

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1378

end_time = int(time.time())

1379

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1380

return _LabInventory.create_inventory(afe, start_time, end_time)

1381

1382

1383

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1384

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1385

1386

J. Richard Barnette