Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

32

--report-untestable

33

Scan the inventory for DUTs that can't test because they're stuck in

34

repair loops, or because the scheduler can't give them work.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

53

import collections

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

54

import datetime

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

55

import logging

56

import logging.handlers

57

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

58

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

63

from autotest_lib.client.bin import utils

Richard Barnette

6f6ce32

2018-09-07 16:23:20 +0000

[diff] [blame]

64

from autotest_lib.client.common_lib import time_utils

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

65

from autotest_lib.frontend.afe.json_rpc import proxy

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

66

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

67

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

68

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

69

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

70

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

71

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

72

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

73

74

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

75

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

76

SPARE_POOL = constants.Pools.SPARE_POOL

77

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

78

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

79

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

80

# monitoring by this script. Currently, we're excluding these:

81

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

82

# + 'board:guado_moblab' - These are maintained by a separate

83

# process that doesn't use this script.

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

84

# + 'board:veyron_rialto' due to crbug.com/854404

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

85

C Shapiro

7de0442

2018-08-29 14:46:11 -0600

[diff] [blame]

86

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab',

Aviv Keshet

1ba0dec

2018-07-12 17:14:08 -0700

[diff] [blame]

87

'board:veyron_rialto'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

88

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

89

# _DEFAULT_DURATION:

90

# Default value used for the --duration command line option.

91

# Specifies how far back in time to search in order to determine

92

# DUT status.

93

94

_DEFAULT_DURATION = 24

95

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

96

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

97

# Relative path used in the calculation of the default setting for

98

# the --logdir option. The full path is relative to the root of the

99

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

100

# _LOGFILE:

101

# Basename of a file to which general log information will be

102

# written.

103

# _LOG_FORMAT:

104

# Format string for log messages.

105

106

_LOGDIR = os.path.join('logs', 'dut-data')

107

_LOGFILE = 'lab-inventory.log'

108

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

109

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

110

# Pattern describing location-based host names in the Chrome OS test

111

# labs. Each DUT hostname designates the DUT's location:

112

# * A lab (room) that's physically separated from other labs

113

# (i.e. there's a door).

114

# * A row (or aisle) of DUTs within the lab.

115

# * A vertical rack of shelves on the row.

116

# * A specific host on one shelf of the rack.

117

118

_HOSTNAME_PATTERN = re.compile(

119

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

120

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

121

# _REPAIR_LOOP_THRESHOLD:

122

# The number of repeated Repair tasks that must be seen to declare

123

# that a DUT is stuck in a repair loop.

124

125

_REPAIR_LOOP_THRESHOLD = 4

126

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

127

Prathmesh Prabhu

b69a6cc

2018-05-07 14:49:33 -0700

[diff] [blame]

128

_METRICS_PREFIX = 'chromeos/autotest/inventory'

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

129

_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

130

_METRICS_PREFIX + '/untestable',

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

131

'DUTs that cannot be scheduled for testing')

132

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

133

_MISSING_DUT_METRIC = metrics.Counter(

134

_METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries'

135

' because they are invalid or deleted')

136

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

137

_TIMESTAMP_FORMAT = '%Y-%m-%d.%H'

138

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame^]

139

# _Diagnosis - namedtuple corresponding to the return value from

140

# `HostHistory.last_diagnosis()`

141

_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task'])

142

143

144

def _get_diagnosis(history):

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

145

dut_present = True

146

try:

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame^]

147

diagnosis = _Diagnosis(*history.last_diagnosis())

148

if (diagnosis.status == status_history.BROKEN

149

and diagnosis.task.end_time < history.start_time):

150

return _Diagnosis(status_history.UNUSED, diagnosis.task)

151

else:

152

return diagnosis

Jacob Kopczynski

2018-08-13 17:24:41 -0700

[diff] [blame]

153

except proxy.JSONRPCException as e:

logging.warn(e)

dut_present = False

finally:

_MISSING_DUT_METRIC.increment(

158

fields={'host': history.hostname, 'presence': dut_present})

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

159

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

160

def _host_is_working(history):

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame^]

161

return _get_diagnosis(history).status == status_history.WORKING

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

162

163

164

def _host_is_broken(history):

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame^]

165

return _get_diagnosis(history).status == status_history.BROKEN

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

166

167

168

def _host_is_idle(history):

169

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame^]

170

return _get_diagnosis(history).status in idle_statuses

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

171

172

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

173

class _HostSetInventory(object):

174

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

175

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

176

Current usage of this class is that all DUTs are part of a single

177

scheduling pool of DUTs for a single model; however, this class make

178

no assumptions about the actual relationship among the DUTs.

179

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

180

The collection is segregated into disjoint categories of "working",

181

"broken", and "idle" DUTs. Accessor methods allow finding both the

182

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

183

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

184

Performance note: Certain methods in this class are potentially

185

expensive:

186

* `get_working()`

187

* `get_working_list()`

188

* `get_broken()`

189

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

190

* `get_idle()`

191

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

192

The first time any one of these methods is called, it causes

193

multiple RPC calls with a relatively expensive set of database

194

queries. However, the results of the queries are cached in the

195

individual `HostJobHistory` objects, so only the first call

196

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

197

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

198

Additionally, `get_working_list()`, `get_broken_list()` and

199

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

200

lists at every call; this caching is separate from the caching of

201

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

202

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

203

This class is deliberately constructed to delay the RPC cost until

204

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

205

`record_host()`) so that it's possible to construct a complete

206

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

207

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

212

self._working_list = None

213

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

214

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

215

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

216

def record_host(self, host_history):

217

"""Add one `HostJobHistory` object to the collection.

218

219

@param host_history The `HostJobHistory` object to be

220

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

221

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

222

self._working_list = None

223

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

224

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

225

self._histories.append(host_history)

226

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

227

def get_working_list(self):

228

"""Return a list of all working DUTs in the pool.

229

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

230

Filter `self._histories` for histories where the DUT is

231

diagnosed as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

232

233

Cache the result so that we only cacluate it once.

234

235

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

236

"""

237

if self._working_list is None:

238

self._working_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

239

if _host_is_working(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

240

return self._working_list

241

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

242

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

243

"""Return the number of working DUTs in the pool."""

244

return len(self.get_working_list())

245

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

246

def get_broken_list(self):

247

"""Return a list of all broken DUTs in the pool.

248

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

249

Filter `self._histories` for histories where the DUT is

250

diagnosed as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

251

252

Cache the result so that we only cacluate it once.

253

254

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

255

"""

256

if self._broken_list is None:

257

self._broken_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

258

if _host_is_broken(h)]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

259

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

260

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

261

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

262

"""Return the number of broken DUTs in the pool."""

263

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

264

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

265

def get_idle_list(self):

266

"""Return a list of all idle DUTs in the pool.

267

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

268

Filter `self._histories` for histories where the DUT is

269

diagnosed as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

270

271

Cache the result so that we only cacluate it once.

272

273

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

274

"""

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

275

if self._idle_list is None:

276

self._idle_list = [h for h in self._histories

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

277

if _host_is_idle(h)]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

278

return self._idle_list

279

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

280

def get_idle(self):

281

"""Return the number of idle DUTs in the pool."""

282

return len(self.get_idle_list())

283

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

284

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

285

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

286

return len(self._histories)

287

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

288

def get_all_histories(self):

289

return self._histories

290

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

291

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

292

class _PoolSetInventory(object):

293

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

294

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

295

The collection is segregated into disjoint categories of "working",

296

"broken", and "idle" DUTs. Accessor methods allow finding both the

297

list of DUTs in each category, as well as counts of each category.

298

Accessor queries can be for an individual pool, or against all

299

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

300

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

301

Performance note: This class relies on `_HostSetInventory`. Public

302

methods in this class generally rely on methods of the same name in

303

the underlying class, and so will have the same underlying

304

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

305

"""

306

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

307

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

308

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

309

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

310

}

311

312

def record_host(self, host_history):

313

"""Add one `HostJobHistory` object to the collection.

314

315

@param host_history The `HostJobHistory` object to be

316

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

317

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

318

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

319

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

320

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

321

def _count_pool(self, get_pool_count, pool=None):

322

"""Internal helper to count hosts in a given pool.

323

324

The `get_pool_count` parameter is a function to calculate

325

the exact count of interest for the pool.

326

327

@param get_pool_count Function to return a count from a

328

_PoolCount object.

329

@param pool The pool to be counted. If `None`,

330

return the total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

331

"""

332

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

333

return sum([get_pool_count(cached_history) for cached_history in

334

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

335

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

336

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

337

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

338

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

339

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

340

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

341

Go through all HostJobHistory objects across all pools,

342

selecting all DUTs identified as working.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

343

344

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

345

"""

346

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

347

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

348

l.extend(p.get_working_list())

349

return l

350

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

351

def get_working(self, pool=None):

352

"""Return the number of working DUTs in a pool.

353

354

@param pool The pool to be counted. If `None`, return the

355

total across all pools.

356

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

357

@return The total number of working DUTs in the selected

358

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

359

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

360

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

361

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

362

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

363

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

364

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

365

Go through all HostJobHistory objects across all pools,

366

selecting all DUTs identified as broken.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

367

368

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

369

"""

370

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

371

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

372

l.extend(p.get_broken_list())

373

return l

374

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

375

def get_broken(self, pool=None):

376

"""Return the number of broken DUTs in a pool.

377

378

@param pool The pool to be counted. If `None`, return the

379

total across all pools.

380

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

381

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

382

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

383

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

384

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

385

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

386

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

387

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

388

Go through all HostJobHistory objects across all pools,

389

selecting all DUTs identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

390

391

@param pool: The pool to be counted. If `None`, return the total list

392

across all pools.

393

394

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

395

"""

396

if pool is None:

397

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

398

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

399

l.extend(p.get_idle_list())

400

return l

401

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

402

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

403

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

404

def get_idle(self, pool=None):

405

"""Return the number of idle DUTs in a pool.

406

407

@param pool: The pool to be counted. If `None`, return the total

408

across all pools.

409

410

@return The total number of idle DUTs in the selected pool(s).

411

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

412

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

413

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

414

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

415

"""Return the the nominal number of working spares.

416

417

Calculates and returns how many working spares there would

418

be in the spares pool if all broken DUTs were in the spares

419

pool. This number may be negative, indicating a shortfall

420

in the critical pools.

421

422

@return The total number DUTs in the spares pool, less the total

423

number of broken DUTs in all pools.

424

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

425

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

426

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

427

def get_total(self, pool=None):

428

"""Return the total number of DUTs in a pool.

429

430

@param pool The pool to be counted. If `None`, return the

431

total across all pools.

432

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

433

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

434

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

435

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

436

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

437

def get_all_histories(self, pool=None):

438

if pool is None:

439

for p in self._histories_by_pool.itervalues():

440

for h in p.get_all_histories():

441

yield h

442

else:

443

for h in self._histories_by_pool[pool].get_all_histories():

444

yield h

445

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

446

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

447

def _eligible_host(afehost):

448

"""Return whether this host is eligible for monitoring.

449

450

A host is eligible if it has a (unique) 'model' label, it's in

451

exactly one pool, and it has no labels from the

452

`_EXCLUDED_LABELS` set.

453

454

@param afehost The host to be tested for eligibility.

455

"""

456

# DUTs without an existing, unique 'model' or 'pool' label

457

# aren't meant to exist in the managed inventory; their presence

458

# generally indicates an error in the database. Unfortunately

459

# such errors have been seen to occur from time to time.

460

#

461

# The _LabInventory constructor requires hosts to conform to the

462

# label restrictions, and may fail if they don't. Failing an

463

# inventory run for a single bad entry is the wrong thing, so we

464

# ignore the problem children here, to keep them out of the

465

# inventory.

466

models = [l for l in afehost.labels

467

if l.startswith(constants.Labels.MODEL_PREFIX)]

468

pools = [l for l in afehost.labels

469

if l.startswith(constants.Labels.POOL_PREFIX)]

470

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

471

return len(models) == 1 and len(pools) == 1 and not excluded

472

473

474

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

475

"""Collection of `HostJobHistory` objects for the Lab's inventory.

476

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

477

This is a dict-like collection indexed by model. Indexing returns

478

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

479

"""

480

481

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

482

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

483

"""Return a Lab inventory with specified parameters.

484

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

485

By default, gathers inventory from `HostJobHistory` objects for

486

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

487

supplied, the inventory will be restricted to only the given

488

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

489

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

490

@param afe AFE object for constructing the

491

`HostJobHistory` objects.

492

@param start_time Start time for the `HostJobHistory` objects.

493

@param end_time End time for the `HostJobHistory` objects.

494

@param modellist List of models to include. If empty,

495

include all available models.

496

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

497

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

498

target_pools = MANAGED_POOLS

499

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

500

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

501

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

502

# We're deliberately not checking host eligibility in this

503

# code path. This is a debug path, not used in production;

504

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

505

modelhosts = []

506

for model in modellist:

507

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

508

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

509

if model_label in h.labels]

510

modelhosts.extend(host_list)

511

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

512

else:

Richard Barnette

3a40449

2018-02-08 13:57:01 -0800

[diff] [blame]

513

afehosts = [h for h in afehosts if _eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

514

create = lambda host: (

515

status_history.HostJobHistory(afe, host,

516

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

517

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

518

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

519

def __init__(self, histories, pools):

520

models = {h.host_model for h in histories}

521

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

522

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

523

for h in histories:

524

self[h.host_model].record_host(h)

525

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

526

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

527

def __getitem__(self, key):

528

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

529

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

530

def __len__(self):

531

return self._modeldata.__len__()

532

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

533

def __iter__(self):

534

return self._modeldata.__iter__()

535

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

536

def get_num_duts(self):

537

"""Return the total number of DUTs in the inventory."""

538

return self._dut_count

539

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

540

def get_num_models(self):

541

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

542

return len(self)

543

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

544

def get_pool_models(self, pool):

545

"""Return all models in `pool`.

546

547

@param pool The pool to be inventoried for models.

548

"""

549

return {m for m, h in self.iteritems() if h.get_total(pool)}

550

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

551

def get_boards(self):

552

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

553

554

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

555

def _reportable_models(inventory, spare_pool=SPARE_POOL):

556

"""Iterate over all models subject to reporting.

557

558

Yields the contents of `inventory.iteritems()` filtered to include

559

only reportable models. A model is reportable if it has DUTs in

560

both `spare_pool` and at least one other pool.

561

562

@param spare_pool The spare pool to be tested for reporting.

563

"""

564

for model, poolset in inventory.iteritems():

565

spares = poolset.get_total(spare_pool)

566

total = poolset.get_total()

567

if spares != 0 and spares != total:

yield model, poolset

def _all_dut_histories(inventory):

572

for poolset in inventory.itervalues():

573

for h in poolset.get_all_histories():

yield h

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

577

def _sort_by_location(inventory_list):

578

"""Return a list of DUTs, organized by location.

579

580

Take the given list of `HostJobHistory` objects, separate it

581

into a list per lab, and sort each lab's list by location. The

582

order of sorting within a lab is

583

* By row number within the lab,

584

* then by rack number within the row,

585

* then by host shelf number within the rack.

586

587

Return a list of the sorted lists.

588

589

Implementation note: host locations are sorted by converting

590

each location into a base 100 number. If row, rack or

591

host numbers exceed the range [0..99], then sorting will

592

break down.

593

594

@return A list of sorted lists of DUTs.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

599

location = _HOSTNAME_PATTERN.match(history.host.hostname)

600

if location:

601

lab = location.group(1)

602

key = 0

603

for idx in location.group(2, 3, 4):

604

key = BASE * key + int(idx)

605

lab_lists.setdefault(lab, []).append((key, history))

606

return_list = []

607

for dut_list in lab_lists.values():

608

dut_list.sort(key=lambda t: t[0])

609

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

614

"""Return a numeric score rating a set of DUTs to be repaired.

615

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

616

`buffer_counts` is a dictionary mapping model names to the size of

617

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

618

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

619

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

620

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

621

622

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

623

result from the proposed repairs, and scores the new set using two

624

numbers:

625

* Worst case buffer count for any model (higher is better). This

626

is the more significant number for comparison.

627

* Number of models at the worst case (lower is better). This is

628

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

629

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

630

Implementation note: The score could fail to reflect the intended

631

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

632

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

633

@param spare_counts A dictionary mapping models to buffer counts.

634

@param repair_list A list of `HostJobHistory` objects for the

635

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

636

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

637

"""

638

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

639

# that records the buffer count for each model after repair.

640

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

641

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

642

_NMODELS = 1000

643

pools = {h.host_pool for h in repair_list}

644

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

645

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

646

for m, c in buffer_counts.iteritems():

647

if m in repair_inventory:

648

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

649

else:

650

newcount = 0

651

new_counts.append(c + newcount)

652

# Go through the new list of counts. Find the worst available

653

# spares count, and count how many times that worst case occurs.

654

worst_count = new_counts[0]

655

num_worst = 1

656

for c in new_counts[1:]:

657

if c == worst_count:

658

num_worst += 1

659

elif c < worst_count:

660

worst_count = c

661

num_worst = 1

662

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

663

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

664

665

666

def _generate_repair_recommendation(inventory, num_recommend):

667

"""Return a summary of selected DUTs needing repair.

668

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

669

Returns a message recommending a list of broken DUTs to be repaired.

670

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

671

* No more than `num_recommend` DUTs will be listed.

672

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

673

* DUTs should be selected for some degree of physical proximity.

674

* DUTs for models with a low spares buffer are more important than

675

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

676

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

677

The algorithm used will guarantee that at least one DUT from a model

678

with the lowest spares buffer will be recommended. If the worst

679

spares buffer number is shared by more than one model, the algorithm

680

will tend to prefer repair sets that include more of those models

681

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

682

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

683

@param inventory `_LabInventory` object from which to generate

684

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

685

@param num_recommend Number of DUTs to recommend for repair.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

686

"""

687

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

688

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

689

broken_list = []

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

690

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

691

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

692

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

693

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

694

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

695

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

696

# simplification is hard:

697

# * Calculating an initial recommendation outside of

698

# the loop likely would make things more complicated,

699

# not less.

700

# * It's necessary to calculate an initial lab slice once per

701

# lab _before_ the while loop, in case the number of broken

702

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

703

recommendation = None

704

best_score = None

705

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

706

start = 0

707

end = num_recommend

708

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

709

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

710

while end < len(lab_duts):

711

start += 1

712

end += 1

713

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

714

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

715

if new_score > lab_score:

716

lab_slice = new_slice

717

lab_score = new_score

718

if recommendation is None or lab_score > best_score:

719

recommendation = lab_slice

720

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

721

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

722

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

723

# know more, go try it yourself...

724

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

725

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

726

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame]

727

if recommendation:

728

for h in recommendation:

729

servo_name = servo_host.make_servo_hostname(h.host.hostname)

730

servo_present = utils.host_is_in_lab_zone(servo_name)

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame^]

731

event = _get_diagnosis(h).task

Richard Barnette

b14c7dc

2018-09-17 14:16:48 -0700

[diff] [blame]

732

line = line_fmt % (

733

h.host.hostname, h.host_model,

734

'Yes' if servo_present else 'No', event.job_url)

735

message.append(line)

736

else:

737

message.append('(No DUTs to repair)')

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

738

return '\n'.join(message)

739

740

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

741

def _generate_model_inventory_message(inventory):

742

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

743

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

744

The model inventory is a list by model summarizing the number of

745

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

746

of working devices relative to the minimum critical pool

747

requirement.

748

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

749

The report omits models with no DUTs in the spare pool or with no

750

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

751

752

N.B. For sample output text formattted as users can expect to

753

see it in e-mail and log files, refer to the unit tests.

754

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

755

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

756

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

757

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

758

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

759

nworking = 0

760

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

761

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

762

nbroken_models = 0

763

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

764

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

765

column_names = (

766

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

767

for model, counts in _reportable_models(inventory):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

768

logging.debug('Counting %2d DUTS for model %s',

769

counts.get_total(), model)

770

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

771

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

772

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

773

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

774

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

775

counts.get_spares_buffer(),

776

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

777

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

778

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

779

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

780

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

781

if element[2]:

782

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

783

nbroken_models += 1

784

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

785

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

786

nidle += element[3]

787

nworking += element[4]

788

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

789

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

790

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

791

idle_percent = int(round(100.0 * nidle / ntotal))

792

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

793

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

794

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

795

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

796

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

797

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

798

nworking, working_percent,

799

ntotal),

800

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

801

'Models with failures: %d' % nbroken_models,

802

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

803

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

804

'Full model inventory:\n',

805

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

806

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

807

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

808

return '\n'.join(message)

809

810

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

811

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

812

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

813

less than full strength, please take action to resolve the issues.

814

Once you're satisified that failures won't recur, failed DUTs can

815

be replaced with spares by running `balance_pool`. Detailed

816

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

817

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

821

def _generate_pool_inventory_message(inventory):

822

"""Generate the "pool inventory" e-mail message.

823

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

824

The pool inventory is a list by pool and model summarizing the

825

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

826

at least one broken DUT are included in the list.

827

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

828

N.B. For sample output text formattted as users can expect to see it

829

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

830

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

831

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

832

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

833

"""

834

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

835

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

836

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

837

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

838

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

839

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

840

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

841

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

842

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

843

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

844

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

845

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

846

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

847

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

848

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

849

# models at full strength are not reported

850

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

851

continue

852

working = counts.get_working(pool)

853

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

854

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

855

if data_list:

856

data_list = sorted(data_list, key=lambda d: -d[1])

857

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

858

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

859

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

860

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

861

newline = '\n'

862

return '\n'.join(message)

863

864

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

865

_IDLE_INVENTORY_HEADER = '''\

866

Notice to Infrastructure deputies: The hosts shown below haven't

867

run any jobs for at least 24 hours. Please check each host; locked

868

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

874

"""Generate the "idle inventory" e-mail message.

875

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

876

The idle inventory is a host list with corresponding pool and model,

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

877

where the hosts are identified as idle.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

878

879

N.B. For sample output text format as users can expect to

880

see it in e-mail and log files, refer to the unit tests.

881

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

882

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

883

@return String with the inventory message to be sent.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

884

"""

885

logging.debug('Creating idle inventory')

886

message = [_IDLE_INVENTORY_HEADER]

887

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

888

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

889

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

890

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

891

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

892

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

893

counts.get_total(pool), model, pool)

894

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

895

for dut in counts.get_idle_list(pool)])

896

if data_list:

897

message.extend(['%-30s %-20s %s' % t for t in data_list])

898

else:

899

message.append('(No idle DUTs)')

900

return '\n'.join(message)

901

902

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

903

def _send_email(arguments, tag, subject, recipients, body):

904

"""Send an inventory e-mail message.

905

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

906

The message is logged in the selected log directory using `tag` for

907

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

908

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

909

If the --debug option was requested, the message is neither logged

910

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

911

912

@param arguments Parsed command-line options.

913

@param tag Tag identifying the inventory for logging

914

purposes.

915

@param subject E-mail Subject: header line.

916

@param recipients E-mail addresses for the To: header line.

917

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

918

"""

919

logging.debug('Generating email: "%s"', subject)

920

all_recipients = ', '.join(recipients)

921

report_body = '\n'.join([

922

'To: %s' % all_recipients,

923

'Subject: %s' % subject,

924

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

925

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

926

print report_body

927

else:

928

filename = os.path.join(arguments.logdir, tag)

929

try:

930

report_file = open(filename, 'w')

931

report_file.write(report_body)

932

report_file.close()

933

except EnvironmentError as e:

934

logging.error('Failed to write %s: %s', filename, e)

935

try:

936

gmail_lib.send_email(all_recipients, subject, body)

937

except Exception as e:

938

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

942

def _populate_model_counts(inventory):

943

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

944

945

Gathering the status of all individual DUTs in the lab can take

946

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

947

Normally, we pay that cost by querying as we go. However, with

948

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

949

progress in real time. So, we force the first (expensive) queries

950

to happen up front, and provide simple ASCII output on sys.stdout

951

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

952

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

953

@param inventory `_LabInventory` object from which to gather

954

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

955

"""

956

n = 0

957

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

958

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

968

# This next call is where all the time goes - it forces all of a

969

# model's `HostJobHistory` objects to query the database and

970

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

971

total_broken += counts.get_broken()

972

sys.stdout.write('\n')

973

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

974

975

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

976

def _perform_model_inventory(arguments, inventory, timestamp):

977

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

978

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

979

The model inventory report consists of the following:

980

* A list of DUTs that are recommended to be repaired. This list

981

is optional, and only appears if the `--recommend` option is

982

present.

983

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

984

of working, broken, and spare DUTs, among others.

985

986

@param arguments Command-line arguments as returned by

987

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

988

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

989

@param timestamp A string used to identify this run's timestamp

990

in logs and email output.

991

"""

992

if arguments.recommend:

993

recommend_message = _generate_repair_recommendation(

994

inventory, arguments.recommend) + '\n\n\n'

995

else:

996

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

997

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

998

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

999

'models-%s.txt' % timestamp,

1000

'DUT model inventory %s' % timestamp,

1001

arguments.model_notify,

1002

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1003

1004

1005

def _perform_pool_inventory(arguments, inventory, timestamp):

1006

"""Perform the pool inventory report.

1007

1008

The pool inventory report consists of the following:

1009

* A list of all critical pools that have failed DUTs, with counts

1010

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1011

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1012

pool.

1013

1014

@param arguments Command-line arguments as returned by

1015

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1016

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1017

@param timestamp A string used to identify this run's timestamp in

1018

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1019

"""

1020

pool_message = _generate_pool_inventory_message(inventory)

1021

idle_message = _generate_idle_inventory_message(inventory)

1022

_send_email(arguments,

1023

'pools-%s.txt' % timestamp,

1024

'DUT pool inventory %s' % timestamp,

1025

arguments.pool_notify,

1026

pool_message + '\n\n\n' + idle_message)

1027

1028

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1029

def _dut_in_repair_loop(history):

1030

"""Return whether a DUT's history indicates a repair loop.

1031

1032

A DUT is considered looping if it runs no tests, and no tasks pass

1033

other than repair tasks.

1034

1035

@param history An instance of `status_history.HostJobHistory` to be

1036

scanned for a repair loop. The caller guarantees

1037

that this history corresponds to a working DUT.

1038

@returns Return a true value if the DUT's most recent history

1039

indicates a repair loop.

1040

"""

1041

# Our caller passes only histories for working DUTs; that means

1042

# we've already paid the cost of fetching the diagnosis task, and

1043

# we know that the task was successful. The diagnosis task will be

1044

# one of the tasks we must scan to find a loop, so if the task isn't

1045

# a repair task, then our history includes a successful non-repair

1046

# task, and we're not looping.

1047

#

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1048

# The for loop below is very expensive, because it must fetch the

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1049

# full history, regardless of how many tasks we examine. At the

1050

# time of this writing, this check against the diagnosis task

1051

# reduces the cost of finding loops in the full inventory from hours

1052

# to minutes.

Richard Barnette

2018-09-14 15:25:30 -0700

[diff] [blame^]

1053

if _get_diagnosis(history).task.name != 'Repair':

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1058

# This is a test, so we're not looping.

1059

return False

1060

if task.diagnosis == status_history.BROKEN:

1061

# Failed a repair, so we're not looping.

1062

return False

1063

if (task.diagnosis == status_history.WORKING

1064

and task.name != 'Repair'):

1065

# Non-repair task succeeded, so we're not looping.

1066

return False

1067

# At this point, we have either a failed non-repair task, or

1068

# a successful repair.

1069

if task.name == 'Repair':

1070

repair_ok_count += 1

1071

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1075

def _report_untestable_dut(history, state):

1076

fields = {

1077

'dut_hostname': history.hostname,

1078

'model': history.host_model,

1079

'pool': history.host_pool,

1080

'state': state,

1081

}

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1082

logging.info('DUT in state %(state)s: %(dut_hostname)s, '

1083

'model: %(model)s, pool: %(pool)s', fields)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1084

_UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1085

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1086

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1087

def _report_untestable_dut_metrics(inventory):

1088

"""Scan the inventory for DUTs unable to run tests.

1089

1090

DUTs in the inventory are judged "untestable" if they meet one of

1091

two criteria:

1092

* The DUT is stuck in a repair loop; that is, it regularly passes

1093

repair, but never passes other operations.

1094

* The DUT runs no tasks at all, but is not locked.

1095

1096

This routine walks through the given inventory looking for DUTs in

1097

either of these states. Results are reported via a Monarch presence

1098

metric.

1099

1100

Note: To make sure that DUTs aren't flagged as "idle" merely

1101

because there's no work, a separate job runs prior to regular

1102

inventory runs which schedules trivial work on any DUT that appears

1103

idle.

1104

1105

@param inventory `_LabInventory` object to be reported on.

1106

"""

Richard Barnette

2018-04-27 13:12:04 -0400

[diff] [blame]

1107

logging.info('Scanning for untestable DUTs.')

1108

for history in _all_dut_histories(inventory):

1109

# Managed DUTs with names that don't match

1110

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1111

# don't want arbitrary strings being attached to the

1112

# 'dut_hostname' field, so for safety, we exclude all

1113

# anomalies.

1114

if not _HOSTNAME_PATTERN.match(history.hostname):

1115

continue

1116

if _host_is_working(history):

1117

if _dut_in_repair_loop(history):

1118

_report_untestable_dut(history, 'repair_loop')

1119

elif _host_is_idle(history):

1120

if not history.host.locked:

1121

_report_untestable_dut(history, 'idle_unlocked')

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1122

1123

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1124

def _log_startup(arguments, startup_time):

1125

"""Log the start of this inventory run.

1126

1127

Print various log messages indicating the start of the run. Return

1128

a string based on `startup_time` that will be used to identify this

1129

run in log files and e-mail messages.

1130

1131

@param startup_time A UNIX timestamp marking the moment when

1132

this inventory run began.

1133

@returns A timestamp string that will be used to identify this run

1134

in logs and email output.

1135

"""

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

1136

timestamp = time.strftime(_TIMESTAMP_FORMAT,

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1137

time.localtime(startup_time))

1138

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1139

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1140

if arguments.recommend:

1141

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1142

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1143

if arguments.pool_notify:

1144

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1149

"""Create the `_LabInventory` instance to use for reporting.

1150

1151

@param end_time A UNIX timestamp for the end of the time range

1152

to be searched in this inventory run.

1153

"""

1154

start_time = end_time - arguments.duration * 60 * 60

1155

afe = frontend_wrappers.RetryingAFE(server=None)

1156

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1157

afe, start_time, end_time, arguments.modelnames)

1158

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1159

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1160

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1164

def _perform_inventory_reports(arguments):

1165

"""Perform all inventory checks requested on the command line.

1166

1167

Create the initial inventory and run through the inventory reports

1168

as called for by the parsed command-line arguments.

1169

1170

@param arguments Command-line arguments as returned by

1171

`ArgumentParser`.

1172

"""

1173

startup_time = time.time()

1174

timestamp = _log_startup(arguments, startup_time)

1175

inventory = _create_inventory(arguments, startup_time)

1176

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1177

_populate_model_counts(inventory)

1178

if arguments.model_notify:

1179

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1180

if arguments.pool_notify:

1181

_perform_pool_inventory(arguments, inventory, timestamp)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1182

if arguments.report_untestable:

1183

_report_untestable_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1184

1185

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1186

def _separate_email_addresses(address_list):

1187

"""Parse a list of comma-separated lists of e-mail addresses.

1188

1189

@param address_list A list of strings containing comma

1190

separate e-mail addresses.

1191

@return A list of the individual e-mail addresses.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1192

"""

1193

newlist = []

1194

for arg in address_list:

1195

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1200

"""Validate command-line arguments.

1201

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1202

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1203

`--pool-notify` in separate option arguments into a single list.

1204

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1205

For non-debug uses, require that at least one inventory report be

1206

requested. For debug, if a report isn't specified, treat it as "run

1207

all the reports."

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1208

1209

The return value indicates success or failure; in the case of

1210

failure, we also write an error message to stderr.

1211

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1212

@param arguments Command-line arguments as returned by

1213

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1214

@return True if the arguments are semantically good, or False

1215

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1216

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1217

arguments.model_notify = _separate_email_addresses(

1218

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1219

arguments.pool_notify = _separate_email_addresses(

1220

arguments.pool_notify)

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1221

if not any([arguments.model_notify, arguments.pool_notify,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1222

arguments.report_untestable]):

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1223

if not arguments.debug:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1224

sys.stderr.write('Must request at least one report via '

1225

'--model-notify, --pool-notify, or '

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1226

'--report-untestable\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1227

return False

1228

else:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1229

# We want to run all the e-mail reports. An empty notify

1230

# list will cause a report to be skipped, so make sure the

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1231

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1232

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1233

arguments.pool_notify = ['']

1234

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1235

1236

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1237

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1238

"""Get the default directory for the `--logdir` option.

1239

1240

The default log directory is based on the parent directory

1241

containing this script.

1242

1243

@param script Path to this script file.

1244

@return A path to a directory.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1245

"""

1246

basedir = os.path.dirname(os.path.abspath(script))

1247

basedir = os.path.dirname(basedir)

1248

return os.path.join(basedir, _LOGDIR)

1249

1250

1251

def _parse_command(argv):

1252

"""Parse the command line arguments.

1253

1254

Create an argument parser for this command's syntax, parse the

1255

command line, and return the result of the ArgumentParser

1256

parse_args() method.

1257

1258

@param argv Standard command line argument vector; argv[0] is

1259

assumed to be the command name.

1260

@return Result returned by ArgumentParser.parse_args().

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1261

"""

1262

parser = argparse.ArgumentParser(

1263

prog=argv[0],

1264

description='Gather and report lab inventory statistics')

1265

parser.add_argument('-d', '--duration', type=int,

1266

default=_DEFAULT_DURATION, metavar='HOURS',

1267

help='number of hours back to search for status'

1268

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1269

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1270

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1271

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1272

'and send it to the given e-mail address(es)')

1273

parser.add_argument('--pool-notify', action='append',

1274

default=[], metavar='ADDRESS',

1275

help='Generate pool inventory message, '

1276

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1277

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1278

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1279

'recommended for repair (default: no '

1280

'recommendation)'))

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1281

parser.add_argument('--report-untestable', action='store_true',

1282

help='Check for devices unable to run tests.')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1283

parser.add_argument('--debug', action='store_true',

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1284

help='Print e-mail, metrics messages on stdout '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1285

'without sending them.')

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1286

parser.add_argument('--no-metrics', action='store_false',

1287

dest='use_metrics',

1288

help='Suppress generation of Monarch metrics.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1289

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1290

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1291

parser.add_argument('modelnames', nargs='*',

1292

metavar='MODEL',

1293

help='names of models to report on '

1294

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1295

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1296

if not _verify_arguments(arguments):

1297

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1302

"""Configure the `logging` module for our needs.

1303

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1304

How we log depends on whether the `--debug` option was provided on

1305

the command line.

1306

* Without the option, we configure the logging to capture all

1307

potentially relevant events in a log file. The log file is

1308

configured to rotate once a week on Friday evening, preserving

1309

~3 months worth of history.

1310

* With the option, we expect stdout to contain other

1311

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1312

messages), so we restrict the output to INFO level.

1313

1314

For convenience, when `--debug` is on, the logging format has

1315

no adornments, so that a call like `logging.info(msg)` simply writes

1316

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1317

1318

@param arguments Command-line arguments as returned by

1319

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1320

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1321

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1322

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1323

root_logger.setLevel(logging.INFO)

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

1324

logfile = sys.stdout

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1325

else:

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

1326

root_logger.setLevel(logging.DEBUG)

1327

logfile = open(os.path.join(

1328

arguments.logdir,

1329

_LOGFILE + datetime.datetime.today().strftime(_TIMESTAMP_FORMAT)

1330

))

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1331

if not os.path.exists(arguments.logdir):

1332

os.mkdir(arguments.logdir)

Jacob Kopczynski

2018-08-09 11:05:37 -0700

[diff] [blame]

1333

handler = logging.StreamHandler(logfile)

1334

formatter = logging.Formatter(

1335

_LOG_FORMAT, time_utils.TIME_FMT)

1336

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1337

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1338

# implicitly imported logging_config, which calls

1339

# logging.basicConfig() *at module level*. That gives us an

1340

# extra logging handler that we don't want. So, clear out all

1341

# the handlers here.

1342

for h in root_logger.handlers:

1343

root_logger.removeHandler(h)

1344

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1345

1346

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1347

def main(argv):

1348

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1349

1350

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1351

"""

1352

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1353

if not arguments:

1354

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1355

_configure_logging(arguments)

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1356

Richard Barnette

c437469

2018-09-17 13:53:38 -0700

[diff] [blame]

1357

try:

1358

if arguments.use_metrics:

1359

if arguments.debug:

1360

logging.info('Debug mode: Will not report metrics to monarch.')

1361

metrics_file = '/dev/null'

1362

else:

1363

metrics_file = None

1364

with site_utils.SetupTsMonGlobalState(

1365

'lab_inventory', debug_file=metrics_file,

auto_flush=False):

success = False

try:

with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX):

1370

_perform_inventory_reports(arguments)

1371

success = True

1372

finally:

1373

metrics.Counter('%s/tick' % _METRICS_PREFIX).increment(

1374

fields={'success': success})

1375

metrics.Flush()

1376

else:

1377

_perform_inventory_reports(arguments)

1378

except KeyboardInterrupt:

1379

pass

1380

except Exception:

1381

# Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.

1382

logging.exception('Error escaped main')

1383

raise

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1384

1385

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1386

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1387

end_time = int(time.time())

1388

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1389

return _LabInventory.create_inventory(afe, start_time, end_time)

1390

1391

1392

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1393

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1394

1395

J. Richard Barnette