Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

32

--report-untestable

33

Scan the inventory for DUTs that can't test because they're stuck in

34

repair loops, or because the scheduler can't give them work.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

53

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

54

import logging

55

import logging.handlers

56

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

57

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

62

from autotest_lib.client.bin import utils

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

63

from autotest_lib.client.common_lib import time_utils

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

64

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

65

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

66

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

67

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

68

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

69

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

70

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

71

72

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

73

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

74

SPARE_POOL = constants.Pools.SPARE_POOL

75

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

76

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

77

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

78

# monitoring by this script. Currently, we're excluding these:

79

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

80

# + 'board:guado_moblab' - These are maintained by a separate

81

# process that doesn't use this script.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

82

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

83

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

84

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

85

# _DEFAULT_DURATION:

86

# Default value used for the --duration command line option.

87

# Specifies how far back in time to search in order to determine

88

# DUT status.

89

90

_DEFAULT_DURATION = 24

91

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

92

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

93

# Relative path used in the calculation of the default setting for

94

# the --logdir option. The full path is relative to the root of the

95

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

96

# _LOGFILE:

97

# Basename of a file to which general log information will be

98

# written.

99

# _LOG_FORMAT:

100

# Format string for log messages.

101

102

_LOGDIR = os.path.join('logs', 'dut-data')

103

_LOGFILE = 'lab-inventory.log'

104

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

105

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

106

# Pattern describing location-based host names in the Chrome OS test

107

# labs. Each DUT hostname designates the DUT's location:

108

# * A lab (room) that's physically separated from other labs

109

# (i.e. there's a door).

110

# * A row (or aisle) of DUTs within the lab.

111

# * A vertical rack of shelves on the row.

112

# * A specific host on one shelf of the rack.

113

114

_HOSTNAME_PATTERN = re.compile(

115

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

116

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

117

# _REPAIR_LOOP_THRESHOLD:

118

# The number of repeated Repair tasks that must be seen to declare

119

# that a DUT is stuck in a repair loop.

120

121

_REPAIR_LOOP_THRESHOLD = 4

122

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

123

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

124

_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric(

125

'chromeos/autotest/inventory/untestable',

126

'DUTs that cannot be scheduled for testing')

127

128

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

129

class _HostSetInventory(object):

130

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

131

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

132

The collection is segregated into disjoint categories of "working",

133

"broken", and "idle" DUTs. Accessor methods allow finding both the

134

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

135

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

136

Performance note: Certain methods in this class are potentially

137

expensive:

138

* `get_working()`

139

* `get_working_list()`

140

* `get_broken()`

141

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

142

* `get_idle()`

143

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

144

The first time any one of these methods is called, it causes

145

multiple RPC calls with a relatively expensive set of database

146

queries. However, the results of the queries are cached in the

147

individual `HostJobHistory` objects, so only the first call

148

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

149

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

150

Additionally, `get_working_list()`, `get_broken_list()` and

151

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

152

lists at every call; this caching is separate from the caching of

153

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

154

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

155

This class is deliberately constructed to delay the RPC cost until

156

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

157

`record_host()`) so that it's possible to construct a complete

158

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

159

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

160

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

161

Current usage of this class is that all DUTs are part of a single

162

scheduling pool of DUTs; however, this class make no assumptions

163

about the actual relationship among the DUTs.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

168

self._working_list = None

169

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

170

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

171

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

172

def record_host(self, host_history):

173

"""Add one `HostJobHistory` object to the collection.

174

175

@param host_history The `HostJobHistory` object to be

176

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

177

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

178

self._working_list = None

179

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

180

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

181

self._histories.append(host_history)

182

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

183

def get_working_list(self):

184

"""Return a list of all working DUTs in the pool.

185

186

Filter `self._histories` for histories where the last

187

diagnosis is `WORKING`.

188

189

Cache the result so that we only cacluate it once.

190

191

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

192

"""

193

if self._working_list is None:

194

self._working_list = [h for h in self._histories

195

if h.last_diagnosis()[0] == status_history.WORKING]

196

return self._working_list

197

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

198

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

199

"""Return the number of working DUTs in the pool."""

200

return len(self.get_working_list())

201

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

202

def get_broken_list(self):

203

"""Return a list of all broken DUTs in the pool.

204

205

Filter `self._histories` for histories where the last

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

206

diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

207

208

Cache the result so that we only cacluate it once.

209

210

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

211

"""

212

if self._broken_list is None:

213

self._broken_list = [h for h in self._histories

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

214

if h.last_diagnosis()[0] == status_history.BROKEN]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

215

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

216

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

217

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

218

"""Return the number of broken DUTs in the pool."""

219

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

220

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

221

def get_idle_list(self):

222

"""Return a list of all idle DUTs in the pool.

223

224

Filter `self._histories` for histories where the last

225

diagnosis is `UNUSED` or `UNKNOWN`.

226

227

Cache the result so that we only cacluate it once.

228

229

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

230

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

231

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

232

if self._idle_list is None:

233

self._idle_list = [h for h in self._histories

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

234

if h.last_diagnosis()[0] in idle_statuses]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

235

return self._idle_list

236

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

237

def get_idle(self):

238

"""Return the number of idle DUTs in the pool."""

239

return len(self.get_idle_list())

240

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

241

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

242

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

243

return len(self._histories)

244

245

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

246

class _PoolSetInventory(object):

247

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

248

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

249

The collection is segregated into disjoint categories of "working",

250

"broken", and "idle" DUTs. Accessor methods allow finding both the

251

list of DUTs in each category, as well as counts of each category.

252

Accessor queries can be for an individual pool, or against all

253

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

254

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

255

Performance note: This class relies on `_HostSetInventory`. Public

256

methods in this class generally rely on methods of the same name in

257

the underlying class, and so will have the same underlying

258

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

259

"""

260

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

261

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

262

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

263

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

264

}

265

266

def record_host(self, host_history):

267

"""Add one `HostJobHistory` object to the collection.

268

269

@param host_history The `HostJobHistory` object to be

270

remembered.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

271

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

272

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

273

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

274

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

275

def _count_pool(self, get_pool_count, pool=None):

276

"""Internal helper to count hosts in a given pool.

277

278

The `get_pool_count` parameter is a function to calculate

279

the exact count of interest for the pool.

280

281

@param get_pool_count Function to return a count from a

282

_PoolCount object.

283

@param pool The pool to be counted. If `None`,

284

return the total across all pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

285

"""

286

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

287

return sum([get_pool_count(cached_history) for cached_history in

288

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

289

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

290

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

291

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

292

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

293

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

294

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

295

Go through all HostJobHistory objects across all pools, selecting the

296

ones where the last diagnosis is `WORKING`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

297

298

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

299

"""

300

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

301

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

302

l.extend(p.get_working_list())

303

return l

304

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

305

def get_working(self, pool=None):

306

"""Return the number of working DUTs in a pool.

307

308

@param pool The pool to be counted. If `None`, return the

309

total across all pools.

310

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

311

@return The total number of working DUTs in the selected

312

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

313

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

314

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

315

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

316

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

317

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

318

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

319

Go through all HostJobHistory objects in the across all pools,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

320

selecting the ones where the last diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

321

322

@return A list of HostJobHistory objects.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

323

"""

324

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

325

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

326

l.extend(p.get_broken_list())

327

return l

328

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

329

def get_broken(self, pool=None):

330

"""Return the number of broken DUTs in a pool.

331

332

@param pool The pool to be counted. If `None`, return the

333

total across all pools.

334

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

335

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

336

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

337

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

338

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

339

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

340

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

341

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

342

Go through all HostJobHistory objects in the given pool, selecting the

343

ones where the last diagnosis is `UNUSED` or `UNKNOWN`.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

344

345

@param pool: The pool to be counted. If `None`, return the total list

346

across all pools.

347

348

@return A list of HostJobHistory objects.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

349

"""

350

if pool is None:

351

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

352

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

353

l.extend(p.get_idle_list())

354

return l

355

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

356

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

357

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

358

def get_idle(self, pool=None):

359

"""Return the number of idle DUTs in a pool.

360

361

@param pool: The pool to be counted. If `None`, return the total

362

across all pools.

363

364

@return The total number of idle DUTs in the selected pool(s).

365

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

366

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

367

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

368

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

369

"""Return the the nominal number of working spares.

370

371

Calculates and returns how many working spares there would

372

be in the spares pool if all broken DUTs were in the spares

373

pool. This number may be negative, indicating a shortfall

374

in the critical pools.

375

376

@return The total number DUTs in the spares pool, less the total

377

number of broken DUTs in all pools.

378

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

379

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

380

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

381

def get_total(self, pool=None):

382

"""Return the total number of DUTs in a pool.

383

384

@param pool The pool to be counted. If `None`, return the

385

total across all pools.

386

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

387

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

388

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

389

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

390

391

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

392

def _eligible_host(afehost):

393

"""Return whether this host is eligible for monitoring.

394

395

A host is eligible if it has a (unique) 'model' label, it's in

396

exactly one pool, and it has no labels from the

397

`_EXCLUDED_LABELS` set.

398

399

@param afehost The host to be tested for eligibility.

400

"""

401

# DUTs without an existing, unique 'model' or 'pool' label

402

# aren't meant to exist in the managed inventory; their presence

403

# generally indicates an error in the database. Unfortunately

404

# such errors have been seen to occur from time to time.

405

#

406

# The _LabInventory constructor requires hosts to conform to the

407

# label restrictions, and may fail if they don't. Failing an

408

# inventory run for a single bad entry is the wrong thing, so we

409

# ignore the problem children here, to keep them out of the

410

# inventory.

411

models = [l for l in afehost.labels

412

if l.startswith(constants.Labels.MODEL_PREFIX)]

413

pools = [l for l in afehost.labels

414

if l.startswith(constants.Labels.POOL_PREFIX)]

415

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

416

return len(models) == 1 and len(pools) == 1 and not excluded

417

418

419

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

420

"""Collection of `HostJobHistory` objects for the Lab's inventory.

421

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

422

This is a dict-like collection indexed by model. Indexing returns

423

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

424

"""

425

426

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

427

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

428

"""Return a Lab inventory with specified parameters.

429

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

430

By default, gathers inventory from `HostJobHistory` objects for

431

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

432

supplied, the inventory will be restricted to only the given

433

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

434

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

435

@param afe AFE object for constructing the

436

`HostJobHistory` objects.

437

@param start_time Start time for the `HostJobHistory` objects.

438

@param end_time End time for the `HostJobHistory` objects.

439

@param modellist List of models to include. If empty,

440

include all available models.

441

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

442

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

443

target_pools = MANAGED_POOLS

444

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

445

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

446

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

447

# We're deliberately not checking host eligibility in this

448

# code path. This is a debug path, not used in production;

449

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

450

modelhosts = []

451

for model in modellist:

452

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

453

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

454

if model_label in h.labels]

455

modelhosts.extend(host_list)

456

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

457

else:

Richard Barnette

3a40449

2018-02-08 13:57:01 -0800

[diff] [blame]

458

afehosts = [h for h in afehosts if _eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

459

create = lambda host: (

460

status_history.HostJobHistory(afe, host,

461

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

462

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

463

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

464

def __init__(self, histories, pools):

465

models = {h.host_model for h in histories}

466

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

467

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

468

for h in histories:

469

self[h.host_model].record_host(h)

470

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

471

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

472

def __getitem__(self, key):

473

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

474

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

475

def __len__(self):

476

return self._modeldata.__len__()

477

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

478

def __iter__(self):

479

return self._modeldata.__iter__()

480

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

481

def reportable_items(self, spare_pool=SPARE_POOL):

Richard Barnette

df01f1b

2018-04-20 14:44:40 -0400

[diff] [blame]

482

"""Iterate over all items subject to reporting.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

483

484

Yields the contents of `self.iteritems()` filtered to include

485

only reportable models. A model is reportable if it has DUTs in

486

both `spare_pool` and at least one other pool.

487

488

@param spare_pool The spare pool to be tested for reporting.

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

489

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

490

for model, histories in self.iteritems():

491

spares = histories.get_total(spare_pool)

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

492

total = histories.get_total()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

493

if spares != 0 and spares != total:

494

yield model, histories

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

495

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

496

def get_num_duts(self):

497

"""Return the total number of DUTs in the inventory."""

498

return self._dut_count

499

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

500

def get_num_models(self):

501

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

502

return len(self)

503

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

504

def get_pool_models(self, pool):

505

"""Return all models in `pool`.

506

507

@param pool The pool to be inventoried for models.

508

"""

509

return {m for m, h in self.iteritems() if h.get_total(pool)}

510

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

511

def get_boards(self):

512

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

513

514

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

515

def _sort_by_location(inventory_list):

516

"""Return a list of DUTs, organized by location.

517

518

Take the given list of `HostJobHistory` objects, separate it

519

into a list per lab, and sort each lab's list by location. The

520

order of sorting within a lab is

521

* By row number within the lab,

522

* then by rack number within the row,

523

* then by host shelf number within the rack.

524

525

Return a list of the sorted lists.

526

527

Implementation note: host locations are sorted by converting

528

each location into a base 100 number. If row, rack or

529

host numbers exceed the range [0..99], then sorting will

530

break down.

531

532

@return A list of sorted lists of DUTs.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

537

location = _HOSTNAME_PATTERN.match(history.host.hostname)

538

if location:

539

lab = location.group(1)

540

key = 0

541

for idx in location.group(2, 3, 4):

542

key = BASE * key + int(idx)

543

lab_lists.setdefault(lab, []).append((key, history))

544

return_list = []

545

for dut_list in lab_lists.values():

546

dut_list.sort(key=lambda t: t[0])

547

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

552

"""Return a numeric score rating a set of DUTs to be repaired.

553

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

554

`buffer_counts` is a dictionary mapping model names to the size of

555

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

556

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

557

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

558

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

559

560

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

561

result from the proposed repairs, and scores the new set using two

562

numbers:

563

* Worst case buffer count for any model (higher is better). This

564

is the more significant number for comparison.

565

* Number of models at the worst case (lower is better). This is

566

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

567

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

568

Implementation note: The score could fail to reflect the intended

569

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

570

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

571

@param spare_counts A dictionary mapping models to buffer counts.

572

@param repair_list A list of `HostJobHistory` objects for the

573

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

574

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

575

"""

576

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

577

# that records the buffer count for each model after repair.

578

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

579

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

580

_NMODELS = 1000

581

pools = {h.host_pool for h in repair_list}

582

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

583

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

584

for m, c in buffer_counts.iteritems():

585

if m in repair_inventory:

586

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

587

else:

588

newcount = 0

589

new_counts.append(c + newcount)

590

# Go through the new list of counts. Find the worst available

591

# spares count, and count how many times that worst case occurs.

592

worst_count = new_counts[0]

593

num_worst = 1

594

for c in new_counts[1:]:

595

if c == worst_count:

596

num_worst += 1

597

elif c < worst_count:

598

worst_count = c

599

num_worst = 1

600

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

601

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

602

603

604

def _generate_repair_recommendation(inventory, num_recommend):

605

"""Return a summary of selected DUTs needing repair.

606

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

607

Returns a message recommending a list of broken DUTs to be repaired.

608

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

609

* No more than `num_recommend` DUTs will be listed.

610

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

611

* DUTs should be selected for some degree of physical proximity.

612

* DUTs for models with a low spares buffer are more important than

613

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

614

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

615

The algorithm used will guarantee that at least one DUT from a model

616

with the lowest spares buffer will be recommended. If the worst

617

spares buffer number is shared by more than one model, the algorithm

618

will tend to prefer repair sets that include more of those models

619

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

620

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

621

@param inventory `_LabInventory` object from which to generate

622

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

623

@param num_recommend Number of DUTs to recommend for repair.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

624

"""

625

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

626

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

627

broken_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

628

for model, counts in inventory.reportable_items():

629

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

630

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

631

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

632

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

633

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

634

# simplification is hard:

635

# * Calculating an initial recommendation outside of

636

# the loop likely would make things more complicated,

637

# not less.

638

# * It's necessary to calculate an initial lab slice once per

639

# lab _before_ the while loop, in case the number of broken

640

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

641

recommendation = None

642

best_score = None

643

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

644

start = 0

645

end = num_recommend

646

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

647

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

648

while end < len(lab_duts):

649

start += 1

650

end += 1

651

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

652

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

653

if new_score > lab_score:

654

lab_slice = new_slice

655

lab_score = new_score

656

if recommendation is None or lab_score > best_score:

657

recommendation = lab_slice

658

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

659

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

660

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

661

# know more, go try it yourself...

662

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

663

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

664

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

665

for h in recommendation:

666

servo_name = servo_host.make_servo_hostname(h.host.hostname)

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

667

servo_present = utils.host_is_in_lab_zone(servo_name)

668

_, event = h.last_diagnosis()

669

line = line_fmt % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

670

h.host.hostname, h.host_model,

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

671

'Yes' if servo_present else 'No', event.job_url)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

672

message.append(line)

673

return '\n'.join(message)

674

675

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

676

def _generate_model_inventory_message(inventory):

677

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

678

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

679

The model inventory is a list by model summarizing the number of

680

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

681

of working devices relative to the minimum critical pool

682

requirement.

683

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

684

The report omits models with no DUTs in the spare pool or with no

685

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

686

687

N.B. For sample output text formattted as users can expect to

688

see it in e-mail and log files, refer to the unit tests.

689

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

690

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

691

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

692

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

693

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

694

nworking = 0

695

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

696

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

697

nbroken_models = 0

698

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

699

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

700

column_names = (

701

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

702

for model, counts in inventory.reportable_items():

703

logging.debug('Counting %2d DUTS for model %s',

704

counts.get_total(), model)

705

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

706

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

707

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

708

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

709

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

710

counts.get_spares_buffer(),

711

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

712

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

713

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

714

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

715

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

716

if element[2]:

717

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

718

nbroken_models += 1

719

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

720

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

721

nidle += element[3]

722

nworking += element[4]

723

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

724

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

725

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

726

idle_percent = int(round(100.0 * nidle / ntotal))

727

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

728

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

729

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

730

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

731

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

732

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

733

nworking, working_percent,

734

ntotal),

735

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

736

'Models with failures: %d' % nbroken_models,

737

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

738

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

739

'Full model inventory:\n',

740

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

741

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

742

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

743

return '\n'.join(message)

744

745

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

746

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

747

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

748

less than full strength, please take action to resolve the issues.

749

Once you're satisified that failures won't recur, failed DUTs can

750

be replaced with spares by running `balance_pool`. Detailed

751

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

752

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

756

def _generate_pool_inventory_message(inventory):

757

"""Generate the "pool inventory" e-mail message.

758

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

759

The pool inventory is a list by pool and model summarizing the

760

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

761

at least one broken DUT are included in the list.

762

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

763

N.B. For sample output text formattted as users can expect to see it

764

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

765

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

766

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

767

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

768

"""

769

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

770

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

771

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

772

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

773

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

774

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

775

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

776

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

777

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

778

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

779

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

780

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

781

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

782

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

783

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

784

# models at full strength are not reported

785

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

786

continue

787

working = counts.get_working(pool)

788

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

789

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

790

if data_list:

791

data_list = sorted(data_list, key=lambda d: -d[1])

792

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

793

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

794

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

795

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

796

newline = '\n'

797

return '\n'.join(message)

798

799

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

800

_IDLE_INVENTORY_HEADER = '''\

801

Notice to Infrastructure deputies: The hosts shown below haven't

802

run any jobs for at least 24 hours. Please check each host; locked

803

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

809

"""Generate the "idle inventory" e-mail message.

810

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

811

The idle inventory is a host list with corresponding pool and model,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

812

where the hosts are idle (`UNKWOWN` or `UNUSED`).

813

814

N.B. For sample output text format as users can expect to

815

see it in e-mail and log files, refer to the unit tests.

816

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

817

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

818

@return String with the inventory message to be sent.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

819

"""

820

logging.debug('Creating idle inventory')

821

message = [_IDLE_INVENTORY_HEADER]

822

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

823

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

824

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

825

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

826

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

827

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

828

counts.get_total(pool), model, pool)

829

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

830

for dut in counts.get_idle_list(pool)])

831

if data_list:

832

message.extend(['%-30s %-20s %s' % t for t in data_list])

833

else:

834

message.append('(No idle DUTs)')

835

return '\n'.join(message)

836

837

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

838

def _send_email(arguments, tag, subject, recipients, body):

839

"""Send an inventory e-mail message.

840

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

841

The message is logged in the selected log directory using `tag` for

842

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

843

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

844

If the --debug option was requested, the message is neither logged

845

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

846

847

@param arguments Parsed command-line options.

848

@param tag Tag identifying the inventory for logging

849

purposes.

850

@param subject E-mail Subject: header line.

851

@param recipients E-mail addresses for the To: header line.

852

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

853

"""

854

logging.debug('Generating email: "%s"', subject)

855

all_recipients = ', '.join(recipients)

856

report_body = '\n'.join([

857

'To: %s' % all_recipients,

858

'Subject: %s' % subject,

859

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

860

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

861

print report_body

862

else:

863

filename = os.path.join(arguments.logdir, tag)

864

try:

865

report_file = open(filename, 'w')

866

report_file.write(report_body)

867

report_file.close()

868

except EnvironmentError as e:

869

logging.error('Failed to write %s: %s', filename, e)

870

try:

871

gmail_lib.send_email(all_recipients, subject, body)

872

except Exception as e:

873

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

877

def _populate_model_counts(inventory):

878

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

879

880

Gathering the status of all individual DUTs in the lab can take

881

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

882

Normally, we pay that cost by querying as we go. However, with

883

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

884

progress in real time. So, we force the first (expensive) queries

885

to happen up front, and provide simple ASCII output on sys.stdout

886

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

887

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

888

@param inventory `_LabInventory` object from which to gather

889

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

890

"""

891

n = 0

892

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

893

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

903

# This next call is where all the time goes - it forces all of a

904

# model's `HostJobHistory` objects to query the database and

905

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

906

total_broken += counts.get_broken()

907

sys.stdout.write('\n')

908

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

909

910

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

911

def _perform_model_inventory(arguments, inventory, timestamp):

912

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

913

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

914

The model inventory report consists of the following:

915

* A list of DUTs that are recommended to be repaired. This list

916

is optional, and only appears if the `--recommend` option is

917

present.

918

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

919

of working, broken, and spare DUTs, among others.

920

921

@param arguments Command-line arguments as returned by

922

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

923

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

924

@param timestamp A string used to identify this run's timestamp

925

in logs and email output.

926

"""

927

if arguments.recommend:

928

recommend_message = _generate_repair_recommendation(

929

inventory, arguments.recommend) + '\n\n\n'

930

else:

931

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

932

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

933

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

934

'models-%s.txt' % timestamp,

935

'DUT model inventory %s' % timestamp,

936

arguments.model_notify,

937

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

938

939

940

def _perform_pool_inventory(arguments, inventory, timestamp):

941

"""Perform the pool inventory report.

942

943

The pool inventory report consists of the following:

944

* A list of all critical pools that have failed DUTs, with counts

945

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

946

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

947

pool.

948

949

@param arguments Command-line arguments as returned by

950

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

951

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

952

@param timestamp A string used to identify this run's timestamp in

953

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

954

"""

955

pool_message = _generate_pool_inventory_message(inventory)

956

idle_message = _generate_idle_inventory_message(inventory)

957

_send_email(arguments,

958

'pools-%s.txt' % timestamp,

959

'DUT pool inventory %s' % timestamp,

960

arguments.pool_notify,

961

pool_message + '\n\n\n' + idle_message)

962

963

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

964

def _dut_in_repair_loop(history):

965

"""Return whether a DUT's history indicates a repair loop.

966

967

A DUT is considered looping if it runs no tests, and no tasks pass

968

other than repair tasks.

969

970

@param history An instance of `status_history.HostJobHistory` to be

971

scanned for a repair loop. The caller guarantees

972

that this history corresponds to a working DUT.

973

@returns Return a true value if the DUT's most recent history

974

indicates a repair loop.

975

"""

976

# Our caller passes only histories for working DUTs; that means

977

# we've already paid the cost of fetching the diagnosis task, and

978

# we know that the task was successful. The diagnosis task will be

979

# one of the tasks we must scan to find a loop, so if the task isn't

980

# a repair task, then our history includes a successful non-repair

981

# task, and we're not looping.

982

#

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

983

# The for loop below is very expensive, because it must fetch the

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

984

# full history, regardless of how many tasks we examine. At the

985

# time of this writing, this check against the diagnosis task

986

# reduces the cost of finding loops in the full inventory from hours

987

# to minutes.

988

if history.last_diagnosis()[1].name != 'Repair':

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

993

# This is a test, so we're not looping.

994

return False

995

if task.diagnosis == status_history.BROKEN:

996

# Failed a repair, so we're not looping.

997

return False

998

if (task.diagnosis == status_history.WORKING

999

and task.name != 'Repair'):

1000

# Non-repair task succeeded, so we're not looping.

1001

return False

1002

# At this point, we have either a failed non-repair task, or

1003

# a successful repair.

1004

if task.name == 'Repair':

1005

repair_ok_count += 1

1006

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1010

def _report_untestable_dut(history, state):

1011

fields = {

1012

'dut_hostname': history.hostname,

1013

'model': history.host_model,

1014

'pool': history.host_pool,

1015

'state': state,

1016

}

1017

logging.info('Untestable DUT: %(dut_hostname)s, model: %(model)s, '

1018

'pool: %(pool)s', fields)

1019

_UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1020

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1021

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1022

def _report_repair_loop_metrics(inventory):

1023

"""Find and report DUTs stuck in a repair loop.

1024

1025

Go through `inventory`, and find and report any DUT identified as

1026

being in a repair loop.

1027

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1028

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1029

"""

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1030

logging.info('Scanning for DUTs in repair loops.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1031

for counts in inventory.itervalues():

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1032

for history in counts.get_working_list():

1033

# Managed DUTs with names that don't match

1034

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1035

# don't want arbitrary strings being attached to the

1036

# 'dut_hostname' field, so for safety, we exclude all

1037

# anomalies.

1038

if not _HOSTNAME_PATTERN.match(history.hostname):

1039

continue

1040

if _dut_in_repair_loop(history):

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1041

_report_untestable_dut(history, 'repair_loop')

1042

1043

1044

def _report_idle_dut_metrics(inventory):

1045

"""Find and report idle, unlocked DUTs.

1046

1047

Go through `inventory`, and find and report any DUT identified as

1048

"idle" that is not also locked.

1049

1050

@param inventory `_LabInventory` object to be reported on.

1051

"""

1052

logging.info('Scanning for idle, unlocked DUTs.')

1053

for counts in inventory.itervalues():

1054

for history in counts.get_idle_list():

1055

# Managed DUTs with names that don't match

1056

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1057

# don't want arbitrary strings being attached to the

1058

# 'dut_hostname' field, so for safety, we exclude all

1059

# anomalies.

1060

if not _HOSTNAME_PATTERN.match(history.hostname):

1061

continue

1062

if not history.host.locked:

1063

_report_untestable_dut(history, 'idle_unlocked')

1064

1065

1066

def _report_untestable_dut_metrics(inventory):

1067

"""Scan the inventory for DUTs unable to run tests.

1068

1069

DUTs in the inventory are judged "untestable" if they meet one of

1070

two criteria:

1071

* The DUT is stuck in a repair loop; that is, it regularly passes

1072

repair, but never passes other operations.

1073

* The DUT runs no tasks at all, but is not locked.

1074

1075

This routine walks through the given inventory looking for DUTs in

1076

either of these states. Results are reported via a Monarch presence

1077

metric.

1078

1079

Note: To make sure that DUTs aren't flagged as "idle" merely

1080

because there's no work, a separate job runs prior to regular

1081

inventory runs which schedules trivial work on any DUT that appears

1082

idle.

1083

1084

@param inventory `_LabInventory` object to be reported on.

1085

"""

1086

_report_repair_loop_metrics(inventory)

1087

_report_idle_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1088

1089

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1090

def _log_startup(arguments, startup_time):

1091

"""Log the start of this inventory run.

1092

1093

Print various log messages indicating the start of the run. Return

1094

a string based on `startup_time` that will be used to identify this

1095

run in log files and e-mail messages.

1096

1097

@param startup_time A UNIX timestamp marking the moment when

1098

this inventory run began.

1099

@returns A timestamp string that will be used to identify this run

1100

in logs and email output.

1101

"""

1102

timestamp = time.strftime('%Y-%m-%d.%H',

1103

time.localtime(startup_time))

1104

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1105

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1106

if arguments.recommend:

1107

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1108

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1109

if arguments.pool_notify:

1110

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1115

"""Create the `_LabInventory` instance to use for reporting.

1116

1117

@param end_time A UNIX timestamp for the end of the time range

1118

to be searched in this inventory run.

1119

"""

1120

start_time = end_time - arguments.duration * 60 * 60

1121

afe = frontend_wrappers.RetryingAFE(server=None)

1122

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1123

afe, start_time, end_time, arguments.modelnames)

1124

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1125

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1126

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1130

def _perform_inventory_reports(arguments):

1131

"""Perform all inventory checks requested on the command line.

1132

1133

Create the initial inventory and run through the inventory reports

1134

as called for by the parsed command-line arguments.

1135

1136

@param arguments Command-line arguments as returned by

1137

`ArgumentParser`.

1138

"""

1139

startup_time = time.time()

1140

timestamp = _log_startup(arguments, startup_time)

1141

inventory = _create_inventory(arguments, startup_time)

1142

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1143

_populate_model_counts(inventory)

1144

if arguments.model_notify:

1145

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1146

if arguments.pool_notify:

1147

_perform_pool_inventory(arguments, inventory, timestamp)

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1148

if arguments.report_untestable:

1149

_report_untestable_dut_metrics(inventory)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1150

1151

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1152

def _separate_email_addresses(address_list):

1153

"""Parse a list of comma-separated lists of e-mail addresses.

1154

1155

@param address_list A list of strings containing comma

1156

separate e-mail addresses.

1157

@return A list of the individual e-mail addresses.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1158

"""

1159

newlist = []

1160

for arg in address_list:

1161

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1166

"""Validate command-line arguments.

1167

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1168

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1169

`--pool-notify` in separate option arguments into a single list.

1170

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1171

For non-debug uses, require that at least one inventory report be

1172

requested. For debug, if a report isn't specified, treat it as "run

1173

all the reports."

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1174

1175

The return value indicates success or failure; in the case of

1176

failure, we also write an error message to stderr.

1177

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1178

@param arguments Command-line arguments as returned by

1179

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1180

@return True if the arguments are semantically good, or False

1181

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1182

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1183

arguments.model_notify = _separate_email_addresses(

1184

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1185

arguments.pool_notify = _separate_email_addresses(

1186

arguments.pool_notify)

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1187

if not any([arguments.model_notify, arguments.pool_notify,

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1188

arguments.report_untestable]):

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1189

if not arguments.debug:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1190

sys.stderr.write('Must request at least one report via '

1191

'--model-notify, --pool-notify, or '

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1192

'--report-untestable\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1193

return False

1194

else:

Richard Barnette

2018-02-26 10:42:46 -0800

[diff] [blame]

1195

# We want to run all the e-mail reports. An empty notify

1196

# list will cause a report to be skipped, so make sure the

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1197

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1198

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1199

arguments.pool_notify = ['']

1200

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1201

1202

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1203

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1204

"""Get the default directory for the `--logdir` option.

1205

1206

The default log directory is based on the parent directory

1207

containing this script.

1208

1209

@param script Path to this script file.

1210

@return A path to a directory.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1211

"""

1212

basedir = os.path.dirname(os.path.abspath(script))

1213

basedir = os.path.dirname(basedir)

1214

return os.path.join(basedir, _LOGDIR)

1215

1216

1217

def _parse_command(argv):

1218

"""Parse the command line arguments.

1219

1220

Create an argument parser for this command's syntax, parse the

1221

command line, and return the result of the ArgumentParser

1222

parse_args() method.

1223

1224

@param argv Standard command line argument vector; argv[0] is

1225

assumed to be the command name.

1226

@return Result returned by ArgumentParser.parse_args().

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1227

"""

1228

parser = argparse.ArgumentParser(

1229

prog=argv[0],

1230

description='Gather and report lab inventory statistics')

1231

parser.add_argument('-d', '--duration', type=int,

1232

default=_DEFAULT_DURATION, metavar='HOURS',

1233

help='number of hours back to search for status'

1234

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1235

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1236

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1237

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1238

'and send it to the given e-mail address(es)')

1239

parser.add_argument('--pool-notify', action='append',

1240

default=[], metavar='ADDRESS',

1241

help='Generate pool inventory message, '

1242

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1243

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1244

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1245

'recommended for repair (default: no '

1246

'recommendation)'))

Richard Barnette

2018-04-09 16:45:58 -0700

[diff] [blame]

1247

parser.add_argument('--report-untestable', action='store_true',

1248

help='Check for devices unable to run tests.')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1249

parser.add_argument('--debug', action='store_true',

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1250

help='Print e-mail, metrics messages on stdout '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1251

'without sending them.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1252

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1253

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1254

parser.add_argument('modelnames', nargs='*',

1255

metavar='MODEL',

1256

help='names of models to report on '

1257

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1258

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1259

if not _verify_arguments(arguments):

1260

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1265

"""Configure the `logging` module for our needs.

1266

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1267

How we log depends on whether the `--debug` option was provided on

1268

the command line.

1269

* Without the option, we configure the logging to capture all

1270

potentially relevant events in a log file. The log file is

1271

configured to rotate once a week on Friday evening, preserving

1272

~3 months worth of history.

1273

* With the option, we expect stdout to contain other

1274

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1275

messages), so we restrict the output to INFO level.

1276

1277

For convenience, when `--debug` is on, the logging format has

1278

no adornments, so that a call like `logging.info(msg)` simply writes

1279

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1280

1281

@param arguments Command-line arguments as returned by

1282

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1283

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1284

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1285

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1286

root_logger.setLevel(logging.INFO)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1287

handler = logging.StreamHandler(sys.stdout)

1288

handler.setFormatter(logging.Formatter())

1289

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1290

if not os.path.exists(arguments.logdir):

1291

os.mkdir(arguments.logdir)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1292

root_logger.setLevel(logging.DEBUG)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1293

logfile = os.path.join(arguments.logdir, _LOGFILE)

1294

handler = logging.handlers.TimedRotatingFileHandler(

1295

logfile, when='W4', backupCount=13)

1296

formatter = logging.Formatter(_LOG_FORMAT,

1297

time_utils.TIME_FMT)

1298

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1299

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1300

# implicitly imported logging_config, which calls

1301

# logging.basicConfig() *at module level*. That gives us an

1302

# extra logging handler that we don't want. So, clear out all

1303

# the handlers here.

1304

for h in root_logger.handlers:

1305

root_logger.removeHandler(h)

1306

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1307

1308

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1309

def main(argv):

1310

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1311

1312

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1313

"""

1314

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1315

if not arguments:

1316

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1317

_configure_logging(arguments)

Prathmesh Prabhu

6b48ede

2018-05-07 14:33:07 -0700

[diff] [blame]

1318

1319

if arguments.debug:

1320

logging.info('--debug mode: Will not report metrics to monarch')

1321

metrics_file = '/dev/null'

else:

metrics_file = None

Prathmesh Prabhu

2018-05-07 14:37:35 -0700

[diff] [blame^]

1325

with site_utils.SetupTsMonGlobalState(

1326

'lab_inventory', debug_file=metrics_file,

1327

auto_flush=False):

1328

try:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1329

_perform_inventory_reports(arguments)

Prathmesh Prabhu

58728f4

2018-05-07 14:37:35 -0700

[diff] [blame^]

1330

except KeyboardInterrupt:

1331

pass

1332

except (EnvironmentError, Exception):

1333

# Our cron setup doesn't preserve stderr, so drop extra breadcrumbs.

1334

logging.exception('Error escaped main')

1335

raise

1336

finally:

1337

metrics.Flush()

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1338

1339

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1340

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1341

end_time = int(time.time())

1342

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1343

return _LabInventory.create_inventory(afe, start_time, end_time)

1344

1345

1346

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame]

1347

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1348

1349

J. Richard Barnette