Blame - site_utils/lab_inventory.py - platform/external/autotest

2015-03-27 17:23:52 -0700

[diff] [blame]

1

#!/usr/bin/env python

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Create e-mail reports of the Lab's DUT inventory.

7

8

Gathers a list of all DUTs of interest in the Lab, segregated by

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

9

model and pool, and determines whether each DUT is working or

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

10

broken. Then, send one or more e-mail reports summarizing the

11

status to e-mail addresses provided on the command line.

12

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

13

usage: lab_inventory.py [ options ] [ model ... ]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

14

15

Options:

16

--duration / -d <hours>

17

How far back in time to search job history to determine DUT

18

status.

19

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

20

--model-notify <address>[,<address>]

21

Send the "model status" e-mail to all the specified e-mail

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

22

addresses.

23

24

--pool-notify <address>[,<address>]

25

Send the "pool status" e-mail to all the specified e-mail

26

addresses.

27

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

28

--recommend <number>

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

29

When generating the "model status" e-mail, include a list of

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

30

<number> specific DUTs to be recommended for repair.

31

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

32

--repair-loops

33

Scan the inventory for DUTs stuck in repair loops, and report them

34

via a Monarch presence metric.

35

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

36

--logdir <directory>

37

Log progress and actions in a file under this directory. Text

38

of any e-mail sent will also be logged in a timestamped file in

39

this directory.

40

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

41

--debug

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

42

Suppress all logging, metrics reporting, and sending e-mail.

43

Instead, write the output that would be generated onto stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

44

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

45

<model> arguments:

46

With no arguments, gathers the status for all models in the lab.

47

With one or more named models on the command line, restricts

48

reporting to just those models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

import argparse

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

54

import collections

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

55

import logging

56

import logging.handlers

57

import os

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

58

import re

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

import sys

import time

import common

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

63

from autotest_lib.client.bin import utils

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

64

from autotest_lib.client.common_lib import time_utils

Xixuan Wu

93e646c

2017-12-07 18:36:10 -0800

[diff] [blame]

65

from autotest_lib.server import constants

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

66

from autotest_lib.server import site_utils

J. Richard Barnette

a7c514e

2015-09-15 11:13:23 -0700

[diff] [blame]

67

from autotest_lib.server.cros.dynamic_suite import frontend_wrappers

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

68

from autotest_lib.server.hosts import servo_host

Aviv Keshet

7ee9586

2016-08-30 15:18:27 -0700

[diff] [blame]

69

from autotest_lib.server.lib import status_history

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

70

from autotest_lib.site_utils import gmail_lib

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

71

from chromite.lib import metrics

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

72

73

Richard Barnette

673573b

2016-12-12 09:46:39 -0800

[diff] [blame]

74

CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS

75

SPARE_POOL = constants.Pools.SPARE_POOL

76

MANAGED_POOLS = constants.Pools.MANAGED_POOLS

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

77

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

78

# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

79

# monitoring by this script. Currently, we're excluding these:

80

# + 'adb' - We're not ready to monitor Android or Brillo hosts.

81

# + 'board:guado_moblab' - These are maintained by a separate

82

# process that doesn't use this script.

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

83

Richard Barnette

eabcf39

2017-09-01 15:10:54 -0700

[diff] [blame]

84

_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

85

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

86

# _DEFAULT_DURATION:

87

# Default value used for the --duration command line option.

88

# Specifies how far back in time to search in order to determine

89

# DUT status.

90

91

_DEFAULT_DURATION = 24

92

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

93

# _LOGDIR:

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

94

# Relative path used in the calculation of the default setting for

95

# the --logdir option. The full path is relative to the root of the

96

# autotest directory, as determined from sys.argv[0].

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

97

# _LOGFILE:

98

# Basename of a file to which general log information will be

99

# written.

100

# _LOG_FORMAT:

101

# Format string for log messages.

102

103

_LOGDIR = os.path.join('logs', 'dut-data')

104

_LOGFILE = 'lab-inventory.log'

105

_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'

106

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

107

# Pattern describing location-based host names in the Chrome OS test

108

# labs. Each DUT hostname designates the DUT's location:

109

# * A lab (room) that's physically separated from other labs

110

# (i.e. there's a door).

111

# * A row (or aisle) of DUTs within the lab.

112

# * A vertical rack of shelves on the row.

113

# * A specific host on one shelf of the rack.

114

115

_HOSTNAME_PATTERN = re.compile(

116

r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')

117

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

118

# _REPAIR_LOOP_THRESHOLD:

119

# The number of repeated Repair tasks that must be seen to declare

120

# that a DUT is stuck in a repair loop.

121

122

_REPAIR_LOOP_THRESHOLD = 4

123

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

124

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

125

class _HostSetInventory(object):

126

"""Maintains a set of related `HostJobHistory` objects.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

127

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

128

The collection is segregated into disjoint categories of "working",

129

"broken", and "idle" DUTs. Accessor methods allow finding both the

130

list of DUTs in each category, as well as counts of each category.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

131

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

132

Performance note: Certain methods in this class are potentially

133

expensive:

134

* `get_working()`

135

* `get_working_list()`

136

* `get_broken()`

137

* `get_broken_list()`

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

138

* `get_idle()`

139

* `get_idle_list()`

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

140

The first time any one of these methods is called, it causes

141

multiple RPC calls with a relatively expensive set of database

142

queries. However, the results of the queries are cached in the

143

individual `HostJobHistory` objects, so only the first call

144

actually pays the full cost.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

145

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

146

Additionally, `get_working_list()`, `get_broken_list()` and

147

`get_idle_list()` cache their return values to avoid recalculating

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

148

lists at every call; this caching is separate from the caching of

149

RPC results described above.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

150

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

151

This class is deliberately constructed to delay the RPC cost until

152

the accessor methods are called (rather than to query in

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

153

`record_host()`) so that it's possible to construct a complete

154

`_LabInventory` without making the expensive queries at creation

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

155

time. `_populate_model_counts()`, below, assumes this behavior.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

156

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

157

Current usage of this class is that all DUTs are part of a single

158

scheduling pool of DUTs; however, this class make no assumptions

159

about the actual relationship among the DUTs.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

"""

def __init__(self):

self._histories = []

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

164

self._working_list = None

165

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

166

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

167

168

169

def record_host(self, host_history):

170

"""Add one `HostJobHistory` object to the collection.

171

172

@param host_history The `HostJobHistory` object to be

173

remembered.

174

175

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

176

self._working_list = None

177

self._broken_list = None

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

178

self._idle_list = None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

179

self._histories.append(host_history)

180

181

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

182

def get_working_list(self):

183

"""Return a list of all working DUTs in the pool.

184

185

Filter `self._histories` for histories where the last

186

diagnosis is `WORKING`.

187

188

Cache the result so that we only cacluate it once.

189

190

@return A list of HostJobHistory objects.

191

192

"""

193

if self._working_list is None:

194

self._working_list = [h for h in self._histories

195

if h.last_diagnosis()[0] == status_history.WORKING]

196

return self._working_list

197

198

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

199

def get_working(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

200

"""Return the number of working DUTs in the pool."""

201

return len(self.get_working_list())

202

203

204

def get_broken_list(self):

205

"""Return a list of all broken DUTs in the pool.

206

207

Filter `self._histories` for histories where the last

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

208

diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

209

210

Cache the result so that we only cacluate it once.

211

212

@return A list of HostJobHistory objects.

213

214

"""

215

if self._broken_list is None:

216

self._broken_list = [h for h in self._histories

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

217

if h.last_diagnosis()[0] == status_history.BROKEN]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

218

return self._broken_list

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

219

220

221

def get_broken(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

222

"""Return the number of broken DUTs in the pool."""

223

return len(self.get_broken_list())

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

224

225

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

226

def get_idle_list(self):

227

"""Return a list of all idle DUTs in the pool.

228

229

Filter `self._histories` for histories where the last

230

diagnosis is `UNUSED` or `UNKNOWN`.

231

232

Cache the result so that we only cacluate it once.

233

234

@return A list of HostJobHistory objects.

235

236

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

237

idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

238

if self._idle_list is None:

239

self._idle_list = [h for h in self._histories

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

240

if h.last_diagnosis()[0] in idle_statuses]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

241

return self._idle_list

def get_idle(self):

"""Return the number of idle DUTs in the pool."""

246

return len(self.get_idle_list())

247

248

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

249

def get_total(self):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

250

"""Return the total number of DUTs in the pool."""

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

251

return len(self._histories)

252

253

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

254

class _PoolSetInventory(object):

255

"""Maintains a set of `HostJobHistory`s for a set of pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

256

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

257

The collection is segregated into disjoint categories of "working",

258

"broken", and "idle" DUTs. Accessor methods allow finding both the

259

list of DUTs in each category, as well as counts of each category.

260

Accessor queries can be for an individual pool, or against all

261

pools.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

262

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

263

Performance note: This class relies on `_HostSetInventory`. Public

264

methods in this class generally rely on methods of the same name in

265

the underlying class, and so will have the same underlying

266

performance characteristics.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

267

"""

268

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

269

def __init__(self, pools):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

270

self._histories_by_pool = {

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

271

pool: _HostSetInventory() for pool in pools

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

272

}

273

274

def record_host(self, host_history):

275

"""Add one `HostJobHistory` object to the collection.

276

277

@param host_history The `HostJobHistory` object to be

278

remembered.

279

280

"""

J. Richard Barnette

3d0590a

2015-04-29 12:56:12 -0700

[diff] [blame]

281

pool = host_history.host_pool

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

282

self._histories_by_pool[pool].record_host(host_history)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

283

284

285

def _count_pool(self, get_pool_count, pool=None):

286

"""Internal helper to count hosts in a given pool.

287

288

The `get_pool_count` parameter is a function to calculate

289

the exact count of interest for the pool.

290

291

@param get_pool_count Function to return a count from a

292

_PoolCount object.

293

@param pool The pool to be counted. If `None`,

294

return the total across all pools.

295

296

"""

297

if pool is None:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

298

return sum([get_pool_count(cached_history) for cached_history in

299

self._histories_by_pool.values()])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

300

else:

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

301

return get_pool_count(self._histories_by_pool[pool])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

302

303

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

304

def get_working_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

305

"""Return a list of all working DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

306

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

307

Go through all HostJobHistory objects across all pools, selecting the

308

ones where the last diagnosis is `WORKING`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

309

310

@return A list of HostJobHistory objects.

311

312

"""

313

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

314

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

315

l.extend(p.get_working_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

319

def get_working(self, pool=None):

320

"""Return the number of working DUTs in a pool.

321

322

@param pool The pool to be counted. If `None`, return the

323

total across all pools.

324

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

325

@return The total number of working DUTs in the selected

326

pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

327

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

328

return self._count_pool(_HostSetInventory.get_working, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

329

330

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

331

def get_broken_list(self):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

332

"""Return a list of all broken DUTs (across all pools).

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

333

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

334

Go through all HostJobHistory objects in the across all pools,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

335

selecting the ones where the last diagnosis is `BROKEN`.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

336

337

@return A list of HostJobHistory objects.

338

339

"""

340

l = []

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

341

for p in self._histories_by_pool.values():

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

342

l.extend(p.get_broken_list())

return l

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

346

def get_broken(self, pool=None):

347

"""Return the number of broken DUTs in a pool.

348

349

@param pool The pool to be counted. If `None`, return the

350

total across all pools.

351

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

352

@return The total number of broken DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

353

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

354

return self._count_pool(_HostSetInventory.get_broken, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

355

356

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

357

def get_idle_list(self, pool=None):

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

358

"""Return a list of all idle DUTs in the given pool.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

359

Prathmesh Prabhu

2017-11-08 17:04:24 -0800

[diff] [blame]

360

Go through all HostJobHistory objects in the given pool, selecting the

361

ones where the last diagnosis is `UNUSED` or `UNKNOWN`.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

362

363

@param pool: The pool to be counted. If `None`, return the total list

364

across all pools.

365

366

@return A list of HostJobHistory objects.

"""

if pool is None:

l = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

371

for p in self._histories_by_pool.itervalues():

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

372

l.extend(p.get_idle_list())

373

return l

374

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

375

return self._histories_by_pool[pool].get_idle_list()

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

376

377

378

def get_idle(self, pool=None):

379

"""Return the number of idle DUTs in a pool.

380

381

@param pool: The pool to be counted. If `None`, return the total

382

across all pools.

383

384

@return The total number of idle DUTs in the selected pool(s).

385

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

386

return self._count_pool(_HostSetInventory.get_idle, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

387

388

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

389

def get_spares_buffer(self, spare_pool=SPARE_POOL):

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

390

"""Return the the nominal number of working spares.

391

392

Calculates and returns how many working spares there would

393

be in the spares pool if all broken DUTs were in the spares

394

pool. This number may be negative, indicating a shortfall

395

in the critical pools.

396

397

@return The total number DUTs in the spares pool, less the total

398

number of broken DUTs in all pools.

399

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

400

return self.get_total(spare_pool) - self.get_broken()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

401

402

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

403

def get_total(self, pool=None):

404

"""Return the total number of DUTs in a pool.

405

406

@param pool The pool to be counted. If `None`, return the

407

total across all pools.

408

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

409

@return The total number of DUTs in the selected pool(s).

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

410

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

411

return self._count_pool(_HostSetInventory.get_total, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

412

413

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

414

def _eligible_host(afehost):

415

"""Return whether this host is eligible for monitoring.

416

417

A host is eligible if it has a (unique) 'model' label, it's in

418

exactly one pool, and it has no labels from the

419

`_EXCLUDED_LABELS` set.

420

421

@param afehost The host to be tested for eligibility.

422

"""

423

# DUTs without an existing, unique 'model' or 'pool' label

424

# aren't meant to exist in the managed inventory; their presence

425

# generally indicates an error in the database. Unfortunately

426

# such errors have been seen to occur from time to time.

427

#

428

# The _LabInventory constructor requires hosts to conform to the

429

# label restrictions, and may fail if they don't. Failing an

430

# inventory run for a single bad entry is the wrong thing, so we

431

# ignore the problem children here, to keep them out of the

432

# inventory.

433

models = [l for l in afehost.labels

434

if l.startswith(constants.Labels.MODEL_PREFIX)]

435

pools = [l for l in afehost.labels

436

if l.startswith(constants.Labels.POOL_PREFIX)]

437

excluded = _EXCLUDED_LABELS.intersection(afehost.labels)

438

return len(models) == 1 and len(pools) == 1 and not excluded

439

440

441

class _LabInventory(collections.Mapping):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

442

"""Collection of `HostJobHistory` objects for the Lab's inventory.

443

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

444

This is a dict-like collection indexed by model. Indexing returns

445

the _PoolSetInventory object associated with the model.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

446

"""

447

448

@classmethod

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

449

def create_inventory(cls, afe, start_time, end_time, modellist=[]):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

450

"""Return a Lab inventory with specified parameters.

451

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

452

By default, gathers inventory from `HostJobHistory` objects for

453

all DUTs in the `MANAGED_POOLS` list. If `modellist` is

454

supplied, the inventory will be restricted to only the given

455

models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

456

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

457

@param afe AFE object for constructing the

458

`HostJobHistory` objects.

459

@param start_time Start time for the `HostJobHistory` objects.

460

@param end_time End time for the `HostJobHistory` objects.

461

@param modellist List of models to include. If empty,

462

include all available models.

463

@return A `_LabInventory` object for the specified models.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

464

465

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

466

target_pools = MANAGED_POOLS

467

label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

468

afehosts = afe.get_hosts(labels__name__in=label_list)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

469

if modellist:

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

470

# We're deliberately not checking host eligibility in this

471

# code path. This is a debug path, not used in production;

472

# it may be useful to include ineligible hosts here.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

473

modelhosts = []

474

for model in modellist:

475

model_label = constants.Labels.MODEL_PREFIX + model

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

476

host_list = [h for h in afehosts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

477

if model_label in h.labels]

478

modelhosts.extend(host_list)

479

afehosts = modelhosts

J. Richard Barnette

2016-03-17 17:03:57 -0700

[diff] [blame]

480

else:

481

afehosts = [h for h in afehosts if cls._eligible_host(h)]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

482

create = lambda host: (

483

status_history.HostJobHistory(afe, host,

484

start_time, end_time))

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

485

return cls([create(host) for host in afehosts], target_pools)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

486

487

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

488

def __init__(self, histories, pools):

489

models = {h.host_model for h in histories}

490

self._modeldata = {model: _PoolSetInventory(pools) for model in models}

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

491

self._dut_count = len(histories)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

492

for h in histories:

493

self[h.host_model].record_host(h)

494

self._boards = {h.host_board for h in histories}

Prathmesh Prabhu

154cb2b

2017-11-08 17:36:51 -0800

[diff] [blame]

495

496

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

497

def __getitem__(self, key):

498

return self._modeldata.__getitem__(key)

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

499

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

500

501

def __len__(self):

502

return self._modeldata.__len__()

def __iter__(self):

return self._modeldata.__iter__()

507

508

509

def reportable_items(self, spare_pool=SPARE_POOL):

510

"""Iterate over all items subject to reporting.

511

512

Yields the contents of `self.iteritems()` filtered to include

513

only reportable models. A model is reportable if it has DUTs in

514

both `spare_pool` and at least one other pool.

515

516

@param spare_pool The spare pool to be tested for reporting.

Prathmesh Prabhu

021e784

2017-11-08 18:05:45 -0800

[diff] [blame]

517

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

518

for model, histories in self.iteritems():

519

spares = histories.get_total(spare_pool)

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

520

total = histories.get_total()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

521

if spares != 0 and spares != total:

522

yield model, histories

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

523

524

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

525

def get_num_duts(self):

526

"""Return the total number of DUTs in the inventory."""

527

return self._dut_count

528

529

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

530

def get_num_models(self):

531

"""Return the total number of models in the inventory."""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

return len(self)

def get_pool_models(self, pool):

536

"""Return all models in `pool`.

537

538

@param pool The pool to be inventoried for models.

539

"""

540

return {m for m, h in self.iteritems() if h.get_total(pool)}

541

542

543

def get_boards(self):

544

return self._boards

Prathmesh Prabhu

a5a0e3d

2017-11-09 08:53:53 -0800

[diff] [blame]

545

546

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

547

def _sort_by_location(inventory_list):

548

"""Return a list of DUTs, organized by location.

549

550

Take the given list of `HostJobHistory` objects, separate it

551

into a list per lab, and sort each lab's list by location. The

552

order of sorting within a lab is

553

* By row number within the lab,

554

* then by rack number within the row,

555

* then by host shelf number within the rack.

556

557

Return a list of the sorted lists.

558

559

Implementation note: host locations are sorted by converting

560

each location into a base 100 number. If row, rack or

561

host numbers exceed the range [0..99], then sorting will

562

break down.

563

564

@return A list of sorted lists of DUTs.

"""

BASE = 100

lab_lists = {}

for history in inventory_list:

570

location = _HOSTNAME_PATTERN.match(history.host.hostname)

571

if location:

572

lab = location.group(1)

573

key = 0

574

for idx in location.group(2, 3, 4):

575

key = BASE * key + int(idx)

576

lab_lists.setdefault(lab, []).append((key, history))

577

return_list = []

578

for dut_list in lab_lists.values():

579

dut_list.sort(key=lambda t: t[0])

580

return_list.append([t[1] for t in dut_list])

return return_list

def _score_repair_set(buffer_counts, repair_list):

585

"""Return a numeric score rating a set of DUTs to be repaired.

586

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

587

`buffer_counts` is a dictionary mapping model names to the size of

588

the model's spares buffer.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

589

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

590

`repair_list` is a list of `HostJobHistory` objects for the DUTs to

591

be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

592

593

This function calculates the new set of buffer counts that would

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

594

result from the proposed repairs, and scores the new set using two

595

numbers:

596

* Worst case buffer count for any model (higher is better). This

597

is the more significant number for comparison.

598

* Number of models at the worst case (lower is better). This is

599

the less significant number.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

600

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

601

Implementation note: The score could fail to reflect the intended

602

criteria if there are more than 1000 models in the inventory.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

603

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

604

@param spare_counts A dictionary mapping models to buffer counts.

605

@param repair_list A list of `HostJobHistory` objects for the

606

DUTs to be repaired.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

607

@return A numeric score.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

608

"""

609

# Go through `buffer_counts`, and create a list of new counts

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

610

# that records the buffer count for each model after repair.

611

# The new list of counts discards the model names, as they don't

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

612

# contribute to the final score.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

613

_NMODELS = 1000

614

pools = {h.host_pool for h in repair_list}

615

repair_inventory = _LabInventory(repair_list, pools)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

616

new_counts = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

617

for m, c in buffer_counts.iteritems():

618

if m in repair_inventory:

619

newcount = repair_inventory[m].get_total()

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

620

else:

621

newcount = 0

622

new_counts.append(c + newcount)

623

# Go through the new list of counts. Find the worst available

624

# spares count, and count how many times that worst case occurs.

625

worst_count = new_counts[0]

626

num_worst = 1

627

for c in new_counts[1:]:

628

if c == worst_count:

629

num_worst += 1

630

elif c < worst_count:

631

worst_count = c

632

num_worst = 1

633

# Return the calculated score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

634

return _NMODELS * worst_count - num_worst

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

635

636

637

def _generate_repair_recommendation(inventory, num_recommend):

638

"""Return a summary of selected DUTs needing repair.

639

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

640

Returns a message recommending a list of broken DUTs to be repaired.

641

The list of DUTs is selected based on these criteria:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

642

* No more than `num_recommend` DUTs will be listed.

643

* All DUTs must be in the same lab.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

644

* DUTs should be selected for some degree of physical proximity.

645

* DUTs for models with a low spares buffer are more important than

646

DUTs with larger buffers.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

647

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

648

The algorithm used will guarantee that at least one DUT from a model

649

with the lowest spares buffer will be recommended. If the worst

650

spares buffer number is shared by more than one model, the algorithm

651

will tend to prefer repair sets that include more of those models

652

over sets that cover fewer models.

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

653

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

654

@param inventory `_LabInventory` object from which to generate

655

recommendations.

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

656

@param num_recommend Number of DUTs to recommend for repair.

657

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

658

"""

659

logging.debug('Creating DUT repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

660

model_buffer_counts = {}

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

661

broken_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

662

for model, counts in inventory.reportable_items():

663

logging.debug('Listing failed DUTs for %s', model)

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

664

if counts.get_broken() != 0:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

665

model_buffer_counts[model] = counts.get_spares_buffer()

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

666

broken_list.extend(counts.get_broken_list())

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

667

# N.B. The logic inside this loop may seem complicated, but

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

668

# simplification is hard:

669

# * Calculating an initial recommendation outside of

670

# the loop likely would make things more complicated,

671

# not less.

672

# * It's necessary to calculate an initial lab slice once per

673

# lab _before_ the while loop, in case the number of broken

674

# DUTs in a lab is less than `num_recommend`.

J. Richard Barnette

5512743

2015-10-13 17:01:56 -0700

[diff] [blame]

675

recommendation = None

676

best_score = None

677

for lab_duts in _sort_by_location(broken_list):

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

678

start = 0

679

end = num_recommend

680

lab_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

681

lab_score = _score_repair_set(model_buffer_counts, lab_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

682

while end < len(lab_duts):

683

start += 1

684

end += 1

685

new_slice = lab_duts[start : end]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

686

new_score = _score_repair_set(model_buffer_counts, new_slice)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

687

if new_score > lab_score:

688

lab_slice = new_slice

689

lab_score = new_score

690

if recommendation is None or lab_score > best_score:

691

recommendation = lab_slice

692

best_score = lab_score

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

693

# N.B. The trailing space in `line_fmt` is manadatory: Without it,

694

# Gmail will parse the URL wrong. Don't ask. If you simply _must_

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

695

# know more, go try it yourself...

696

line_fmt = '%-30s %-16s %-6s\n %s '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

697

message = ['Repair recommendations:\n',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

698

line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

699

for h in recommendation:

700

servo_name = servo_host.make_servo_hostname(h.host.hostname)

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

701

servo_present = utils.host_is_in_lab_zone(servo_name)

702

_, event = h.last_diagnosis()

703

line = line_fmt % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

704

h.host.hostname, h.host_model,

J. Richard Barnette

5d0fa51

2016-04-05 17:39:52 -0700

[diff] [blame]

705

'Yes' if servo_present else 'No', event.job_url)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

706

message.append(line)

707

return '\n'.join(message)

708

709

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

710

def _generate_model_inventory_message(inventory):

711

"""Generate the "model inventory" e-mail message.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

712

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

713

The model inventory is a list by model summarizing the number of

714

working, broken, and idle DUTs, and the total shortfall or surplus

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

715

of working devices relative to the minimum critical pool

716

requirement.

717

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

718

The report omits models with no DUTs in the spare pool or with no

719

DUTs in a critical pool.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

720

721

N.B. For sample output text formattted as users can expect to

722

see it in e-mail and log files, refer to the unit tests.

723

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

724

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

725

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

726

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

727

logging.debug('Creating model inventory')

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

728

nworking = 0

729

nbroken = 0

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

730

nidle = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

731

nbroken_models = 0

732

ntotal_models = 0

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

733

summaries = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

734

column_names = (

735

'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')

736

for model, counts in inventory.reportable_items():

737

logging.debug('Counting %2d DUTS for model %s',

738

counts.get_total(), model)

739

# Summary elements laid out in the same order as the column

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

740

# headers:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

741

# Model Avail Bad Idle Good Spare Total

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

742

# e[0] e[1] e[2] e[3] e[4] e[5] e[6]

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

743

element = (model,

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

744

counts.get_spares_buffer(),

745

counts.get_broken(),

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

746

counts.get_idle(),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

747

counts.get_working(),

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

748

counts.get_total(SPARE_POOL),

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

749

counts.get_total())

J. Richard Barnette

ea5a4ba

2016-02-18 16:34:50 -0800

[diff] [blame]

750

if element[2]:

751

summaries.append(element)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

752

nbroken_models += 1

753

ntotal_models += 1

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

754

nbroken += element[2]

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

755

nidle += element[3]

756

nworking += element[4]

757

ntotal = nworking + nbroken + nidle

J. Richard Barnette

2015-10-20 17:58:30 -0700

[diff] [blame]

758

summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

759

broken_percent = int(round(100.0 * nbroken / ntotal))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

760

idle_percent = int(round(100.0 * nidle / ntotal))

761

working_percent = 100 - broken_percent - idle_percent

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

762

message = ['Summary of DUTs in inventory:',

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

763

'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),

764

'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

765

nbroken, broken_percent,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

766

nidle, idle_percent,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

767

nworking, working_percent,

768

ntotal),

769

'',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

770

'Models with failures: %d' % nbroken_models,

771

'Models in inventory: %d' % ntotal_models,

J. Richard Barnette

2015-10-14 11:20:49 -0700

[diff] [blame]

772

'', '',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

773

'Full model inventory:\n',

774

'%-22s %5s %5s %5s %5s %5s %5s' % column_names]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

775

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

776

['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

777

return '\n'.join(message)

778

779

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

780

_POOL_INVENTORY_HEADER = '''\

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

781

Notice to Infrastructure deputies: All models shown below are at

J. Richard Barnette

c9a143c

2015-06-04 11:11:19 -0700

[diff] [blame]

782

less than full strength, please take action to resolve the issues.

783

Once you're satisified that failures won't recur, failed DUTs can

784

be replaced with spares by running `balance_pool`. Detailed

785

instructions can be found here:

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

786

http://go/cros-manage-duts

'''

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

790

def _generate_pool_inventory_message(inventory):

791

"""Generate the "pool inventory" e-mail message.

792

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

793

The pool inventory is a list by pool and model summarizing the

794

number of working and broken DUTs in the pool. Only models with

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

795

at least one broken DUT are included in the list.

796

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

797

N.B. For sample output text formattted as users can expect to see it

798

in e-mail and log files, refer to the unit tests.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

799

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

800

@param inventory `_LabInventory` object to be reported on.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

801

@return String with the inventory message to be sent.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

802

"""

803

logging.debug('Creating pool inventory')

J. Richard Barnette

4845fcf

2015-04-20 14:26:25 -0700

[diff] [blame]

804

message = [_POOL_INVENTORY_HEADER]

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

805

newline = ''

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

806

for pool in CRITICAL_POOLS:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

807

message.append(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

808

'%sStatus for pool:%s, by model:' % (newline, pool))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

809

message.append(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

810

'%-20s %5s %5s %5s %5s' % (

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

811

'Model', 'Bad', 'Idle', 'Good', 'Total'))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

812

data_list = []

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

813

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

814

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

815

counts.get_total(pool), model, pool)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

816

broken = counts.get_broken(pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

817

idle = counts.get_idle(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

818

# models at full strength are not reported

819

if not broken and not idle:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

820

continue

821

working = counts.get_working(pool)

822

total = counts.get_total(pool)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

823

data_list.append((model, broken, idle, working, total))

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

824

if data_list:

825

data_list = sorted(data_list, key=lambda d: -d[1])

826

message.extend(

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

827

['%-20s %5d %5d %5d %5d' % t for t in data_list])

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

828

else:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

829

message.append('(All models at full strength)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

830

newline = '\n'

831

return '\n'.join(message)

832

833

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

834

_IDLE_INVENTORY_HEADER = '''\

835

Notice to Infrastructure deputies: The hosts shown below haven't

836

run any jobs for at least 24 hours. Please check each host; locked

837

hosts should normally be unlocked; stuck jobs should normally be

aborted.

'''

def _generate_idle_inventory_message(inventory):

843

"""Generate the "idle inventory" e-mail message.

844

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

845

The idle inventory is a host list with corresponding pool and model,

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

846

where the hosts are idle (`UNKWOWN` or `UNUSED`).

847

848

N.B. For sample output text format as users can expect to

849

see it in e-mail and log files, refer to the unit tests.

850

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

851

@param inventory `_LabInventory` object to be reported on.

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

852

@return String with the inventory message to be sent.

853

854

"""

855

logging.debug('Creating idle inventory')

856

message = [_IDLE_INVENTORY_HEADER]

857

message.append('Idle Host List:')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

858

message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

859

data_list = []

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

860

for pool in MANAGED_POOLS:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

861

for model, counts in inventory.iteritems():

Richard Barnette

254d5b4

2016-07-06 19:13:23 -0700

[diff] [blame]

862

logging.debug('Counting %2d DUTs for %s, %s',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

863

counts.get_total(pool), model, pool)

864

data_list.extend([(dut.host.hostname, model, pool)

xixuan

2016-03-10 13:16:30 -0800

[diff] [blame]

865

for dut in counts.get_idle_list(pool)])

866

if data_list:

867

message.extend(['%-30s %-20s %s' % t for t in data_list])

868

else:

869

message.append('(No idle DUTs)')

870

return '\n'.join(message)

871

872

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

873

def _send_email(arguments, tag, subject, recipients, body):

874

"""Send an inventory e-mail message.

875

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

876

The message is logged in the selected log directory using `tag` for

877

the file name.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

878

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

879

If the --debug option was requested, the message is neither logged

880

nor sent, but merely printed on stdout.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

881

882

@param arguments Parsed command-line options.

883

@param tag Tag identifying the inventory for logging

884

purposes.

885

@param subject E-mail Subject: header line.

886

@param recipients E-mail addresses for the To: header line.

887

@param body E-mail message body.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

888

"""

889

logging.debug('Generating email: "%s"', subject)

890

all_recipients = ', '.join(recipients)

891

report_body = '\n'.join([

892

'To: %s' % all_recipients,

893

'Subject: %s' % subject,

894

'', body, ''])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

895

if arguments.debug:

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

896

print report_body

897

else:

898

filename = os.path.join(arguments.logdir, tag)

899

try:

900

report_file = open(filename, 'w')

901

report_file.write(report_body)

902

report_file.close()

903

except EnvironmentError as e:

904

logging.error('Failed to write %s: %s', filename, e)

905

try:

906

gmail_lib.send_email(all_recipients, subject, body)

907

except Exception as e:

908

logging.error('Failed to send e-mail to %s: %s',

all_recipients, e)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

912

def _populate_model_counts(inventory):

913

"""Gather model counts while providing interactive feedback.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

914

915

Gathering the status of all individual DUTs in the lab can take

916

considerable time (~30 minutes at the time of this writing).

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

917

Normally, we pay that cost by querying as we go. However, with

918

the `--debug` option, we expect a human being to be watching the

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

919

progress in real time. So, we force the first (expensive) queries

920

to happen up front, and provide simple ASCII output on sys.stdout

921

to show a progress bar and results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

922

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

923

@param inventory `_LabInventory` object from which to gather

924

counts.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

925

"""

926

n = 0

927

total_broken = 0

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

928

for counts in inventory.itervalues():

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

n += 1

if n % 10 == 5:

c = '+'

elif n % 10 == 0:

c = '%d' % ((n / 10) % 10)

else:

c = '.'

sys.stdout.write(c)

sys.stdout.flush()

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

938

# This next call is where all the time goes - it forces all of a

939

# model's `HostJobHistory` objects to query the database and

940

# cache their results.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

941

total_broken += counts.get_broken()

942

sys.stdout.write('\n')

943

sys.stdout.write('Found %d broken DUTs\n' % total_broken)

944

945

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

946

def _perform_model_inventory(arguments, inventory, timestamp):

947

"""Perform the model inventory report.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

948

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

949

The model inventory report consists of the following:

950

* A list of DUTs that are recommended to be repaired. This list

951

is optional, and only appears if the `--recommend` option is

952

present.

953

* A list of all models that have failed DUTs, with counts

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

954

of working, broken, and spare DUTs, among others.

955

956

@param arguments Command-line arguments as returned by

957

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

958

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

959

@param timestamp A string used to identify this run's timestamp

960

in logs and email output.

961

"""

962

if arguments.recommend:

963

recommend_message = _generate_repair_recommendation(

964

inventory, arguments.recommend) + '\n\n\n'

965

else:

966

recommend_message = ''

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

967

model_message = _generate_model_inventory_message(inventory)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

968

_send_email(arguments,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

969

'models-%s.txt' % timestamp,

970

'DUT model inventory %s' % timestamp,

971

arguments.model_notify,

972

recommend_message + model_message)

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

973

974

975

def _perform_pool_inventory(arguments, inventory, timestamp):

976

"""Perform the pool inventory report.

977

978

The pool inventory report consists of the following:

979

* A list of all critical pools that have failed DUTs, with counts

980

of working, broken, and idle DUTs.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

981

* A list of all idle DUTs by hostname including the model and

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

982

pool.

983

984

@param arguments Command-line arguments as returned by

985

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

986

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

987

@param timestamp A string used to identify this run's timestamp in

988

logs and email output.

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

989

"""

990

pool_message = _generate_pool_inventory_message(inventory)

991

idle_message = _generate_idle_inventory_message(inventory)

992

_send_email(arguments,

993

'pools-%s.txt' % timestamp,

994

'DUT pool inventory %s' % timestamp,

995

arguments.pool_notify,

996

pool_message + '\n\n\n' + idle_message)

997

998

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

999

def _dut_in_repair_loop(history):

1000

"""Return whether a DUT's history indicates a repair loop.

1001

1002

A DUT is considered looping if it runs no tests, and no tasks pass

1003

other than repair tasks.

1004

1005

@param history An instance of `status_history.HostJobHistory` to be

1006

scanned for a repair loop. The caller guarantees

1007

that this history corresponds to a working DUT.

1008

@returns Return a true value if the DUT's most recent history

1009

indicates a repair loop.

1010

"""

1011

# Our caller passes only histories for working DUTs; that means

1012

# we've already paid the cost of fetching the diagnosis task, and

1013

# we know that the task was successful. The diagnosis task will be

1014

# one of the tasks we must scan to find a loop, so if the task isn't

1015

# a repair task, then our history includes a successful non-repair

1016

# task, and we're not looping.

1017

#

1018

# The for loop below is very expensive, because it must fetch the

1019

# full history, regardless of how many tasks we examine. At the

1020

# time of this writing, this check against the diagnosis task

1021

# reduces the cost of finding loops in the full inventory from hours

1022

# to minutes.

1023

if history.last_diagnosis()[1].name != 'Repair':

return False

repair_ok_count = 0

for task in history:

if not task.is_special:

1028

# This is a test, so we're not looping.

1029

return False

1030

if task.diagnosis == status_history.BROKEN:

1031

# Failed a repair, so we're not looping.

1032

return False

1033

if (task.diagnosis == status_history.WORKING

1034

and task.name != 'Repair'):

1035

# Non-repair task succeeded, so we're not looping.

1036

return False

1037

# At this point, we have either a failed non-repair task, or

1038

# a successful repair.

1039

if task.name == 'Repair':

1040

repair_ok_count += 1

1041

if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:

return True

def _perform_repair_loop_report(arguments, inventory):

1046

"""Scan the inventory for DUTs stuck in a repair loop.

1047

1048

This routine walks through the given inventory looking for DUTs

1049

where the most recent history shows that the DUT is regularly

1050

passing repair tasks, but has not run any tests.

1051

1052

@param arguments Command-line arguments as returned by

1053

`ArgumentParser`

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1054

@param inventory `_LabInventory` object to be reported on.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1055

"""

1056

loop_presence = metrics.BooleanMetric(

1057

'chromeos/autotest/inventory/repair_loops',

1058

'DUTs stuck in repair loops')

1059

logging.info('Scanning for DUTs in repair loops.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1060

for counts in inventory.itervalues():

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1061

for history in counts.get_working_list():

1062

# Managed DUTs with names that don't match

1063

# _HOSTNAME_PATTERN shouldn't be possible. However, we

1064

# don't want arbitrary strings being attached to the

1065

# 'dut_hostname' field, so for safety, we exclude all

1066

# anomalies.

1067

if not _HOSTNAME_PATTERN.match(history.hostname):

1068

continue

1069

if _dut_in_repair_loop(history):

1070

fields = {'dut_hostname': history.hostname,

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1071

'model': history.host_model,

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1072

'pool': history.host_pool}

1073

logging.info('Looping DUT: %(dut_hostname)s, '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1074

'model: %(model)s, pool: %(pool)s',

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1075

fields)

1076

loop_presence.set(True, fields=fields)

1077

1078

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1079

def _log_startup(arguments, startup_time):

1080

"""Log the start of this inventory run.

1081

1082

Print various log messages indicating the start of the run. Return

1083

a string based on `startup_time` that will be used to identify this

1084

run in log files and e-mail messages.

1085

1086

@param startup_time A UNIX timestamp marking the moment when

1087

this inventory run began.

1088

@returns A timestamp string that will be used to identify this run

1089

in logs and email output.

1090

"""

1091

timestamp = time.strftime('%Y-%m-%d.%H',

1092

time.localtime(startup_time))

1093

logging.debug('Starting lab inventory for %s', timestamp)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1094

if arguments.model_notify:

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1095

if arguments.recommend:

1096

logging.debug('Will include repair recommendations')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1097

logging.debug('Will include model inventory')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1098

if arguments.pool_notify:

1099

logging.debug('Will include pool inventory')

return timestamp

def _create_inventory(arguments, end_time):

1104

"""Create the `_LabInventory` instance to use for reporting.

1105

1106

@param end_time A UNIX timestamp for the end of the time range

1107

to be searched in this inventory run.

1108

"""

1109

start_time = end_time - arguments.duration * 60 * 60

1110

afe = frontend_wrappers.RetryingAFE(server=None)

1111

inventory = _LabInventory.create_inventory(

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1112

afe, start_time, end_time, arguments.modelnames)

1113

logging.info('Found %d hosts across %d models',

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1114

inventory.get_num_duts(),

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1115

inventory.get_num_models())

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

return inventory

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1119

def _perform_inventory_reports(arguments):

1120

"""Perform all inventory checks requested on the command line.

1121

1122

Create the initial inventory and run through the inventory reports

1123

as called for by the parsed command-line arguments.

1124

1125

@param arguments Command-line arguments as returned by

1126

`ArgumentParser`.

1127

"""

1128

startup_time = time.time()

1129

timestamp = _log_startup(arguments, startup_time)

1130

inventory = _create_inventory(arguments, startup_time)

1131

if arguments.debug:

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1132

_populate_model_counts(inventory)

1133

if arguments.model_notify:

1134

_perform_model_inventory(arguments, inventory, timestamp)

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1135

if arguments.pool_notify:

1136

_perform_pool_inventory(arguments, inventory, timestamp)

1137

if arguments.repair_loops:

1138

_perform_repair_loop_report(arguments, inventory)

1139

1140

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1141

def _separate_email_addresses(address_list):

1142

"""Parse a list of comma-separated lists of e-mail addresses.

1143

1144

@param address_list A list of strings containing comma

1145

separate e-mail addresses.

1146

@return A list of the individual e-mail addresses.

"""

newlist = []

for arg in address_list:

1151

newlist.extend([email.strip() for email in arg.split(',')])

return newlist

def _verify_arguments(arguments):

1156

"""Validate command-line arguments.

1157

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1158

Join comma separated e-mail addresses for `--model-notify` and

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1159

`--pool-notify` in separate option arguments into a single list.

1160

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1161

For non-debug uses, require that notification be requested for

1162

at least one report. For debug, if notification isn't specified,

1163

treat it as "run all the reports."

1164

1165

The return value indicates success or failure; in the case of

1166

failure, we also write an error message to stderr.

1167

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1168

@param arguments Command-line arguments as returned by

1169

`ArgumentParser`

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1170

@return True if the arguments are semantically good, or False

1171

if the arguments don't meet requirements.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1172

1173

"""

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1174

arguments.model_notify = _separate_email_addresses(

1175

arguments.model_notify)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1176

arguments.pool_notify = _separate_email_addresses(

1177

arguments.pool_notify)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1178

if not arguments.model_notify and not arguments.pool_notify:

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1179

if not arguments.debug:

1180

sys.stderr.write('Must specify at least one of '

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1181

'--model-notify or --pool-notify\n')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1182

return False

1183

else:

1184

# We want to run all the reports. An empty notify list

1185

# will cause a report to be skipped, so make sure the

1186

# lists are non-empty.

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1187

arguments.model_notify = ['']

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1188

arguments.pool_notify = ['']

1189

return True

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1190

1191

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1192

def _get_default_logdir(script):

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1193

"""Get the default directory for the `--logdir` option.

1194

1195

The default log directory is based on the parent directory

1196

containing this script.

1197

1198

@param script Path to this script file.

1199

@return A path to a directory.

1200

1201

"""

1202

basedir = os.path.dirname(os.path.abspath(script))

1203

basedir = os.path.dirname(basedir)

1204

return os.path.join(basedir, _LOGDIR)

1205

1206

1207

def _parse_command(argv):

1208

"""Parse the command line arguments.

1209

1210

Create an argument parser for this command's syntax, parse the

1211

command line, and return the result of the ArgumentParser

1212

parse_args() method.

1213

1214

@param argv Standard command line argument vector; argv[0] is

1215

assumed to be the command name.

1216

@return Result returned by ArgumentParser.parse_args().

1217

1218

"""

1219

parser = argparse.ArgumentParser(

1220

prog=argv[0],

1221

description='Gather and report lab inventory statistics')

1222

parser.add_argument('-d', '--duration', type=int,

1223

default=_DEFAULT_DURATION, metavar='HOURS',

1224

help='number of hours back to search for status'

1225

' (default: %d)' % _DEFAULT_DURATION)

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1226

parser.add_argument('--model-notify', action='append',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1227

default=[], metavar='ADDRESS',

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1228

help='Generate model inventory message, '

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1229

'and send it to the given e-mail address(es)')

1230

parser.add_argument('--pool-notify', action='append',

1231

default=[], metavar='ADDRESS',

1232

help='Generate pool inventory message, '

1233

'and send it to the given address(es)')

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1234

parser.add_argument('-r', '--recommend', type=int, default=None,

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1235

help=('Specify how many DUTs should be '

J. Richard Barnette

2015-06-09 10:06:17 -0700

[diff] [blame]

1236

'recommended for repair (default: no '

1237

'recommendation)'))

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1238

parser.add_argument('--repair-loops', action='store_true',

1239

help='Check for devices stuck in repair loops.')

Richard Barnette

88b9489

2018-02-07 12:11:02 -0800

[diff] [blame]

1240

parser.add_argument('--debug-metrics', action='store_true',

1241

help='Include debug information about the metrics '

1242

'that would be reported ')

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1243

parser.add_argument('--debug', action='store_true',

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1244

help='Print e-mail messages on stdout '

1245

'without sending them.')

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1246

parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1247

help='Directory where logs will be written.')

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1248

parser.add_argument('modelnames', nargs='*',

1249

metavar='MODEL',

1250

help='names of models to report on '

1251

'(default: all models)')

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1252

arguments = parser.parse_args(argv[1:])

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1253

if not _verify_arguments(arguments):

1254

return None

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

return arguments

def _configure_logging(arguments):

1259

"""Configure the `logging` module for our needs.

1260

Richard Barnette

2017-10-23 17:57:50 -0700

[diff] [blame]

1261

How we log depends on whether the `--debug` option was provided on

1262

the command line.

1263

* Without the option, we configure the logging to capture all

1264

potentially relevant events in a log file. The log file is

1265

configured to rotate once a week on Friday evening, preserving

1266

~3 months worth of history.

1267

* With the option, we expect stdout to contain other

1268

human-readable output (including the contents of the e-mail

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1269

messages), so we restrict the output to INFO level.

1270

1271

For convenience, when `--debug` is on, the logging format has

1272

no adornments, so that a call like `logging.info(msg)` simply writes

1273

`msg` to stdout, plus a trailing newline.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1274

1275

@param arguments Command-line arguments as returned by

1276

`ArgumentParser`

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1277

"""

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1278

root_logger = logging.getLogger()

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1279

if arguments.debug:

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1280

root_logger.setLevel(logging.INFO)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1281

handler = logging.StreamHandler(sys.stdout)

1282

handler.setFormatter(logging.Formatter())

1283

else:

Richard Barnette

5af9740

2016-04-18 11:00:26 -0700

[diff] [blame]

1284

if not os.path.exists(arguments.logdir):

1285

os.mkdir(arguments.logdir)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1286

root_logger.setLevel(logging.DEBUG)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1287

logfile = os.path.join(arguments.logdir, _LOGFILE)

1288

handler = logging.handlers.TimedRotatingFileHandler(

1289

logfile, when='W4', backupCount=13)

1290

formatter = logging.Formatter(_LOG_FORMAT,

1291

time_utils.TIME_FMT)

1292

handler.setFormatter(formatter)

J. Richard Barnette

2015-06-01 16:00:35 -0700

[diff] [blame]

1293

# TODO(jrbarnette) This is gross. Importing client.bin.utils

1294

# implicitly imported logging_config, which calls

1295

# logging.basicConfig() *at module level*. That gives us an

1296

# extra logging handler that we don't want. So, clear out all

1297

# the handlers here.

1298

for h in root_logger.handlers:

1299

root_logger.removeHandler(h)

1300

root_logger.addHandler(handler)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1301

1302

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1303

def main(argv):

1304

"""Standard main routine.

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1305

1306

@param argv Command line arguments, including `sys.argv[0]`.

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1307

"""

1308

arguments = _parse_command(argv)

J. Richard Barnette

2015-10-13 16:02:47 -0700

[diff] [blame]

1309

if not arguments:

1310

sys.exit(1)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1311

_configure_logging(arguments)

1312

try:

Richard Barnette

88b9489

2018-02-07 12:11:02 -0800

[diff] [blame]

1313

if arguments.debug_metrics or not arguments.debug:

1314

metrics_file = None if not arguments.debug_metrics else '/dev/null'

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1315

with site_utils.SetupTsMonGlobalState(

Richard Barnette

88b9489

2018-02-07 12:11:02 -0800

[diff] [blame]

1316

'repair_loops', short_lived=True,

1317

debug_file=metrics_file,

1318

auto_flush=False):

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1319

_perform_inventory_reports(arguments)

Richard Barnette

8840588

2018-02-07 11:39:30 -0800

[diff] [blame]

1320

metrics.Flush()

Richard Barnette

2017-10-24 18:13:11 -0700

[diff] [blame]

1321

else:

1322

_perform_inventory_reports(arguments)

J. Richard Barnette

2015-03-27 17:23:52 -0700

[diff] [blame]

1323

except KeyboardInterrupt:

1324

pass

1325

except EnvironmentError as e:

1326

logging.exception('Unexpected OS error: %s', e)

1327

except Exception as e:

1328

logging.exception('Unexpected exception: %s', e)

1329

1330

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1331

def get_inventory(afe):

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1332

end_time = int(time.time())

1333

start_time = end_time - 24 * 60 * 60

Kevin Cheng

2016-04-19 14:51:39 -0700

[diff] [blame]

1334

return _LabInventory.create_inventory(afe, start_time, end_time)

1335

1336

1337

def get_managed_boards(afe):

Richard Barnette

2017-12-15 09:53:42 -0800

[diff] [blame^]

1338

return get_inventory(afe).get_boards()

J. Richard Barnette

aa86893

2015-10-23 13:28:59 -0700

[diff] [blame]

1339

1340

J. Richard Barnette