Blame - server/cros/dynamic_suite.py - platform/external/autotest

2012-02-15 14:21:02 -0800

[diff] [blame]

1

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

2

# Use of this source code is governed by a BSD-style license that can be

3

# found in the LICENSE file.

4

5

import common

6

import compiler, logging, os, random, re, time

Chris Masone

2011-12-20 11:06:53 -0800

[diff] [blame]

7

from autotest_lib.client.common_lib import control_data, global_config, error

8

from autotest_lib.client.common_lib import utils

Chris Masone

2012-02-15 14:21:02 -0800

[diff] [blame]

9

from autotest_lib.server.cros import control_file_getter, frontend_wrappers

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

10

from autotest_lib.server import frontend

11

12

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

13

VERSION_PREFIX = 'cros-version:'

Chris Masone

2011-12-20 11:06:53 -0800

[diff] [blame]

14

CONFIG = global_config.global_config

15

16

Chris Masone

8b76425

2012-01-17 11:12:51 -0800

[diff] [blame]

17

def inject_vars(vars, control_file_in):

18

"""

19

Inject the contents of |vars| into |control_file_in|

20

21

@param vars: a dict to shoehorn into the provided control file string.

22

@param control_file_in: the contents of a control file to munge.

23

@return the modified control file string.

24

"""

25

control_file = ''

26

for key, value in vars.iteritems():

27

control_file += "%s='%s'\n" % (key, value)

28

return control_file + control_file_in

29

30

Chris Masone

2011-12-20 11:06:53 -0800

[diff] [blame]

31

def _image_url_pattern():

32

return CONFIG.get_config_value('CROS', 'image_url_pattern', type=str)

33

34

35

def _package_url_pattern():

36

return CONFIG.get_config_value('CROS', 'package_url_pattern', type=str)

37

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

38

39

class Reimager(object):

40

"""

41

A class that can run jobs to reimage devices.

42

43

@var _afe: a frontend.AFE instance used to talk to autotest.

44

@var _tko: a frontend.TKO instance used to query the autotest results db.

45

@var _cf_getter: a ControlFileGetter used to get the AU control file.

"""

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

49

def __init__(self, autotest_dir, afe=None, tko=None, pool=None):

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

"""

Constructor

@param autotest_dir: the place to find autotests.

54

@param afe: an instance of AFE as defined in server/frontend.py.

55

@param tko: an instance of TKO as defined in server/frontend.py.

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

56

@param pool: Specify the pool of machines to use for scheduling

57

purposes.

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

58

"""

Chris Masone

2012-02-15 14:21:02 -0800

[diff] [blame]

59

self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,

60

delay_sec=10,

61

debug=False)

62

self._tko = tko or frontend_wrappers.RetryingTKO(timeout_min=30,

63

delay_sec=10,

64

debug=False)

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

65

self._pool = pool

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

66

self._cf_getter = control_file_getter.FileSystemGetter(

67

[os.path.join(autotest_dir, 'server/site_tests')])

68

69

Chris Masone

2011-12-20 11:06:53 -0800

[diff] [blame]

70

def skip(self, g):

71

return 'SKIP_IMAGE' in g and g['SKIP_IMAGE']

72

73

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

74

def attempt(self, build, board, record, num=None, pool=None):

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

75

"""

76

Synchronously attempt to reimage some machines.

77

78

Fire off attempts to reimage |num| machines of type |board|, using an

Chris Masone

2012-01-31 09:27:36 -0800

[diff] [blame]

79

image at |url| called |build|. Wait for completion, polling every

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

80

10s, and log results with |record| upon completion.

81

Chris Masone

2012-01-31 09:27:36 -0800

[diff] [blame]

82

@param build: the build to install e.g.

83

x86-alex-release/R18-1655.0.0-a1-b1584.

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

84

@param board: which kind of devices to reimage.

85

@param record: callable that records job status.

86

prototype:

87

record(status, subdir, name, reason)

Chris Masone

5552dd7

2012-02-15 15:01:04 -0800

[diff] [blame]

88

@param num: how many devices to reimage.

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

89

@param pool: Specify the pool of machines to use for scheduling

90

purposes.

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

91

@return True if all reimaging jobs succeed, false otherwise.

92

"""

Chris Masone

5552dd7

2012-02-15 15:01:04 -0800

[diff] [blame]

93

if not num:

94

num = CONFIG.get_config_value('CROS', 'sharding_factor', type=int)

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

95

if pool:

96

self._pool = pool

97

logging.debug("scheduling reimaging across %d machines", num)

Chris Masone

73f6502

2012-01-31 14:00:43 -0800

[diff] [blame]

98

wrapper_job_name = 'try new image'

99

record('START', None, wrapper_job_name)

Chris Masone

2012-01-31 09:27:36 -0800

[diff] [blame]

100

self._ensure_version_label(VERSION_PREFIX + build)

101

canary = self._schedule_reimage_job(build, num, board)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

102

logging.debug('Created re-imaging job: %d', canary.id)

103

while len(self._afe.get_jobs(id=canary.id, not_yet_run=True)) > 0:

104

time.sleep(10)

105

logging.debug('Re-imaging job running.')

106

while len(self._afe.get_jobs(id=canary.id, finished=True)) == 0:

107

time.sleep(10)

108

logging.debug('Re-imaging job finished.')

109

canary.result = self._afe.poll_job_results(self._tko, canary, 0)

110

111

if canary.result is True:

112

self._report_results(canary, record)

Chris Masone

73f6502

2012-01-31 14:00:43 -0800

[diff] [blame]

113

record('END GOOD', None, wrapper_job_name)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

114

return True

115

116

if canary.result is None:

117

record('FAIL', None, canary.name, 're-imaging tasks did not run')

118

else: # canary.result is False

119

self._report_results(canary, record)

120

Chris Masone

73f6502

2012-01-31 14:00:43 -0800

[diff] [blame]

121

record('END FAIL', None, wrapper_job_name)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

return False

def _ensure_version_label(self, name):

126

"""

127

Ensure that a label called |name| exists in the autotest DB.

128

129

@param name: the label to check for/create.

130

"""

131

labels = self._afe.get_labels(name=name)

132

if len(labels) == 0:

133

self._afe.create_label(name=name)

134

135

Chris Masone

2012-01-31 09:27:36 -0800

[diff] [blame]

136

def _schedule_reimage_job(self, build, num_machines, board):

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

"""

Sends an RPC to the autotest frontend to enqueue reimaging jobs on

141

|num_machines| devices of type |board|

142

Chris Masone

2012-01-31 09:27:36 -0800

[diff] [blame]

143

@param build: the build to install (must be unique).

Chris Masone

2011-12-20 11:06:53 -0800

[diff] [blame]

144

@param num_machines: how many devices to reimage.

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

145

@param board: which kind of devices to reimage.

146

@return a frontend.Job object for the reimaging job we scheduled.

147

"""

Chris Masone

8b76425

2012-01-17 11:12:51 -0800

[diff] [blame]

148

control_file = inject_vars(

Chris Masone

2012-01-31 09:27:36 -0800

[diff] [blame]

149

{'image_url': _image_url_pattern() % build, 'image_name': build},

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

150

self._cf_getter.get_control_file_contents_by_name('autoupdate'))

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

151

job_deps = []

152

if self._pool:

153

meta_host = 'pool:%s' % self._pool

154

board_label = 'board:%s' % board

155

job_deps.append(board_label)

156

else:

157

# No pool specified use board.

158

meta_host = 'board:%s' % board

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

159

Chris Masone

2011-12-20 11:06:53 -0800

[diff] [blame]

160

return self._afe.create_job(control_file=control_file,

Chris Masone

2012-01-31 09:27:36 -0800

[diff] [blame]

161

name=build + '-try',

Chris Masone

2011-12-20 11:06:53 -0800

[diff] [blame]

162

control_type='Server',

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

163

meta_hosts=[meta_host] * num_machines,

164

dependencies=job_deps)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

165

166

167

def _report_results(self, job, record):

168

"""

169

Record results from a completed frontend.Job object.

170

171

@param job: a completed frontend.Job object populated by

172

frontend.AFE.poll_job_results.

173

@param record: callable that records job status.

174

prototype:

175

record(status, subdir, name, reason)

176

"""

177

if job.result == True:

178

record('GOOD', None, job.name)

179

return

180

181

for platform in job.results_platform_map:

182

for status in job.results_platform_map[platform]:

183

if status == 'Total':

184

continue

185

for host in job.results_platform_map[platform][status]:

186

if host not in job.test_status:

187

record('ERROR', None, host, 'Job failed to run.')

188

elif status == 'Failed':

189

for test_status in job.test_status[host].fail:

190

record('FAIL', None, host, test_status.reason)

191

elif status == 'Aborted':

192

for test_status in job.test_status[host].fail:

193

record('ABORT', None, host, test_status.reason)

194

elif status == 'Completed':

195

record('GOOD', None, host)

class Suite(object):

"""

A suite of tests, defined by some predicate over control file variables.

201

202

Given a place to search for control files a predicate to match the desired

203

tests, can gather tests and fire off jobs to run them, and then wait for

204

results.

205

206

@var _predicate: a function that should return True when run over a

207

ControlData representation of a control file that should be in

208

this Suite.

209

@var _tag: a string with which to tag jobs run in this suite.

210

@var _afe: an instance of AFE as defined in server/frontend.py.

211

@var _tko: an instance of TKO as defined in server/frontend.py.

212

@var _jobs: currently scheduled jobs, if any.

213

@var _cf_getter: a control_file_getter.ControlFileGetter

"""

Chris Masone

2012-01-17 11:16:32 -0800

[diff] [blame]

217

@staticmethod

218

def create_fs_getter(autotest_dir):

219

"""

220

@param autotest_dir: the place to find autotests.

221

@return a FileSystemGetter instance that looks under |autotest_dir|.

222

"""

223

# currently hard-coded places to look for tests.

224

subpaths = ['server/site_tests', 'client/site_tests']

225

directories = [os.path.join(autotest_dir, p) for p in subpaths]

226

return control_file_getter.FileSystemGetter(directories)

227

228

229

@staticmethod

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

230

def create_from_name(name, autotest_dir, afe=None, tko=None, pool=None):

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

231

"""

232

Create a Suite using a predicate based on the SUITE control file var.

233

234

Makes a predicate based on |name| and uses it to instantiate a Suite

235

that looks for tests in |autotest_dir| and will schedule them using

236

|afe|. Results will be pulled from |tko| upon completion

237

238

@param name: a value of the SUITE control file variable to search for.

239

@param autotest_dir: the place to find autotests.

240

@param afe: an instance of AFE as defined in server/frontend.py.

241

@param tko: an instance of TKO as defined in server/frontend.py.

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

242

@param pool: Specify the pool of machines to use for scheduling

243

purposes.

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

244

@return a Suite instance.

245

"""

246

return Suite(lambda t: hasattr(t, 'suite') and t.suite == name,

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

247

name, autotest_dir, afe, tko, pool)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

248

249

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

250

def __init__(self, predicate, tag, autotest_dir, afe=None, tko=None,

251

pool=None):

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

"""

Constructor

@param predicate: a function that should return True when run over a

256

ControlData representation of a control file that should be in

257

this Suite.

258

@param tag: a string with which to tag jobs run in this suite.

259

@param autotest_dir: the place to find autotests.

260

@param afe: an instance of AFE as defined in server/frontend.py.

261

@param tko: an instance of TKO as defined in server/frontend.py.

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

262

@param pool: Specify the pool of machines to use for scheduling

263

purposes.

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

264

"""

265

self._predicate = predicate

266

self._tag = tag

Chris Masone

2012-02-15 14:21:02 -0800

[diff] [blame]

267

self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,

268

delay_sec=10,

269

debug=False)

270

self._tko = tko or frontend_wrappers.RetryingTKO(timeout_min=30,

271

delay_sec=10,

272

debug=False)

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

273

self._pool = pool

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

274

self._jobs = []

275

Chris Masone

fef2138

2012-01-17 11:16:32 -0800

[diff] [blame]

276

self._cf_getter = Suite.create_fs_getter(autotest_dir)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

277

278

self._tests = Suite.find_and_parse_tests(self._cf_getter,

279

self._predicate,

280

add_experimental=True)

@property

def tests(self):

"""

A list of ControlData objects in the suite, with added |text| attr.

"""

return self._tests

def stable_tests(self):

292

"""

293

|self.tests|, filtered for non-experimental tests.

294

"""

295

return filter(lambda t: not t.experimental, self.tests)

296

297

298

def unstable_tests(self):

299

"""

300

|self.tests|, filtered for experimental tests.

301

"""

302

return filter(lambda t: t.experimental, self.tests)

303

304

305

def _create_job(self, test, image_name):

306

"""

307

Thin wrapper around frontend.AFE.create_job().

308

309

@param test: ControlData object for a test to run.

310

@param image_name: the name of an image against which to test.

311

@return frontend.Job object for the job just scheduled.

312

"""

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

313

job_deps = []

314

if self._pool:

315

meta_hosts = 'pool:%s' % self._pool

316

cros_label = VERSION_PREFIX+image_name

317

job_deps.append(cros_label)

318

else:

319

# No pool specified use any machines with the following label.

320

meta_hosts = VERSION_PREFIX+image_name

321

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

322

return self._afe.create_job(

323

control_file=test.text,

324

name='/'.join([image_name, self._tag, test.name]),

325

control_type=test.test_type.capitalize(),

Scott Zawalski

2012-02-16 11:48:26 -0500

[diff] [blame^]

326

meta_hosts=[meta_hosts],

327

dependencies=job_deps)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

328

329

330

def run_and_wait(self, image_name, record, add_experimental=True):

331

"""

332

Synchronously run tests in |self.tests|.

333

334

Schedules tests against a device running image |image_name|, and

335

then polls for status, using |record| to print status when each

336

completes.

337

338

Tests returned by self.stable_tests() will always be run, while tests

339

in self.unstable_tests() will only be run if |add_experimental| is true.

340

341

@param image_name: the name of an image against which to test.

342

@param record: callable that records job status.

343

prototype:

344

record(status, subdir, name, reason)

345

@param add_experimental: schedule experimental tests as well, or not.

346

"""

347

try:

Scott Zawalski

2012-02-10 18:29:12 -0500

[diff] [blame]

348

record('INFO', None, 'Start %s' % self._tag)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

349

self.schedule(image_name, add_experimental)

350

try:

351

for result in self.wait_for_results():

Scott Zawalski

2012-02-10 18:29:12 -0500

[diff] [blame]

352

# |result| will be a tuple of a maximum of 4 entries and a

353

# minimum of 3. We use the first 3 for START and END

354

# entries so we separate those variables out for legible

355

# variable names, nothing more.

356

status = result[0]

357

test_name = result[2]

358

record('START', None, test_name)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

359

record(*result)

Scott Zawalski

2012-02-10 18:29:12 -0500

[diff] [blame]

360

record('END %s' % status, None, test_name)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

361

except Exception as e:

362

logging.error(e)

Scott Zawalski

2012-02-10 18:29:12 -0500

[diff] [blame]

363

record('FAIL', None, self._tag,

364

'Exception waiting for results')

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

365

except Exception as e:

366

logging.error(e)

Scott Zawalski

2012-02-10 18:29:12 -0500

[diff] [blame]

367

record('FAIL', None, self._tag,

368

'Exception while scheduling suite')

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

369

370

371

def schedule(self, image_name, add_experimental=True):

372

"""

373

Schedule jobs using |self._afe|.

374

375

frontend.Job objects representing each scheduled job will be put in

376

|self._jobs|.

377

378

@param image_name: the name of an image against which to test.

379

@param add_experimental: schedule experimental tests as well, or not.

380

"""

381

for test in self.stable_tests():

382

logging.debug('Scheduling %s', test.name)

383

self._jobs.append(self._create_job(test, image_name))

384

385

if add_experimental:

386

# TODO(cmasone): ensure I can log results from these differently.

387

for test in self.unstable_tests():

388

logging.debug('Scheduling %s', test.name)

389

self._jobs.append(self._create_job(test, image_name))

390

391

392

def _status_is_relevant(self, status):

393

"""

394

Indicates whether the status of a given test is meaningful or not.

395

396

@param status: frontend.TestStatus object to look at.

397

@return True if this is a test result worth looking at further.

398

"""

399

return not (status.test_name.startswith('SERVER_JOB') or

400

status.test_name.startswith('CLIENT_JOB'))

401

402

403

def _collate_aborted(self, current_value, entry):

404

"""

405

reduce() over a list of HostQueueEntries for a job; True if any aborted.

406

407

Functor that can be reduced()ed over a list of

408

HostQueueEntries for a job. If any were aborted

409

(|entry.aborted| exists and is True), then the reduce() will

return True.

Ex:

entries = self._afe.run('get_host_queue_entries', job=job.id)

414

reduce(self._collate_aborted, entries, False)

415

416

@param current_value: the current accumulator (a boolean).

417

@param entry: the current entry under consideration.

418

@return the value of |entry.aborted| if it exists, False if not.

419

"""

420

return current_value or ('aborted' in entry and entry['aborted'])

421

422

423

def wait_for_results(self):

424

"""

425

Wait for results of all tests in all jobs in |self._jobs|.

426

427

Currently polls for results every 5s. When all results are available,

428

@return a list of tuples, one per test: (status, subdir, name, reason)

429

"""

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

430

while self._jobs:

431

for job in list(self._jobs):

432

if not self._afe.get_jobs(id=job.id, finished=True):

433

continue

434

435

self._jobs.remove(job)

436

437

entries = self._afe.run('get_host_queue_entries', job=job.id)

438

if reduce(self._collate_aborted, entries, False):

Scott Zawalski

2012-02-10 18:29:12 -0500

[diff] [blame]

439

yield('ABORT', None, job.name)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

440

else:

441

statuses = self._tko.get_status_counts(job=job.id)

442

for s in filter(self._status_is_relevant, statuses):

Scott Zawalski

2012-02-10 18:29:12 -0500

[diff] [blame]

443

yield(s.status, None, s.test_name, s.reason)

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

444

time.sleep(5)

445

Chris Masone

2011-10-20 16:36:43 -0700

[diff] [blame]

446

Chris Masone

fef2138

2012-01-17 11:16:32 -0800

[diff] [blame]

447

@staticmethod

448

def find_and_parse_tests(cf_getter, predicate, add_experimental=False):

Chris Masone