Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 1 | #!/usr/bin/python |
Yunlian Jiang | 36f91ad | 2013-03-28 17:13:29 -0700 | [diff] [blame] | 2 | |
| 3 | # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. |
| 4 | # Use of this source code is governed by a BSD-style license that can be |
| 5 | # found in the LICENSE file. |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 6 | |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 7 | import hashlib |
| 8 | import image_chromeos |
cmtice | 517dc98 | 2015-06-12 12:22:32 -0700 | [diff] [blame] | 9 | import file_lock_machine |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 10 | import math |
| 11 | import os.path |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 12 | import re |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 13 | import sys |
| 14 | import threading |
| 15 | import time |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 16 | |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 17 | |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 18 | from utils import command_executer |
| 19 | from utils import logger |
cmtice | 5c09fc2 | 2015-04-22 09:25:53 -0700 | [diff] [blame] | 20 | from utils import misc |
Ahmad Sharif | f1d70cb | 2012-02-06 21:51:59 -0800 | [diff] [blame] | 21 | from utils.file_utils import FileUtils |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 22 | |
| 23 | CHECKSUM_FILE = "/usr/local/osimage_checksum_file" |
| 24 | |
Caroline Tice | 5ea9f00 | 2015-09-02 12:36:47 -0700 | [diff] [blame] | 25 | class BadChecksum(Exception): |
| 26 | """Raised if all machines for a label don't have the same checksum.""" |
| 27 | pass |
| 28 | |
| 29 | class BadChecksumString(Exception): |
| 30 | """Raised if all machines for a label don't have the same checksum string.""" |
cmtice | 798a8fa | 2014-05-12 13:56:42 -0700 | [diff] [blame] | 31 | pass |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 32 | |
cmtice | 5c09fc2 | 2015-04-22 09:25:53 -0700 | [diff] [blame] | 33 | class MissingLocksDirectory(Exception): |
| 34 | """Raised when cannot find/access the machine locks directory.""" |
| 35 | |
Caroline Tice | 31fedb0 | 2015-09-14 16:30:37 -0700 | [diff] [blame] | 36 | class CrosCommandError(Exception): |
| 37 | """Raised when an error occurs running command on DUT.""" |
| 38 | |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 39 | class CrosMachine(object): |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 40 | def __init__(self, name, chromeos_root, log_level, cmd_exec=None): |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 41 | self.name = name |
| 42 | self.image = None |
Han Shen | ba64928 | 2015-08-05 17:19:55 -0700 | [diff] [blame] | 43 | # We relate a dut with a label if we reimage the dut using label or we |
| 44 | # detect at the very beginning that the dut is running this label. |
| 45 | self.label = None |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 46 | self.checksum = None |
| 47 | self.locked = False |
| 48 | self.released_time = time.time() |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 49 | self.test_run = None |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 50 | self.chromeos_root = chromeos_root |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 51 | self.log_level = log_level |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 52 | self.ce = cmd_exec or command_executer.GetCommandExecuter( |
| 53 | log_level=self.log_level) |
cmtice | 798a8fa | 2014-05-12 13:56:42 -0700 | [diff] [blame] | 54 | self.SetUpChecksumInfo() |
| 55 | |
| 56 | def SetUpChecksumInfo(self): |
Yunlian Jiang | e5b673f | 2013-05-23 11:42:53 -0700 | [diff] [blame] | 57 | if not self.IsReachable(): |
Yunlian Jiang | 837e07a | 2013-05-22 16:23:28 -0700 | [diff] [blame] | 58 | self.machine_checksum = None |
| 59 | return |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 60 | self._GetMemoryInfo() |
| 61 | self._GetCPUInfo() |
| 62 | self._ComputeMachineChecksumString() |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 63 | self._GetMachineID() |
| 64 | self.machine_checksum = self._GetMD5Checksum(self.checksum_string) |
| 65 | self.machine_id_checksum = self._GetMD5Checksum(self.machine_id) |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 66 | |
Yunlian Jiang | e5b673f | 2013-05-23 11:42:53 -0700 | [diff] [blame] | 67 | def IsReachable(self): |
Yunlian Jiang | 837e07a | 2013-05-22 16:23:28 -0700 | [diff] [blame] | 68 | command = "ls" |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 69 | ret = self.ce.CrosRunCommand(command, |
| 70 | machine=self.name, |
| 71 | chromeos_root=self.chromeos_root) |
Yunlian Jiang | 837e07a | 2013-05-22 16:23:28 -0700 | [diff] [blame] | 72 | if ret: |
| 73 | return False |
| 74 | return True |
| 75 | |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 76 | def _ParseMemoryInfo(self): |
| 77 | line = self.meminfo.splitlines()[0] |
| 78 | usable_kbytes = int(line.split()[1]) |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 79 | # This code is from src/third_party/test/files/client/bin/base_utils.py |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 80 | # usable_kbytes is system's usable DRAM in kbytes, |
| 81 | # as reported by memtotal() from device /proc/meminfo memtotal |
| 82 | # after Linux deducts 1.5% to 9.5% for system table overhead |
| 83 | # Undo the unknown actual deduction by rounding up |
| 84 | # to next small multiple of a big power-of-two |
| 85 | # eg 12GB - 5.1% gets rounded back up to 12GB |
| 86 | mindeduct = 0.005 # 0.5 percent |
| 87 | maxdeduct = 0.095 # 9.5 percent |
| 88 | # deduction range 1.5% .. 9.5% supports physical mem sizes |
| 89 | # 6GB .. 12GB in steps of .5GB |
| 90 | # 12GB .. 24GB in steps of 1 GB |
| 91 | # 24GB .. 48GB in steps of 2 GB ... |
| 92 | # Finer granularity in physical mem sizes would require |
| 93 | # tighter spread between min and max possible deductions |
| 94 | |
| 95 | # increase mem size by at least min deduction, without rounding |
| 96 | min_kbytes = int(usable_kbytes / (1.0 - mindeduct)) |
| 97 | # increase mem size further by 2**n rounding, by 0..roundKb or more |
| 98 | round_kbytes = int(usable_kbytes / (1.0 - maxdeduct)) - min_kbytes |
| 99 | # find least binary roundup 2**n that covers worst-cast roundKb |
| 100 | mod2n = 1 << int(math.ceil(math.log(round_kbytes, 2))) |
| 101 | # have round_kbytes <= mod2n < round_kbytes*2 |
| 102 | # round min_kbytes up to next multiple of mod2n |
| 103 | phys_kbytes = min_kbytes + mod2n - 1 |
| 104 | phys_kbytes -= phys_kbytes % mod2n # clear low bits |
| 105 | self.phys_kbytes = phys_kbytes |
| 106 | |
| 107 | def _GetMemoryInfo(self): |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 108 | #TODO yunlian: when the machine in rebooting, it will not return |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 109 | #meminfo, the assert does not catch it either |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 110 | command = "cat /proc/meminfo" |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 111 | ret, self.meminfo, _ = self.ce.CrosRunCommand(command, return_output=True, |
| 112 | machine=self.name, |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 113 | chromeos_root=self.chromeos_root) |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 114 | assert ret == 0, "Could not get meminfo from machine: %s" % self.name |
| 115 | if ret == 0: |
| 116 | self._ParseMemoryInfo() |
| 117 | |
| 118 | #cpuinfo format is different across architecture |
| 119 | #need to find a better way to parse it. |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 120 | def _ParseCPUInfo(self, cpuinfo): |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 121 | return 0 |
| 122 | |
| 123 | def _GetCPUInfo(self): |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 124 | command = "cat /proc/cpuinfo" |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 125 | ret, self.cpuinfo, _ = self.ce.CrosRunCommand(command, return_output=True, |
| 126 | machine=self.name, |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 127 | chromeos_root=self.chromeos_root) |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 128 | assert ret == 0, "Could not get cpuinfo from machine: %s" % self.name |
| 129 | if ret == 0: |
| 130 | self._ParseCPUInfo(self.cpuinfo) |
| 131 | |
| 132 | def _ComputeMachineChecksumString(self): |
| 133 | self.checksum_string = "" |
| 134 | exclude_lines_list = ["MHz", "BogoMIPS", "bogomips"] |
| 135 | for line in self.cpuinfo.splitlines(): |
| 136 | if not any([e in line for e in exclude_lines_list]): |
| 137 | self.checksum_string += line |
| 138 | self.checksum_string += " " + str(self.phys_kbytes) |
| 139 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 140 | def _GetMD5Checksum(self, ss): |
| 141 | if ss: |
| 142 | return hashlib.md5(ss).hexdigest() |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 143 | else: |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 144 | return "" |
| 145 | |
| 146 | def _GetMachineID(self): |
Luis Lozano | f81680c | 2013-03-15 14:44:13 -0700 | [diff] [blame] | 147 | command = "dump_vpd_log --full --stdout" |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 148 | ret, if_out, _ = self.ce.CrosRunCommand(command, return_output=True, |
| 149 | machine=self.name, |
| 150 | chromeos_root=self.chromeos_root) |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 151 | b = if_out.splitlines() |
Luis Lozano | f81680c | 2013-03-15 14:44:13 -0700 | [diff] [blame] | 152 | a = [l for l in b if "Product" in l] |
Yunlian Jiang | 9fc9919 | 2013-05-29 16:29:51 -0700 | [diff] [blame] | 153 | if len(a): |
| 154 | self.machine_id = a[0] |
| 155 | return |
| 156 | command = "ifconfig" |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 157 | ret, if_out, _ = self.ce.CrosRunCommand(command, return_output=True, |
| 158 | machine=self.name, |
| 159 | chromeos_root=self.chromeos_root) |
Yunlian Jiang | 9fc9919 | 2013-05-29 16:29:51 -0700 | [diff] [blame] | 160 | b = if_out.splitlines() |
| 161 | a = [l for l in b if "HWaddr" in l] |
| 162 | if len(a): |
| 163 | self.machine_id = "_".join(a) |
| 164 | return |
cmtice | 870c184 | 2013-11-27 11:17:57 -0800 | [diff] [blame] | 165 | a = [l for l in b if "ether" in l] |
| 166 | if len(a): |
| 167 | self.machine_id = "_".join(a) |
| 168 | return |
Yunlian Jiang | 9fc9919 | 2013-05-29 16:29:51 -0700 | [diff] [blame] | 169 | assert 0, "Could not get machine_id from machine: %s" % self.name |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 170 | |
| 171 | def __str__(self): |
| 172 | l = [] |
| 173 | l.append(self.name) |
| 174 | l.append(str(self.image)) |
| 175 | l.append(str(self.checksum)) |
| 176 | l.append(str(self.locked)) |
| 177 | l.append(str(self.released_time)) |
| 178 | return ", ".join(l) |
| 179 | |
| 180 | |
| 181 | class MachineManager(object): |
cmtice | e5bc63b | 2015-05-27 16:59:37 -0700 | [diff] [blame] | 182 | """Lock, image and unlock machines locally for benchmark runs. |
| 183 | |
| 184 | This class contains methods and calls to lock, unlock and image |
| 185 | machines and distribute machines to each benchmark run. The assumption is |
| 186 | that all of the machines for the experiment have been globally locked |
| 187 | (using an AFE server) in the ExperimentRunner, but the machines still need |
| 188 | to be locally locked/unlocked (allocated to benchmark runs) to prevent |
| 189 | multiple benchmark runs within the same experiment from trying to use the |
| 190 | same machine at the same time. |
| 191 | """ |
cmtice | 5c09fc2 | 2015-04-22 09:25:53 -0700 | [diff] [blame] | 192 | def __init__(self, chromeos_root, acquire_timeout, log_level, locks_dir, |
| 193 | cmd_exec=None, lgr=None): |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 194 | self._lock = threading.RLock() |
| 195 | self._all_machines = [] |
| 196 | self._machines = [] |
| 197 | self.image_lock = threading.Lock() |
| 198 | self.num_reimages = 0 |
| 199 | self.chromeos_root = None |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 200 | self.machine_checksum = {} |
| 201 | self.machine_checksum_string = {} |
Luis Lozano | f81680c | 2013-03-15 14:44:13 -0700 | [diff] [blame] | 202 | self.acquire_timeout = acquire_timeout |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 203 | self.log_level = log_level |
cmtice | 517dc98 | 2015-06-12 12:22:32 -0700 | [diff] [blame] | 204 | self.locks_dir = locks_dir |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 205 | self.ce = cmd_exec or command_executer.GetCommandExecuter( |
| 206 | log_level=self.log_level) |
| 207 | self.logger = lgr or logger.GetLogger() |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 208 | |
cmtice | 517dc98 | 2015-06-12 12:22:32 -0700 | [diff] [blame] | 209 | if self.locks_dir and not os.path.isdir(self.locks_dir): |
| 210 | raise MissingLocksDirectory("Cannot access locks directory: %s" |
| 211 | % self.locks_dir) |
| 212 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 213 | self._initialized_machines = [] |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 214 | self.chromeos_root = chromeos_root |
| 215 | |
cmtice | f3eb803 | 2015-07-27 13:55:52 -0700 | [diff] [blame] | 216 | def RemoveNonLockedMachines(self, locked_machines): |
| 217 | for m in self._all_machines: |
| 218 | if m.name not in locked_machines: |
| 219 | self._all_machines.remove(m) |
| 220 | |
| 221 | for m in self._machines: |
| 222 | if m.name not in locked_machines: |
| 223 | self._machines.remove(m) |
| 224 | |
Caroline Tice | 31fedb0 | 2015-09-14 16:30:37 -0700 | [diff] [blame] | 225 | def GetChromeVersion(self, machine): |
| 226 | """Get the version of Chrome running on the DUT.""" |
| 227 | |
| 228 | cmd = "/opt/google/chrome/chrome --version" |
| 229 | ret, version, _ = self.ce.CrosRunCommand(cmd, return_output=True, |
| 230 | machine=machine.name, |
Caroline Tice | 31fedb0 | 2015-09-14 16:30:37 -0700 | [diff] [blame] | 231 | chromeos_root=self.chromeos_root) |
| 232 | if ret != 0: |
| 233 | raise CrosCommandError("Couldn't get Chrome version from %s." |
| 234 | % machine.name) |
| 235 | |
| 236 | if ret != 0: |
| 237 | version = "" |
| 238 | return version.rstrip() |
| 239 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 240 | def ImageMachine(self, machine, label): |
Han Shen | ba64928 | 2015-08-05 17:19:55 -0700 | [diff] [blame] | 241 | checksum = label.checksum |
cmtice | 0cc4e77 | 2014-01-30 15:52:37 -0800 | [diff] [blame] | 242 | |
| 243 | if checksum and (machine.checksum == checksum): |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 244 | return |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 245 | chromeos_root = label.chromeos_root |
Ahmad Sharif | f1d70cb | 2012-02-06 21:51:59 -0800 | [diff] [blame] | 246 | if not chromeos_root: |
| 247 | chromeos_root = self.chromeos_root |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 248 | image_chromeos_args = [image_chromeos.__file__, |
cmtice | e5bc63b | 2015-05-27 16:59:37 -0700 | [diff] [blame] | 249 | "--no_lock", |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 250 | "--chromeos_root=%s" % chromeos_root, |
| 251 | "--image=%s" % label.chromeos_image, |
| 252 | "--image_args=%s" % label.image_args, |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 253 | "--remote=%s" % machine.name, |
| 254 | "--logging_level=%s" % self.log_level] |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 255 | if label.board: |
| 256 | image_chromeos_args.append("--board=%s" % label.board) |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 257 | |
| 258 | # Currently can't image two machines at once. |
| 259 | # So have to serialized on this lock. |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 260 | save_ce_log_level = self.ce.log_level |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 261 | if self.log_level != "verbose": |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 262 | self.ce.log_level = "average" |
| 263 | |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 264 | with self.image_lock: |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 265 | if self.log_level != "verbose": |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 266 | self.logger.LogOutput("Pushing image onto machine.") |
Han Shen | ba64928 | 2015-08-05 17:19:55 -0700 | [diff] [blame] | 267 | self.logger.LogOutput("Running image_chromeos.DoImage with %s" |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 268 | % " ".join(image_chromeos_args)) |
Han Shen | ba64928 | 2015-08-05 17:19:55 -0700 | [diff] [blame] | 269 | retval = image_chromeos.DoImage(image_chromeos_args) |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 270 | if retval: |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 271 | cmd = "reboot && exit" |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 272 | if self.log_level != "verbose": |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 273 | self.logger.LogOutput("reboot & exit.") |
| 274 | self.ce.CrosRunCommand(cmd, machine=machine.name, |
| 275 | chromeos_root=self.chromeos_root) |
Yunlian Jiang | 36f91ad | 2013-03-28 17:13:29 -0700 | [diff] [blame] | 276 | time.sleep(60) |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 277 | if self.log_level != "verbose": |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 278 | self.logger.LogOutput("Pushing image onto machine.") |
Han Shen | ba64928 | 2015-08-05 17:19:55 -0700 | [diff] [blame] | 279 | self.logger.LogOutput("Running image_chromeos.DoImage with %s" |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 280 | % " ".join(image_chromeos_args)) |
Han Shen | ba64928 | 2015-08-05 17:19:55 -0700 | [diff] [blame] | 281 | retval = image_chromeos.DoImage(image_chromeos_args) |
Yunlian Jiang | 36f91ad | 2013-03-28 17:13:29 -0700 | [diff] [blame] | 282 | if retval: |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 283 | raise Exception("Could not image machine: '%s'." % machine.name) |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 284 | else: |
| 285 | self.num_reimages += 1 |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 286 | machine.checksum = checksum |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 287 | machine.image = label.chromeos_image |
Han Shen | ba64928 | 2015-08-05 17:19:55 -0700 | [diff] [blame] | 288 | machine.label = label |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 289 | |
Caroline Tice | 31fedb0 | 2015-09-14 16:30:37 -0700 | [diff] [blame] | 290 | if not label.chrome_version: |
| 291 | label.chrome_version = self.GetChromeVersion(machine) |
| 292 | |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 293 | self.ce.log_level = save_ce_log_level |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 294 | return retval |
| 295 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 296 | def ComputeCommonCheckSum(self, label): |
Caroline Tice | 5ea9f00 | 2015-09-02 12:36:47 -0700 | [diff] [blame] | 297 | # Since this is used for cache lookups before the machines have been |
| 298 | # compared/verified, check here to make sure they all have the same |
| 299 | # checksum (otherwise the cache lookup may not be valid). |
| 300 | common_checksum = None |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 301 | for machine in self.GetMachines(label): |
Caroline Tice | 5ea9f00 | 2015-09-02 12:36:47 -0700 | [diff] [blame] | 302 | # Make sure the machine's checksums are calculated. |
| 303 | if not machine.machine_checksum: |
| 304 | machine.SetUpChecksumInfo() |
| 305 | cs = machine.machine_checksum |
| 306 | # If this is the first machine we've examined, initialize |
| 307 | # common_checksum. |
| 308 | if not common_checksum: |
| 309 | common_checksum = cs |
| 310 | # Make sure this machine's checksum matches our 'common' checksum. |
| 311 | if cs != common_checksum: |
| 312 | raise BadChecksum("Machine checksums do not match!") |
| 313 | self.machine_checksum[label.name] = common_checksum |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 314 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 315 | def ComputeCommonCheckSumString(self, label): |
Caroline Tice | 5ea9f00 | 2015-09-02 12:36:47 -0700 | [diff] [blame] | 316 | # The assumption is that this function is only called AFTER |
| 317 | # ComputeCommonCheckSum, so there is no need to verify the machines |
| 318 | # are the same here. If this is ever changed, this function should be |
| 319 | # modified to verify that all the machines for a given label are the |
| 320 | # same. |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 321 | for machine in self.GetMachines(label): |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 322 | if machine.checksum_string: |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 323 | self.machine_checksum_string[label.name] = machine.checksum_string |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 324 | break |
| 325 | |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 326 | def _TryToLockMachine(self, cros_machine): |
| 327 | with self._lock: |
| 328 | assert cros_machine, "Machine can't be None" |
| 329 | for m in self._machines: |
Luis Lozano | f81680c | 2013-03-15 14:44:13 -0700 | [diff] [blame] | 330 | if m.name == cros_machine.name: |
| 331 | return |
cmtice | 517dc98 | 2015-06-12 12:22:32 -0700 | [diff] [blame] | 332 | locked = True |
| 333 | if self.locks_dir: |
| 334 | locked = file_lock_machine.Machine(cros_machine.name, |
| 335 | self.locks_dir).Lock(True, |
| 336 | sys.argv[0]) |
| 337 | if locked: |
| 338 | self._machines.append(cros_machine) |
| 339 | command = "cat %s" % CHECKSUM_FILE |
| 340 | ret, out, _ = self.ce.CrosRunCommand( |
| 341 | command, return_output=True, chromeos_root=self.chromeos_root, |
| 342 | machine=cros_machine.name) |
| 343 | if ret == 0: |
| 344 | cros_machine.checksum = out.strip() |
| 345 | elif self.locks_dir: |
| 346 | self.logger.LogOutput("Couldn't lock: %s" % cros_machine.name) |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 347 | |
| 348 | # This is called from single threaded mode. |
| 349 | def AddMachine(self, machine_name): |
| 350 | with self._lock: |
| 351 | for m in self._all_machines: |
| 352 | assert m.name != machine_name, "Tried to double-add %s" % machine_name |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 353 | if self.log_level != "verbose": |
| 354 | self.logger.LogOutput("Setting up remote access to %s" |
| 355 | % machine_name) |
| 356 | self.logger.LogOutput("Checking machine characteristics for %s" |
| 357 | % machine_name) |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 358 | cm = CrosMachine(machine_name, self.chromeos_root, self.log_level) |
Yunlian Jiang | 837e07a | 2013-05-22 16:23:28 -0700 | [diff] [blame] | 359 | if cm.machine_checksum: |
| 360 | self._all_machines.append(cm) |
Ahmad Sharif | f395c26 | 2012-10-09 17:48:09 -0700 | [diff] [blame] | 361 | |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 362 | |
Yunlian Jiang | e5b673f | 2013-05-23 11:42:53 -0700 | [diff] [blame] | 363 | def RemoveMachine(self, machine_name): |
| 364 | with self._lock: |
| 365 | self._machines = [m for m in self._machines |
| 366 | if m.name != machine_name] |
cmtice | 517dc98 | 2015-06-12 12:22:32 -0700 | [diff] [blame] | 367 | if self.locks_dir: |
| 368 | res = file_lock_machine.Machine(machine_name, |
| 369 | self.locks_dir).Unlock(True) |
| 370 | if not res: |
| 371 | self.logger.LogError("Could not unlock machine: '%s'." |
| 372 | % machine_name) |
Yunlian Jiang | e5b673f | 2013-05-23 11:42:53 -0700 | [diff] [blame] | 373 | |
cmtice | 798a8fa | 2014-05-12 13:56:42 -0700 | [diff] [blame] | 374 | def ForceSameImageToAllMachines(self, label): |
| 375 | machines = self.GetMachines(label) |
cmtice | 798a8fa | 2014-05-12 13:56:42 -0700 | [diff] [blame] | 376 | for m in machines: |
| 377 | self.ImageMachine(m, label) |
| 378 | m.SetUpChecksumInfo() |
| 379 | |
| 380 | def AcquireMachine(self, chromeos_image, label, throw=False): |
Han Shen | ba64928 | 2015-08-05 17:19:55 -0700 | [diff] [blame] | 381 | image_checksum = label.checksum |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 382 | machines = self.GetMachines(label) |
Luis Lozano | f81680c | 2013-03-15 14:44:13 -0700 | [diff] [blame] | 383 | check_interval_time = 120 |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 384 | with self._lock: |
| 385 | # Lazily external lock machines |
Luis Lozano | f81680c | 2013-03-15 14:44:13 -0700 | [diff] [blame] | 386 | while self.acquire_timeout >= 0: |
| 387 | for m in machines: |
| 388 | new_machine = m not in self._all_machines |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 389 | self._TryToLockMachine(m) |
Luis Lozano | f81680c | 2013-03-15 14:44:13 -0700 | [diff] [blame] | 390 | if new_machine: |
| 391 | m.released_time = time.time() |
Luis Lozano | f81680c | 2013-03-15 14:44:13 -0700 | [diff] [blame] | 392 | if self.GetAvailableMachines(label): |
| 393 | break |
| 394 | else: |
| 395 | sleep_time = max(1, min(self.acquire_timeout, check_interval_time)) |
| 396 | time.sleep(sleep_time) |
| 397 | self.acquire_timeout -= sleep_time |
| 398 | |
| 399 | if self.acquire_timeout < 0: |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 400 | machine_names = [] |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 401 | for machine in machines: |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 402 | machine_names.append(machine.name) |
cmtice | 1505b6a | 2014-06-04 14:19:19 -0700 | [diff] [blame] | 403 | self.logger.LogFatal("Could not acquire any of the " |
| 404 | "following machines: '%s'" |
| 405 | % ", ".join(machine_names)) |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 406 | |
| 407 | ### for m in self._machines: |
| 408 | ### if (m.locked and time.time() - m.released_time < 10 and |
| 409 | ### m.checksum == image_checksum): |
| 410 | ### return None |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 411 | for m in [machine for machine in self.GetAvailableMachines(label) |
| 412 | if not machine.locked]: |
cmtice | 0cc4e77 | 2014-01-30 15:52:37 -0800 | [diff] [blame] | 413 | if image_checksum and (m.checksum == image_checksum): |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 414 | m.locked = True |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 415 | m.test_run = threading.current_thread() |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 416 | return m |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 417 | for m in [machine for machine in self.GetAvailableMachines(label) |
| 418 | if not machine.locked]: |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 419 | if not m.checksum: |
| 420 | m.locked = True |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 421 | m.test_run = threading.current_thread() |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 422 | return m |
| 423 | # This logic ensures that threads waiting on a machine will get a machine |
| 424 | # with a checksum equal to their image over other threads. This saves time |
| 425 | # when crosperf initially assigns the machines to threads by minimizing |
| 426 | # the number of re-images. |
| 427 | # TODO(asharif): If we centralize the thread-scheduler, we wont need this |
| 428 | # code and can implement minimal reimaging code more cleanly. |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 429 | for m in [machine for machine in self.GetAvailableMachines(label) |
| 430 | if not machine.locked]: |
Yunlian Jiang | a844671 | 2015-02-26 10:25:11 -0800 | [diff] [blame] | 431 | if time.time() - m.released_time > 15: |
| 432 | # The release time gap is too large, so it is probably in the start |
| 433 | # stage, we need to reset the released_time. |
| 434 | m.released_time = time.time() |
| 435 | elif time.time() - m.released_time > 8: |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 436 | m.locked = True |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 437 | m.test_run = threading.current_thread() |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 438 | return m |
| 439 | return None |
| 440 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 441 | def GetAvailableMachines(self, label=None): |
| 442 | if not label: |
| 443 | return self._machines |
| 444 | return [m for m in self._machines if m.name in label.remote] |
| 445 | |
| 446 | def GetMachines(self, label=None): |
| 447 | if not label: |
| 448 | return self._all_machines |
| 449 | return [m for m in self._all_machines if m.name in label.remote] |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 450 | |
| 451 | def ReleaseMachine(self, machine): |
| 452 | with self._lock: |
| 453 | for m in self._machines: |
| 454 | if machine.name == m.name: |
| 455 | assert m.locked == True, "Tried to double-release %s" % m.name |
| 456 | m.released_time = time.time() |
| 457 | m.locked = False |
| 458 | m.status = "Available" |
| 459 | break |
| 460 | |
cmtice | 517dc98 | 2015-06-12 12:22:32 -0700 | [diff] [blame] | 461 | def Cleanup(self): |
| 462 | with self._lock: |
| 463 | # Unlock all machines (via file lock) |
| 464 | for m in self._machines: |
| 465 | res = file_lock_machine.Machine(m.name, self.locks_dir).Unlock(True) |
| 466 | |
| 467 | if not res: |
| 468 | self.logger.LogError("Could not unlock machine: '%s'." |
| 469 | % m.name) |
| 470 | |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 471 | def __str__(self): |
| 472 | with self._lock: |
| 473 | l = ["MachineManager Status:"] |
| 474 | for m in self._machines: |
| 475 | l.append(str(m)) |
| 476 | return "\n".join(l) |
| 477 | |
| 478 | def AsString(self): |
| 479 | with self._lock: |
| 480 | stringify_fmt = "%-30s %-10s %-4s %-25s %-32s" |
| 481 | header = stringify_fmt % ("Machine", "Thread", "Lock", "Status", |
| 482 | "Checksum") |
| 483 | table = [header] |
| 484 | for m in self._machines: |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 485 | if m.test_run: |
| 486 | test_name = m.test_run.name |
| 487 | test_status = m.test_run.timeline.GetLastEvent() |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 488 | else: |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 489 | test_name = "" |
| 490 | test_status = "" |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 491 | |
| 492 | try: |
| 493 | machine_string = stringify_fmt % (m.name, |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 494 | test_name, |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 495 | m.locked, |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 496 | test_status, |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 497 | m.checksum) |
| 498 | except Exception: |
| 499 | machine_string = "" |
| 500 | table.append(machine_string) |
| 501 | return "Machine Status:\n%s" % "\n".join(table) |
| 502 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 503 | def GetAllCPUInfo(self, labels): |
| 504 | """Get cpuinfo for labels, merge them if their cpuinfo are the same.""" |
| 505 | dic = {} |
| 506 | for label in labels: |
| 507 | for machine in self._all_machines: |
| 508 | if machine.name in label.remote: |
| 509 | if machine.cpuinfo not in dic: |
| 510 | dic[machine.cpuinfo] = [label.name] |
| 511 | else: |
| 512 | dic[machine.cpuinfo].append(label.name) |
| 513 | break |
| 514 | output = "" |
| 515 | for key, v in dic.items(): |
| 516 | output += " ".join(v) |
| 517 | output += "\n-------------------\n" |
| 518 | output += key |
| 519 | output += "\n\n\n" |
| 520 | return output |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 521 | |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 522 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 523 | class MockCrosMachine(CrosMachine): |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 524 | def __init__(self, name, chromeos_root, log_level): |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 525 | self.name = name |
| 526 | self.image = None |
| 527 | self.checksum = None |
| 528 | self.locked = False |
| 529 | self.released_time = time.time() |
Yunlian Jiang | 04dc5dc | 2013-04-23 15:05:05 -0700 | [diff] [blame] | 530 | self.test_run = None |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 531 | self.chromeos_root = chromeos_root |
| 532 | self.checksum_string = re.sub("\d", "", name) |
| 533 | #In test, we assume "lumpy1", "lumpy2" are the same machine. |
| 534 | self.machine_checksum = self._GetMD5Checksum(self.checksum_string) |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 535 | self.log_level = log_level |
Han Shen | e066297 | 2015-09-18 16:53:34 -0700 | [diff] [blame] | 536 | self.label = None |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 537 | |
cmtice | c454cee | 2014-04-09 10:58:51 -0700 | [diff] [blame] | 538 | def IsReachable(self): |
| 539 | return True |
| 540 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 541 | |
| 542 | class MockMachineManager(MachineManager): |
| 543 | |
cmtice | d96e457 | 2015-05-19 16:19:25 -0700 | [diff] [blame] | 544 | def __init__(self, chromeos_root, acquire_timeout, log_level, dummy_locks_dir): |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 545 | super(MockMachineManager, self).__init__(chromeos_root, acquire_timeout, |
cmtice | d96e457 | 2015-05-19 16:19:25 -0700 | [diff] [blame] | 546 | log_level, |
Han Shen | 7a939a3 | 2015-09-16 11:08:09 -0700 | [diff] [blame] | 547 | file_lock_machine.Machine.LOCKS_DIR) |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 548 | |
| 549 | def _TryToLockMachine(self, cros_machine): |
| 550 | self._machines.append(cros_machine) |
| 551 | cros_machine.checksum = "" |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 552 | |
| 553 | def AddMachine(self, machine_name): |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 554 | with self._lock: |
| 555 | for m in self._all_machines: |
| 556 | assert m.name != machine_name, "Tried to double-add %s" % machine_name |
cmtice | 1390924 | 2014-03-11 13:38:07 -0700 | [diff] [blame] | 557 | cm = MockCrosMachine(machine_name, self.chromeos_root, self.log_level) |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 558 | assert cm.machine_checksum, ("Could not find checksum for machine %s" % |
| 559 | machine_name) |
Han Shen | 624bae7 | 2015-09-17 15:07:31 -0700 | [diff] [blame] | 560 | # In Original MachineManager, the test is 'if cm.machine_checksum:' - if a |
| 561 | # machine is unreachable, then its machine_checksum is None. Here we |
| 562 | # cannot do this, because machine_checksum is always faked, so we directly |
| 563 | # test cm.IsReachable, which is properly mocked. |
| 564 | if cm.IsReachable(): |
| 565 | self._all_machines.append(cm) |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 566 | |
cmtice | 4d24b3a | 2014-05-29 16:11:04 -0700 | [diff] [blame] | 567 | def AcquireMachine(self, chromeos_image, label, throw=False): |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 568 | for machine in self._all_machines: |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 569 | if not machine.locked: |
| 570 | machine.locked = True |
| 571 | return machine |
| 572 | return None |
| 573 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 574 | def ImageMachine(self, machine_name, label): |
| 575 | return 0 |
| 576 | |
Ahmad Sharif | 0dcbc4b | 2012-02-02 16:37:18 -0800 | [diff] [blame] | 577 | def ReleaseMachine(self, machine): |
| 578 | machine.locked = False |
| 579 | |
Ahmad Sharif | 4467f00 | 2012-12-20 12:09:49 -0800 | [diff] [blame] | 580 | def GetMachines(self, label): |
| 581 | return self._all_machines |
| 582 | |
| 583 | def GetAvailableMachines(self, label): |
| 584 | return self._all_machines |