lmr | 6c4c1bf | 2009-09-30 11:28:29 +0000 | [diff] [blame] | 1 | import os, select |
| 2 | import kvm_utils, kvm_vm, kvm_subprocess |
| 3 | |
| 4 | |
| 5 | class scheduler: |
| 6 | """ |
| 7 | A scheduler that manages several parallel test execution pipelines on a |
| 8 | single host. |
| 9 | """ |
| 10 | |
| 11 | def __init__(self, tests, num_workers, total_cpus, total_mem, bindir): |
| 12 | """ |
| 13 | Initialize the class. |
| 14 | |
| 15 | @param tests: A list of test dictionaries. |
| 16 | @param num_workers: The number of workers (pipelines). |
| 17 | @param total_cpus: The total number of CPUs to dedicate to tests. |
| 18 | @param total_mem: The total amount of memory to dedicate to tests. |
| 19 | @param bindir: The directory where environment files reside. |
| 20 | """ |
| 21 | self.tests = tests |
| 22 | self.num_workers = num_workers |
| 23 | self.total_cpus = total_cpus |
| 24 | self.total_mem = total_mem |
| 25 | self.bindir = bindir |
| 26 | # Pipes -- s stands for scheduler, w stands for worker |
| 27 | self.s2w = [os.pipe() for i in range(num_workers)] |
| 28 | self.w2s = [os.pipe() for i in range(num_workers)] |
| 29 | self.s2w_r = [os.fdopen(r, "r", 0) for r, w in self.s2w] |
| 30 | self.s2w_w = [os.fdopen(w, "w", 0) for r, w in self.s2w] |
| 31 | self.w2s_r = [os.fdopen(r, "r", 0) for r, w in self.w2s] |
| 32 | self.w2s_w = [os.fdopen(w, "w", 0) for r, w in self.w2s] |
| 33 | # "Personal" worker dicts contain modifications that are applied |
| 34 | # specifically to each worker. For example, each worker must use a |
| 35 | # different environment file and a different MAC address pool. |
| 36 | self.worker_dicts = [{"env": "env%d" % i} for i in range(num_workers)] |
| 37 | |
| 38 | |
| 39 | def worker(self, index, run_test_func): |
| 40 | """ |
| 41 | The worker function. |
| 42 | |
| 43 | Waits for commands from the scheduler and processes them. |
| 44 | |
| 45 | @param index: The index of this worker (in the range 0..num_workers-1). |
| 46 | @param run_test_func: A function to be called to run a test |
| 47 | (e.g. job.run_test). |
| 48 | """ |
| 49 | r = self.s2w_r[index] |
| 50 | w = self.w2s_w[index] |
| 51 | self_dict = self.worker_dicts[index] |
| 52 | |
| 53 | # Inform the scheduler this worker is ready |
| 54 | w.write("ready\n") |
| 55 | |
| 56 | while True: |
| 57 | cmd = r.readline().split() |
| 58 | if not cmd: |
| 59 | continue |
| 60 | |
| 61 | # The scheduler wants this worker to run a test |
| 62 | if cmd[0] == "run": |
| 63 | test_index = int(cmd[1]) |
| 64 | test = self.tests[test_index].copy() |
| 65 | test.update(self_dict) |
lmr | f3d3e52 | 2010-03-23 16:38:12 +0000 | [diff] [blame] | 66 | test = kvm_utils.get_sub_pool(test, index, self.num_workers) |
lmr | 6c4c1bf | 2009-09-30 11:28:29 +0000 | [diff] [blame] | 67 | test_iterations = int(test.get("iterations", 1)) |
| 68 | status = run_test_func("kvm", params=test, |
| 69 | tag=test.get("shortname"), |
| 70 | iterations=test_iterations) |
| 71 | w.write("done %s %s\n" % (test_index, status)) |
| 72 | w.write("ready\n") |
| 73 | |
| 74 | # The scheduler wants this worker to free its used resources |
| 75 | elif cmd[0] == "cleanup": |
| 76 | env_filename = os.path.join(self.bindir, self_dict["env"]) |
| 77 | env = kvm_utils.load_env(env_filename, {}) |
| 78 | for obj in env.values(): |
| 79 | if isinstance(obj, kvm_vm.VM): |
| 80 | obj.destroy() |
| 81 | elif isinstance(obj, kvm_subprocess.kvm_spawn): |
| 82 | obj.close() |
| 83 | kvm_utils.dump_env(env, env_filename) |
| 84 | w.write("cleanup_done\n") |
| 85 | w.write("ready\n") |
| 86 | |
| 87 | # There's no more work for this worker |
| 88 | elif cmd[0] == "terminate": |
| 89 | break |
| 90 | |
| 91 | |
| 92 | def scheduler(self): |
| 93 | """ |
| 94 | The scheduler function. |
| 95 | |
| 96 | Sends commands to workers, telling them to run tests, clean up or |
| 97 | terminate execution. |
| 98 | """ |
| 99 | idle_workers = [] |
| 100 | closing_workers = [] |
| 101 | test_status = ["waiting"] * len(self.tests) |
| 102 | test_worker = [None] * len(self.tests) |
| 103 | used_cpus = [0] * self.num_workers |
| 104 | used_mem = [0] * self.num_workers |
| 105 | |
| 106 | while True: |
| 107 | # Wait for a message from a worker |
| 108 | r, w, x = select.select(self.w2s_r, [], []) |
| 109 | |
| 110 | someone_is_ready = False |
| 111 | |
| 112 | for pipe in r: |
| 113 | worker_index = self.w2s_r.index(pipe) |
| 114 | msg = pipe.readline().split() |
| 115 | if not msg: |
| 116 | continue |
| 117 | |
| 118 | # A worker is ready -- add it to the idle_workers list |
| 119 | if msg[0] == "ready": |
| 120 | idle_workers.append(worker_index) |
| 121 | someone_is_ready = True |
| 122 | |
| 123 | # A worker completed a test |
| 124 | elif msg[0] == "done": |
| 125 | test_index = int(msg[1]) |
| 126 | test = self.tests[test_index] |
| 127 | status = int(eval(msg[2])) |
| 128 | test_status[test_index] = ("fail", "pass")[status] |
| 129 | # If the test failed, mark all dependent tests as "failed" too |
| 130 | if not status: |
| 131 | for i, other_test in enumerate(self.tests): |
| 132 | for dep in other_test.get("depend", []): |
| 133 | if dep in test["name"]: |
| 134 | test_status[i] = "fail" |
| 135 | |
| 136 | # A worker is done shutting down its VMs and other processes |
| 137 | elif msg[0] == "cleanup_done": |
| 138 | used_cpus[worker_index] = 0 |
| 139 | used_mem[worker_index] = 0 |
| 140 | closing_workers.remove(worker_index) |
| 141 | |
| 142 | if not someone_is_ready: |
| 143 | continue |
| 144 | |
| 145 | for worker in idle_workers[:]: |
| 146 | # Find a test for this worker |
| 147 | test_found = False |
| 148 | for i, test in enumerate(self.tests): |
| 149 | # We only want "waiting" tests |
| 150 | if test_status[i] != "waiting": |
| 151 | continue |
| 152 | # Make sure the test isn't assigned to another worker |
| 153 | if test_worker[i] is not None and test_worker[i] != worker: |
| 154 | continue |
| 155 | # Make sure the test's dependencies are satisfied |
| 156 | dependencies_satisfied = True |
| 157 | for dep in test["depend"]: |
| 158 | dependencies = [j for j, t in enumerate(self.tests) |
| 159 | if dep in t["name"]] |
| 160 | bad_status_deps = [j for j in dependencies |
| 161 | if test_status[j] != "pass"] |
| 162 | if bad_status_deps: |
| 163 | dependencies_satisfied = False |
| 164 | break |
| 165 | if not dependencies_satisfied: |
| 166 | continue |
| 167 | # Make sure we have enough resources to run the test |
| 168 | test_used_cpus = int(test.get("used_cpus", 1)) |
| 169 | test_used_mem = int(test.get("used_mem", 128)) |
| 170 | # First make sure the other workers aren't using too many |
| 171 | # CPUs (not including the workers currently shutting down) |
| 172 | uc = (sum(used_cpus) - used_cpus[worker] - |
| 173 | sum(used_cpus[i] for i in closing_workers)) |
| 174 | if uc and uc + test_used_cpus > self.total_cpus: |
| 175 | continue |
| 176 | # ... or too much memory |
| 177 | um = (sum(used_mem) - used_mem[worker] - |
| 178 | sum(used_mem[i] for i in closing_workers)) |
| 179 | if um and um + test_used_mem > self.total_mem: |
| 180 | continue |
| 181 | # If we reached this point it means there are, or will |
| 182 | # soon be, enough resources to run the test |
| 183 | test_found = True |
| 184 | # Now check if the test can be run right now, i.e. if the |
| 185 | # other workers, including the ones currently shutting |
| 186 | # down, aren't using too many CPUs |
| 187 | uc = (sum(used_cpus) - used_cpus[worker]) |
| 188 | if uc and uc + test_used_cpus > self.total_cpus: |
| 189 | continue |
| 190 | # ... or too much memory |
| 191 | um = (sum(used_mem) - used_mem[worker]) |
| 192 | if um and um + test_used_mem > self.total_mem: |
| 193 | continue |
| 194 | # Everything is OK -- run the test |
| 195 | test_status[i] = "running" |
| 196 | test_worker[i] = worker |
| 197 | idle_workers.remove(worker) |
| 198 | # Update used_cpus and used_mem |
| 199 | used_cpus[worker] = test_used_cpus |
| 200 | used_mem[worker] = test_used_mem |
| 201 | # Assign all related tests to this worker |
| 202 | for j, other_test in enumerate(self.tests): |
| 203 | for other_dep in other_test["depend"]: |
| 204 | # All tests that depend on this test |
| 205 | if other_dep in test["name"]: |
| 206 | test_worker[j] = worker |
| 207 | break |
| 208 | # ... and all tests that share a dependency |
| 209 | # with this test |
| 210 | for dep in test["depend"]: |
| 211 | if dep in other_dep or other_dep in dep: |
| 212 | test_worker[j] = worker |
| 213 | break |
| 214 | # Tell the worker to run the test |
| 215 | self.s2w_w[worker].write("run %s\n" % i) |
| 216 | break |
| 217 | |
| 218 | # If there won't be any tests for this worker to run soon, tell |
| 219 | # the worker to free its used resources |
| 220 | if not test_found and (used_cpus[worker] or used_mem[worker]): |
| 221 | self.s2w_w[worker].write("cleanup\n") |
| 222 | idle_workers.remove(worker) |
| 223 | closing_workers.append(worker) |
| 224 | |
| 225 | # If there are no more new tests to run, terminate the workers and |
| 226 | # the scheduler |
| 227 | if len(idle_workers) == self.num_workers: |
| 228 | for worker in idle_workers: |
| 229 | self.s2w_w[worker].write("terminate\n") |
| 230 | break |