blob: 95282e447785ecf3361d261ed571ae61b78a282b [file] [log] [blame]
lmr6c4c1bf2009-09-30 11:28:29 +00001import os, select
2import kvm_utils, kvm_vm, kvm_subprocess
3
4
5class scheduler:
6 """
7 A scheduler that manages several parallel test execution pipelines on a
8 single host.
9 """
10
11 def __init__(self, tests, num_workers, total_cpus, total_mem, bindir):
12 """
13 Initialize the class.
14
15 @param tests: A list of test dictionaries.
16 @param num_workers: The number of workers (pipelines).
17 @param total_cpus: The total number of CPUs to dedicate to tests.
18 @param total_mem: The total amount of memory to dedicate to tests.
19 @param bindir: The directory where environment files reside.
20 """
21 self.tests = tests
22 self.num_workers = num_workers
23 self.total_cpus = total_cpus
24 self.total_mem = total_mem
25 self.bindir = bindir
26 # Pipes -- s stands for scheduler, w stands for worker
27 self.s2w = [os.pipe() for i in range(num_workers)]
28 self.w2s = [os.pipe() for i in range(num_workers)]
29 self.s2w_r = [os.fdopen(r, "r", 0) for r, w in self.s2w]
30 self.s2w_w = [os.fdopen(w, "w", 0) for r, w in self.s2w]
31 self.w2s_r = [os.fdopen(r, "r", 0) for r, w in self.w2s]
32 self.w2s_w = [os.fdopen(w, "w", 0) for r, w in self.w2s]
33 # "Personal" worker dicts contain modifications that are applied
34 # specifically to each worker. For example, each worker must use a
35 # different environment file and a different MAC address pool.
36 self.worker_dicts = [{"env": "env%d" % i} for i in range(num_workers)]
37
38
39 def worker(self, index, run_test_func):
40 """
41 The worker function.
42
43 Waits for commands from the scheduler and processes them.
44
45 @param index: The index of this worker (in the range 0..num_workers-1).
46 @param run_test_func: A function to be called to run a test
47 (e.g. job.run_test).
48 """
49 r = self.s2w_r[index]
50 w = self.w2s_w[index]
51 self_dict = self.worker_dicts[index]
52
53 # Inform the scheduler this worker is ready
54 w.write("ready\n")
55
56 while True:
57 cmd = r.readline().split()
58 if not cmd:
59 continue
60
61 # The scheduler wants this worker to run a test
62 if cmd[0] == "run":
63 test_index = int(cmd[1])
64 test = self.tests[test_index].copy()
65 test.update(self_dict)
lmrf3d3e522010-03-23 16:38:12 +000066 test = kvm_utils.get_sub_pool(test, index, self.num_workers)
lmr6c4c1bf2009-09-30 11:28:29 +000067 test_iterations = int(test.get("iterations", 1))
68 status = run_test_func("kvm", params=test,
69 tag=test.get("shortname"),
70 iterations=test_iterations)
71 w.write("done %s %s\n" % (test_index, status))
72 w.write("ready\n")
73
74 # The scheduler wants this worker to free its used resources
75 elif cmd[0] == "cleanup":
76 env_filename = os.path.join(self.bindir, self_dict["env"])
Eric Li7edb3042011-01-06 17:57:17 -080077 env = kvm_utils.Env(env_filename)
lmr6c4c1bf2009-09-30 11:28:29 +000078 for obj in env.values():
79 if isinstance(obj, kvm_vm.VM):
80 obj.destroy()
Eric Li7edb3042011-01-06 17:57:17 -080081 elif isinstance(obj, kvm_subprocess.Spawn):
lmr6c4c1bf2009-09-30 11:28:29 +000082 obj.close()
Eric Li7edb3042011-01-06 17:57:17 -080083 env.save()
lmr6c4c1bf2009-09-30 11:28:29 +000084 w.write("cleanup_done\n")
85 w.write("ready\n")
86
87 # There's no more work for this worker
88 elif cmd[0] == "terminate":
89 break
90
91
92 def scheduler(self):
93 """
94 The scheduler function.
95
96 Sends commands to workers, telling them to run tests, clean up or
97 terminate execution.
98 """
99 idle_workers = []
100 closing_workers = []
101 test_status = ["waiting"] * len(self.tests)
102 test_worker = [None] * len(self.tests)
103 used_cpus = [0] * self.num_workers
104 used_mem = [0] * self.num_workers
105
106 while True:
107 # Wait for a message from a worker
108 r, w, x = select.select(self.w2s_r, [], [])
109
110 someone_is_ready = False
111
112 for pipe in r:
113 worker_index = self.w2s_r.index(pipe)
114 msg = pipe.readline().split()
115 if not msg:
116 continue
117
118 # A worker is ready -- add it to the idle_workers list
119 if msg[0] == "ready":
120 idle_workers.append(worker_index)
121 someone_is_ready = True
122
123 # A worker completed a test
124 elif msg[0] == "done":
125 test_index = int(msg[1])
126 test = self.tests[test_index]
127 status = int(eval(msg[2]))
128 test_status[test_index] = ("fail", "pass")[status]
129 # If the test failed, mark all dependent tests as "failed" too
130 if not status:
131 for i, other_test in enumerate(self.tests):
132 for dep in other_test.get("depend", []):
133 if dep in test["name"]:
134 test_status[i] = "fail"
135
136 # A worker is done shutting down its VMs and other processes
137 elif msg[0] == "cleanup_done":
138 used_cpus[worker_index] = 0
139 used_mem[worker_index] = 0
140 closing_workers.remove(worker_index)
141
142 if not someone_is_ready:
143 continue
144
145 for worker in idle_workers[:]:
146 # Find a test for this worker
147 test_found = False
148 for i, test in enumerate(self.tests):
149 # We only want "waiting" tests
150 if test_status[i] != "waiting":
151 continue
152 # Make sure the test isn't assigned to another worker
153 if test_worker[i] is not None and test_worker[i] != worker:
154 continue
155 # Make sure the test's dependencies are satisfied
156 dependencies_satisfied = True
157 for dep in test["depend"]:
158 dependencies = [j for j, t in enumerate(self.tests)
159 if dep in t["name"]]
160 bad_status_deps = [j for j in dependencies
161 if test_status[j] != "pass"]
162 if bad_status_deps:
163 dependencies_satisfied = False
164 break
165 if not dependencies_satisfied:
166 continue
167 # Make sure we have enough resources to run the test
168 test_used_cpus = int(test.get("used_cpus", 1))
169 test_used_mem = int(test.get("used_mem", 128))
170 # First make sure the other workers aren't using too many
171 # CPUs (not including the workers currently shutting down)
172 uc = (sum(used_cpus) - used_cpus[worker] -
173 sum(used_cpus[i] for i in closing_workers))
174 if uc and uc + test_used_cpus > self.total_cpus:
175 continue
176 # ... or too much memory
177 um = (sum(used_mem) - used_mem[worker] -
178 sum(used_mem[i] for i in closing_workers))
179 if um and um + test_used_mem > self.total_mem:
180 continue
181 # If we reached this point it means there are, or will
182 # soon be, enough resources to run the test
183 test_found = True
184 # Now check if the test can be run right now, i.e. if the
185 # other workers, including the ones currently shutting
186 # down, aren't using too many CPUs
187 uc = (sum(used_cpus) - used_cpus[worker])
188 if uc and uc + test_used_cpus > self.total_cpus:
189 continue
190 # ... or too much memory
191 um = (sum(used_mem) - used_mem[worker])
192 if um and um + test_used_mem > self.total_mem:
193 continue
194 # Everything is OK -- run the test
195 test_status[i] = "running"
196 test_worker[i] = worker
197 idle_workers.remove(worker)
198 # Update used_cpus and used_mem
199 used_cpus[worker] = test_used_cpus
200 used_mem[worker] = test_used_mem
201 # Assign all related tests to this worker
202 for j, other_test in enumerate(self.tests):
203 for other_dep in other_test["depend"]:
204 # All tests that depend on this test
205 if other_dep in test["name"]:
206 test_worker[j] = worker
207 break
208 # ... and all tests that share a dependency
209 # with this test
210 for dep in test["depend"]:
211 if dep in other_dep or other_dep in dep:
212 test_worker[j] = worker
213 break
214 # Tell the worker to run the test
215 self.s2w_w[worker].write("run %s\n" % i)
216 break
217
218 # If there won't be any tests for this worker to run soon, tell
219 # the worker to free its used resources
220 if not test_found and (used_cpus[worker] or used_mem[worker]):
221 self.s2w_w[worker].write("cleanup\n")
222 idle_workers.remove(worker)
223 closing_workers.append(worker)
224
225 # If there are no more new tests to run, terminate the workers and
226 # the scheduler
227 if len(idle_workers) == self.num_workers:
228 for worker in idle_workers:
229 self.s2w_w[worker].write("terminate\n")
230 break