blob: 194c231236736f7c3ed09205683653032c2e666b [file] [log] [blame]
sewardje663cb92002-04-12 10:26:32 +00001
2/*--------------------------------------------------------------------*/
3/*--- A user-space pthreads implementation. vg_scheduler.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7 This file is part of Valgrind, an x86 protected-mode emulator
8 designed for debugging and profiling binaries on x86-Unixes.
9
10 Copyright (C) 2000-2002 Julian Seward
11 jseward@acm.org
sewardje663cb92002-04-12 10:26:32 +000012
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26 02111-1307, USA.
27
28 The GNU General Public License is contained in the file LICENSE.
29*/
30
31#include "vg_include.h"
32#include "vg_constants.h"
33
34#include "valgrind.h" /* for VG_USERREQ__MAKE_NOACCESS and
35 VG_USERREQ__DO_LEAK_CHECK */
36
sewardj77e466c2002-04-14 02:29:29 +000037/* BORKAGE/ISSUES as of 14 Apr 02
sewardje663cb92002-04-12 10:26:32 +000038
sewardj77e466c2002-04-14 02:29:29 +000039Note! This pthreads implementation is so poor as to not be
40suitable for use by anyone at all!
sewardje663cb92002-04-12 10:26:32 +000041
sewardj77e466c2002-04-14 02:29:29 +000042- Currently, when a signal is run, just the ThreadStatus.status fields
43 are saved in the signal frame, along with the CPU state. Question:
44 should I also save and restore:
45 ThreadStatus.joiner
46 ThreadStatus.waited_on_mid
47 ThreadStatus.awaken_at
48 ThreadStatus.retval
49 Currently unsure, and so am not doing so.
sewardje663cb92002-04-12 10:26:32 +000050
sewardj77e466c2002-04-14 02:29:29 +000051- Signals interrupting read/write and nanosleep: SA_RESTART settings.
52 Read/write correctly return with EINTR when SA_RESTART isn't
53 specified and they are interrupted by a signal. nanosleep just
54 pretends signals don't exist -- should be fixed.
sewardje663cb92002-04-12 10:26:32 +000055
sewardj75fe1892002-04-14 02:46:33 +000056- Read/write syscall starts: don't crap out when the initial
57 nonblocking read/write returns an error.
sewardj8937c812002-04-12 20:12:20 +000058
sewardj9a199dc2002-04-14 13:01:38 +000059- Get rid of restrictions re use of sigaltstack; they are no longer
60 needed.
61
sewardj6072c362002-04-19 14:40:57 +000062- Fix signals properly, so that each thread has its own blocking mask.
63 Currently this isn't done, and (worse?) signals are delivered to
64 Thread 1 (the root thread) regardless.
65
66 So, what's the deal with signals and mutexes? If a thread is
67 blocked on a mutex, or for a condition variable for that matter, can
68 signals still be delivered to it? This has serious consequences --
69 deadlocks, etc.
70
sewardje462e202002-04-13 04:09:07 +000071*/
sewardje663cb92002-04-12 10:26:32 +000072
73
74/* ---------------------------------------------------------------------
75 Types and globals for the scheduler.
76 ------------------------------------------------------------------ */
77
78/* type ThreadId is defined in vg_include.h. */
79
80/* struct ThreadState is defined in vg_include.h. */
81
sewardj6072c362002-04-19 14:40:57 +000082/* Private globals. A statically allocated array of threads. NOTE:
83 [0] is never used, to simplify the simulation of initialisers for
84 LinuxThreads. */
sewardje663cb92002-04-12 10:26:32 +000085static ThreadState vg_threads[VG_N_THREADS];
86
sewardj1e8cdc92002-04-18 11:37:52 +000087/* The tid of the thread currently in VG_(baseBlock). */
88static Int vg_tid_currently_in_baseBlock = VG_INVALID_THREADID;
89
sewardje663cb92002-04-12 10:26:32 +000090
91/* vg_oursignalhandler() might longjmp(). Here's the jmp_buf. */
92jmp_buf VG_(scheduler_jmpbuf);
93/* ... and if so, here's the signal which caused it to do so. */
94Int VG_(longjmpd_on_signal);
95
96
97/* Machinery to keep track of which threads are waiting on which
98 fds. */
99typedef
100 struct {
101 /* The thread which made the request. */
102 ThreadId tid;
103
104 /* The next two fields describe the request. */
105 /* File descriptor waited for. -1 means this slot is not in use */
106 Int fd;
107 /* The syscall number the fd is used in. */
108 Int syscall_no;
109
110 /* False => still waiting for select to tell us the fd is ready
111 to go. True => the fd is ready, but the results have not yet
112 been delivered back to the calling thread. Once the latter
113 happens, this entire record is marked as no longer in use, by
114 making the fd field be -1. */
115 Bool ready;
116 }
117 VgWaitedOnFd;
118
119static VgWaitedOnFd vg_waiting_fds[VG_N_WAITING_FDS];
120
121
sewardje663cb92002-04-12 10:26:32 +0000122/* Forwards */
123static void do_nontrivial_clientreq ( ThreadId tid );
124
sewardj6072c362002-04-19 14:40:57 +0000125static void scheduler_sanity ( void );
126
sewardje663cb92002-04-12 10:26:32 +0000127
128/* ---------------------------------------------------------------------
129 Helper functions for the scheduler.
130 ------------------------------------------------------------------ */
131
sewardj604ec3c2002-04-18 22:38:41 +0000132static __inline__
133Bool is_valid_tid ( ThreadId tid )
134{
135 /* tid is unsigned, hence no < 0 test. */
sewardj6072c362002-04-19 14:40:57 +0000136 if (tid == 0) return False;
sewardj604ec3c2002-04-18 22:38:41 +0000137 if (tid >= VG_N_THREADS) return False;
sewardj604ec3c2002-04-18 22:38:41 +0000138 return True;
139}
140
141
sewardj1e8cdc92002-04-18 11:37:52 +0000142/* For constructing error messages only: try and identify a thread
143 whose stack this address currently falls within, or return
144 VG_INVALID_THREADID if it doesn't. A small complication is dealing
145 with any currently VG_(baseBlock)-resident thread.
146*/
147ThreadId VG_(identify_stack_addr)( Addr a )
148{
149 ThreadId tid, tid_to_skip;
150
151 tid_to_skip = VG_INVALID_THREADID;
152
153 /* First check to see if there's a currently-loaded thread in
154 VG_(baseBlock). */
155 if (vg_tid_currently_in_baseBlock != VG_INVALID_THREADID) {
156 tid = vg_tid_currently_in_baseBlock;
157 if (VG_(baseBlock)[VGOFF_(m_esp)] <= a
158 && a <= vg_threads[tid].stack_highest_word)
159 return tid;
160 else
161 tid_to_skip = tid;
162 }
163
sewardj6072c362002-04-19 14:40:57 +0000164 for (tid = 1; tid < VG_N_THREADS; tid++) {
sewardj1e8cdc92002-04-18 11:37:52 +0000165 if (vg_threads[tid].status == VgTs_Empty) continue;
166 if (tid == tid_to_skip) continue;
167 if (vg_threads[tid].m_esp <= a
168 && a <= vg_threads[tid].stack_highest_word)
169 return tid;
170 }
171 return VG_INVALID_THREADID;
172}
173
174
sewardj15a43e12002-04-17 19:35:12 +0000175/* Print the scheduler status. */
176void VG_(pp_sched_status) ( void )
sewardje663cb92002-04-12 10:26:32 +0000177{
178 Int i;
179 VG_(printf)("\nsched status:\n");
sewardj6072c362002-04-19 14:40:57 +0000180 for (i = 1; i < VG_N_THREADS; i++) {
sewardje663cb92002-04-12 10:26:32 +0000181 if (vg_threads[i].status == VgTs_Empty) continue;
sewardj15a43e12002-04-17 19:35:12 +0000182 VG_(printf)("\nThread %d: status = ", i);
sewardje663cb92002-04-12 10:26:32 +0000183 switch (vg_threads[i].status) {
sewardj6072c362002-04-19 14:40:57 +0000184 case VgTs_Runnable: VG_(printf)("Runnable"); break;
185 case VgTs_WaitFD: VG_(printf)("WaitFD"); break;
186 case VgTs_WaitJoiner: VG_(printf)("WaitJoiner(%d)",
sewardje663cb92002-04-12 10:26:32 +0000187 vg_threads[i].joiner); break;
sewardj6072c362002-04-19 14:40:57 +0000188 case VgTs_WaitJoinee: VG_(printf)("WaitJoinee"); break;
189 case VgTs_Sleeping: VG_(printf)("Sleeping"); break;
190 case VgTs_WaitMX: VG_(printf)("WaitMX"); break;
sewardje663cb92002-04-12 10:26:32 +0000191 default: VG_(printf)("???"); break;
192 }
sewardj6072c362002-04-19 14:40:57 +0000193 VG_(printf)(", waited_on_mx = %p\n", vg_threads[i].waited_on_mx );
sewardj15a43e12002-04-17 19:35:12 +0000194 VG_(pp_ExeContext)(
195 VG_(get_ExeContext)( False, vg_threads[i].m_eip,
196 vg_threads[i].m_ebp ));
sewardje663cb92002-04-12 10:26:32 +0000197 }
198 VG_(printf)("\n");
199}
200
201static
202void add_waiting_fd ( ThreadId tid, Int fd, Int syscall_no )
203{
204 Int i;
205
206 vg_assert(fd != -1); /* avoid total chaos */
207
208 for (i = 0; i < VG_N_WAITING_FDS; i++)
209 if (vg_waiting_fds[i].fd == -1)
210 break;
211
212 if (i == VG_N_WAITING_FDS)
213 VG_(panic)("add_waiting_fd: VG_N_WAITING_FDS is too low");
214 /*
215 VG_(printf)("add_waiting_fd: add (tid %d, fd %d) at slot %d\n",
216 tid, fd, i);
217 */
218 vg_waiting_fds[i].fd = fd;
219 vg_waiting_fds[i].tid = tid;
220 vg_waiting_fds[i].ready = False;
221 vg_waiting_fds[i].syscall_no = syscall_no;
222}
223
224
225
226static
227void print_sched_event ( ThreadId tid, Char* what )
228{
sewardj45b4b372002-04-16 22:50:32 +0000229 VG_(message)(Vg_DebugMsg, " SCHED[%d]: %s", tid, what );
sewardj8937c812002-04-12 20:12:20 +0000230}
231
232
233static
234void print_pthread_event ( ThreadId tid, Char* what )
235{
236 VG_(message)(Vg_DebugMsg, "PTHREAD[%d]: %s", tid, what );
sewardje663cb92002-04-12 10:26:32 +0000237}
238
239
240static
241Char* name_of_sched_event ( UInt event )
242{
243 switch (event) {
sewardje663cb92002-04-12 10:26:32 +0000244 case VG_TRC_EBP_JMP_SYSCALL: return "SYSCALL";
245 case VG_TRC_EBP_JMP_CLIENTREQ: return "CLIENTREQ";
246 case VG_TRC_INNER_COUNTERZERO: return "COUNTERZERO";
247 case VG_TRC_INNER_FASTMISS: return "FASTMISS";
248 case VG_TRC_UNRESUMABLE_SIGNAL: return "FATALSIGNAL";
249 default: return "??UNKNOWN??";
250 }
251}
252
253
254/* Create a translation of the client basic block beginning at
255 orig_addr, and add it to the translation cache & translation table.
256 This probably doesn't really belong here, but, hey ...
257*/
sewardj1e8cdc92002-04-18 11:37:52 +0000258static
259void create_translation_for ( ThreadId tid, Addr orig_addr )
sewardje663cb92002-04-12 10:26:32 +0000260{
261 Addr trans_addr;
262 TTEntry tte;
263 Int orig_size, trans_size;
264 /* Ensure there is space to hold a translation. */
265 VG_(maybe_do_lru_pass)();
sewardj1e8cdc92002-04-18 11:37:52 +0000266 VG_(translate)( &vg_threads[tid],
267 orig_addr, &orig_size, &trans_addr, &trans_size );
sewardje663cb92002-04-12 10:26:32 +0000268 /* Copy data at trans_addr into the translation cache.
269 Returned pointer is to the code, not to the 4-byte
270 header. */
271 /* Since the .orig_size and .trans_size fields are
272 UShort, be paranoid. */
273 vg_assert(orig_size > 0 && orig_size < 65536);
274 vg_assert(trans_size > 0 && trans_size < 65536);
275 tte.orig_size = orig_size;
276 tte.orig_addr = orig_addr;
277 tte.trans_size = trans_size;
278 tte.trans_addr = VG_(copy_to_transcache)
279 ( trans_addr, trans_size );
280 tte.mru_epoch = VG_(current_epoch);
281 /* Free the intermediary -- was allocated by VG_(emit_code). */
282 VG_(jitfree)( (void*)trans_addr );
283 /* Add to trans tab and set back pointer. */
284 VG_(add_to_trans_tab) ( &tte );
285 /* Update stats. */
286 VG_(this_epoch_in_count) ++;
287 VG_(this_epoch_in_osize) += orig_size;
288 VG_(this_epoch_in_tsize) += trans_size;
289 VG_(overall_in_count) ++;
290 VG_(overall_in_osize) += orig_size;
291 VG_(overall_in_tsize) += trans_size;
292 /* Record translated area for SMC detection. */
293 VG_(smc_mark_original) ( orig_addr, orig_size );
294}
295
296
297/* Allocate a completely empty ThreadState record. */
298static
299ThreadId vg_alloc_ThreadState ( void )
300{
301 Int i;
sewardj6072c362002-04-19 14:40:57 +0000302 for (i = 1; i < VG_N_THREADS; i++) {
sewardje663cb92002-04-12 10:26:32 +0000303 if (vg_threads[i].status == VgTs_Empty)
304 return i;
305 }
306 VG_(printf)("vg_alloc_ThreadState: no free slots available\n");
307 VG_(printf)("Increase VG_N_THREADS, rebuild and try again.\n");
308 VG_(panic)("VG_N_THREADS is too low");
309 /*NOTREACHED*/
310}
311
312
313ThreadState* VG_(get_thread_state) ( ThreadId tid )
314{
sewardj6072c362002-04-19 14:40:57 +0000315 vg_assert(is_valid_tid(tid));
sewardje663cb92002-04-12 10:26:32 +0000316 vg_assert(vg_threads[tid].status != VgTs_Empty);
317 return & vg_threads[tid];
318}
319
320
sewardj1e8cdc92002-04-18 11:37:52 +0000321ThreadState* VG_(get_current_thread_state) ( void )
322{
323 vg_assert(vg_tid_currently_in_baseBlock != VG_INVALID_THREADID);
sewardj6072c362002-04-19 14:40:57 +0000324 return VG_(get_thread_state) ( vg_tid_currently_in_baseBlock );
sewardj1e8cdc92002-04-18 11:37:52 +0000325}
326
327
328ThreadId VG_(get_current_tid) ( void )
329{
330 vg_assert(vg_tid_currently_in_baseBlock != VG_INVALID_THREADID);
331 return vg_tid_currently_in_baseBlock;
332}
333
334
sewardje663cb92002-04-12 10:26:32 +0000335/* Copy the saved state of a thread into VG_(baseBlock), ready for it
336 to be run. */
337__inline__
338void VG_(load_thread_state) ( ThreadId tid )
339{
340 Int i;
sewardj1e8cdc92002-04-18 11:37:52 +0000341 vg_assert(vg_tid_currently_in_baseBlock == VG_INVALID_THREADID);
342
sewardje663cb92002-04-12 10:26:32 +0000343 VG_(baseBlock)[VGOFF_(m_eax)] = vg_threads[tid].m_eax;
344 VG_(baseBlock)[VGOFF_(m_ebx)] = vg_threads[tid].m_ebx;
345 VG_(baseBlock)[VGOFF_(m_ecx)] = vg_threads[tid].m_ecx;
346 VG_(baseBlock)[VGOFF_(m_edx)] = vg_threads[tid].m_edx;
347 VG_(baseBlock)[VGOFF_(m_esi)] = vg_threads[tid].m_esi;
348 VG_(baseBlock)[VGOFF_(m_edi)] = vg_threads[tid].m_edi;
349 VG_(baseBlock)[VGOFF_(m_ebp)] = vg_threads[tid].m_ebp;
350 VG_(baseBlock)[VGOFF_(m_esp)] = vg_threads[tid].m_esp;
351 VG_(baseBlock)[VGOFF_(m_eflags)] = vg_threads[tid].m_eflags;
352 VG_(baseBlock)[VGOFF_(m_eip)] = vg_threads[tid].m_eip;
353
354 for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
355 VG_(baseBlock)[VGOFF_(m_fpustate) + i] = vg_threads[tid].m_fpu[i];
356
357 VG_(baseBlock)[VGOFF_(sh_eax)] = vg_threads[tid].sh_eax;
358 VG_(baseBlock)[VGOFF_(sh_ebx)] = vg_threads[tid].sh_ebx;
359 VG_(baseBlock)[VGOFF_(sh_ecx)] = vg_threads[tid].sh_ecx;
360 VG_(baseBlock)[VGOFF_(sh_edx)] = vg_threads[tid].sh_edx;
361 VG_(baseBlock)[VGOFF_(sh_esi)] = vg_threads[tid].sh_esi;
362 VG_(baseBlock)[VGOFF_(sh_edi)] = vg_threads[tid].sh_edi;
363 VG_(baseBlock)[VGOFF_(sh_ebp)] = vg_threads[tid].sh_ebp;
364 VG_(baseBlock)[VGOFF_(sh_esp)] = vg_threads[tid].sh_esp;
365 VG_(baseBlock)[VGOFF_(sh_eflags)] = vg_threads[tid].sh_eflags;
sewardj1e8cdc92002-04-18 11:37:52 +0000366
367 vg_tid_currently_in_baseBlock = tid;
sewardje663cb92002-04-12 10:26:32 +0000368}
369
370
371/* Copy the state of a thread from VG_(baseBlock), presumably after it
372 has been descheduled. For sanity-check purposes, fill the vacated
373 VG_(baseBlock) with garbage so as to make the system more likely to
374 fail quickly if we erroneously continue to poke around inside
375 VG_(baseBlock) without first doing a load_thread_state().
376*/
377__inline__
378void VG_(save_thread_state) ( ThreadId tid )
379{
380 Int i;
381 const UInt junk = 0xDEADBEEF;
382
sewardj1e8cdc92002-04-18 11:37:52 +0000383 vg_assert(vg_tid_currently_in_baseBlock != VG_INVALID_THREADID);
384
sewardje663cb92002-04-12 10:26:32 +0000385 vg_threads[tid].m_eax = VG_(baseBlock)[VGOFF_(m_eax)];
386 vg_threads[tid].m_ebx = VG_(baseBlock)[VGOFF_(m_ebx)];
387 vg_threads[tid].m_ecx = VG_(baseBlock)[VGOFF_(m_ecx)];
388 vg_threads[tid].m_edx = VG_(baseBlock)[VGOFF_(m_edx)];
389 vg_threads[tid].m_esi = VG_(baseBlock)[VGOFF_(m_esi)];
390 vg_threads[tid].m_edi = VG_(baseBlock)[VGOFF_(m_edi)];
391 vg_threads[tid].m_ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
392 vg_threads[tid].m_esp = VG_(baseBlock)[VGOFF_(m_esp)];
393 vg_threads[tid].m_eflags = VG_(baseBlock)[VGOFF_(m_eflags)];
394 vg_threads[tid].m_eip = VG_(baseBlock)[VGOFF_(m_eip)];
395
396 for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
397 vg_threads[tid].m_fpu[i] = VG_(baseBlock)[VGOFF_(m_fpustate) + i];
398
399 vg_threads[tid].sh_eax = VG_(baseBlock)[VGOFF_(sh_eax)];
400 vg_threads[tid].sh_ebx = VG_(baseBlock)[VGOFF_(sh_ebx)];
401 vg_threads[tid].sh_ecx = VG_(baseBlock)[VGOFF_(sh_ecx)];
402 vg_threads[tid].sh_edx = VG_(baseBlock)[VGOFF_(sh_edx)];
403 vg_threads[tid].sh_esi = VG_(baseBlock)[VGOFF_(sh_esi)];
404 vg_threads[tid].sh_edi = VG_(baseBlock)[VGOFF_(sh_edi)];
405 vg_threads[tid].sh_ebp = VG_(baseBlock)[VGOFF_(sh_ebp)];
406 vg_threads[tid].sh_esp = VG_(baseBlock)[VGOFF_(sh_esp)];
407 vg_threads[tid].sh_eflags = VG_(baseBlock)[VGOFF_(sh_eflags)];
408
409 /* Fill it up with junk. */
410 VG_(baseBlock)[VGOFF_(m_eax)] = junk;
411 VG_(baseBlock)[VGOFF_(m_ebx)] = junk;
412 VG_(baseBlock)[VGOFF_(m_ecx)] = junk;
413 VG_(baseBlock)[VGOFF_(m_edx)] = junk;
414 VG_(baseBlock)[VGOFF_(m_esi)] = junk;
415 VG_(baseBlock)[VGOFF_(m_edi)] = junk;
416 VG_(baseBlock)[VGOFF_(m_ebp)] = junk;
417 VG_(baseBlock)[VGOFF_(m_esp)] = junk;
418 VG_(baseBlock)[VGOFF_(m_eflags)] = junk;
419 VG_(baseBlock)[VGOFF_(m_eip)] = junk;
420
421 for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
422 VG_(baseBlock)[VGOFF_(m_fpustate) + i] = junk;
sewardj1e8cdc92002-04-18 11:37:52 +0000423
424 vg_tid_currently_in_baseBlock = VG_INVALID_THREADID;
sewardje663cb92002-04-12 10:26:32 +0000425}
426
427
428/* Run the thread tid for a while, and return a VG_TRC_* value to the
429 scheduler indicating what happened. */
sewardj6072c362002-04-19 14:40:57 +0000430static
sewardje663cb92002-04-12 10:26:32 +0000431UInt run_thread_for_a_while ( ThreadId tid )
432{
433 UInt trc = 0;
sewardj6072c362002-04-19 14:40:57 +0000434 vg_assert(is_valid_tid(tid));
435 vg_assert(vg_threads[tid].status == VgTs_Runnable);
sewardje663cb92002-04-12 10:26:32 +0000436 vg_assert(VG_(bbs_to_go) > 0);
437
438 VG_(load_thread_state) ( tid );
439 if (__builtin_setjmp(VG_(scheduler_jmpbuf)) == 0) {
440 /* try this ... */
441 trc = VG_(run_innerloop)();
442 /* We get here if the client didn't take a fault. */
443 } else {
444 /* We get here if the client took a fault, which caused our
445 signal handler to longjmp. */
446 vg_assert(trc == 0);
447 trc = VG_TRC_UNRESUMABLE_SIGNAL;
448 }
449 VG_(save_thread_state) ( tid );
450 return trc;
451}
452
453
454/* Increment the LRU epoch counter. */
455static
456void increment_epoch ( void )
457{
458 VG_(current_epoch)++;
459 if (VG_(clo_verbosity) > 2) {
460 UInt tt_used, tc_used;
461 VG_(get_tt_tc_used) ( &tt_used, &tc_used );
462 VG_(message)(Vg_UserMsg,
463 "%lu bbs, in: %d (%d -> %d), out %d (%d -> %d), TT %d, TC %d",
464 VG_(bbs_done),
465 VG_(this_epoch_in_count),
466 VG_(this_epoch_in_osize),
467 VG_(this_epoch_in_tsize),
468 VG_(this_epoch_out_count),
469 VG_(this_epoch_out_osize),
470 VG_(this_epoch_out_tsize),
471 tt_used, tc_used
472 );
473 }
474 VG_(this_epoch_in_count) = 0;
475 VG_(this_epoch_in_osize) = 0;
476 VG_(this_epoch_in_tsize) = 0;
477 VG_(this_epoch_out_count) = 0;
478 VG_(this_epoch_out_osize) = 0;
479 VG_(this_epoch_out_tsize) = 0;
480}
481
482
483/* Initialise the scheduler. Create a single "main" thread ready to
sewardj6072c362002-04-19 14:40:57 +0000484 run, with special ThreadId of one. This is called at startup; the
sewardje663cb92002-04-12 10:26:32 +0000485 caller takes care to park the client's state is parked in
486 VG_(baseBlock).
487*/
488void VG_(scheduler_init) ( void )
489{
490 Int i;
491 Addr startup_esp;
492 ThreadId tid_main;
493
494 startup_esp = VG_(baseBlock)[VGOFF_(m_esp)];
495 if ((startup_esp & VG_STARTUP_STACK_MASK) != VG_STARTUP_STACK_MASK) {
sewardj9a199dc2002-04-14 13:01:38 +0000496 VG_(printf)("%%esp at startup = %p is not near %p; aborting\n",
497 (void*)startup_esp, (void*)VG_STARTUP_STACK_MASK);
sewardje663cb92002-04-12 10:26:32 +0000498 VG_(panic)("unexpected %esp at startup");
499 }
500
sewardj6072c362002-04-19 14:40:57 +0000501 for (i = 0 /* NB; not 1 */; i < VG_N_THREADS; i++) {
502 vg_threads[i].status = VgTs_Empty;
sewardje663cb92002-04-12 10:26:32 +0000503 vg_threads[i].stack_size = 0;
504 vg_threads[i].stack_base = (Addr)NULL;
sewardj1e8cdc92002-04-18 11:37:52 +0000505 vg_threads[i].tid = i;
sewardje663cb92002-04-12 10:26:32 +0000506 }
507
508 for (i = 0; i < VG_N_WAITING_FDS; i++)
509 vg_waiting_fds[i].fd = -1; /* not in use */
510
sewardje663cb92002-04-12 10:26:32 +0000511 /* Assert this is thread zero, which has certain magic
512 properties. */
513 tid_main = vg_alloc_ThreadState();
sewardj6072c362002-04-19 14:40:57 +0000514 vg_assert(tid_main == 1);
sewardje663cb92002-04-12 10:26:32 +0000515
sewardj6072c362002-04-19 14:40:57 +0000516 vg_threads[tid_main].status = VgTs_Runnable;
517 vg_threads[tid_main].joiner = VG_INVALID_THREADID;
518 vg_threads[tid_main].waited_on_mx = NULL;
519 vg_threads[tid_main].retval = NULL; /* not important */
sewardj1e8cdc92002-04-18 11:37:52 +0000520 vg_threads[tid_main].stack_highest_word
521 = vg_threads[tid_main].m_esp /* -4 ??? */;
sewardje663cb92002-04-12 10:26:32 +0000522
523 /* Copy VG_(baseBlock) state to tid_main's slot. */
sewardj1e8cdc92002-04-18 11:37:52 +0000524 vg_tid_currently_in_baseBlock = tid_main;
sewardje663cb92002-04-12 10:26:32 +0000525 VG_(save_thread_state) ( tid_main );
sewardj1e8cdc92002-04-18 11:37:52 +0000526
527 /* So now ... */
528 vg_assert(vg_tid_currently_in_baseBlock == VG_INVALID_THREADID);
sewardje663cb92002-04-12 10:26:32 +0000529}
530
531
532/* What if fd isn't a valid fd? */
533static
534void set_fd_nonblocking ( Int fd )
535{
536 Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
537 vg_assert(!VG_(is_kerror)(res));
538 res |= VKI_O_NONBLOCK;
539 res = VG_(fcntl)( fd, VKI_F_SETFL, res );
540 vg_assert(!VG_(is_kerror)(res));
541}
542
543static
544void set_fd_blocking ( Int fd )
545{
546 Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
547 vg_assert(!VG_(is_kerror)(res));
548 res &= ~VKI_O_NONBLOCK;
549 res = VG_(fcntl)( fd, VKI_F_SETFL, res );
550 vg_assert(!VG_(is_kerror)(res));
551}
552
553static
554Bool fd_is_blockful ( Int fd )
555{
556 Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
557 vg_assert(!VG_(is_kerror)(res));
558 return (res & VKI_O_NONBLOCK) ? False : True;
559}
560
561
562
563/* Do a purely thread-local request for tid, and put the result in its
564 %EDX, without changing its scheduling state in any way, nor that of
565 any other threads. Return True if so.
566
567 If the request is non-trivial, return False; a more capable but
568 slower mechanism will deal with it.
569*/
570static
571Bool maybe_do_trivial_clientreq ( ThreadId tid )
572{
573# define SIMPLE_RETURN(vvv) \
sewardj8c824512002-04-14 04:16:48 +0000574 { tst->m_edx = (vvv); \
sewardje663cb92002-04-12 10:26:32 +0000575 return True; \
576 }
577
sewardj8c824512002-04-14 04:16:48 +0000578 ThreadState* tst = &vg_threads[tid];
579 UInt* arg = (UInt*)(tst->m_eax);
580 UInt req_no = arg[0];
581
sewardje663cb92002-04-12 10:26:32 +0000582 switch (req_no) {
583 case VG_USERREQ__MALLOC:
584 SIMPLE_RETURN(
sewardj8c824512002-04-14 04:16:48 +0000585 (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocMalloc )
sewardje663cb92002-04-12 10:26:32 +0000586 );
587 case VG_USERREQ__BUILTIN_NEW:
588 SIMPLE_RETURN(
sewardj8c824512002-04-14 04:16:48 +0000589 (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocNew )
sewardje663cb92002-04-12 10:26:32 +0000590 );
591 case VG_USERREQ__BUILTIN_VEC_NEW:
592 SIMPLE_RETURN(
sewardj8c824512002-04-14 04:16:48 +0000593 (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocNewVec )
sewardje663cb92002-04-12 10:26:32 +0000594 );
595 case VG_USERREQ__FREE:
sewardj8c824512002-04-14 04:16:48 +0000596 VG_(client_free) ( tst, (void*)arg[1], Vg_AllocMalloc );
sewardje663cb92002-04-12 10:26:32 +0000597 SIMPLE_RETURN(0); /* irrelevant */
598 case VG_USERREQ__BUILTIN_DELETE:
sewardj8c824512002-04-14 04:16:48 +0000599 VG_(client_free) ( tst, (void*)arg[1], Vg_AllocNew );
sewardje663cb92002-04-12 10:26:32 +0000600 SIMPLE_RETURN(0); /* irrelevant */
601 case VG_USERREQ__BUILTIN_VEC_DELETE:
sewardj8c824512002-04-14 04:16:48 +0000602 VG_(client_free) ( tst, (void*)arg[1], Vg_AllocNewVec );
sewardje663cb92002-04-12 10:26:32 +0000603 SIMPLE_RETURN(0); /* irrelevant */
604 case VG_USERREQ__CALLOC:
605 SIMPLE_RETURN(
sewardj8c824512002-04-14 04:16:48 +0000606 (UInt)VG_(client_calloc) ( tst, arg[1], arg[2] )
sewardje663cb92002-04-12 10:26:32 +0000607 );
608 case VG_USERREQ__REALLOC:
609 SIMPLE_RETURN(
sewardj8c824512002-04-14 04:16:48 +0000610 (UInt)VG_(client_realloc) ( tst, (void*)arg[1], arg[2] )
sewardje663cb92002-04-12 10:26:32 +0000611 );
612 case VG_USERREQ__MEMALIGN:
613 SIMPLE_RETURN(
sewardj8c824512002-04-14 04:16:48 +0000614 (UInt)VG_(client_memalign) ( tst, arg[1], arg[2] )
sewardje663cb92002-04-12 10:26:32 +0000615 );
sewardj9650c992002-04-16 03:44:31 +0000616
617 /* These are heavily used. */
618 case VG_USERREQ__PTHREAD_GET_THREADID:
619 SIMPLE_RETURN(tid);
620 case VG_USERREQ__RUNNING_ON_VALGRIND:
621 SIMPLE_RETURN(1);
sewardj45b4b372002-04-16 22:50:32 +0000622 case VG_USERREQ__GET_PTHREAD_TRACE_LEVEL:
623 SIMPLE_RETURN(VG_(clo_trace_pthread_level));
sewardj9650c992002-04-16 03:44:31 +0000624
sewardje663cb92002-04-12 10:26:32 +0000625 default:
626 /* Too hard; wimp out. */
627 return False;
628 }
629# undef SIMPLE_RETURN
630}
631
632
sewardj6072c362002-04-19 14:40:57 +0000633/* vthread tid is returning from a signal handler; modify its
634 stack/regs accordingly. */
635static
636void handle_signal_return ( ThreadId tid )
637{
638 Char msg_buf[100];
639 Bool restart_blocked_syscalls;
640
641 vg_assert(is_valid_tid(tid));
642
643 restart_blocked_syscalls = VG_(signal_returns)(tid);
644
645 if (restart_blocked_syscalls)
646 /* Easy; we don't have to do anything. */
647 return;
648
649 if (vg_threads[tid].status == VgTs_WaitFD) {
650 vg_assert(vg_threads[tid].m_eax == __NR_read
651 || vg_threads[tid].m_eax == __NR_write);
652 /* read() or write() interrupted. Force a return with EINTR. */
653 vg_threads[tid].m_eax = -VKI_EINTR;
654 vg_threads[tid].status = VgTs_Runnable;
655 if (VG_(clo_trace_sched)) {
656 VG_(sprintf)(msg_buf,
657 "read() / write() interrupted by signal; return EINTR" );
658 print_sched_event(tid, msg_buf);
659 }
660 return;
661 }
662
663 if (vg_threads[tid].status == VgTs_WaitFD) {
664 vg_assert(vg_threads[tid].m_eax == __NR_nanosleep);
665 /* We interrupted a nanosleep(). The right thing to do is to
666 write the unused time to nanosleep's second param and return
667 EINTR, but I'm too lazy for that. */
668 return;
669 }
670
671 /* All other cases? Just return. */
672}
673
674
sewardje663cb92002-04-12 10:26:32 +0000675static
676void sched_do_syscall ( ThreadId tid )
677{
678 UInt saved_eax;
679 UInt res, syscall_no;
680 UInt fd;
681 Bool might_block, assumed_nonblocking;
682 Bool orig_fd_blockness;
683 Char msg_buf[100];
684
sewardj6072c362002-04-19 14:40:57 +0000685 vg_assert(is_valid_tid(tid));
sewardje663cb92002-04-12 10:26:32 +0000686 vg_assert(vg_threads[tid].status == VgTs_Runnable);
687
688 syscall_no = vg_threads[tid].m_eax; /* syscall number */
689
690 if (syscall_no == __NR_nanosleep) {
691 ULong t_now, t_awaken;
692 struct vki_timespec* req;
693 req = (struct vki_timespec*)vg_threads[tid].m_ebx; /* arg1 */
694 t_now = VG_(read_microsecond_timer)();
695 t_awaken
696 = t_now
697 + (ULong)1000000ULL * (ULong)(req->tv_sec)
698 + (ULong)( (UInt)(req->tv_nsec) / 1000 );
699 vg_threads[tid].status = VgTs_Sleeping;
700 vg_threads[tid].awaken_at = t_awaken;
sewardj8937c812002-04-12 20:12:20 +0000701 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +0000702 VG_(sprintf)(msg_buf, "at %lu: nanosleep for %lu",
703 t_now, t_awaken-t_now);
704 print_sched_event(tid, msg_buf);
705 }
706 /* Force the scheduler to run something else for a while. */
707 return;
708 }
709
710 switch (syscall_no) {
711 case __NR_read:
712 case __NR_write:
713 assumed_nonblocking
714 = False;
715 might_block
716 = fd_is_blockful(vg_threads[tid].m_ebx /* arg1 */);
717 break;
718 default:
719 might_block = False;
720 assumed_nonblocking = True;
721 }
722
723 if (assumed_nonblocking) {
724 /* We think it's non-blocking. Just do it in the normal way. */
725 VG_(perform_assumed_nonblocking_syscall)(tid);
726 /* The thread is still runnable. */
727 return;
728 }
729
730 /* It might block. Take evasive action. */
731 switch (syscall_no) {
732 case __NR_read:
733 case __NR_write:
734 fd = vg_threads[tid].m_ebx; break;
735 default:
736 vg_assert(3+3 == 7);
737 }
738
739 /* Set the fd to nonblocking, and do the syscall, which will return
740 immediately, in order to lodge a request with the Linux kernel.
741 We later poll for I/O completion using select(). */
742
743 orig_fd_blockness = fd_is_blockful(fd);
744 set_fd_nonblocking(fd);
745 vg_assert(!fd_is_blockful(fd));
746 VG_(check_known_blocking_syscall)(tid, syscall_no, NULL /* PRE */);
747
748 /* This trashes the thread's %eax; we have to preserve it. */
749 saved_eax = vg_threads[tid].m_eax;
750 KERNEL_DO_SYSCALL(tid,res);
751
752 /* Restore original blockfulness of the fd. */
753 if (orig_fd_blockness)
754 set_fd_blocking(fd);
755 else
756 set_fd_nonblocking(fd);
757
758 if (res != -VKI_EWOULDBLOCK) {
759 /* It didn't block; it went through immediately. So finish off
760 in the normal way. Don't restore %EAX, since that now
761 (correctly) holds the result of the call. */
762 VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
763 /* We're still runnable. */
764 vg_assert(vg_threads[tid].status == VgTs_Runnable);
765
766 } else {
767
768 /* It would have blocked. First, restore %EAX to what it was
769 before our speculative call. */
770 vg_threads[tid].m_eax = saved_eax;
771 /* Put this fd in a table of fds on which we are waiting for
772 completion. The arguments for select() later are constructed
773 from this table. */
774 add_waiting_fd(tid, fd, saved_eax /* which holds the syscall # */);
775 /* Deschedule thread until an I/O completion happens. */
776 vg_threads[tid].status = VgTs_WaitFD;
sewardj8937c812002-04-12 20:12:20 +0000777 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +0000778 VG_(sprintf)(msg_buf,"block until I/O ready on fd %d", fd);
779 print_sched_event(tid, msg_buf);
780 }
781
782 }
783}
784
785
786/* Find out which of the fds in vg_waiting_fds are now ready to go, by
787 making enquiries with select(), and mark them as ready. We have to
788 wait for the requesting threads to fall into the the WaitFD state
789 before we can actually finally deliver the results, so this
790 procedure doesn't do that; complete_blocked_syscalls() does it.
791
792 It might seem odd that a thread which has done a blocking syscall
793 is not in WaitFD state; the way this can happen is if it initially
794 becomes WaitFD, but then a signal is delivered to it, so it becomes
795 Runnable for a while. In this case we have to wait for the
796 sighandler to return, whereupon the WaitFD state is resumed, and
797 only at that point can the I/O result be delivered to it. However,
798 this point may be long after the fd is actually ready.
799
800 So, poll_for_ready_fds() merely detects fds which are ready.
801 complete_blocked_syscalls() does the second half of the trick,
802 possibly much later: it delivers the results from ready fds to
803 threads in WaitFD state.
804*/
sewardj9a199dc2002-04-14 13:01:38 +0000805static
sewardje663cb92002-04-12 10:26:32 +0000806void poll_for_ready_fds ( void )
807{
808 vki_ksigset_t saved_procmask;
809 vki_fd_set readfds;
810 vki_fd_set writefds;
811 vki_fd_set exceptfds;
812 struct vki_timeval timeout;
813 Int fd, fd_max, i, n_ready, syscall_no, n_ok;
814 ThreadId tid;
815 Bool rd_ok, wr_ok, ex_ok;
816 Char msg_buf[100];
817
sewardje462e202002-04-13 04:09:07 +0000818 struct vki_timespec* rem;
819 ULong t_now;
820
sewardje663cb92002-04-12 10:26:32 +0000821 /* Awaken any sleeping threads whose sleep has expired. */
sewardj6072c362002-04-19 14:40:57 +0000822 for (tid = 1; tid < VG_N_THREADS; tid++)
823 if (vg_threads[tid].status == VgTs_Sleeping)
824 break;
825
826 /* Avoid pointless calls to VG_(read_microsecond_timer). */
827 if (tid < VG_N_THREADS) {
828 t_now = VG_(read_microsecond_timer)();
829 for (tid = 1; tid < VG_N_THREADS; tid++) {
830 if (vg_threads[tid].status != VgTs_Sleeping)
831 continue;
832 if (t_now >= vg_threads[tid].awaken_at) {
833 /* Resume this thread. Set to zero the remaining-time
834 (second) arg of nanosleep, since it's used up all its
835 time. */
836 vg_assert(vg_threads[tid].m_eax == __NR_nanosleep);
837 rem = (struct vki_timespec *)vg_threads[tid].m_ecx; /* arg2 */
838 if (rem != NULL) {
839 rem->tv_sec = 0;
840 rem->tv_nsec = 0;
841 }
842 /* Make the syscall return 0 (success). */
843 vg_threads[tid].m_eax = 0;
844 /* Reschedule this thread. */
845 vg_threads[tid].status = VgTs_Runnable;
846 if (VG_(clo_trace_sched)) {
847 VG_(sprintf)(msg_buf, "at %lu: nanosleep done",
848 t_now);
849 print_sched_event(tid, msg_buf);
850 }
sewardje663cb92002-04-12 10:26:32 +0000851 }
852 }
853 }
sewardje663cb92002-04-12 10:26:32 +0000854
sewardje462e202002-04-13 04:09:07 +0000855 /* And look for threads waiting on file descriptors which are now
856 ready for I/O.*/
sewardje663cb92002-04-12 10:26:32 +0000857 timeout.tv_sec = 0;
858 timeout.tv_usec = 0;
859
860 VKI_FD_ZERO(&readfds);
861 VKI_FD_ZERO(&writefds);
862 VKI_FD_ZERO(&exceptfds);
863 fd_max = -1;
864 for (i = 0; i < VG_N_WAITING_FDS; i++) {
865 if (vg_waiting_fds[i].fd == -1 /* not in use */)
866 continue;
867 if (vg_waiting_fds[i].ready /* already ready? */)
868 continue;
869 fd = vg_waiting_fds[i].fd;
870 /* VG_(printf)("adding QUERY for fd %d\n", fd); */
sewardje462e202002-04-13 04:09:07 +0000871 vg_assert(fd >= 0);
sewardje663cb92002-04-12 10:26:32 +0000872 if (fd > fd_max)
873 fd_max = fd;
874 tid = vg_waiting_fds[i].tid;
sewardj6072c362002-04-19 14:40:57 +0000875 vg_assert(is_valid_tid(tid));
sewardje663cb92002-04-12 10:26:32 +0000876 syscall_no = vg_waiting_fds[i].syscall_no;
877 switch (syscall_no) {
878 case __NR_read:
879 VKI_FD_SET(fd, &readfds); break;
880 case __NR_write:
881 VKI_FD_SET(fd, &writefds); break;
882 default:
883 VG_(panic)("poll_for_ready_fds: unexpected syscall");
884 /*NOTREACHED*/
885 break;
886 }
887 }
888
sewardje462e202002-04-13 04:09:07 +0000889 /* Short cut: if no fds are waiting, give up now. */
890 if (fd_max == -1)
891 return;
892
sewardje663cb92002-04-12 10:26:32 +0000893 /* BLOCK ALL SIGNALS. We don't want the complication of select()
894 getting interrupted. */
895 VG_(block_all_host_signals)( &saved_procmask );
896
897 n_ready = VG_(select)
898 ( fd_max+1, &readfds, &writefds, &exceptfds, &timeout);
899 if (VG_(is_kerror)(n_ready)) {
900 VG_(printf)("poll_for_ready_fds: select returned %d\n", n_ready);
901 VG_(panic)("poll_for_ready_fds: select failed?!");
902 /*NOTREACHED*/
903 }
904
905 /* UNBLOCK ALL SIGNALS */
906 VG_(restore_host_signals)( &saved_procmask );
907
908 /* VG_(printf)("poll_for_io_completions: %d fs ready\n", n_ready); */
909
910 if (n_ready == 0)
911 return;
912
913 /* Inspect all the fds we know about, and handle any completions that
914 have happened. */
915 /*
916 VG_(printf)("\n\n");
917 for (fd = 0; fd < 100; fd++)
918 if (VKI_FD_ISSET(fd, &writefds) || VKI_FD_ISSET(fd, &readfds)) {
919 VG_(printf)("X"); } else { VG_(printf)("."); };
920 VG_(printf)("\n\nfd_max = %d\n", fd_max);
921 */
922
923 for (fd = 0; fd <= fd_max; fd++) {
924 rd_ok = VKI_FD_ISSET(fd, &readfds);
925 wr_ok = VKI_FD_ISSET(fd, &writefds);
926 ex_ok = VKI_FD_ISSET(fd, &exceptfds);
927
928 n_ok = (rd_ok ? 1 : 0) + (wr_ok ? 1 : 0) + (ex_ok ? 1 : 0);
929 if (n_ok == 0)
930 continue;
931 if (n_ok > 1) {
932 VG_(printf)("offending fd = %d\n", fd);
933 VG_(panic)("poll_for_ready_fds: multiple events on fd");
934 }
935
936 /* An I/O event completed for fd. Find the thread which
937 requested this. */
938 for (i = 0; i < VG_N_WAITING_FDS; i++) {
939 if (vg_waiting_fds[i].fd == -1 /* not in use */)
940 continue;
941 if (vg_waiting_fds[i].fd == fd)
942 break;
943 }
944
945 /* And a bit more paranoia ... */
946 vg_assert(i >= 0 && i < VG_N_WAITING_FDS);
947
948 /* Mark the fd as ready. */
949 vg_assert(! vg_waiting_fds[i].ready);
950 vg_waiting_fds[i].ready = True;
951 }
952}
953
954
955/* See comment attached to poll_for_ready_fds() for explaination. */
sewardj9a199dc2002-04-14 13:01:38 +0000956static
sewardje663cb92002-04-12 10:26:32 +0000957void complete_blocked_syscalls ( void )
958{
959 Int fd, i, res, syscall_no;
960 ThreadId tid;
961 Char msg_buf[100];
962
963 /* Inspect all the outstanding fds we know about. */
964
965 for (i = 0; i < VG_N_WAITING_FDS; i++) {
966 if (vg_waiting_fds[i].fd == -1 /* not in use */)
967 continue;
968 if (! vg_waiting_fds[i].ready)
969 continue;
970
971 fd = vg_waiting_fds[i].fd;
972 tid = vg_waiting_fds[i].tid;
sewardj6072c362002-04-19 14:40:57 +0000973 vg_assert(is_valid_tid(tid));
sewardje663cb92002-04-12 10:26:32 +0000974
975 /* The thread actually has to be waiting for the I/O event it
976 requested before we can deliver the result! */
977 if (vg_threads[tid].status != VgTs_WaitFD)
978 continue;
979
980 /* Ok, actually do it! We can safely use %EAX as the syscall
981 number, because the speculative call made by
982 sched_do_syscall() doesn't change %EAX in the case where the
983 call would have blocked. */
984
985 syscall_no = vg_waiting_fds[i].syscall_no;
986 vg_assert(syscall_no == vg_threads[tid].m_eax);
987 KERNEL_DO_SYSCALL(tid,res);
988 VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
989
990 /* Reschedule. */
991 vg_threads[tid].status = VgTs_Runnable;
992 /* Mark slot as no longer in use. */
993 vg_waiting_fds[i].fd = -1;
994 /* pp_sched_status(); */
sewardj8937c812002-04-12 20:12:20 +0000995 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +0000996 VG_(sprintf)(msg_buf,"resume due to I/O completion on fd %d", fd);
997 print_sched_event(tid, msg_buf);
998 }
999 }
1000}
1001
1002
1003static
1004void nanosleep_for_a_while ( void )
1005{
1006 Int res;
1007 struct vki_timespec req;
1008 struct vki_timespec rem;
1009 req.tv_sec = 0;
1010 req.tv_nsec = 20 * 1000 * 1000;
1011 res = VG_(nanosleep)( &req, &rem );
1012 /* VG_(printf)("after ns, unused = %d\n", rem.tv_nsec ); */
1013 vg_assert(res == 0);
1014}
1015
1016
1017/* ---------------------------------------------------------------------
1018 The scheduler proper.
1019 ------------------------------------------------------------------ */
1020
1021/* Run user-space threads until either
1022 * Deadlock occurs
1023 * One thread asks to shutdown Valgrind
1024 * The specified number of basic blocks has gone by.
1025*/
1026VgSchedReturnCode VG_(scheduler) ( void )
1027{
1028 ThreadId tid, tid_next;
1029 UInt trc;
1030 UInt dispatch_ctr_SAVED;
sewardj54cacf02002-04-12 23:24:59 +00001031 Int request_code, done_this_time, n_in_fdwait_or_sleep;
sewardje663cb92002-04-12 10:26:32 +00001032 Char msg_buf[100];
1033 Addr trans_addr;
1034
1035 /* For the LRU structures, records when the epoch began. */
1036 ULong lru_epoch_started_at = 0;
1037
1038 /* Start with the root thread. tid in general indicates the
1039 currently runnable/just-finished-running thread. */
sewardj6072c362002-04-19 14:40:57 +00001040 tid = 1;
sewardje663cb92002-04-12 10:26:32 +00001041
1042 /* This is the top level scheduler loop. It falls into three
1043 phases. */
1044 while (True) {
1045
sewardj6072c362002-04-19 14:40:57 +00001046 /* ======================= Phase 0 of 3 =======================
1047 Be paranoid. Always a good idea. */
1048 scheduler_sanity();
1049
sewardje663cb92002-04-12 10:26:32 +00001050 /* ======================= Phase 1 of 3 =======================
1051 Handle I/O completions and signals. This may change the
1052 status of various threads. Then select a new thread to run,
1053 or declare deadlock, or sleep if there are no runnable
1054 threads but some are blocked on I/O. */
1055
1056 /* Age the LRU structures if an epoch has been completed. */
1057 if (VG_(bbs_done) - lru_epoch_started_at >= VG_BBS_PER_EPOCH) {
1058 lru_epoch_started_at = VG_(bbs_done);
1059 increment_epoch();
1060 }
1061
1062 /* Was a debug-stop requested? */
1063 if (VG_(bbs_to_go) == 0)
1064 goto debug_stop;
1065
1066 /* Do the following loop until a runnable thread is found, or
1067 deadlock is detected. */
1068 while (True) {
1069
1070 /* For stats purposes only. */
1071 VG_(num_scheduling_events_MAJOR) ++;
1072
1073 /* See if any I/O operations which we were waiting for have
1074 completed, and, if so, make runnable the relevant waiting
1075 threads. */
1076 poll_for_ready_fds();
1077 complete_blocked_syscalls();
1078
1079 /* See if there are any signals which need to be delivered. If
1080 so, choose thread(s) to deliver them to, and build signal
1081 delivery frames on those thread(s) stacks. */
sewardj6072c362002-04-19 14:40:57 +00001082
1083 /* Be careful about delivering signals to a thread waiting
1084 for a mutex. In particular, when the handler is running,
1085 that thread is temporarily apparently-not-waiting for the
1086 mutex, so if it is unlocked by another thread whilst the
1087 handler is running, this thread is not informed. When the
1088 handler returns, the thread resumes waiting on the mutex,
1089 even if, as a result, it has missed the unlocking of it.
1090 Potential deadlock. This sounds all very strange, but the
1091 POSIX standard appears to require this behaviour. */
1092 VG_(deliver_signals)( 1 /*HACK*/ );
1093 VG_(do_sanity_checks)( 1 /*HACK*/, False );
sewardje663cb92002-04-12 10:26:32 +00001094
1095 /* Try and find a thread (tid) to run. */
1096 tid_next = tid;
sewardj54cacf02002-04-12 23:24:59 +00001097 n_in_fdwait_or_sleep = 0;
sewardje663cb92002-04-12 10:26:32 +00001098 while (True) {
1099 tid_next++;
sewardj6072c362002-04-19 14:40:57 +00001100 if (tid_next >= VG_N_THREADS) tid_next = 1;
sewardj54cacf02002-04-12 23:24:59 +00001101 if (vg_threads[tid_next].status == VgTs_WaitFD
1102 || vg_threads[tid_next].status == VgTs_Sleeping)
1103 n_in_fdwait_or_sleep ++;
sewardje663cb92002-04-12 10:26:32 +00001104 if (vg_threads[tid_next].status == VgTs_Runnable)
1105 break; /* We can run this one. */
1106 if (tid_next == tid)
1107 break; /* been all the way round */
1108 }
1109 tid = tid_next;
1110
1111 if (vg_threads[tid].status == VgTs_Runnable) {
1112 /* Found a suitable candidate. Fall out of this loop, so
1113 we can advance to stage 2 of the scheduler: actually
1114 running the thread. */
1115 break;
1116 }
1117
1118 /* We didn't find a runnable thread. Now what? */
sewardj54cacf02002-04-12 23:24:59 +00001119 if (n_in_fdwait_or_sleep == 0) {
1120 /* No runnable threads and no prospect of any appearing
1121 even if we wait for an arbitrary length of time. In
1122 short, we have a deadlock. */
sewardj15a43e12002-04-17 19:35:12 +00001123 VG_(pp_sched_status)();
sewardje663cb92002-04-12 10:26:32 +00001124 return VgSrc_Deadlock;
1125 }
1126
1127 /* At least one thread is in a fd-wait state. Delay for a
1128 while, and go round again, in the hope that eventually a
1129 thread becomes runnable. */
1130 nanosleep_for_a_while();
1131 // pp_sched_status();
1132 // VG_(printf)(".\n");
1133 }
1134
1135
1136 /* ======================= Phase 2 of 3 =======================
1137 Wahey! We've finally decided that thread tid is runnable, so
1138 we now do that. Run it for as much of a quanta as possible.
1139 Trivial requests are handled and the thread continues. The
1140 aim is not to do too many of Phase 1 since it is expensive. */
1141
1142 if (0)
1143 VG_(printf)("SCHED: tid %d, used %d\n", tid, VG_N_THREADS);
1144
1145 /* Figure out how many bbs to ask vg_run_innerloop to do. Note
1146 that it decrements the counter before testing it for zero, so
1147 that if VG_(dispatch_ctr) is set to N you get at most N-1
1148 iterations. Also this means that VG_(dispatch_ctr) must
1149 exceed zero before entering the innerloop. Also also, the
1150 decrement is done before the bb is actually run, so you
1151 always get at least one decrement even if nothing happens.
1152 */
1153 if (VG_(bbs_to_go) >= VG_SCHEDULING_QUANTUM)
1154 VG_(dispatch_ctr) = VG_SCHEDULING_QUANTUM + 1;
1155 else
1156 VG_(dispatch_ctr) = (UInt)VG_(bbs_to_go) + 1;
1157
1158 /* ... and remember what we asked for. */
1159 dispatch_ctr_SAVED = VG_(dispatch_ctr);
1160
sewardj1e8cdc92002-04-18 11:37:52 +00001161 /* paranoia ... */
1162 vg_assert(vg_threads[tid].tid == tid);
1163
sewardje663cb92002-04-12 10:26:32 +00001164 /* Actually run thread tid. */
1165 while (True) {
1166
1167 /* For stats purposes only. */
1168 VG_(num_scheduling_events_MINOR) ++;
1169
1170 if (0)
1171 VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs",
1172 tid, VG_(dispatch_ctr) - 1 );
1173
1174 trc = run_thread_for_a_while ( tid );
1175
1176 /* Deal quickly with trivial scheduling events, and resume the
1177 thread. */
1178
1179 if (trc == VG_TRC_INNER_FASTMISS) {
1180 vg_assert(VG_(dispatch_ctr) > 0);
1181
1182 /* Trivial event. Miss in the fast-cache. Do a full
1183 lookup for it. */
1184 trans_addr
1185 = VG_(search_transtab) ( vg_threads[tid].m_eip );
1186 if (trans_addr == (Addr)0) {
1187 /* Not found; we need to request a translation. */
sewardj1e8cdc92002-04-18 11:37:52 +00001188 create_translation_for( tid, vg_threads[tid].m_eip );
sewardje663cb92002-04-12 10:26:32 +00001189 trans_addr = VG_(search_transtab) ( vg_threads[tid].m_eip );
1190 if (trans_addr == (Addr)0)
1191 VG_(panic)("VG_TRC_INNER_FASTMISS: missing tt_fast entry");
1192 }
1193 continue; /* with this thread */
1194 }
1195
1196 if (trc == VG_TRC_EBP_JMP_CLIENTREQ) {
1197 Bool is_triv = maybe_do_trivial_clientreq(tid);
1198 if (is_triv) {
1199 /* NOTE: a trivial request is something like a call to
1200 malloc() or free(). It DOES NOT change the
1201 Runnability of this thread nor the status of any
1202 other thread; it is purely thread-local. */
1203 continue; /* with this thread */
1204 }
1205 }
1206
1207 /* It's a non-trivial event. Give up running this thread and
1208 handle things the expensive way. */
1209 break;
1210 }
1211
1212 /* ======================= Phase 3 of 3 =======================
1213 Handle non-trivial thread requests, mostly pthread stuff. */
1214
1215 /* Ok, we've fallen out of the dispatcher for a
1216 non-completely-trivial reason. First, update basic-block
1217 counters. */
1218
1219 done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 1;
1220 vg_assert(done_this_time >= 0);
1221 VG_(bbs_to_go) -= (ULong)done_this_time;
1222 VG_(bbs_done) += (ULong)done_this_time;
1223
1224 if (0 && trc != VG_TRC_INNER_FASTMISS)
1225 VG_(message)(Vg_DebugMsg, "thread %d: completed %d bbs, trc %d",
1226 tid, done_this_time, (Int)trc );
1227
1228 if (0 && trc != VG_TRC_INNER_FASTMISS)
1229 VG_(message)(Vg_DebugMsg, "thread %d: %ld bbs, event %s",
1230 tid, VG_(bbs_done),
1231 name_of_sched_event(trc) );
sewardj9d1b5d32002-04-17 19:40:49 +00001232
sewardje663cb92002-04-12 10:26:32 +00001233 /* Examine the thread's return code to figure out why it
1234 stopped, and handle requests. */
1235
1236 switch (trc) {
1237
1238 case VG_TRC_INNER_FASTMISS:
1239 VG_(panic)("VG_(scheduler): VG_TRC_INNER_FASTMISS");
1240 /*NOTREACHED*/
1241 break;
1242
1243 case VG_TRC_INNER_COUNTERZERO:
1244 /* Timeslice is out. Let a new thread be scheduled,
1245 simply by doing nothing, causing us to arrive back at
1246 Phase 1. */
1247 if (VG_(bbs_to_go) == 0) {
1248 goto debug_stop;
1249 }
1250 vg_assert(VG_(dispatch_ctr) == 0);
1251 break;
1252
1253 case VG_TRC_UNRESUMABLE_SIGNAL:
1254 /* It got a SIGSEGV/SIGBUS, which we need to deliver right
1255 away. Again, do nothing, so we wind up back at Phase
1256 1, whereupon the signal will be "delivered". */
1257 break;
1258
sewardje663cb92002-04-12 10:26:32 +00001259 case VG_TRC_EBP_JMP_SYSCALL:
1260 /* Do a syscall for the vthread tid. This could cause it
1261 to become non-runnable. */
1262 sched_do_syscall(tid);
1263 break;
1264
1265 case VG_TRC_EBP_JMP_CLIENTREQ:
1266 /* Do a client request for the vthread tid. Note that
1267 some requests will have been handled by
1268 maybe_do_trivial_clientreq(), so we don't expect to see
1269 those here.
1270 */
sewardj54cacf02002-04-12 23:24:59 +00001271 /* The thread's %EAX points at an arg block, the first
1272 word of which is the request code. */
1273 request_code = ((UInt*)(vg_threads[tid].m_eax))[0];
sewardje663cb92002-04-12 10:26:32 +00001274 if (0) {
sewardj54cacf02002-04-12 23:24:59 +00001275 VG_(sprintf)(msg_buf, "request 0x%x", request_code );
sewardje663cb92002-04-12 10:26:32 +00001276 print_sched_event(tid, msg_buf);
1277 }
1278 /* Do a non-trivial client request for thread tid. tid's
1279 %EAX points to a short vector of argument words, the
1280 first of which is the request code. The result of the
1281 request is put in tid's %EDX. Alternatively, perhaps
1282 the request causes tid to become non-runnable and/or
1283 other blocked threads become runnable. In general we
1284 can and often do mess with the state of arbitrary
1285 threads at this point. */
sewardj54cacf02002-04-12 23:24:59 +00001286 if (request_code == VG_USERREQ__SHUTDOWN_VALGRIND) {
1287 return VgSrc_Shutdown;
1288 } else {
1289 do_nontrivial_clientreq(tid);
1290 }
sewardje663cb92002-04-12 10:26:32 +00001291 break;
1292
1293 default:
1294 VG_(printf)("\ntrc = %d\n", trc);
1295 VG_(panic)("VG_(scheduler), phase 3: "
1296 "unexpected thread return code");
1297 /* NOTREACHED */
1298 break;
1299
1300 } /* switch (trc) */
1301
1302 /* That completes Phase 3 of 3. Return now to the top of the
1303 main scheduler loop, to Phase 1 of 3. */
1304
1305 } /* top-level scheduler loop */
1306
1307
1308 /* NOTREACHED */
1309 VG_(panic)("scheduler: post-main-loop ?!");
1310 /* NOTREACHED */
1311
1312 debug_stop:
1313 /* If we exited because of a debug stop, print the translation
1314 of the last block executed -- by translating it again, and
1315 throwing away the result. */
1316 VG_(printf)(
1317 "======vvvvvvvv====== LAST TRANSLATION ======vvvvvvvv======\n");
sewardj1e8cdc92002-04-18 11:37:52 +00001318 VG_(translate)( &vg_threads[tid], vg_threads[tid].m_eip, NULL, NULL, NULL );
sewardje663cb92002-04-12 10:26:32 +00001319 VG_(printf)("\n");
1320 VG_(printf)(
1321 "======^^^^^^^^====== LAST TRANSLATION ======^^^^^^^^======\n");
1322
1323 return VgSrc_BbsDone;
1324}
1325
1326
1327/* ---------------------------------------------------------------------
1328 The pthread implementation.
1329 ------------------------------------------------------------------ */
1330
1331#include <pthread.h>
1332#include <errno.h>
1333
1334#if !defined(PTHREAD_STACK_MIN)
1335# define PTHREAD_STACK_MIN (16384 - VG_AR_CLIENT_STACKBASE_REDZONE_SZB)
1336#endif
1337
1338/* /usr/include/bits/pthreadtypes.h:
1339 typedef unsigned long int pthread_t;
1340*/
1341
sewardje663cb92002-04-12 10:26:32 +00001342
sewardj604ec3c2002-04-18 22:38:41 +00001343/* -----------------------------------------------------------
1344 Thread CREATION, JOINAGE and CANCELLATION.
1345 -------------------------------------------------------- */
1346
sewardje663cb92002-04-12 10:26:32 +00001347static
1348void do_pthread_cancel ( ThreadId tid_canceller,
1349 pthread_t tid_cancellee )
1350{
1351 Char msg_buf[100];
1352 /* We want make is appear that this thread has returned to
1353 do_pthread_create_bogusRA with PTHREAD_CANCELED as the
1354 return value. So: simple: put PTHREAD_CANCELED into %EAX
1355 and &do_pthread_create_bogusRA into %EIP and keep going! */
sewardj8937c812002-04-12 20:12:20 +00001356 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001357 VG_(sprintf)(msg_buf, "cancelled by %d", tid_canceller);
1358 print_sched_event(tid_cancellee, msg_buf);
1359 }
1360 vg_threads[tid_cancellee].m_eax = (UInt)PTHREAD_CANCELED;
sewardjbc5b99f2002-04-13 00:08:51 +00001361 vg_threads[tid_cancellee].m_eip = (UInt)&VG_(pthreadreturn_bogusRA);
sewardje663cb92002-04-12 10:26:32 +00001362 vg_threads[tid_cancellee].status = VgTs_Runnable;
1363}
1364
1365
1366
1367/* Thread tid is exiting, by returning from the function it was
sewardjbc5b99f2002-04-13 00:08:51 +00001368 created with. Or possibly due to pthread_exit or cancellation.
1369 The main complication here is to resume any thread waiting to join
1370 with this one. */
sewardje663cb92002-04-12 10:26:32 +00001371static
sewardjbc5b99f2002-04-13 00:08:51 +00001372void handle_pthread_return ( ThreadId tid, void* retval )
sewardje663cb92002-04-12 10:26:32 +00001373{
1374 ThreadId jnr; /* joiner, the thread calling pthread_join. */
1375 UInt* jnr_args;
1376 void** jnr_thread_return;
1377 Char msg_buf[100];
1378
1379 /* Mark it as not in use. Leave the stack in place so the next
1380 user of this slot doesn't reallocate it. */
sewardj6072c362002-04-19 14:40:57 +00001381 vg_assert(is_valid_tid(tid));
sewardje663cb92002-04-12 10:26:32 +00001382 vg_assert(vg_threads[tid].status != VgTs_Empty);
1383
sewardjbc5b99f2002-04-13 00:08:51 +00001384 vg_threads[tid].retval = retval;
sewardje663cb92002-04-12 10:26:32 +00001385
1386 if (vg_threads[tid].joiner == VG_INVALID_THREADID) {
1387 /* No one has yet done a join on me */
1388 vg_threads[tid].status = VgTs_WaitJoiner;
sewardj8937c812002-04-12 20:12:20 +00001389 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001390 VG_(sprintf)(msg_buf,
1391 "root fn returns, waiting for a call pthread_join(%d)",
1392 tid);
1393 print_sched_event(tid, msg_buf);
1394 }
1395 } else {
1396 /* Some is waiting; make their join call return with success,
1397 putting my exit code in the place specified by the caller's
1398 thread_return param. This is all very horrible, since we
1399 need to consult the joiner's arg block -- pointed to by its
1400 %EAX -- in order to extract the 2nd param of its pthread_join
1401 call. TODO: free properly the slot (also below).
1402 */
1403 jnr = vg_threads[tid].joiner;
sewardj6072c362002-04-19 14:40:57 +00001404 vg_assert(is_valid_tid(jnr));
sewardje663cb92002-04-12 10:26:32 +00001405 vg_assert(vg_threads[jnr].status == VgTs_WaitJoinee);
1406 jnr_args = (UInt*)vg_threads[jnr].m_eax;
1407 jnr_thread_return = (void**)(jnr_args[2]);
1408 if (jnr_thread_return != NULL)
1409 *jnr_thread_return = vg_threads[tid].retval;
1410 vg_threads[jnr].m_edx = 0; /* success */
1411 vg_threads[jnr].status = VgTs_Runnable;
1412 vg_threads[tid].status = VgTs_Empty; /* bye! */
sewardj75fe1892002-04-14 02:46:33 +00001413 if (VG_(clo_instrument) && tid != 0)
1414 VGM_(make_noaccess)( vg_threads[tid].stack_base,
1415 vg_threads[tid].stack_size );
sewardj8937c812002-04-12 20:12:20 +00001416 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001417 VG_(sprintf)(msg_buf,
1418 "root fn returns, to find a waiting pthread_join(%d)", tid);
1419 print_sched_event(tid, msg_buf);
1420 VG_(sprintf)(msg_buf,
1421 "my pthread_join(%d) returned; resuming", tid);
1422 print_sched_event(jnr, msg_buf);
1423 }
1424 }
1425
1426 /* Return value is irrelevant; this thread will not get
1427 rescheduled. */
1428}
1429
1430
1431static
1432void do_pthread_join ( ThreadId tid, ThreadId jee, void** thread_return )
1433{
1434 Char msg_buf[100];
1435
1436 /* jee, the joinee, is the thread specified as an arg in thread
1437 tid's call to pthread_join. So tid is the join-er. */
sewardj6072c362002-04-19 14:40:57 +00001438 vg_assert(is_valid_tid(tid));
sewardje663cb92002-04-12 10:26:32 +00001439 vg_assert(vg_threads[tid].status == VgTs_Runnable);
1440
1441 if (jee == tid) {
1442 vg_threads[tid].m_edx = EDEADLK; /* libc constant, not a kernel one */
1443 vg_threads[tid].status = VgTs_Runnable;
1444 return;
1445 }
1446
1447 if (jee < 0
1448 || jee >= VG_N_THREADS
1449 || vg_threads[jee].status == VgTs_Empty) {
1450 /* Invalid thread to join to. */
1451 vg_threads[tid].m_edx = EINVAL;
1452 vg_threads[tid].status = VgTs_Runnable;
1453 return;
1454 }
1455
1456 if (vg_threads[jee].joiner != VG_INVALID_THREADID) {
1457 /* Someone already did join on this thread */
1458 vg_threads[tid].m_edx = EINVAL;
1459 vg_threads[tid].status = VgTs_Runnable;
1460 return;
1461 }
1462
1463 /* if (vg_threads[jee].detached) ... */
1464
1465 /* Perhaps the joinee has already finished? If so return
1466 immediately with its return code, and free up the slot. TODO:
1467 free it properly (also above). */
1468 if (vg_threads[jee].status == VgTs_WaitJoiner) {
1469 vg_assert(vg_threads[jee].joiner == VG_INVALID_THREADID);
1470 vg_threads[tid].m_edx = 0; /* success */
1471 if (thread_return != NULL)
1472 *thread_return = vg_threads[jee].retval;
1473 vg_threads[tid].status = VgTs_Runnable;
1474 vg_threads[jee].status = VgTs_Empty; /* bye! */
sewardj75fe1892002-04-14 02:46:33 +00001475 if (VG_(clo_instrument) && jee != 0)
1476 VGM_(make_noaccess)( vg_threads[jee].stack_base,
1477 vg_threads[jee].stack_size );
sewardj8937c812002-04-12 20:12:20 +00001478 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001479 VG_(sprintf)(msg_buf,
1480 "someone called pthread_join() on me; bye!");
1481 print_sched_event(jee, msg_buf);
1482 VG_(sprintf)(msg_buf,
1483 "my pthread_join(%d) returned immediately",
1484 jee );
1485 print_sched_event(tid, msg_buf);
1486 }
1487 return;
1488 }
1489
1490 /* Ok, so we'll have to wait on jee. */
1491 vg_threads[jee].joiner = tid;
1492 vg_threads[tid].status = VgTs_WaitJoinee;
sewardj8937c812002-04-12 20:12:20 +00001493 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001494 VG_(sprintf)(msg_buf,
1495 "blocking on call of pthread_join(%d)", jee );
1496 print_sched_event(tid, msg_buf);
1497 }
1498 /* So tid's join call does not return just now. */
1499}
1500
1501
1502static
1503void do_pthread_create ( ThreadId parent_tid,
1504 pthread_t* thread,
1505 pthread_attr_t* attr,
1506 void* (*start_routine)(void *),
1507 void* arg )
1508{
1509 Addr new_stack;
1510 UInt new_stk_szb;
1511 ThreadId tid;
1512 Char msg_buf[100];
1513
1514 /* Paranoia ... */
1515 vg_assert(sizeof(pthread_t) == sizeof(UInt));
1516
1517 vg_assert(vg_threads[parent_tid].status != VgTs_Empty);
1518
sewardj1e8cdc92002-04-18 11:37:52 +00001519 tid = vg_alloc_ThreadState();
sewardje663cb92002-04-12 10:26:32 +00001520
1521 /* If we've created the main thread's tid, we're in deep trouble :) */
sewardj6072c362002-04-19 14:40:57 +00001522 vg_assert(tid != 1);
1523 vg_assert(is_valid_tid(tid));
sewardje663cb92002-04-12 10:26:32 +00001524
1525 /* Copy the parent's CPU state into the child's, in a roundabout
1526 way (via baseBlock). */
1527 VG_(load_thread_state)(parent_tid);
1528 VG_(save_thread_state)(tid);
1529
1530 /* Consider allocating the child a stack, if the one it already has
1531 is inadequate. */
1532 new_stk_szb = PTHREAD_STACK_MIN;
1533
1534 if (new_stk_szb > vg_threads[tid].stack_size) {
1535 /* Again, for good measure :) We definitely don't want to be
1536 allocating a stack for the main thread. */
sewardj6072c362002-04-19 14:40:57 +00001537 vg_assert(tid != 1);
sewardje663cb92002-04-12 10:26:32 +00001538 /* for now, we don't handle the case of anything other than
1539 assigning it for the first time. */
1540 vg_assert(vg_threads[tid].stack_size == 0);
1541 vg_assert(vg_threads[tid].stack_base == (Addr)NULL);
1542 new_stack = (Addr)VG_(get_memory_from_mmap)( new_stk_szb );
1543 vg_threads[tid].stack_base = new_stack;
1544 vg_threads[tid].stack_size = new_stk_szb;
sewardj1e8cdc92002-04-18 11:37:52 +00001545 vg_threads[tid].stack_highest_word
sewardje663cb92002-04-12 10:26:32 +00001546 = new_stack + new_stk_szb
sewardj1e8cdc92002-04-18 11:37:52 +00001547 - VG_AR_CLIENT_STACKBASE_REDZONE_SZB; /* -4 ??? */;
sewardje663cb92002-04-12 10:26:32 +00001548 }
sewardj1e8cdc92002-04-18 11:37:52 +00001549
1550 vg_threads[tid].m_esp
1551 = vg_threads[tid].stack_base
1552 + vg_threads[tid].stack_size
1553 - VG_AR_CLIENT_STACKBASE_REDZONE_SZB;
1554
sewardje663cb92002-04-12 10:26:32 +00001555 if (VG_(clo_instrument))
1556 VGM_(make_noaccess)( vg_threads[tid].m_esp,
1557 VG_AR_CLIENT_STACKBASE_REDZONE_SZB );
1558
1559 /* push arg */
1560 vg_threads[tid].m_esp -= 4;
1561 * (UInt*)(vg_threads[tid].m_esp) = (UInt)arg;
1562
1563 /* push (magical) return address */
1564 vg_threads[tid].m_esp -= 4;
sewardjbc5b99f2002-04-13 00:08:51 +00001565 * (UInt*)(vg_threads[tid].m_esp) = (UInt)VG_(pthreadreturn_bogusRA);
sewardje663cb92002-04-12 10:26:32 +00001566
1567 if (VG_(clo_instrument))
1568 VGM_(make_readable)( vg_threads[tid].m_esp, 2 * 4 );
1569
1570 /* this is where we start */
1571 vg_threads[tid].m_eip = (UInt)start_routine;
1572
sewardj8937c812002-04-12 20:12:20 +00001573 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001574 VG_(sprintf)(msg_buf,
1575 "new thread, created by %d", parent_tid );
1576 print_sched_event(tid, msg_buf);
1577 }
1578
1579 /* store the thread id in *thread. */
1580 // if (VG_(clo_instrument))
1581 // ***** CHECK *thread is writable
1582 *thread = (pthread_t)tid;
1583
sewardj6072c362002-04-19 14:40:57 +00001584 vg_threads[tid].waited_on_mx = NULL;
1585 vg_threads[tid].joiner = VG_INVALID_THREADID;
1586 vg_threads[tid].status = VgTs_Runnable;
sewardj604ec3c2002-04-18 22:38:41 +00001587
1588 /* return zero */
sewardje663cb92002-04-12 10:26:32 +00001589 vg_threads[tid].m_edx = 0; /* success */
1590}
1591
1592
sewardj604ec3c2002-04-18 22:38:41 +00001593/* -----------------------------------------------------------
1594 MUTEXes
1595 -------------------------------------------------------- */
1596
sewardj604ec3c2002-04-18 22:38:41 +00001597/* pthread_mutex_t is a struct with at 5 words:
sewardje663cb92002-04-12 10:26:32 +00001598 typedef struct
1599 {
1600 int __m_reserved; -- Reserved for future use
1601 int __m_count; -- Depth of recursive locking
1602 _pthread_descr __m_owner; -- Owner thread (if recursive or errcheck)
1603 int __m_kind; -- Mutex kind: fast, recursive or errcheck
1604 struct _pthread_fastlock __m_lock; -- Underlying fast lock
1605 } pthread_mutex_t;
sewardj604ec3c2002-04-18 22:38:41 +00001606
sewardj6072c362002-04-19 14:40:57 +00001607 #define PTHREAD_MUTEX_INITIALIZER \
1608 {0, 0, 0, PTHREAD_MUTEX_TIMED_NP, __LOCK_INITIALIZER}
1609 # define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP \
1610 {0, 0, 0, PTHREAD_MUTEX_RECURSIVE_NP, __LOCK_INITIALIZER}
1611 # define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP \
1612 {0, 0, 0, PTHREAD_MUTEX_ERRORCHECK_NP, __LOCK_INITIALIZER}
1613 # define PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP \
1614 {0, 0, 0, PTHREAD_MUTEX_ADAPTIVE_NP, __LOCK_INITIALIZER}
sewardj604ec3c2002-04-18 22:38:41 +00001615
sewardj6072c362002-04-19 14:40:57 +00001616 How we use it:
sewardj604ec3c2002-04-18 22:38:41 +00001617
sewardj6072c362002-04-19 14:40:57 +00001618 __m_kind never changes and indicates whether or not it is recursive.
1619
1620 __m_count indicates the lock count; if 0, the mutex is not owned by
1621 anybody.
1622
1623 __m_owner has a ThreadId value stuffed into it. We carefully arrange
1624 that ThreadId == 0 is invalid (VG_INVALID_THREADID), so that
1625 statically initialised mutexes correctly appear
1626 to belong to nobody.
1627
1628 In summary, a not-in-use mutex is distinguised by having __m_owner
1629 == 0 (VG_INVALID_THREADID) and __m_count == 0 too. If one of those
1630 conditions holds, the other should too.
1631
1632 There is no linked list of threads waiting for this mutex. Instead
1633 a thread in WaitMX state points at the mutex with its waited_on_mx
1634 field. This makes _unlock() inefficient, but simple to implement the
1635 right semantics viz-a-viz signals.
sewardje663cb92002-04-12 10:26:32 +00001636
sewardj604ec3c2002-04-18 22:38:41 +00001637 We don't have to deal with mutex initialisation; the client side
sewardj6072c362002-04-19 14:40:57 +00001638 deals with that for us.
1639*/
sewardje663cb92002-04-12 10:26:32 +00001640
1641
1642static
1643void do_pthread_mutex_lock( ThreadId tid, pthread_mutex_t *mutex )
1644{
sewardj604ec3c2002-04-18 22:38:41 +00001645 Char msg_buf[100];
sewardje663cb92002-04-12 10:26:32 +00001646
sewardj604ec3c2002-04-18 22:38:41 +00001647 if (VG_(clo_trace_pthread_level) >= 2) {
1648 VG_(sprintf)(msg_buf, "pthread_mutex_lock %p", mutex );
1649 print_pthread_event(tid, msg_buf);
1650 }
1651
1652 /* Paranoia ... */
1653 vg_assert(is_valid_tid(tid)
1654 && vg_threads[tid].status == VgTs_Runnable);
sewardje663cb92002-04-12 10:26:32 +00001655
1656 /* POSIX doesn't mandate this, but for sanity ... */
1657 if (mutex == NULL) {
1658 vg_threads[tid].m_edx = EINVAL;
1659 return;
1660 }
1661
sewardj604ec3c2002-04-18 22:38:41 +00001662 /* More paranoia ... */
1663 switch (mutex->__m_kind) {
1664 case PTHREAD_MUTEX_TIMED_NP:
1665 case PTHREAD_MUTEX_RECURSIVE_NP:
1666 case PTHREAD_MUTEX_ERRORCHECK_NP:
1667 case PTHREAD_MUTEX_ADAPTIVE_NP:
1668 if (mutex->__m_count >= 0) break;
1669 /* else fall thru */
1670 default:
1671 vg_threads[tid].m_edx = EINVAL;
1672 return;
sewardje663cb92002-04-12 10:26:32 +00001673 }
1674
sewardj604ec3c2002-04-18 22:38:41 +00001675 if (mutex->__m_count > 0) {
sewardje663cb92002-04-12 10:26:32 +00001676
sewardj604ec3c2002-04-18 22:38:41 +00001677 vg_assert(is_valid_tid((ThreadId)mutex->__m_owner));
sewardjf8f819e2002-04-17 23:21:37 +00001678
1679 /* Someone has it already. */
sewardj604ec3c2002-04-18 22:38:41 +00001680 if ((ThreadId)mutex->__m_owner == tid) {
sewardjf8f819e2002-04-17 23:21:37 +00001681 /* It's locked -- by me! */
sewardj604ec3c2002-04-18 22:38:41 +00001682 if (mutex->__m_kind == PTHREAD_MUTEX_RECURSIVE_NP) {
sewardjf8f819e2002-04-17 23:21:37 +00001683 /* return 0 (success). */
sewardj604ec3c2002-04-18 22:38:41 +00001684 mutex->__m_count++;
sewardjf8f819e2002-04-17 23:21:37 +00001685 vg_threads[tid].m_edx = 0;
sewardj604ec3c2002-04-18 22:38:41 +00001686 VG_(printf)("!!!!!! tid %d, mutex %p -> locked %d\n",
1687 tid, mutex, mutex->__m_count);
sewardjf8f819e2002-04-17 23:21:37 +00001688 return;
1689 } else {
1690 vg_threads[tid].m_edx = EDEADLK;
1691 return;
1692 }
1693 } else {
sewardj6072c362002-04-19 14:40:57 +00001694 /* Someone else has it; we have to wait. Mark ourselves
1695 thusly. */
1696 vg_threads[tid].status = VgTs_WaitMX;
1697 vg_threads[tid].waited_on_mx = mutex;
sewardjf8f819e2002-04-17 23:21:37 +00001698 /* No assignment to %EDX, since we're blocking. */
1699 if (VG_(clo_trace_pthread_level) >= 1) {
sewardj604ec3c2002-04-18 22:38:41 +00001700 VG_(sprintf)(msg_buf, "pthread_mutex_lock %p: BLOCK",
1701 mutex );
sewardjf8f819e2002-04-17 23:21:37 +00001702 print_pthread_event(tid, msg_buf);
1703 }
sewardje663cb92002-04-12 10:26:32 +00001704 return;
1705 }
sewardjf8f819e2002-04-17 23:21:37 +00001706
sewardje663cb92002-04-12 10:26:32 +00001707 } else {
sewardj6072c362002-04-19 14:40:57 +00001708 /* Nobody owns it. Sanity check ... */
1709 vg_assert(mutex->__m_owner == VG_INVALID_THREADID);
sewardjf8f819e2002-04-17 23:21:37 +00001710 /* We get it! [for the first time]. */
sewardj604ec3c2002-04-18 22:38:41 +00001711 mutex->__m_count = 1;
1712 mutex->__m_owner = (_pthread_descr)tid;
sewardj6072c362002-04-19 14:40:57 +00001713 vg_assert(vg_threads[tid].waited_on_mx == NULL);
sewardje663cb92002-04-12 10:26:32 +00001714 /* return 0 (success). */
1715 vg_threads[tid].m_edx = 0;
1716 }
sewardjf8f819e2002-04-17 23:21:37 +00001717
sewardje663cb92002-04-12 10:26:32 +00001718}
1719
1720
1721static
1722void do_pthread_mutex_unlock ( ThreadId tid,
1723 pthread_mutex_t *mutex )
1724{
sewardje663cb92002-04-12 10:26:32 +00001725 Int i;
1726 Char msg_buf[100];
1727
sewardj45b4b372002-04-16 22:50:32 +00001728 if (VG_(clo_trace_pthread_level) >= 2) {
sewardj604ec3c2002-04-18 22:38:41 +00001729 VG_(sprintf)(msg_buf, "pthread_mutex_unlock %p", mutex );
sewardj8937c812002-04-12 20:12:20 +00001730 print_pthread_event(tid, msg_buf);
1731 }
1732
sewardj604ec3c2002-04-18 22:38:41 +00001733 /* Paranoia ... */
1734 vg_assert(is_valid_tid(tid)
1735 && vg_threads[tid].status == VgTs_Runnable);
1736
1737 if (mutex == NULL) {
1738 vg_threads[tid].m_edx = EINVAL;
1739 return;
1740 }
1741
1742 /* More paranoia ... */
1743 switch (mutex->__m_kind) {
1744 case PTHREAD_MUTEX_TIMED_NP:
1745 case PTHREAD_MUTEX_RECURSIVE_NP:
1746 case PTHREAD_MUTEX_ERRORCHECK_NP:
1747 case PTHREAD_MUTEX_ADAPTIVE_NP:
1748 if (mutex->__m_count >= 0) break;
1749 /* else fall thru */
1750 default:
1751 vg_threads[tid].m_edx = EINVAL;
1752 return;
1753 }
sewardje663cb92002-04-12 10:26:32 +00001754
1755 /* Barf if we don't currently hold the mutex. */
sewardj604ec3c2002-04-18 22:38:41 +00001756 if (mutex->__m_count == 0 /* nobody holds it */
1757 || (ThreadId)mutex->__m_owner != tid /* we don't hold it */) {
sewardje663cb92002-04-12 10:26:32 +00001758 vg_threads[tid].m_edx = EPERM;
1759 return;
1760 }
1761
sewardjf8f819e2002-04-17 23:21:37 +00001762 /* If it's a multiply-locked recursive mutex, just decrement the
1763 lock count and return. */
sewardj604ec3c2002-04-18 22:38:41 +00001764 if (mutex->__m_count > 1) {
1765 vg_assert(mutex->__m_kind == PTHREAD_MUTEX_RECURSIVE_NP);
1766 mutex->__m_count --;
sewardjf8f819e2002-04-17 23:21:37 +00001767 vg_threads[tid].m_edx = 0; /* success */
1768 return;
1769 }
1770
sewardj604ec3c2002-04-18 22:38:41 +00001771 /* Now we're sure it is locked exactly once, and by the thread who
sewardjf8f819e2002-04-17 23:21:37 +00001772 is now doing an unlock on it. */
sewardj604ec3c2002-04-18 22:38:41 +00001773 vg_assert(mutex->__m_count == 1);
sewardj6072c362002-04-19 14:40:57 +00001774 vg_assert((ThreadId)mutex->__m_owner == tid);
sewardjf8f819e2002-04-17 23:21:37 +00001775
sewardj6072c362002-04-19 14:40:57 +00001776 /* Find some arbitrary thread waiting on this mutex, and make it
1777 runnable. If none are waiting, mark the mutex as not held. */
1778 for (i = 1; i < VG_N_THREADS; i++) {
1779 if (vg_threads[i].status == VgTs_Empty)
1780 continue;
1781 if (vg_threads[i].status == VgTs_WaitMX
1782 && vg_threads[i].waited_on_mx == mutex)
1783 break;
1784 }
sewardje663cb92002-04-12 10:26:32 +00001785
sewardj6072c362002-04-19 14:40:57 +00001786 vg_assert(i <= VG_N_THREADS);
1787 if (i == VG_N_THREADS) {
sewardje663cb92002-04-12 10:26:32 +00001788 /* Nobody else is waiting on it. */
sewardj604ec3c2002-04-18 22:38:41 +00001789 mutex->__m_count = 0;
sewardj6072c362002-04-19 14:40:57 +00001790 mutex->__m_owner = VG_INVALID_THREADID;
sewardje663cb92002-04-12 10:26:32 +00001791 } else {
1792 /* Notionally transfer the hold to thread i, whose
1793 pthread_mutex_lock() call now returns with 0 (success). */
sewardjf8f819e2002-04-17 23:21:37 +00001794 /* The .count is already == 1. */
sewardj6072c362002-04-19 14:40:57 +00001795 vg_assert(vg_threads[i].waited_on_mx == mutex);
sewardj604ec3c2002-04-18 22:38:41 +00001796 mutex->__m_owner = (_pthread_descr)i;
sewardj6072c362002-04-19 14:40:57 +00001797 vg_threads[i].status = VgTs_Runnable;
1798 vg_threads[i].waited_on_mx = NULL;
sewardje663cb92002-04-12 10:26:32 +00001799 vg_threads[i].m_edx = 0; /* pth_lock() success */
sewardj8937c812002-04-12 20:12:20 +00001800
sewardj45b4b372002-04-16 22:50:32 +00001801 if (VG_(clo_trace_pthread_level) >= 1) {
sewardj604ec3c2002-04-18 22:38:41 +00001802 VG_(sprintf)(msg_buf, "pthread_mutex_lock %p: RESUME",
1803 mutex );
1804 print_pthread_event(i, msg_buf);
sewardje663cb92002-04-12 10:26:32 +00001805 }
1806 }
1807
1808 /* In either case, our (tid's) pth_unlock() returns with 0
1809 (success). */
1810 vg_threads[tid].m_edx = 0; /* Success. */
1811}
1812
1813
sewardj6072c362002-04-19 14:40:57 +00001814/* -----------------------------------------------------------
1815 CONDITION VARIABLES
1816 -------------------------------------------------------- */
sewardje663cb92002-04-12 10:26:32 +00001817
sewardj6072c362002-04-19 14:40:57 +00001818/* The relevant native types are as follows:
1819 (copied from /usr/include/bits/pthreadtypes.h)
sewardj77e466c2002-04-14 02:29:29 +00001820
sewardj6072c362002-04-19 14:40:57 +00001821 -- Conditions (not abstract because of PTHREAD_COND_INITIALIZER
1822 typedef struct
1823 {
1824 struct _pthread_fastlock __c_lock; -- Protect against concurrent access
1825 _pthread_descr __c_waiting; -- Threads waiting on this condition
1826 } pthread_cond_t;
sewardj77e466c2002-04-14 02:29:29 +00001827
sewardj6072c362002-04-19 14:40:57 +00001828 -- Attribute for conditionally variables.
1829 typedef struct
1830 {
1831 int __dummy;
1832 } pthread_condattr_t;
sewardj77e466c2002-04-14 02:29:29 +00001833
sewardj6072c362002-04-19 14:40:57 +00001834 #define PTHREAD_COND_INITIALIZER {__LOCK_INITIALIZER, 0}
sewardj77e466c2002-04-14 02:29:29 +00001835
sewardj6072c362002-04-19 14:40:57 +00001836 We'll just use the __c_waiting field to point to the head of the
1837 list of threads waiting on this condition. Note how the static
1838 initialiser has __c_waiting == 0 == VG_INVALID_THREADID.
1839
1840 Linux pthreads supports no attributes on condition variables, so we
1841 don't need to think too hard there.
1842*/
1843
sewardj77e466c2002-04-14 02:29:29 +00001844
1845
sewardje663cb92002-04-12 10:26:32 +00001846/* ---------------------------------------------------------------------
1847 Handle non-trivial client requests.
1848 ------------------------------------------------------------------ */
1849
1850static
1851void do_nontrivial_clientreq ( ThreadId tid )
1852{
1853 UInt* arg = (UInt*)(vg_threads[tid].m_eax);
1854 UInt req_no = arg[0];
1855 switch (req_no) {
1856
1857 case VG_USERREQ__PTHREAD_CREATE:
1858 do_pthread_create( tid,
1859 (pthread_t*)arg[1],
1860 (pthread_attr_t*)arg[2],
1861 (void*(*)(void*))arg[3],
1862 (void*)arg[4] );
1863 break;
1864
sewardjbc5b99f2002-04-13 00:08:51 +00001865 case VG_USERREQ__PTHREAD_RETURNS:
1866 handle_pthread_return( tid, (void*)arg[1] );
sewardje663cb92002-04-12 10:26:32 +00001867 break;
1868
1869 case VG_USERREQ__PTHREAD_JOIN:
1870 do_pthread_join( tid, arg[1], (void**)(arg[2]) );
1871 break;
1872
sewardje663cb92002-04-12 10:26:32 +00001873 case VG_USERREQ__PTHREAD_MUTEX_LOCK:
1874 do_pthread_mutex_lock( tid, (pthread_mutex_t *)(arg[1]) );
1875 break;
1876
1877 case VG_USERREQ__PTHREAD_MUTEX_UNLOCK:
1878 do_pthread_mutex_unlock( tid, (pthread_mutex_t *)(arg[1]) );
1879 break;
1880
sewardje663cb92002-04-12 10:26:32 +00001881 case VG_USERREQ__PTHREAD_CANCEL:
1882 do_pthread_cancel( tid, (pthread_t)(arg[1]) );
1883 break;
1884
1885 case VG_USERREQ__MAKE_NOACCESS:
1886 case VG_USERREQ__MAKE_WRITABLE:
1887 case VG_USERREQ__MAKE_READABLE:
1888 case VG_USERREQ__DISCARD:
1889 case VG_USERREQ__CHECK_WRITABLE:
1890 case VG_USERREQ__CHECK_READABLE:
1891 case VG_USERREQ__MAKE_NOACCESS_STACK:
1892 case VG_USERREQ__RUNNING_ON_VALGRIND:
1893 case VG_USERREQ__DO_LEAK_CHECK:
sewardj8c824512002-04-14 04:16:48 +00001894 vg_threads[tid].m_edx
1895 = VG_(handle_client_request) ( &vg_threads[tid], arg );
sewardje663cb92002-04-12 10:26:32 +00001896 break;
1897
sewardj77e466c2002-04-14 02:29:29 +00001898 case VG_USERREQ__SIGNAL_RETURNS:
1899 handle_signal_return(tid);
1900 break;
sewardj54cacf02002-04-12 23:24:59 +00001901
sewardje663cb92002-04-12 10:26:32 +00001902 default:
1903 VG_(printf)("panic'd on private request = 0x%x\n", arg[0] );
1904 VG_(panic)("handle_private_client_pthread_request: "
1905 "unknown request");
1906 /*NOTREACHED*/
1907 break;
1908 }
1909}
1910
1911
sewardj6072c362002-04-19 14:40:57 +00001912/* ---------------------------------------------------------------------
1913 Sanity checking.
1914 ------------------------------------------------------------------ */
1915
1916/* Internal consistency checks on the sched/pthread structures. */
1917static
1918void scheduler_sanity ( void )
1919{
1920 pthread_mutex_t* mutex;
1921 Int i;
1922 /* VG_(printf)("scheduler_sanity\n"); */
1923 for (i = 1; i < VG_N_THREADS; i++) {
1924 if (vg_threads[i].status == VgTs_WaitMX) {
1925 mutex = vg_threads[i].waited_on_mx;
1926 vg_assert(mutex != NULL);
1927 vg_assert(mutex->__m_count > 0);
1928 vg_assert(is_valid_tid((ThreadId)mutex->__m_owner));
1929 } else {
1930 vg_assert(vg_threads[i].waited_on_mx == NULL);
1931 }
1932 }
1933}
1934
1935
sewardje663cb92002-04-12 10:26:32 +00001936/*--------------------------------------------------------------------*/
1937/*--- end vg_scheduler.c ---*/
1938/*--------------------------------------------------------------------*/