blob: f75fcce8a69d5b5e0d0cc59a465a566e993f7550 [file] [log] [blame]
sewardje663cb92002-04-12 10:26:32 +00001
2/*--------------------------------------------------------------------*/
3/*--- A user-space pthreads implementation. vg_scheduler.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7 This file is part of Valgrind, an x86 protected-mode emulator
8 designed for debugging and profiling binaries on x86-Unixes.
9
10 Copyright (C) 2000-2002 Julian Seward
11 jseward@acm.org
12 Julian_Seward@muraroa.demon.co.uk
13
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of the
17 License, or (at your option) any later version.
18
19 This program is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, write to the Free Software
26 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27 02111-1307, USA.
28
29 The GNU General Public License is contained in the file LICENSE.
30*/
31
32#include "vg_include.h"
33#include "vg_constants.h"
34
35#include "valgrind.h" /* for VG_USERREQ__MAKE_NOACCESS and
36 VG_USERREQ__DO_LEAK_CHECK */
37
38/* BORKAGE as of 11 Apr 02
39
40Note! This implementation is so poor as to not be suitable for use by
41anyone at all!
42
43- properly save scheduler private state in signal delivery frames.
44
45- fd-poll optimisation (don't select with empty sets)
46
47- signals interrupting read/write and nanosleep, and take notice
48 of SA_RESTART or not
49
50- return bogus RA: %EAX trashed, so pthread_joiner gets nonsense
51 exit codes
52
sewardj8937c812002-04-12 20:12:20 +000053- when a thread is done mark its stack as noaccess
54
55- make signal return and .fini call be detected via request mechanism
56
57 */
sewardje663cb92002-04-12 10:26:32 +000058
59
60/* ---------------------------------------------------------------------
61 Types and globals for the scheduler.
62 ------------------------------------------------------------------ */
63
64/* type ThreadId is defined in vg_include.h. */
65
66/* struct ThreadState is defined in vg_include.h. */
67
68/* Private globals. A statically allocated array of threads. */
69static ThreadState vg_threads[VG_N_THREADS];
70
71
72/* vg_oursignalhandler() might longjmp(). Here's the jmp_buf. */
73jmp_buf VG_(scheduler_jmpbuf);
74/* ... and if so, here's the signal which caused it to do so. */
75Int VG_(longjmpd_on_signal);
76
77
78/* Machinery to keep track of which threads are waiting on which
79 fds. */
80typedef
81 struct {
82 /* The thread which made the request. */
83 ThreadId tid;
84
85 /* The next two fields describe the request. */
86 /* File descriptor waited for. -1 means this slot is not in use */
87 Int fd;
88 /* The syscall number the fd is used in. */
89 Int syscall_no;
90
91 /* False => still waiting for select to tell us the fd is ready
92 to go. True => the fd is ready, but the results have not yet
93 been delivered back to the calling thread. Once the latter
94 happens, this entire record is marked as no longer in use, by
95 making the fd field be -1. */
96 Bool ready;
97 }
98 VgWaitedOnFd;
99
100static VgWaitedOnFd vg_waiting_fds[VG_N_WAITING_FDS];
101
102
103
104typedef
105 struct {
106 /* Is this slot in use, or free? */
107 Bool in_use;
108 /* If in_use, is this mutex held by some thread, or not? */
109 Bool held;
110 /* if held==True, owner indicates who by. */
111 ThreadId owner;
112 }
113 VgMutex;
114
115static VgMutex vg_mutexes[VG_N_MUTEXES];
116
117/* Forwards */
118static void do_nontrivial_clientreq ( ThreadId tid );
119
120
121/* ---------------------------------------------------------------------
122 Helper functions for the scheduler.
123 ------------------------------------------------------------------ */
124
125static
126void pp_sched_status ( void )
127{
128 Int i;
129 VG_(printf)("\nsched status:\n");
130 for (i = 0; i < VG_N_THREADS; i++) {
131 if (vg_threads[i].status == VgTs_Empty) continue;
132 VG_(printf)("tid %d: ", i);
133 switch (vg_threads[i].status) {
134 case VgTs_Runnable: VG_(printf)("Runnable\n"); break;
135 case VgTs_WaitFD: VG_(printf)("WaitFD\n"); break;
136 case VgTs_WaitJoiner: VG_(printf)("WaitJoiner(%d)\n",
137 vg_threads[i].joiner); break;
138 case VgTs_WaitJoinee: VG_(printf)("WaitJoinee\n"); break;
139 default: VG_(printf)("???"); break;
140 }
141 }
142 VG_(printf)("\n");
143}
144
145static
146void add_waiting_fd ( ThreadId tid, Int fd, Int syscall_no )
147{
148 Int i;
149
150 vg_assert(fd != -1); /* avoid total chaos */
151
152 for (i = 0; i < VG_N_WAITING_FDS; i++)
153 if (vg_waiting_fds[i].fd == -1)
154 break;
155
156 if (i == VG_N_WAITING_FDS)
157 VG_(panic)("add_waiting_fd: VG_N_WAITING_FDS is too low");
158 /*
159 VG_(printf)("add_waiting_fd: add (tid %d, fd %d) at slot %d\n",
160 tid, fd, i);
161 */
162 vg_waiting_fds[i].fd = fd;
163 vg_waiting_fds[i].tid = tid;
164 vg_waiting_fds[i].ready = False;
165 vg_waiting_fds[i].syscall_no = syscall_no;
166}
167
168
169
170static
171void print_sched_event ( ThreadId tid, Char* what )
172{
sewardj8937c812002-04-12 20:12:20 +0000173 VG_(message)(Vg_DebugMsg, "SCHED[%d]: %s", tid, what );
174}
175
176
177static
178void print_pthread_event ( ThreadId tid, Char* what )
179{
180 VG_(message)(Vg_DebugMsg, "PTHREAD[%d]: %s", tid, what );
sewardje663cb92002-04-12 10:26:32 +0000181}
182
183
184static
185Char* name_of_sched_event ( UInt event )
186{
187 switch (event) {
188 case VG_TRC_EBP_JMP_SPECIAL: return "JMP_SPECIAL";
189 case VG_TRC_EBP_JMP_SYSCALL: return "SYSCALL";
190 case VG_TRC_EBP_JMP_CLIENTREQ: return "CLIENTREQ";
191 case VG_TRC_INNER_COUNTERZERO: return "COUNTERZERO";
192 case VG_TRC_INNER_FASTMISS: return "FASTMISS";
193 case VG_TRC_UNRESUMABLE_SIGNAL: return "FATALSIGNAL";
194 default: return "??UNKNOWN??";
195 }
196}
197
198
199/* Create a translation of the client basic block beginning at
200 orig_addr, and add it to the translation cache & translation table.
201 This probably doesn't really belong here, but, hey ...
202*/
203void VG_(create_translation_for) ( Addr orig_addr )
204{
205 Addr trans_addr;
206 TTEntry tte;
207 Int orig_size, trans_size;
208 /* Ensure there is space to hold a translation. */
209 VG_(maybe_do_lru_pass)();
210 VG_(translate)( orig_addr, &orig_size, &trans_addr, &trans_size );
211 /* Copy data at trans_addr into the translation cache.
212 Returned pointer is to the code, not to the 4-byte
213 header. */
214 /* Since the .orig_size and .trans_size fields are
215 UShort, be paranoid. */
216 vg_assert(orig_size > 0 && orig_size < 65536);
217 vg_assert(trans_size > 0 && trans_size < 65536);
218 tte.orig_size = orig_size;
219 tte.orig_addr = orig_addr;
220 tte.trans_size = trans_size;
221 tte.trans_addr = VG_(copy_to_transcache)
222 ( trans_addr, trans_size );
223 tte.mru_epoch = VG_(current_epoch);
224 /* Free the intermediary -- was allocated by VG_(emit_code). */
225 VG_(jitfree)( (void*)trans_addr );
226 /* Add to trans tab and set back pointer. */
227 VG_(add_to_trans_tab) ( &tte );
228 /* Update stats. */
229 VG_(this_epoch_in_count) ++;
230 VG_(this_epoch_in_osize) += orig_size;
231 VG_(this_epoch_in_tsize) += trans_size;
232 VG_(overall_in_count) ++;
233 VG_(overall_in_osize) += orig_size;
234 VG_(overall_in_tsize) += trans_size;
235 /* Record translated area for SMC detection. */
236 VG_(smc_mark_original) ( orig_addr, orig_size );
237}
238
239
240/* Allocate a completely empty ThreadState record. */
241static
242ThreadId vg_alloc_ThreadState ( void )
243{
244 Int i;
245 for (i = 0; i < VG_N_THREADS; i++) {
246 if (vg_threads[i].status == VgTs_Empty)
247 return i;
248 }
249 VG_(printf)("vg_alloc_ThreadState: no free slots available\n");
250 VG_(printf)("Increase VG_N_THREADS, rebuild and try again.\n");
251 VG_(panic)("VG_N_THREADS is too low");
252 /*NOTREACHED*/
253}
254
255
256ThreadState* VG_(get_thread_state) ( ThreadId tid )
257{
258 vg_assert(tid >= 0 && tid < VG_N_THREADS);
259 vg_assert(vg_threads[tid].status != VgTs_Empty);
260 return & vg_threads[tid];
261}
262
263
264/* Find an unused VgMutex record. */
265static
266MutexId vg_alloc_VgMutex ( void )
267{
268 Int i;
269 for (i = 0; i < VG_N_MUTEXES; i++) {
270 if (!vg_mutexes[i].in_use)
271 return i;
272 }
273 VG_(printf)("vg_alloc_VgMutex: no free slots available\n");
274 VG_(printf)("Increase VG_N_MUTEXES, rebuild and try again.\n");
275 VG_(panic)("VG_N_MUTEXES is too low");
276 /*NOTREACHED*/
277}
278
279
280/* Copy the saved state of a thread into VG_(baseBlock), ready for it
281 to be run. */
282__inline__
283void VG_(load_thread_state) ( ThreadId tid )
284{
285 Int i;
286 VG_(baseBlock)[VGOFF_(m_eax)] = vg_threads[tid].m_eax;
287 VG_(baseBlock)[VGOFF_(m_ebx)] = vg_threads[tid].m_ebx;
288 VG_(baseBlock)[VGOFF_(m_ecx)] = vg_threads[tid].m_ecx;
289 VG_(baseBlock)[VGOFF_(m_edx)] = vg_threads[tid].m_edx;
290 VG_(baseBlock)[VGOFF_(m_esi)] = vg_threads[tid].m_esi;
291 VG_(baseBlock)[VGOFF_(m_edi)] = vg_threads[tid].m_edi;
292 VG_(baseBlock)[VGOFF_(m_ebp)] = vg_threads[tid].m_ebp;
293 VG_(baseBlock)[VGOFF_(m_esp)] = vg_threads[tid].m_esp;
294 VG_(baseBlock)[VGOFF_(m_eflags)] = vg_threads[tid].m_eflags;
295 VG_(baseBlock)[VGOFF_(m_eip)] = vg_threads[tid].m_eip;
296
297 for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
298 VG_(baseBlock)[VGOFF_(m_fpustate) + i] = vg_threads[tid].m_fpu[i];
299
300 VG_(baseBlock)[VGOFF_(sh_eax)] = vg_threads[tid].sh_eax;
301 VG_(baseBlock)[VGOFF_(sh_ebx)] = vg_threads[tid].sh_ebx;
302 VG_(baseBlock)[VGOFF_(sh_ecx)] = vg_threads[tid].sh_ecx;
303 VG_(baseBlock)[VGOFF_(sh_edx)] = vg_threads[tid].sh_edx;
304 VG_(baseBlock)[VGOFF_(sh_esi)] = vg_threads[tid].sh_esi;
305 VG_(baseBlock)[VGOFF_(sh_edi)] = vg_threads[tid].sh_edi;
306 VG_(baseBlock)[VGOFF_(sh_ebp)] = vg_threads[tid].sh_ebp;
307 VG_(baseBlock)[VGOFF_(sh_esp)] = vg_threads[tid].sh_esp;
308 VG_(baseBlock)[VGOFF_(sh_eflags)] = vg_threads[tid].sh_eflags;
309}
310
311
312/* Copy the state of a thread from VG_(baseBlock), presumably after it
313 has been descheduled. For sanity-check purposes, fill the vacated
314 VG_(baseBlock) with garbage so as to make the system more likely to
315 fail quickly if we erroneously continue to poke around inside
316 VG_(baseBlock) without first doing a load_thread_state().
317*/
318__inline__
319void VG_(save_thread_state) ( ThreadId tid )
320{
321 Int i;
322 const UInt junk = 0xDEADBEEF;
323
324 vg_threads[tid].m_eax = VG_(baseBlock)[VGOFF_(m_eax)];
325 vg_threads[tid].m_ebx = VG_(baseBlock)[VGOFF_(m_ebx)];
326 vg_threads[tid].m_ecx = VG_(baseBlock)[VGOFF_(m_ecx)];
327 vg_threads[tid].m_edx = VG_(baseBlock)[VGOFF_(m_edx)];
328 vg_threads[tid].m_esi = VG_(baseBlock)[VGOFF_(m_esi)];
329 vg_threads[tid].m_edi = VG_(baseBlock)[VGOFF_(m_edi)];
330 vg_threads[tid].m_ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
331 vg_threads[tid].m_esp = VG_(baseBlock)[VGOFF_(m_esp)];
332 vg_threads[tid].m_eflags = VG_(baseBlock)[VGOFF_(m_eflags)];
333 vg_threads[tid].m_eip = VG_(baseBlock)[VGOFF_(m_eip)];
334
335 for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
336 vg_threads[tid].m_fpu[i] = VG_(baseBlock)[VGOFF_(m_fpustate) + i];
337
338 vg_threads[tid].sh_eax = VG_(baseBlock)[VGOFF_(sh_eax)];
339 vg_threads[tid].sh_ebx = VG_(baseBlock)[VGOFF_(sh_ebx)];
340 vg_threads[tid].sh_ecx = VG_(baseBlock)[VGOFF_(sh_ecx)];
341 vg_threads[tid].sh_edx = VG_(baseBlock)[VGOFF_(sh_edx)];
342 vg_threads[tid].sh_esi = VG_(baseBlock)[VGOFF_(sh_esi)];
343 vg_threads[tid].sh_edi = VG_(baseBlock)[VGOFF_(sh_edi)];
344 vg_threads[tid].sh_ebp = VG_(baseBlock)[VGOFF_(sh_ebp)];
345 vg_threads[tid].sh_esp = VG_(baseBlock)[VGOFF_(sh_esp)];
346 vg_threads[tid].sh_eflags = VG_(baseBlock)[VGOFF_(sh_eflags)];
347
348 /* Fill it up with junk. */
349 VG_(baseBlock)[VGOFF_(m_eax)] = junk;
350 VG_(baseBlock)[VGOFF_(m_ebx)] = junk;
351 VG_(baseBlock)[VGOFF_(m_ecx)] = junk;
352 VG_(baseBlock)[VGOFF_(m_edx)] = junk;
353 VG_(baseBlock)[VGOFF_(m_esi)] = junk;
354 VG_(baseBlock)[VGOFF_(m_edi)] = junk;
355 VG_(baseBlock)[VGOFF_(m_ebp)] = junk;
356 VG_(baseBlock)[VGOFF_(m_esp)] = junk;
357 VG_(baseBlock)[VGOFF_(m_eflags)] = junk;
358 VG_(baseBlock)[VGOFF_(m_eip)] = junk;
359
360 for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
361 VG_(baseBlock)[VGOFF_(m_fpustate) + i] = junk;
362}
363
364
365/* Run the thread tid for a while, and return a VG_TRC_* value to the
366 scheduler indicating what happened. */
367static
368UInt run_thread_for_a_while ( ThreadId tid )
369{
370 UInt trc = 0;
371 vg_assert(tid >= 0 && tid < VG_N_THREADS);
372 vg_assert(vg_threads[tid].status != VgTs_Empty);
373 vg_assert(VG_(bbs_to_go) > 0);
374
375 VG_(load_thread_state) ( tid );
376 if (__builtin_setjmp(VG_(scheduler_jmpbuf)) == 0) {
377 /* try this ... */
378 trc = VG_(run_innerloop)();
379 /* We get here if the client didn't take a fault. */
380 } else {
381 /* We get here if the client took a fault, which caused our
382 signal handler to longjmp. */
383 vg_assert(trc == 0);
384 trc = VG_TRC_UNRESUMABLE_SIGNAL;
385 }
386 VG_(save_thread_state) ( tid );
387 return trc;
388}
389
390
391/* Increment the LRU epoch counter. */
392static
393void increment_epoch ( void )
394{
395 VG_(current_epoch)++;
396 if (VG_(clo_verbosity) > 2) {
397 UInt tt_used, tc_used;
398 VG_(get_tt_tc_used) ( &tt_used, &tc_used );
399 VG_(message)(Vg_UserMsg,
400 "%lu bbs, in: %d (%d -> %d), out %d (%d -> %d), TT %d, TC %d",
401 VG_(bbs_done),
402 VG_(this_epoch_in_count),
403 VG_(this_epoch_in_osize),
404 VG_(this_epoch_in_tsize),
405 VG_(this_epoch_out_count),
406 VG_(this_epoch_out_osize),
407 VG_(this_epoch_out_tsize),
408 tt_used, tc_used
409 );
410 }
411 VG_(this_epoch_in_count) = 0;
412 VG_(this_epoch_in_osize) = 0;
413 VG_(this_epoch_in_tsize) = 0;
414 VG_(this_epoch_out_count) = 0;
415 VG_(this_epoch_out_osize) = 0;
416 VG_(this_epoch_out_tsize) = 0;
417}
418
419
420/* Initialise the scheduler. Create a single "main" thread ready to
421 run, with special ThreadId of zero. This is called at startup; the
422 caller takes care to park the client's state is parked in
423 VG_(baseBlock).
424*/
425void VG_(scheduler_init) ( void )
426{
427 Int i;
428 Addr startup_esp;
429 ThreadId tid_main;
430
431 startup_esp = VG_(baseBlock)[VGOFF_(m_esp)];
432 if ((startup_esp & VG_STARTUP_STACK_MASK) != VG_STARTUP_STACK_MASK) {
433 VG_(printf)("%esp at startup = %p is not near %p; aborting\n",
434 startup_esp, VG_STARTUP_STACK_MASK);
435 VG_(panic)("unexpected %esp at startup");
436 }
437
438 for (i = 0; i < VG_N_THREADS; i++) {
439 vg_threads[i].stack_size = 0;
440 vg_threads[i].stack_base = (Addr)NULL;
441 }
442
443 for (i = 0; i < VG_N_WAITING_FDS; i++)
444 vg_waiting_fds[i].fd = -1; /* not in use */
445
446 for (i = 0; i < VG_N_MUTEXES; i++)
447 vg_mutexes[i].in_use = False;
448
449 /* Assert this is thread zero, which has certain magic
450 properties. */
451 tid_main = vg_alloc_ThreadState();
452 vg_assert(tid_main == 0);
453
454 vg_threads[tid_main].status = VgTs_Runnable;
455 vg_threads[tid_main].joiner = VG_INVALID_THREADID;
456 vg_threads[tid_main].retval = NULL; /* not important */
457
458 /* Copy VG_(baseBlock) state to tid_main's slot. */
459 VG_(save_thread_state) ( tid_main );
460}
461
462
463/* What if fd isn't a valid fd? */
464static
465void set_fd_nonblocking ( Int fd )
466{
467 Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
468 vg_assert(!VG_(is_kerror)(res));
469 res |= VKI_O_NONBLOCK;
470 res = VG_(fcntl)( fd, VKI_F_SETFL, res );
471 vg_assert(!VG_(is_kerror)(res));
472}
473
474static
475void set_fd_blocking ( Int fd )
476{
477 Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
478 vg_assert(!VG_(is_kerror)(res));
479 res &= ~VKI_O_NONBLOCK;
480 res = VG_(fcntl)( fd, VKI_F_SETFL, res );
481 vg_assert(!VG_(is_kerror)(res));
482}
483
484static
485Bool fd_is_blockful ( Int fd )
486{
487 Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
488 vg_assert(!VG_(is_kerror)(res));
489 return (res & VKI_O_NONBLOCK) ? False : True;
490}
491
492
493
494/* Do a purely thread-local request for tid, and put the result in its
495 %EDX, without changing its scheduling state in any way, nor that of
496 any other threads. Return True if so.
497
498 If the request is non-trivial, return False; a more capable but
499 slower mechanism will deal with it.
500*/
501static
502Bool maybe_do_trivial_clientreq ( ThreadId tid )
503{
504# define SIMPLE_RETURN(vvv) \
505 { vg_threads[tid].m_edx = (vvv); \
506 return True; \
507 }
508
509 UInt* arg = (UInt*)(vg_threads[tid].m_eax);
510 UInt req_no = arg[0];
511 switch (req_no) {
512 case VG_USERREQ__MALLOC:
513 SIMPLE_RETURN(
514 (UInt)VG_(client_malloc) ( arg[1], Vg_AllocMalloc )
515 );
516 case VG_USERREQ__BUILTIN_NEW:
517 SIMPLE_RETURN(
518 (UInt)VG_(client_malloc) ( arg[1], Vg_AllocNew )
519 );
520 case VG_USERREQ__BUILTIN_VEC_NEW:
521 SIMPLE_RETURN(
522 (UInt)VG_(client_malloc) ( arg[1], Vg_AllocNewVec )
523 );
524 case VG_USERREQ__FREE:
525 VG_(client_free) ( (void*)arg[1], Vg_AllocMalloc );
526 SIMPLE_RETURN(0); /* irrelevant */
527 case VG_USERREQ__BUILTIN_DELETE:
528 VG_(client_free) ( (void*)arg[1], Vg_AllocNew );
529 SIMPLE_RETURN(0); /* irrelevant */
530 case VG_USERREQ__BUILTIN_VEC_DELETE:
531 VG_(client_free) ( (void*)arg[1], Vg_AllocNewVec );
532 SIMPLE_RETURN(0); /* irrelevant */
533 case VG_USERREQ__CALLOC:
534 SIMPLE_RETURN(
535 (UInt)VG_(client_calloc) ( arg[1], arg[2] )
536 );
537 case VG_USERREQ__REALLOC:
538 SIMPLE_RETURN(
539 (UInt)VG_(client_realloc) ( (void*)arg[1], arg[2] )
540 );
541 case VG_USERREQ__MEMALIGN:
542 SIMPLE_RETURN(
543 (UInt)VG_(client_memalign) ( arg[1], arg[2] )
544 );
545 default:
546 /* Too hard; wimp out. */
547 return False;
548 }
549# undef SIMPLE_RETURN
550}
551
552
553static
554void sched_do_syscall ( ThreadId tid )
555{
556 UInt saved_eax;
557 UInt res, syscall_no;
558 UInt fd;
559 Bool might_block, assumed_nonblocking;
560 Bool orig_fd_blockness;
561 Char msg_buf[100];
562
563 vg_assert(tid >= 0 && tid < VG_N_THREADS);
564 vg_assert(vg_threads[tid].status == VgTs_Runnable);
565
566 syscall_no = vg_threads[tid].m_eax; /* syscall number */
567
568 if (syscall_no == __NR_nanosleep) {
569 ULong t_now, t_awaken;
570 struct vki_timespec* req;
571 req = (struct vki_timespec*)vg_threads[tid].m_ebx; /* arg1 */
572 t_now = VG_(read_microsecond_timer)();
573 t_awaken
574 = t_now
575 + (ULong)1000000ULL * (ULong)(req->tv_sec)
576 + (ULong)( (UInt)(req->tv_nsec) / 1000 );
577 vg_threads[tid].status = VgTs_Sleeping;
578 vg_threads[tid].awaken_at = t_awaken;
sewardj8937c812002-04-12 20:12:20 +0000579 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +0000580 VG_(sprintf)(msg_buf, "at %lu: nanosleep for %lu",
581 t_now, t_awaken-t_now);
582 print_sched_event(tid, msg_buf);
583 }
584 /* Force the scheduler to run something else for a while. */
585 return;
586 }
587
588 switch (syscall_no) {
589 case __NR_read:
590 case __NR_write:
591 assumed_nonblocking
592 = False;
593 might_block
594 = fd_is_blockful(vg_threads[tid].m_ebx /* arg1 */);
595 break;
596 default:
597 might_block = False;
598 assumed_nonblocking = True;
599 }
600
601 if (assumed_nonblocking) {
602 /* We think it's non-blocking. Just do it in the normal way. */
603 VG_(perform_assumed_nonblocking_syscall)(tid);
604 /* The thread is still runnable. */
605 return;
606 }
607
608 /* It might block. Take evasive action. */
609 switch (syscall_no) {
610 case __NR_read:
611 case __NR_write:
612 fd = vg_threads[tid].m_ebx; break;
613 default:
614 vg_assert(3+3 == 7);
615 }
616
617 /* Set the fd to nonblocking, and do the syscall, which will return
618 immediately, in order to lodge a request with the Linux kernel.
619 We later poll for I/O completion using select(). */
620
621 orig_fd_blockness = fd_is_blockful(fd);
622 set_fd_nonblocking(fd);
623 vg_assert(!fd_is_blockful(fd));
624 VG_(check_known_blocking_syscall)(tid, syscall_no, NULL /* PRE */);
625
626 /* This trashes the thread's %eax; we have to preserve it. */
627 saved_eax = vg_threads[tid].m_eax;
628 KERNEL_DO_SYSCALL(tid,res);
629
630 /* Restore original blockfulness of the fd. */
631 if (orig_fd_blockness)
632 set_fd_blocking(fd);
633 else
634 set_fd_nonblocking(fd);
635
636 if (res != -VKI_EWOULDBLOCK) {
637 /* It didn't block; it went through immediately. So finish off
638 in the normal way. Don't restore %EAX, since that now
639 (correctly) holds the result of the call. */
640 VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
641 /* We're still runnable. */
642 vg_assert(vg_threads[tid].status == VgTs_Runnable);
643
644 } else {
645
646 /* It would have blocked. First, restore %EAX to what it was
647 before our speculative call. */
648 vg_threads[tid].m_eax = saved_eax;
649 /* Put this fd in a table of fds on which we are waiting for
650 completion. The arguments for select() later are constructed
651 from this table. */
652 add_waiting_fd(tid, fd, saved_eax /* which holds the syscall # */);
653 /* Deschedule thread until an I/O completion happens. */
654 vg_threads[tid].status = VgTs_WaitFD;
sewardj8937c812002-04-12 20:12:20 +0000655 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +0000656 VG_(sprintf)(msg_buf,"block until I/O ready on fd %d", fd);
657 print_sched_event(tid, msg_buf);
658 }
659
660 }
661}
662
663
664/* Find out which of the fds in vg_waiting_fds are now ready to go, by
665 making enquiries with select(), and mark them as ready. We have to
666 wait for the requesting threads to fall into the the WaitFD state
667 before we can actually finally deliver the results, so this
668 procedure doesn't do that; complete_blocked_syscalls() does it.
669
670 It might seem odd that a thread which has done a blocking syscall
671 is not in WaitFD state; the way this can happen is if it initially
672 becomes WaitFD, but then a signal is delivered to it, so it becomes
673 Runnable for a while. In this case we have to wait for the
674 sighandler to return, whereupon the WaitFD state is resumed, and
675 only at that point can the I/O result be delivered to it. However,
676 this point may be long after the fd is actually ready.
677
678 So, poll_for_ready_fds() merely detects fds which are ready.
679 complete_blocked_syscalls() does the second half of the trick,
680 possibly much later: it delivers the results from ready fds to
681 threads in WaitFD state.
682*/
683void poll_for_ready_fds ( void )
684{
685 vki_ksigset_t saved_procmask;
686 vki_fd_set readfds;
687 vki_fd_set writefds;
688 vki_fd_set exceptfds;
689 struct vki_timeval timeout;
690 Int fd, fd_max, i, n_ready, syscall_no, n_ok;
691 ThreadId tid;
692 Bool rd_ok, wr_ok, ex_ok;
693 Char msg_buf[100];
694
695 /* Awaken any sleeping threads whose sleep has expired. */
696 {
697 struct vki_timespec * rem;
698 ULong t_now = VG_(read_microsecond_timer)();
699 for (tid = 0; tid < VG_N_THREADS; tid++) {
700 if (vg_threads[tid].status != VgTs_Sleeping)
701 continue;
702 if (t_now >= vg_threads[tid].awaken_at) {
703 /* Resume this thread. Set to zero the remaining-time (second)
704 arg of nanosleep, since it's used up all its time. */
705 vg_assert(vg_threads[tid].m_eax == __NR_nanosleep);
706 rem = (struct vki_timespec *)vg_threads[tid].m_ecx; /* arg2 */
707 if (rem != NULL) {
708 rem->tv_sec = 0;
709 rem->tv_nsec = 0;
710 }
711 /* Make the syscall return 0 (success). */
712 vg_threads[tid].m_eax = 0;
713 /* Reschedule this thread. */
714 vg_threads[tid].status = VgTs_Runnable;
sewardj8937c812002-04-12 20:12:20 +0000715 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +0000716 VG_(sprintf)(msg_buf, "at %lu: nanosleep done",
717 t_now);
718 print_sched_event(tid, msg_buf);
719 }
720 }
721 }
722 }
723
724 timeout.tv_sec = 0;
725 timeout.tv_usec = 0;
726
727 VKI_FD_ZERO(&readfds);
728 VKI_FD_ZERO(&writefds);
729 VKI_FD_ZERO(&exceptfds);
730 fd_max = -1;
731 for (i = 0; i < VG_N_WAITING_FDS; i++) {
732 if (vg_waiting_fds[i].fd == -1 /* not in use */)
733 continue;
734 if (vg_waiting_fds[i].ready /* already ready? */)
735 continue;
736 fd = vg_waiting_fds[i].fd;
737 /* VG_(printf)("adding QUERY for fd %d\n", fd); */
738 if (fd > fd_max)
739 fd_max = fd;
740 tid = vg_waiting_fds[i].tid;
741 vg_assert(tid >= 0 && tid < VG_N_THREADS);
742 syscall_no = vg_waiting_fds[i].syscall_no;
743 switch (syscall_no) {
744 case __NR_read:
745 VKI_FD_SET(fd, &readfds); break;
746 case __NR_write:
747 VKI_FD_SET(fd, &writefds); break;
748 default:
749 VG_(panic)("poll_for_ready_fds: unexpected syscall");
750 /*NOTREACHED*/
751 break;
752 }
753 }
754
755 /* BLOCK ALL SIGNALS. We don't want the complication of select()
756 getting interrupted. */
757 VG_(block_all_host_signals)( &saved_procmask );
758
759 n_ready = VG_(select)
760 ( fd_max+1, &readfds, &writefds, &exceptfds, &timeout);
761 if (VG_(is_kerror)(n_ready)) {
762 VG_(printf)("poll_for_ready_fds: select returned %d\n", n_ready);
763 VG_(panic)("poll_for_ready_fds: select failed?!");
764 /*NOTREACHED*/
765 }
766
767 /* UNBLOCK ALL SIGNALS */
768 VG_(restore_host_signals)( &saved_procmask );
769
770 /* VG_(printf)("poll_for_io_completions: %d fs ready\n", n_ready); */
771
772 if (n_ready == 0)
773 return;
774
775 /* Inspect all the fds we know about, and handle any completions that
776 have happened. */
777 /*
778 VG_(printf)("\n\n");
779 for (fd = 0; fd < 100; fd++)
780 if (VKI_FD_ISSET(fd, &writefds) || VKI_FD_ISSET(fd, &readfds)) {
781 VG_(printf)("X"); } else { VG_(printf)("."); };
782 VG_(printf)("\n\nfd_max = %d\n", fd_max);
783 */
784
785 for (fd = 0; fd <= fd_max; fd++) {
786 rd_ok = VKI_FD_ISSET(fd, &readfds);
787 wr_ok = VKI_FD_ISSET(fd, &writefds);
788 ex_ok = VKI_FD_ISSET(fd, &exceptfds);
789
790 n_ok = (rd_ok ? 1 : 0) + (wr_ok ? 1 : 0) + (ex_ok ? 1 : 0);
791 if (n_ok == 0)
792 continue;
793 if (n_ok > 1) {
794 VG_(printf)("offending fd = %d\n", fd);
795 VG_(panic)("poll_for_ready_fds: multiple events on fd");
796 }
797
798 /* An I/O event completed for fd. Find the thread which
799 requested this. */
800 for (i = 0; i < VG_N_WAITING_FDS; i++) {
801 if (vg_waiting_fds[i].fd == -1 /* not in use */)
802 continue;
803 if (vg_waiting_fds[i].fd == fd)
804 break;
805 }
806
807 /* And a bit more paranoia ... */
808 vg_assert(i >= 0 && i < VG_N_WAITING_FDS);
809
810 /* Mark the fd as ready. */
811 vg_assert(! vg_waiting_fds[i].ready);
812 vg_waiting_fds[i].ready = True;
813 }
814}
815
816
817/* See comment attached to poll_for_ready_fds() for explaination. */
818void complete_blocked_syscalls ( void )
819{
820 Int fd, i, res, syscall_no;
821 ThreadId tid;
822 Char msg_buf[100];
823
824 /* Inspect all the outstanding fds we know about. */
825
826 for (i = 0; i < VG_N_WAITING_FDS; i++) {
827 if (vg_waiting_fds[i].fd == -1 /* not in use */)
828 continue;
829 if (! vg_waiting_fds[i].ready)
830 continue;
831
832 fd = vg_waiting_fds[i].fd;
833 tid = vg_waiting_fds[i].tid;
834 vg_assert(tid >= 0 && tid < VG_N_THREADS);
835
836 /* The thread actually has to be waiting for the I/O event it
837 requested before we can deliver the result! */
838 if (vg_threads[tid].status != VgTs_WaitFD)
839 continue;
840
841 /* Ok, actually do it! We can safely use %EAX as the syscall
842 number, because the speculative call made by
843 sched_do_syscall() doesn't change %EAX in the case where the
844 call would have blocked. */
845
846 syscall_no = vg_waiting_fds[i].syscall_no;
847 vg_assert(syscall_no == vg_threads[tid].m_eax);
848 KERNEL_DO_SYSCALL(tid,res);
849 VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
850
851 /* Reschedule. */
852 vg_threads[tid].status = VgTs_Runnable;
853 /* Mark slot as no longer in use. */
854 vg_waiting_fds[i].fd = -1;
855 /* pp_sched_status(); */
sewardj8937c812002-04-12 20:12:20 +0000856 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +0000857 VG_(sprintf)(msg_buf,"resume due to I/O completion on fd %d", fd);
858 print_sched_event(tid, msg_buf);
859 }
860 }
861}
862
863
864static
865void nanosleep_for_a_while ( void )
866{
867 Int res;
868 struct vki_timespec req;
869 struct vki_timespec rem;
870 req.tv_sec = 0;
871 req.tv_nsec = 20 * 1000 * 1000;
872 res = VG_(nanosleep)( &req, &rem );
873 /* VG_(printf)("after ns, unused = %d\n", rem.tv_nsec ); */
874 vg_assert(res == 0);
875}
876
877
878/* ---------------------------------------------------------------------
879 The scheduler proper.
880 ------------------------------------------------------------------ */
881
882/* Run user-space threads until either
883 * Deadlock occurs
884 * One thread asks to shutdown Valgrind
885 * The specified number of basic blocks has gone by.
886*/
887VgSchedReturnCode VG_(scheduler) ( void )
888{
889 ThreadId tid, tid_next;
890 UInt trc;
891 UInt dispatch_ctr_SAVED;
892 Int done_this_time, n_in_fdwait;
893 Char msg_buf[100];
894 Addr trans_addr;
895
896 /* For the LRU structures, records when the epoch began. */
897 ULong lru_epoch_started_at = 0;
898
899 /* Start with the root thread. tid in general indicates the
900 currently runnable/just-finished-running thread. */
901 tid = 0;
902
903 /* This is the top level scheduler loop. It falls into three
904 phases. */
905 while (True) {
906
907 /* ======================= Phase 1 of 3 =======================
908 Handle I/O completions and signals. This may change the
909 status of various threads. Then select a new thread to run,
910 or declare deadlock, or sleep if there are no runnable
911 threads but some are blocked on I/O. */
912
913 /* Age the LRU structures if an epoch has been completed. */
914 if (VG_(bbs_done) - lru_epoch_started_at >= VG_BBS_PER_EPOCH) {
915 lru_epoch_started_at = VG_(bbs_done);
916 increment_epoch();
917 }
918
919 /* Was a debug-stop requested? */
920 if (VG_(bbs_to_go) == 0)
921 goto debug_stop;
922
923 /* Do the following loop until a runnable thread is found, or
924 deadlock is detected. */
925 while (True) {
926
927 /* For stats purposes only. */
928 VG_(num_scheduling_events_MAJOR) ++;
929
930 /* See if any I/O operations which we were waiting for have
931 completed, and, if so, make runnable the relevant waiting
932 threads. */
933 poll_for_ready_fds();
934 complete_blocked_syscalls();
935
936 /* See if there are any signals which need to be delivered. If
937 so, choose thread(s) to deliver them to, and build signal
938 delivery frames on those thread(s) stacks. */
939 VG_(deliver_signals)( 0 /*HACK*/ );
940 VG_(do_sanity_checks)(0 /*HACK*/, False);
941
942 /* Try and find a thread (tid) to run. */
943 tid_next = tid;
944 n_in_fdwait = 0;
945 while (True) {
946 tid_next++;
947 if (tid_next >= VG_N_THREADS) tid_next = 0;
948 if (vg_threads[tid_next].status == VgTs_WaitFD)
949 n_in_fdwait ++;
950 if (vg_threads[tid_next].status == VgTs_Runnable)
951 break; /* We can run this one. */
952 if (tid_next == tid)
953 break; /* been all the way round */
954 }
955 tid = tid_next;
956
957 if (vg_threads[tid].status == VgTs_Runnable) {
958 /* Found a suitable candidate. Fall out of this loop, so
959 we can advance to stage 2 of the scheduler: actually
960 running the thread. */
961 break;
962 }
963
964 /* We didn't find a runnable thread. Now what? */
965 if (n_in_fdwait == 0) {
966 /* No runnable threads and non in fd-wait either. Not
967 good. */
968 pp_sched_status();
969 return VgSrc_Deadlock;
970 }
971
972 /* At least one thread is in a fd-wait state. Delay for a
973 while, and go round again, in the hope that eventually a
974 thread becomes runnable. */
975 nanosleep_for_a_while();
976 // pp_sched_status();
977 // VG_(printf)(".\n");
978 }
979
980
981 /* ======================= Phase 2 of 3 =======================
982 Wahey! We've finally decided that thread tid is runnable, so
983 we now do that. Run it for as much of a quanta as possible.
984 Trivial requests are handled and the thread continues. The
985 aim is not to do too many of Phase 1 since it is expensive. */
986
987 if (0)
988 VG_(printf)("SCHED: tid %d, used %d\n", tid, VG_N_THREADS);
989
990 /* Figure out how many bbs to ask vg_run_innerloop to do. Note
991 that it decrements the counter before testing it for zero, so
992 that if VG_(dispatch_ctr) is set to N you get at most N-1
993 iterations. Also this means that VG_(dispatch_ctr) must
994 exceed zero before entering the innerloop. Also also, the
995 decrement is done before the bb is actually run, so you
996 always get at least one decrement even if nothing happens.
997 */
998 if (VG_(bbs_to_go) >= VG_SCHEDULING_QUANTUM)
999 VG_(dispatch_ctr) = VG_SCHEDULING_QUANTUM + 1;
1000 else
1001 VG_(dispatch_ctr) = (UInt)VG_(bbs_to_go) + 1;
1002
1003 /* ... and remember what we asked for. */
1004 dispatch_ctr_SAVED = VG_(dispatch_ctr);
1005
1006 /* Actually run thread tid. */
1007 while (True) {
1008
1009 /* For stats purposes only. */
1010 VG_(num_scheduling_events_MINOR) ++;
1011
1012 if (0)
1013 VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs",
1014 tid, VG_(dispatch_ctr) - 1 );
1015
1016 trc = run_thread_for_a_while ( tid );
1017
1018 /* Deal quickly with trivial scheduling events, and resume the
1019 thread. */
1020
1021 if (trc == VG_TRC_INNER_FASTMISS) {
1022 vg_assert(VG_(dispatch_ctr) > 0);
1023
1024 /* Trivial event. Miss in the fast-cache. Do a full
1025 lookup for it. */
1026 trans_addr
1027 = VG_(search_transtab) ( vg_threads[tid].m_eip );
1028 if (trans_addr == (Addr)0) {
1029 /* Not found; we need to request a translation. */
1030 VG_(create_translation_for)( vg_threads[tid].m_eip );
1031 trans_addr = VG_(search_transtab) ( vg_threads[tid].m_eip );
1032 if (trans_addr == (Addr)0)
1033 VG_(panic)("VG_TRC_INNER_FASTMISS: missing tt_fast entry");
1034 }
1035 continue; /* with this thread */
1036 }
1037
1038 if (trc == VG_TRC_EBP_JMP_CLIENTREQ) {
1039 Bool is_triv = maybe_do_trivial_clientreq(tid);
1040 if (is_triv) {
1041 /* NOTE: a trivial request is something like a call to
1042 malloc() or free(). It DOES NOT change the
1043 Runnability of this thread nor the status of any
1044 other thread; it is purely thread-local. */
1045 continue; /* with this thread */
1046 }
1047 }
1048
1049 /* It's a non-trivial event. Give up running this thread and
1050 handle things the expensive way. */
1051 break;
1052 }
1053
1054 /* ======================= Phase 3 of 3 =======================
1055 Handle non-trivial thread requests, mostly pthread stuff. */
1056
1057 /* Ok, we've fallen out of the dispatcher for a
1058 non-completely-trivial reason. First, update basic-block
1059 counters. */
1060
1061 done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 1;
1062 vg_assert(done_this_time >= 0);
1063 VG_(bbs_to_go) -= (ULong)done_this_time;
1064 VG_(bbs_done) += (ULong)done_this_time;
1065
1066 if (0 && trc != VG_TRC_INNER_FASTMISS)
1067 VG_(message)(Vg_DebugMsg, "thread %d: completed %d bbs, trc %d",
1068 tid, done_this_time, (Int)trc );
1069
1070 if (0 && trc != VG_TRC_INNER_FASTMISS)
1071 VG_(message)(Vg_DebugMsg, "thread %d: %ld bbs, event %s",
1072 tid, VG_(bbs_done),
1073 name_of_sched_event(trc) );
1074
1075 /* Examine the thread's return code to figure out why it
1076 stopped, and handle requests. */
1077
1078 switch (trc) {
1079
1080 case VG_TRC_INNER_FASTMISS:
1081 VG_(panic)("VG_(scheduler): VG_TRC_INNER_FASTMISS");
1082 /*NOTREACHED*/
1083 break;
1084
1085 case VG_TRC_INNER_COUNTERZERO:
1086 /* Timeslice is out. Let a new thread be scheduled,
1087 simply by doing nothing, causing us to arrive back at
1088 Phase 1. */
1089 if (VG_(bbs_to_go) == 0) {
1090 goto debug_stop;
1091 }
1092 vg_assert(VG_(dispatch_ctr) == 0);
1093 break;
1094
1095 case VG_TRC_UNRESUMABLE_SIGNAL:
1096 /* It got a SIGSEGV/SIGBUS, which we need to deliver right
1097 away. Again, do nothing, so we wind up back at Phase
1098 1, whereupon the signal will be "delivered". */
1099 break;
1100
1101 case VG_TRC_EBP_JMP_SPECIAL: {
1102 Addr next_eip = vg_threads[tid].m_eip;
1103 if (next_eip == (Addr) & VG_(signalreturn_bogusRA)) {
1104 /* vthread tid is returning from a signal handler;
1105 modify its stack/regs accordingly. */
1106 VG_(signal_returns)(tid);
1107 }
1108 else
1109 if (next_eip == (Addr) & VG_(shutdown)) {
1110 return VgSrc_Shutdown;
1111 } else {
1112 VG_(panic)("vg_schedule: VG_TRC_EBP_JMP_SPECIAL");
1113 }
1114 break;
1115 }
1116
1117 case VG_TRC_EBP_JMP_SYSCALL:
1118 /* Do a syscall for the vthread tid. This could cause it
1119 to become non-runnable. */
1120 sched_do_syscall(tid);
1121 break;
1122
1123 case VG_TRC_EBP_JMP_CLIENTREQ:
1124 /* Do a client request for the vthread tid. Note that
1125 some requests will have been handled by
1126 maybe_do_trivial_clientreq(), so we don't expect to see
1127 those here.
1128 */
1129 if (0) {
1130 VG_(sprintf)(msg_buf, "request 0x%x",
1131 vg_threads[tid].m_eax);
1132 print_sched_event(tid, msg_buf);
1133 }
1134 /* Do a non-trivial client request for thread tid. tid's
1135 %EAX points to a short vector of argument words, the
1136 first of which is the request code. The result of the
1137 request is put in tid's %EDX. Alternatively, perhaps
1138 the request causes tid to become non-runnable and/or
1139 other blocked threads become runnable. In general we
1140 can and often do mess with the state of arbitrary
1141 threads at this point. */
1142 do_nontrivial_clientreq(tid);
1143 break;
1144
1145 default:
1146 VG_(printf)("\ntrc = %d\n", trc);
1147 VG_(panic)("VG_(scheduler), phase 3: "
1148 "unexpected thread return code");
1149 /* NOTREACHED */
1150 break;
1151
1152 } /* switch (trc) */
1153
1154 /* That completes Phase 3 of 3. Return now to the top of the
1155 main scheduler loop, to Phase 1 of 3. */
1156
1157 } /* top-level scheduler loop */
1158
1159
1160 /* NOTREACHED */
1161 VG_(panic)("scheduler: post-main-loop ?!");
1162 /* NOTREACHED */
1163
1164 debug_stop:
1165 /* If we exited because of a debug stop, print the translation
1166 of the last block executed -- by translating it again, and
1167 throwing away the result. */
1168 VG_(printf)(
1169 "======vvvvvvvv====== LAST TRANSLATION ======vvvvvvvv======\n");
1170 VG_(translate)( vg_threads[tid].m_eip, NULL, NULL, NULL );
1171 VG_(printf)("\n");
1172 VG_(printf)(
1173 "======^^^^^^^^====== LAST TRANSLATION ======^^^^^^^^======\n");
1174
1175 return VgSrc_BbsDone;
1176}
1177
1178
1179/* ---------------------------------------------------------------------
1180 The pthread implementation.
1181 ------------------------------------------------------------------ */
1182
1183#include <pthread.h>
1184#include <errno.h>
1185
1186#if !defined(PTHREAD_STACK_MIN)
1187# define PTHREAD_STACK_MIN (16384 - VG_AR_CLIENT_STACKBASE_REDZONE_SZB)
1188#endif
1189
1190/* /usr/include/bits/pthreadtypes.h:
1191 typedef unsigned long int pthread_t;
1192*/
1193
1194/* RUNS ON SIMD CPU!
1195 This is the return address that pthread_create uses.
1196*/
1197static
1198void do_pthread_create_bogusRA ( void )
1199{
1200 /* Tell the scheduler that this thread has returned. */
1201 Int res;
1202 VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
1203 VG_USERREQ__PTHREAD_CREATE_BOGUSRA,
1204 0, 0, 0, 0);
1205 VG_(panic)("do_pthread_create_bogusRA: shouldn't be still alive!");
1206}
1207
1208
1209static
1210void do_pthread_cancel ( ThreadId tid_canceller,
1211 pthread_t tid_cancellee )
1212{
1213 Char msg_buf[100];
1214 /* We want make is appear that this thread has returned to
1215 do_pthread_create_bogusRA with PTHREAD_CANCELED as the
1216 return value. So: simple: put PTHREAD_CANCELED into %EAX
1217 and &do_pthread_create_bogusRA into %EIP and keep going! */
sewardj8937c812002-04-12 20:12:20 +00001218 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001219 VG_(sprintf)(msg_buf, "cancelled by %d", tid_canceller);
1220 print_sched_event(tid_cancellee, msg_buf);
1221 }
1222 vg_threads[tid_cancellee].m_eax = (UInt)PTHREAD_CANCELED;
1223 vg_threads[tid_cancellee].m_eip = (UInt)&do_pthread_create_bogusRA;
1224 vg_threads[tid_cancellee].status = VgTs_Runnable;
1225}
1226
1227
1228
1229/* Thread tid is exiting, by returning from the function it was
1230 created with. The main complication here is to resume any thread
1231 waiting to join with this one. */
1232static
1233void do_pthread_create_exit_by_returning ( ThreadId tid )
1234{
1235 ThreadId jnr; /* joiner, the thread calling pthread_join. */
1236 UInt* jnr_args;
1237 void** jnr_thread_return;
1238 Char msg_buf[100];
1239
1240 /* Mark it as not in use. Leave the stack in place so the next
1241 user of this slot doesn't reallocate it. */
1242 vg_assert(tid >= 0 && tid < VG_N_THREADS);
1243 vg_assert(vg_threads[tid].status != VgTs_Empty);
1244
1245 vg_threads[tid].retval = (void*)vg_threads[tid].m_eax;
1246
1247 if (vg_threads[tid].joiner == VG_INVALID_THREADID) {
1248 /* No one has yet done a join on me */
1249 vg_threads[tid].status = VgTs_WaitJoiner;
sewardj8937c812002-04-12 20:12:20 +00001250 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001251 VG_(sprintf)(msg_buf,
1252 "root fn returns, waiting for a call pthread_join(%d)",
1253 tid);
1254 print_sched_event(tid, msg_buf);
1255 }
1256 } else {
1257 /* Some is waiting; make their join call return with success,
1258 putting my exit code in the place specified by the caller's
1259 thread_return param. This is all very horrible, since we
1260 need to consult the joiner's arg block -- pointed to by its
1261 %EAX -- in order to extract the 2nd param of its pthread_join
1262 call. TODO: free properly the slot (also below).
1263 */
1264 jnr = vg_threads[tid].joiner;
1265 vg_assert(jnr >= 0 && jnr < VG_N_THREADS);
1266 vg_assert(vg_threads[jnr].status == VgTs_WaitJoinee);
1267 jnr_args = (UInt*)vg_threads[jnr].m_eax;
1268 jnr_thread_return = (void**)(jnr_args[2]);
1269 if (jnr_thread_return != NULL)
1270 *jnr_thread_return = vg_threads[tid].retval;
1271 vg_threads[jnr].m_edx = 0; /* success */
1272 vg_threads[jnr].status = VgTs_Runnable;
1273 vg_threads[tid].status = VgTs_Empty; /* bye! */
sewardj8937c812002-04-12 20:12:20 +00001274 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001275 VG_(sprintf)(msg_buf,
1276 "root fn returns, to find a waiting pthread_join(%d)", tid);
1277 print_sched_event(tid, msg_buf);
1278 VG_(sprintf)(msg_buf,
1279 "my pthread_join(%d) returned; resuming", tid);
1280 print_sched_event(jnr, msg_buf);
1281 }
1282 }
1283
1284 /* Return value is irrelevant; this thread will not get
1285 rescheduled. */
1286}
1287
1288
1289static
1290void do_pthread_join ( ThreadId tid, ThreadId jee, void** thread_return )
1291{
1292 Char msg_buf[100];
1293
1294 /* jee, the joinee, is the thread specified as an arg in thread
1295 tid's call to pthread_join. So tid is the join-er. */
1296 vg_assert(tid >= 0 && tid < VG_N_THREADS);
1297 vg_assert(vg_threads[tid].status == VgTs_Runnable);
1298
1299 if (jee == tid) {
1300 vg_threads[tid].m_edx = EDEADLK; /* libc constant, not a kernel one */
1301 vg_threads[tid].status = VgTs_Runnable;
1302 return;
1303 }
1304
1305 if (jee < 0
1306 || jee >= VG_N_THREADS
1307 || vg_threads[jee].status == VgTs_Empty) {
1308 /* Invalid thread to join to. */
1309 vg_threads[tid].m_edx = EINVAL;
1310 vg_threads[tid].status = VgTs_Runnable;
1311 return;
1312 }
1313
1314 if (vg_threads[jee].joiner != VG_INVALID_THREADID) {
1315 /* Someone already did join on this thread */
1316 vg_threads[tid].m_edx = EINVAL;
1317 vg_threads[tid].status = VgTs_Runnable;
1318 return;
1319 }
1320
1321 /* if (vg_threads[jee].detached) ... */
1322
1323 /* Perhaps the joinee has already finished? If so return
1324 immediately with its return code, and free up the slot. TODO:
1325 free it properly (also above). */
1326 if (vg_threads[jee].status == VgTs_WaitJoiner) {
1327 vg_assert(vg_threads[jee].joiner == VG_INVALID_THREADID);
1328 vg_threads[tid].m_edx = 0; /* success */
1329 if (thread_return != NULL)
1330 *thread_return = vg_threads[jee].retval;
1331 vg_threads[tid].status = VgTs_Runnable;
1332 vg_threads[jee].status = VgTs_Empty; /* bye! */
sewardj8937c812002-04-12 20:12:20 +00001333 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001334 VG_(sprintf)(msg_buf,
1335 "someone called pthread_join() on me; bye!");
1336 print_sched_event(jee, msg_buf);
1337 VG_(sprintf)(msg_buf,
1338 "my pthread_join(%d) returned immediately",
1339 jee );
1340 print_sched_event(tid, msg_buf);
1341 }
1342 return;
1343 }
1344
1345 /* Ok, so we'll have to wait on jee. */
1346 vg_threads[jee].joiner = tid;
1347 vg_threads[tid].status = VgTs_WaitJoinee;
sewardj8937c812002-04-12 20:12:20 +00001348 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001349 VG_(sprintf)(msg_buf,
1350 "blocking on call of pthread_join(%d)", jee );
1351 print_sched_event(tid, msg_buf);
1352 }
1353 /* So tid's join call does not return just now. */
1354}
1355
1356
1357static
1358void do_pthread_create ( ThreadId parent_tid,
1359 pthread_t* thread,
1360 pthread_attr_t* attr,
1361 void* (*start_routine)(void *),
1362 void* arg )
1363{
1364 Addr new_stack;
1365 UInt new_stk_szb;
1366 ThreadId tid;
1367 Char msg_buf[100];
1368
1369 /* Paranoia ... */
1370 vg_assert(sizeof(pthread_t) == sizeof(UInt));
1371
1372 vg_assert(vg_threads[parent_tid].status != VgTs_Empty);
1373
1374 tid = vg_alloc_ThreadState();
1375
1376 /* If we've created the main thread's tid, we're in deep trouble :) */
1377 vg_assert(tid != 0);
1378
1379 /* Copy the parent's CPU state into the child's, in a roundabout
1380 way (via baseBlock). */
1381 VG_(load_thread_state)(parent_tid);
1382 VG_(save_thread_state)(tid);
1383
1384 /* Consider allocating the child a stack, if the one it already has
1385 is inadequate. */
1386 new_stk_szb = PTHREAD_STACK_MIN;
1387
1388 if (new_stk_szb > vg_threads[tid].stack_size) {
1389 /* Again, for good measure :) We definitely don't want to be
1390 allocating a stack for the main thread. */
1391 vg_assert(tid != 0);
1392 /* for now, we don't handle the case of anything other than
1393 assigning it for the first time. */
1394 vg_assert(vg_threads[tid].stack_size == 0);
1395 vg_assert(vg_threads[tid].stack_base == (Addr)NULL);
1396 new_stack = (Addr)VG_(get_memory_from_mmap)( new_stk_szb );
1397 vg_threads[tid].stack_base = new_stack;
1398 vg_threads[tid].stack_size = new_stk_szb;
1399 vg_threads[tid].m_esp
1400 = new_stack + new_stk_szb
1401 - VG_AR_CLIENT_STACKBASE_REDZONE_SZB;
1402 }
1403 if (VG_(clo_instrument))
1404 VGM_(make_noaccess)( vg_threads[tid].m_esp,
1405 VG_AR_CLIENT_STACKBASE_REDZONE_SZB );
1406
1407 /* push arg */
1408 vg_threads[tid].m_esp -= 4;
1409 * (UInt*)(vg_threads[tid].m_esp) = (UInt)arg;
1410
1411 /* push (magical) return address */
1412 vg_threads[tid].m_esp -= 4;
1413 * (UInt*)(vg_threads[tid].m_esp) = (UInt)do_pthread_create_bogusRA;
1414
1415 if (VG_(clo_instrument))
1416 VGM_(make_readable)( vg_threads[tid].m_esp, 2 * 4 );
1417
1418 /* this is where we start */
1419 vg_threads[tid].m_eip = (UInt)start_routine;
1420
sewardj8937c812002-04-12 20:12:20 +00001421 if (VG_(clo_trace_sched)) {
sewardje663cb92002-04-12 10:26:32 +00001422 VG_(sprintf)(msg_buf,
1423 "new thread, created by %d", parent_tid );
1424 print_sched_event(tid, msg_buf);
1425 }
1426
1427 /* store the thread id in *thread. */
1428 // if (VG_(clo_instrument))
1429 // ***** CHECK *thread is writable
1430 *thread = (pthread_t)tid;
1431
1432 /* return zero */
1433 vg_threads[tid].joiner = VG_INVALID_THREADID;
1434 vg_threads[tid].status = VgTs_Runnable;
1435 vg_threads[tid].m_edx = 0; /* success */
1436}
1437
1438
1439/* Horrible hacks to do with pthread_mutex_t: the real pthread_mutex_t
1440 is a struct with at least 5 words:
1441 typedef struct
1442 {
1443 int __m_reserved; -- Reserved for future use
1444 int __m_count; -- Depth of recursive locking
1445 _pthread_descr __m_owner; -- Owner thread (if recursive or errcheck)
1446 int __m_kind; -- Mutex kind: fast, recursive or errcheck
1447 struct _pthread_fastlock __m_lock; -- Underlying fast lock
1448 } pthread_mutex_t;
1449 Ours is just a single word, an index into vg_mutexes[].
1450 For now I'll park it in the __m_reserved field.
1451
1452 Uninitialised mutexes (PTHREAD_MUTEX_INITIALIZER) all have
1453 a zero __m_count field (see /usr/include/pthread.h). So I'll
1454 use zero to mean non-inited, and 1 to mean inited.
1455
1456 How convenient.
1457*/
1458
1459static
sewardj8937c812002-04-12 20:12:20 +00001460void initialise_mutex ( ThreadId tid, pthread_mutex_t *mutex )
sewardje663cb92002-04-12 10:26:32 +00001461{
sewardj8937c812002-04-12 20:12:20 +00001462 MutexId mid;
1463 Char msg_buf[100];
sewardje663cb92002-04-12 10:26:32 +00001464 /* vg_alloc_MutexId aborts if we can't allocate a mutex, for
1465 whatever reason. */
sewardje663cb92002-04-12 10:26:32 +00001466 mid = vg_alloc_VgMutex();
1467 vg_mutexes[mid].in_use = True;
1468 vg_mutexes[mid].held = False;
1469 vg_mutexes[mid].owner = VG_INVALID_THREADID; /* irrelevant */
1470 mutex->__m_reserved = mid;
1471 mutex->__m_count = 1; /* initialised */
sewardj8937c812002-04-12 20:12:20 +00001472 if (VG_(clo_trace_pthread)) {
1473 VG_(sprintf)(msg_buf, "(initialise mutex) (%p) -> %d",
1474 mutex, mid );
1475 print_pthread_event(tid, msg_buf);
1476 }
sewardje663cb92002-04-12 10:26:32 +00001477}
1478
1479/* Allocate a new MutexId and write it into *mutex. Ideally take
1480 notice of the attributes in *mutexattr. */
1481static
1482void do_pthread_mutex_init ( ThreadId tid,
1483 pthread_mutex_t *mutex,
1484 const pthread_mutexattr_t *mutexattr)
1485{
sewardj8937c812002-04-12 20:12:20 +00001486 Char msg_buf[100];
sewardje663cb92002-04-12 10:26:32 +00001487 /* Paranoia ... */
sewardje663cb92002-04-12 10:26:32 +00001488 vg_assert(sizeof(pthread_mutex_t) >= sizeof(UInt));
1489
sewardj8937c812002-04-12 20:12:20 +00001490 initialise_mutex(tid, mutex);
1491
1492 if (VG_(clo_trace_pthread)) {
1493 VG_(sprintf)(msg_buf, "pthread_mutex_init (%p) -> %d",
1494 mutex, mutex->__m_reserved );
1495 print_pthread_event(tid, msg_buf);
1496 }
1497
sewardje663cb92002-04-12 10:26:32 +00001498 /*
1499 RETURN VALUE
1500 pthread_mutex_init always returns 0. The other mutex functions
1501 return 0 on success and a non-zero error code on error.
1502 */
1503 /* THIS THREAD returns with 0. */
1504 vg_threads[tid].m_edx = 0;
1505}
1506
1507
1508static
1509void do_pthread_mutex_lock( ThreadId tid, pthread_mutex_t *mutex )
1510{
1511 MutexId mid;
1512 Char msg_buf[100];
1513
sewardje663cb92002-04-12 10:26:32 +00001514 /* *mutex contains the MutexId, or one of the magic values
1515 PTHREAD_*MUTEX_INITIALIZER*, indicating we need to initialise it
1516 now. See comment(s) above re use of __m_count to indicated
1517 initialisation status.
1518 */
1519
1520 /* POSIX doesn't mandate this, but for sanity ... */
1521 if (mutex == NULL) {
1522 vg_threads[tid].m_edx = EINVAL;
1523 return;
1524 }
1525
1526 if (mutex->__m_count == 0) {
sewardj8937c812002-04-12 20:12:20 +00001527 initialise_mutex(tid, mutex);
sewardje663cb92002-04-12 10:26:32 +00001528 }
1529
1530 mid = mutex->__m_reserved;
1531 if (mid < 0 || mid >= VG_N_MUTEXES || !vg_mutexes[mid].in_use) {
1532 vg_threads[tid].m_edx = EINVAL;
1533 return;
1534 }
1535
sewardj8937c812002-04-12 20:12:20 +00001536 if (VG_(clo_trace_pthread)) {
1537 VG_(sprintf)(msg_buf, "pthread_mutex_lock %d (%p)",
1538 mid, mutex );
1539 print_pthread_event(tid, msg_buf);
1540 }
1541
sewardje663cb92002-04-12 10:26:32 +00001542 /* Assert initialised. */
1543 vg_assert(mutex->__m_count == 1);
1544
1545 /* Assume tid valid. */
1546 vg_assert(vg_threads[tid].status == VgTs_Runnable);
1547
1548 if (vg_mutexes[mid].held) {
1549 if (vg_mutexes[mid].owner == tid) {
1550 vg_threads[tid].m_edx = EDEADLK;
1551 return;
1552 }
1553 /* Someone else has it; we have to wait. */
1554 vg_threads[tid].status = VgTs_WaitMX;
1555 vg_threads[tid].waited_on_mid = mid;
1556 /* No assignment to %EDX, since we're blocking. */
sewardj8937c812002-04-12 20:12:20 +00001557 if (VG_(clo_trace_pthread)) {
1558 VG_(sprintf)(msg_buf, "pthread_mutex_lock %d (%p): BLOCK",
1559 mid, mutex );
1560 print_pthread_event(tid, msg_buf);
sewardje663cb92002-04-12 10:26:32 +00001561 }
1562 } else {
1563 /* We get it! */
1564 vg_mutexes[mid].held = True;
1565 vg_mutexes[mid].owner = tid;
1566 /* return 0 (success). */
1567 vg_threads[tid].m_edx = 0;
1568 }
1569}
1570
1571
1572static
1573void do_pthread_mutex_unlock ( ThreadId tid,
1574 pthread_mutex_t *mutex )
1575{
1576 MutexId mid;
1577 Int i;
1578 Char msg_buf[100];
1579
sewardje663cb92002-04-12 10:26:32 +00001580 if (mutex == NULL
1581 || mutex->__m_count != 1) {
1582 vg_threads[tid].m_edx = EINVAL;
1583 return;
1584 }
1585
1586 mid = mutex->__m_reserved;
1587 if (mid < 0 || mid >= VG_N_MUTEXES || !vg_mutexes[mid].in_use) {
1588 vg_threads[tid].m_edx = EINVAL;
1589 return;
1590 }
1591
sewardj8937c812002-04-12 20:12:20 +00001592 if (VG_(clo_trace_pthread)) {
1593 VG_(sprintf)(msg_buf, "pthread_mutex_unlock %d (%p)",
1594 mid, mutex );
1595 print_pthread_event(tid, msg_buf);
1596 }
1597
sewardje663cb92002-04-12 10:26:32 +00001598 /* Assume tid valid */
1599 vg_assert(vg_threads[tid].status == VgTs_Runnable);
1600
1601 /* Barf if we don't currently hold the mutex. */
1602 if (!vg_mutexes[mid].held || vg_mutexes[mid].owner != tid) {
1603 vg_threads[tid].m_edx = EPERM;
1604 return;
1605 }
1606
1607 /* Find some arbitrary thread waiting on this mutex, and make it
1608 runnable. If none are waiting, mark the mutex as not held. */
1609 for (i = 0; i < VG_N_THREADS; i++) {
1610 if (vg_threads[i].status == VgTs_Empty)
1611 continue;
1612 if (vg_threads[i].status == VgTs_WaitMX
1613 && vg_threads[i].waited_on_mid == mid)
1614 break;
1615 }
1616
1617 vg_assert(i <= VG_N_THREADS);
1618 if (i == VG_N_THREADS) {
1619 /* Nobody else is waiting on it. */
1620 vg_mutexes[mid].held = False;
1621 } else {
1622 /* Notionally transfer the hold to thread i, whose
1623 pthread_mutex_lock() call now returns with 0 (success). */
1624 vg_mutexes[mid].owner = i;
1625 vg_threads[i].status = VgTs_Runnable;
1626 vg_threads[i].m_edx = 0; /* pth_lock() success */
sewardj8937c812002-04-12 20:12:20 +00001627
1628 if (VG_(clo_trace_pthread)) {
1629 VG_(sprintf)(msg_buf, "pthread_mutex_lock %d: RESUME",
1630 mid );
1631 print_pthread_event(tid, msg_buf);
sewardje663cb92002-04-12 10:26:32 +00001632 }
1633 }
1634
1635 /* In either case, our (tid's) pth_unlock() returns with 0
1636 (success). */
1637 vg_threads[tid].m_edx = 0; /* Success. */
1638}
1639
1640
1641static void do_pthread_mutex_destroy ( ThreadId tid,
1642 pthread_mutex_t *mutex )
1643{
sewardj8937c812002-04-12 20:12:20 +00001644 MutexId mid;
1645 Char msg_buf[100];
sewardje663cb92002-04-12 10:26:32 +00001646
1647 if (mutex == NULL
1648 || mutex->__m_count != 1) {
1649 vg_threads[tid].m_edx = EINVAL;
1650 return;
1651 }
1652
1653 mid = mutex->__m_reserved;
1654 if (mid < 0 || mid >= VG_N_MUTEXES || !vg_mutexes[mid].in_use) {
1655 vg_threads[tid].m_edx = EINVAL;
1656 return;
1657 }
1658
sewardj8937c812002-04-12 20:12:20 +00001659 if (VG_(clo_trace_pthread)) {
1660 VG_(sprintf)(msg_buf, "pthread_mutex_destroy %d (%p)",
1661 mid, mutex );
1662 print_pthread_event(tid, msg_buf);
1663 }
1664
sewardje663cb92002-04-12 10:26:32 +00001665 /* Assume tid valid */
1666 vg_assert(vg_threads[tid].status == VgTs_Runnable);
1667
1668 /* Barf if the mutex is currently held. */
1669 if (vg_mutexes[mid].held) {
1670 vg_threads[tid].m_edx = EBUSY;
1671 return;
1672 }
1673
1674 mutex->__m_count = 0; /* uninitialised */
1675 vg_mutexes[mid].in_use = False;
1676 vg_threads[tid].m_edx = 0;
1677}
1678
1679
1680/* ---------------------------------------------------------------------
1681 Handle non-trivial client requests.
1682 ------------------------------------------------------------------ */
1683
1684static
1685void do_nontrivial_clientreq ( ThreadId tid )
1686{
1687 UInt* arg = (UInt*)(vg_threads[tid].m_eax);
1688 UInt req_no = arg[0];
1689 switch (req_no) {
1690
1691 case VG_USERREQ__PTHREAD_CREATE:
1692 do_pthread_create( tid,
1693 (pthread_t*)arg[1],
1694 (pthread_attr_t*)arg[2],
1695 (void*(*)(void*))arg[3],
1696 (void*)arg[4] );
1697 break;
1698
1699 case VG_USERREQ__PTHREAD_CREATE_BOGUSRA:
1700 do_pthread_create_exit_by_returning( tid );
1701 break;
1702
1703 case VG_USERREQ__PTHREAD_JOIN:
1704 do_pthread_join( tid, arg[1], (void**)(arg[2]) );
1705 break;
1706
1707 /* Sigh ... this probably will cause huge numbers of major
1708 (expensive) scheduling events, for no real reason.
1709 Perhaps should be classified as a trivial-request. */
1710 case VG_USERREQ__PTHREAD_GET_THREADID:
1711 vg_threads[tid].m_edx = tid;
1712 break;
1713
1714 case VG_USERREQ__PTHREAD_MUTEX_INIT:
1715 do_pthread_mutex_init( tid,
1716 (pthread_mutex_t *)(arg[1]),
1717 (pthread_mutexattr_t *)(arg[2]) );
1718 break;
1719
1720 case VG_USERREQ__PTHREAD_MUTEX_LOCK:
1721 do_pthread_mutex_lock( tid, (pthread_mutex_t *)(arg[1]) );
1722 break;
1723
1724 case VG_USERREQ__PTHREAD_MUTEX_UNLOCK:
1725 do_pthread_mutex_unlock( tid, (pthread_mutex_t *)(arg[1]) );
1726 break;
1727
1728 case VG_USERREQ__PTHREAD_MUTEX_DESTROY:
1729 do_pthread_mutex_destroy( tid, (pthread_mutex_t *)(arg[1]) );
1730 break;
1731
1732 case VG_USERREQ__PTHREAD_CANCEL:
1733 do_pthread_cancel( tid, (pthread_t)(arg[1]) );
1734 break;
1735
1736 case VG_USERREQ__MAKE_NOACCESS:
1737 case VG_USERREQ__MAKE_WRITABLE:
1738 case VG_USERREQ__MAKE_READABLE:
1739 case VG_USERREQ__DISCARD:
1740 case VG_USERREQ__CHECK_WRITABLE:
1741 case VG_USERREQ__CHECK_READABLE:
1742 case VG_USERREQ__MAKE_NOACCESS_STACK:
1743 case VG_USERREQ__RUNNING_ON_VALGRIND:
1744 case VG_USERREQ__DO_LEAK_CHECK:
1745 vg_threads[tid].m_edx = VG_(handle_client_request) ( arg );
1746 break;
1747
1748 default:
1749 VG_(printf)("panic'd on private request = 0x%x\n", arg[0] );
1750 VG_(panic)("handle_private_client_pthread_request: "
1751 "unknown request");
1752 /*NOTREACHED*/
1753 break;
1754 }
1755}
1756
1757
1758/*--------------------------------------------------------------------*/
1759/*--- end vg_scheduler.c ---*/
1760/*--------------------------------------------------------------------*/