Blame - vg_scheduler.c - fp2-dev/platform/external/valgrind

blob: c1a26ad2c9a54ca8de7327eb8ce2d9a768aea0dd [file] [log] [blame]

sewardj	e663cb9	2002-04-12 10:26:32 +0000	[diff] [blame^]	1
				2	/--------------------------------------------------------------------/
				3	/--- A user-space pthreads implementation. vg_scheduler.c ---/
				4	/--------------------------------------------------------------------/
				5
				6	/*
				7	This file is part of Valgrind, an x86 protected-mode emulator
				8	designed for debugging and profiling binaries on x86-Unixes.
				9
				10	Copyright (C) 2000-2002 Julian Seward
				11	jseward@acm.org
				12	Julian_Seward@muraroa.demon.co.uk
				13
				14	This program is free software; you can redistribute it and/or
				15	modify it under the terms of the GNU General Public License as
				16	published by the Free Software Foundation; either version 2 of the
				17	License, or (at your option) any later version.
				18
				19	This program is distributed in the hope that it will be useful, but
				20	WITHOUT ANY WARRANTY; without even the implied warranty of
				21	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				22	General Public License for more details.
				23
				24	You should have received a copy of the GNU General Public License
				25	along with this program; if not, write to the Free Software
				26	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
				27	02111-1307, USA.
				28
				29	The GNU General Public License is contained in the file LICENSE.
				30	*/
				31
				32	#include "vg_include.h"
				33	#include "vg_constants.h"
				34
				35	#include "valgrind.h" /* for VG_USERREQ__MAKE_NOACCESS and
				36	VG_USERREQ__DO_LEAK_CHECK */
				37
				38	/* BORKAGE as of 11 Apr 02
				39
				40	Note! This implementation is so poor as to not be suitable for use by
				41	anyone at all!
				42
				43	- properly save scheduler private state in signal delivery frames.
				44
				45	- fd-poll optimisation (don't select with empty sets)
				46
				47	- signals interrupting read/write and nanosleep, and take notice
				48	of SA_RESTART or not
				49
				50	- return bogus RA: %EAX trashed, so pthread_joiner gets nonsense
				51	exit codes
				52
				53	- when a thread is done mark its stack as noaccess */
				54
				55
				56	/* ---------------------------------------------------------------------
				57	Types and globals for the scheduler.
				58	------------------------------------------------------------------ */
				59
				60	/* type ThreadId is defined in vg_include.h. */
				61
				62	/* struct ThreadState is defined in vg_include.h. */
				63
				64	/* Private globals. A statically allocated array of threads. */
				65	static ThreadState vg_threads[VG_N_THREADS];
				66
				67
				68	/* vg_oursignalhandler() might longjmp(). Here's the jmp_buf. */
				69	jmp_buf VG_(scheduler_jmpbuf);
				70	/* ... and if so, here's the signal which caused it to do so. */
				71	Int VG_(longjmpd_on_signal);
				72
				73
				74	/* Machinery to keep track of which threads are waiting on which
				75	fds. */
				76	typedef
				77	struct {
				78	/* The thread which made the request. */
				79	ThreadId tid;
				80
				81	/* The next two fields describe the request. */
				82	/* File descriptor waited for. -1 means this slot is not in use */
				83	Int fd;
				84	/* The syscall number the fd is used in. */
				85	Int syscall_no;
				86
				87	/* False => still waiting for select to tell us the fd is ready
				88	to go. True => the fd is ready, but the results have not yet
				89	been delivered back to the calling thread. Once the latter
				90	happens, this entire record is marked as no longer in use, by
				91	making the fd field be -1. */
				92	Bool ready;
				93	}
				94	VgWaitedOnFd;
				95
				96	static VgWaitedOnFd vg_waiting_fds[VG_N_WAITING_FDS];
				97
				98
				99
				100	typedef
				101	struct {
				102	/* Is this slot in use, or free? */
				103	Bool in_use;
				104	/* If in_use, is this mutex held by some thread, or not? */
				105	Bool held;
				106	/* if held==True, owner indicates who by. */
				107	ThreadId owner;
				108	}
				109	VgMutex;
				110
				111	static VgMutex vg_mutexes[VG_N_MUTEXES];
				112
				113	/* Forwards */
				114	static void do_nontrivial_clientreq ( ThreadId tid );
				115
				116
				117	/* ---------------------------------------------------------------------
				118	Helper functions for the scheduler.
				119	------------------------------------------------------------------ */
				120
				121	static
				122	void pp_sched_status ( void )
				123	{
				124	Int i;
				125	VG_(printf)("\nsched status:\n");
				126	for (i = 0; i < VG_N_THREADS; i++) {
				127	if (vg_threads[i].status == VgTs_Empty) continue;
				128	VG_(printf)("tid %d: ", i);
				129	switch (vg_threads[i].status) {
				130	case VgTs_Runnable: VG_(printf)("Runnable\n"); break;
				131	case VgTs_WaitFD: VG_(printf)("WaitFD\n"); break;
				132	case VgTs_WaitJoiner: VG_(printf)("WaitJoiner(%d)\n",
				133	vg_threads[i].joiner); break;
				134	case VgTs_WaitJoinee: VG_(printf)("WaitJoinee\n"); break;
				135	default: VG_(printf)("???"); break;
				136	}
				137	}
				138	VG_(printf)("\n");
				139	}
				140
				141	static
				142	void add_waiting_fd ( ThreadId tid, Int fd, Int syscall_no )
				143	{
				144	Int i;
				145
				146	vg_assert(fd != -1); /* avoid total chaos */
				147
				148	for (i = 0; i < VG_N_WAITING_FDS; i++)
				149	if (vg_waiting_fds[i].fd == -1)
				150	break;
				151
				152	if (i == VG_N_WAITING_FDS)
				153	VG_(panic)("add_waiting_fd: VG_N_WAITING_FDS is too low");
				154	/*
				155	VG_(printf)("add_waiting_fd: add (tid %d, fd %d) at slot %d\n",
				156	tid, fd, i);
				157	*/
				158	vg_waiting_fds[i].fd = fd;
				159	vg_waiting_fds[i].tid = tid;
				160	vg_waiting_fds[i].ready = False;
				161	vg_waiting_fds[i].syscall_no = syscall_no;
				162	}
				163
				164
				165
				166	static
				167	void print_sched_event ( ThreadId tid, Char* what )
				168	{
				169	if (1)
				170	VG_(message)(Vg_DebugMsg, "SCHED[%d]: %s", tid, what );
				171	}
				172
				173
				174	static
				175	Char* name_of_sched_event ( UInt event )
				176	{
				177	switch (event) {
				178	case VG_TRC_EBP_JMP_SPECIAL: return "JMP_SPECIAL";
				179	case VG_TRC_EBP_JMP_SYSCALL: return "SYSCALL";
				180	case VG_TRC_EBP_JMP_CLIENTREQ: return "CLIENTREQ";
				181	case VG_TRC_INNER_COUNTERZERO: return "COUNTERZERO";
				182	case VG_TRC_INNER_FASTMISS: return "FASTMISS";
				183	case VG_TRC_UNRESUMABLE_SIGNAL: return "FATALSIGNAL";
				184	default: return "??UNKNOWN??";
				185	}
				186	}
				187
				188
				189	/* Create a translation of the client basic block beginning at
				190	orig_addr, and add it to the translation cache & translation table.
				191	This probably doesn't really belong here, but, hey ...
				192	*/
				193	void VG_(create_translation_for) ( Addr orig_addr )
				194	{
				195	Addr trans_addr;
				196	TTEntry tte;
				197	Int orig_size, trans_size;
				198	/* Ensure there is space to hold a translation. */
				199	VG_(maybe_do_lru_pass)();
				200	VG_(translate)( orig_addr, &orig_size, &trans_addr, &trans_size );
				201	/* Copy data at trans_addr into the translation cache.
				202	Returned pointer is to the code, not to the 4-byte
				203	header. */
				204	/* Since the .orig_size and .trans_size fields are
				205	UShort, be paranoid. */
				206	vg_assert(orig_size > 0 && orig_size < 65536);
				207	vg_assert(trans_size > 0 && trans_size < 65536);
				208	tte.orig_size = orig_size;
				209	tte.orig_addr = orig_addr;
				210	tte.trans_size = trans_size;
				211	tte.trans_addr = VG_(copy_to_transcache)
				212	( trans_addr, trans_size );
				213	tte.mru_epoch = VG_(current_epoch);
				214	/* Free the intermediary -- was allocated by VG_(emit_code). */
				215	VG_(jitfree)( (void*)trans_addr );
				216	/* Add to trans tab and set back pointer. */
				217	VG_(add_to_trans_tab) ( &tte );
				218	/* Update stats. */
				219	VG_(this_epoch_in_count) ++;
				220	VG_(this_epoch_in_osize) += orig_size;
				221	VG_(this_epoch_in_tsize) += trans_size;
				222	VG_(overall_in_count) ++;
				223	VG_(overall_in_osize) += orig_size;
				224	VG_(overall_in_tsize) += trans_size;
				225	/* Record translated area for SMC detection. */
				226	VG_(smc_mark_original) ( orig_addr, orig_size );
				227	}
				228
				229
				230	/* Allocate a completely empty ThreadState record. */
				231	static
				232	ThreadId vg_alloc_ThreadState ( void )
				233	{
				234	Int i;
				235	for (i = 0; i < VG_N_THREADS; i++) {
				236	if (vg_threads[i].status == VgTs_Empty)
				237	return i;
				238	}
				239	VG_(printf)("vg_alloc_ThreadState: no free slots available\n");
				240	VG_(printf)("Increase VG_N_THREADS, rebuild and try again.\n");
				241	VG_(panic)("VG_N_THREADS is too low");
				242	/NOTREACHED/
				243	}
				244
				245
				246	ThreadState* VG_(get_thread_state) ( ThreadId tid )
				247	{
				248	vg_assert(tid >= 0 && tid < VG_N_THREADS);
				249	vg_assert(vg_threads[tid].status != VgTs_Empty);
				250	return & vg_threads[tid];
				251	}
				252
				253
				254	/* Find an unused VgMutex record. */
				255	static
				256	MutexId vg_alloc_VgMutex ( void )
				257	{
				258	Int i;
				259	for (i = 0; i < VG_N_MUTEXES; i++) {
				260	if (!vg_mutexes[i].in_use)
				261	return i;
				262	}
				263	VG_(printf)("vg_alloc_VgMutex: no free slots available\n");
				264	VG_(printf)("Increase VG_N_MUTEXES, rebuild and try again.\n");
				265	VG_(panic)("VG_N_MUTEXES is too low");
				266	/NOTREACHED/
				267	}
				268
				269
				270	/* Copy the saved state of a thread into VG_(baseBlock), ready for it
				271	to be run. */
				272	__inline__
				273	void VG_(load_thread_state) ( ThreadId tid )
				274	{
				275	Int i;
				276	VG_(baseBlock)[VGOFF_(m_eax)] = vg_threads[tid].m_eax;
				277	VG_(baseBlock)[VGOFF_(m_ebx)] = vg_threads[tid].m_ebx;
				278	VG_(baseBlock)[VGOFF_(m_ecx)] = vg_threads[tid].m_ecx;
				279	VG_(baseBlock)[VGOFF_(m_edx)] = vg_threads[tid].m_edx;
				280	VG_(baseBlock)[VGOFF_(m_esi)] = vg_threads[tid].m_esi;
				281	VG_(baseBlock)[VGOFF_(m_edi)] = vg_threads[tid].m_edi;
				282	VG_(baseBlock)[VGOFF_(m_ebp)] = vg_threads[tid].m_ebp;
				283	VG_(baseBlock)[VGOFF_(m_esp)] = vg_threads[tid].m_esp;
				284	VG_(baseBlock)[VGOFF_(m_eflags)] = vg_threads[tid].m_eflags;
				285	VG_(baseBlock)[VGOFF_(m_eip)] = vg_threads[tid].m_eip;
				286
				287	for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
				288	VG_(baseBlock)[VGOFF_(m_fpustate) + i] = vg_threads[tid].m_fpu[i];
				289
				290	VG_(baseBlock)[VGOFF_(sh_eax)] = vg_threads[tid].sh_eax;
				291	VG_(baseBlock)[VGOFF_(sh_ebx)] = vg_threads[tid].sh_ebx;
				292	VG_(baseBlock)[VGOFF_(sh_ecx)] = vg_threads[tid].sh_ecx;
				293	VG_(baseBlock)[VGOFF_(sh_edx)] = vg_threads[tid].sh_edx;
				294	VG_(baseBlock)[VGOFF_(sh_esi)] = vg_threads[tid].sh_esi;
				295	VG_(baseBlock)[VGOFF_(sh_edi)] = vg_threads[tid].sh_edi;
				296	VG_(baseBlock)[VGOFF_(sh_ebp)] = vg_threads[tid].sh_ebp;
				297	VG_(baseBlock)[VGOFF_(sh_esp)] = vg_threads[tid].sh_esp;
				298	VG_(baseBlock)[VGOFF_(sh_eflags)] = vg_threads[tid].sh_eflags;
				299	}
				300
				301
				302	/* Copy the state of a thread from VG_(baseBlock), presumably after it
				303	has been descheduled. For sanity-check purposes, fill the vacated
				304	VG_(baseBlock) with garbage so as to make the system more likely to
				305	fail quickly if we erroneously continue to poke around inside
				306	VG_(baseBlock) without first doing a load_thread_state().
				307	*/
				308	__inline__
				309	void VG_(save_thread_state) ( ThreadId tid )
				310	{
				311	Int i;
				312	const UInt junk = 0xDEADBEEF;
				313
				314	vg_threads[tid].m_eax = VG_(baseBlock)[VGOFF_(m_eax)];
				315	vg_threads[tid].m_ebx = VG_(baseBlock)[VGOFF_(m_ebx)];
				316	vg_threads[tid].m_ecx = VG_(baseBlock)[VGOFF_(m_ecx)];
				317	vg_threads[tid].m_edx = VG_(baseBlock)[VGOFF_(m_edx)];
				318	vg_threads[tid].m_esi = VG_(baseBlock)[VGOFF_(m_esi)];
				319	vg_threads[tid].m_edi = VG_(baseBlock)[VGOFF_(m_edi)];
				320	vg_threads[tid].m_ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
				321	vg_threads[tid].m_esp = VG_(baseBlock)[VGOFF_(m_esp)];
				322	vg_threads[tid].m_eflags = VG_(baseBlock)[VGOFF_(m_eflags)];
				323	vg_threads[tid].m_eip = VG_(baseBlock)[VGOFF_(m_eip)];
				324
				325	for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
				326	vg_threads[tid].m_fpu[i] = VG_(baseBlock)[VGOFF_(m_fpustate) + i];
				327
				328	vg_threads[tid].sh_eax = VG_(baseBlock)[VGOFF_(sh_eax)];
				329	vg_threads[tid].sh_ebx = VG_(baseBlock)[VGOFF_(sh_ebx)];
				330	vg_threads[tid].sh_ecx = VG_(baseBlock)[VGOFF_(sh_ecx)];
				331	vg_threads[tid].sh_edx = VG_(baseBlock)[VGOFF_(sh_edx)];
				332	vg_threads[tid].sh_esi = VG_(baseBlock)[VGOFF_(sh_esi)];
				333	vg_threads[tid].sh_edi = VG_(baseBlock)[VGOFF_(sh_edi)];
				334	vg_threads[tid].sh_ebp = VG_(baseBlock)[VGOFF_(sh_ebp)];
				335	vg_threads[tid].sh_esp = VG_(baseBlock)[VGOFF_(sh_esp)];
				336	vg_threads[tid].sh_eflags = VG_(baseBlock)[VGOFF_(sh_eflags)];
				337
				338	/* Fill it up with junk. */
				339	VG_(baseBlock)[VGOFF_(m_eax)] = junk;
				340	VG_(baseBlock)[VGOFF_(m_ebx)] = junk;
				341	VG_(baseBlock)[VGOFF_(m_ecx)] = junk;
				342	VG_(baseBlock)[VGOFF_(m_edx)] = junk;
				343	VG_(baseBlock)[VGOFF_(m_esi)] = junk;
				344	VG_(baseBlock)[VGOFF_(m_edi)] = junk;
				345	VG_(baseBlock)[VGOFF_(m_ebp)] = junk;
				346	VG_(baseBlock)[VGOFF_(m_esp)] = junk;
				347	VG_(baseBlock)[VGOFF_(m_eflags)] = junk;
				348	VG_(baseBlock)[VGOFF_(m_eip)] = junk;
				349
				350	for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
				351	VG_(baseBlock)[VGOFF_(m_fpustate) + i] = junk;
				352	}
				353
				354
				355	/* Run the thread tid for a while, and return a VG_TRC_* value to the
				356	scheduler indicating what happened. */
				357	static
				358	UInt run_thread_for_a_while ( ThreadId tid )
				359	{
				360	UInt trc = 0;
				361	vg_assert(tid >= 0 && tid < VG_N_THREADS);
				362	vg_assert(vg_threads[tid].status != VgTs_Empty);
				363	vg_assert(VG_(bbs_to_go) > 0);
				364
				365	VG_(load_thread_state) ( tid );
				366	if (__builtin_setjmp(VG_(scheduler_jmpbuf)) == 0) {
				367	/* try this ... */
				368	trc = VG_(run_innerloop)();
				369	/* We get here if the client didn't take a fault. */
				370	} else {
				371	/* We get here if the client took a fault, which caused our
				372	signal handler to longjmp. */
				373	vg_assert(trc == 0);
				374	trc = VG_TRC_UNRESUMABLE_SIGNAL;
				375	}
				376	VG_(save_thread_state) ( tid );
				377	return trc;
				378	}
				379
				380
				381	/* Increment the LRU epoch counter. */
				382	static
				383	void increment_epoch ( void )
				384	{
				385	VG_(current_epoch)++;
				386	if (VG_(clo_verbosity) > 2) {
				387	UInt tt_used, tc_used;
				388	VG_(get_tt_tc_used) ( &tt_used, &tc_used );
				389	VG_(message)(Vg_UserMsg,
				390	"%lu bbs, in: %d (%d -> %d), out %d (%d -> %d), TT %d, TC %d",
				391	VG_(bbs_done),
				392	VG_(this_epoch_in_count),
				393	VG_(this_epoch_in_osize),
				394	VG_(this_epoch_in_tsize),
				395	VG_(this_epoch_out_count),
				396	VG_(this_epoch_out_osize),
				397	VG_(this_epoch_out_tsize),
				398	tt_used, tc_used
				399	);
				400	}
				401	VG_(this_epoch_in_count) = 0;
				402	VG_(this_epoch_in_osize) = 0;
				403	VG_(this_epoch_in_tsize) = 0;
				404	VG_(this_epoch_out_count) = 0;
				405	VG_(this_epoch_out_osize) = 0;
				406	VG_(this_epoch_out_tsize) = 0;
				407	}
				408
				409
				410	/* Initialise the scheduler. Create a single "main" thread ready to
				411	run, with special ThreadId of zero. This is called at startup; the
				412	caller takes care to park the client's state is parked in
				413	VG_(baseBlock).
				414	*/
				415	void VG_(scheduler_init) ( void )
				416	{
				417	Int i;
				418	Addr startup_esp;
				419	ThreadId tid_main;
				420
				421	startup_esp = VG_(baseBlock)[VGOFF_(m_esp)];
				422	if ((startup_esp & VG_STARTUP_STACK_MASK) != VG_STARTUP_STACK_MASK) {
				423	VG_(printf)("%esp at startup = %p is not near %p; aborting\n",
				424	startup_esp, VG_STARTUP_STACK_MASK);
				425	VG_(panic)("unexpected %esp at startup");
				426	}
				427
				428	for (i = 0; i < VG_N_THREADS; i++) {
				429	vg_threads[i].stack_size = 0;
				430	vg_threads[i].stack_base = (Addr)NULL;
				431	}
				432
				433	for (i = 0; i < VG_N_WAITING_FDS; i++)
				434	vg_waiting_fds[i].fd = -1; /* not in use */
				435
				436	for (i = 0; i < VG_N_MUTEXES; i++)
				437	vg_mutexes[i].in_use = False;
				438
				439	/* Assert this is thread zero, which has certain magic
				440	properties. */
				441	tid_main = vg_alloc_ThreadState();
				442	vg_assert(tid_main == 0);
				443
				444	vg_threads[tid_main].status = VgTs_Runnable;
				445	vg_threads[tid_main].joiner = VG_INVALID_THREADID;
				446	vg_threads[tid_main].retval = NULL; /* not important */
				447
				448	/* Copy VG_(baseBlock) state to tid_main's slot. */
				449	VG_(save_thread_state) ( tid_main );
				450	}
				451
				452
				453	/* What if fd isn't a valid fd? */
				454	static
				455	void set_fd_nonblocking ( Int fd )
				456	{
				457	Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
				458	vg_assert(!VG_(is_kerror)(res));
				459	res \|= VKI_O_NONBLOCK;
				460	res = VG_(fcntl)( fd, VKI_F_SETFL, res );
				461	vg_assert(!VG_(is_kerror)(res));
				462	}
				463
				464	static
				465	void set_fd_blocking ( Int fd )
				466	{
				467	Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
				468	vg_assert(!VG_(is_kerror)(res));
				469	res &= ~VKI_O_NONBLOCK;
				470	res = VG_(fcntl)( fd, VKI_F_SETFL, res );
				471	vg_assert(!VG_(is_kerror)(res));
				472	}
				473
				474	static
				475	Bool fd_is_blockful ( Int fd )
				476	{
				477	Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
				478	vg_assert(!VG_(is_kerror)(res));
				479	return (res & VKI_O_NONBLOCK) ? False : True;
				480	}
				481
				482
				483
				484	/* Do a purely thread-local request for tid, and put the result in its
				485	%EDX, without changing its scheduling state in any way, nor that of
				486	any other threads. Return True if so.
				487
				488	If the request is non-trivial, return False; a more capable but
				489	slower mechanism will deal with it.
				490	*/
				491	static
				492	Bool maybe_do_trivial_clientreq ( ThreadId tid )
				493	{
				494	# define SIMPLE_RETURN(vvv) \
				495	{ vg_threads[tid].m_edx = (vvv); \
				496	return True; \
				497	}
				498
				499	UInt* arg = (UInt*)(vg_threads[tid].m_eax);
				500	UInt req_no = arg[0];
				501	switch (req_no) {
				502	case VG_USERREQ__MALLOC:
				503	SIMPLE_RETURN(
				504	(UInt)VG_(client_malloc) ( arg[1], Vg_AllocMalloc )
				505	);
				506	case VG_USERREQ__BUILTIN_NEW:
				507	SIMPLE_RETURN(
				508	(UInt)VG_(client_malloc) ( arg[1], Vg_AllocNew )
				509	);
				510	case VG_USERREQ__BUILTIN_VEC_NEW:
				511	SIMPLE_RETURN(
				512	(UInt)VG_(client_malloc) ( arg[1], Vg_AllocNewVec )
				513	);
				514	case VG_USERREQ__FREE:
				515	VG_(client_free) ( (void*)arg[1], Vg_AllocMalloc );
				516	SIMPLE_RETURN(0); /* irrelevant */
				517	case VG_USERREQ__BUILTIN_DELETE:
				518	VG_(client_free) ( (void*)arg[1], Vg_AllocNew );
				519	SIMPLE_RETURN(0); /* irrelevant */
				520	case VG_USERREQ__BUILTIN_VEC_DELETE:
				521	VG_(client_free) ( (void*)arg[1], Vg_AllocNewVec );
				522	SIMPLE_RETURN(0); /* irrelevant */
				523	case VG_USERREQ__CALLOC:
				524	SIMPLE_RETURN(
				525	(UInt)VG_(client_calloc) ( arg[1], arg[2] )
				526	);
				527	case VG_USERREQ__REALLOC:
				528	SIMPLE_RETURN(
				529	(UInt)VG_(client_realloc) ( (void*)arg[1], arg[2] )
				530	);
				531	case VG_USERREQ__MEMALIGN:
				532	SIMPLE_RETURN(
				533	(UInt)VG_(client_memalign) ( arg[1], arg[2] )
				534	);
				535	default:
				536	/* Too hard; wimp out. */
				537	return False;
				538	}
				539	# undef SIMPLE_RETURN
				540	}
				541
				542
				543	static
				544	void sched_do_syscall ( ThreadId tid )
				545	{
				546	UInt saved_eax;
				547	UInt res, syscall_no;
				548	UInt fd;
				549	Bool might_block, assumed_nonblocking;
				550	Bool orig_fd_blockness;
				551	Char msg_buf[100];
				552
				553	vg_assert(tid >= 0 && tid < VG_N_THREADS);
				554	vg_assert(vg_threads[tid].status == VgTs_Runnable);
				555
				556	syscall_no = vg_threads[tid].m_eax; /* syscall number */
				557
				558	if (syscall_no == __NR_nanosleep) {
				559	ULong t_now, t_awaken;
				560	struct vki_timespec* req;
				561	req = (struct vki_timespec)vg_threads[tid].m_ebx; / arg1 */
				562	t_now = VG_(read_microsecond_timer)();
				563	t_awaken
				564	= t_now
				565	+ (ULong)1000000ULL * (ULong)(req->tv_sec)
				566	+ (ULong)( (UInt)(req->tv_nsec) / 1000 );
				567	vg_threads[tid].status = VgTs_Sleeping;
				568	vg_threads[tid].awaken_at = t_awaken;
				569	if (1) {
				570	VG_(sprintf)(msg_buf, "at %lu: nanosleep for %lu",
				571	t_now, t_awaken-t_now);
				572	print_sched_event(tid, msg_buf);
				573	}
				574	/* Force the scheduler to run something else for a while. */
				575	return;
				576	}
				577
				578	switch (syscall_no) {
				579	case __NR_read:
				580	case __NR_write:
				581	assumed_nonblocking
				582	= False;
				583	might_block
				584	= fd_is_blockful(vg_threads[tid].m_ebx /* arg1 */);
				585	break;
				586	default:
				587	might_block = False;
				588	assumed_nonblocking = True;
				589	}
				590
				591	if (assumed_nonblocking) {
				592	/* We think it's non-blocking. Just do it in the normal way. */
				593	VG_(perform_assumed_nonblocking_syscall)(tid);
				594	/* The thread is still runnable. */
				595	return;
				596	}
				597
				598	/* It might block. Take evasive action. */
				599	switch (syscall_no) {
				600	case __NR_read:
				601	case __NR_write:
				602	fd = vg_threads[tid].m_ebx; break;
				603	default:
				604	vg_assert(3+3 == 7);
				605	}
				606
				607	/* Set the fd to nonblocking, and do the syscall, which will return
				608	immediately, in order to lodge a request with the Linux kernel.
				609	We later poll for I/O completion using select(). */
				610
				611	orig_fd_blockness = fd_is_blockful(fd);
				612	set_fd_nonblocking(fd);
				613	vg_assert(!fd_is_blockful(fd));
				614	VG_(check_known_blocking_syscall)(tid, syscall_no, NULL /* PRE */);
				615
				616	/* This trashes the thread's %eax; we have to preserve it. */
				617	saved_eax = vg_threads[tid].m_eax;
				618	KERNEL_DO_SYSCALL(tid,res);
				619
				620	/* Restore original blockfulness of the fd. */
				621	if (orig_fd_blockness)
				622	set_fd_blocking(fd);
				623	else
				624	set_fd_nonblocking(fd);
				625
				626	if (res != -VKI_EWOULDBLOCK) {
				627	/* It didn't block; it went through immediately. So finish off
				628	in the normal way. Don't restore %EAX, since that now
				629	(correctly) holds the result of the call. */
				630	VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
				631	/* We're still runnable. */
				632	vg_assert(vg_threads[tid].status == VgTs_Runnable);
				633
				634	} else {
				635
				636	/* It would have blocked. First, restore %EAX to what it was
				637	before our speculative call. */
				638	vg_threads[tid].m_eax = saved_eax;
				639	/* Put this fd in a table of fds on which we are waiting for
				640	completion. The arguments for select() later are constructed
				641	from this table. */
				642	add_waiting_fd(tid, fd, saved_eax /* which holds the syscall # */);
				643	/* Deschedule thread until an I/O completion happens. */
				644	vg_threads[tid].status = VgTs_WaitFD;
				645	if (1) {
				646	VG_(sprintf)(msg_buf,"block until I/O ready on fd %d", fd);
				647	print_sched_event(tid, msg_buf);
				648	}
				649
				650	}
				651	}
				652
				653
				654	/* Find out which of the fds in vg_waiting_fds are now ready to go, by
				655	making enquiries with select(), and mark them as ready. We have to
				656	wait for the requesting threads to fall into the the WaitFD state
				657	before we can actually finally deliver the results, so this
				658	procedure doesn't do that; complete_blocked_syscalls() does it.
				659
				660	It might seem odd that a thread which has done a blocking syscall
				661	is not in WaitFD state; the way this can happen is if it initially
				662	becomes WaitFD, but then a signal is delivered to it, so it becomes
				663	Runnable for a while. In this case we have to wait for the
				664	sighandler to return, whereupon the WaitFD state is resumed, and
				665	only at that point can the I/O result be delivered to it. However,
				666	this point may be long after the fd is actually ready.
				667
				668	So, poll_for_ready_fds() merely detects fds which are ready.
				669	complete_blocked_syscalls() does the second half of the trick,
				670	possibly much later: it delivers the results from ready fds to
				671	threads in WaitFD state.
				672	*/
				673	void poll_for_ready_fds ( void )
				674	{
				675	vki_ksigset_t saved_procmask;
				676	vki_fd_set readfds;
				677	vki_fd_set writefds;
				678	vki_fd_set exceptfds;
				679	struct vki_timeval timeout;
				680	Int fd, fd_max, i, n_ready, syscall_no, n_ok;
				681	ThreadId tid;
				682	Bool rd_ok, wr_ok, ex_ok;
				683	Char msg_buf[100];
				684
				685	/* Awaken any sleeping threads whose sleep has expired. */
				686	{
				687	struct vki_timespec * rem;
				688	ULong t_now = VG_(read_microsecond_timer)();
				689	for (tid = 0; tid < VG_N_THREADS; tid++) {
				690	if (vg_threads[tid].status != VgTs_Sleeping)
				691	continue;
				692	if (t_now >= vg_threads[tid].awaken_at) {
				693	/* Resume this thread. Set to zero the remaining-time (second)
				694	arg of nanosleep, since it's used up all its time. */
				695	vg_assert(vg_threads[tid].m_eax == __NR_nanosleep);
				696	rem = (struct vki_timespec )vg_threads[tid].m_ecx; / arg2 */
				697	if (rem != NULL) {
				698	rem->tv_sec = 0;
				699	rem->tv_nsec = 0;
				700	}
				701	/* Make the syscall return 0 (success). */
				702	vg_threads[tid].m_eax = 0;
				703	/* Reschedule this thread. */
				704	vg_threads[tid].status = VgTs_Runnable;
				705	if (1) {
				706	VG_(sprintf)(msg_buf, "at %lu: nanosleep done",
				707	t_now);
				708	print_sched_event(tid, msg_buf);
				709	}
				710	}
				711	}
				712	}
				713
				714	timeout.tv_sec = 0;
				715	timeout.tv_usec = 0;
				716
				717	VKI_FD_ZERO(&readfds);
				718	VKI_FD_ZERO(&writefds);
				719	VKI_FD_ZERO(&exceptfds);
				720	fd_max = -1;
				721	for (i = 0; i < VG_N_WAITING_FDS; i++) {
				722	if (vg_waiting_fds[i].fd == -1 /* not in use */)
				723	continue;
				724	if (vg_waiting_fds[i].ready /* already ready? */)
				725	continue;
				726	fd = vg_waiting_fds[i].fd;
				727	/* VG_(printf)("adding QUERY for fd %d\n", fd); */
				728	if (fd > fd_max)
				729	fd_max = fd;
				730	tid = vg_waiting_fds[i].tid;
				731	vg_assert(tid >= 0 && tid < VG_N_THREADS);
				732	syscall_no = vg_waiting_fds[i].syscall_no;
				733	switch (syscall_no) {
				734	case __NR_read:
				735	VKI_FD_SET(fd, &readfds); break;
				736	case __NR_write:
				737	VKI_FD_SET(fd, &writefds); break;
				738	default:
				739	VG_(panic)("poll_for_ready_fds: unexpected syscall");
				740	/NOTREACHED/
				741	break;
				742	}
				743	}
				744
				745	/* BLOCK ALL SIGNALS. We don't want the complication of select()
				746	getting interrupted. */
				747	VG_(block_all_host_signals)( &saved_procmask );
				748
				749	n_ready = VG_(select)
				750	( fd_max+1, &readfds, &writefds, &exceptfds, &timeout);
				751	if (VG_(is_kerror)(n_ready)) {
				752	VG_(printf)("poll_for_ready_fds: select returned %d\n", n_ready);
				753	VG_(panic)("poll_for_ready_fds: select failed?!");
				754	/NOTREACHED/
				755	}
				756
				757	/* UNBLOCK ALL SIGNALS */
				758	VG_(restore_host_signals)( &saved_procmask );
				759
				760	/* VG_(printf)("poll_for_io_completions: %d fs ready\n", n_ready); */
				761
				762	if (n_ready == 0)
				763	return;
				764
				765	/* Inspect all the fds we know about, and handle any completions that
				766	have happened. */
				767	/*
				768	VG_(printf)("\n\n");
				769	for (fd = 0; fd < 100; fd++)
				770	if (VKI_FD_ISSET(fd, &writefds) \|\| VKI_FD_ISSET(fd, &readfds)) {
				771	VG_(printf)("X"); } else { VG_(printf)("."); };
				772	VG_(printf)("\n\nfd_max = %d\n", fd_max);
				773	*/
				774
				775	for (fd = 0; fd <= fd_max; fd++) {
				776	rd_ok = VKI_FD_ISSET(fd, &readfds);
				777	wr_ok = VKI_FD_ISSET(fd, &writefds);
				778	ex_ok = VKI_FD_ISSET(fd, &exceptfds);
				779
				780	n_ok = (rd_ok ? 1 : 0) + (wr_ok ? 1 : 0) + (ex_ok ? 1 : 0);
				781	if (n_ok == 0)
				782	continue;
				783	if (n_ok > 1) {
				784	VG_(printf)("offending fd = %d\n", fd);
				785	VG_(panic)("poll_for_ready_fds: multiple events on fd");
				786	}
				787
				788	/* An I/O event completed for fd. Find the thread which
				789	requested this. */
				790	for (i = 0; i < VG_N_WAITING_FDS; i++) {
				791	if (vg_waiting_fds[i].fd == -1 /* not in use */)
				792	continue;
				793	if (vg_waiting_fds[i].fd == fd)
				794	break;
				795	}
				796
				797	/* And a bit more paranoia ... */
				798	vg_assert(i >= 0 && i < VG_N_WAITING_FDS);
				799
				800	/* Mark the fd as ready. */
				801	vg_assert(! vg_waiting_fds[i].ready);
				802	vg_waiting_fds[i].ready = True;
				803	}
				804	}
				805
				806
				807	/* See comment attached to poll_for_ready_fds() for explaination. */
				808	void complete_blocked_syscalls ( void )
				809	{
				810	Int fd, i, res, syscall_no;
				811	ThreadId tid;
				812	Char msg_buf[100];
				813
				814	/* Inspect all the outstanding fds we know about. */
				815
				816	for (i = 0; i < VG_N_WAITING_FDS; i++) {
				817	if (vg_waiting_fds[i].fd == -1 /* not in use */)
				818	continue;
				819	if (! vg_waiting_fds[i].ready)
				820	continue;
				821
				822	fd = vg_waiting_fds[i].fd;
				823	tid = vg_waiting_fds[i].tid;
				824	vg_assert(tid >= 0 && tid < VG_N_THREADS);
				825
				826	/* The thread actually has to be waiting for the I/O event it
				827	requested before we can deliver the result! */
				828	if (vg_threads[tid].status != VgTs_WaitFD)
				829	continue;
				830
				831	/* Ok, actually do it! We can safely use %EAX as the syscall
				832	number, because the speculative call made by
				833	sched_do_syscall() doesn't change %EAX in the case where the
				834	call would have blocked. */
				835
				836	syscall_no = vg_waiting_fds[i].syscall_no;
				837	vg_assert(syscall_no == vg_threads[tid].m_eax);
				838	KERNEL_DO_SYSCALL(tid,res);
				839	VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
				840
				841	/* Reschedule. */
				842	vg_threads[tid].status = VgTs_Runnable;
				843	/* Mark slot as no longer in use. */
				844	vg_waiting_fds[i].fd = -1;
				845	/* pp_sched_status(); */
				846	if (1) {
				847	VG_(sprintf)(msg_buf,"resume due to I/O completion on fd %d", fd);
				848	print_sched_event(tid, msg_buf);
				849	}
				850	}
				851	}
				852
				853
				854	static
				855	void nanosleep_for_a_while ( void )
				856	{
				857	Int res;
				858	struct vki_timespec req;
				859	struct vki_timespec rem;
				860	req.tv_sec = 0;
				861	req.tv_nsec = 20 * 1000 * 1000;
				862	res = VG_(nanosleep)( &req, &rem );
				863	/* VG_(printf)("after ns, unused = %d\n", rem.tv_nsec ); */
				864	vg_assert(res == 0);
				865	}
				866
				867
				868	/* ---------------------------------------------------------------------
				869	The scheduler proper.
				870	------------------------------------------------------------------ */
				871
				872	/* Run user-space threads until either
				873	* Deadlock occurs
				874	* One thread asks to shutdown Valgrind
				875	* The specified number of basic blocks has gone by.
				876	*/
				877	VgSchedReturnCode VG_(scheduler) ( void )
				878	{
				879	ThreadId tid, tid_next;
				880	UInt trc;
				881	UInt dispatch_ctr_SAVED;
				882	Int done_this_time, n_in_fdwait;
				883	Char msg_buf[100];
				884	Addr trans_addr;
				885
				886	/* For the LRU structures, records when the epoch began. */
				887	ULong lru_epoch_started_at = 0;
				888
				889	/* Start with the root thread. tid in general indicates the
				890	currently runnable/just-finished-running thread. */
				891	tid = 0;
				892
				893	/* This is the top level scheduler loop. It falls into three
				894	phases. */
				895	while (True) {
				896
				897	/* ======================= Phase 1 of 3 =======================
				898	Handle I/O completions and signals. This may change the
				899	status of various threads. Then select a new thread to run,
				900	or declare deadlock, or sleep if there are no runnable
				901	threads but some are blocked on I/O. */
				902
				903	/* Age the LRU structures if an epoch has been completed. */
				904	if (VG_(bbs_done) - lru_epoch_started_at >= VG_BBS_PER_EPOCH) {
				905	lru_epoch_started_at = VG_(bbs_done);
				906	increment_epoch();
				907	}
				908
				909	/* Was a debug-stop requested? */
				910	if (VG_(bbs_to_go) == 0)
				911	goto debug_stop;
				912
				913	/* Do the following loop until a runnable thread is found, or
				914	deadlock is detected. */
				915	while (True) {
				916
				917	/* For stats purposes only. */
				918	VG_(num_scheduling_events_MAJOR) ++;
				919
				920	/* See if any I/O operations which we were waiting for have
				921	completed, and, if so, make runnable the relevant waiting
				922	threads. */
				923	poll_for_ready_fds();
				924	complete_blocked_syscalls();
				925
				926	/* See if there are any signals which need to be delivered. If
				927	so, choose thread(s) to deliver them to, and build signal
				928	delivery frames on those thread(s) stacks. */
				929	VG_(deliver_signals)( 0 /HACK/ );
				930	VG_(do_sanity_checks)(0 /HACK/, False);
				931
				932	/* Try and find a thread (tid) to run. */
				933	tid_next = tid;
				934	n_in_fdwait = 0;
				935	while (True) {
				936	tid_next++;
				937	if (tid_next >= VG_N_THREADS) tid_next = 0;
				938	if (vg_threads[tid_next].status == VgTs_WaitFD)
				939	n_in_fdwait ++;
				940	if (vg_threads[tid_next].status == VgTs_Runnable)
				941	break; /* We can run this one. */
				942	if (tid_next == tid)
				943	break; /* been all the way round */
				944	}
				945	tid = tid_next;
				946
				947	if (vg_threads[tid].status == VgTs_Runnable) {
				948	/* Found a suitable candidate. Fall out of this loop, so
				949	we can advance to stage 2 of the scheduler: actually
				950	running the thread. */
				951	break;
				952	}
				953
				954	/* We didn't find a runnable thread. Now what? */
				955	if (n_in_fdwait == 0) {
				956	/* No runnable threads and non in fd-wait either. Not
				957	good. */
				958	pp_sched_status();
				959	return VgSrc_Deadlock;
				960	}
				961
				962	/* At least one thread is in a fd-wait state. Delay for a
				963	while, and go round again, in the hope that eventually a
				964	thread becomes runnable. */
				965	nanosleep_for_a_while();
				966	// pp_sched_status();
				967	// VG_(printf)(".\n");
				968	}
				969
				970
				971	/* ======================= Phase 2 of 3 =======================
				972	Wahey! We've finally decided that thread tid is runnable, so
				973	we now do that. Run it for as much of a quanta as possible.
				974	Trivial requests are handled and the thread continues. The
				975	aim is not to do too many of Phase 1 since it is expensive. */
				976
				977	if (0)
				978	VG_(printf)("SCHED: tid %d, used %d\n", tid, VG_N_THREADS);
				979
				980	/* Figure out how many bbs to ask vg_run_innerloop to do. Note
				981	that it decrements the counter before testing it for zero, so
				982	that if VG_(dispatch_ctr) is set to N you get at most N-1
				983	iterations. Also this means that VG_(dispatch_ctr) must
				984	exceed zero before entering the innerloop. Also also, the
				985	decrement is done before the bb is actually run, so you
				986	always get at least one decrement even if nothing happens.
				987	*/
				988	if (VG_(bbs_to_go) >= VG_SCHEDULING_QUANTUM)
				989	VG_(dispatch_ctr) = VG_SCHEDULING_QUANTUM + 1;
				990	else
				991	VG_(dispatch_ctr) = (UInt)VG_(bbs_to_go) + 1;
				992
				993	/* ... and remember what we asked for. */
				994	dispatch_ctr_SAVED = VG_(dispatch_ctr);
				995
				996	/* Actually run thread tid. */
				997	while (True) {
				998
				999	/* For stats purposes only. */
				1000	VG_(num_scheduling_events_MINOR) ++;
				1001
				1002	if (0)
				1003	VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs",
				1004	tid, VG_(dispatch_ctr) - 1 );
				1005
				1006	trc = run_thread_for_a_while ( tid );
				1007
				1008	/* Deal quickly with trivial scheduling events, and resume the
				1009	thread. */
				1010
				1011	if (trc == VG_TRC_INNER_FASTMISS) {
				1012	vg_assert(VG_(dispatch_ctr) > 0);
				1013
				1014	/* Trivial event. Miss in the fast-cache. Do a full
				1015	lookup for it. */
				1016	trans_addr
				1017	= VG_(search_transtab) ( vg_threads[tid].m_eip );
				1018	if (trans_addr == (Addr)0) {
				1019	/* Not found; we need to request a translation. */
				1020	VG_(create_translation_for)( vg_threads[tid].m_eip );
				1021	trans_addr = VG_(search_transtab) ( vg_threads[tid].m_eip );
				1022	if (trans_addr == (Addr)0)
				1023	VG_(panic)("VG_TRC_INNER_FASTMISS: missing tt_fast entry");
				1024	}
				1025	continue; /* with this thread */
				1026	}
				1027
				1028	if (trc == VG_TRC_EBP_JMP_CLIENTREQ) {
				1029	Bool is_triv = maybe_do_trivial_clientreq(tid);
				1030	if (is_triv) {
				1031	/* NOTE: a trivial request is something like a call to
				1032	malloc() or free(). It DOES NOT change the
				1033	Runnability of this thread nor the status of any
				1034	other thread; it is purely thread-local. */
				1035	continue; /* with this thread */
				1036	}
				1037	}
				1038
				1039	/* It's a non-trivial event. Give up running this thread and
				1040	handle things the expensive way. */
				1041	break;
				1042	}
				1043
				1044	/* ======================= Phase 3 of 3 =======================
				1045	Handle non-trivial thread requests, mostly pthread stuff. */
				1046
				1047	/* Ok, we've fallen out of the dispatcher for a
				1048	non-completely-trivial reason. First, update basic-block
				1049	counters. */
				1050
				1051	done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 1;
				1052	vg_assert(done_this_time >= 0);
				1053	VG_(bbs_to_go) -= (ULong)done_this_time;
				1054	VG_(bbs_done) += (ULong)done_this_time;
				1055
				1056	if (0 && trc != VG_TRC_INNER_FASTMISS)
				1057	VG_(message)(Vg_DebugMsg, "thread %d: completed %d bbs, trc %d",
				1058	tid, done_this_time, (Int)trc );
				1059
				1060	if (0 && trc != VG_TRC_INNER_FASTMISS)
				1061	VG_(message)(Vg_DebugMsg, "thread %d: %ld bbs, event %s",
				1062	tid, VG_(bbs_done),
				1063	name_of_sched_event(trc) );
				1064
				1065	/* Examine the thread's return code to figure out why it
				1066	stopped, and handle requests. */
				1067
				1068	switch (trc) {
				1069
				1070	case VG_TRC_INNER_FASTMISS:
				1071	VG_(panic)("VG_(scheduler): VG_TRC_INNER_FASTMISS");
				1072	/NOTREACHED/
				1073	break;
				1074
				1075	case VG_TRC_INNER_COUNTERZERO:
				1076	/* Timeslice is out. Let a new thread be scheduled,
				1077	simply by doing nothing, causing us to arrive back at
				1078	Phase 1. */
				1079	if (VG_(bbs_to_go) == 0) {
				1080	goto debug_stop;
				1081	}
				1082	vg_assert(VG_(dispatch_ctr) == 0);
				1083	break;
				1084
				1085	case VG_TRC_UNRESUMABLE_SIGNAL:
				1086	/* It got a SIGSEGV/SIGBUS, which we need to deliver right
				1087	away. Again, do nothing, so we wind up back at Phase
				1088	1, whereupon the signal will be "delivered". */
				1089	break;
				1090
				1091	case VG_TRC_EBP_JMP_SPECIAL: {
				1092	Addr next_eip = vg_threads[tid].m_eip;
				1093	if (next_eip == (Addr) & VG_(signalreturn_bogusRA)) {
				1094	/* vthread tid is returning from a signal handler;
				1095	modify its stack/regs accordingly. */
				1096	VG_(signal_returns)(tid);
				1097	}
				1098	else
				1099	if (next_eip == (Addr) & VG_(shutdown)) {
				1100	return VgSrc_Shutdown;
				1101	} else {
				1102	VG_(panic)("vg_schedule: VG_TRC_EBP_JMP_SPECIAL");
				1103	}
				1104	break;
				1105	}
				1106
				1107	case VG_TRC_EBP_JMP_SYSCALL:
				1108	/* Do a syscall for the vthread tid. This could cause it
				1109	to become non-runnable. */
				1110	sched_do_syscall(tid);
				1111	break;
				1112
				1113	case VG_TRC_EBP_JMP_CLIENTREQ:
				1114	/* Do a client request for the vthread tid. Note that
				1115	some requests will have been handled by
				1116	maybe_do_trivial_clientreq(), so we don't expect to see
				1117	those here.
				1118	*/
				1119	if (0) {
				1120	VG_(sprintf)(msg_buf, "request 0x%x",
				1121	vg_threads[tid].m_eax);
				1122	print_sched_event(tid, msg_buf);
				1123	}
				1124	/* Do a non-trivial client request for thread tid. tid's
				1125	%EAX points to a short vector of argument words, the
				1126	first of which is the request code. The result of the
				1127	request is put in tid's %EDX. Alternatively, perhaps
				1128	the request causes tid to become non-runnable and/or
				1129	other blocked threads become runnable. In general we
				1130	can and often do mess with the state of arbitrary
				1131	threads at this point. */
				1132	do_nontrivial_clientreq(tid);
				1133	break;
				1134
				1135	default:
				1136	VG_(printf)("\ntrc = %d\n", trc);
				1137	VG_(panic)("VG_(scheduler), phase 3: "
				1138	"unexpected thread return code");
				1139	/* NOTREACHED */
				1140	break;
				1141
				1142	} /* switch (trc) */
				1143
				1144	/* That completes Phase 3 of 3. Return now to the top of the
				1145	main scheduler loop, to Phase 1 of 3. */
				1146
				1147	} /* top-level scheduler loop */
				1148
				1149
				1150	/* NOTREACHED */
				1151	VG_(panic)("scheduler: post-main-loop ?!");
				1152	/* NOTREACHED */
				1153
				1154	debug_stop:
				1155	/* If we exited because of a debug stop, print the translation
				1156	of the last block executed -- by translating it again, and
				1157	throwing away the result. */
				1158	VG_(printf)(
				1159	"======vvvvvvvv====== LAST TRANSLATION ======vvvvvvvv======\n");
				1160	VG_(translate)( vg_threads[tid].m_eip, NULL, NULL, NULL );
				1161	VG_(printf)("\n");
				1162	VG_(printf)(
				1163	"======^^^^^^^^====== LAST TRANSLATION ======^^^^^^^^======\n");
				1164
				1165	return VgSrc_BbsDone;
				1166	}
				1167
				1168
				1169	/* ---------------------------------------------------------------------
				1170	The pthread implementation.
				1171	------------------------------------------------------------------ */
				1172
				1173	#include <pthread.h>
				1174	#include <errno.h>
				1175
				1176	#if !defined(PTHREAD_STACK_MIN)
				1177	# define PTHREAD_STACK_MIN (16384 - VG_AR_CLIENT_STACKBASE_REDZONE_SZB)
				1178	#endif
				1179
				1180	/* /usr/include/bits/pthreadtypes.h:
				1181	typedef unsigned long int pthread_t;
				1182	*/
				1183
				1184	/* RUNS ON SIMD CPU!
				1185	This is the return address that pthread_create uses.
				1186	*/
				1187	static
				1188	void do_pthread_create_bogusRA ( void )
				1189	{
				1190	/* Tell the scheduler that this thread has returned. */
				1191	Int res;
				1192	VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
				1193	VG_USERREQ__PTHREAD_CREATE_BOGUSRA,
				1194	0, 0, 0, 0);
				1195	VG_(panic)("do_pthread_create_bogusRA: shouldn't be still alive!");
				1196	}
				1197
				1198
				1199	static
				1200	void do_pthread_cancel ( ThreadId tid_canceller,
				1201	pthread_t tid_cancellee )
				1202	{
				1203	Char msg_buf[100];
				1204	/* We want make is appear that this thread has returned to
				1205	do_pthread_create_bogusRA with PTHREAD_CANCELED as the
				1206	return value. So: simple: put PTHREAD_CANCELED into %EAX
				1207	and &do_pthread_create_bogusRA into %EIP and keep going! */
				1208	if (1) {
				1209	VG_(sprintf)(msg_buf, "cancelled by %d", tid_canceller);
				1210	print_sched_event(tid_cancellee, msg_buf);
				1211	}
				1212	vg_threads[tid_cancellee].m_eax = (UInt)PTHREAD_CANCELED;
				1213	vg_threads[tid_cancellee].m_eip = (UInt)&do_pthread_create_bogusRA;
				1214	vg_threads[tid_cancellee].status = VgTs_Runnable;
				1215	}
				1216
				1217
				1218
				1219	/* Thread tid is exiting, by returning from the function it was
				1220	created with. The main complication here is to resume any thread
				1221	waiting to join with this one. */
				1222	static
				1223	void do_pthread_create_exit_by_returning ( ThreadId tid )
				1224	{
				1225	ThreadId jnr; /* joiner, the thread calling pthread_join. */
				1226	UInt* jnr_args;
				1227	void** jnr_thread_return;
				1228	Char msg_buf[100];
				1229
				1230	/* Mark it as not in use. Leave the stack in place so the next
				1231	user of this slot doesn't reallocate it. */
				1232	vg_assert(tid >= 0 && tid < VG_N_THREADS);
				1233	vg_assert(vg_threads[tid].status != VgTs_Empty);
				1234
				1235	vg_threads[tid].retval = (void*)vg_threads[tid].m_eax;
				1236
				1237	if (vg_threads[tid].joiner == VG_INVALID_THREADID) {
				1238	/* No one has yet done a join on me */
				1239	vg_threads[tid].status = VgTs_WaitJoiner;
				1240	if (1) {
				1241	VG_(sprintf)(msg_buf,
				1242	"root fn returns, waiting for a call pthread_join(%d)",
				1243	tid);
				1244	print_sched_event(tid, msg_buf);
				1245	}
				1246	} else {
				1247	/* Some is waiting; make their join call return with success,
				1248	putting my exit code in the place specified by the caller's
				1249	thread_return param. This is all very horrible, since we
				1250	need to consult the joiner's arg block -- pointed to by its
				1251	%EAX -- in order to extract the 2nd param of its pthread_join
				1252	call. TODO: free properly the slot (also below).
				1253	*/
				1254	jnr = vg_threads[tid].joiner;
				1255	vg_assert(jnr >= 0 && jnr < VG_N_THREADS);
				1256	vg_assert(vg_threads[jnr].status == VgTs_WaitJoinee);
				1257	jnr_args = (UInt*)vg_threads[jnr].m_eax;
				1258	jnr_thread_return = (void**)(jnr_args[2]);
				1259	if (jnr_thread_return != NULL)
				1260	*jnr_thread_return = vg_threads[tid].retval;
				1261	vg_threads[jnr].m_edx = 0; /* success */
				1262	vg_threads[jnr].status = VgTs_Runnable;
				1263	vg_threads[tid].status = VgTs_Empty; /* bye! */
				1264	if (1) {
				1265	VG_(sprintf)(msg_buf,
				1266	"root fn returns, to find a waiting pthread_join(%d)", tid);
				1267	print_sched_event(tid, msg_buf);
				1268	VG_(sprintf)(msg_buf,
				1269	"my pthread_join(%d) returned; resuming", tid);
				1270	print_sched_event(jnr, msg_buf);
				1271	}
				1272	}
				1273
				1274	/* Return value is irrelevant; this thread will not get
				1275	rescheduled. */
				1276	}
				1277
				1278
				1279	static
				1280	void do_pthread_join ( ThreadId tid, ThreadId jee, void** thread_return )
				1281	{
				1282	Char msg_buf[100];
				1283
				1284	/* jee, the joinee, is the thread specified as an arg in thread
				1285	tid's call to pthread_join. So tid is the join-er. */
				1286	vg_assert(tid >= 0 && tid < VG_N_THREADS);
				1287	vg_assert(vg_threads[tid].status == VgTs_Runnable);
				1288
				1289	if (jee == tid) {
				1290	vg_threads[tid].m_edx = EDEADLK; /* libc constant, not a kernel one */
				1291	vg_threads[tid].status = VgTs_Runnable;
				1292	return;
				1293	}
				1294
				1295	if (jee < 0
				1296	\|\| jee >= VG_N_THREADS
				1297	\|\| vg_threads[jee].status == VgTs_Empty) {
				1298	/* Invalid thread to join to. */
				1299	vg_threads[tid].m_edx = EINVAL;
				1300	vg_threads[tid].status = VgTs_Runnable;
				1301	return;
				1302	}
				1303
				1304	if (vg_threads[jee].joiner != VG_INVALID_THREADID) {
				1305	/* Someone already did join on this thread */
				1306	vg_threads[tid].m_edx = EINVAL;
				1307	vg_threads[tid].status = VgTs_Runnable;
				1308	return;
				1309	}
				1310
				1311	/* if (vg_threads[jee].detached) ... */
				1312
				1313	/* Perhaps the joinee has already finished? If so return
				1314	immediately with its return code, and free up the slot. TODO:
				1315	free it properly (also above). */
				1316	if (vg_threads[jee].status == VgTs_WaitJoiner) {
				1317	vg_assert(vg_threads[jee].joiner == VG_INVALID_THREADID);
				1318	vg_threads[tid].m_edx = 0; /* success */
				1319	if (thread_return != NULL)
				1320	*thread_return = vg_threads[jee].retval;
				1321	vg_threads[tid].status = VgTs_Runnable;
				1322	vg_threads[jee].status = VgTs_Empty; /* bye! */
				1323	if (1) {
				1324	VG_(sprintf)(msg_buf,
				1325	"someone called pthread_join() on me; bye!");
				1326	print_sched_event(jee, msg_buf);
				1327	VG_(sprintf)(msg_buf,
				1328	"my pthread_join(%d) returned immediately",
				1329	jee );
				1330	print_sched_event(tid, msg_buf);
				1331	}
				1332	return;
				1333	}
				1334
				1335	/* Ok, so we'll have to wait on jee. */
				1336	vg_threads[jee].joiner = tid;
				1337	vg_threads[tid].status = VgTs_WaitJoinee;
				1338	if (1) {
				1339	VG_(sprintf)(msg_buf,
				1340	"blocking on call of pthread_join(%d)", jee );
				1341	print_sched_event(tid, msg_buf);
				1342	}
				1343	/* So tid's join call does not return just now. */
				1344	}
				1345
				1346
				1347	static
				1348	void do_pthread_create ( ThreadId parent_tid,
				1349	pthread_t* thread,
				1350	pthread_attr_t* attr,
				1351	void* (start_routine)(void ),
				1352	void* arg )
				1353	{
				1354	Addr new_stack;
				1355	UInt new_stk_szb;
				1356	ThreadId tid;
				1357	Char msg_buf[100];
				1358
				1359	/* Paranoia ... */
				1360	vg_assert(sizeof(pthread_t) == sizeof(UInt));
				1361
				1362	vg_assert(vg_threads[parent_tid].status != VgTs_Empty);
				1363
				1364	tid = vg_alloc_ThreadState();
				1365
				1366	/* If we've created the main thread's tid, we're in deep trouble :) */
				1367	vg_assert(tid != 0);
				1368
				1369	/* Copy the parent's CPU state into the child's, in a roundabout
				1370	way (via baseBlock). */
				1371	VG_(load_thread_state)(parent_tid);
				1372	VG_(save_thread_state)(tid);
				1373
				1374	/* Consider allocating the child a stack, if the one it already has
				1375	is inadequate. */
				1376	new_stk_szb = PTHREAD_STACK_MIN;
				1377
				1378	if (new_stk_szb > vg_threads[tid].stack_size) {
				1379	/* Again, for good measure :) We definitely don't want to be
				1380	allocating a stack for the main thread. */
				1381	vg_assert(tid != 0);
				1382	/* for now, we don't handle the case of anything other than
				1383	assigning it for the first time. */
				1384	vg_assert(vg_threads[tid].stack_size == 0);
				1385	vg_assert(vg_threads[tid].stack_base == (Addr)NULL);
				1386	new_stack = (Addr)VG_(get_memory_from_mmap)( new_stk_szb );
				1387	vg_threads[tid].stack_base = new_stack;
				1388	vg_threads[tid].stack_size = new_stk_szb;
				1389	vg_threads[tid].m_esp
				1390	= new_stack + new_stk_szb
				1391	- VG_AR_CLIENT_STACKBASE_REDZONE_SZB;
				1392	}
				1393	if (VG_(clo_instrument))
				1394	VGM_(make_noaccess)( vg_threads[tid].m_esp,
				1395	VG_AR_CLIENT_STACKBASE_REDZONE_SZB );
				1396
				1397	/* push arg */
				1398	vg_threads[tid].m_esp -= 4;
				1399	* (UInt*)(vg_threads[tid].m_esp) = (UInt)arg;
				1400
				1401	/* push (magical) return address */
				1402	vg_threads[tid].m_esp -= 4;
				1403	* (UInt*)(vg_threads[tid].m_esp) = (UInt)do_pthread_create_bogusRA;
				1404
				1405	if (VG_(clo_instrument))
				1406	VGM_(make_readable)( vg_threads[tid].m_esp, 2 * 4 );
				1407
				1408	/* this is where we start */
				1409	vg_threads[tid].m_eip = (UInt)start_routine;
				1410
				1411	if (1) {
				1412	VG_(sprintf)(msg_buf,
				1413	"new thread, created by %d", parent_tid );
				1414	print_sched_event(tid, msg_buf);
				1415	}
				1416
				1417	/* store the thread id in thread. /
				1418	// if (VG_(clo_instrument))
				1419	// ***** CHECK *thread is writable
				1420	*thread = (pthread_t)tid;
				1421
				1422	/* return zero */
				1423	vg_threads[tid].joiner = VG_INVALID_THREADID;
				1424	vg_threads[tid].status = VgTs_Runnable;
				1425	vg_threads[tid].m_edx = 0; /* success */
				1426	}
				1427
				1428
				1429	/* Horrible hacks to do with pthread_mutex_t: the real pthread_mutex_t
				1430	is a struct with at least 5 words:
				1431	typedef struct
				1432	{
				1433	int __m_reserved; -- Reserved for future use
				1434	int __m_count; -- Depth of recursive locking
				1435	_pthread_descr __m_owner; -- Owner thread (if recursive or errcheck)
				1436	int __m_kind; -- Mutex kind: fast, recursive or errcheck
				1437	struct _pthread_fastlock __m_lock; -- Underlying fast lock
				1438	} pthread_mutex_t;
				1439	Ours is just a single word, an index into vg_mutexes[].
				1440	For now I'll park it in the __m_reserved field.
				1441
				1442	Uninitialised mutexes (PTHREAD_MUTEX_INITIALIZER) all have
				1443	a zero __m_count field (see /usr/include/pthread.h). So I'll
				1444	use zero to mean non-inited, and 1 to mean inited.
				1445
				1446	How convenient.
				1447	*/
				1448
				1449	static
				1450	void initialise_mutex ( pthread_mutex_t *mutex )
				1451	{
				1452	MutexId mid;
				1453	/* vg_alloc_MutexId aborts if we can't allocate a mutex, for
				1454	whatever reason. */
				1455	VG_(printf)("initialise_mutex %p\n", mutex);
				1456	mid = vg_alloc_VgMutex();
				1457	vg_mutexes[mid].in_use = True;
				1458	vg_mutexes[mid].held = False;
				1459	vg_mutexes[mid].owner = VG_INVALID_THREADID; /* irrelevant */
				1460	mutex->__m_reserved = mid;
				1461	mutex->__m_count = 1; /* initialised */
				1462	}
				1463
				1464	/* Allocate a new MutexId and write it into *mutex. Ideally take
				1465	notice of the attributes in mutexattr. /
				1466	static
				1467	void do_pthread_mutex_init ( ThreadId tid,
				1468	pthread_mutex_t *mutex,
				1469	const pthread_mutexattr_t *mutexattr)
				1470	{
				1471	/* Paranoia ... */
				1472	VG_(printf)("mutex_init %d %p %p\n", tid, mutex, mutexattr);
				1473
				1474	vg_assert(sizeof(pthread_mutex_t) >= sizeof(UInt));
				1475
				1476	initialise_mutex(mutex);
				1477	/*
				1478	RETURN VALUE
				1479	pthread_mutex_init always returns 0. The other mutex functions
				1480	return 0 on success and a non-zero error code on error.
				1481	*/
				1482	/* THIS THREAD returns with 0. */
				1483	vg_threads[tid].m_edx = 0;
				1484	}
				1485
				1486
				1487	static
				1488	void do_pthread_mutex_lock( ThreadId tid, pthread_mutex_t *mutex )
				1489	{
				1490	MutexId mid;
				1491	Char msg_buf[100];
				1492
				1493	VG_(printf)("mutex_lock %d %p\n", tid, mutex);
				1494
				1495	/* *mutex contains the MutexId, or one of the magic values
				1496	PTHREAD_MUTEX_INITIALIZER, indicating we need to initialise it
				1497	now. See comment(s) above re use of __m_count to indicated
				1498	initialisation status.
				1499	*/
				1500
				1501	/* POSIX doesn't mandate this, but for sanity ... */
				1502	if (mutex == NULL) {
				1503	vg_threads[tid].m_edx = EINVAL;
				1504	return;
				1505	}
				1506
				1507	if (mutex->__m_count == 0) {
				1508	initialise_mutex(mutex);
				1509	}
				1510
				1511	mid = mutex->__m_reserved;
				1512	if (mid < 0 \|\| mid >= VG_N_MUTEXES \|\| !vg_mutexes[mid].in_use) {
				1513	vg_threads[tid].m_edx = EINVAL;
				1514	return;
				1515	}
				1516
				1517	/* Assert initialised. */
				1518	vg_assert(mutex->__m_count == 1);
				1519
				1520	/* Assume tid valid. */
				1521	vg_assert(vg_threads[tid].status == VgTs_Runnable);
				1522
				1523	if (vg_mutexes[mid].held) {
				1524	if (vg_mutexes[mid].owner == tid) {
				1525	vg_threads[tid].m_edx = EDEADLK;
				1526	return;
				1527	}
				1528	/* Someone else has it; we have to wait. */
				1529	vg_threads[tid].status = VgTs_WaitMX;
				1530	vg_threads[tid].waited_on_mid = mid;
				1531	/* No assignment to %EDX, since we're blocking. */
				1532	if (1) {
				1533	VG_(sprintf)(msg_buf, "wait for mutex %d", mid );
				1534	print_sched_event(tid, msg_buf);
				1535	}
				1536	} else {
				1537	/* We get it! */
				1538	vg_mutexes[mid].held = True;
				1539	vg_mutexes[mid].owner = tid;
				1540	/* return 0 (success). */
				1541	vg_threads[tid].m_edx = 0;
				1542	}
				1543	}
				1544
				1545
				1546	static
				1547	void do_pthread_mutex_unlock ( ThreadId tid,
				1548	pthread_mutex_t *mutex )
				1549	{
				1550	MutexId mid;
				1551	Int i;
				1552	Char msg_buf[100];
				1553
				1554	VG_(printf)("mutex_unlock %d %p\n", tid, mutex);
				1555
				1556	if (mutex == NULL
				1557	\|\| mutex->__m_count != 1) {
				1558	vg_threads[tid].m_edx = EINVAL;
				1559	return;
				1560	}
				1561
				1562	mid = mutex->__m_reserved;
				1563	if (mid < 0 \|\| mid >= VG_N_MUTEXES \|\| !vg_mutexes[mid].in_use) {
				1564	vg_threads[tid].m_edx = EINVAL;
				1565	return;
				1566	}
				1567
				1568	/* Assume tid valid */
				1569	vg_assert(vg_threads[tid].status == VgTs_Runnable);
				1570
				1571	/* Barf if we don't currently hold the mutex. */
				1572	if (!vg_mutexes[mid].held \|\| vg_mutexes[mid].owner != tid) {
				1573	vg_threads[tid].m_edx = EPERM;
				1574	return;
				1575	}
				1576
				1577	/* Find some arbitrary thread waiting on this mutex, and make it
				1578	runnable. If none are waiting, mark the mutex as not held. */
				1579	for (i = 0; i < VG_N_THREADS; i++) {
				1580	if (vg_threads[i].status == VgTs_Empty)
				1581	continue;
				1582	if (vg_threads[i].status == VgTs_WaitMX
				1583	&& vg_threads[i].waited_on_mid == mid)
				1584	break;
				1585	}
				1586
				1587	vg_assert(i <= VG_N_THREADS);
				1588	if (i == VG_N_THREADS) {
				1589	/* Nobody else is waiting on it. */
				1590	vg_mutexes[mid].held = False;
				1591	} else {
				1592	/* Notionally transfer the hold to thread i, whose
				1593	pthread_mutex_lock() call now returns with 0 (success). */
				1594	vg_mutexes[mid].owner = i;
				1595	vg_threads[i].status = VgTs_Runnable;
				1596	vg_threads[i].m_edx = 0; /* pth_lock() success */
				1597	if (1) {
				1598	VG_(sprintf)(msg_buf, "acquire mutex %d, resume", mid );
				1599	print_sched_event(tid, msg_buf);
				1600	}
				1601	}
				1602
				1603	/* In either case, our (tid's) pth_unlock() returns with 0
				1604	(success). */
				1605	vg_threads[tid].m_edx = 0; /* Success. */
				1606	}
				1607
				1608
				1609	static void do_pthread_mutex_destroy ( ThreadId tid,
				1610	pthread_mutex_t *mutex )
				1611	{
				1612	MutexId mid;
				1613
				1614	VG_(printf)("mutex_destroy %d %p\n", tid, mutex);
				1615
				1616	if (mutex == NULL
				1617	\|\| mutex->__m_count != 1) {
				1618	vg_threads[tid].m_edx = EINVAL;
				1619	return;
				1620	}
				1621
				1622	mid = mutex->__m_reserved;
				1623	if (mid < 0 \|\| mid >= VG_N_MUTEXES \|\| !vg_mutexes[mid].in_use) {
				1624	vg_threads[tid].m_edx = EINVAL;
				1625	return;
				1626	}
				1627
				1628	/* Assume tid valid */
				1629	vg_assert(vg_threads[tid].status == VgTs_Runnable);
				1630
				1631	/* Barf if the mutex is currently held. */
				1632	if (vg_mutexes[mid].held) {
				1633	vg_threads[tid].m_edx = EBUSY;
				1634	return;
				1635	}
				1636
				1637	mutex->__m_count = 0; /* uninitialised */
				1638	vg_mutexes[mid].in_use = False;
				1639	vg_threads[tid].m_edx = 0;
				1640	}
				1641
				1642
				1643	/* ---------------------------------------------------------------------
				1644	Handle non-trivial client requests.
				1645	------------------------------------------------------------------ */
				1646
				1647	static
				1648	void do_nontrivial_clientreq ( ThreadId tid )
				1649	{
				1650	UInt* arg = (UInt*)(vg_threads[tid].m_eax);
				1651	UInt req_no = arg[0];
				1652	switch (req_no) {
				1653
				1654	case VG_USERREQ__PTHREAD_CREATE:
				1655	do_pthread_create( tid,
				1656	(pthread_t*)arg[1],
				1657	(pthread_attr_t*)arg[2],
				1658	(void()(void*))arg[3],
				1659	(void*)arg[4] );
				1660	break;
				1661
				1662	case VG_USERREQ__PTHREAD_CREATE_BOGUSRA:
				1663	do_pthread_create_exit_by_returning( tid );
				1664	break;
				1665
				1666	case VG_USERREQ__PTHREAD_JOIN:
				1667	do_pthread_join( tid, arg[1], (void**)(arg[2]) );
				1668	break;
				1669
				1670	/* Sigh ... this probably will cause huge numbers of major
				1671	(expensive) scheduling events, for no real reason.
				1672	Perhaps should be classified as a trivial-request. */
				1673	case VG_USERREQ__PTHREAD_GET_THREADID:
				1674	vg_threads[tid].m_edx = tid;
				1675	break;
				1676
				1677	case VG_USERREQ__PTHREAD_MUTEX_INIT:
				1678	do_pthread_mutex_init( tid,
				1679	(pthread_mutex_t *)(arg[1]),
				1680	(pthread_mutexattr_t *)(arg[2]) );
				1681	break;
				1682
				1683	case VG_USERREQ__PTHREAD_MUTEX_LOCK:
				1684	do_pthread_mutex_lock( tid, (pthread_mutex_t *)(arg[1]) );
				1685	break;
				1686
				1687	case VG_USERREQ__PTHREAD_MUTEX_UNLOCK:
				1688	do_pthread_mutex_unlock( tid, (pthread_mutex_t *)(arg[1]) );
				1689	break;
				1690
				1691	case VG_USERREQ__PTHREAD_MUTEX_DESTROY:
				1692	do_pthread_mutex_destroy( tid, (pthread_mutex_t *)(arg[1]) );
				1693	break;
				1694
				1695	case VG_USERREQ__PTHREAD_CANCEL:
				1696	do_pthread_cancel( tid, (pthread_t)(arg[1]) );
				1697	break;
				1698
				1699	case VG_USERREQ__MAKE_NOACCESS:
				1700	case VG_USERREQ__MAKE_WRITABLE:
				1701	case VG_USERREQ__MAKE_READABLE:
				1702	case VG_USERREQ__DISCARD:
				1703	case VG_USERREQ__CHECK_WRITABLE:
				1704	case VG_USERREQ__CHECK_READABLE:
				1705	case VG_USERREQ__MAKE_NOACCESS_STACK:
				1706	case VG_USERREQ__RUNNING_ON_VALGRIND:
				1707	case VG_USERREQ__DO_LEAK_CHECK:
				1708	vg_threads[tid].m_edx = VG_(handle_client_request) ( arg );
				1709	break;
				1710
				1711	default:
				1712	VG_(printf)("panic'd on private request = 0x%x\n", arg[0] );
				1713	VG_(panic)("handle_private_client_pthread_request: "
				1714	"unknown request");
				1715	/NOTREACHED/
				1716	break;
				1717	}
				1718	}
				1719
				1720
				1721	/--------------------------------------------------------------------/
				1722	/--- end vg_scheduler.c ---/
				1723	/--------------------------------------------------------------------/