vg_scheduler.c - fp2-dev/platform/external/valgrind - Gitiles


 /*--------------------------------------------------------------------*/
 /*--- A user-space pthreads implementation.         vg_scheduler.c ---*/
 /*--------------------------------------------------------------------*/

 /*
    This file is part of Valgrind, an x86 protected-mode emulator
    designed for debugging and profiling binaries on x86-Unixes.

    Copyright (C) 2000-2002 Julian Seward
       jseward@acm.org
       Julian_Seward@muraroa.demon.co.uk

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307, USA.

    The GNU General Public License is contained in the file LICENSE.
 */

 #include "vg_include.h"
 #include "vg_constants.h"

 #include "valgrind.h" /* for VG_USERREQ__MAKE_NOACCESS and
                          VG_USERREQ__DO_LEAK_CHECK */

 /* BORKAGE/ISSUES as of 14 Apr 02

 Note!  This pthreads implementation is so poor as to not be
 suitable for use by anyone at all!

 - Currently, when a signal is run, just the ThreadStatus.status fields
   are saved in the signal frame, along with the CPU state.  Question:
   should I also save and restore:
      ThreadStatus.joiner
      ThreadStatus.waited_on_mid
      ThreadStatus.awaken_at
      ThreadStatus.retval
   Currently unsure, and so am not doing so.

 - Signals interrupting read/write and nanosleep: SA_RESTART settings.
   Read/write correctly return with EINTR when SA_RESTART isn't
   specified and they are interrupted by a signal.  nanosleep just
   pretends signals don't exist -- should be fixed.

 - Read/write syscall starts: don't crap out when the initial
   nonblocking read/write returns an error.

 - Get rid of restrictions re use of sigaltstack; they are no longer
   needed.

 */


 /* ---------------------------------------------------------------------
    Types and globals for the scheduler.
    ------------------------------------------------------------------ */

 /* type ThreadId is defined in vg_include.h. */

 /* struct ThreadState is defined in vg_include.h. */

 /* Private globals.  A statically allocated array of threads. */
 static ThreadState vg_threads[VG_N_THREADS];


 /* vg_oursignalhandler() might longjmp().  Here's the jmp_buf. */
 jmp_buf VG_(scheduler_jmpbuf);
 /* ... and if so, here's the signal which caused it to do so. */
 Int     VG_(longjmpd_on_signal);


 /* Machinery to keep track of which threads are waiting on which
    fds. */
 typedef
    struct {
       /* The thread which made the request. */
       ThreadId tid;

       /* The next two fields describe the request. */
       /* File descriptor waited for.  -1 means this slot is not in use */
       Int      fd;
       /* The syscall number the fd is used in. */
       Int      syscall_no;

       /* False => still waiting for select to tell us the fd is ready
          to go.  True => the fd is ready, but the results have not yet
          been delivered back to the calling thread.  Once the latter
          happens, this entire record is marked as no longer in use, by
          making the fd field be -1.  */
       Bool     ready;
    }
    VgWaitedOnFd;

 static VgWaitedOnFd vg_waiting_fds[VG_N_WAITING_FDS];


 typedef
    struct {
       /* Is this slot in use, or free? */
       Bool in_use;
       /* If in_use, is this mutex held by some thread, or not? */
       Bool held;
       /* if held==True, owner indicates who by. */
       ThreadId owner;
    }
    VgMutex;

 static VgMutex vg_mutexes[VG_N_MUTEXES];

 /* Forwards */
 static void do_nontrivial_clientreq ( ThreadId tid );


 /* ---------------------------------------------------------------------
    Helper functions for the scheduler.
    ------------------------------------------------------------------ */

 static
 void pp_sched_status ( void )
 {
    Int i;
    VG_(printf)("\nsched status:\n");
    for (i = 0; i < VG_N_THREADS; i++) {
       if (vg_threads[i].status == VgTs_Empty) continue;
       VG_(printf)("tid %d:  ", i);
       switch (vg_threads[i].status) {
          case VgTs_Runnable:   VG_(printf)("Runnable\n"); break;
          case VgTs_WaitFD:     VG_(printf)("WaitFD\n"); break;
          case VgTs_WaitJoiner: VG_(printf)("WaitJoiner(%d)\n",
                                            vg_threads[i].joiner); break;
          case VgTs_WaitJoinee: VG_(printf)("WaitJoinee\n"); break;
          case VgTs_Sleeping:   VG_(printf)("Sleeping\n"); break;
          default: VG_(printf)("???"); break;
       }
    }
    VG_(printf)("\n");
 }

 static
 void add_waiting_fd ( ThreadId tid, Int fd, Int syscall_no )
 {
    Int i;

    vg_assert(fd != -1); /* avoid total chaos */

    for (i = 0;  i < VG_N_WAITING_FDS; i++)
       if (vg_waiting_fds[i].fd == -1)
          break;

    if (i == VG_N_WAITING_FDS)
       VG_(panic)("add_waiting_fd: VG_N_WAITING_FDS is too low");
    /*
    VG_(printf)("add_waiting_fd: add (tid %d, fd %d) at slot %d\n",
                tid, fd, i);
    */
    vg_waiting_fds[i].fd         = fd;
    vg_waiting_fds[i].tid        = tid;
    vg_waiting_fds[i].ready      = False;
    vg_waiting_fds[i].syscall_no = syscall_no;
 }


 static
 void print_sched_event ( ThreadId tid, Char* what )
 {
    VG_(message)(Vg_DebugMsg, "SCHED[%d]: %s", tid, what );
 }


 static
 void print_pthread_event ( ThreadId tid, Char* what )
 {
    VG_(message)(Vg_DebugMsg, "PTHREAD[%d]: %s", tid, what );
 }


 static
 Char* name_of_sched_event ( UInt event )
 {
    switch (event) {
       case VG_TRC_EBP_JMP_SYSCALL:    return "SYSCALL";
       case VG_TRC_EBP_JMP_CLIENTREQ:  return "CLIENTREQ";
       case VG_TRC_INNER_COUNTERZERO:  return "COUNTERZERO";
       case VG_TRC_INNER_FASTMISS:     return "FASTMISS";
       case VG_TRC_UNRESUMABLE_SIGNAL: return "FATALSIGNAL";
       default:                        return "??UNKNOWN??";
   }
 }


 /* Create a translation of the client basic block beginning at
    orig_addr, and add it to the translation cache & translation table.
    This probably doesn't really belong here, but, hey ...
 */
 void VG_(create_translation_for) ( Addr orig_addr )
 {
    Addr    trans_addr;
    TTEntry tte;
    Int orig_size, trans_size;
    /* Ensure there is space to hold a translation. */
    VG_(maybe_do_lru_pass)();
    VG_(translate)( orig_addr, &orig_size, &trans_addr, &trans_size );
    /* Copy data at trans_addr into the translation cache.
       Returned pointer is to the code, not to the 4-byte
       header. */
    /* Since the .orig_size and .trans_size fields are
       UShort, be paranoid. */
    vg_assert(orig_size > 0 && orig_size < 65536);
    vg_assert(trans_size > 0 && trans_size < 65536);
    tte.orig_size  = orig_size;
    tte.orig_addr  = orig_addr;
    tte.trans_size = trans_size;
    tte.trans_addr = VG_(copy_to_transcache)
                        ( trans_addr, trans_size );
    tte.mru_epoch  = VG_(current_epoch);
    /* Free the intermediary -- was allocated by VG_(emit_code). */
    VG_(jitfree)( (void*)trans_addr );
    /* Add to trans tab and set back pointer. */
    VG_(add_to_trans_tab) ( &tte );
    /* Update stats. */
    VG_(this_epoch_in_count) ++;
    VG_(this_epoch_in_osize) += orig_size;
    VG_(this_epoch_in_tsize) += trans_size;
    VG_(overall_in_count) ++;
    VG_(overall_in_osize) += orig_size;
    VG_(overall_in_tsize) += trans_size;
    /* Record translated area for SMC detection. */
    VG_(smc_mark_original) ( orig_addr, orig_size );
 }


 /* Allocate a completely empty ThreadState record. */
 static
 ThreadId vg_alloc_ThreadState ( void )
 {
    Int i;
    for (i = 0; i < VG_N_THREADS; i++) {
       if (vg_threads[i].status == VgTs_Empty)
          return i;
    }
    VG_(printf)("vg_alloc_ThreadState: no free slots available\n");
    VG_(printf)("Increase VG_N_THREADS, rebuild and try again.\n");
    VG_(panic)("VG_N_THREADS is too low");
    /*NOTREACHED*/
 }


 ThreadState* VG_(get_thread_state) ( ThreadId tid )
 {
    vg_assert(tid >= 0 && tid < VG_N_THREADS);
    vg_assert(vg_threads[tid].status != VgTs_Empty);
    return & vg_threads[tid];
 }


 /* Find an unused VgMutex record. */
 static
 MutexId vg_alloc_VgMutex ( void )
 {
    Int i;
    for (i = 0; i < VG_N_MUTEXES; i++) {
       if (!vg_mutexes[i].in_use)
          return i;
    }
    VG_(printf)("vg_alloc_VgMutex: no free slots available\n");
    VG_(printf)("Increase VG_N_MUTEXES, rebuild and try again.\n");
    VG_(panic)("VG_N_MUTEXES is too low");
    /*NOTREACHED*/
 }


 /* Copy the saved state of a thread into VG_(baseBlock), ready for it
    to be run. */
 __inline__
 void VG_(load_thread_state) ( ThreadId tid )
 {
    Int i;
    VG_(baseBlock)[VGOFF_(m_eax)] = vg_threads[tid].m_eax;
    VG_(baseBlock)[VGOFF_(m_ebx)] = vg_threads[tid].m_ebx;
    VG_(baseBlock)[VGOFF_(m_ecx)] = vg_threads[tid].m_ecx;
    VG_(baseBlock)[VGOFF_(m_edx)] = vg_threads[tid].m_edx;
    VG_(baseBlock)[VGOFF_(m_esi)] = vg_threads[tid].m_esi;
    VG_(baseBlock)[VGOFF_(m_edi)] = vg_threads[tid].m_edi;
    VG_(baseBlock)[VGOFF_(m_ebp)] = vg_threads[tid].m_ebp;
    VG_(baseBlock)[VGOFF_(m_esp)] = vg_threads[tid].m_esp;
    VG_(baseBlock)[VGOFF_(m_eflags)] = vg_threads[tid].m_eflags;
    VG_(baseBlock)[VGOFF_(m_eip)] = vg_threads[tid].m_eip;

    for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
       VG_(baseBlock)[VGOFF_(m_fpustate) + i] = vg_threads[tid].m_fpu[i];

    VG_(baseBlock)[VGOFF_(sh_eax)] = vg_threads[tid].sh_eax;
    VG_(baseBlock)[VGOFF_(sh_ebx)] = vg_threads[tid].sh_ebx;
    VG_(baseBlock)[VGOFF_(sh_ecx)] = vg_threads[tid].sh_ecx;
    VG_(baseBlock)[VGOFF_(sh_edx)] = vg_threads[tid].sh_edx;
    VG_(baseBlock)[VGOFF_(sh_esi)] = vg_threads[tid].sh_esi;
    VG_(baseBlock)[VGOFF_(sh_edi)] = vg_threads[tid].sh_edi;
    VG_(baseBlock)[VGOFF_(sh_ebp)] = vg_threads[tid].sh_ebp;
    VG_(baseBlock)[VGOFF_(sh_esp)] = vg_threads[tid].sh_esp;
    VG_(baseBlock)[VGOFF_(sh_eflags)] = vg_threads[tid].sh_eflags;
 }


 /* Copy the state of a thread from VG_(baseBlock), presumably after it
    has been descheduled.  For sanity-check purposes, fill the vacated
    VG_(baseBlock) with garbage so as to make the system more likely to
    fail quickly if we erroneously continue to poke around inside
    VG_(baseBlock) without first doing a load_thread_state().
 */
 __inline__
 void VG_(save_thread_state) ( ThreadId tid )
 {
    Int i;
    const UInt junk = 0xDEADBEEF;

    vg_threads[tid].m_eax = VG_(baseBlock)[VGOFF_(m_eax)];
    vg_threads[tid].m_ebx = VG_(baseBlock)[VGOFF_(m_ebx)];
    vg_threads[tid].m_ecx = VG_(baseBlock)[VGOFF_(m_ecx)];
    vg_threads[tid].m_edx = VG_(baseBlock)[VGOFF_(m_edx)];
    vg_threads[tid].m_esi = VG_(baseBlock)[VGOFF_(m_esi)];
    vg_threads[tid].m_edi = VG_(baseBlock)[VGOFF_(m_edi)];
    vg_threads[tid].m_ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
    vg_threads[tid].m_esp = VG_(baseBlock)[VGOFF_(m_esp)];
    vg_threads[tid].m_eflags = VG_(baseBlock)[VGOFF_(m_eflags)];
    vg_threads[tid].m_eip = VG_(baseBlock)[VGOFF_(m_eip)];

    for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
       vg_threads[tid].m_fpu[i] = VG_(baseBlock)[VGOFF_(m_fpustate) + i];

    vg_threads[tid].sh_eax = VG_(baseBlock)[VGOFF_(sh_eax)];
    vg_threads[tid].sh_ebx = VG_(baseBlock)[VGOFF_(sh_ebx)];
    vg_threads[tid].sh_ecx = VG_(baseBlock)[VGOFF_(sh_ecx)];
    vg_threads[tid].sh_edx = VG_(baseBlock)[VGOFF_(sh_edx)];
    vg_threads[tid].sh_esi = VG_(baseBlock)[VGOFF_(sh_esi)];
    vg_threads[tid].sh_edi = VG_(baseBlock)[VGOFF_(sh_edi)];
    vg_threads[tid].sh_ebp = VG_(baseBlock)[VGOFF_(sh_ebp)];
    vg_threads[tid].sh_esp = VG_(baseBlock)[VGOFF_(sh_esp)];
    vg_threads[tid].sh_eflags = VG_(baseBlock)[VGOFF_(sh_eflags)];

    /* Fill it up with junk. */
    VG_(baseBlock)[VGOFF_(m_eax)] = junk;
    VG_(baseBlock)[VGOFF_(m_ebx)] = junk;
    VG_(baseBlock)[VGOFF_(m_ecx)] = junk;
    VG_(baseBlock)[VGOFF_(m_edx)] = junk;
    VG_(baseBlock)[VGOFF_(m_esi)] = junk;
    VG_(baseBlock)[VGOFF_(m_edi)] = junk;
    VG_(baseBlock)[VGOFF_(m_ebp)] = junk;
    VG_(baseBlock)[VGOFF_(m_esp)] = junk;
    VG_(baseBlock)[VGOFF_(m_eflags)] = junk;
    VG_(baseBlock)[VGOFF_(m_eip)] = junk;

    for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
       VG_(baseBlock)[VGOFF_(m_fpustate) + i] = junk;
 }


 /* Run the thread tid for a while, and return a VG_TRC_* value to the
    scheduler indicating what happened. */
 static
 UInt run_thread_for_a_while ( ThreadId tid )
 {
    UInt trc = 0;
    vg_assert(tid >= 0 && tid < VG_N_THREADS);
    vg_assert(vg_threads[tid].status != VgTs_Empty);
    vg_assert(VG_(bbs_to_go) > 0);

    VG_(load_thread_state) ( tid );
    if (__builtin_setjmp(VG_(scheduler_jmpbuf)) == 0) {
       /* try this ... */
       trc = VG_(run_innerloop)();
       /* We get here if the client didn't take a fault. */
    } else {
       /* We get here if the client took a fault, which caused our
          signal handler to longjmp. */
       vg_assert(trc == 0);
       trc = VG_TRC_UNRESUMABLE_SIGNAL;
    }
    VG_(save_thread_state) ( tid );
    return trc;
 }


 /* Increment the LRU epoch counter. */
 static
 void increment_epoch ( void )
 {
    VG_(current_epoch)++;
    if (VG_(clo_verbosity) > 2) {
       UInt tt_used, tc_used;
       VG_(get_tt_tc_used) ( &tt_used, &tc_used );
       VG_(message)(Vg_UserMsg,
          "%lu bbs, in: %d (%d -> %d), out %d (%d -> %d), TT %d, TC %d",
           VG_(bbs_done),
           VG_(this_epoch_in_count),
           VG_(this_epoch_in_osize),
           VG_(this_epoch_in_tsize),
           VG_(this_epoch_out_count),
           VG_(this_epoch_out_osize),
           VG_(this_epoch_out_tsize),
           tt_used, tc_used
        );
    }
    VG_(this_epoch_in_count) = 0;
    VG_(this_epoch_in_osize) = 0;
    VG_(this_epoch_in_tsize) = 0;
    VG_(this_epoch_out_count) = 0;
    VG_(this_epoch_out_osize) = 0;
    VG_(this_epoch_out_tsize) = 0;
 }


 /* Initialise the scheduler.  Create a single "main" thread ready to
    run, with special ThreadId of zero.  This is called at startup; the
    caller takes care to park the client's state is parked in
    VG_(baseBlock).
 */
 void VG_(scheduler_init) ( void )
 {
    Int      i;
    Addr     startup_esp;
    ThreadId tid_main;

    startup_esp = VG_(baseBlock)[VGOFF_(m_esp)];
    if ((startup_esp & VG_STARTUP_STACK_MASK) != VG_STARTUP_STACK_MASK) {
       VG_(printf)("%%esp at startup = %p is not near %p; aborting\n",
                   (void*)startup_esp, (void*)VG_STARTUP_STACK_MASK);
       VG_(panic)("unexpected %esp at startup");
    }

    for (i = 0; i < VG_N_THREADS; i++) {
       vg_threads[i].stack_size = 0;
       vg_threads[i].stack_base = (Addr)NULL;
    }

    for (i = 0; i < VG_N_WAITING_FDS; i++)
       vg_waiting_fds[i].fd = -1; /* not in use */

    for (i = 0; i < VG_N_MUTEXES; i++)
       vg_mutexes[i].in_use = False;

    /* Assert this is thread zero, which has certain magic
       properties. */
    tid_main = vg_alloc_ThreadState();
    vg_assert(tid_main == 0);

    vg_threads[tid_main].status      = VgTs_Runnable;
    vg_threads[tid_main].joiner      = VG_INVALID_THREADID;
    vg_threads[tid_main].retval      = NULL; /* not important */

    /* Copy VG_(baseBlock) state to tid_main's slot. */
    VG_(save_thread_state) ( tid_main );
 }


 /* What if fd isn't a valid fd? */
 static
 void set_fd_nonblocking ( Int fd )
 {
    Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
    vg_assert(!VG_(is_kerror)(res));
    res |= VKI_O_NONBLOCK;
    res = VG_(fcntl)( fd, VKI_F_SETFL, res );
    vg_assert(!VG_(is_kerror)(res));
 }

 static
 void set_fd_blocking ( Int fd )
 {
    Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
    vg_assert(!VG_(is_kerror)(res));
    res &= ~VKI_O_NONBLOCK;
    res = VG_(fcntl)( fd, VKI_F_SETFL, res );
    vg_assert(!VG_(is_kerror)(res));
 }

 static
 Bool fd_is_blockful ( Int fd )
 {
    Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
    vg_assert(!VG_(is_kerror)(res));
    return (res & VKI_O_NONBLOCK) ? False : True;
 }


 /* Do a purely thread-local request for tid, and put the result in its
    %EDX, without changing its scheduling state in any way, nor that of
    any other threads.  Return True if so.

    If the request is non-trivial, return False; a more capable but
    slower mechanism will deal with it.
 */
 static
 Bool maybe_do_trivial_clientreq ( ThreadId tid )
 {
 #  define SIMPLE_RETURN(vvv)                      \
        { tst->m_edx = (vvv);                      \
          return True;                             \
        }

    ThreadState* tst    = &vg_threads[tid];
    UInt*        arg    = (UInt*)(tst->m_eax);
    UInt         req_no = arg[0];

    switch (req_no) {
       case VG_USERREQ__MALLOC:
          SIMPLE_RETURN(
             (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocMalloc )
          );
       case VG_USERREQ__BUILTIN_NEW:
          SIMPLE_RETURN(
             (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocNew )
          );
       case VG_USERREQ__BUILTIN_VEC_NEW:
          SIMPLE_RETURN(
             (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocNewVec )
          );
       case VG_USERREQ__FREE:
          VG_(client_free) ( tst, (void*)arg[1], Vg_AllocMalloc );
 	 SIMPLE_RETURN(0); /* irrelevant */
       case VG_USERREQ__BUILTIN_DELETE:
          VG_(client_free) ( tst, (void*)arg[1], Vg_AllocNew );
 	 SIMPLE_RETURN(0); /* irrelevant */
       case VG_USERREQ__BUILTIN_VEC_DELETE:
          VG_(client_free) ( tst, (void*)arg[1], Vg_AllocNewVec );
 	 SIMPLE_RETURN(0); /* irrelevant */
       case VG_USERREQ__CALLOC:
          SIMPLE_RETURN(
             (UInt)VG_(client_calloc) ( tst, arg[1], arg[2] )
          );
       case VG_USERREQ__REALLOC:
          SIMPLE_RETURN(
             (UInt)VG_(client_realloc) ( tst, (void*)arg[1], arg[2] )
          );
       case VG_USERREQ__MEMALIGN:
          SIMPLE_RETURN(
             (UInt)VG_(client_memalign) ( tst, arg[1], arg[2] )
          );
       default:
          /* Too hard; wimp out. */
          return False;
    }
 #  undef SIMPLE_RETURN
 }


 static
 void sched_do_syscall ( ThreadId tid )
 {
    UInt saved_eax;
    UInt res, syscall_no;
    UInt fd;
    Bool might_block, assumed_nonblocking;
    Bool orig_fd_blockness;
    Char msg_buf[100];

    vg_assert(tid >= 0 && tid < VG_N_THREADS);
    vg_assert(vg_threads[tid].status == VgTs_Runnable);

    syscall_no = vg_threads[tid].m_eax; /* syscall number */

    if (syscall_no == __NR_nanosleep) {
       ULong t_now, t_awaken;
       struct vki_timespec* req;
       req = (struct vki_timespec*)vg_threads[tid].m_ebx; /* arg1 */
       t_now = VG_(read_microsecond_timer)();
       t_awaken
          = t_now
            + (ULong)1000000ULL * (ULong)(req->tv_sec)
            + (ULong)( (UInt)(req->tv_nsec) / 1000 );
       vg_threads[tid].status    = VgTs_Sleeping;
       vg_threads[tid].awaken_at = t_awaken;
       if (VG_(clo_trace_sched)) {
          VG_(sprintf)(msg_buf, "at %lu: nanosleep for %lu",
                                t_now, t_awaken-t_now);
 	 print_sched_event(tid, msg_buf);
       }
       /* Force the scheduler to run something else for a while. */
       return;
    }

    switch (syscall_no) {
       case __NR_read:
       case __NR_write:
          assumed_nonblocking
             = False;
          might_block
             = fd_is_blockful(vg_threads[tid].m_ebx /* arg1 */);
          break;
       default:
          might_block = False;
          assumed_nonblocking = True;
    }

    if (assumed_nonblocking) {
       /* We think it's non-blocking.  Just do it in the normal way. */
       VG_(perform_assumed_nonblocking_syscall)(tid);
       /* The thread is still runnable. */
       return;
    }

    /* It might block.  Take evasive action. */
    switch (syscall_no) {
       case __NR_read:
       case __NR_write:
          fd = vg_threads[tid].m_ebx; break;
       default:
          vg_assert(3+3 == 7);
    }

    /* Set the fd to nonblocking, and do the syscall, which will return
       immediately, in order to lodge a request with the Linux kernel.
       We later poll for I/O completion using select().  */

    orig_fd_blockness = fd_is_blockful(fd);
    set_fd_nonblocking(fd);
    vg_assert(!fd_is_blockful(fd));
    VG_(check_known_blocking_syscall)(tid, syscall_no, NULL /* PRE */);

    /* This trashes the thread's %eax; we have to preserve it. */
    saved_eax = vg_threads[tid].m_eax;
    KERNEL_DO_SYSCALL(tid,res);

    /* Restore original blockfulness of the fd. */
    if (orig_fd_blockness)
       set_fd_blocking(fd);
    else
       set_fd_nonblocking(fd);

    if (res != -VKI_EWOULDBLOCK) {
       /* It didn't block; it went through immediately.  So finish off
          in the normal way.  Don't restore %EAX, since that now
          (correctly) holds the result of the call. */
       VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
       /* We're still runnable. */
       vg_assert(vg_threads[tid].status == VgTs_Runnable);

    } else {

       /* It would have blocked.  First, restore %EAX to what it was
          before our speculative call. */
       vg_threads[tid].m_eax = saved_eax;
       /* Put this fd in a table of fds on which we are waiting for
          completion. The arguments for select() later are constructed
          from this table.  */
       add_waiting_fd(tid, fd, saved_eax /* which holds the syscall # */);
       /* Deschedule thread until an I/O completion happens. */
       vg_threads[tid].status = VgTs_WaitFD;
       if (VG_(clo_trace_sched)) {
          VG_(sprintf)(msg_buf,"block until I/O ready on fd %d", fd);
 	 print_sched_event(tid, msg_buf);
       }

    }
 }


 /* Find out which of the fds in vg_waiting_fds are now ready to go, by
    making enquiries with select(), and mark them as ready.  We have to
    wait for the requesting threads to fall into the the WaitFD state
    before we can actually finally deliver the results, so this
    procedure doesn't do that; complete_blocked_syscalls() does it.

    It might seem odd that a thread which has done a blocking syscall
    is not in WaitFD state; the way this can happen is if it initially
    becomes WaitFD, but then a signal is delivered to it, so it becomes
    Runnable for a while.  In this case we have to wait for the
    sighandler to return, whereupon the WaitFD state is resumed, and
    only at that point can the I/O result be delivered to it.  However,
    this point may be long after the fd is actually ready.

    So, poll_for_ready_fds() merely detects fds which are ready.
    complete_blocked_syscalls() does the second half of the trick,
    possibly much later: it delivers the results from ready fds to
    threads in WaitFD state.
 */
 static
 void poll_for_ready_fds ( void )
 {
    vki_ksigset_t      saved_procmask;
    vki_fd_set         readfds;
    vki_fd_set         writefds;
    vki_fd_set         exceptfds;
    struct vki_timeval timeout;
    Int                fd, fd_max, i, n_ready, syscall_no, n_ok;
    ThreadId           tid;
    Bool               rd_ok, wr_ok, ex_ok;
    Char               msg_buf[100];

    struct vki_timespec* rem;
    ULong                t_now;

    /* Awaken any sleeping threads whose sleep has expired. */
    t_now = VG_(read_microsecond_timer)();
    for (tid = 0; tid < VG_N_THREADS; tid++) {
       if (vg_threads[tid].status != VgTs_Sleeping)
          continue;
       if (t_now >= vg_threads[tid].awaken_at) {
          /* Resume this thread.  Set to zero the remaining-time (second)
             arg of nanosleep, since it's used up all its time. */
          vg_assert(vg_threads[tid].m_eax == __NR_nanosleep);
          rem = (struct vki_timespec *)vg_threads[tid].m_ecx; /* arg2 */
          if (rem != NULL) {
 	    rem->tv_sec = 0;
             rem->tv_nsec = 0;
  	 }
          /* Make the syscall return 0 (success). */
          vg_threads[tid].m_eax = 0;
 	 /* Reschedule this thread. */
 	 vg_threads[tid].status = VgTs_Runnable;
          if (VG_(clo_trace_sched)) {
             VG_(sprintf)(msg_buf, "at %lu: nanosleep done",
                                   t_now);
             print_sched_event(tid, msg_buf);
          }
       }
    }

    /* And look for threads waiting on file descriptors which are now
       ready for I/O.*/
    timeout.tv_sec = 0;
    timeout.tv_usec = 0;

    VKI_FD_ZERO(&readfds);
    VKI_FD_ZERO(&writefds);
    VKI_FD_ZERO(&exceptfds);
    fd_max = -1;
    for (i = 0; i < VG_N_WAITING_FDS; i++) {
       if (vg_waiting_fds[i].fd == -1 /* not in use */)
          continue;
       if (vg_waiting_fds[i].ready /* already ready? */)
          continue;
       fd = vg_waiting_fds[i].fd;
       /* VG_(printf)("adding QUERY for fd %d\n", fd); */
       vg_assert(fd >= 0);
       if (fd > fd_max)
          fd_max = fd;
       tid = vg_waiting_fds[i].tid;
       vg_assert(tid >= 0 && tid < VG_N_THREADS);
       syscall_no = vg_waiting_fds[i].syscall_no;
       switch (syscall_no) {
          case __NR_read:
             VKI_FD_SET(fd, &readfds); break;
          case __NR_write:
             VKI_FD_SET(fd, &writefds); break;
          default:
             VG_(panic)("poll_for_ready_fds: unexpected syscall");
             /*NOTREACHED*/
             break;
       }
    }

    /* Short cut: if no fds are waiting, give up now. */
    if (fd_max == -1)
       return;

    /* BLOCK ALL SIGNALS.  We don't want the complication of select()
       getting interrupted. */
    VG_(block_all_host_signals)( &saved_procmask );

    n_ready = VG_(select)
                 ( fd_max+1, &readfds, &writefds, &exceptfds, &timeout);
    if (VG_(is_kerror)(n_ready)) {
       VG_(printf)("poll_for_ready_fds: select returned %d\n", n_ready);
       VG_(panic)("poll_for_ready_fds: select failed?!");
       /*NOTREACHED*/
    }

    /* UNBLOCK ALL SIGNALS */
    VG_(restore_host_signals)( &saved_procmask );

    /* VG_(printf)("poll_for_io_completions: %d fs ready\n", n_ready); */

    if (n_ready == 0)
       return;

    /* Inspect all the fds we know about, and handle any completions that
       have happened. */
    /*
    VG_(printf)("\n\n");
    for (fd = 0; fd < 100; fd++)
      if (VKI_FD_ISSET(fd, &writefds) || VKI_FD_ISSET(fd, &readfds)) {
        VG_(printf)("X"); } else { VG_(printf)("."); };
    VG_(printf)("\n\nfd_max = %d\n", fd_max);
    */

    for (fd = 0; fd <= fd_max; fd++) {
       rd_ok = VKI_FD_ISSET(fd, &readfds);
       wr_ok = VKI_FD_ISSET(fd, &writefds);
       ex_ok = VKI_FD_ISSET(fd, &exceptfds);

       n_ok = (rd_ok ? 1 : 0) + (wr_ok ? 1 : 0) + (ex_ok ? 1 : 0);
       if (n_ok == 0)
          continue;
       if (n_ok > 1) {
          VG_(printf)("offending fd = %d\n", fd);
          VG_(panic)("poll_for_ready_fds: multiple events on fd");
       }

       /* An I/O event completed for fd.  Find the thread which
          requested this. */
       for (i = 0; i < VG_N_WAITING_FDS; i++) {
          if (vg_waiting_fds[i].fd == -1 /* not in use */)
             continue;
          if (vg_waiting_fds[i].fd == fd)
             break;
       }

       /* And a bit more paranoia ... */
       vg_assert(i >= 0 && i < VG_N_WAITING_FDS);

       /* Mark the fd as ready. */
       vg_assert(! vg_waiting_fds[i].ready);
       vg_waiting_fds[i].ready = True;
    }
 }


 /* See comment attached to poll_for_ready_fds() for explaination. */
 static
 void complete_blocked_syscalls ( void )
 {
    Int      fd, i, res, syscall_no;
    ThreadId tid;
    Char     msg_buf[100];

    /* Inspect all the outstanding fds we know about. */

    for (i = 0; i < VG_N_WAITING_FDS; i++) {
       if (vg_waiting_fds[i].fd == -1 /* not in use */)
          continue;
       if (! vg_waiting_fds[i].ready)
          continue;

       fd  = vg_waiting_fds[i].fd;
       tid = vg_waiting_fds[i].tid;
       vg_assert(tid >= 0 && tid < VG_N_THREADS);

       /* The thread actually has to be waiting for the I/O event it
          requested before we can deliver the result! */
       if (vg_threads[tid].status != VgTs_WaitFD)
          continue;

       /* Ok, actually do it!  We can safely use %EAX as the syscall
          number, because the speculative call made by
          sched_do_syscall() doesn't change %EAX in the case where the
          call would have blocked. */

       syscall_no = vg_waiting_fds[i].syscall_no;
       vg_assert(syscall_no == vg_threads[tid].m_eax);
       KERNEL_DO_SYSCALL(tid,res);
       VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);

       /* Reschedule. */
       vg_threads[tid].status = VgTs_Runnable;
       /* Mark slot as no longer in use. */
       vg_waiting_fds[i].fd = -1;
       /* pp_sched_status(); */
       if (VG_(clo_trace_sched)) {
          VG_(sprintf)(msg_buf,"resume due to I/O completion on fd %d", fd);
 	 print_sched_event(tid, msg_buf);
       }
    }
 }


 static
 void nanosleep_for_a_while ( void )
 {
    Int res;
    struct vki_timespec req;
    struct vki_timespec rem;
    req.tv_sec = 0;
    req.tv_nsec = 20 * 1000 * 1000;
    res = VG_(nanosleep)( &req, &rem );
    /* VG_(printf)("after ns, unused = %d\n", rem.tv_nsec ); */
    vg_assert(res == 0);
 }


 /* ---------------------------------------------------------------------
    The scheduler proper.
    ------------------------------------------------------------------ */

 /* Run user-space threads until either
    * Deadlock occurs
    * One thread asks to shutdown Valgrind
    * The specified number of basic blocks has gone by.
 */
 VgSchedReturnCode VG_(scheduler) ( void )
 {
    ThreadId tid, tid_next;
    UInt     trc;
    UInt     dispatch_ctr_SAVED;
    Int      request_code, done_this_time, n_in_fdwait_or_sleep;
    Char     msg_buf[100];
    Addr     trans_addr;

    /* For the LRU structures, records when the epoch began. */
    ULong lru_epoch_started_at = 0;

    /* Start with the root thread.  tid in general indicates the
       currently runnable/just-finished-running thread. */
    tid = 0;

    /* This is the top level scheduler loop.  It falls into three
       phases. */
    while (True) {

       /* ======================= Phase 1 of 3 =======================
          Handle I/O completions and signals.  This may change the
          status of various threads.  Then select a new thread to run,
          or declare deadlock, or sleep if there are no runnable
          threads but some are blocked on I/O.  */

       /* Age the LRU structures if an epoch has been completed. */
       if (VG_(bbs_done) - lru_epoch_started_at >= VG_BBS_PER_EPOCH) {
          lru_epoch_started_at = VG_(bbs_done);
          increment_epoch();
       }

       /* Was a debug-stop requested? */
       if (VG_(bbs_to_go) == 0)
          goto debug_stop;

       /* Do the following loop until a runnable thread is found, or
          deadlock is detected. */
       while (True) {

          /* For stats purposes only. */
          VG_(num_scheduling_events_MAJOR) ++;

          /* See if any I/O operations which we were waiting for have
             completed, and, if so, make runnable the relevant waiting
             threads. */
          poll_for_ready_fds();
          complete_blocked_syscalls();

          /* See if there are any signals which need to be delivered.  If
             so, choose thread(s) to deliver them to, and build signal
             delivery frames on those thread(s) stacks. */
          VG_(deliver_signals)( 0 /*HACK*/ );
          VG_(do_sanity_checks)(0 /*HACK*/, False);

          /* Try and find a thread (tid) to run. */
          tid_next = tid;
          n_in_fdwait_or_sleep = 0;
          while (True) {
             tid_next++;
             if (tid_next >= VG_N_THREADS) tid_next = 0;
             if (vg_threads[tid_next].status == VgTs_WaitFD
                 || vg_threads[tid_next].status == VgTs_Sleeping)
                n_in_fdwait_or_sleep ++;
             if (vg_threads[tid_next].status == VgTs_Runnable)
                break; /* We can run this one. */
             if (tid_next == tid)
                break; /* been all the way round */
          }
          tid = tid_next;

          if (vg_threads[tid].status == VgTs_Runnable) {
             /* Found a suitable candidate.  Fall out of this loop, so
                we can advance to stage 2 of the scheduler: actually
                running the thread. */
             break;
 	 }

          /* We didn't find a runnable thread.  Now what? */
          if (n_in_fdwait_or_sleep == 0) {
             /* No runnable threads and no prospect of any appearing
                even if we wait for an arbitrary length of time.  In
                short, we have a deadlock. */
 	    pp_sched_status();
             return VgSrc_Deadlock;
          }

          /* At least one thread is in a fd-wait state.  Delay for a
             while, and go round again, in the hope that eventually a
             thread becomes runnable. */
          nanosleep_for_a_while();
 	 //         pp_sched_status();
 	 //	 VG_(printf)(".\n");
       }


       /* ======================= Phase 2 of 3 =======================
          Wahey!  We've finally decided that thread tid is runnable, so
          we now do that.  Run it for as much of a quanta as possible.
          Trivial requests are handled and the thread continues.  The
          aim is not to do too many of Phase 1 since it is expensive.  */

       if (0)
          VG_(printf)("SCHED: tid %d, used %d\n", tid, VG_N_THREADS);

       /* Figure out how many bbs to ask vg_run_innerloop to do.  Note
          that it decrements the counter before testing it for zero, so
          that if VG_(dispatch_ctr) is set to N you get at most N-1
          iterations.  Also this means that VG_(dispatch_ctr) must
          exceed zero before entering the innerloop.  Also also, the
          decrement is done before the bb is actually run, so you
          always get at least one decrement even if nothing happens.
       */
       if (VG_(bbs_to_go) >= VG_SCHEDULING_QUANTUM)
          VG_(dispatch_ctr) = VG_SCHEDULING_QUANTUM + 1;
       else
          VG_(dispatch_ctr) = (UInt)VG_(bbs_to_go) + 1;

       /* ... and remember what we asked for. */
       dispatch_ctr_SAVED = VG_(dispatch_ctr);

       /* Actually run thread tid. */
       while (True) {

          /* For stats purposes only. */
          VG_(num_scheduling_events_MINOR) ++;

          if (0)
             VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs",
                                       tid, VG_(dispatch_ctr) - 1 );

          trc = run_thread_for_a_while ( tid );

          /* Deal quickly with trivial scheduling events, and resume the
             thread. */

          if (trc == VG_TRC_INNER_FASTMISS) {
             vg_assert(VG_(dispatch_ctr) > 0);

             /* Trivial event.  Miss in the fast-cache.  Do a full
                lookup for it. */
             trans_addr
                = VG_(search_transtab) ( vg_threads[tid].m_eip );
             if (trans_addr == (Addr)0) {
                /* Not found; we need to request a translation. */
                VG_(create_translation_for)( vg_threads[tid].m_eip );
                trans_addr = VG_(search_transtab) ( vg_threads[tid].m_eip );
                if (trans_addr == (Addr)0)
                   VG_(panic)("VG_TRC_INNER_FASTMISS: missing tt_fast entry");
             }
             continue; /* with this thread */
          }

          if (trc == VG_TRC_EBP_JMP_CLIENTREQ) {
             Bool is_triv = maybe_do_trivial_clientreq(tid);
             if (is_triv) {
                /* NOTE: a trivial request is something like a call to
                   malloc() or free().  It DOES NOT change the
                   Runnability of this thread nor the status of any
                   other thread; it is purely thread-local. */
                continue; /* with this thread */
 	    }
 	 }

 	 /* It's a non-trivial event.  Give up running this thread and
             handle things the expensive way. */
 	 break;
       }

       /* ======================= Phase 3 of 3 =======================
          Handle non-trivial thread requests, mostly pthread stuff. */

       /* Ok, we've fallen out of the dispatcher for a
          non-completely-trivial reason. First, update basic-block
          counters. */

       done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 1;
       vg_assert(done_this_time >= 0);
       VG_(bbs_to_go)   -= (ULong)done_this_time;
       VG_(bbs_done)    += (ULong)done_this_time;

       if (0 && trc != VG_TRC_INNER_FASTMISS)
          VG_(message)(Vg_DebugMsg, "thread %d:   completed %d bbs, trc %d",
                                    tid, done_this_time, (Int)trc );

       if (0 && trc != VG_TRC_INNER_FASTMISS)
          VG_(message)(Vg_DebugMsg, "thread %d:  %ld bbs, event %s",
                                    tid, VG_(bbs_done),
                                    name_of_sched_event(trc) );

       /* Examine the thread's return code to figure out why it
          stopped, and handle requests. */

       switch (trc) {

          case VG_TRC_INNER_FASTMISS:
             VG_(panic)("VG_(scheduler):  VG_TRC_INNER_FASTMISS");
             /*NOTREACHED*/
             break;

          case VG_TRC_INNER_COUNTERZERO:
             /* Timeslice is out.  Let a new thread be scheduled,
                simply by doing nothing, causing us to arrive back at
                Phase 1. */
             if (VG_(bbs_to_go) == 0) {
                goto debug_stop;
             }
             vg_assert(VG_(dispatch_ctr) == 0);
             break;

          case VG_TRC_UNRESUMABLE_SIGNAL:
             /* It got a SIGSEGV/SIGBUS, which we need to deliver right
                away.  Again, do nothing, so we wind up back at Phase
                1, whereupon the signal will be "delivered". */
 	    break;

          case VG_TRC_EBP_JMP_SYSCALL:
             /* Do a syscall for the vthread tid.  This could cause it
                to become non-runnable. */
             sched_do_syscall(tid);
             break;

          case VG_TRC_EBP_JMP_CLIENTREQ:
             /* Do a client request for the vthread tid.  Note that
                some requests will have been handled by
                maybe_do_trivial_clientreq(), so we don't expect to see
                those here.
             */
             /* The thread's %EAX points at an arg block, the first
                word of which is the request code. */
             request_code = ((UInt*)(vg_threads[tid].m_eax))[0];
             if (0) {
                VG_(sprintf)(msg_buf, "request 0x%x", request_code );
                print_sched_event(tid, msg_buf);
 	    }
 	    /* Do a non-trivial client request for thread tid.  tid's
                %EAX points to a short vector of argument words, the
                first of which is the request code.  The result of the
                request is put in tid's %EDX.  Alternatively, perhaps
                the request causes tid to become non-runnable and/or
                other blocked threads become runnable.  In general we
                can and often do mess with the state of arbitrary
                threads at this point. */
             if (request_code == VG_USERREQ__SHUTDOWN_VALGRIND) {
                return VgSrc_Shutdown;
             } else {
                do_nontrivial_clientreq(tid);
 	    }
             break;

          default:
             VG_(printf)("\ntrc = %d\n", trc);
             VG_(panic)("VG_(scheduler), phase 3: "
                        "unexpected thread return code");
             /* NOTREACHED */
             break;

       } /* switch (trc) */

       /* That completes Phase 3 of 3.  Return now to the top of the
 	 main scheduler loop, to Phase 1 of 3. */

    } /* top-level scheduler loop */


    /* NOTREACHED */
    VG_(panic)("scheduler: post-main-loop ?!");
    /* NOTREACHED */

   debug_stop:
    /* If we exited because of a debug stop, print the translation
       of the last block executed -- by translating it again, and
       throwing away the result. */
    VG_(printf)(
       "======vvvvvvvv====== LAST TRANSLATION ======vvvvvvvv======\n");
    VG_(translate)( vg_threads[tid].m_eip, NULL, NULL, NULL );
    VG_(printf)("\n");
    VG_(printf)(
       "======^^^^^^^^====== LAST TRANSLATION ======^^^^^^^^======\n");

    return VgSrc_BbsDone;
 }


 /* ---------------------------------------------------------------------
    The pthread implementation.
    ------------------------------------------------------------------ */

 #include <pthread.h>
 #include <errno.h>

 #if !defined(PTHREAD_STACK_MIN)
 #  define PTHREAD_STACK_MIN (16384 - VG_AR_CLIENT_STACKBASE_REDZONE_SZB)
 #endif

 /*  /usr/include/bits/pthreadtypes.h:
     typedef unsigned long int pthread_t;
 */


 static
 void do_pthread_cancel ( ThreadId  tid_canceller,
                          pthread_t tid_cancellee )
 {
    Char msg_buf[100];
    /* We want make is appear that this thread has returned to
       do_pthread_create_bogusRA with PTHREAD_CANCELED as the
       return value.  So: simple: put PTHREAD_CANCELED into %EAX
       and &do_pthread_create_bogusRA into %EIP and keep going! */
    if (VG_(clo_trace_sched)) {
       VG_(sprintf)(msg_buf, "cancelled by %d", tid_canceller);
       print_sched_event(tid_cancellee, msg_buf);
    }
    vg_threads[tid_cancellee].m_eax  = (UInt)PTHREAD_CANCELED;
    vg_threads[tid_cancellee].m_eip  = (UInt)&VG_(pthreadreturn_bogusRA);
    vg_threads[tid_cancellee].status = VgTs_Runnable;
 }


 /* Thread tid is exiting, by returning from the function it was
    created with.  Or possibly due to pthread_exit or cancellation.
    The main complication here is to resume any thread waiting to join
    with this one. */
 static
 void handle_pthread_return ( ThreadId tid, void* retval )
 {
    ThreadId jnr; /* joiner, the thread calling pthread_join. */
    UInt*    jnr_args;
    void**   jnr_thread_return;
    Char     msg_buf[100];

    /* Mark it as not in use.  Leave the stack in place so the next
       user of this slot doesn't reallocate it. */
    vg_assert(tid >= 0 && tid < VG_N_THREADS);
    vg_assert(vg_threads[tid].status != VgTs_Empty);

    vg_threads[tid].retval = retval;

    if (vg_threads[tid].joiner == VG_INVALID_THREADID) {
       /* No one has yet done a join on me */
       vg_threads[tid].status = VgTs_WaitJoiner;
       if (VG_(clo_trace_sched)) {
          VG_(sprintf)(msg_buf,
             "root fn returns, waiting for a call pthread_join(%d)",
             tid);
          print_sched_event(tid, msg_buf);
       }
    } else {
       /* Some is waiting; make their join call return with success,
          putting my exit code in the place specified by the caller's
          thread_return param.  This is all very horrible, since we
          need to consult the joiner's arg block -- pointed to by its
          %EAX -- in order to extract the 2nd param of its pthread_join
          call.  TODO: free properly the slot (also below).
       */
       jnr = vg_threads[tid].joiner;
       vg_assert(jnr >= 0 && jnr < VG_N_THREADS);
       vg_assert(vg_threads[jnr].status == VgTs_WaitJoinee);
       jnr_args = (UInt*)vg_threads[jnr].m_eax;
       jnr_thread_return = (void**)(jnr_args[2]);
       if (jnr_thread_return != NULL)
          *jnr_thread_return = vg_threads[tid].retval;
       vg_threads[jnr].m_edx = 0; /* success */
       vg_threads[jnr].status = VgTs_Runnable;
       vg_threads[tid].status = VgTs_Empty; /* bye! */
       if (VG_(clo_instrument) && tid != 0)
          VGM_(make_noaccess)( vg_threads[tid].stack_base,
                               vg_threads[tid].stack_size );
       if (VG_(clo_trace_sched)) {
          VG_(sprintf)(msg_buf,
             "root fn returns, to find a waiting pthread_join(%d)", tid);
          print_sched_event(tid, msg_buf);
          VG_(sprintf)(msg_buf,
             "my pthread_join(%d) returned; resuming", tid);
          print_sched_event(jnr, msg_buf);
       }
    }

    /* Return value is irrelevant; this thread will not get
       rescheduled. */
 }


 static
 void do_pthread_join ( ThreadId tid, ThreadId jee, void** thread_return )
 {
    Char msg_buf[100];

    /* jee, the joinee, is the thread specified as an arg in thread
       tid's call to pthread_join.  So tid is the join-er. */
    vg_assert(tid >= 0 && tid < VG_N_THREADS);
    vg_assert(vg_threads[tid].status == VgTs_Runnable);

    if (jee == tid) {
       vg_threads[tid].m_edx = EDEADLK; /* libc constant, not a kernel one */
       vg_threads[tid].status = VgTs_Runnable;
       return;
    }

    if (jee < 0
        || jee >= VG_N_THREADS
        || vg_threads[jee].status == VgTs_Empty) {
       /* Invalid thread to join to. */
       vg_threads[tid].m_edx = EINVAL;
       vg_threads[tid].status = VgTs_Runnable;
       return;
    }

    if (vg_threads[jee].joiner != VG_INVALID_THREADID) {
       /* Someone already did join on this thread */
       vg_threads[tid].m_edx = EINVAL;
       vg_threads[tid].status = VgTs_Runnable;
       return;
    }

    /* if (vg_threads[jee].detached) ... */

    /* Perhaps the joinee has already finished?  If so return
       immediately with its return code, and free up the slot. TODO:
       free it properly (also above). */
    if (vg_threads[jee].status == VgTs_WaitJoiner) {
       vg_assert(vg_threads[jee].joiner == VG_INVALID_THREADID);
       vg_threads[tid].m_edx = 0; /* success */
       if (thread_return != NULL)
          *thread_return = vg_threads[jee].retval;
       vg_threads[tid].status = VgTs_Runnable;
       vg_threads[jee].status = VgTs_Empty; /* bye! */
       if (VG_(clo_instrument) && jee != 0)
          VGM_(make_noaccess)( vg_threads[jee].stack_base,
                               vg_threads[jee].stack_size );
       if (VG_(clo_trace_sched)) {
 	 VG_(sprintf)(msg_buf,
 		      "someone called pthread_join() on me; bye!");
          print_sched_event(jee, msg_buf);
 	 VG_(sprintf)(msg_buf,
             "my pthread_join(%d) returned immediately",
             jee );
          print_sched_event(tid, msg_buf);
       }
       return;
    }

    /* Ok, so we'll have to wait on jee. */
    vg_threads[jee].joiner = tid;
    vg_threads[tid].status = VgTs_WaitJoinee;
    if (VG_(clo_trace_sched)) {
       VG_(sprintf)(msg_buf,
          "blocking on call of pthread_join(%d)", jee );
       print_sched_event(tid, msg_buf);
    }
    /* So tid's join call does not return just now. */
 }


 static
 void do_pthread_create ( ThreadId parent_tid,
                          pthread_t* thread,
                          pthread_attr_t* attr,
                          void* (*start_routine)(void *),
                          void* arg )
 {
    Addr     new_stack;
    UInt     new_stk_szb;
    ThreadId tid;
    Char     msg_buf[100];

    /* Paranoia ... */
    vg_assert(sizeof(pthread_t) == sizeof(UInt));

    vg_assert(vg_threads[parent_tid].status != VgTs_Empty);

    tid         = vg_alloc_ThreadState();

    /* If we've created the main thread's tid, we're in deep trouble :) */
    vg_assert(tid != 0);

    /* Copy the parent's CPU state into the child's, in a roundabout
       way (via baseBlock). */
    VG_(load_thread_state)(parent_tid);
    VG_(save_thread_state)(tid);

    /* Consider allocating the child a stack, if the one it already has
       is inadequate. */
    new_stk_szb = PTHREAD_STACK_MIN;

    if (new_stk_szb > vg_threads[tid].stack_size) {
       /* Again, for good measure :) We definitely don't want to be
          allocating a stack for the main thread. */
       vg_assert(tid != 0);
       /* for now, we don't handle the case of anything other than
          assigning it for the first time. */
       vg_assert(vg_threads[tid].stack_size == 0);
       vg_assert(vg_threads[tid].stack_base == (Addr)NULL);
       new_stack = (Addr)VG_(get_memory_from_mmap)( new_stk_szb );
       vg_threads[tid].stack_base = new_stack;
       vg_threads[tid].stack_size = new_stk_szb;
       vg_threads[tid].m_esp
          = new_stack + new_stk_szb
                      - VG_AR_CLIENT_STACKBASE_REDZONE_SZB;
    }
    if (VG_(clo_instrument))
       VGM_(make_noaccess)( vg_threads[tid].m_esp,
                            VG_AR_CLIENT_STACKBASE_REDZONE_SZB );

    /* push arg */
    vg_threads[tid].m_esp -= 4;
    * (UInt*)(vg_threads[tid].m_esp) = (UInt)arg;

    /* push (magical) return address */
    vg_threads[tid].m_esp -= 4;
    * (UInt*)(vg_threads[tid].m_esp) = (UInt)VG_(pthreadreturn_bogusRA);

    if (VG_(clo_instrument))
       VGM_(make_readable)( vg_threads[tid].m_esp, 2 * 4 );

    /* this is where we start */
    vg_threads[tid].m_eip = (UInt)start_routine;

    if (VG_(clo_trace_sched)) {
       VG_(sprintf)(msg_buf,
          "new thread, created by %d", parent_tid );
       print_sched_event(tid, msg_buf);
    }

    /* store the thread id in *thread. */
    //   if (VG_(clo_instrument))
    // ***** CHECK *thread is writable
    *thread = (pthread_t)tid;

    /* return zero */
    vg_threads[tid].joiner = VG_INVALID_THREADID;
    vg_threads[tid].status = VgTs_Runnable;
    vg_threads[tid].m_edx  = 0; /* success */
 }


 /* Horrible hacks to do with pthread_mutex_t: the real pthread_mutex_t
    is a struct with at least 5 words:
       typedef struct
       {
         int __m_reserved;         -- Reserved for future use
         int __m_count;            -- Depth of recursive locking
         _pthread_descr __m_owner; -- Owner thread (if recursive or errcheck)
         int __m_kind;      -- Mutex kind: fast, recursive or errcheck
         struct _pthread_fastlock __m_lock;  -- Underlying fast lock
       } pthread_mutex_t;
    Ours is just a single word, an index into vg_mutexes[].
    For now I'll park it in the __m_reserved field.

    Uninitialised mutexes (PTHREAD_MUTEX_INITIALIZER) all have
    a zero __m_count field (see /usr/include/pthread.h).  So I'll
    use zero to mean non-inited, and 1 to mean inited.

    How convenient.
 */

 static
 void initialise_mutex ( ThreadId tid, pthread_mutex_t *mutex )
 {
    MutexId  mid;
    Char     msg_buf[100];
    /* vg_alloc_MutexId aborts if we can't allocate a mutex, for
       whatever reason. */
    mid = vg_alloc_VgMutex();
    vg_mutexes[mid].in_use = True;
    vg_mutexes[mid].held = False;
    vg_mutexes[mid].owner = VG_INVALID_THREADID; /* irrelevant */
    mutex->__m_reserved = mid;
    mutex->__m_count = 1; /* initialised */
    if (VG_(clo_trace_pthread)) {
       VG_(sprintf)(msg_buf, "(initialise mutex) (%p) -> %d",
                             mutex, mid );
       print_pthread_event(tid, msg_buf);
    }
 }

 /* Allocate a new MutexId and write it into *mutex.  Ideally take
    notice of the attributes in *mutexattr.  */
 static
 void do_pthread_mutex_init ( ThreadId tid,
                              pthread_mutex_t *mutex,
                              const  pthread_mutexattr_t *mutexattr)
 {
    Char     msg_buf[100];
    /* Paranoia ... */
    vg_assert(sizeof(pthread_mutex_t) >= sizeof(UInt));

    initialise_mutex(tid, mutex);

    if (VG_(clo_trace_pthread)) {
       VG_(sprintf)(msg_buf, "pthread_mutex_init (%p) -> %d",
                             mutex, mutex->__m_reserved );
       print_pthread_event(tid, msg_buf);
    }

    /*
    RETURN VALUE
        pthread_mutex_init  always  returns 0. The other mutex functions
        return 0 on success and a non-zero error code on error.
    */
    /* THIS THREAD returns with 0. */
    vg_threads[tid].m_edx = 0;
 }


 static
 void do_pthread_mutex_lock( ThreadId tid, pthread_mutex_t *mutex )
 {
    MutexId  mid;
    Char     msg_buf[100];

    /* *mutex contains the MutexId, or one of the magic values
       PTHREAD_*MUTEX_INITIALIZER*, indicating we need to initialise it
       now.  See comment(s) above re use of __m_count to indicated
       initialisation status.
    */

    /* POSIX doesn't mandate this, but for sanity ... */
    if (mutex == NULL) {
       vg_threads[tid].m_edx = EINVAL;
       return;
    }

    if (mutex->__m_count == 0) {
       initialise_mutex(tid, mutex);
    }

    mid = mutex->__m_reserved;
    if (mid < 0 || mid >= VG_N_MUTEXES || !vg_mutexes[mid].in_use) {
       vg_threads[tid].m_edx = EINVAL;
       return;
    }

    if (VG_(clo_trace_pthread)) {
       VG_(sprintf)(msg_buf, "pthread_mutex_lock   %d (%p)",
                             mid, mutex );
       print_pthread_event(tid, msg_buf);
    }

    /* Assert initialised. */
    vg_assert(mutex->__m_count == 1);

    /* Assume tid valid. */
    vg_assert(vg_threads[tid].status == VgTs_Runnable);

    if (vg_mutexes[mid].held) {
       if (vg_mutexes[mid].owner == tid) {
          vg_threads[tid].m_edx = EDEADLK;
          return;
       }
       /* Someone else has it; we have to wait. */
       vg_threads[tid].status = VgTs_WaitMX;
       vg_threads[tid].waited_on_mid = mid;
       /* No assignment to %EDX, since we're blocking. */
       if (VG_(clo_trace_pthread)) {
          VG_(sprintf)(msg_buf, "pthread_mutex_lock   %d (%p): BLOCK",
                                mid, mutex );
          print_pthread_event(tid, msg_buf);
       }
    } else {
       /* We get it! */
       vg_mutexes[mid].held  = True;
       vg_mutexes[mid].owner = tid;
       /* return 0 (success). */
       vg_threads[tid].m_edx = 0;
    }
 }


 static
 void do_pthread_mutex_unlock ( ThreadId tid,
                                pthread_mutex_t *mutex )
 {
    MutexId  mid;
    Int      i;
    Char     msg_buf[100];

    if (mutex == NULL
        || mutex->__m_count != 1) {
       vg_threads[tid].m_edx = EINVAL;
       return;
    }

    mid = mutex->__m_reserved;
    if (mid < 0 || mid >= VG_N_MUTEXES || !vg_mutexes[mid].in_use) {
       vg_threads[tid].m_edx = EINVAL;
       return;
    }

    if (VG_(clo_trace_pthread)) {
       VG_(sprintf)(msg_buf, "pthread_mutex_unlock %d (%p)",
                             mid, mutex );
       print_pthread_event(tid, msg_buf);
    }

    /* Assume tid valid */
    vg_assert(vg_threads[tid].status == VgTs_Runnable);

    /* Barf if we don't currently hold the mutex. */
    if (!vg_mutexes[mid].held || vg_mutexes[mid].owner != tid) {
       vg_threads[tid].m_edx = EPERM;
       return;
    }

    /* Find some arbitrary thread waiting on this mutex, and make it
       runnable.  If none are waiting, mark the mutex as not held. */
    for (i = 0; i < VG_N_THREADS; i++) {
       if (vg_threads[i].status == VgTs_Empty)
          continue;
       if (vg_threads[i].status == VgTs_WaitMX
           && vg_threads[i].waited_on_mid == mid)
          break;
    }

    vg_assert(i <= VG_N_THREADS);
    if (i == VG_N_THREADS) {
       /* Nobody else is waiting on it. */
       vg_mutexes[mid].held = False;
    } else {
       /* Notionally transfer the hold to thread i, whose
          pthread_mutex_lock() call now returns with 0 (success). */
       vg_mutexes[mid].owner = i;
       vg_threads[i].status = VgTs_Runnable;
       vg_threads[i].m_edx = 0; /* pth_lock() success */

       if (VG_(clo_trace_pthread)) {
          VG_(sprintf)(msg_buf, "pthread_mutex_lock   %d: RESUME",
                                mid );
          print_pthread_event(tid, msg_buf);
       }
    }

    /* In either case, our (tid's) pth_unlock() returns with 0
       (success). */
    vg_threads[tid].m_edx = 0; /* Success. */
 }


 static void do_pthread_mutex_destroy ( ThreadId tid,
                                        pthread_mutex_t *mutex )
 {
    MutexId  mid;
    Char     msg_buf[100];

    if (mutex == NULL
        || mutex->__m_count != 1) {
       vg_threads[tid].m_edx = EINVAL;
       return;
    }

    mid = mutex->__m_reserved;
    if (mid < 0 || mid >= VG_N_MUTEXES || !vg_mutexes[mid].in_use) {
       vg_threads[tid].m_edx = EINVAL;
       return;
    }

    if (VG_(clo_trace_pthread)) {
       VG_(sprintf)(msg_buf, "pthread_mutex_destroy %d (%p)",
                             mid, mutex );
       print_pthread_event(tid, msg_buf);
    }

    /* Assume tid valid */
    vg_assert(vg_threads[tid].status == VgTs_Runnable);

    /* Barf if the mutex is currently held. */
    if (vg_mutexes[mid].held) {
       vg_threads[tid].m_edx = EBUSY;
       return;
    }

    mutex->__m_count = 0; /* uninitialised */
    vg_mutexes[mid].in_use = False;
    vg_threads[tid].m_edx = 0;
 }


 /* vthread tid is returning from a signal handler; modify its
    stack/regs accordingly. */
 static
 void handle_signal_return ( ThreadId tid )
 {
    Char msg_buf[100];
    Bool restart_blocked_syscalls = VG_(signal_returns)(tid);

    if (restart_blocked_syscalls)
       /* Easy; we don't have to do anything. */
       return;

    if (vg_threads[tid].status == VgTs_WaitFD) {
       vg_assert(vg_threads[tid].m_eax == __NR_read
                 || vg_threads[tid].m_eax == __NR_write);
       /* read() or write() interrupted.  Force a return with EINTR. */
       vg_threads[tid].m_eax = -VKI_EINTR;
       vg_threads[tid].status = VgTs_Runnable;
       if (VG_(clo_trace_sched)) {
          VG_(sprintf)(msg_buf,
             "read() / write() interrupted by signal; return EINTR" );
          print_sched_event(tid, msg_buf);
       }
       return;
    }

    if (vg_threads[tid].status == VgTs_WaitFD) {
       vg_assert(vg_threads[tid].m_eax == __NR_nanosleep);
       /* We interrupted a nanosleep().  The right thing to do is to
          write the unused time to nanosleep's second param and return
          EINTR, but I'm too lazy for that. */
       return;
    }

    /* All other cases?  Just return. */
 }


 /* ---------------------------------------------------------------------
    Handle non-trivial client requests.
    ------------------------------------------------------------------ */

 static
 void do_nontrivial_clientreq ( ThreadId tid )
 {
    UInt* arg    = (UInt*)(vg_threads[tid].m_eax);
    UInt  req_no = arg[0];
    switch (req_no) {

       case VG_USERREQ__PTHREAD_CREATE:
          do_pthread_create( tid,
                             (pthread_t*)arg[1],
                             (pthread_attr_t*)arg[2],
                             (void*(*)(void*))arg[3],
                             (void*)arg[4] );
          break;

       case VG_USERREQ__PTHREAD_RETURNS:
          handle_pthread_return( tid, (void*)arg[1] );
          break;

       case VG_USERREQ__PTHREAD_JOIN:
          do_pthread_join( tid, arg[1], (void**)(arg[2]) );
          break;

       /* Sigh ... this probably will cause huge numbers of major
          (expensive) scheduling events, for no real reason.
          Perhaps should be classified as a trivial-request. */
       case VG_USERREQ__PTHREAD_GET_THREADID:
          vg_threads[tid].m_edx = tid;
 	 break;

       case VG_USERREQ__PTHREAD_MUTEX_INIT:
          do_pthread_mutex_init( tid,
                                 (pthread_mutex_t *)(arg[1]),
                                 (pthread_mutexattr_t *)(arg[2]) );
          break;

       case VG_USERREQ__PTHREAD_MUTEX_LOCK:
          do_pthread_mutex_lock( tid, (pthread_mutex_t *)(arg[1]) );
          break;

       case VG_USERREQ__PTHREAD_MUTEX_UNLOCK:
          do_pthread_mutex_unlock( tid, (pthread_mutex_t *)(arg[1]) );
          break;

       case VG_USERREQ__PTHREAD_MUTEX_DESTROY:
          do_pthread_mutex_destroy( tid, (pthread_mutex_t *)(arg[1]) );
          break;

       case VG_USERREQ__PTHREAD_CANCEL:
          do_pthread_cancel( tid, (pthread_t)(arg[1]) );
          break;

       case VG_USERREQ__MAKE_NOACCESS:
       case VG_USERREQ__MAKE_WRITABLE:
       case VG_USERREQ__MAKE_READABLE:
       case VG_USERREQ__DISCARD:
       case VG_USERREQ__CHECK_WRITABLE:
       case VG_USERREQ__CHECK_READABLE:
       case VG_USERREQ__MAKE_NOACCESS_STACK:
       case VG_USERREQ__RUNNING_ON_VALGRIND:
       case VG_USERREQ__DO_LEAK_CHECK:
          vg_threads[tid].m_edx
             = VG_(handle_client_request) ( &vg_threads[tid], arg );
 	 break;

       case VG_USERREQ__SIGNAL_RETURNS:
          handle_signal_return(tid);
 	 break;

       default:
          VG_(printf)("panic'd on private request = 0x%x\n", arg[0] );
          VG_(panic)("handle_private_client_pthread_request: "
                     "unknown request");
          /*NOTREACHED*/
          break;
    }
 }


 /*--------------------------------------------------------------------*/
 /*--- end                                           vg_scheduler.c ---*/
 /*--------------------------------------------------------------------*/