Various upgrades, with the effect that mozilla now runs, although
it has tremendous performance problems.

* Implement pthread_key_{create,delete} and pthread_{set,get}specific.

* Implement pthread_cond_timedwait.  A nuisance.

* New timer infrastructure, based on the RDTSC instruction.  This
  allows fast, accurate time measurement without swamping the host with
  gettimeofday() syscalls.

There's something definitely screwy about the scheduler, making opera
run slowly and mozilla run unbelievably slowly.  To be investigated.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@119 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/arch/x86-linux/vg_libpthread.c b/coregrind/arch/x86-linux/vg_libpthread.c
index fb70ec6..76923a3 100644
--- a/coregrind/arch/x86-linux/vg_libpthread.c
+++ b/coregrind/arch/x86-linux/vg_libpthread.c
@@ -134,6 +134,8 @@
 #include <pthread.h>
 #include <stdio.h>
 #include <errno.h>
+#include <assert.h>
+#include <sys/time.h> /* gettimeofday */
 
 /* ---------------------------------------------------
    THREAD ATTRIBUTES
@@ -421,6 +423,40 @@
    return res;
 }
 
+int pthread_cond_timedwait ( pthread_cond_t *cond, 
+                             pthread_mutex_t *mutex, 
+                             const struct  timespec *abstime )
+{
+   int res;
+   unsigned int ms_now, ms_end;
+   struct  timeval timeval_now;
+   unsigned long long int ull_ms_now_after_1970;
+   unsigned long long int ull_ms_end_after_1970;
+
+   ensure_valgrind("pthread_cond_timedwait");
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+   assert(ms_now != 0xFFFFFFFF);
+   res = gettimeofday(&timeval_now, NULL);
+   assert(res == 0);
+
+   ull_ms_now_after_1970 
+      = 1000ULL * ((unsigned long long int)(timeval_now.tv_sec))
+        + ((unsigned long long int)(timeval_now.tv_usec / 1000000));
+   ull_ms_end_after_1970
+      = 1000ULL * ((unsigned long long int)(abstime->tv_sec))
+        + ((unsigned long long int)(abstime->tv_nsec / 1000000));
+   assert(ull_ms_end_after_1970 >= ull_ms_now_after_1970);
+   ms_end 
+      = ms_now + (unsigned int)(ull_ms_end_after_1970 - ull_ms_now_after_1970);
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_COND_TIMEDWAIT,
+			   cond, mutex, ms_end, 0);
+   return res;
+}
+
+
 int pthread_cond_signal(pthread_cond_t *cond)
 {
    int res;
@@ -471,8 +507,12 @@
 int pthread_key_create(pthread_key_t *key,  
                        void  (*destr_function)  (void *))
 {
-   ignored("pthread_key_create");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_key_create");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_KEY_CREATE,
+                           key, destr_function, 0, 0);
+   return res;
 }
 
 int pthread_key_delete(pthread_key_t key)
@@ -483,14 +523,22 @@
 
 int pthread_setspecific(pthread_key_t key, const void *pointer)
 {
-   ignored("pthread_setspecific");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_setspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_SETSPECIFIC,
+                           key, pointer, 0, 0);
+   return res;
 }
 
 void * pthread_getspecific(pthread_key_t key)
 {
-   ignored("pthread_setspecific");
-   return NULL;
+   int res;
+   ensure_valgrind("pthread_getspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_GETSPECIFIC,
+                           key, 0 , 0, 0);
+   return (void*)res;
 }
 
 
@@ -784,7 +832,6 @@
    * (unchecked) libc error numbers (EINTR etc) are the negation of the
      kernel's error numbers (VKI_EINTR etc).
 */
-#include <assert.h>
 
 
 int select ( int n, 
@@ -793,16 +840,19 @@
              fd_set *xfds, 
              struct timeval *timeout )
 {
+   unsigned int ms_now, ms_end;
    int    res;
    fd_set rfds_copy;
    fd_set wfds_copy;
    fd_set xfds_copy;
    struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timeval  zero_timeout;
    struct vki_timespec nanosleep_interval;
 
-   ensure_valgrind("select");
+   /* gcc's complains about ms_end being used uninitialised -- classic
+      case it can't understand, where ms_end is both defined and used
+      only if timeout != NULL.  Hence ... */
+   ms_end = 0;
 
    /* We assume that the kernel and libc data layouts are identical
       for the following types.  These asserts provide a crude
@@ -811,8 +861,17 @@
        || sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking select(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0) {
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
+   /* If a zero timeout specified, this call is harmless.  Also go
+      this route if we're not running on Valgrind, for whatever
+      reason. */
+   if ( (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0)
+        || (ms_now == 0xFFFFFFFF) ) {
       res = do_syscall_select( n, (vki_fd_set*)rfds, 
                                    (vki_fd_set*)wfds, 
                                    (vki_fd_set*)xfds, 
@@ -825,35 +884,29 @@
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end millisecond
+      counter [wallclock] time. */
    if (timeout) {
       res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
       assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += timeout->tv_usec;
-      t_end.tv_sec  += timeout->tv_sec;
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
+      ms_end = ms_now;
+      ms_end += (timeout->tv_usec / 1000);
+      ms_end += (timeout->tv_sec * 1000);
       /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      assert (ms_end >= ms_now);
    }
 
    /* fprintf(stderr, "MY_SELECT: before loop\n"); */
 
    /* Either timeout == NULL, meaning wait indefinitely, or timeout !=
-      NULL, in which case t_end holds the end time. */
+      NULL, in which case ms_end holds the end time. */
    while (1) {
       if (timeout) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             if (rfds) FD_ZERO(rfds);
             if (wfds) FD_ZERO(wfds);
@@ -892,7 +945,7 @@
       /* fprintf(stderr, "MY_SELECT: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 75 * 1000 * 1000; /* 75 milliseconds */
+      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
@@ -907,19 +960,28 @@
 
 int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
 {
+   unsigned int        ms_now, ms_end;
    int                 res, i;
-   struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timespec nanosleep_interval;
 
    ensure_valgrind("poll");
 
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
    if (/* CHECK SIZES FOR struct pollfd */
        sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking poll(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (__timeout == 0) {
+   /* dummy initialisation to keep gcc -Wall happy */
+   ms_end = 0;
+
+   /* If a zero timeout specified, this call is harmless.  Also do
+      this if not running on Valgrind. */
+   if (__timeout == 0 || ms_now == 0xFFFFFFFF) {
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, __timeout);
       if (is_kerror(res)) {
          * (__errno_location()) = -res;
@@ -929,36 +991,25 @@
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end wallclock
+      time.  Easy considering that __timeout is in milliseconds. */
    if (__timeout > 0) {
-      res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-      assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += 1000 * (__timeout % 1000);
-      t_end.tv_sec  += (__timeout / 1000);
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
-      /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      ms_end += (unsigned int)__timeout;
    }
 
    /* fprintf(stderr, "MY_POLL: before loop\n"); */
 
    /* Either timeout < 0, meaning wait indefinitely, or timeout > 0,
       in which case t_end holds the end time. */
+   assert(__timeout != 0);
+
    while (1) {
-      assert(__timeout != 0);
       if (__timeout > 0) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             for (i = 0; i < __nfds; i++) 
                __fds[i].revents = 0;
@@ -966,8 +1017,7 @@
          }
       }
 
-      /* These could be trashed each time round the loop, so restore
-         them each time. */
+      /* Do a return-immediately poll. */
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, 0 );
       if (is_kerror(res)) {
          /* Some kind of error.  Set errno and return.  */
@@ -981,7 +1031,7 @@
       /* fprintf(stderr, "MY_POLL: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
+      nanosleep_interval.tv_nsec = 99 * 1000 * 1000; /* 99 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 13e3f01..4ae3850 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -102,7 +102,7 @@
 
 /* These many bytes below %ESP are considered addressible if we're
    doing the --workaround-gcc296-bugs hack. */
-#define VG_GCC296_BUG_STACK_SLOP 256
+#define VG_GCC296_BUG_STACK_SLOP /*256*/ 1024
 
 /* The maximum number of calls we're prepared to save in a
    backtrace. */
@@ -123,10 +123,14 @@
 
 /* The maximum number of pthreads that we support.  This is
    deliberately not very high since our implementation of some of the
-   scheduler algorithms is surely O(N^2) in the number of threads,
-   since that's simple, at least.  And (in practice) we hope that most
+   scheduler algorithms is surely O(N) in the number of threads, since
+   that's simple, at least.  And (in practice) we hope that most
    programs do not need many threads. */
-#define VG_N_THREADS 20
+#define VG_N_THREADS 100
+
+/* Maximum number of pthread keys available.  Again, we start low until
+   the need for a higher number presents itself. */
+#define VG_N_THREAD_KEYS 10
 
 /* Number of file descriptors that can simultaneously be waited on for
    I/O to complete.  Perhaps this should be the same as VG_N_THREADS
@@ -403,8 +407,15 @@
 #define VG_USERREQ__PTHREAD_CANCEL          0x3007
 #define VG_USERREQ__PTHREAD_EXIT            0x3008
 #define VG_USERREQ__PTHREAD_COND_WAIT       0x3009
-#define VG_USERREQ__PTHREAD_COND_SIGNAL     0x300A
-#define VG_USERREQ__PTHREAD_COND_BROADCAST  0x300B
+#define VG_USERREQ__PTHREAD_COND_TIMEDWAIT  0x300A
+#define VG_USERREQ__PTHREAD_COND_SIGNAL     0x300B
+#define VG_USERREQ__PTHREAD_COND_BROADCAST  0x300C
+#define VG_USERREQ__PTHREAD_KEY_CREATE      0x300D
+#define VG_USERREQ__PTHREAD_KEY_DELETE      0x300E
+#define VG_USERREQ__PTHREAD_SETSPECIFIC     0x300F
+#define VG_USERREQ__PTHREAD_GETSPECIFIC     0x3010
+
+#define VG_USERREQ__READ_MILLISECOND_TIMER  0x4001
 
 /* Cosmetic ... */
 #define VG_USERREQ__GET_PTHREAD_TRACE_LEVEL 0x3101
@@ -466,7 +477,12 @@
          ALWAYS == the index in vg_threads[]. */
       ThreadId tid;
 
-      /* Current scheduling status. */
+      /* Current scheduling status. 
+
+         Complications: whenever this is set to VgTs_WaitMX, you
+         should also set .m_edx to whatever the required return value
+         is for pthread_mutex_lock / pthread_cond_timedwait for when
+         the mutex finally gets unblocked. */
       ThreadStatus status;
 
       /* Identity of joiner (thread who called join on me), or
@@ -483,12 +499,21 @@
          waiting for.  In all other cases, should be NULL. */
       void* /* pthread_cond_t* */ associated_cv;
 
-      /* If VgTs_Sleeping, this is when we should wake up. */
-      ULong awaken_at;
+      /* If VgTs_Sleeping, this is when we should wake up, measured in
+         milliseconds as supplied by VG_(read_millisecond_counter). 
+ 
+         If VgTs_WaitCV, this indicates the time at which
+         pthread_cond_timedwait should wake up.  If == 0xFFFFFFFF,
+         this means infinitely far in the future, viz,
+         pthread_cond_wait. */
+      UInt awaken_at;
 
       /* return value */
       void* retval;
 
+      /* thread-specific data */
+      void* specifics[VG_N_THREAD_KEYS];
+
       /* Stacks.  When a thread slot is freed, we don't deallocate its
          stack; we just leave it lying around for the next use of the
          slot.  If the next use of the slot requires a larger stack,
@@ -662,7 +687,10 @@
 
 extern Char* VG_(getenv) ( Char* name );
 extern Int   VG_(getpid) ( void );
-extern ULong VG_(read_microsecond_timer)( void );
+
+extern void VG_(start_rdtsc_calibration) ( void );
+extern void VG_(end_rdtsc_calibration) ( void );
+extern UInt VG_(read_millisecond_timer) ( void );
 
 
 extern Char VG_(toupper) ( Char c );
diff --git a/coregrind/vg_libpthread.c b/coregrind/vg_libpthread.c
index fb70ec6..76923a3 100644
--- a/coregrind/vg_libpthread.c
+++ b/coregrind/vg_libpthread.c
@@ -134,6 +134,8 @@
 #include <pthread.h>
 #include <stdio.h>
 #include <errno.h>
+#include <assert.h>
+#include <sys/time.h> /* gettimeofday */
 
 /* ---------------------------------------------------
    THREAD ATTRIBUTES
@@ -421,6 +423,40 @@
    return res;
 }
 
+int pthread_cond_timedwait ( pthread_cond_t *cond, 
+                             pthread_mutex_t *mutex, 
+                             const struct  timespec *abstime )
+{
+   int res;
+   unsigned int ms_now, ms_end;
+   struct  timeval timeval_now;
+   unsigned long long int ull_ms_now_after_1970;
+   unsigned long long int ull_ms_end_after_1970;
+
+   ensure_valgrind("pthread_cond_timedwait");
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+   assert(ms_now != 0xFFFFFFFF);
+   res = gettimeofday(&timeval_now, NULL);
+   assert(res == 0);
+
+   ull_ms_now_after_1970 
+      = 1000ULL * ((unsigned long long int)(timeval_now.tv_sec))
+        + ((unsigned long long int)(timeval_now.tv_usec / 1000000));
+   ull_ms_end_after_1970
+      = 1000ULL * ((unsigned long long int)(abstime->tv_sec))
+        + ((unsigned long long int)(abstime->tv_nsec / 1000000));
+   assert(ull_ms_end_after_1970 >= ull_ms_now_after_1970);
+   ms_end 
+      = ms_now + (unsigned int)(ull_ms_end_after_1970 - ull_ms_now_after_1970);
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_COND_TIMEDWAIT,
+			   cond, mutex, ms_end, 0);
+   return res;
+}
+
+
 int pthread_cond_signal(pthread_cond_t *cond)
 {
    int res;
@@ -471,8 +507,12 @@
 int pthread_key_create(pthread_key_t *key,  
                        void  (*destr_function)  (void *))
 {
-   ignored("pthread_key_create");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_key_create");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_KEY_CREATE,
+                           key, destr_function, 0, 0);
+   return res;
 }
 
 int pthread_key_delete(pthread_key_t key)
@@ -483,14 +523,22 @@
 
 int pthread_setspecific(pthread_key_t key, const void *pointer)
 {
-   ignored("pthread_setspecific");
-   return 0;
+   int res;
+   ensure_valgrind("pthread_setspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_SETSPECIFIC,
+                           key, pointer, 0, 0);
+   return res;
 }
 
 void * pthread_getspecific(pthread_key_t key)
 {
-   ignored("pthread_setspecific");
-   return NULL;
+   int res;
+   ensure_valgrind("pthread_getspecific");
+   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
+                           VG_USERREQ__PTHREAD_GETSPECIFIC,
+                           key, 0 , 0, 0);
+   return (void*)res;
 }
 
 
@@ -784,7 +832,6 @@
    * (unchecked) libc error numbers (EINTR etc) are the negation of the
      kernel's error numbers (VKI_EINTR etc).
 */
-#include <assert.h>
 
 
 int select ( int n, 
@@ -793,16 +840,19 @@
              fd_set *xfds, 
              struct timeval *timeout )
 {
+   unsigned int ms_now, ms_end;
    int    res;
    fd_set rfds_copy;
    fd_set wfds_copy;
    fd_set xfds_copy;
    struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timeval  zero_timeout;
    struct vki_timespec nanosleep_interval;
 
-   ensure_valgrind("select");
+   /* gcc's complains about ms_end being used uninitialised -- classic
+      case it can't understand, where ms_end is both defined and used
+      only if timeout != NULL.  Hence ... */
+   ms_end = 0;
 
    /* We assume that the kernel and libc data layouts are identical
       for the following types.  These asserts provide a crude
@@ -811,8 +861,17 @@
        || sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking select(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0) {
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
+   /* If a zero timeout specified, this call is harmless.  Also go
+      this route if we're not running on Valgrind, for whatever
+      reason. */
+   if ( (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0)
+        || (ms_now == 0xFFFFFFFF) ) {
       res = do_syscall_select( n, (vki_fd_set*)rfds, 
                                    (vki_fd_set*)wfds, 
                                    (vki_fd_set*)xfds, 
@@ -825,35 +884,29 @@
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end millisecond
+      counter [wallclock] time. */
    if (timeout) {
       res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
       assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += timeout->tv_usec;
-      t_end.tv_sec  += timeout->tv_sec;
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
+      ms_end = ms_now;
+      ms_end += (timeout->tv_usec / 1000);
+      ms_end += (timeout->tv_sec * 1000);
       /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      assert (ms_end >= ms_now);
    }
 
    /* fprintf(stderr, "MY_SELECT: before loop\n"); */
 
    /* Either timeout == NULL, meaning wait indefinitely, or timeout !=
-      NULL, in which case t_end holds the end time. */
+      NULL, in which case ms_end holds the end time. */
    while (1) {
       if (timeout) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             if (rfds) FD_ZERO(rfds);
             if (wfds) FD_ZERO(wfds);
@@ -892,7 +945,7 @@
       /* fprintf(stderr, "MY_SELECT: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 75 * 1000 * 1000; /* 75 milliseconds */
+      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
@@ -907,19 +960,28 @@
 
 int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
 {
+   unsigned int        ms_now, ms_end;
    int                 res, i;
-   struct vki_timeval  t_now;
-   struct vki_timeval  t_end;
    struct vki_timespec nanosleep_interval;
 
    ensure_valgrind("poll");
 
+   /* Detect the current time and simultaneously find out if we are
+      running on Valgrind. */
+   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                           VG_USERREQ__READ_MILLISECOND_TIMER,
+                           0, 0, 0, 0);
+
    if (/* CHECK SIZES FOR struct pollfd */
        sizeof(struct timeval) != sizeof(struct vki_timeval))
       barf("valgrind's hacky non-blocking poll(): data sizes error");
 
-   /* If a zero timeout specified, this call is harmless. */
-   if (__timeout == 0) {
+   /* dummy initialisation to keep gcc -Wall happy */
+   ms_end = 0;
+
+   /* If a zero timeout specified, this call is harmless.  Also do
+      this if not running on Valgrind. */
+   if (__timeout == 0 || ms_now == 0xFFFFFFFF) {
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, __timeout);
       if (is_kerror(res)) {
          * (__errno_location()) = -res;
@@ -929,36 +991,25 @@
       }
    }
 
-   /* If a timeout was specified, set t_end to be the end wallclock
-      time. */
+   /* If a timeout was specified, set ms_end to be the end wallclock
+      time.  Easy considering that __timeout is in milliseconds. */
    if (__timeout > 0) {
-      res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-      assert(res == 0);
-      t_end = t_now;
-      t_end.tv_usec += 1000 * (__timeout % 1000);
-      t_end.tv_sec  += (__timeout / 1000);
-      if (t_end.tv_usec >= 1000000) {
-         t_end.tv_usec -= 1000000;
-         t_end.tv_sec += 1;
-      }
-      /* Stay sane ... */
-      assert (t_end.tv_sec > t_now.tv_sec
-              || (t_end.tv_sec == t_now.tv_sec 
-                  && t_end.tv_usec >= t_now.tv_usec));
+      ms_end += (unsigned int)__timeout;
    }
 
    /* fprintf(stderr, "MY_POLL: before loop\n"); */
 
    /* Either timeout < 0, meaning wait indefinitely, or timeout > 0,
       in which case t_end holds the end time. */
+   assert(__timeout != 0);
+
    while (1) {
-      assert(__timeout != 0);
       if (__timeout > 0) {
-         res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-         assert(res == 0);
-         if (t_now.tv_sec > t_end.tv_sec
-             || (t_now.tv_sec == t_end.tv_sec 
-                 && t_now.tv_usec > t_end.tv_usec)) {
+         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
+                                 VG_USERREQ__READ_MILLISECOND_TIMER,
+                                 0, 0, 0, 0);
+         assert(ms_now != 0xFFFFFFFF);
+         if (ms_now >= ms_end) {
             /* timeout; nothing interesting happened. */
             for (i = 0; i < __nfds; i++) 
                __fds[i].revents = 0;
@@ -966,8 +1017,7 @@
          }
       }
 
-      /* These could be trashed each time round the loop, so restore
-         them each time. */
+      /* Do a return-immediately poll. */
       res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, 0 );
       if (is_kerror(res)) {
          /* Some kind of error.  Set errno and return.  */
@@ -981,7 +1031,7 @@
       /* fprintf(stderr, "MY_POLL: nanosleep\n"); */
       /* nanosleep and go round again */
       nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 100 * 1000 * 1000; /* 100 milliseconds */
+      nanosleep_interval.tv_nsec = 99 * 1000 * 1000; /* 99 milliseconds */
       /* It's critical here that valgrind's nanosleep implementation
          is nonblocking. */
       (void)my_do_syscall2(__NR_nanosleep, 
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index 47ea5be..5f7fe59 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -971,6 +971,9 @@
    VGP_(init_profiling)();
 #  endif
 
+   /* Start calibration of our RDTSC-based clock. */
+   VG_(start_rdtsc_calibration)();
+
    /* Hook to delay things long enough so we can get the pid and
       attach GDB in another shell. */
    /* {extern unsigned int sleep(unsigned int seconds); sleep(10);} */
@@ -984,6 +987,10 @@
       VGP_POPCC;
    }
 
+   /* End calibration of our RDTSC-based clock, leaving it as long as
+      we can. */
+   VG_(end_rdtsc_calibration)();
+
    /* This should come after init_memory_audit; otherwise the latter
       carefully sets up the permissions maps to cover the anonymous
       mmaps for the translation table and translation cache, which
diff --git a/coregrind/vg_mylibc.c b/coregrind/vg_mylibc.c
index a728f42..740b21e 100644
--- a/coregrind/vg_mylibc.c
+++ b/coregrind/vg_mylibc.c
@@ -291,13 +291,14 @@
    return res;
 }
 
-/* Returns -1 on error, but 0 if ok or interrupted. */
+/* Returns -1 on error, 0 if ok, 1 if interrupted. */
 Int VG_(nanosleep)( const struct vki_timespec *req, 
                     struct vki_timespec *rem )
 {
    Int res;
    res = vg_do_syscall2(__NR_nanosleep, (UInt)req, (UInt)rem);
    if (res == -VKI_EINVAL) return -1;
+   if (res == -VKI_EINTR)  return 1;
    return 0;
 }
 
@@ -936,17 +937,6 @@
    return res;
 }
 
-/* Read a notional elapsed (wallclock-time) timer, giving a 64-bit
-   microseconds count. */
-ULong VG_(read_microsecond_timer)( void )
-{
-   Int                res;
-   struct vki_timeval tv;
-   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&tv, (UInt)NULL);
-   vg_assert(!VG_(is_kerror)(res));
-   return (1000000ULL * (ULong)(tv.tv_sec)) + (ULong)(tv.tv_usec);
-}
-
 /* Return -1 if error, else 0.  NOTE does not indicate return code of
    child! */
 Int VG_(system) ( Char* cmd )
@@ -982,6 +972,129 @@
 
 
 /* ---------------------------------------------------------------------
+   Support for a millisecond-granularity counter using RDTSC.
+   ------------------------------------------------------------------ */
+
+static __inline__ ULong do_rdtsc_insn ( void )
+{
+   ULong x;
+   __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
+   return x;
+}
+
+/* 0 = pre-calibration, 1 = calibration, 2 = running */
+static Int   rdtsc_calibration_state     = 0;
+static ULong rdtsc_ticks_per_millisecond = 0; /* invalid value */
+
+static struct vki_timeval rdtsc_cal_start_timeval;
+static struct vki_timeval rdtsc_cal_end_timeval;
+
+static ULong              rdtsc_cal_start_raw;
+static ULong              rdtsc_cal_end_raw;
+
+UInt VG_(read_millisecond_timer) ( void )
+{
+   ULong rdtsc_now;
+   vg_assert(rdtsc_calibration_state == 2);
+   rdtsc_now = do_rdtsc_insn();
+   vg_assert(rdtsc_now > rdtsc_cal_end_raw);
+   rdtsc_now -= rdtsc_cal_end_raw;
+   rdtsc_now /= rdtsc_ticks_per_millisecond;
+   return (UInt)rdtsc_now;
+}
+
+
+void VG_(start_rdtsc_calibration) ( void )
+{
+   Int res;
+   vg_assert(rdtsc_calibration_state == 0);
+   rdtsc_calibration_state = 1;
+   rdtsc_cal_start_raw = do_rdtsc_insn();
+   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_start_timeval, 
+                                           (UInt)NULL);
+   vg_assert(!VG_(is_kerror)(res));
+}
+
+void VG_(end_rdtsc_calibration) ( void )
+{
+   Int   res, loops;
+   ULong cpu_clock_MHZ;
+   ULong cal_clock_ticks;
+   ULong cal_wallclock_microseconds;
+   ULong wallclock_start_microseconds;
+   ULong wallclock_end_microseconds;
+   struct vki_timespec req;
+   struct vki_timespec rem;
+   
+   vg_assert(rdtsc_calibration_state == 1);
+   rdtsc_calibration_state = 2;
+
+   /* Try and delay for 20 milliseconds, so that we can at least have
+      some minimum level of accuracy. */
+   req.tv_sec = 0;
+   req.tv_nsec = 20 * 1000 * 1000;
+   loops = 0;
+   while (True) {
+      res = VG_(nanosleep)(&req, &rem);
+      vg_assert(res == 0 /*ok*/ || res == 1 /*interrupted*/);
+      if (res == 0)
+         break;
+      if (rem.tv_sec == 0 && rem.tv_nsec == 0) 
+         break;
+      req = rem;
+      loops++;
+      if (loops > 100) 
+         VG_(panic)("calibration nanosleep loop failed?!");
+   }
+
+   /* Now read both timers, and do the Math. */
+   rdtsc_cal_end_raw = do_rdtsc_insn();
+   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_end_timeval, 
+                                           (UInt)NULL);
+
+   vg_assert(rdtsc_cal_end_raw > rdtsc_cal_start_raw);
+   cal_clock_ticks = rdtsc_cal_end_raw - rdtsc_cal_start_raw;
+
+   wallclock_start_microseconds
+      = (1000000ULL * (ULong)(rdtsc_cal_start_timeval.tv_sec)) 
+         + (ULong)(rdtsc_cal_start_timeval.tv_usec);
+   wallclock_end_microseconds
+      = (1000000ULL * (ULong)(rdtsc_cal_end_timeval.tv_sec)) 
+         + (ULong)(rdtsc_cal_end_timeval.tv_usec);
+   vg_assert(wallclock_end_microseconds > wallclock_start_microseconds);
+   cal_wallclock_microseconds 
+      = wallclock_end_microseconds - wallclock_start_microseconds;
+
+   /* Since we just nanoslept for 20 ms ... */
+   vg_assert(cal_wallclock_microseconds >= 20000);
+
+   /* Now we know (roughly) that cal_clock_ticks on RDTSC take
+      cal_wallclock_microseconds elapsed time.  Calculate the RDTSC
+      ticks-per-millisecond value. */
+   if (0)
+      VG_(printf)("%lld ticks in %lld microseconds\n", 
+                  cal_clock_ticks,  cal_wallclock_microseconds );
+
+   rdtsc_ticks_per_millisecond   
+      = cal_clock_ticks / (cal_wallclock_microseconds / 1000ULL);
+   cpu_clock_MHZ
+      = (1000ULL * rdtsc_ticks_per_millisecond) / 1000000ULL;
+   if (VG_(clo_verbosity) >= 1)
+      VG_(message)(Vg_UserMsg, "Estimated CPU clock rate is %d MHz",
+                               (UInt)cpu_clock_MHZ);
+   if (cpu_clock_MHZ < 100 || cpu_clock_MHZ > 10000)
+      VG_(panic)("end_rdtsc_calibration: "
+                 "estimated CPU MHz outside range 100 .. 10000");
+   /* Paranoia about division by zero later. */
+   vg_assert(rdtsc_ticks_per_millisecond != 0);
+   if (0)
+      VG_(printf)("ticks per millisecond %llu\n", 
+                  rdtsc_ticks_per_millisecond);
+}
+
+
+
+/* ---------------------------------------------------------------------
    Primitive support for bagging memory via mmap.
    ------------------------------------------------------------------ */
 
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
index 68dbf19..32201b9 100644
--- a/coregrind/vg_scheduler.c
+++ b/coregrind/vg_scheduler.c
@@ -119,7 +119,26 @@
 static VgWaitedOnFd vg_waiting_fds[VG_N_WAITING_FDS];
 
 
+/* Keeping track of keys. */
+typedef
+   struct {
+      /* Has this key been allocated ? */
+      Bool inuse;
+      /* If .inuse==True, records the address of the associated
+         destructor, or NULL if none. */
+      void (*destructor)(void*);
+   }
+   ThreadKeyState;
+
+/* And our array of thread keys. */
+static ThreadKeyState vg_thread_keys[VG_N_THREAD_KEYS];
+
+typedef UInt ThreadKey;
+
+
 /* Forwards */
+static void do_pthread_cond_timedwait_TIMEOUT ( ThreadId tid );
+
 static void do_nontrivial_clientreq ( ThreadId tid );
 
 static void scheduler_sanity ( void );
@@ -511,6 +530,11 @@
    for (i = 0; i < VG_N_WAITING_FDS; i++)
       vg_waiting_fds[i].fd = -1; /* not in use */
 
+   for (i = 0; i < VG_N_THREAD_KEYS; i++) {
+      vg_thread_keys[i].inuse      = False;
+      vg_thread_keys[i].destructor = NULL;
+   }
+
    /* Assert this is thread zero, which has certain magic
       properties. */
    tid_main = vg_alloc_ThreadState();
@@ -523,6 +547,8 @@
    vg_threads[tid_main].retval        = NULL; /* not important */
    vg_threads[tid_main].stack_highest_word 
       = vg_threads[tid_main].m_esp /* -4  ??? */;
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      vg_threads[tid_main].specifics[i] = NULL;
 
    /* Copy VG_(baseBlock) state to tid_main's slot. */
    vg_tid_currently_in_baseBlock = tid_main;
@@ -618,13 +644,16 @@
             (UInt)VG_(client_memalign) ( tst, arg[1], arg[2] )
          );
 
-      /* These are heavily used. */
+      /* These are heavily used -- or at least we want them to be
+         cheap. */
       case VG_USERREQ__PTHREAD_GET_THREADID:
          SIMPLE_RETURN(tid);
       case VG_USERREQ__RUNNING_ON_VALGRIND:
          SIMPLE_RETURN(1);
       case VG_USERREQ__GET_PTHREAD_TRACE_LEVEL:
          SIMPLE_RETURN(VG_(clo_trace_pthread_level));
+      case VG_USERREQ__READ_MILLISECOND_TIMER:
+         SIMPLE_RETURN(VG_(read_millisecond_timer)());
 
       default:
          /* Too hard; wimp out. */
@@ -692,18 +721,18 @@
    syscall_no = vg_threads[tid].m_eax; /* syscall number */
 
    if (syscall_no == __NR_nanosleep) {
-      ULong t_now, t_awaken;
+      UInt t_now, t_awaken;
       struct vki_timespec* req;
       req = (struct vki_timespec*)vg_threads[tid].m_ebx; /* arg1 */
-      t_now = VG_(read_microsecond_timer)();     
+      t_now = VG_(read_millisecond_timer)();     
       t_awaken 
          = t_now
-           + (ULong)1000000ULL * (ULong)(req->tv_sec) 
-           + (ULong)( (UInt)(req->tv_nsec) / 1000 );
+           + (UInt)1000ULL * (UInt)(req->tv_sec) 
+           + (UInt)(req->tv_nsec) / 1000000;
       vg_threads[tid].status    = VgTs_Sleeping;
       vg_threads[tid].awaken_at = t_awaken;
       if (VG_(clo_trace_sched)) {
-         VG_(sprintf)(msg_buf, "at %lu: nanosleep for %lu", 
+         VG_(sprintf)(msg_buf, "at %d: nanosleep for %d", 
                                t_now, t_awaken-t_now);
 	 print_sched_event(tid, msg_buf);
       }
@@ -820,16 +849,16 @@
    Char               msg_buf[100];
 
    struct vki_timespec* rem;
-   ULong                t_now;
+   UInt                 t_now;
 
    /* Awaken any sleeping threads whose sleep has expired. */
    for (tid = 1; tid < VG_N_THREADS; tid++)
      if (vg_threads[tid].status == VgTs_Sleeping)
         break;
 
-   /* Avoid pointless calls to VG_(read_microsecond_timer). */
+   /* Avoid pointless calls to VG_(read_millisecond_timer). */
    if (tid < VG_N_THREADS) {
-      t_now = VG_(read_microsecond_timer)();
+      t_now = VG_(read_millisecond_timer)();
       for (tid = 1; tid < VG_N_THREADS; tid++) {
          if (vg_threads[tid].status != VgTs_Sleeping)
             continue;
@@ -848,7 +877,7 @@
 	    /* Reschedule this thread. */
             vg_threads[tid].status = VgTs_Runnable;
             if (VG_(clo_trace_sched)) {
-               VG_(sprintf)(msg_buf, "at %lu: nanosleep done", 
+               VG_(sprintf)(msg_buf, "at %d: nanosleep done", 
                                      t_now);
                print_sched_event(tid, msg_buf);
             }
@@ -1005,16 +1034,30 @@
 
 
 static
+void check_for_pthread_cond_timedwait ( void )
+{
+   Int i;
+   for (i = 1; i < VG_N_THREADS; i++) {
+      if (vg_threads[i].status != VgTs_WaitCV)
+         continue;
+      if (vg_threads[i].awaken_at == 0xFFFFFFFF /* no timeout */)
+         continue;
+      if (VG_(read_millisecond_timer)() >= vg_threads[i].awaken_at)
+         do_pthread_cond_timedwait_TIMEOUT(i);
+   }
+}
+
+
+static
 void nanosleep_for_a_while ( void )
 {
    Int res;
    struct vki_timespec req;
    struct vki_timespec rem;
    req.tv_sec = 0;
-   req.tv_nsec = 20 * 1000 * 1000;
+   req.tv_nsec = 50 * 1000 * 1000;
    res = VG_(nanosleep)( &req, &rem );   
-   /* VG_(printf)("after ns, unused = %d\n", rem.tv_nsec ); */
-   vg_assert(res == 0);
+   vg_assert(res == 0 /* ok */ || res == 1 /* interrupted by signal */);
 }
 
 
@@ -1079,6 +1122,7 @@
             threads. */
          poll_for_ready_fds();
          complete_blocked_syscalls();
+         check_for_pthread_cond_timedwait();
 
          /* See if there are any signals which need to be delivered.  If
             so, choose thread(s) to deliver them to, and build signal
@@ -1527,6 +1571,7 @@
                          void* (*start_routine)(void *), 
                          void* arg )
 {
+   Int      i;
    Addr     new_stack;
    UInt     new_stk_szb;
    ThreadId tid;
@@ -1607,6 +1652,9 @@
    vg_threads[tid].joiner        = VG_INVALID_THREADID;
    vg_threads[tid].status        = VgTs_Runnable;
 
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      vg_threads[tid].specifics[i] = NULL;
+
    /* return zero */
    vg_threads[tid].m_edx  = 0; /* success */
 }
@@ -1691,7 +1739,7 @@
       mutex->__m_owner = (_pthread_descr)i;
       vg_threads[i].status        = VgTs_Runnable;
       vg_threads[i].associated_mx = NULL;
-      vg_threads[i].m_edx         = 0; /* pth_lock() success */
+      /* m_edx already holds pth_mx_lock() success (0) */
 
       if (VG_(clo_trace_pthread_level) >= 1) {
          VG_(sprintf)(msg_buf, "%s       mx %p: RESUME", 
@@ -1773,7 +1821,7 @@
          } else {
             vg_threads[tid].status        = VgTs_WaitMX;
             vg_threads[tid].associated_mx = mutex;
-            /* No assignment to %EDX, since we're blocking. */
+            vg_threads[tid].m_edx         = 0; /* pth_mx_lock success value */
             if (VG_(clo_trace_pthread_level) >= 1) {
                VG_(sprintf)(msg_buf, "%s    mx %p: BLOCK", 
                                      caller, mutex );
@@ -1890,6 +1938,56 @@
    don't need to think too hard there.  */
 
 
+static 
+void do_pthread_cond_timedwait_TIMEOUT ( ThreadId tid )
+{
+   Char             msg_buf[100];
+   pthread_mutex_t* mx;
+   pthread_cond_t*  cv;
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_WaitCV
+             && vg_threads[tid].awaken_at != 0xFFFFFFFF);
+   mx = vg_threads[tid].associated_mx;
+   vg_assert(mx != NULL);
+   cv = vg_threads[tid].associated_cv;
+   vg_assert(cv != NULL);
+
+   if (mx->__m_owner == VG_INVALID_THREADID) {
+      /* Currently unheld; hand it out to thread tid. */
+      vg_assert(mx->__m_count == 0);
+      vg_threads[tid].status        = VgTs_Runnable;
+      vg_threads[tid].m_edx         = ETIMEDOUT; 
+                                      /* pthread_cond_wait return value */
+      vg_threads[tid].associated_cv = NULL;
+      vg_threads[tid].associated_mx = NULL;
+      mx->__m_owner = (_pthread_descr)tid;
+      mx->__m_count = 1;
+
+      if (VG_(clo_trace_pthread_level) >= 1) {
+         VG_(sprintf)(msg_buf, "pthread_cond_timedwai cv %p: TIMEOUT with mx %p", 
+                               cv, mx );
+         print_pthread_event(tid, msg_buf);
+      }
+   } else {
+      /* Currently held.  Make thread tid be blocked on it. */
+      vg_assert(mx->__m_count > 0);
+      vg_threads[tid].status        = VgTs_WaitMX;
+      vg_threads[tid].m_edx         = ETIMEDOUT; 
+                                      /* pthread_cond_wait return value */
+      vg_threads[tid].associated_cv = NULL;
+      vg_threads[tid].associated_mx = mx;
+      if (VG_(clo_trace_pthread_level) >= 1) {
+         VG_(sprintf)(msg_buf, 
+            "pthread_cond_timedwai cv %p: TIMEOUT -> BLOCK for mx %p", 
+            cv, mx );
+         print_pthread_event(tid, msg_buf);
+      }
+
+   }
+}
+
+
 static
 void release_N_threads_waiting_on_cond ( pthread_cond_t* cond, 
                                          Int n_to_release, 
@@ -1920,8 +2018,6 @@
 
       mx = vg_threads[i].associated_mx;
       vg_assert(mx != NULL);
-      vg_assert(mx->__m_count > 0);
-      vg_assert(is_valid_tid((ThreadId)mx->__m_owner));
 
       if (mx->__m_owner == VG_INVALID_THREADID) {
          /* Currently unheld; hand it out to thread i. */
@@ -1931,7 +2027,7 @@
          vg_threads[i].associated_mx = NULL;
          mx->__m_owner = (_pthread_descr)i;
          mx->__m_count = 1;
-         vg_threads[i].m_edx = 0; /* pthread_cond_wait returns success */
+         /* .m_edx already holds pth_cond_wait success value (0) */
 
          if (VG_(clo_trace_pthread_level) >= 1) {
             VG_(sprintf)(msg_buf, "%s   cv %p: RESUME with mx %p", 
@@ -1941,9 +2037,11 @@
 
       } else {
          /* Currently held.  Make thread i be blocked on it. */
+         vg_assert(mx->__m_count > 0);
          vg_threads[i].status        = VgTs_WaitMX;
          vg_threads[i].associated_cv = NULL;
          vg_threads[i].associated_mx = mx;
+         vg_threads[i].m_edx         = 0; /* pth_cond_wait success value */
 
          if (VG_(clo_trace_pthread_level) >= 1) {
             VG_(sprintf)(msg_buf, "%s   cv %p: BLOCK for mx %p", 
@@ -1961,14 +2059,18 @@
 static
 void do_pthread_cond_wait ( ThreadId tid,
                             pthread_cond_t *cond, 
-                            pthread_mutex_t *mutex )
+                            pthread_mutex_t *mutex,
+			    UInt ms_end )
 {
    Char msg_buf[100];
 
+   /* If ms_end == 0xFFFFFFFF, wait forever (no timeout).  Otherwise,
+      ms_end is the ending millisecond. */
+
    /* pre: mutex should be a valid mutex and owned by tid. */
    if (VG_(clo_trace_pthread_level) >= 2) {
-      VG_(sprintf)(msg_buf, "pthread_cond_wait        cv %p, mx %p ...", 
-                            cond, mutex );
+      VG_(sprintf)(msg_buf, "pthread_cond_wait        cv %p, mx %p, end %d ...", 
+                            cond, mutex, ms_end );
       print_pthread_event(tid, msg_buf);
    }
 
@@ -2007,6 +2109,7 @@
    vg_threads[tid].status        = VgTs_WaitCV;
    vg_threads[tid].associated_cv = cond;
    vg_threads[tid].associated_mx = mutex;
+   vg_threads[tid].awaken_at     = ms_end;
 
    if (VG_(clo_trace_pthread_level) >= 1) {
       VG_(sprintf)(msg_buf, 
@@ -2055,6 +2158,133 @@
 }
 
 
+/* -----------------------------------------------------------
+   THREAD SPECIFIC DATA
+   -------------------------------------------------------- */
+
+static __inline__
+Bool is_valid_key ( ThreadKey k )
+{
+   /* k unsigned; hence no < 0 check */
+   if (k >= VG_N_THREAD_KEYS) return False;
+   if (!vg_thread_keys[k].inuse) return False;
+   return True;
+}
+
+static
+void do_pthread_key_create ( ThreadId tid,
+                             pthread_key_t* key,
+                             void (*destructor)(void*) )
+{
+   Int  i;
+   Char msg_buf[100];
+
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_key_create      *key %p, destr %p", 
+                            key, destructor );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(sizeof(pthread_key_t) == sizeof(ThreadKey));
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   for (i = 0; i < VG_N_THREAD_KEYS; i++)
+      if (!vg_thread_keys[i].inuse)   
+         break;
+
+   if (i == VG_N_THREAD_KEYS) {
+      /* vg_threads[tid].m_edx = EAGAIN; 
+         return; 
+      */
+      VG_(panic)("pthread_key_create: VG_N_THREAD_KEYS is too low;"
+                 " increase and recompile");
+   }
+
+   vg_thread_keys[i].inuse = True;
+   /* TODO: check key for addressibility */
+   *key = i;
+   vg_threads[tid].m_edx = 0;
+}
+
+
+static
+void do_pthread_key_delete ( ThreadId tid, pthread_key_t key )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_key_delete       key %d", 
+                            key );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+   
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = EINVAL;
+      return;
+   }
+
+   vg_thread_keys[key].inuse = False;
+
+   /* Optional.  We're not required to do this, although it shouldn't
+      make any difference to programs which use the key/specifics
+      functions correctly.  */
+   for (tid = 1; tid < VG_N_THREADS; tid++) {
+      if (vg_threads[tid].status != VgTs_Empty)
+         vg_threads[tid].specifics[key] = NULL;
+   }
+}
+
+
+static 
+void do_pthread_getspecific ( ThreadId tid, pthread_key_t key )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_getspecific      key %d", 
+                            key );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = (UInt)NULL;
+      return;
+   }
+
+   vg_threads[tid].m_edx = (UInt)vg_threads[tid].specifics[key];
+}
+
+
+static
+void do_pthread_setspecific ( ThreadId tid, 
+                              pthread_key_t key, 
+                              void *pointer )
+{
+   Char msg_buf[100];
+   if (VG_(clo_trace_pthread_level) >= 1) {
+      VG_(sprintf)(msg_buf, "pthread_setspecific      key %d, ptr %p", 
+                            key, pointer );
+      print_pthread_event(tid, msg_buf);
+   }
+
+   vg_assert(is_valid_tid(tid) 
+             && vg_threads[tid].status == VgTs_Runnable);
+
+   if (!is_valid_key(key)) {
+      vg_threads[tid].m_edx = EINVAL;
+      return;
+   }
+
+   vg_threads[tid].specifics[key] = pointer;
+   vg_threads[tid].m_edx = 0;
+}
+
+
 /* ---------------------------------------------------------------------
    Handle non-trivial client requests.
    ------------------------------------------------------------------ */
@@ -2105,7 +2335,15 @@
       case VG_USERREQ__PTHREAD_COND_WAIT:
          do_pthread_cond_wait( tid, 
                                (pthread_cond_t *)(arg[1]),
-                               (pthread_mutex_t *)(arg[2]) );
+                               (pthread_mutex_t *)(arg[2]),
+                               0xFFFFFFFF /* no timeout */ );
+         break;
+
+      case VG_USERREQ__PTHREAD_COND_TIMEDWAIT:
+         do_pthread_cond_wait( tid, 
+                               (pthread_cond_t *)(arg[1]),
+                               (pthread_mutex_t *)(arg[2]),
+                               arg[3] /* timeout millisecond point */ );
          break;
 
       case VG_USERREQ__PTHREAD_COND_SIGNAL:
@@ -2122,6 +2360,28 @@
             (pthread_cond_t *)(arg[1]) );
          break;
 
+      case VG_USERREQ__PTHREAD_KEY_CREATE:
+ 	 do_pthread_key_create ( tid, 
+                                 (pthread_key_t*)(arg[1]),
+                                 (void(*)(void*))(arg[2]) );
+	 break;
+
+      case VG_USERREQ__PTHREAD_KEY_DELETE:
+ 	 do_pthread_key_delete ( tid, 
+                                 (pthread_key_t)(arg[1]) );
+ 	 break;
+
+      case VG_USERREQ__PTHREAD_GETSPECIFIC:
+ 	 do_pthread_getspecific ( tid, 
+                                  (pthread_key_t)(arg[1]) );
+ 	 break;
+
+      case VG_USERREQ__PTHREAD_SETSPECIFIC:
+ 	 do_pthread_setspecific ( tid, 
+                                  (pthread_key_t)(arg[1]),
+				  (void*)(arg[2]) );
+ 	 break;
+
       case VG_USERREQ__MAKE_NOACCESS:
       case VG_USERREQ__MAKE_WRITABLE:
       case VG_USERREQ__MAKE_READABLE:
@@ -2160,6 +2420,7 @@
    pthread_mutex_t* mx;
    pthread_cond_t*  cv;
    Int              i;
+
    /* VG_(printf)("scheduler_sanity\n"); */
    for (i = 1; i < VG_N_THREADS; i++) {
       mx = vg_threads[i].associated_mx;
@@ -2190,6 +2451,11 @@
          /* vg_assert(mx == NULL); */
       }
    }
+
+   for (i = 0; i < VG_N_THREAD_KEYS; i++) {
+      if (!vg_thread_keys[i].inuse)
+         vg_assert(vg_thread_keys[i].destructor == NULL);
+   }
 }