A complete reworking of Valgrind's handling of system calls and signals,
with the aim of making it more robust, more correct and perhaps faster.

This patch removes the need to poll blocking syscalls, by adding a proxy
LWP for each application thread.  This LWP is a kernel thread whose job
is to run all (potentially) blocking syscalls, and also to handle signals.

This allows the kernel to do more of the work of dealing with signals,
so on kernels which do this properly (2.6), Valgrind's behavious is a
lot more posix compliant.  On base 2.4 kernels, we emulate some of the
missing 2.6 functionality.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1918 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_mylibc.c b/coregrind/vg_mylibc.c
index d0dcb57..811818f 100644
--- a/coregrind/vg_mylibc.c
+++ b/coregrind/vg_mylibc.c
@@ -32,101 +32,6 @@
 
 #include "vg_include.h"
 
-
-
-/* ---------------------------------------------------------------------
-   Really Actually DO system calls.
-   ------------------------------------------------------------------ */
-
-/* Ripped off from /usr/include/asm/unistd.h. */
-
-static
-UInt vg_do_syscall0 ( UInt syscallno )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno) );
-   return __res;
-}
-
-
-static
-UInt vg_do_syscall1 ( UInt syscallno, UInt arg1 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1) );
-   return __res;
-}
-
-
-static
-UInt vg_do_syscall2 ( UInt syscallno, 
-                      UInt arg1, UInt arg2 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1),
-                       "c" (arg2) );
-   return __res;
-}
-
-
-static
-UInt vg_do_syscall3 ( UInt syscallno, 
-                      UInt arg1, UInt arg2, UInt arg3 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1),
-                       "c" (arg2),
-                       "d" (arg3) );
-   return __res;
-}
-
-
-static
-UInt vg_do_syscall4 ( UInt syscallno, 
-                      UInt arg1, UInt arg2, UInt arg3, UInt arg4 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1),
-                       "c" (arg2),
-                       "d" (arg3),
-                       "S" (arg4) );
-   return __res;
-}
-
-
-#if 0
-static
-UInt vg_do_syscall5 ( UInt syscallno, 
-                      UInt arg1, UInt arg2, UInt arg3, UInt arg4, 
-                      UInt arg5 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1),
-                       "c" (arg2),
-                       "d" (arg3),
-                       "S" (arg4),
-                       "D" (arg5) );
-   return __res;
-}
-#endif
-
 /* ---------------------------------------------------------------------
    Wrappers around system calls, and other stuff, to do with signals.
    ------------------------------------------------------------------ */
@@ -236,9 +141,9 @@
                        vki_ksigset_t* oldset)
 {
    Int res 
-      = vg_do_syscall4(__NR_rt_sigprocmask, 
-                       how, (UInt)set, (UInt)oldset, 
-                       VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
+      = VG_(do_syscall)(__NR_rt_sigprocmask, 
+			how, (UInt)set, (UInt)oldset, 
+			VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
    return VG_(is_kerror)(res) ? -1 : 0;
 }
 
@@ -248,9 +153,9 @@
                       vki_ksigaction* oldact)
 {
    Int res
-     = vg_do_syscall4(__NR_rt_sigaction,
-                      signum, (UInt)act, (UInt)oldact, 
-                      VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
+     = VG_(do_syscall)(__NR_rt_sigaction,
+		       signum, (UInt)act, (UInt)oldact, 
+		       VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
    /* VG_(printf)("res = %d\n",res); */
    return VG_(is_kerror)(res) ? -1 : 0;
 }
@@ -259,10 +164,17 @@
 Int VG_(ksigaltstack)( const vki_kstack_t* ss, vki_kstack_t* oss )
 {
    Int res
-     = vg_do_syscall2(__NR_sigaltstack, (UInt)ss, (UInt)oss);
+     = VG_(do_syscall)(__NR_sigaltstack, (UInt)ss, (UInt)oss);
    return VG_(is_kerror)(res) ? -1 : 0;
 }
 
+Int VG_(ksigtimedwait)( const vki_ksigset_t *set, vki_ksiginfo_t *info, 
+			const struct vki_timespec *timeout )
+{
+   Int res = VG_(do_syscall)(__NR_rt_sigtimedwait, set, info, timeout, sizeof(*set));
+
+   return VG_(is_kerror)(res) ? -1 : res;
+}
  
 Int VG_(ksignal)(Int signum, void (*sighandler)(Int))
 {
@@ -273,26 +185,60 @@
    sa.ksa_restorer = NULL;
    res = VG_(ksigemptyset)( &sa.ksa_mask );
    vg_assert(res == 0);
-   res = vg_do_syscall4(__NR_rt_sigaction,
-                        signum, (UInt)(&sa), (UInt)NULL,
-                        VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
+   res = VG_(do_syscall)(__NR_rt_sigaction,
+			 signum, (UInt)(&sa), (UInt)NULL,
+			 VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
    return VG_(is_kerror)(res) ? -1 : 0;
 }
 
 
 Int VG_(kkill)( Int pid, Int signo )
 {
-   Int res = vg_do_syscall2(__NR_kill, pid, signo);
+   Int res = VG_(do_syscall)(__NR_kill, pid, signo);
    return VG_(is_kerror)(res) ? -1 : 0;
 }
 
 
+Int VG_(ktkill)( Int tid, Int signo )
+{
+   Int ret = -VKI_ENOSYS;
+
+#ifdef __NR_tkill
+   ret = VG_(do_syscall)(__NR_tkill, tid, signo);
+#endif /* __NR_tkill */
+
+   if (ret == -VKI_ENOSYS)
+      ret = VG_(do_syscall)(__NR_kill, tid, signo);
+
+   return VG_(is_kerror)(ret) ? -1 : 0;
+}
+
 Int VG_(ksigpending) ( vki_ksigset_t* set )
 {
-   Int res = vg_do_syscall1(__NR_sigpending, (UInt)set);
+   Int res = VG_(do_syscall)(__NR_sigpending, (UInt)set);
    return VG_(is_kerror)(res) ? -1 : 0;
 }
 
+Int VG_(waitpid)(Int pid, Int *status, Int options)
+{
+   Int ret = VG_(do_syscall)(__NR_wait4, pid, status, options, NULL);
+
+   return VG_(is_kerror)(ret) ? -1 : ret;
+}
+
+Int VG_(gettid)(void)
+{
+   Int ret;
+
+   ret = VG_(do_syscall)(__NR_gettid);
+
+   if (ret == -VKI_ENOSYS)
+      ret = VG_(do_syscall)(__NR_getpid);
+
+   return ret;
+}
+
+
 
 /* ---------------------------------------------------------------------
    mmap/munmap, exit, fcntl
@@ -310,29 +256,31 @@
    args[3] = flags;
    args[4] = fd;
    args[5] = offset;
-   res = vg_do_syscall1(__NR_mmap, (UInt)(&(args[0])) );
+   res = VG_(do_syscall)(__NR_mmap, (UInt)(&(args[0])) );
    return VG_(is_kerror)(res) ? ((void*)(-1)) : (void*)res;
 }
 
 /* Returns -1 on failure. */
 Int VG_(munmap)( void* start, Int length )
 {
-   Int res = vg_do_syscall2(__NR_munmap, (UInt)start, (UInt)length );
+   Int res = VG_(do_syscall)(__NR_munmap, (UInt)start, (UInt)length );
    return VG_(is_kerror)(res) ? -1 : 0;
 }
 
 void VG_(exit)( Int status )
 {
-   (void)vg_do_syscall1(__NR_exit, (UInt)status );
+   (void)VG_(do_syscall)(__NR_exit_group, (UInt)status );
+   (void)VG_(do_syscall)(__NR_exit, (UInt)status );
    /* Why are we still alive here? */
    /*NOTREACHED*/
+   *(volatile Int *)0 = 'x';
    vg_assert(2+2 == 5);
 }
 
 /* Returns -1 on error. */
 Int VG_(fcntl) ( Int fd, Int cmd, Int arg )
 {
-   Int res = vg_do_syscall3(__NR_fcntl, fd, cmd, arg);
+   Int res = VG_(do_syscall)(__NR_fcntl, fd, cmd, arg);
    return VG_(is_kerror)(res) ? -1 : res;
 }
 
@@ -350,16 +298,23 @@
    args[2] = (UInt)writefds;
    args[3] = (UInt)exceptfds;
    args[4] = (UInt)timeout;
-   res = vg_do_syscall1(__NR_select, (UInt)(&(args[0])) );
+   res = VG_(do_syscall)(__NR_select, (UInt)(&(args[0])) );
    return VG_(is_kerror)(res) ? -1 : res;
 }
 
+Int VG_(poll)( struct vki_pollfd *ufds, UInt nfds, Int timeout)
+{
+   Int res = VG_(do_syscall)(__NR_poll, ufds, nfds, timeout);
+
+   return res;
+}
+
 /* Returns -1 on error, 0 if ok, 1 if interrupted. */
 Int VG_(nanosleep)( const struct vki_timespec *req, 
                     struct vki_timespec *rem )
 {
    Int res;
-   res = vg_do_syscall2(__NR_nanosleep, (UInt)req, (UInt)rem);
+   res = VG_(do_syscall)(__NR_nanosleep, (UInt)req, (UInt)rem);
    if (res == -VKI_EINVAL) return -1;
    if (res == -VKI_EINTR)  return 1;
    return 0;
@@ -368,7 +323,7 @@
 void* VG_(brk) ( void* end_data_segment )
 {
    Int res;
-   res = vg_do_syscall1(__NR_brk, (UInt)end_data_segment);
+   res = VG_(do_syscall)(__NR_brk, (UInt)end_data_segment);
    return (void*)(  VG_(is_kerror)(res) ? -1 : res  );
 }
 
@@ -1089,9 +1044,35 @@
    Assertery.
    ------------------------------------------------------------------ */
 
+/* Fake up an ExeContext which is of our actual real CPU state, so we
+   can print a stack trace.  This isn't terribly useful in the case
+   where we were killed by a signal, since we just get a backtrace
+   into the signal handler.  Also, it could be somewhat risky if we
+   actully got the panic/exception within the execontext/stack
+   dump/symtab code.  But it's better than nothing. */
+static inline ExeContext *get_real_execontext(Addr ret)
+{
+   ExeContext *ec;
+   Addr ebp;
+   Addr stacktop;
+   Addr esp = (Addr)&esp;
+
+   asm("movl %%ebp, %0" : "=r" (ebp));
+   stacktop = (Addr)&VG_(stack)[VG_STACK_SIZE_W];
+   if (esp >= (Addr)&VG_(sigstack)[0] && esp < (Addr)&VG_(sigstack)[VG_STACK_SIZE_W])
+      stacktop = (Addr)&VG_(sigstack)[VG_STACK_SIZE_W];
+      
+   ec = VG_(get_ExeContext2)(ret, ebp, esp, stacktop);
+
+   return ec;
+}
+
 __attribute__ ((noreturn))
 static void report_and_quit ( const Char* report )
 {
+   ExeContext *ec = get_real_execontext((Addr)__builtin_return_address(0));
+   VG_(pp_ExeContext)(ec);
+   
    VG_(pp_sched_status)();
    VG_(printf)("\n");
    VG_(printf)("Note: see also the FAQ.txt in the source distribution.\n");
@@ -1152,6 +1133,30 @@
    Primitive support for reading files.
    ------------------------------------------------------------------ */
 
+static inline Bool fd_exists(Int fd)
+{
+   struct vki_stat st;
+
+   return VG_(fstat)(fd, &st) == 0;
+}
+
+/* Move an fd into the Valgrind-safe range */
+Int VG_(safe_fd)(Int oldfd)
+{
+   Int newfd;
+
+   newfd = VG_(fcntl)(oldfd, VKI_F_DUPFD, VG_MAX_FD+1);
+   if (newfd != -1)
+      VG_(close)(oldfd);
+
+   VG_(fcntl)(newfd, VKI_F_SETFD, VKI_FD_CLOEXEC);
+
+   vg_assert(newfd > VG_MAX_FD);
+   return newfd;
+}
+
+
+
 /* Returns -1 on failure. */
 Int VG_(open) ( const Char* pathname, Int flags, Int mode )
 {  
@@ -1164,15 +1169,21 @@
    /* fd = open( pathname, O_RDONLY ); */
    /* ... so we go direct to the horse's mouth, which seems to work
       ok: */
-   fd = vg_do_syscall3(__NR_open, (UInt)pathname, flags, mode);
+   fd = VG_(do_syscall)(__NR_open, (UInt)pathname, flags, mode);
    /* VG_(printf)("result = %d\n", fd); */
    if (VG_(is_kerror)(fd)) fd = -1;
    return fd;
 }
 
+Int VG_(pipe) ( Int fd[2] )
+{
+   Int ret = VG_(do_syscall)(__NR_pipe, fd);
+   return VG_(is_kerror)(ret) ? -1 : 0;
+}
+
 void VG_(close) ( Int fd )
 {
-   vg_do_syscall1(__NR_close, fd);
+   VG_(do_syscall)(__NR_close, fd);
 }
 
 
@@ -1180,38 +1191,52 @@
 {
    Int res;
    /* res = read( fd, buf, count ); */
-   res = vg_do_syscall3(__NR_read, fd, (UInt)buf, count);
-   if (VG_(is_kerror)(res)) res = -1;
+   res = VG_(do_syscall)(__NR_read, fd, (UInt)buf, count);
+   /* return -ERRNO on error */
    return res;
 }
 
-Int VG_(write) ( Int fd, void* buf, Int count)
+Int VG_(write) ( Int fd, const void* buf, Int count)
 {
    Int res;
    /* res = write( fd, buf, count ); */
-   res = vg_do_syscall3(__NR_write, fd, (UInt)buf, count);
-   if (VG_(is_kerror)(res)) res = -1;
+   res = VG_(do_syscall)(__NR_write, fd, (UInt)buf, count);
+   /* return -ERRNO on error */
    return res;
 }
 
 Int VG_(stat) ( Char* file_name, struct vki_stat* buf )
 {
    Int res;
-   res = vg_do_syscall2(__NR_stat, (UInt)file_name, (UInt)buf);
+   res = VG_(do_syscall)(__NR_stat, (UInt)file_name, (UInt)buf);
    return VG_(is_kerror)(res) ? (-1) : 0;
 }
 
+Int VG_(fstat) ( Int fd, struct vki_stat* buf )
+{
+   Int res;
+   res = VG_(do_syscall)(__NR_fstat, (UInt)fd, (UInt)buf);
+   return VG_(is_kerror)(res) ? (-1) : 0;
+}
+
+Int VG_(dup2) ( Int oldfd, Int newfd )
+{
+   Int res;
+   res = VG_(do_syscall)(__NR_dup2, (UInt)oldfd, (UInt)newfd);
+   return VG_(is_kerror)(res) ? (-1) : res;
+}
+
 Int VG_(rename) ( Char* old_name, Char* new_name )
 {
    Int res;
-   res = vg_do_syscall2(__NR_rename, (UInt)old_name, (UInt)new_name);
+   res = VG_(do_syscall)(__NR_rename, (UInt)old_name, (UInt)new_name);
    return VG_(is_kerror)(res) ? (-1) : 0;
 }
 
 Int VG_(unlink) ( Char* file_name )
 {
    Int res;
-   res = vg_do_syscall1(__NR_unlink, (UInt)file_name);
+   res = VG_(do_syscall)(__NR_unlink, (UInt)file_name);
    return VG_(is_kerror)(res) ? (-1) : 0;
 }
 
@@ -1221,7 +1246,7 @@
 {
    Int res;
    vg_assert(buf != NULL);
-   res = vg_do_syscall2(__NR_getcwd, (UInt)buf, (UInt)size);
+   res = VG_(do_syscall)(__NR_getcwd, (UInt)buf, (UInt)size);
    return VG_(is_kerror)(res) ? ((Char*)NULL) : (Char*)res;
 }
 
@@ -1270,17 +1295,29 @@
 {
    Int res;
    /* res = getpid(); */
-   res = vg_do_syscall0(__NR_getpid);
+   res = VG_(do_syscall)(__NR_getpid);
+   return res;
+}
+
+Int VG_(getpgrp) ( void )
+{
+   Int res;
+   /* res = getpgid(); */
+   res = VG_(do_syscall)(__NR_getpgrp);
    return res;
 }
 
 Int VG_(getppid) ( void )
 {
    Int res;
-   res = vg_do_syscall0(__NR_getppid);
+   res = VG_(do_syscall)(__NR_getppid);
    return res;
 }
 
+Int VG_(setpgid) ( Int pid, Int pgrp )
+{
+   return VG_(do_syscall)(__NR_setpgid, pid, pgrp);
+}
 
 /* Return -1 if error, else 0.  NOTE does not indicate return code of
    child! */
@@ -1290,7 +1327,7 @@
    void* environ[1] = { NULL };
    if (cmd == NULL)
       return 1;
-   pid = vg_do_syscall0(__NR_fork);
+   pid = VG_(do_syscall)(__NR_fork);
    if (VG_(is_kerror)(pid))
       return -1;
    if (pid == 0) {
@@ -1300,13 +1337,13 @@
       argv[1] = "-c";
       argv[2] = cmd;
       argv[3] = 0;
-      (void)vg_do_syscall3(__NR_execve, 
-                           (UInt)"/bin/sh", (UInt)argv, (UInt)&environ);
+      (void)VG_(do_syscall)(__NR_execve, 
+			    (UInt)"/bin/sh", (UInt)argv, (UInt)&environ);
       /* If we're still alive here, execve failed. */
       return -1;
    } else {
       /* parent */
-      res = vg_do_syscall3(__NR_waitpid, pid, (UInt)NULL, 0);
+      res = VG_(do_syscall)(__NR_waitpid, pid, (UInt)NULL, 0);
       if (VG_(is_kerror)(res)) {
          return -1;
       } else {
@@ -1357,8 +1394,8 @@
    vg_assert(rdtsc_calibration_state == 0);
    rdtsc_calibration_state = 1;
    rdtsc_cal_start_raw = do_rdtsc_insn();
-   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_start_timeval, 
-                                           (UInt)NULL);
+   res = VG_(do_syscall)(__NR_gettimeofday, (UInt)&rdtsc_cal_start_timeval, 
+			 (UInt)NULL);
    vg_assert(!VG_(is_kerror)(res));
 }
 
@@ -1396,8 +1433,8 @@
 
    /* Now read both timers, and do the Math. */
    rdtsc_cal_end_raw = do_rdtsc_insn();
-   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_end_timeval, 
-                                           (UInt)NULL);
+   res = VG_(do_syscall)(__NR_gettimeofday, (UInt)&rdtsc_cal_end_timeval, 
+			 (UInt)NULL);
 
    vg_assert(rdtsc_cal_end_raw > rdtsc_cal_start_raw);
    cal_clock_ticks = rdtsc_cal_end_raw - rdtsc_cal_start_raw;
@@ -1749,7 +1786,7 @@
    args[0] = domain;
    args[1] = type;
    args[2] = protocol;
-   res = vg_do_syscall2(__NR_socketcall, SYS_SOCKET, (UInt)&args);
+   res = VG_(do_syscall)(__NR_socketcall, SYS_SOCKET, (UInt)&args);
    if (VG_(is_kerror)(res)) 
       res = -1;
    return res;
@@ -1764,7 +1801,7 @@
    args[0] = sockfd;
    args[1] = (UInt)serv_addr;
    args[2] = addrlen;
-   res = vg_do_syscall2(__NR_socketcall, SYS_CONNECT, (UInt)&args);
+   res = VG_(do_syscall)(__NR_socketcall, SYS_CONNECT, (UInt)&args);
    if (VG_(is_kerror)(res)) 
       res = -1;
    return res;
@@ -1785,7 +1822,7 @@
    args[1] = (UInt)msg;
    args[2] = count;
    args[3] = flags;
-   res = vg_do_syscall2(__NR_socketcall, SYS_SEND, (UInt)&args);
+   res = VG_(do_syscall)(__NR_socketcall, SYS_SEND, (UInt)&args);
    if (VG_(is_kerror)(res)) 
       res = -1;
    return res;