Fix the use of brk.  This change removes the requirement for the "real" brk
segment to be moved up to stage2's brk segment.  Instead, Valgrind's
use of brk is simulated with mmap.  In order to prevent any unwanted use
of the process brk segment, it also sets the RLIMIT_DATA to 0, which will
make brk always fail.  glibc's malloc will use mmap to allocate if brk
fails.  We try to intercept glibc's brk, but malloc seems to always use the
library-internal version.  (The client's use of brk has always been simulated,
and is unaffected by this change.)


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@2266 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am
index f20a1c2..33b9d02 100644
--- a/coregrind/Makefile.am
+++ b/coregrind/Makefile.am
@@ -50,6 +50,7 @@
 	vg_errcontext.c \
 	vg_execontext.c \
 	vg_from_ucode.c \
+	vg_glibc.c \
 	vg_hashtable.c \
 	vg_helpers.S \
 	vg_instrument.c \
diff --git a/coregrind/stage1.c b/coregrind/stage1.c
index 1b91f84..b73c32e 100644
--- a/coregrind/stage1.c
+++ b/coregrind/stage1.c
@@ -162,7 +162,6 @@
       - something else?
     */
    info.map_base = 0xb0000000;
-   info.setbrk = 1;		/* ask do_exec to move the brk-base */
    info.argv = NULL;
 
    snprintf(buf, sizeof(buf), "%s/%s", valgrind_lib, stage2);
diff --git a/coregrind/ume.c b/coregrind/ume.c
index d0a5f5a..ff76d0c 100644
--- a/coregrind/ume.c
+++ b/coregrind/ume.c
@@ -90,6 +90,7 @@
 #include <assert.h>
 
 #include "ume.h"
+#include "vg_include.h"
 
 static int padfile = -1;
 static struct stat padstat;
@@ -309,7 +310,7 @@
 #define REMAINS(x, a)   ((x)        & ((a)-1))
 
 /* Map an ELF file.  Returns the brk address. */
-ESZ(Addr) mapelf(struct elfinfo *e, ESZ(Addr) base, int setbrk)
+ESZ(Addr) mapelf(struct elfinfo *e, ESZ(Addr) base)
 {
    int i;
    ESZ(Addr) elfbrk = 0;
@@ -330,33 +331,6 @@
 	 elfbrk = brkaddr;
    }
 
-   if (setbrk) {
-      /* sneaking up on the brk limit works better than actually
-	 jumping directly there.  Unfortunately, setting the brk is
-	 tested against the datasize rlimit, even though we're not
-	 actually using any memory. */
-      char *b = sbrk(0);
-      char *initb = (char *)PGROUNDUP(b);
-
-      while(b < (char *)elfbrk) {
-	 unsigned delta = (char *)elfbrk - b;
-	 static const unsigned limit = 256*1024*1024;
-	 char *bb;
-
-	 if (delta > limit)
-	    delta = limit;
-	 //printf("elfbrk=%p b=%p delta=%u\n", elfbrk, b, delta);
-	 bb = sbrk(delta);
-	 if (bb != b) {
-	    fprintf(stderr, "sbrk failed while adjusting brk base: "
-		    "perhaps we hit the datasize ulimit?\n");
-	    return 0;
-	 }
-	 b += delta;
-      }
-      munmap(initb, (char *)PGROUNDDN(elfbrk)-initb);
-   }
-
    for(i = 0; i < e->e.e_phnum; i++) {
       ESZ(Phdr) *ph = &e->p[i];
       ESZ(Addr) addr, bss, brkaddr;
@@ -508,7 +482,7 @@
       }
    }
 
-   info->brkbase = mapelf(e, 0, info->setbrk);		/* map the executable */
+   info->brkbase = mapelf(e, 0);		/* map the executable */
 
    if (info->brkbase == 0)
       return ENOMEM;
@@ -528,7 +502,7 @@
 
       baseoff = base - interp_addr;
 
-      mapelf(interp, (ESZ(Addr))baseoff, 0);
+      mapelf(interp, (ESZ(Addr))baseoff);
 
       close(interp->fd);
       free(interp);
diff --git a/coregrind/ume.h b/coregrind/ume.h
index 286e356..4762f98 100644
--- a/coregrind/ume.h
+++ b/coregrind/ume.h
@@ -43,7 +43,6 @@
 
 struct exeinfo
 {
-   int		setbrk;		/* INPUT: if true, set the brk segment base */
    addr_t	map_base;	/* INPUT: if non-zero, base address of mappings  */
 
    addr_t	exe_base;	/* INOUT: lowest (allowed) address of exe	*/
@@ -82,7 +81,7 @@
 };
 
 struct elfinfo *readelf(int fd, const char *filename);
-ESZ(Addr) mapelf(struct elfinfo *e, ESZ(Addr) base, int setbrk);
+ESZ(Addr) mapelf(struct elfinfo *e, ESZ(Addr) base);
 
 struct ume_auxv
 {
diff --git a/coregrind/vg_glibc.c b/coregrind/vg_glibc.c
new file mode 100644
index 0000000..3454a55
--- /dev/null
+++ b/coregrind/vg_glibc.c
@@ -0,0 +1,39 @@
+#include <errno.h>
+#include <sys/mman.h>
+#include "vg_include.h"
+
+extern void *__curbrk;		/* in glibc */
+
+/* libc overrides, so that things can use normal allocators if they
+   wish. 
+   XXX override malloc with VG_(arena_malloc)?
+*/
+int brk(void *end)
+{
+   void *res = VG_(brk)(end);
+
+   if (res != end) {
+      errno = -ENOMEM;
+      return -1;
+   }
+   return 0;
+}
+int __brk(void *) __attribute__((alias ("brk")));
+
+void *sbrk(ptrdiff_t inc)
+{
+   void *oldbrk = __curbrk;
+
+   if (inc == 0)
+      return __curbrk;
+   if (__brk(__curbrk + inc) < 0)
+      return (void *)-1;
+
+   return oldbrk;
+}
+int __sbrk(void *) __attribute__((alias ("sbrk")));
+
+void *mmap(void *addr, size_t len, int prot, int flags, int fd, __off_t offset)
+{
+   return VG_(mmap)(addr, len, prot, flags, fd, offset);
+}
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index 5e4d810..a764362 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -1359,6 +1359,8 @@
 extern Addr VG_(valgrind_mmap_end);
 extern Addr VG_(valgrind_end);
 
+extern vki_rlimit VG_(client_rlimit_data); /* client's original rlimit data */
+
 /* stage1 executable file descriptor */
 extern Int  VG_(vgexecfd);
 
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index f4f95ba..516a3ef 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -105,6 +105,8 @@
 Addr VG_(valgrind_mmap_end);	 /* valgrind's mmaps are between valgrind_base and here */
 Addr VG_(valgrind_end);
 
+vki_rlimit VG_(client_rlimit_data);
+
 /* This is set early to indicate whether this CPU has the
    SSE/fxsave/fxrestor features.  */
 Bool VG_(have_ssestate);
@@ -1364,7 +1366,6 @@
    }
 
    info->map_base = VG_(client_mapbase);
-   info->setbrk   = False;
 
    info->exe_base = VG_(client_base);
    info->exe_end  = VG_(client_end);
@@ -2664,6 +2665,7 @@
    Addr esp_at_startup;    /* client's %esp at the point we gained control. */
    UInt * client_auxv;
    VgSchedReturnCode src;
+   vki_rlimit zero = { 0, 0 };
 
    //============================================================
    // Nb: startup is complex.  Prerequisites are shown at every step.
@@ -2671,6 +2673,14 @@
    // *** Be very careful when messing with the order ***
    //============================================================
 
+   // Get the current process datasize rlimit, and set it to zero.
+   // This prevents any internal uses of brk() from having any effect.
+   // We remember the old value so we can restore it on exec, so that
+   // child processes will have a reasonable brk value.
+   VG_(getrlimit)(VKI_RLIMIT_DATA, &VG_(client_rlimit_data));
+   zero.rlim_max = VG_(client_rlimit_data).rlim_max;
+   VG_(setrlimit)(VKI_RLIMIT_DATA, &zero);
+   
    //--------------------------------------------------------------
    // Check we were launched by stage1
    //   p: n/a  [must be first step]
diff --git a/coregrind/vg_mylibc.c b/coregrind/vg_mylibc.c
index 6d64cc3..130c293 100644
--- a/coregrind/vg_mylibc.c
+++ b/coregrind/vg_mylibc.c
@@ -244,12 +244,30 @@
    mmap/munmap, exit, fcntl
    ------------------------------------------------------------------ */
 
+static Int munmap_inner(void *start, UInt length)
+{
+   return VG_(do_syscall)(__NR_munmap, (UInt)start, (UInt)length );
+}
+
+static Addr mmap_inner(void *start, UInt length, UInt prot, UInt flags, UInt fd, UInt offset)
+{
+   UInt args[6];
+
+   args[0] = (UInt)start;
+   args[1] = length;
+   args[2] = prot;
+   args[3] = flags & ~(VKI_MAP_NOSYMS|VKI_MAP_CLIENT);
+   args[4] = fd;
+   args[5] = offset;
+
+   return VG_(do_syscall)(__NR_mmap, (UInt)(&(args[0])) );   
+}
+
 /* Returns -1 on failure. */
 void* VG_(mmap)( void* start, UInt length, 
                  UInt prot, UInt flags, UInt fd, UInt offset)
 {
    Addr  res;
-   UInt args[6];
 
    if (!(flags & VKI_MAP_FIXED)) {
       start = (void *)VG_(find_map_space)((Addr)start, length, !!(flags & VKI_MAP_CLIENT));
@@ -259,13 +277,7 @@
       flags |= VKI_MAP_FIXED;
    }
 
-   args[0] = (UInt)start;
-   args[1] = length;
-   args[2] = prot;
-   args[3] = flags & ~(VKI_MAP_NOSYMS|VKI_MAP_CLIENT);
-   args[4] = fd;
-   args[5] = offset;
-   res = VG_(do_syscall)(__NR_mmap, (UInt)(&(args[0])) );
+   res = mmap_inner(start, length, prot, flags, fd, offset);
 
    if (!VG_(is_kerror)(res)) {
       UInt sf_flags = SF_MMAP;
@@ -305,7 +317,7 @@
 /* Returns -1 on failure. */
 Int VG_(munmap)( void* start, Int length )
 {
-   Int res = VG_(do_syscall)(__NR_munmap, (UInt)start, (UInt)length );
+   Int res = munmap_inner(start, length);
    if (!VG_(is_kerror)(res))
       VG_(unmap_range)((Addr)start, length);
    return VG_(is_kerror)(res) ? -1 : 0;
@@ -372,11 +384,49 @@
    return 0;
 }
 
+extern Char _end;
+Char *VG_(curbrk) = NULL;
+extern void *__curbrk;		/* in glibc */
+
 void* VG_(brk) ( void* end_data_segment )
 {
-   Int res;
-   res = VG_(do_syscall)(__NR_brk, (UInt)end_data_segment);
-   return (void*)(  VG_(is_kerror)(res) ? -1 : res  );
+   Addr end;
+   Addr brkpage;
+   Addr endpage;
+
+   if (VG_(curbrk) == NULL) {
+      VG_(curbrk) = &_end;
+      __curbrk = (void *)VG_(curbrk);
+   }
+   
+   end = (Addr)end_data_segment;
+   brkpage = PGROUNDUP(VG_(curbrk));
+   endpage = PGROUNDUP(end);
+
+   if (0 && VG_(curbrk) != __curbrk)
+      VG_(printf)("__curbrk changed unexpectedly: VG_(curbrk)=%p, __curbrk=%p\n",
+		  VG_(curbrk), __curbrk);
+
+   if (0)
+      VG_(printf)("brk(end_data_segment=%p); brkpage=%p endpage=%p end=%p curbrk=%p &_end=%p\n",
+		  end_data_segment, brkpage, endpage, end, VG_(curbrk), &_end);
+
+   if (endpage < (Addr)&_end) {
+      __curbrk = (void *)VG_(curbrk);
+      return (void *)VG_(curbrk);
+   }
+
+   if (brkpage != endpage) {
+      if (brkpage > endpage)
+	 munmap_inner((void *)brkpage, brkpage-endpage);
+      else
+	 mmap_inner((void *)brkpage, endpage-brkpage, 
+		    VKI_PROT_READ|VKI_PROT_WRITE|VKI_PROT_EXEC,
+		    VKI_MAP_FIXED|VKI_MAP_PRIVATE|VKI_MAP_ANONYMOUS, -1, 0);
+   }
+   VG_(curbrk) = (Char *)__curbrk = end_data_segment;
+
+   return end_data_segment;
 }
 
 
@@ -1532,6 +1582,9 @@
       static Char** envp = NULL;
       Char* argv[4];
 
+      /* restore the DATA rlimit for the child */
+      VG_(setrlimit)(VKI_RLIMIT_DATA, &VG_(client_rlimit_data));
+
       if (envp == NULL) {
          Int i;
          Char* ld_preload_str = NULL;
@@ -1616,24 +1669,13 @@
 {
    static UInt tot_alloc = 0;
    void* p;
+   Char *b = VG_(brk)(0);
 
-#if 0
-   p = VG_(mmap)( (void *)VG_(valgrind_base), nBytes,
-		  VKI_PROT_READ | VKI_PROT_WRITE | VKI_PROT_EXEC, 
-		  VKI_MAP_PRIVATE | VKI_MAP_ANONYMOUS, -1, 0 );
-#else
-   /* use brk, because it will definitely be in the valgrind address space */
-   {
-      Char *b = VG_(brk)(0);
+   p = (void *)PGROUNDUP(b);
+   b = VG_(brk)(p + PGROUNDUP(nBytes));
 
-      p = (void *)PGROUNDUP(b);
-      
-      b = VG_(brk)(p + PGROUNDUP(nBytes));
-
-      if (b != (p + PGROUNDUP(nBytes)))
-	 p = (void *)-1;
-   }
-#endif
+   if (b != (p + PGROUNDUP(nBytes)))
+       p = (void *)-1;
 
    if (p != ((void*)(-1))) {
       vg_assert(p >= (void *)VG_(valgrind_mmap_end) && p < (void *)VG_(valgrind_end));
diff --git a/coregrind/vg_syscalls.c b/coregrind/vg_syscalls.c
index c697588..05ba314 100644
--- a/coregrind/vg_syscalls.c
+++ b/coregrind/vg_syscalls.c
@@ -1994,6 +1994,9 @@
       VG_(ksigprocmask)(VKI_SIG_SETMASK, &tst->sig_mask, NULL);
    }
 
+   /* restore the DATA rlimit for the child */
+   VG_(setrlimit)(VKI_RLIMIT_DATA, &VG_(client_rlimit_data));
+
    res = VG_(do_syscall)(__NR_execve, arg1, arg2, arg3);
 
    /* If we got here, then the execve failed.  We've already made too much of a mess
@@ -2482,16 +2485,22 @@
    /* int getrlimit (int resource, struct rlimit *rlim); */
    MAYBE_PRINTF("getrlimit ( %d, %p )\n", arg1,arg2);
    SYSCALL_TRACK( pre_mem_write, tid, "getrlimit(rlim)", arg2, 
-		  sizeof(struct rlimit) );
+		  sizeof(struct vki_rlimit) );
 }
 
 POST(getrlimit)
 {
-   if (res == 0)
-      VG_TRACK( post_mem_write, arg2, sizeof(struct rlimit) );
+    VG_TRACK( post_mem_write, arg2, sizeof(struct vki_rlimit) );
 
-   if (res == 0 && arg1 == VKI_RLIMIT_NOFILE)
-      ((struct rlimit *)arg2)->rlim_cur = VG_(max_fd);
+    switch(arg1) {
+    case VKI_RLIMIT_NOFILE:
+	((vki_rlimit *)arg2)->rlim_cur = VG_(max_fd);
+	break;
+
+    case VKI_RLIMIT_DATA:
+	*((vki_rlimit *)arg2) = VG_(client_rlimit_data);
+	break;
+    }
 }
 
 PREALIAS(ugetrlimit, getrlimit);
@@ -4245,7 +4254,12 @@
    /* int setrlimit (int resource, const struct rlimit *rlim); */
    MAYBE_PRINTF("setrlimit ( %d, %p )\n", arg1,arg2);
    SYSCALL_TRACK( pre_mem_read, tid, "setrlimit(rlim)", 
-		  arg2, sizeof(struct rlimit) );
+		  arg2, sizeof(struct vki_rlimit) );
+
+   if (arg1 == VKI_RLIMIT_DATA) {
+      VG_(client_rlimit_data) = *(vki_rlimit *)arg2;
+      res = 0;
+   }
 }
 
 PRE(setuid)
diff --git a/include/vg_kerneliface.h b/include/vg_kerneliface.h
index 6b70e6f..5473008 100644
--- a/include/vg_kerneliface.h
+++ b/include/vg_kerneliface.h
@@ -698,7 +698,17 @@
 	unsigned long rlim_max;
 } vki_rlimit;
 
-#define VKI_RLIMIT_NOFILE 7
+#define VKI_RLIMIT_CPU		0		/* CPU time in ms */
+#define VKI_RLIMIT_FSIZE	1		/* Maximum filesize */
+#define VKI_RLIMIT_DATA		2		/* max data size */
+#define VKI_RLIMIT_STACK	3		/* max stack size */
+#define VKI_RLIMIT_CORE		4		/* max core file size */
+#define VKI_RLIMIT_RSS		5		/* max resident set size */
+#define VKI_RLIMIT_NPROC	6		/* max number of processes */
+#define VKI_RLIMIT_NOFILE	7		/* max number of open files */
+#define VKI_RLIMIT_MEMLOCK	8		/* max locked-in-memory address space */
+#define VKI_RLIMIT_AS		9		/* address space limit */
+#define VKI_RLIMIT_LOCKS	10		/* maximum file locks held */
 
 /* Socket stuff. */
 /*