Make host valgrind work with clang/llvm.

Bug: 28454823
* Add -fno-builtin-memset flag.

Bug: 31035712
* Make esp 16-byte aligned before calling x86 functions.
Bug: https://bugs.kde.org/show_bug.cgi?id=368120

Note that this does not fix other errors on x86 emulator.

Change-Id: Ibc19d0902aaba68edbb9f3758da368c35294128b
Test: Run valgrind on linux host and arm/x86 targets.
Test: In art run "mm valgrind-test-art-host"
diff --git a/Android.build_host.mk b/Android.build_host.mk
index 77d11ba..ee07d2d 100644
--- a/Android.build_host.mk
+++ b/Android.build_host.mk
@@ -23,10 +23,8 @@
   vg_local_arch := amd64
 endif
 
-# TODO: This workaround is to avoid calling memset from VG(memset)
-# wrapper because of invalid clang optimization; This seems to be
-# limited to amd64/x86 codegen(?);
-LOCAL_CLANG := false
+# Do not call (builtin) memset from VG(memset).
+LOCAL_CLANG_CFLAGS += -fno-builtin-memset
 
 LOCAL_MODULE := $(vg_local_module)-$(vg_local_arch)-linux
 
diff --git a/Android.build_one.mk b/Android.build_one.mk
index 9ab3a2b..1705cf2 100644
--- a/Android.build_one.mk
+++ b/Android.build_one.mk
@@ -25,13 +25,15 @@
 vg_local_android_arch := $(TARGET_ARCH)
 endif
 
-# TODO: This workaround is to avoid calling memset from VG(memset)
-# wrapper because of invalid clang optimization; This seems to be
-# limited to amd64/x86 codegen(?);
+# For arm and arm64 targets, clang compiled module has other
+# undefined errors, see bug 28454823.
 ifeq ($(filter $TARGET_ARCH,x86 x86_64),)
   LOCAL_CLANG := false
 endif
 
+# Do not call (builtin) memset from VG(memset).
+LOCAL_CLANG_CFLAGS += -fno-builtin-memset
+
 LOCAL_MODULE := $(vg_local_module)-$(vg_local_arch)-linux
 
 LOCAL_MODULE_TARGET_ARCH := $(vg_local_android_arch)
diff --git a/coregrind/m_main.c b/coregrind/m_main.c
index 1821c94..140efbf 100644
--- a/coregrind/m_main.c
+++ b/coregrind/m_main.c
@@ -2925,12 +2925,13 @@
     "\tmovl  $vgPlain_interim_stack, %eax\n"
     "\taddl  $"VG_STRINGIFY(VG_STACK_GUARD_SZB)", %eax\n"
     "\taddl  $"VG_STRINGIFY(VG_DEFAULT_STACK_ACTIVE_SZB)", %eax\n"
+    /* allocate at least 16 bytes on the new stack, and aligned */
     "\tsubl  $16, %eax\n"
     "\tandl  $~15, %eax\n"
     /* install it, and collect the original one */
     "\txchgl %eax, %esp\n"
     /* call _start_in_C_linux, passing it the startup %esp */
-    "\tpushl %eax\n"
+    "\tmovl  %eax, (%esp)\n"
     "\tcall  _start_in_C_linux\n"
     "\thlt\n"
     ".previous\n"
diff --git a/coregrind/m_syswrap/syswrap-x86-linux.c b/coregrind/m_syswrap/syswrap-x86-linux.c
index 24d7dc1..233886d 100644
--- a/coregrind/m_syswrap/syswrap-x86-linux.c
+++ b/coregrind/m_syswrap/syswrap-x86-linux.c
@@ -83,8 +83,9 @@
 ".globl vgModuleLocal_call_on_new_stack_0_1\n"
 "vgModuleLocal_call_on_new_stack_0_1:\n"
 "   movl %esp, %esi\n"     // remember old stack pointer
-"   movl 4(%esi), %esp\n"  // set stack
-"   pushl 16(%esi)\n"      // arg1 to stack
+"   movl 4(%esi), %esp\n"  // set stack, assume %esp is now 16-byte aligned
+"   subl $12, %esp\n"      // skip 12 bytes
+"   pushl 16(%esi)\n"      // arg1 to stack, %esp is 16-byte aligned
 "   pushl  8(%esi)\n"      // retaddr to stack
 "   pushl 12(%esi)\n"      // f to stack
 "   movl $0, %eax\n"       // zero all GP regs
@@ -150,7 +151,8 @@
 "        movl     4+"FSZ"(%esp), %ecx\n"    /* syscall arg2: child stack */
 "        movl    12+"FSZ"(%esp), %ebx\n"    /* fn arg */
 "        movl     0+"FSZ"(%esp), %eax\n"    /* fn */
-"        lea     -8(%ecx), %ecx\n"          /* make space on stack */
+"        andl    $-16, %ecx\n"              /* align to 16-byte */
+"        lea     -20(%ecx), %ecx\n"         /* allocate 16*n+4 bytes on stack */
 "        movl    %ebx, 4(%ecx)\n"           /*   fn arg */
 "        movl    %eax, 0(%ecx)\n"           /*   fn */
 
@@ -165,7 +167,7 @@
 "        jnz     1f\n"
 
          /* CHILD - call thread function */
-"        popl    %eax\n"
+"        popl    %eax\n"                    /* child %esp is 16-byte aligned */
 "        call    *%eax\n"                   /* call fn */
 
          /* exit with result */