Take ppc64 startup further along the road
 - fixed launcher.c to recognise ppc32/64-linux platforms properly
 - lots of assembly fixes to handle func descriptors, toc references, 64bit regs.
 - fixed var types in vki-ppc64-linux

Now gets as far as VG_(translate), but dies from a case of invalid orig_addr.




git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5299 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/launcher.c b/coregrind/launcher.c
index 3dedb06..d1961a4 100644
--- a/coregrind/launcher.c
+++ b/coregrind/launcher.c
@@ -133,26 +133,37 @@
       *interpend = '\0';
 
       platform = select_platform(interp);
-   } else if (memcmp(header, ELFMAG, SELFMAG) == 0 &&
-              header[EI_CLASS] == ELFCLASS32 &&
-              header[EI_DATA] == ELFDATA2LSB) {
-      const Elf32_Ehdr *ehdr = (Elf32_Ehdr *)header;
+   } else if (memcmp(header, ELFMAG, SELFMAG) == 0) {
 
-      if (ehdr->e_machine == EM_386 &&
-          ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV) {
-         platform = "x86-linux";
-      } else if (ehdr->e_machine == EM_PPC &&
-                 ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV) {
-         platform = "ppc32-linux";
-      }
-   } else if (memcmp(header, ELFMAG, SELFMAG) == 0 &&
-              header[EI_CLASS] == ELFCLASS64 &&
-              header[EI_DATA] == ELFDATA2LSB) {
-      const Elf64_Ehdr *ehdr = (Elf64_Ehdr *)header;
+      if (header[EI_CLASS] == ELFCLASS32) {
+         const Elf32_Ehdr *ehdr = (Elf32_Ehdr *)header;
 
-      if (ehdr->e_machine == EM_X86_64 &&
-          ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV) {
-         platform = "amd64-linux";
+         if (header[EI_DATA] == ELFDATA2LSB) {
+            if (ehdr->e_machine == EM_386 &&
+                ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV) {
+               platform = "x86-linux";
+            }
+         }
+         else if (header[EI_DATA] == ELFDATA2MSB) {
+            if (ehdr->e_machine == EM_PPC &&
+                ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV) {
+               platform = "ppc32-linux";
+            }
+         }
+      } else if (header[EI_CLASS] == ELFCLASS64) {
+         const Elf64_Ehdr *ehdr = (Elf64_Ehdr *)header;
+
+         if (header[EI_DATA] == ELFDATA2LSB) {
+            if (ehdr->e_machine == EM_X86_64 &&
+                ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV) {
+               platform = "amd64-linux";
+            }
+         } else if (header[EI_DATA] == ELFDATA2MSB) {
+            if (ehdr->e_machine == EM_PPC64 &&
+                ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV) {
+               platform = "ppc64-linux";
+            }
+         }
       }
    }
 
diff --git a/coregrind/m_debuginfo/dwarf.c b/coregrind/m_debuginfo/dwarf.c
index 560ded2..fcc5493 100644
--- a/coregrind/m_debuginfo/dwarf.c
+++ b/coregrind/m_debuginfo/dwarf.c
@@ -2209,7 +2209,7 @@
    Int    n_CIEs = 0;
    UChar* data = ehframe;
 
-#if defined(VGP_ppc32_linux)
+#if defined(VGP_ppc32_linux) || defined(VGP_ppc64_linux)
    // CAB: tmp hack for ppc - no stacktraces for now...
    return;
 #endif
diff --git a/coregrind/m_dispatch/dispatch-ppc32-linux.S b/coregrind/m_dispatch/dispatch-ppc32-linux.S
index 6973bb6..2220daa 100644
--- a/coregrind/m_dispatch/dispatch-ppc32-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc32-linux.S
@@ -197,8 +197,7 @@
         li      3,0
         stw     3,20(1)
         lfs     3,20(1)
-	/* load f3 to fpscr (0xFF = all bit fields) */
-        mtfsf   0xFF,3
+        mtfsf   0xFF,3   /* fpscr = f3 */
 LafterFP2:
 
         /* set host AltiVec control word to the default mode expected 
diff --git a/coregrind/m_dispatch/dispatch-ppc64-linux.S b/coregrind/m_dispatch/dispatch-ppc64-linux.S
index 1eea790..30bcd68 100644
--- a/coregrind/m_dispatch/dispatch-ppc64-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc64-linux.S
@@ -36,35 +36,48 @@
 
 /* References to globals via the TOC */
 
-	.section        ".toc","aw"
-tocent__vgPlain_tt_fast:
-	.tc vgPlain_tt_fast[TC],vgPlain_tt_fast
+/*
+        .globl vgPlain_tt_fast
+        .lcomm  vgPlain_tt_fast,4,4
+        .type   vgPlain_tt_fast, @object
+*/
+        .section        ".toc","aw"
+.tocent__vgPlain_tt_fast:
+        .tc vgPlain_tt_fast[TC],vgPlain_tt_fast
+.tocent__vgPlain_dispatch_ctr:
+        .tc vgPlain_dispatch_ctr[TC],vgPlain_dispatch_ctr
+.tocent__vgPlain_machine_ppc64_has_VMX:
+        .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX
 
 /*------------------------------------------------------------*/
 /*--- The dispatch loop.                                   ---*/
 /*------------------------------------------------------------*/
 
-	.section	".text"
-	.align 2
-
 /* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
 
+        .section        ".text"
+        .align 2
+        .globl VG_(run_innerloop)
+        .section        ".opd","aw"
+        .align 3
+VG_(run_innerloop):
+        .quad   .VG_(run_innerloop),.TOC.@tocbase,0
+        .previous
+        .type   .VG_(run_innerloop),@function
         .globl  .VG_(run_innerloop)
+
 .VG_(run_innerloop):
         /* ----- entry point to VG_(run_innerloop) ----- */
 
-        /* For Linux/ppc32 we need the SysV ABI, which uses
-           LR->4(parent_sp), CR->anywhere.
-           (The AIX ABI, used on Darwin, and maybe Linux/ppc64?,
-           uses LR->8(prt_sp), CR->4(prt_sp))
-        */
-
-        /* Save lr */
+        /* PPC64 ABI saves LR->16(prt_sp), CR->8(prt_sp)) */
+        /* Save lr, cr */
         mflr    0
-        stw     0,4(1)
+        std     0,16(1)
+        mfcr    0
+        std     0,8(1)
 
         /* New stack frame */
-        stwu    1,-624(1)  /* sp should maintain 16-byte alignment */
+        stdu    1,-624(1)  /* sp should maintain 16-byte alignment */
 
         /* Save callee-saved registers... */
 
@@ -89,36 +102,36 @@
         stfd    14,480(1)
 
         /* General reg save area : 144 bytes */
-        stw     31,472(1)
-        stw     30,464(1)
-        stw     29,456(1)
-        stw     28,448(1)
-        stw     27,440(1)
-        stw     26,432(1)
-        stw     25,424(1)
-        stw     24,416(1)
-        stw     23,408(1)
-        stw     22,400(1)
-        stw     21,392(1)
-        stw     20,384(1)
-        stw     19,376(1)
-        stw     18,368(1)
-        stw     17,360(1)
-        stw     16,352(1)
-        stw     15,344(1)
-        stw     14,336(1)
+        std     31,472(1)
+        std     30,464(1)
+        std     29,456(1)
+        std     28,448(1)
+        std     27,440(1)
+        std     26,432(1)
+        std     25,424(1)
+        std     24,416(1)
+        std     23,408(1)
+        std     22,400(1)
+        std     21,392(1)
+        std     20,384(1)
+        std     19,376(1)
+        std     18,368(1)
+        std     17,360(1)
+        std     16,352(1)
+        std     15,344(1)
+        std     14,336(1)
         /* Probably not necessary to save r13 (thread-specific ptr),
            as VEX stays clear of it... but what the hey. */
-        stw     13,328(1)
+        std     13,328(1)
 
         /* It's necessary to save/restore VRSAVE in the AIX / Darwin ABI.
            The Linux kernel might not actually use VRSAVE for its intended
            purpose, but it should be harmless to preserve anyway. */
 	/* r3 is live here (guest state ptr), so use r4 */
-        lis     4,VG_(machine_ppc64_has_VMX)@ha
-        lwz     4,VG_(machine_ppc64_has_VMX)@l(4)
-        cmplwi  4,0
-        beq     LafterVMX1
+        lis     4,.tocent__vgPlain_machine_ppc64_has_VMX@ha
+        ld      4,.tocent__vgPlain_machine_ppc64_has_VMX@l(4)
+        cmpldi  4,0
+        beq     .LafterVMX1
 
         /* VRSAVE save word : 32 bytes */
         mfspr   4,256         /* vrsave reg is spr number 256 */
@@ -151,24 +164,18 @@
         stvx    21,4,1
         li      4,128
         stvx    20,4,1
-LafterVMX1:
-
-        /* Save cr */
-        mfcr    0
-        stw     0,112(1)
-
+.LafterVMX1:
 
         /* Local variable space... */
 
         /* r3 holds guest_state */
         mr      31,3
-        stw     3,104(1)       /* spill orig guest_state ptr */
+        std     3,104(1)       /* spill orig guest_state ptr */
 
         /* 96(sp) used later to check FPSCR[RM] */
-
         /* 88(sp) used later to stop ctr reg being clobbered */
-
-	/* 48:87(sp) free */
+        /* 80(sp) used later to load fpscr with zero */
+	/* 48:79(sp) free */
 	
         /* Linkage Area (reserved)
            40(sp) : TOC
@@ -182,33 +189,39 @@
 // CAB TODO: Use a caller-saved reg for orig guest_state ptr
 // - rem to set non-allocateable in isel.c
 
-        /* hold dispatch_ctr in ctr reg */
-        lis     17,VG_(dispatch_ctr)@ha
-        lwz     17,VG_(dispatch_ctr)@l(17)
+        /* hold VG_(dispatch_ctr) (=32bit value) in ctr reg */
+        lis     17,.tocent__vgPlain_dispatch_ctr@ha
+        lwz     17,.tocent__vgPlain_dispatch_ctr@l(17)
         mtctr   17
 
         /* fetch %CIA into r30 */
-        lwz     30,OFFSET_ppc64_CIA(31)
+        ld      30,OFFSET_ppc64_CIA(31)
 
         /* set host FPU control word to the default mode expected 
            by VEX-generated code.  See comments in libvex.h for
            more info. */
-        fsub    3,3,3   /* generate zero */
-        mtfsf   0xFF,3
+        /* => get zero into f3 (tedious)
+           fsub 3,3,3 is not a reliable way to do this, since if
+           f3 holds a NaN or similar then we don't necessarily
+           wind up with zero. */
+        li      3,0
+        stw     3,80(1)
+        lfs     3,80(1)
+        mtfsf   0xFF,3   /* fpscr = lo32 of f3 */
 
         /* set host AltiVec control word to the default mode expected 
            by VEX-generated code. */
-        lis     3,VG_(machine_ppc64_has_VMX)@ha
-        lwz     3,VG_(machine_ppc64_has_VMX)@l(3)
-        cmplwi  3,0
-        beq     LafterVMX2
+        lis     3,.tocent__vgPlain_machine_ppc64_has_VMX@ha
+        ld      3,.tocent__vgPlain_machine_ppc64_has_VMX@l(3)
+        cmpldi  3,0
+        beq     .LafterVMX2
 
         vspltisw 3,0x0  /* generate zero */
         mtvscr  3
-LafterVMX2:
+.LafterVMX2:
 
         /* make a stack frame for the code we are calling */
-        stwu    1,-48(1)
+        stdu    1,-48(1)
 
         /* fall into main loop */
 
@@ -226,30 +239,30 @@
         0:43 (r1) (=stack frame header)
 */
 
-dispatch_boring:
+.dispatch_boring:
         /* save the jump address in the guest state */
         std     30,OFFSET_ppc64_CIA(31)
 
         /* Are we out of timeslice?  If yes, defer to scheduler. */
-        bdz     counter_is_zero  /* decrements ctr reg */
+        bdz     .counter_is_zero  /* decrements ctr reg */
 
         /* try a fast lookup in the translation cache */
         /* r4=((r30<<3) & (VG_TT_FAST_MASK<<3)) */
 	rldic	4,30, 3,64-3-VG_TT_FAST_BITS
 // CAB:	use a caller-saved reg for this ?
 	/* r5 = & VG_(tt_fast) */
-	ld	5, tocent__vgPlain_tt_fast@toc(2)
+	ld	5, .tocent__vgPlain_tt_fast@toc(2)
 	/* r5 = VG_(tt_fast)[r30 & VG_TT_FAST_MASK] */
 	ldx	5, 5,4
 	/* r6 = VG_(tt_fast)[r30 & VG_TT_FAST_MASK]->orig_addr */
 	ld	6, 0(5)
         cmpw    30,6
-        bne     fast_lookup_failed
+        bne     .fast_lookup_failed
 
-        /* increment bb profile counter */
+        /* increment bb profile counter VG_(tt_fastN)[x] (=32bit val) */
 // CAB:	use a caller-saved reg for this ?
 	/* r7 = & VG_(tt_fastN) */
-	ld	7, tocent__vgPlain_tt_fast@toc(2)
+	ld	7, .tocent__vgPlain_tt_fast@toc(2)
 	/* r7 = VG_(tt_fastN)[r30 & VG_TT_FAST_MASK] */
 	srdi	4, 4,1
 	lwzx	6, 7,4
@@ -263,7 +276,7 @@
 
         /* stop ctr being clobbered */
 // CAB:	use a caller-saved reg for this ?
-//      but then (bdz) => (decr, cmp, bc)... still better than a stw?
+//      but then (bdz) => (decr, cmp, bc)... still better than a std?
         mfctr   9
         std     9,136(1)         /* => 88(parent_sp) */
 
@@ -289,15 +302,15 @@
         mr      30,3             /* put CIA (=r3) in r30 */
         ld      16,152(1)        /* gst_state ptr => 104(prnt_sp) */
         cmpd    16,31
-        beq     dispatch_boring  /* r31 unchanged... */
+        beq     .dispatch_boring /* r31 unchanged... */
 
         mr      3,31             /* put return val (=r31) in r3 */
-        b       dispatch_exceptional
+        b       .dispatch_exceptional
 
 /* All exits from the dispatcher go through here.
    r3 holds the return value. 
 */
-run_innerloop_exit: 
+.run_innerloop_exit: 
         /* We're leaving.  Check that nobody messed with
            VSCR or FPSCR. */
 
@@ -310,14 +323,14 @@
         lwzx      6,5,1                   /* load to gpr */
         andi.     6,6,0xFF                /* mask wanted bits */
         cmplwi    6,0x0                   /* cmp with zero */
-        bne       invariant_violation     /* branch if not zero */
+        bne       .invariant_violation    /* branch if not zero */
 #endif
 
 	/* Using r11 - value used again further on, so don't trash! */
-        lis     11,VG_(machine_ppc64_has_VMX)@ha
-        lwz     11,VG_(machine_ppc64_has_VMX)@l(11)
-        cmplwi  11,0
-        beq     LafterVMX8
+        lis     11,.tocent__vgPlain_machine_ppc64_has_VMX@ha
+        ld      11,.tocent__vgPlain_machine_ppc64_has_VMX@l(11)
+        cmpldi  11,0
+        beq     .LafterVMX8
 
         /* Check VSCR[NJ] == 1 */
         /* first generate 4x 0x00010000 */
@@ -329,78 +342,74 @@
         vand      7,7,6                   /* gives NJ flag */
         vspltw    7,7,0x3                 /* flags-word to all lanes */
         vcmpequw. 8,6,7                   /* CR[24] = 1 if v6 == v7 */
-        bt        24,invariant_violation  /* branch if all_equal */
-LafterVMX8:
+        bt        24,.invariant_violation /* branch if all_equal */
+.LafterVMX8:
 
 	/* otherwise we're OK */
-        b       run_innerloop_exit_REALLY
+        b       .run_innerloop_exit_REALLY
 
 
-invariant_violation:
+.invariant_violation:
         li      3,VG_TRC_INVARIANT_FAILED
-        b       run_innerloop_exit_REALLY
+        b       .run_innerloop_exit_REALLY
 
-run_innerloop_exit_REALLY:
+.run_innerloop_exit_REALLY:
         /* r3 holds VG_TRC_* value to return */
 
         /* Return to parent stack */
         addi    1,1,48
 
-        /* Write ctr to VG(dispatch_ctr) */
+        /* Write ctr to VG_(dispatch_ctr) (=32bit value) */
         mfctr   17
-        lis     18,VG_(dispatch_ctr)@ha
-        stw     17,VG_(dispatch_ctr)@l(18)
-
-        /* Restore cr */
-        lwz     0,112(1)
-        mtcr    0
+        lis     18,.tocent__vgPlain_dispatch_ctr@ha
+        stw     17,.tocent__vgPlain_dispatch_ctr@l(18)
 
         /* Restore callee-saved registers... */
 
         /* Floating-point regs */
-        lfd    31,616(1)
-        lfd    30,608(1)
-        lfd    29,600(1)
-        lfd    28,592(1)
-        lfd    27,584(1)
-        lfd    26,576(1)
-        lfd    25,568(1)
-        lfd    24,560(1)
-        lfd    23,552(1)
-        lfd    22,544(1)
-        lfd    21,536(1)
-        lfd    20,528(1)
-        lfd    19,520(1)
-        lfd    18,512(1)
-        lfd    17,504(1)
-        lfd    16,496(1)
-        lfd    15,488(1)
-        lfd    14,480(1)
+        lfd     31,616(1)
+        lfd     30,608(1)
+        lfd     29,600(1)
+        lfd     28,592(1)
+        lfd     27,584(1)
+        lfd     26,576(1)
+        lfd     25,568(1)
+        lfd     24,560(1)
+        lfd     23,552(1)
+        lfd     22,544(1)
+        lfd     21,536(1)
+        lfd     20,528(1)
+        lfd     19,520(1)
+        lfd     18,512(1)
+        lfd     17,504(1)
+        lfd     16,496(1)
+        lfd     15,488(1)
+        lfd     14,480(1)
 
         /* General regs */
-        lwz     31,472(1)
-        lwz     30,464(1)
-        lwz     29,456(1)
-        lwz     28,448(1)
-        lwz     27,440(1)
-        lwz     26,432(1)
-        lwz     25,424(1)
-        lwz     24,416(1)
-        lwz     23,408(1)
-        lwz     22,400(1)
-        lwz     21,392(1)
-        lwz     20,384(1)
-        lwz     19,376(1)
-        lwz     18,368(1)
-        lwz     17,360(1)
-        lwz     16,352(1)
-        lwz     15,344(1)
-        lwz     14,336(1)
-        lwz     13,328(1)
+        ld      31,472(1)
+        ld      30,464(1)
+        ld      29,456(1)
+        ld      28,448(1)
+        ld      27,440(1)
+        ld      26,432(1)
+        ld      25,424(1)
+        ld      24,416(1)
+        ld      23,408(1)
+        ld      22,400(1)
+        ld      21,392(1)
+        ld      20,384(1)
+        ld      19,376(1)
+        ld      18,368(1)
+        ld      17,360(1)
+        ld      16,352(1)
+        ld      15,344(1)
+        ld      14,336(1)
+        ld      13,328(1)
 
         /* r11 already holds VG_(machine_ppc64_has_VMX) value */
-        cmplwi  11,0
-        beq     LafterVMX9
+        cmpldi  11,0
+        beq     .LafterVMX9
 
         /* VRSAVE */
         lwz     4,324(1)
@@ -431,10 +440,12 @@
         lvx     21,4,1
         li      4,128
         lvx     20,4,1
-LafterVMX9:
+.LafterVMX9:
 
-        /* reset lr & sp */
-        lwz     0,628(1)  /* stack_size + 4 */
+        /* reset cr, lr, sp */
+        ld      0,632(1)  /* stack_size + 8 */
+        mtcr    0
+        ld      0,640(1)  /* stack_size + 16 */
         mtlr    0
         addi    1,1,624   /* stack_size */
         blr
@@ -443,28 +454,28 @@
 /* Other ways of getting out of the inner loop.  Placed out-of-line to
    make it look cleaner. 
 */
-dispatch_exceptional:
+.dispatch_exceptional:
 	/* this is jumped to only, not fallen-through from above */
 	/* save r30 in %CIA and defer to sched */
-        lwz     16,152(1)
-        stw     30,OFFSET_ppc64_CIA(16)
-        b       run_innerloop_exit
+        ld      16,152(1)
+        std     30,OFFSET_ppc64_CIA(16)
+        b       .run_innerloop_exit
 
-fast_lookup_failed:
+.fast_lookup_failed:
 	/* %CIA is up to date here since dispatch_boring dominates */
         mfctr   17
         addi    17,17,1
 	mtctr   17
         li      3,VG_TRC_INNER_FASTMISS
-	b       run_innerloop_exit
+	b       .run_innerloop_exit
 
-counter_is_zero:
+.counter_is_zero:
 	/* %CIA is up to date here since dispatch_boring dominates */
         mfctr   17
         addi    17,17,1
 	mtctr   17
         li      3,VG_TRC_INNER_COUNTERZERO
-        b       run_innerloop_exit
+        b       .run_innerloop_exit
 
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
diff --git a/coregrind/m_main.c b/coregrind/m_main.c
index c01ee65..cb6892e 100644
--- a/coregrind/m_main.c
+++ b/coregrind/m_main.c
@@ -1743,7 +1743,7 @@
    arch->vex.guest_CIA  = client_ip;
 
 #elif defined(VGA_ppc64)
-   vg_assert(0 == sizeof(VexGuestPPC64State) % 8);
+   vg_assert(0 == sizeof(VexGuestPPC64State) % 16);
 
    /* Zero out the initial state, and set up the simulated FPU in a
       sane way. */
@@ -2865,9 +2865,17 @@
 #elif defined(VGP_ppc64_linux)
 asm("\n"
     ".text\n"
-    "\t.globl _start\n"
-    "\t.type _start,@function\n"
+    /* PPC64 ELF ABI says '_start' points to a function descriptor.
+       So we must have one, and that is what goes into the .opd section. */
+    "\t.global _start\n"
+    "\t.section \".opd\",\"aw\"\n"
+    "\t.align 3\n"
     "_start:\n"
+    "\t.quad ._start,.TOC.@tocbase,0\n"
+    "\t.previous\n"
+    "\t.type ._start,@function\n"
+    "\t.global  ._start\n"
+    "._start:\n"
     /* set up the new stack in r16 */
     "\tlis  16,   vgPlain_interim_stack@highest\n"
     "\tori  16,16,vgPlain_interim_stack@higher\n"
@@ -2889,7 +2897,8 @@
        call _start_in_C, passing it the initial SP. */
     "\tmr 3,1\n"
     "\tmr 1,16\n"
-    "\tbl _start_in_C\n"
+    "\tbl ._start_in_C\n"
+    "\tnop\n"
     "\ttrap\n"
     ".previous\n"
 );
diff --git a/coregrind/m_syscall.c b/coregrind/m_syscall.c
index 28a79c0..ae28f49 100644
--- a/coregrind/m_syscall.c
+++ b/coregrind/m_syscall.c
@@ -231,7 +231,6 @@
 "        sc\n"             /* result in r3 and cr0.so */
 "        ld   5,-16(1)\n"  /* reacquire argblock ptr (r5 is caller-save) */
 "        std  3,0(5)\n"    /* argblock[0] = r3 */
-"        xor  3,3,3\n"
 "        mfcr 3\n"
 "        srwi 3,3,28\n"
 "        andi. 3,3,1\n"
diff --git a/coregrind/m_syswrap/syswrap-linux.c b/coregrind/m_syswrap/syswrap-linux.c
index 866f0c5..c6587c7 100644
--- a/coregrind/m_syswrap/syswrap-linux.c
+++ b/coregrind/m_syswrap/syswrap-linux.c
@@ -264,6 +264,11 @@
    sp -= 16;
    sp &= ~0xF;
    *(UWord *)sp = 0;
+#elif defined(VGP_ppc64_linux)
+   /* make a stack frame */
+   sp -= 112;
+   sp &= ~((Addr)0xF);
+   *(UWord *)sp = 0;
 #endif
 
    /* If we can't even allocate the first thread's stack, we're hosed.
diff --git a/coregrind/m_syswrap/syswrap-ppc64-linux.c b/coregrind/m_syswrap/syswrap-ppc64-linux.c
index 017bc27..afbea0a 100644
--- a/coregrind/m_syswrap/syswrap-ppc64-linux.c
+++ b/coregrind/m_syswrap/syswrap-ppc64-linux.c
@@ -63,19 +63,31 @@
 __attribute__((noreturn))
 void ML_(call_on_new_stack_0_1) ( Addr stack,
                                   Addr retaddr,
-                                  void (*f)(Word),
+                                  void (*f_desc)(Word),
                                   Word arg1 );
 //    r3 = stack
 //    r4 = retaddr
-//    r5 = f
+//    r5 = function descriptor
 //    r6 = arg1
+/* On PPC64, a func ptr is represented by a TOC entry ptr.
+   This TOC entry contains three words; the first word is the function
+   address, the second word is the TOC ptr (r2), and the third word is
+   the static chain value. */
 asm(
 ".text\n"
-".globl .vgModuleLocal_call_on_new_stack_0_1\n"
+"   .globl   vgModuleLocal_call_on_new_stack_0_1\n"
+"   .section \".opd\",\"aw\"\n"
+"   .align   3\n"
+"vgModuleLocal_call_on_new_stack_0_1:\n"
+"   .quad    .vgModuleLocal_call_on_new_stack_0_1,.TOC.@tocbase,0\n"
+"   .previous\n"
+"   .type    .vgModuleLocal_call_on_new_stack_0_1,@function\n"
+"   .globl   .vgModuleLocal_call_on_new_stack_0_1\n"
 ".vgModuleLocal_call_on_new_stack_0_1:\n"
 "   mr    %r1,%r3\n\t"     // stack to %sp
 "   mtlr  %r4\n\t"         // retaddr to %lr
-"   mtctr %r5\n\t"         // f to count reg
+"   ld 5,0(5)\n\t"         // load f_ptr from f_desc[0]
+"   mtctr %r5\n\t"         // f_ptr to count reg
 "   mr %r3,%r6\n\t"        // arg1 to %r3
 "   li 0,0\n\t"            // zero all GP regs
 "   li 4,0\n\t"
diff --git a/include/vki-ppc64-linux.h b/include/vki-ppc64-linux.h
index 41f4792..e31f1d4 100644
--- a/include/vki-ppc64-linux.h
+++ b/include/vki-ppc64-linux.h
@@ -399,10 +399,10 @@
 struct vki_stat {
   unsigned long   st_dev;
   unsigned long   st_ino;
-  unsigned short  st_nlink;
-  unsigned short  st_mode;
-  unsigned short  st_uid;
-  unsigned short  st_gid;
+  unsigned long   st_nlink;
+  unsigned int    st_mode;
+  unsigned int    st_uid;
+  unsigned int    st_gid;
   unsigned long   st_rdev;
   long            st_size;
   unsigned long   st_blksize;
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c
index e7d8fc3..91ea81d 100644
--- a/memcheck/mc_main.c
+++ b/memcheck/mc_main.c
@@ -1308,10 +1308,12 @@
 static void mc_post_reg_write ( CorePart part, ThreadId tid, 
                                 OffT offset, SizeT size)
 {
-   UChar area[1024];
-   tl_assert(size <= 1024);
+#  define MAX_REG_WRITE_SIZE 1120
+   UChar area[MAX_REG_WRITE_SIZE];
+   tl_assert(size <= MAX_REG_WRITE_SIZE);
    VG_(memset)(area, VGM_BYTE_VALID, size);
    VG_(set_shadow_regs_area)( tid, offset, size, area );
+#  undef MAX_REG_WRITE_SIZE
 }
 
 static