This commit is for Bugzilla 334834.  The Bugzilla contains patch 2 of 3
to add PPC64 LE support.  The other two patches can be found in Bugzillas
334384 and 334836.

POWER PC, add the functional Little Endian support, patch 2 

The IBM POWER processor now supports both Big Endian and Little Endian.
The ABI for Little Endian also changes.  Specifically, the function
descriptor is not used, the stack size changed, accessing the TOC
changed.  Functions now have a local and a global entry point.  Register
r2 contains the TOC for local calls and register r12 contains the TOC
for global calls.  This patch makes the functional changes to the
Valgrind tool.  The patch makes the changes needed for the
none/tests/ppc32 and none/tests/ppc64 Makefile.am.  A number of the
ppc specific tests have Endian dependencies that are not fixed in
this patch.  They are fixed in the next patch.

Per Julian's comments renamed coregrind/m_dispatch/dispatch-ppc64-linux.S
to coregrind/m_dispatch/dispatch-ppc64be-linux.S  Created new file for LE
coregrind/m_dispatch/dispatch-ppc64le-linux.S.  The same was done for
coregrind/m_syswrap/syscall-ppc-linux.S.

Signed-off-by: Carl Love <carll@us.ibm.com>

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@14239 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am
index aabadcb..91ac84f 100644
--- a/coregrind/Makefile.am
+++ b/coregrind/Makefile.am
@@ -336,7 +336,8 @@
 	m_dispatch/dispatch-x86-linux.S \
 	m_dispatch/dispatch-amd64-linux.S \
 	m_dispatch/dispatch-ppc32-linux.S \
-	m_dispatch/dispatch-ppc64-linux.S \
+	m_dispatch/dispatch-ppc64be-linux.S \
+	m_dispatch/dispatch-ppc64le-linux.S \
 	m_dispatch/dispatch-arm-linux.S \
 	m_dispatch/dispatch-arm64-linux.S \
 	m_dispatch/dispatch-s390x-linux.S \
@@ -388,7 +389,8 @@
 	m_syswrap/syscall-x86-linux.S \
 	m_syswrap/syscall-amd64-linux.S \
 	m_syswrap/syscall-ppc32-linux.S \
-	m_syswrap/syscall-ppc64-linux.S \
+	m_syswrap/syscall-ppc64be-linux.S \
+	m_syswrap/syscall-ppc64le-linux.S \
 	m_syswrap/syscall-arm-linux.S \
 	m_syswrap/syscall-arm64-linux.S \
 	m_syswrap/syscall-s390x-linux.S \
diff --git a/coregrind/launcher-darwin.c b/coregrind/launcher-darwin.c
index 1f99026..8449285 100644
--- a/coregrind/launcher-darwin.c
+++ b/coregrind/launcher-darwin.c
@@ -64,6 +64,7 @@
    { CPU_TYPE_ARM,         "arm",     "arm" },
    { CPU_TYPE_POWERPC,     "ppc",     "ppc32" },
    { CPU_TYPE_POWERPC64BE, "ppc64be", "ppc64be" },
+   { CPU_TYPE_POWERPC64LE, "ppc64le", "ppc64le" },
 };
 static int valid_archs_count = sizeof(valid_archs)/sizeof(valid_archs[0]);
 
diff --git a/coregrind/launcher-linux.c b/coregrind/launcher-linux.c
index 38e4857..4f6b274 100644
--- a/coregrind/launcher-linux.c
+++ b/coregrind/launcher-linux.c
@@ -228,6 +228,10 @@
                 (ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV ||
                  ehdr->e_ident[EI_OSABI] == ELFOSABI_LINUX)) {
                platform = "arm64-linux";
+            } else if (ehdr->e_machine == EM_PPC64 &&
+                (ehdr->e_ident[EI_OSABI] == ELFOSABI_SYSV ||
+                 ehdr->e_ident[EI_OSABI] == ELFOSABI_LINUX)) {
+               platform = "ppc64le-linux";
             }
          } else if (header[EI_DATA] == ELFDATA2MSB) {
 #           if !defined(VGPV_arm_linux_android) \
@@ -321,6 +325,7 @@
        (0==strcmp(VG_PLATFORM,"amd64-linux"))  ||
        (0==strcmp(VG_PLATFORM,"ppc32-linux"))  ||
        (0==strcmp(VG_PLATFORM,"ppc64be-linux"))  ||
+       (0==strcmp(VG_PLATFORM,"ppc64le-linux"))  ||
        (0==strcmp(VG_PLATFORM,"arm-linux"))    ||
        (0==strcmp(VG_PLATFORM,"arm64-linux"))  ||
        (0==strcmp(VG_PLATFORM,"s390x-linux"))  ||
diff --git a/coregrind/m_coredump/coredump-elf.c b/coregrind/m_coredump/coredump-elf.c
index b5dd902..b125e7c 100644
--- a/coregrind/m_coredump/coredump-elf.c
+++ b/coregrind/m_coredump/coredump-elf.c
@@ -343,6 +343,27 @@
    regs->dsisr = 0;
    regs->result = 0;
 
+#elif defined(VGP_ppc64le_linux)
+#  define DO(n)  regs->gpr[n] = arch->vex.guest_GPR##n
+   DO(0);  DO(1);  DO(2);  DO(3);  DO(4);  DO(5);  DO(6);  DO(7);
+   DO(8);  DO(9);  DO(10); DO(11); DO(12); DO(13); DO(14); DO(15);
+   DO(16); DO(17); DO(18); DO(19); DO(20); DO(21); DO(22); DO(23);
+   DO(24); DO(25); DO(26); DO(27); DO(28); DO(29); DO(30); DO(31);
+#  undef DO
+
+   regs->nip = arch->vex.guest_CIA;
+   regs->msr = 0xf033;   /* pretty arbitrary */
+   regs->orig_gpr3 = arch->vex.guest_GPR3;
+   regs->ctr = arch->vex.guest_CTR;
+   regs->link = arch->vex.guest_LR;
+   regs->xer = LibVEX_GuestPPC64_get_XER( &((ThreadArchState*)arch)->vex );
+   regs->ccr = LibVEX_GuestPPC64_get_CR( &((ThreadArchState*)arch)->vex );
+   /* regs->mq = 0; */
+   regs->trap = 0;
+   regs->dar = 0; /* should be fault address? */
+   regs->dsisr = 0;
+   regs->result = 0;
+
 #elif defined(VGP_arm_linux)
    regs->ARM_r0   = arch->vex.guest_R0;
    regs->ARM_r1   = arch->vex.guest_R1;
diff --git a/coregrind/m_debuginfo/debuginfo.c b/coregrind/m_debuginfo/debuginfo.c
index e311f5e..e5114b6 100644
--- a/coregrind/m_debuginfo/debuginfo.c
+++ b/coregrind/m_debuginfo/debuginfo.c
@@ -1,4 +1,5 @@
 
+
 /*--------------------------------------------------------------------*/
 /*--- Top level management of symbols and debugging information.   ---*/
 /*---                                                  debuginfo.c ---*/
@@ -4138,6 +4139,7 @@
                                         Int idx,
                                   /*OUT*/Addr*    avma,
                                   /*OUT*/Addr*    tocptr,
+                                  /*OUT*/Addr*    local_ep,
                                   /*OUT*/UInt*    size,
                                   /*OUT*/HChar**  pri_name,
                                   /*OUT*/HChar*** sec_names,
@@ -4147,6 +4149,7 @@
    vg_assert(idx >= 0 && idx < si->symtab_used);
    if (avma)      *avma      = si->symtab[idx].addr;
    if (tocptr)    *tocptr    = si->symtab[idx].tocptr;
+   if (local_ep)  *local_ep  = si->symtab[idx].local_ep;
    if (size)      *size      = si->symtab[idx].size;
    if (pri_name)  *pri_name  = si->symtab[idx].pri_name;
    if (sec_names) *sec_names = (HChar **)si->symtab[idx].sec_names; // FIXME
diff --git a/coregrind/m_debuginfo/priv_storage.h b/coregrind/m_debuginfo/priv_storage.h
index bcc0b62..30491ec 100644
--- a/coregrind/m_debuginfo/priv_storage.h
+++ b/coregrind/m_debuginfo/priv_storage.h
@@ -71,7 +71,8 @@
 typedef 
    struct { 
       Addr    addr;    /* lowest address of entity */
-      Addr    tocptr;  /* ppc64-linux only: value that R2 should have */
+      Addr    tocptr;  /* ppc64be-linux only: value that R2 should have */
+      Addr    local_ep;  /* address for local entry point, ppc64le */
       HChar*  pri_name;  /* primary name, never NULL */
       HChar** sec_names; /* NULL, or a NULL term'd array of other names */
       // XXX: this could be shrunk (on 32-bit platforms) by using 30
@@ -816,7 +817,7 @@
    Bool   gotplt_present;
    Addr   gotplt_avma;
    SizeT  gotplt_size;
-   /* .opd -- needed on ppc64-linux for finding symbols */
+   /* .opd -- needed on ppc64be-linux for finding symbols */
    Bool   opd_present;
    Addr   opd_avma;
    SizeT  opd_size;
diff --git a/coregrind/m_debuginfo/readelf.c b/coregrind/m_debuginfo/readelf.c
index 627db42..7cbe4c7 100644
--- a/coregrind/m_debuginfo/readelf.c
+++ b/coregrind/m_debuginfo/readelf.c
@@ -241,7 +241,10 @@
         Bool*   from_opd_out,   /* ppc64be-linux only: did we deref an
                                   .opd entry? */
         Bool*   is_text_out,    /* is this a text symbol? */
-        Bool*   is_ifunc        /* is this a  STT_GNU_IFUNC function ?*/
+        Bool*   is_ifunc,       /* is this a  STT_GNU_IFUNC function ?*/
+        Addr*   sym_local_ep    /* addr for local entry point.  PPC64 LE
+                                   supports a local and global entry points.
+                                   Use this value to return the entry point. */
      )
 {
    Bool plausible;
@@ -259,6 +262,8 @@
    *sym_tocptr_out    = 0; /* unknown/inapplicable */
    *from_opd_out      = False;
    *is_ifunc          = False;
+   *sym_local_ep      = 0; /* unknown/inapplicable */
+
    /* Get the symbol size, but restrict it to fit in a signed 32 bit
       int.  Also, deal with the stupid case of negative size by making
       the size be 1.  Note that sym->st_size has type UWord,
@@ -671,14 +676,57 @@
    }
 
 #  if defined(VGP_ppc64be_linux)
-   /* It's crucial that we never add symbol addresses in the .opd
-      section.  This would completely mess up function redirection and
-      intercepting.  This assert ensures that any symbols that make it
-      into the symbol table on ppc64-linux don't point into .opd. */
    if (di->opd_present && di->opd_size > 0) {
       vg_assert(*sym_avma_out + *sym_size_out <= di->opd_avma
                 || *sym_avma_out >= di->opd_avma + di->opd_size);
    }
+#endif
+
+#  if defined(VGP_ppc64le_linux)
+   /* PPC64 LE ABI uses three bits in the st_other field to indicate the number
+    * of instructions between the function's global and local entry points. An
+    * offset of 0 indicates that there is one entry point.  The value must be:
+    *
+    * 0  - one entry point, local and global are the same
+    * 1  - reserved
+    * 2  - local entry point is one instruction after the global entry point
+    * 3  - local entry point is two instructions after the global entry point
+    * 4  - local entry point is four instructions after the global entry point
+    * 5  - local entry point is eight instructions after the global entry point
+    * 6  - local entry point is sixteen two instructions after the global entry point
+    * 7  - reserved
+    *
+    *  The extract the three bit field from the other field.
+    *        (other_field & STO_PPC64_LOCAL_MASK) >> STO_PPC_LOCAL_BIT
+    *
+    *  where the #define values are given in include/elf/powerpc.h file for
+    *  the PPC binutils.
+    *
+    * coversion of the three bit field to bytes is given by
+    *
+    *       ((1 << bit_field) >> 2) << 2
+    */
+
+   #define STO_PPC64_LOCAL_BIT             5
+   #define STO_PPC64_LOCAL_MASK            (7 << STO_PPC64_LOCAL_BIT)
+   {
+      unsigned int bit_field, dist_to_local_entry;
+      /* extract the other filed */
+      bit_field = (sym->st_other & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT;
+
+      if ((bit_field > 0) && (bit_field < 7)) {
+         /* store the local entry point address */
+         dist_to_local_entry = ((1 << bit_field) >> 2) << 2;
+         *sym_local_ep = *sym_avma_out + dist_to_local_entry;
+
+         if (TRACE_SYMTAB_ENABLED) {
+            HChar* sym_name = ML_(img_strdup)(escn_strtab->img,
+                                             "di.gesi.5", sym_name_ioff);
+            VG_(printf)("Local entry point: %s at %#010x\n",
+			sym_name, (unsigned int)*sym_local_ep);
+         }
+      }
+   }
 #  endif
 
    /* Acquire! */
@@ -687,7 +735,7 @@
 
 
 /* Read an ELF symbol table (normal or dynamic).  This one is for the
-   "normal" case ({x86,amd64,ppc32,arm,mips32,mips64}-linux). */
+   "normal" case ({x86,amd64,ppc32,arm,mips32,mips64, ppc64le}-linux). */
 static
 __attribute__((unused)) /* not referred to on all targets */
 void read_elf_symtab__normal( 
@@ -726,6 +774,7 @@
       Addr   sym_avma_really = 0;
       Int    sym_size = 0;
       Addr   sym_tocptr = 0;
+      Addr   local_ep = 0;
       Bool   from_opd = False, is_text = False, is_ifunc = False;
       DiOffT sym_name_really = DiOffT_INVALID;
       if (get_elf_symbol_info(di, &sym, sym_name, escn_strtab, 
@@ -735,7 +784,8 @@
                               &sym_avma_really,
                               &sym_size,
                               &sym_tocptr,
-                              &from_opd, &is_text, &is_ifunc)) {
+                              &from_opd, &is_text, &is_ifunc,
+                              &local_ep)) {
 
          DiSym  disym;
          VG_(memset)(&disym, 0, sizeof(disym));
@@ -743,6 +793,7 @@
                                        "di.res__n.1", sym_name_really);
          disym.addr      = sym_avma_really;
          disym.tocptr    = sym_tocptr;
+         disym.local_ep  = local_ep;
          disym.pri_name  = ML_(addStr) ( di, cstr, -1 );
          disym.sec_names = NULL;
          disym.size      = sym_size;
@@ -750,7 +801,7 @@
          disym.isIFunc   = is_ifunc;
          if (cstr) { ML_(dinfo_free)(cstr); cstr = NULL; }
          vg_assert(disym.pri_name);
-         vg_assert(disym.tocptr == 0); /* has no role except on ppc64-linux */
+         vg_assert(disym.tocptr == 0); /* has no role except on ppc64be-linux */
          ML_(addSym) ( di, &disym );
 
          if (TRACE_SYMTAB_ENABLED) {
@@ -762,6 +813,10 @@
                          (Int)disym.size,
                          (HChar*)disym.pri_name
             );
+	    if (local_ep != 0) {
+               TRACE_SYMTAB("               local entry point %#010lx\n",
+                            local_ep)
+	    }
          }
 
       }
@@ -857,6 +912,7 @@
       Addr   sym_avma_really = 0;
       Int    sym_size = 0;
       Addr   sym_tocptr = 0;
+      Addr   sym_local_ep = 0;
       Bool   from_opd = False, is_text = False, is_ifunc = False;
       DiOffT sym_name_really = DiOffT_INVALID;
       DiSym  disym;
@@ -868,7 +924,8 @@
                               &sym_avma_really,
                               &sym_size,
                               &sym_tocptr,
-                              &from_opd, &is_text, &is_ifunc)) {
+                              &from_opd, &is_text, &is_ifunc,
+                              &sym_local_ep)) {
 
          /* Check if we've seen this (name,addr) key before. */
          key.addr = sym_avma_really;
@@ -2825,6 +2882,7 @@
 #     if !defined(VGP_amd64_linux) \
          && !defined(VGP_s390x_linux) \
          && !defined(VGP_ppc64be_linux) \
+         && !defined(VGP_ppc64le_linux) \
          && !defined(VGPV_arm_linux_android) \
          && !defined(VGPV_x86_linux_android) \
          && !defined(VGP_mips64_linux)
diff --git a/coregrind/m_debuginfo/readmacho.c b/coregrind/m_debuginfo/readmacho.c
index 9926237..95b665c 100644
--- a/coregrind/m_debuginfo/readmacho.c
+++ b/coregrind/m_debuginfo/readmacho.c
@@ -199,6 +199,8 @@
          Int cputype = CPU_TYPE_POWERPC;
 #        elif defined(VGA_ppc64be)
          Int cputype = CPU_TYPE_POWERPC64BE;
+#        elif defined(VGA_ppc64le)
+         Int cputype = CPU_TYPE_POWERPC64LE;
 #        elif defined(VGA_x86)
          Int cputype = CPU_TYPE_X86;
 #        elif defined(VGA_amd64)
diff --git a/coregrind/m_debuginfo/storage.c b/coregrind/m_debuginfo/storage.c
index 440b469..5abe0fb 100644
--- a/coregrind/m_debuginfo/storage.c
+++ b/coregrind/m_debuginfo/storage.c
@@ -199,7 +199,7 @@
    SHOW_HOW(si_m->r11_how, si_m->r11_off);
    VG_(printf)(" R7=");
    SHOW_HOW(si_m->r7_how, si_m->r7_off);
-#  elif defined(VGA_ppc32) || defined(VGA_ppc64be) || defined(VGA_ppc64be)
+#  elif defined(VGA_ppc32) || defined(VGA_ppc64be) || defined(VGA_ppc64le)
 #  elif defined(VGA_s390x) || defined(VGA_mips32) || defined(VGA_mips64)
    VG_(printf)(" SP=");
    SHOW_HOW(si_m->sp_how, si_m->sp_off);
diff --git a/coregrind/m_dispatch/dispatch-ppc64-linux.S b/coregrind/m_dispatch/dispatch-ppc64be-linux.S
similarity index 98%
rename from coregrind/m_dispatch/dispatch-ppc64-linux.S
rename to coregrind/m_dispatch/dispatch-ppc64be-linux.S
index 35cefdf..945fa29 100644
--- a/coregrind/m_dispatch/dispatch-ppc64-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc64be-linux.S
@@ -28,7 +28,7 @@
   The GNU General Public License is contained in the file COPYING.
 */
 
-#if defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+#if defined(VGP_ppc64be_linux)
 
 #include "pub_core_basics_asm.h"
 #include "pub_core_dispatch_asm.h"
@@ -530,7 +530,7 @@
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
 
-#endif // defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+#endif // defined(VGP_ppc64be_linux)
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
diff --git a/coregrind/m_dispatch/dispatch-ppc64-linux.S b/coregrind/m_dispatch/dispatch-ppc64le-linux.S
similarity index 83%
copy from coregrind/m_dispatch/dispatch-ppc64-linux.S
copy to coregrind/m_dispatch/dispatch-ppc64le-linux.S
index 35cefdf..7e5bc0d 100644
--- a/coregrind/m_dispatch/dispatch-ppc64-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc64le-linux.S
@@ -28,13 +28,22 @@
   The GNU General Public License is contained in the file COPYING.
 */
 
-#if defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+#if  defined(VGP_ppc64le_linux)
 
 #include "pub_core_basics_asm.h"
 #include "pub_core_dispatch_asm.h"
 #include "pub_core_transtab_asm.h"
 #include "libvex_guest_offsets.h"	/* for OFFSET_ppc64_CIA */
 
+/* NOTE: PPC64 supports Big Endian and Little Endian.  It also supports the
+	ELF version 1 and ELF version 2 APIs.
+
+	Currently LE uses ELF version 2 and BE uses ELF version 1.  However,
+	BE and LE may support the other ELF version in the future.  So, the
+	_CALL_ELF is used in the assembly function to enable code for a
+	specific ELF version independently of the Enianess of the machine.
+	The test "#if  _CALL_ELF == 2" checks if ELF version 2 is being used.
+*/
 
 /* References to globals via the TOC */
 
@@ -74,14 +83,26 @@
 .section ".text"
 .align   2
 .globl   VG_(disp_run_translations)
+#if _CALL_ELF == 2
+.type VG_(disp_run_translations),@function
+VG_(disp_run_translations):
+.type    .VG_(disp_run_translations),@function
+#else
 .section ".opd","aw"
 .align   3
 VG_(disp_run_translations):
 .quad    .VG_(disp_run_translations),.TOC.@tocbase,0
 .previous
 .type    .VG_(disp_run_translations),@function
+#endif
 .globl   .VG_(disp_run_translations)
 .VG_(disp_run_translations):
+#if  _CALL_ELF == 2
+0:      addis 2, 12,.TOC.-0b@ha
+        addi  2,2,.TOC.-0b@l
+        .localentry VG_(disp_run_translations), .-VG_(disp_run_translations)
+#endif
+
 	/* r3 holds two_words */
 	/* r4 holds guest_state */
         /* r5 holds host_addr */
@@ -228,8 +249,13 @@
         /* make a stack frame for the code we are calling */
         stdu    1,-48(1)
 
-        /* Set up the guest state ptr */
+	/* Set up the guest state ptr */
         mr      31,4      /* r31 (generated code gsp) = r4 */
+#if  _CALL_ELF == 2
+/*  for the LE ABI need to setup r2 and r12 */
+0:      addis 2, 12,.TOC.-0b@ha
+        addi  2,2,.TOC.-0b@l
+#endif
 
         /* and jump into the code cache.  Chained translations in
            the code cache run, until for whatever reason, they can't
@@ -384,6 +410,9 @@
         mtlr    0
         addi    1,1,624   /* stack_size */
         blr
+#if _CALL_ELF == 2
+	.size VG_(disp_run_translations),.-VG_(disp_run_translations)
+#endif
 
 
 /*----------------------------------------------------*/
@@ -394,15 +423,25 @@
         .section ".text"
         .align   2
         .globl   VG_(disp_cp_chain_me_to_slowEP)
-        .section ".opd","aw"
+#if  _CALL_ELF == 2
+        .type VG_(disp_cp_chain_me_to_slowEP),@function
+	VG_(disp_cp_chain_me_to_slowEP):
+#else
+	.section ".opd","aw"
         .align   3
 VG_(disp_cp_chain_me_to_slowEP):
         .quad    .VG_(disp_cp_chain_me_to_slowEP),.TOC.@tocbase,0
         .previous
+#endif
         .type    .VG_(disp_cp_chain_me_to_slowEP),@function
         .globl   .VG_(disp_cp_chain_me_to_slowEP)
 .VG_(disp_cp_chain_me_to_slowEP):
-        /* We got called.  The return address indicates
+#if  _CALL_ELF == 2
+0:      addis 2, 12,.TOC.-0b@ha
+        addi  2,2,.TOC.-0b@l
+        .localentry VG_(disp_cp_chain_me_to_slowEP), .-VG_(disp_cp_chain_me_to_slowEP)
+#endif
+	/* We got called.  The return address indicates
            where the patching needs to happen.  Collect
            the return address and, exit back to C land,
            handing the caller the pair (Chain_me_S, RA) */
@@ -414,20 +453,33 @@
         */
         subi 7,7,20+4+4
         b    .postamble
+#if  _CALL_ELF == 2
+        .size VG_(disp_cp_chain_me_to_slowEP),.-VG_(disp_cp_chain_me_to_slowEP)
+#endif
 
 /* ------ Chain me to fast entry point ------ */
         .section ".text"
         .align   2
         .globl   VG_(disp_cp_chain_me_to_fastEP)
-        .section ".opd","aw"
+#if  _CALL_ELF == 2
+        .type VG_(disp_cp_chain_me_to_fastEP),@function
+VG_(disp_cp_chain_me_to_fastEP):
+#else
+	.section ".opd","aw"
         .align   3
 VG_(disp_cp_chain_me_to_fastEP):
         .quad    .VG_(disp_cp_chain_me_to_fastEP),.TOC.@tocbase,0
         .previous
+#endif
         .type    .VG_(disp_cp_chain_me_to_fastEP),@function
         .globl   .VG_(disp_cp_chain_me_to_fastEP)
 .VG_(disp_cp_chain_me_to_fastEP):
-        /* We got called.  The return address indicates
+#if  _CALL_ELF == 2
+0:      addis 2, 12,.TOC.-0b@ha
+        addi  2,2,.TOC.-0b@l
+        .localentry VG_(disp_cp_chain_me_to_fastEP), .-VG_(disp_cp_chain_me_to_fastEP)
+#endif
+	/* We got called.  The return address indicates
            where the patching needs to happen.  Collect
            the return address and, exit back to C land,
            handing the caller the pair (Chain_me_S, RA) */
@@ -439,20 +491,33 @@
         */
         subi 7,7,20+4+4
         b    .postamble
+#if _CALL_ELF == 2
+        .size VG_(disp_cp_chain_me_to_fastEP),.-VG_(disp_cp_chain_me_to_fastEP)
+#endif
 
 /* ------ Indirect but boring jump ------ */
         .section ".text"
         .align   2
         .globl   VG_(disp_cp_xindir)
-        .section ".opd","aw"
+#if _CALL_ELF == 2
+        .type VG_(disp_cp_xindir),@function
+VG_(disp_cp_xindir):
+#else
+	.section ".opd","aw"
         .align   3
 VG_(disp_cp_xindir):
         .quad    .VG_(disp_cp_xindir),.TOC.@tocbase,0
         .previous
+#endif
         .type    .VG_(disp_cp_xindir),@function
         .globl   .VG_(disp_cp_xindir)
 .VG_(disp_cp_xindir):
-        /* Where are we going? */
+#if  _CALL_ELF == 2
+0:      addis 2, 12,.TOC.-0b@ha
+        addi  2,2,.TOC.-0b@l
+        .localentry VG_(disp_cp_xindir), .-VG_(disp_cp_xindir)
+#endif
+	/* Where are we going? */
         ld      3,OFFSET_ppc64_CIA(31)
 
         /* stats only */
@@ -478,6 +543,9 @@
         /* Found a match.  Jump to .host. */
         mtctr   7
         bctr
+#if _CALL_ELF == 2
+        .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir)
+#endif
 
 .fast_lookup_failed:
         /* stats only */
@@ -495,36 +563,61 @@
 .section ".text"
         .align   2
         .globl   VG_(disp_cp_xassisted)
-        .section ".opd","aw"
+#if _CALL_ELF == 2
+        .type VG_(disp_cp_xassisted),@function
+VG_(disp_cp_xassisted):
+#else
+	.section ".opd","aw"
         .align   3
 VG_(disp_cp_xassisted):
         .quad    .VG_(disp_cp_xassisted),.TOC.@tocbase,0
         .previous
-        .type    .VG_(disp_cp_xassisted),@function
+#endif
+#if  _CALL_ELF == 2
+0:      addis 2, 12,.TOC.-0b@ha
+        addi  2,2,.TOC.-0b@l
+        .localentry VG_(disp_cp_xassisted), .-VG_(disp_cp_xassisted)
+#endif
+	.type    .VG_(disp_cp_xassisted),@function
         .globl   .VG_(disp_cp_xassisted)
 .VG_(disp_cp_xassisted):
         /* r31 contains the TRC */
         mr      6,31
         li      7,0
         b       .postamble
+#if _CALL_ELF == 2
+        .size VG_(disp_cp_xassisted),.-VG_(disp_cp_xassisted)
+#endif
 
 /* ------ Event check failed ------ */
         .section ".text"
         .align   2
         .globl   VG_(disp_cp_evcheck_fail)
-        .section ".opd","aw"
+#if _CALL_ELF == 2
+        .type VG_(disp_cp_evcheck_fail),@function
+VG_(disp_cp_evcheck_fail):
+#else
+	.section ".opd","aw"
         .align   3
 VG_(disp_cp_evcheck_fail):
         .quad    .VG_(disp_cp_evcheck_fail),.TOC.@tocbase,0
         .previous
+#endif
+#if  _CALL_ELF == 2
+0:      addis 2, 12,.TOC.-0b@ha
+        addi  2,2,.TOC.-0b@l
+        .localentry VG_(disp_cp_evcheck_fail), .-VG_(disp_cp_evcheck_fail)
+#endif
         .type    .VG_(disp_cp_evcheck_fail),@function
         .globl   .VG_(disp_cp_evcheck_fail)
 .VG_(disp_cp_evcheck_fail):
         li      6,VG_TRC_INNER_COUNTERZERO
         li      7,0
         b       .postamble
+#if  _CALL_ELF == 2
+       .size VG_(disp_cp_evcheck_fail),.-VG_(disp_cp_evcheck_fail)
+#endif
 
-        
 .size .VG_(disp_run_translations), .-.VG_(disp_run_translations)
 
 /* Let the linker know we don't need an executable stack */
diff --git a/coregrind/m_initimg/initimg-linux.c b/coregrind/m_initimg/initimg-linux.c
index c021f56..400fa1b 100644
--- a/coregrind/m_initimg/initimg-linux.c
+++ b/coregrind/m_initimg/initimg-linux.c
@@ -1043,6 +1043,9 @@
    arch->vex.guest_GPR1 = iifii.initial_client_SP;
    arch->vex.guest_GPR2 = iifii.initial_client_TOC;
    arch->vex.guest_CIA  = iifii.initial_client_IP;
+#if defined(VGP_ppc64le_linux)
+   arch->vex.guest_GPR12 = iifii.initial_client_IP;
+#endif
 
 #  elif defined(VGP_arm_linux)
    /* Zero out the initial state, and set up the simulated FPU in a
diff --git a/coregrind/m_libcsetjmp.c b/coregrind/m_libcsetjmp.c
index 488e8e8..df56e3b 100644
--- a/coregrind/m_libcsetjmp.c
+++ b/coregrind/m_libcsetjmp.c
@@ -158,7 +158,6 @@
 ".align 2"                          "\n"
 ".p2align 4,,15"                    "\n"
 ".globl VG_MINIMAL_SETJMP"          "\n"
-
 ".section \".opd\",\"aw\""          "\n"
 ".align 3"                          "\n"
 "VG_MINIMAL_SETJMP:"                "\n"
@@ -267,9 +266,114 @@
 ""       "\n"
 
 ".previous"  "\n"
-".previous"  "\n"
 );
 
+#elif defined(VGP_ppc64le_linux)
+__asm__(
+".section \".toc\",\"aw\""          "\n"
+
+".section \".text\""                "\n"
+".align 2"                          "\n"
+".p2align 4,,15"                    "\n"
+".globl VG_MINIMAL_SETJMP"          "\n"
+".type VG_MINIMAL_SETJMP,@function" "\n"
+"VG_MINIMAL_SETJMP:"                "\n"
+"       .localentry VG_MINIMAL_SETJMP, .-VG_MINIMAL_SETJMP" "\n"
+"        std     0, 0(3)"  "\n"
+"        std     1, 8(3)"  "\n"
+"        std     2, 16(3)"  "\n"
+"        std     3, 24(3)"  "\n"
+"        std     4, 32(3)"  "\n"
+"        std     5, 40(3)"  "\n"
+"        std     6, 48(3)"  "\n"
+"        std     7, 56(3)"  "\n"
+"        std     8, 64(3)"  "\n"
+"        std     9, 72(3)"  "\n"
+"        std     10, 80(3)"  "\n"
+"        std     11, 88(3)"  "\n"
+"        std     12, 96(3)"  "\n"
+"        std     13, 104(3)"  "\n"
+"        std     14, 112(3)"  "\n"
+"        std     15, 120(3)"  "\n"
+"        std     16, 128(3)"  "\n"
+"        std     17, 136(3)"  "\n"
+"        std     18, 144(3)"  "\n"
+"        std     19, 152(3)"  "\n"
+"        std     20, 160(3)"  "\n"
+"        std     21, 168(3)"  "\n"
+"        std     22, 176(3)"  "\n"
+"        std     23, 184(3)"  "\n"
+"        std     24, 192(3)"  "\n"
+"        std     25, 200(3)"  "\n"
+"        std     26, 208(3)"  "\n"
+"        std     27, 216(3)"  "\n"
+"        std     28, 224(3)"  "\n"
+"        std     29, 232(3)"  "\n"
+"        std     30, 240(3)"  "\n"
+"        std     31, 248(3)"  "\n"
+// must use a caller-save register here as scratch, hence r4
+"        mflr    4"  "\n"
+"        std     4, 256(3)"  "\n"
+"        mfcr    4"  "\n"
+"        std     4, 264(3)"  "\n"
+"        li      3, 0"  "\n"
+"        blr"  "\n"
+""       "\n"
+
+
+".globl VG_MINIMAL_LONGJMP"                "\n"
+".type   VG_MINIMAL_LONGJMP, @function"    "\n"
+"VG_MINIMAL_LONGJMP:"                      "\n"
+"        .localentry VG_MINIMAL_LONGJMP, .-VG_MINIMAL_LONGJMP" "\n"
+         // do r4 = 1
+         // and park it in the restore slot for r3 (the ret reg)
+"        li      4, 1"  "\n"
+"        std     4, 24(3)"  "\n"
+         // restore everything except r3
+         // then r3 last of all
+         // then blr
+"        ld      0, 256(3)"  "\n"
+"        mtlr    0"  "\n"
+"        ld      0, 264(3)"  "\n"
+"        mtcr    0"  "\n"
+"        ld      0, 0(3)"  "\n"
+"        ld      1, 8(3)"  "\n"
+"        ld      2, 16(3)"  "\n"
+         // r3 is done at the end
+"        ld      4, 32(3)"  "\n"
+"        ld      5, 40(3)"  "\n"
+"        ld      6, 48(3)"  "\n"
+"        ld      7, 56(3)"  "\n"
+"        ld      8, 64(3)"  "\n"
+"        ld      9, 72(3)"  "\n"
+"        ld      10, 80(3)"  "\n"
+"        ld      11, 88(3)"  "\n"
+"        ld      12, 96(3)"  "\n"
+"        ld      13, 104(3)"  "\n"
+"        ld      14, 112(3)"  "\n"
+"        ld      15, 120(3)"  "\n"
+"        ld      16, 128(3)"  "\n"
+"        ld      17, 136(3)"  "\n"
+"        ld      18, 144(3)"  "\n"
+"        ld      19, 152(3)"  "\n"
+"        ld      20, 160(3)"  "\n"
+"        ld      21, 168(3)"  "\n"
+"        ld      22, 176(3)"  "\n"
+"        ld      23, 184(3)"  "\n"
+"        ld      24, 192(3)"  "\n"
+"        ld      25, 200(3)"  "\n"
+"        ld      26, 208(3)"  "\n"
+"        ld      27, 216(3)"  "\n"
+"        ld      28, 224(3)"  "\n"
+"        ld      29, 232(3)"  "\n"
+"        ld      30, 240(3)"  "\n"
+"        ld      31, 248(3)"  "\n"
+"        ld      3, 24(3)"  "\n"
+"        blr"               "\n"
+""       "\n"
+
+".previous"  "\n"
+);
 #endif /* VGP_ppc64be_linux */
 
 
diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c
index b59b43a..1c767b6 100644
--- a/coregrind/m_machine.c
+++ b/coregrind/m_machine.c
@@ -1188,9 +1188,13 @@
      VG_(machine_ppc64_has_VMX) = have_V ? 1 : 0;
 
      va = VexArchPPC64;
-     // CARLL fixme: when the time comes, copy .endness setting code
-     // from the VGA_mips32 case
+#    if defined(VKI_LITTLE_ENDIAN)
+     vai.endness = VexEndnessLE;
+#    elif defined(VKI_BIG_ENDIAN)
      vai.endness = VexEndnessBE;
+#    else
+     vai.endness = VexEndness_INVALID;
+#    endif
 
      vai.hwcaps = 0;
      if (have_V)  vai.hwcaps |= VEX_HWCAPS_PPC64_V;
diff --git a/coregrind/m_main.c b/coregrind/m_main.c
index 477cce0..af7a6d9 100644
--- a/coregrind/m_main.c
+++ b/coregrind/m_main.c
@@ -2641,6 +2641,10 @@
    VG_(set_IP)(tid, __libc_freeres_wrapper);
 #  if defined(VGP_ppc64be_linux)
    VG_(threads)[tid].arch.vex.guest_GPR2 = r2;
+#  elif  defined(VGP_ppc64le_linux)
+   /* setting GPR2 but not really needed, GPR12 is needed */
+   VG_(threads)[tid].arch.vex.guest_GPR2  = __libc_freeres_wrapper;
+   VG_(threads)[tid].arch.vex.guest_GPR12 = __libc_freeres_wrapper;
 #  endif
    /* mips-linux note: we need to set t9 */
 #  if defined(VGP_mips32_linux) || defined(VGP_mips64_linux)
@@ -2882,6 +2886,51 @@
     "\tnop\n"
     "\ttrap\n"
 );
+#elif defined(VGP_ppc64le_linux)
+/* Little Endian uses ELF version 2 but in the future may also
+ * support other ELF versions.
+ */
+asm("\n"
+    "\t.align 2\n"
+    "\t.global _start\n"
+    "\t.type _start,@function\n"
+    "_start:\n"
+    "#if _CALL_ELF == 2    \n"
+    "0:  addis        2,12,.TOC.-0b@ha\n"
+    "    addi         2,2,.TOC.-0b@l\n"
+    "    .localentry  _start, .-_start\n"
+    "#endif \n"
+    /* set up the new stack in r16 */
+    "\tlis  16,   vgPlain_interim_stack@highest\n"
+    "\tori  16,16,vgPlain_interim_stack@higher\n"
+    "\tsldi 16,16,32\n"
+    "\toris 16,16,vgPlain_interim_stack@h\n"
+    "\tori  16,16,vgPlain_interim_stack@l\n"
+    "\txor  17,17,17\n"
+    "\tlis    17,("VG_STRINGIFY(VG_STACK_GUARD_SZB)" >> 16)\n"
+    "\tori 17,17,("VG_STRINGIFY(VG_STACK_GUARD_SZB)" & 0xFFFF)\n"
+    "\txor 18,18,18\n"
+    "\tlis    18,("VG_STRINGIFY(VG_STACK_ACTIVE_SZB)" >> 16)\n"
+    "\tori 18,18,("VG_STRINGIFY(VG_STACK_ACTIVE_SZB)" & 0xFFFF)\n"
+    "\tadd 16,17,16\n"
+    "\tadd 16,18,16\n"
+    "\trldicr 16,16,0,59\n"
+    /* now r16 = &vgPlain_interim_stack + VG_STACK_GUARD_SZB +
+       VG_STACK_ACTIVE_SZB rounded down to the nearest 16-byte
+       boundary.  And r1 is the original SP.  Set the SP to r16 and
+       call _start_in_C_linux, passing it the initial SP. */
+    "\tmr 3,1\n"
+    "\tmr 1,16\n"
+    "\tlis  14,   _start_in_C_linux@highest\n"
+    "\tori  14,14,_start_in_C_linux@higher\n"
+    "\tsldi 14,14,32\n"
+    "\toris 14,14,_start_in_C_linux@h\n"
+    "\tori  14,14,_start_in_C_linux@l\n"
+    "\tmtctr 14\n"
+    "\tbctrl\n"
+    "\tnop\n"
+    "\ttrap\n"
+);
 #elif defined(VGP_s390x_linux)
 /*
     This is the canonical entry point, usually the first thing in the text
diff --git a/coregrind/m_redir.c b/coregrind/m_redir.c
index dc77119..de70360 100644
--- a/coregrind/m_redir.c
+++ b/coregrind/m_redir.c
@@ -397,7 +397,7 @@
    TopSpec*     newts;
    HChar*       sym_name_pri;
    HChar**      sym_names_sec;
-   Addr         sym_addr, sym_toc;
+   Addr         sym_addr, sym_toc, local_ep;
    HChar        demangled_sopatt[N_DEMANGLED];
    HChar        demangled_fnpatt[N_DEMANGLED];
    Bool         check_ppcTOCs = False;
@@ -499,7 +499,7 @@
 
    nsyms = VG_(DebugInfo_syms_howmany)( newdi );
    for (i = 0; i < nsyms; i++) {
-      VG_(DebugInfo_syms_getidx)( newdi, i, &sym_addr, &sym_toc,
+      VG_(DebugInfo_syms_getidx)( newdi, i, &sym_addr, &sym_toc, &local_ep,
                                   NULL, &sym_name_pri, &sym_names_sec,
                                   &isText, NULL );
       /* Set up to conveniently iterate over all names for this symbol. */
@@ -592,7 +592,7 @@
 
    if (check_ppcTOCs) {
       for (i = 0; i < nsyms; i++) {
-         VG_(DebugInfo_syms_getidx)( newdi, i, &sym_addr, &sym_toc,
+         VG_(DebugInfo_syms_getidx)( newdi, i, &sym_addr, &sym_toc, &local_ep,
                                      NULL, &sym_name_pri, &sym_names_sec,
                                      &isText, NULL );
          HChar*  twoslots[2];
@@ -755,7 +755,9 @@
       of trashing the caches less. */
    nsyms = VG_(DebugInfo_syms_howmany)( di );
    for (i = 0; i < nsyms; i++) {
-      VG_(DebugInfo_syms_getidx)( di, i, &sym_addr, NULL,
+      Addr local_ep = 0;
+
+      VG_(DebugInfo_syms_getidx)( di, i, &sym_addr, NULL, &local_ep,
                                   NULL, &sym_name_pri, &sym_names_sec,
                                   &isText, &isIFunc );
       HChar*  twoslots[2];
@@ -783,6 +785,18 @@
                act.isIFunc     = isIFunc;
                sp->done = True;
                maybe_add_active( act );
+
+               /* If the function being wrapped has a local entry point
+                * redirect it to the global entry point.  The redirection
+                * must save and setup r2 then setup r12 for the new function.
+                * On return, r2 must be restored.  Local entry points used
+                * used in PPC64 Little Endian.
+                */
+               if (local_ep != 0) {
+                  act.from_addr = local_ep;
+                  maybe_add_active( act );
+               }
+
             }
          } /* for (sp = specs; sp; sp = sp->next) */
 
@@ -1298,6 +1312,27 @@
       );
    }
 
+#  elif defined(VGP_ppc64le_linux)
+   /* If we're using memcheck, use these intercepts right from
+    * the start, otherwise ld.so makes a lot of noise.
+    */
+   if (0==VG_(strcmp)("Memcheck", VG_(details).name)) {
+
+      /* this is mandatory - can't sanely continue without it */
+      add_hardwired_spec(
+         "ld64.so.2", "strlen",
+         (Addr)&VG_(ppc64_linux_REDIR_FOR_strlen),
+         complain_about_stripped_glibc_ldso
+      );
+
+      add_hardwired_spec(
+         "ld64.so.2", "index",
+         (Addr)&VG_(ppc64_linux_REDIR_FOR_strchr),
+         NULL /* not mandatory - so why bother at all? */
+         /* glibc-2.5 (FC6, ppc64) seems fine without it */
+      );
+   }
+
 #  elif defined(VGP_arm_linux)
    /* If we're using memcheck, use these intercepts right from the
       start, otherwise ld.so makes a lot of noise.  In most ARM-linux
@@ -1569,7 +1604,7 @@
          Bool    isText        = False;
          HChar*  sym_name_pri  = NULL;
          HChar** sym_names_sec = NULL;
-         VG_(DebugInfo_syms_getidx)( di, j, NULL, NULL,
+         VG_(DebugInfo_syms_getidx)( di, j, NULL, NULL, NULL,
                                      NULL, &sym_name_pri, &sym_names_sec,
                                      &isText, NULL );
          HChar*  twoslots[2];
diff --git a/coregrind/m_sigframe/sigframe-ppc64-linux.c b/coregrind/m_sigframe/sigframe-ppc64-linux.c
index bb53806..17a3c50 100644
--- a/coregrind/m_sigframe/sigframe-ppc64-linux.c
+++ b/coregrind/m_sigframe/sigframe-ppc64-linux.c
@@ -252,7 +252,11 @@
 #  undef DO
 
    frame->uc.uc_mcontext.gp_regs[VKI_PT_NIP]     = tst->arch.vex.guest_CIA;
-   frame->uc.uc_mcontext.gp_regs[VKI_PT_MSR]     = 0xf032;   /* pretty arbitrary */
+#ifdef VGP_ppc64le_linux
+   frame->uc.uc_mcontext.gp_regs[VKI_PT_MSR]     = 0xf033;  /* pretty arbitrary */
+#else
+   frame->uc.uc_mcontext.gp_regs[VKI_PT_MSR]     = 0xf032;  /* pretty arbitrary */
+#endif
    frame->uc.uc_mcontext.gp_regs[VKI_PT_ORIG_R3] = tst->arch.vex.guest_GPR3;
    frame->uc.uc_mcontext.gp_regs[VKI_PT_CTR]     = tst->arch.vex.guest_CTR;
    frame->uc.uc_mcontext.gp_regs[VKI_PT_LNK]     = tst->arch.vex.guest_LR;
@@ -302,9 +306,13 @@
 
    /* Handler is in fact a standard ppc64-linux function descriptor, 
       so extract the function entry point and also the toc ptr to use. */
+#if defined(VGP_ppc64be_linux)
    SET_SIGNAL_GPR(tid, 2, (Addr) ((ULong*)handler)[1]);
    tst->arch.vex.guest_CIA = (Addr) ((ULong*)handler)[0];
-
+#else
+   SET_SIGNAL_GPR(tid, 12, (Addr) handler);
+   tst->arch.vex.guest_CIA = (Addr) handler;
+#endif
    priv = &frame->priv;
    priv->magicPI       = 0x31415927;
    priv->sigNo_private = sigNo;
diff --git a/coregrind/m_signals.c b/coregrind/m_signals.c
index 0fd11c4..6ce03f8 100644
--- a/coregrind/m_signals.c
+++ b/coregrind/m_signals.c
@@ -866,6 +866,23 @@
    "	li	0, " #name "\n" \
    "	sc\n"
 
+#elif defined(VGP_ppc64le_linux)
+/* Little Endian supports ELF version 2.  In the future, it may
+ * support other versions.
+ */
+#  define _MY_SIGRETURN(name) \
+   ".align   2\n" \
+   ".globl   my_sigreturn\n" \
+   ".type    .my_sigreturn,@function\n" \
+   "my_sigreturn:\n" \
+   "#if _CALL_ELF == 2 \n" \
+   "0: addis        2,12,.TOC.-0b@ha\n" \
+   "   addi         2,2,.TOC.-0b@l\n" \
+   "   .localentry my_sigreturn,.-my_sigreturn\n" \
+   "#endif \n" \
+   "   sc\n" \
+   "   .size my_sigreturn,.-my_sigreturn\n"
+
 #elif defined(VGP_arm_linux)
 #  define _MY_SIGRETURN(name) \
    ".text\n" \
diff --git a/coregrind/m_syscall.c b/coregrind/m_syscall.c
index b111e5c..cd33958 100644
--- a/coregrind/m_syscall.c
+++ b/coregrind/m_syscall.c
@@ -422,6 +422,45 @@
 "        blr\n"
 );
 
+#elif defined(VGP_ppc64le_linux)
+/* Due to the need to return 65 bits of result, this is completely
+   different from the ppc32 case.  The single arg register points to a
+   7-word block containing the syscall # and the 6 args.  The syscall
+   result proper is put in [0] of the block, and %cr0.so is in the
+   bottom bit of [1]. */
+extern void do_syscall_WRK ( ULong* argblock );
+/* Little Endian supports ELF version 2.  In the future, it may support
+ * other versions as well.
+ */
+asm(
+".align   2\n"
+".globl   do_syscall_WRK\n"
+".type    do_syscall_WRK,@function\n"
+"do_syscall_WRK:\n"
+"#if  _CALL_ELF == 2"               "\n"
+"0:      addis        2,12,.TOC.-0b@ha\n"
+"        addi         2,2,.TOC.-0b@l\n"
+"        .localentry do_syscall_WRK, .-do_syscall_WRK\n"
+"#endif"                            "\n"
+"        std  3,-16(1)\n"  /* stash arg */
+"        ld   8, 48(3)\n"  /* sc arg 6 */
+"        ld   7, 40(3)\n"  /* sc arg 5 */
+"        ld   6, 32(3)\n"  /* sc arg 4 */
+"        ld   5, 24(3)\n"  /* sc arg 3 */
+"        ld   4, 16(3)\n"  /* sc arg 2 */
+"        ld   0,  0(3)\n"  /* sc number */
+"        ld   3,  8(3)\n"  /* sc arg 1 */
+"        sc\n"             /* result in r3 and cr0.so */
+"        ld   5,-16(1)\n"  /* reacquire argblock ptr (r5 is caller-save) */
+"        std  3,0(5)\n"    /* argblock[0] = r3 */
+"        mfcr 3\n"
+"        srwi 3,3,28\n"
+"        andi. 3,3,1\n"
+"        std  3,8(5)\n"    /* argblock[1] = cr0.s0 & 1 */
+"        blr\n"
+"        .size do_syscall_WRK, .-do_syscall_WRK\n"
+);
+
 #elif defined(VGP_arm_linux)
 /* I think the conventions are:
    args  in r0 r1 r2 r3 r4 r5
diff --git a/coregrind/m_syswrap/syscall-ppc64-linux.S b/coregrind/m_syswrap/syscall-ppc64be-linux.S
similarity index 97%
rename from coregrind/m_syswrap/syscall-ppc64-linux.S
rename to coregrind/m_syswrap/syscall-ppc64be-linux.S
index df73940..a301612 100644
--- a/coregrind/m_syswrap/syscall-ppc64-linux.S
+++ b/coregrind/m_syswrap/syscall-ppc64be-linux.S
@@ -27,7 +27,7 @@
   The GNU General Public License is contained in the file COPYING.
 */
 
-#if defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+#if defined(VGP_ppc64be_linux)
 
 #include "pub_core_basics_asm.h"
 #include "pub_core_vkiscnums_asm.h"
@@ -165,7 +165,7 @@
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
 
-#endif // defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+#endif // defined(VGP_ppc64be_linux)
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
diff --git a/coregrind/m_syswrap/syscall-ppc64-linux.S b/coregrind/m_syswrap/syscall-ppc64le-linux.S
similarity index 88%
copy from coregrind/m_syswrap/syscall-ppc64-linux.S
copy to coregrind/m_syswrap/syscall-ppc64le-linux.S
index df73940..9d3a7dd 100644
--- a/coregrind/m_syswrap/syscall-ppc64-linux.S
+++ b/coregrind/m_syswrap/syscall-ppc64le-linux.S
@@ -27,7 +27,7 @@
   The GNU General Public License is contained in the file COPYING.
 */
 
-#if defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+#if defined(VGP_ppc64le_linux)
 
 #include "pub_core_basics_asm.h"
 #include "pub_core_vkiscnums_asm.h"
@@ -75,12 +75,25 @@
 
 .align 2
 .globl ML_(do_syscall_for_client_WRK)
+#if _CALL_ELF == 2
+.type .ML_(do_syscall_for_client_WRK),@function
+ML_(do_syscall_for_client_WRK):
+0:      addis         2,12,.TOC.-0b@ha
+        addi          2,2,.TOC.-0b@l
+        .localentry   ML_(do_syscall_for_client_WRK), .-ML_(do_syscall_for_client_WRK)
+#else
 .section ".opd","aw"
 .align 3
-ML_(do_syscall_for_client_WRK):	
+ML_(do_syscall_for_client_WRK):
 .quad .ML_(do_syscall_for_client_WRK),.TOC.@tocbase,0
 .previous
-.type .ML_(do_syscall_for_client_WRK),@function
+#endif
+#if _CALL_ELF == 2
+0:      addis        2,12,.TOC.-0b@ha
+        addi         2,2,.TOC.-0b@l
+        .localentry  ML_(do_syscall_for_client_WRK), .-ML_(do_syscall_for_client_WRK)
+#endif
+.type  .ML_(do_syscall_for_client_WRK),@function
 .globl .ML_(do_syscall_for_client_WRK)
 .ML_(do_syscall_for_client_WRK):
         /* make a stack frame */
@@ -144,7 +157,11 @@
 	/* failure: return 0x8000 | error code */
 7:	ori	3,3,0x8000	/* FAILURE -- ensure return value is nonzero */
         b       5b
-
+#if _CALL_ELF == 2
+        .size .ML_(do_syscall_for_client_WRK),.-.ML_(do_syscall_for_client_WRK)
+#else
+        .size .ML_(do_syscall_for_client_WRK),.-.ML_(do_syscall_for_client_WRK)
+#endif
 .section .rodata
 /* export the ranges so that
    VG_(fixup_guest_state_after_syscall_interrupted) can do the
@@ -165,7 +182,7 @@
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
 
-#endif // defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+#endif defined(VGP_ppc64le_linux)
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
diff --git a/coregrind/m_syswrap/syswrap-main.c b/coregrind/m_syswrap/syswrap-main.c
index 077a9b4..7c1e469 100644
--- a/coregrind/m_syswrap/syswrap-main.c
+++ b/coregrind/m_syswrap/syswrap-main.c
@@ -2007,6 +2007,25 @@
       vg_assert(p[0] == 0x44 && p[1] == 0x0 && p[2] == 0x0 && p[3] == 0x2);
    }
 
+#elif defined(VGP_ppc64le_linux)
+   arch->vex.guest_CIA -= 4;             // sizeof(ppc32 instr)
+
+   /* Make sure our caller is actually sane, and we're really backing
+      back over a syscall.
+
+      sc == 44 00 00 02
+   */
+   {
+      UChar *p = (UChar *)arch->vex.guest_CIA;
+
+      if (p[3] != 0x44 || p[2] != 0x0 || p[1] != 0x0 || p[0] != 0x02)
+         VG_(message)(Vg_DebugMsg,
+                      "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
+                      arch->vex.guest_CIA + 0ULL, p[3], p[2], p[1], p[0]);
+
+      vg_assert(p[3] == 0x44 && p[2] == 0x0 && p[1] == 0x0 && p[0] == 0x2);
+   }
+
 #elif defined(VGP_arm_linux)
    if (arch->vex.guest_R15T & 1) {
       // Thumb mode.  SVC is a encoded as
diff --git a/coregrind/m_syswrap/syswrap-ppc64-linux.c b/coregrind/m_syswrap/syswrap-ppc64-linux.c
index 5d266b8..b67cf30 100644
--- a/coregrind/m_syswrap/syswrap-ppc64-linux.c
+++ b/coregrind/m_syswrap/syswrap-ppc64-linux.c
@@ -78,6 +78,7 @@
    address, the second word is the TOC ptr (r2), and the third word is
    the static chain value. */
 asm(
+#if defined(VGP_ppc64be_linux)
 "   .align   2\n"
 "   .globl   vgModuleLocal_call_on_new_stack_0_1\n"
 "   .section \".opd\",\"aw\"\n"
@@ -126,6 +127,55 @@
 "   mtcr 0\n\t"            // CAB: Need this?
 "   bctr\n\t"              // jump to dst
 "   trap\n"                // should never get here
+#else
+//  ppc64le_linux
+"   .align   2\n"
+"   .globl   vgModuleLocal_call_on_new_stack_0_1\n"
+"vgModuleLocal_call_on_new_stack_0_1:\n"
+"   .type    .vgModuleLocal_call_on_new_stack_0_1,@function\n"
+"#if _CALL_ELF == 2 \n"
+"0: addis        2,12,.TOC.-0b@ha\n"
+"   addi         2,2,.TOC.-0b@l\n"
+"#endif\n"
+".localentry vgModuleLocal_call_on_new_stack_0_1, .-vgModuleLocal_call_on_new_stack_0_1\n"
+"   mr    %r1,%r3\n\t"     // stack to %sp
+"   mtlr  %r4\n\t"         // retaddr to %lr
+"   mtctr %r5\n\t"         // f_ptr to count reg
+"   mr %r3,%r6\n\t"        // arg1 to %r3
+"   li 0,0\n\t"            // zero all GP regs
+"   li 4,0\n\t"
+"   li 5,0\n\t"
+"   li 6,0\n\t"
+"   li 7,0\n\t"
+"   li 8,0\n\t"
+"   li 9,0\n\t"
+"   li 10,0\n\t"
+"   li 11,0\n\t"
+"   li 12,0\n\t"
+"   li 13,0\n\t"
+"   li 14,0\n\t"
+"   li 15,0\n\t"
+"   li 16,0\n\t"
+"   li 17,0\n\t"
+"   li 18,0\n\t"
+"   li 19,0\n\t"
+"   li 20,0\n\t"
+"   li 21,0\n\t"
+"   li 22,0\n\t"
+"   li 23,0\n\t"
+"   li 24,0\n\t"
+"   li 25,0\n\t"
+"   li 26,0\n\t"
+"   li 27,0\n\t"
+"   li 28,0\n\t"
+"   li 29,0\n\t"
+"   li 30,0\n\t"
+"   li 31,0\n\t"
+"   mtxer 0\n\t"           // CAB: Need this?
+"   mtcr 0\n\t"            // CAB: Need this?
+"   bctr\n\t"              // jump to dst
+"   trap\n"                // should never get here
+#endif
 );
 
 
@@ -170,6 +220,7 @@
                                      Int*  parent_tid, 
                                      void/*vki_modify_ldt_t*/ * );
 asm(
+#if defined(VGP_ppc64be_linux)
 "   .align   2\n"
 "   .globl   do_syscall_clone_ppc64_linux\n"
 "   .section \".opd\",\"aw\"\n"
@@ -240,6 +291,78 @@
 "       ld      31,56(1)\n"
 "       addi    1,1,64\n"
 "       blr\n"
+#else
+"   .align   2\n"
+"   .globl   do_syscall_clone_ppc64_linux\n"
+"   .type    do_syscall_clone_ppc64_linux,@function\n"
+"do_syscall_clone_ppc64_linux:\n"
+"   .globl   .do_syscall_clone_ppc64_linux\n"
+".do_syscall_clone_ppc64_linux:\n"
+"#if _CALL_ELF == 2 \n"
+"0:     addis        2,12,.TOC.-0b@ha \n"
+"       addi         2,2,.TOC.-0b@l \n"
+"#endif \n"
+"   .localentry  do_syscall_clone_ppc64_linux, .-do_syscall_clone_ppc64_linux \n"
+"       stdu    1,-64(1)\n"
+"       std     29,40(1)\n"
+"       std     30,48(1)\n"
+"       std     31,56(1)\n"
+"       mr      30,3\n"              // preserve fn
+"       mr      31,6\n"              // preserve arg
+
+        // setup child stack
+"       rldicr  4,4, 0,59\n"         // trim sp to multiple of 16 bytes
+                                     // (r4 &= ~0xF)
+"       li      0,0\n"
+"       stdu    0,-32(4)\n"          // make initial stack frame
+"       mr      29,4\n"              // preserve sp
+
+        // setup syscall
+"       li      0,"__NR_CLONE"\n"    // syscall number
+"       mr      3,5\n"               // syscall arg1: flags
+        // r4 already setup          // syscall arg2: child_stack
+"       mr      5,8\n"               // syscall arg3: parent_tid
+"       mr      6,13\n"              // syscall arg4: REAL THREAD tls
+"       mr      7,7\n"               // syscall arg5: child_tid
+"       mr      8,8\n"               // syscall arg6: ????
+"       mr      9,9\n"               // syscall arg7: ????
+
+"       sc\n"                        // clone()
+
+"       mfcr    4\n"                 // CR now in low half r4
+"       sldi    4,4,32\n"            // CR now in hi half r4
+
+"       sldi    3,3,32\n"
+"       srdi    3,3,32\n"            // zero out hi half r3
+
+"       or      3,3,4\n"             // r3 = CR : syscall-retval
+"       cmpwi   3,0\n"               // child if retval == 0 (note, cmpw)
+"       bne     1f\n"                // jump if !child
+
+        /* CHILD - call thread function */
+        /* Note: 2.4 kernel doesn't set the child stack pointer,
+           so we do it here.
+           That does leave a small window for a signal to be delivered
+           on the wrong stack, unfortunately. */
+"       mr      1,29\n"
+"       mtctr   30\n"                // ctr reg = fn
+"       mr      3,31\n"              // r3 = arg
+"       bctrl\n"                     // call fn()
+
+        // exit with result
+"       li      0,"__NR_EXIT"\n"
+"       sc\n"
+
+        // Exit returned?!
+"       .long   0\n"
+
+        // PARENT or ERROR - return
+"1:     ld      29,40(1)\n"
+"       ld      30,48(1)\n"
+"       ld      31,56(1)\n"
+"       addi    1,1,64\n"
+"       blr\n"
+#endif
 );
 
 #undef __NR_CLONE
diff --git a/coregrind/m_trampoline.S b/coregrind/m_trampoline.S
index c859bdd..2394100 100644
--- a/coregrind/m_trampoline.S
+++ b/coregrind/m_trampoline.S
@@ -444,6 +444,12 @@
 	/* this function is written using the "dotless" ABI convention */
 	.align 2
 	.globl VG_(ppc64_linux_REDIR_FOR_strlen)
+#if !defined VGP_ppc64be_linux || _CALL_ELF == 2
+        /* Little Endian uses ELF version 2 */
+        .type VG_(ppc64_linux_REDIR_FOR_strlen),@function
+VG_(ppc64_linux_REDIR_FOR_strlen):
+#else
+        /* Big Endian uses ELF version 1 */
 	.section        ".opd","aw"
 	.align 3
 VG_(ppc64_linux_REDIR_FOR_strlen):
@@ -454,6 +460,12 @@
 	.type	VG_(ppc64_linux_REDIR_FOR_strlen), @function
 
 .L.VG_(ppc64_linux_REDIR_FOR_strlen):
+#endif
+#if _CALL_ELF == 2
+0:      addis        2,12,.TOC.-0b@ha
+        addi         2,2,.TOC.-0b@l
+        .localentry  VG_(ppc64_linux_REDIR_FOR_strlen), .-VG_(ppc64_linux_REDIR_FOR_strlen)
+#endif
         mr 9,3
         lbz 0,0(3)
         li 3,0
@@ -467,6 +479,12 @@
         cmpwi 7,0,0
         bne 7,.L01
         blr
+
+#if !defined VGP_ppc64be_linux || _CALL_ELF == 2
+        .size VG_(ppc64_linux_REDIR_FOR_strlen),.-VG_(ppc64_linux_REDIR_FOR_strlen)
+#else
+        .size VG_(ppc64_linux_REDIR_FOR_strlen),.-.L.VG_(ppc64_linux_REDIR_FOR_strlen)
+#endif
         .long 0
         .byte 0,0,0,0,0,0,0,0
 .L0end:
@@ -474,6 +492,10 @@
         /* this function is written using the "dotless" ABI convention */
         .align 2
         .globl VG_(ppc64_linux_REDIR_FOR_strchr)
+#if !defined VGP_ppc64be_linux || _CALL_ELF == 2
+        .type   VG_(ppc64_linux_REDIR_FOR_strchr),@function
+VG_(ppc64_linux_REDIR_FOR_strchr):
+#else
 	.section        ".opd","aw"
 	.align 3
 VG_(ppc64_linux_REDIR_FOR_strchr):
@@ -482,8 +504,14 @@
         .size   VG_(ppc64_linux_REDIR_FOR_strchr), \
                         .L1end-.L.VG_(ppc64_linux_REDIR_FOR_strchr)
         .type   VG_(ppc64_linux_REDIR_FOR_strchr),@function
-	
+
 .L.VG_(ppc64_linux_REDIR_FOR_strchr):
+#endif
+#if  _CALL_ELF == 2
+0:      addis         2,12,.TOC.-0b@ha
+        addi         2,2,.TOC.-0b@l
+        .localentry VG_(ppc64_linux_REDIR_FOR_strchr), .-VG_(ppc64_linux_REDIR_FOR_strchr)
+#endif
         lbz 0,0(3)
         rldicl 4,4,0,56
         cmpw 7,4,0
@@ -491,6 +519,11 @@
         cmpdi 7,0,0
         bne 7,.L18
         b .L14
+#if !defined VGP_ppc64be_linux || _CALL_ELF == 2
+        .size VG_(ppc64_linux_REDIR_FOR_strchr),.-VG_(ppc64_linux_REDIR_FOR_strchr)
+#else
+        .size VG_(ppc64_linux_REDIR_FOR_strchr),.-.L.VG_(ppc64_linux_REDIR_FOR_strchr)
+#endif
 .L19:	
         beq 6,.L14
 .L18:	
diff --git a/coregrind/m_translate.c b/coregrind/m_translate.c
index 73f800f..0b3d964 100644
--- a/coregrind/m_translate.c
+++ b/coregrind/m_translate.c
@@ -1118,6 +1118,10 @@
    return res;
 }
 
+#endif
+
+#if defined(VG_PLAT_USES_PPCTOC)
+
 /* Generate code to push LR and R2 onto this thread's redir stack,
    then set R2 to the new value (which is the TOC pointer to be used
    for the duration of the replacement function, as determined by
@@ -1140,6 +1144,9 @@
 #    error Platform is not TOC-afflicted, fortunately
 #  endif
 }
+#endif
+
+#if defined(VG_PLAT_USES_PPCTOC) || defined(VGP_ppc64le_linux)
 
 static void gen_pop_R2_LR_then_bLR ( IRSB* bb )
 {
@@ -1166,6 +1173,9 @@
 #    error Platform is not TOC-afflicted, fortunately
 #  endif
 }
+#endif
+
+#if defined(VG_PLAT_USES_PPCTOC) || defined(VGP_ppc64le_linux)
 
 static
 Bool mk_preamble__ppctoc_magic_return_stub ( void* closureV, IRSB* bb )
@@ -1187,6 +1197,30 @@
 }
 #endif
 
+#if defined(VGP_ppc64le_linux)
+/* Generate code to push LR and R2 onto this thread's redir stack.
+   Need to save R2 in case we redirect to a global entry point.  The
+   value of R2 is not preserved when entering the global entry point.
+   Need to make sure R2 gets restored on return.  Set LR to the magic
+   return stub, so we get to intercept the return and restore R2 and
+   L2 to the values saved here.
+
+   The existing infrastruture for the TOC enabled architectures is
+   being exploited here.  So, we need to enable a number of the
+   code sections used by VG_PLAT_USES_PPCTOC.
+*/
+
+static void gen_push_R2_and_set_LR ( IRSB* bb )
+{
+   Addr64 bogus_RA  = (Addr64)&VG_(ppctoc_magic_redirect_return_stub);
+   Int    offB_GPR2 = offsetof(VexGuestPPC64State,guest_GPR2);
+   Int    offB_LR   = offsetof(VexGuestPPC64State,guest_LR);
+   gen_PUSH( bb, IRExpr_Get(offB_LR,   Ity_I64) );
+   gen_PUSH( bb, IRExpr_Get(offB_GPR2, Ity_I64) );
+   addStmtToIRSB( bb, IRStmt_Put( offB_LR,   mkU64( bogus_RA )) );
+}
+#  endif
+
 /* --------------- END helpers for with-TOC platforms --------------- */
 
 
@@ -1244,6 +1278,19 @@
      gen_push_and_set_LR_R2 ( bb, VG_(get_tocptr)( closure->readdr ) );
    }
 #  endif
+
+#if defined(VGP_ppc64le_linux)
+   VgCallbackClosure* closure = (VgCallbackClosure*)closureV;
+   Int offB_GPR12 = offsetof(VexGuestArchState, guest_GPR12);
+   addStmtToIRSB(bb, IRStmt_Put(offB_GPR12, mkU64(closure->readdr)));
+   addStmtToIRSB(bb,
+      IRStmt_Put(
+         offsetof(VexGuestArchState,guest_NRADDR_GPR2),
+         VG_WORDSIZE==8 ? mkU64(0) : mkU32(0)
+      )
+   );
+   gen_push_R2_and_set_LR ( bb );
+#endif
    return False;
 }
 
@@ -1277,7 +1324,7 @@
    Int offB_GPR25 = offsetof(VexGuestMIPS64State, guest_r25);
    addStmtToIRSB(bb, IRStmt_Put(offB_GPR25, mkU64(closure->readdr)));
 #  endif
-#  if defined(VGP_ppc64be_linux)
+#  if defined(VG_PLAT_USES_PPCTOC) && !defined(VGP_ppc64le_linux)
    addStmtToIRSB( 
       bb,
       IRStmt_Put( 
@@ -1288,6 +1335,22 @@
    );
    gen_push_and_set_LR_R2 ( bb, VG_(get_tocptr)( closure->readdr ) );
 #  endif
+#if defined(VGP_ppc64le_linux)
+   /* This saves the r2 before leaving the function.  We need to move
+    * guest_NRADDR_GPR2 back to R2 on return.
+    */
+   Int offB_GPR12 = offsetof(VexGuestArchState, guest_GPR12);
+   addStmtToIRSB(
+      bb,
+      IRStmt_Put(
+         offsetof(VexGuestArchState,guest_NRADDR_GPR2),
+         IRExpr_Get(offsetof(VexGuestArchState,guest_GPR2),
+                    VG_WORDSIZE==8 ? Ity_I64 : Ity_I32)
+      )
+   );
+   addStmtToIRSB(bb, IRStmt_Put(offB_GPR12, mkU64(closure->readdr)));
+   gen_push_R2_and_set_LR ( bb );
+#endif
    return False;
 }
 
@@ -1485,7 +1548,8 @@
    if (kind == T_Redir_Wrap)
       preamble_fn = mk_preamble__set_NRADDR_to_nraddr;
 
-#  if defined(VG_PLAT_USES_PPCTOC)
+   /* LE we setup the LR */
+#  if defined(VG_PLAT_USES_PPCTOC) || defined(VGP_ppc64le_linux)
    if (ULong_to_Ptr(nraddr)
        == (void*)&VG_(ppctoc_magic_redirect_return_stub)) {
       /* If entering the special return stub, this means a wrapped or
@@ -1527,6 +1591,11 @@
    vex_abiinfo.guest_ppc_zap_RZ_at_bl         = const_True;
    vex_abiinfo.host_ppc_calls_use_fndescrs    = True;
 #  endif
+#  if defined(VGP_ppc64le_linux)
+   vex_abiinfo.guest_ppc_zap_RZ_at_blr        = True;
+   vex_abiinfo.guest_ppc_zap_RZ_at_bl         = const_True;
+   vex_abiinfo.host_ppc_calls_use_fndescrs    = False;
+#  endif
 
    /* Set up closure args. */
    closure.tid    = tid;
diff --git a/coregrind/m_ume/elf.c b/coregrind/m_ume/elf.c
index 910bb7a..4615da3 100644
--- a/coregrind/m_ume/elf.c
+++ b/coregrind/m_ume/elf.c
@@ -495,6 +495,7 @@
       VG_(close)(interp->fd);
 
       entry = (void *)(advised - interp_addr + interp->e.e_entry);
+
       info->interp_offset = advised - interp_addr;
 
       VG_(free)(interp->p);
@@ -514,6 +515,10 @@
    info->init_toc = ((ULong*)entry)[1];
    info->init_ip  += info->interp_offset;
    info->init_toc += info->interp_offset;
+#elif defined(VGP_ppc64le_linux)
+   /* On PPC64LE, ELF ver 2. API doesn't use a func ptr */
+   info->init_ip  = (Addr)entry;
+   info->init_toc = 0; /* meaningless on this platform */
 #else
    info->init_ip  = (Addr)entry;
    info->init_toc = 0; /* meaningless on this platform */
diff --git a/coregrind/m_ume/macho.c b/coregrind/m_ume/macho.c
index 7608811..3dfddaf 100644
--- a/coregrind/m_ume/macho.c
+++ b/coregrind/m_ume/macho.c
@@ -701,6 +701,8 @@
    good_arch = CPU_TYPE_POWERPC;
 #elif defined(VGA_ppc64be)
    good_arch = CPU_TYPE_POWERPC64BE;
+#elif defined(VGA_ppc64le)
+   good_arch = CPU_TYPE_POWERPC64LE;
 #elif defined(VGA_x86)
    good_arch = CPU_TYPE_I386;
 #elif defined(VGA_amd64)
diff --git a/coregrind/pub_core_machine.h b/coregrind/pub_core_machine.h
index f1d839a..14ea354 100644
--- a/coregrind/pub_core_machine.h
+++ b/coregrind/pub_core_machine.h
@@ -61,6 +61,11 @@
 #  define VG_ELF_MACHINE      EM_PPC64
 #  define VG_ELF_CLASS        ELFCLASS64
 #  define VG_PLAT_USES_PPCTOC 1
+#elif defined(VGP_ppc64le_linux)
+#  define VG_ELF_DATA2XXX     ELFDATA2LSB
+#  define VG_ELF_MACHINE      EM_PPC64
+#  define VG_ELF_CLASS        ELFCLASS64
+#  undef VG_PLAT_USES_PPCTOC
 #elif defined(VGP_arm_linux)
 #  define VG_ELF_DATA2XXX     ELFDATA2LSB
 #  define VG_ELF_MACHINE      EM_ARM
diff --git a/coregrind/vgdb-invoker-ptrace.c b/coregrind/vgdb-invoker-ptrace.c
index b9bcaee..f96013b 100644
--- a/coregrind/vgdb-invoker-ptrace.c
+++ b/coregrind/vgdb-invoker-ptrace.c
@@ -1034,6 +1034,16 @@
       user_mod.regs.gpr[3] = check;
       /* put bad_return return address in Link Register */
       user_mod.regs.link = bad_return;
+#elif defined(VGA_ppc64le)
+      /* LE does not use the function pointer structure used in BE */
+      user_mod.regs.nip = shared64->invoke_gdbserver;
+      user_mod.regs.gpr[1] = sp - 512;
+      user_mod.regs.gpr[12] = user_mod.regs.nip;
+      user_mod.regs.trap = -1L;
+      /* put check arg in register 3 */
+      user_mod.regs.gpr[3] = check;
+      /* put bad_return return address in Link Register */
+      user_mod.regs.link = bad_return;
 #elif defined(VGA_s390x)
       /* put check arg in register r2 */
       user_mod.regs.gprs[2] = check;
diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c
index 4a3005f..de01c89 100644
--- a/helgrind/hg_main.c
+++ b/helgrind/hg_main.c
@@ -4475,6 +4475,7 @@
    if (VG_STREQ(soname, VG_U_LD_LINUX_SO_2))        return True;
    if (VG_STREQ(soname, VG_U_LD_LINUX_X86_64_SO_2)) return True;
    if (VG_STREQ(soname, VG_U_LD64_SO_1))            return True;
+   if (VG_STREQ(soname, VG_U_LD64_SO_2))            return True;
    if (VG_STREQ(soname, VG_U_LD_SO_1))              return True;
 #  elif defined(VGO_darwin)
    if (VG_STREQ(soname, VG_U_DYLD)) return True;
diff --git a/include/pub_tool_debuginfo.h b/include/pub_tool_debuginfo.h
index 4f7a09d..9beb512 100644
--- a/include/pub_tool_debuginfo.h
+++ b/include/pub_tool_debuginfo.h
@@ -249,6 +249,7 @@
                                    Int idx,
                                    /*OUT*/Addr*    avma,
                                    /*OUT*/Addr*    tocptr,
+                                   /*OUT*/Addr*    local_ep,
                                    /*OUT*/UInt*    size,
                                    /*OUT*/HChar**  pri_name,
                                    /*OUT*/HChar*** sec_names,
diff --git a/include/pub_tool_redir.h b/include/pub_tool_redir.h
index 94b8766..62732c1 100644
--- a/include/pub_tool_redir.h
+++ b/include/pub_tool_redir.h
@@ -296,6 +296,7 @@
 
 #define  VG_Z_LD64_SO_1             ld64ZdsoZd1                // ld64.so.1
 #define  VG_U_LD64_SO_1             "ld64.so.1"
+#define  VG_U_LD64_SO_2             "ld64.so.2"                // PPC LE loader
 
 #define  VG_Z_LD_SO_1               ldZdsoZd1                  // ld.so.1
 #define  VG_U_LD_SO_1               "ld.so.1"
diff --git a/include/valgrind.h b/include/valgrind.h
index 81c946a..a65f03a 100644
--- a/include/valgrind.h
+++ b/include/valgrind.h
@@ -116,6 +116,7 @@
 #undef PLAT_amd64_linux
 #undef PLAT_ppc32_linux
 #undef PLAT_ppc64be_linux
+#undef PLAT_ppc64le_linux
 #undef PLAT_arm_linux
 #undef PLAT_arm64_linux
 #undef PLAT_s390x_linux
@@ -143,6 +144,9 @@
 #elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__) && _CALL_ELF != 2
 /* Big Endian uses ELF version 1 */
 #  define PLAT_ppc64be_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__) && _CALL_ELF == 2
+/* Little Endian uses ELF version 2 */
+#  define PLAT_ppc64le_linux 1
 #elif defined(__linux__) && defined(__arm__) && !defined(__aarch64__)
 #  define PLAT_arm_linux 1
 #elif defined(__linux__) && defined(__aarch64__) && !defined(__arm__)
@@ -599,6 +603,82 @@
 
 #endif /* PLAT_ppc64be_linux */
 
+#if defined(PLAT_ppc64le_linux)
+
+typedef
+   struct {
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;     /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({         unsigned long long int  _zzq_args[6];                \
+             unsigned long long int  _zzq_result;                 \
+             unsigned long long int* _zzq_ptr;                    \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long long int __addr;                                \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R12 */       \
+                     "or 3,3,3\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or 5,5,5\n\t"                              \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_ppc64le_linux */
 
 /* ------------------------- arm-linux ------------------------- */
 
@@ -3093,6 +3173,562 @@
 
 #endif /* PLAT_ppc64be_linux */
 
+/* ------------------------- ppc64le-linux ----------------------- */
+#if defined(PLAT_ppc64le_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "mr 28,1\n\t"                        \
+      "rldicr 1,1,0,59\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mr 1,28\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(12)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(12)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(12)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64le_linux */
+
 /* ------------------------- arm-linux ------------------------- */
 
 #if defined(PLAT_arm_linux)
@@ -5938,6 +6574,7 @@
 #undef PLAT_amd64_linux
 #undef PLAT_ppc32_linux
 #undef PLAT_ppc64be_linux
+#undef PLAT_ppc64le_linux
 #undef PLAT_arm_linux
 #undef PLAT_s390x_linux
 #undef PLAT_mips32_linux
diff --git a/include/vki/vki-ppc64-linux.h b/include/vki/vki-ppc64-linux.h
index 7f74558..1edc34d 100644
--- a/include/vki/vki-ppc64-linux.h
+++ b/include/vki/vki-ppc64-linux.h
@@ -31,9 +31,11 @@
 #ifndef __VKI_PPC64_LINUX_H
 #define __VKI_PPC64_LINUX_H
 
-// ppc64 is big-endian.
+#if defined(VGP_ppc32_linux) || defined(VGP_ppc64be_linux)
 #define VKI_BIG_ENDIAN  1
-
+#elif defined(VGP_ppc64le_linux)
+#define VKI_LITTLE_ENDIAN  1
+#endif
 //----------------------------------------------------------------------
 // From linux-2.6.13/include/asm-ppc64/types.h
 //----------------------------------------------------------------------
diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c
index ccb431f..ff56a6e 100644
--- a/memcheck/mc_machine.c
+++ b/memcheck/mc_machine.c
@@ -167,7 +167,11 @@
             return GOF(GPRn);
          by testing ox instead of o, and setting ox back 4 bytes when sz == 4.
       */
+#if defined(VGA_ppc64le)
+      Int ox = o;
+#else
       Int ox = sz == 8 ? o : (o - 4);
+#endif
       if (ox == GOF(GPR0)) return ox;
       if (ox == GOF(GPR1)) return ox;
       if (ox == GOF(GPR2)) return ox;
@@ -367,7 +371,11 @@
    Int  o  = offset;
    Int  sz = szB;
    tl_assert(sz > 0);
+#if defined(VGA_ppc64)
    tl_assert(host_is_big_endian());
+#elif defined(VGA_ppc64le)
+   tl_assert(host_is_little_endian());
+#endif
 
    if (o == GOF(GPR0) && sz == 4) return o;
    if (o == GOF(GPR1) && sz == 4) return o;
diff --git a/tests/arch_test.c b/tests/arch_test.c
index 2fa3b48..ef00785 100644
--- a/tests/arch_test.c
+++ b/tests/arch_test.c
@@ -27,6 +27,7 @@
    "amd64",
    "ppc32",
    "ppc64",
+   "ppc64le",
    "arm",
    "s390x",
    "mips32",
@@ -54,6 +55,9 @@
    if ( 0 == strcmp( arch, "ppc32" ) ) return True;
 #endif
 
+#elif defined(VGP_ppc64le_linux)
+   if ( 0 == strcmp( arch, "ppc64" ) ) return True;
+
 #elif defined(VGP_s390x_linux)
    if ( 0 == strcmp( arch, "s390x" ) ) return True;