fixed up ppc64 assembly with .opd sections

do_syscall_for_client_WRK() needed a bigger stack to avoid the linkage area.

always use dot_prefix for label calls

not wrapping assembly with
.section ".text"
...
.previous
 - ppc64 doesn't like it... seems we can't 'stack' more than one section to pop off with .previous ?





git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5405 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/m_libcassert.c b/coregrind/m_libcassert.c
index 773b601..f5e4a17 100644
--- a/coregrind/m_libcassert.c
+++ b/coregrind/m_libcassert.c
@@ -78,8 +78,8 @@
 #elif defined(VGP_ppc64_linux)
 #  define GET_REAL_PC_SP_AND_FP(pc, sp, fp)                   \
       asm("mflr 0;"                   /* r0 = lr */           \
-          "bl m_libcassert_get_ip;"   /* lr = pc */           \
-          "m_libcassert_get_ip:\n"                            \
+          "bl .m_libcassert_get_ip;"  /* lr = pc */           \
+          ".m_libcassert_get_ip:\n"                           \
           "mflr %0;"                \
           "mtlr 0;"                   /* restore lr */        \
           "mr %1,1;"                \
diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c
index 403b8fd..a8987e7 100644
--- a/coregrind/m_machine.c
+++ b/coregrind/m_machine.c
@@ -373,7 +373,7 @@
      /* VG_(printf)("FP %d VMX %d\n", (Int)have_fp, (Int)have_vmx); */
 
      /* We can only support 3 cases, not 4 (vmx but no fp).  So make
-	fp a prerequisite for vmx. */
+        fp a prerequisite for vmx. */
      if (have_vmx && !have_fp)
         have_vmx = False;
 
diff --git a/coregrind/m_main.c b/coregrind/m_main.c
index 97ba2d9..0bb7161 100644
--- a/coregrind/m_main.c
+++ b/coregrind/m_main.c
@@ -2850,9 +2850,9 @@
 );
 #elif defined(VGP_ppc64_linux)
 asm("\n"
-    ".text\n"
     /* PPC64 ELF ABI says '_start' points to a function descriptor.
        So we must have one, and that is what goes into the .opd section. */
+    "\t.align 2\n"
     "\t.global _start\n"
     "\t.section \".opd\",\"aw\"\n"
     "\t.align 3\n"
@@ -2886,7 +2886,6 @@
     "\tbl ._start_in_C\n"
     "\tnop\n"
     "\ttrap\n"
-    ".previous\n"
 );
 #else
 #error "_start: needs implementation on this platform"
diff --git a/coregrind/m_signals.c b/coregrind/m_signals.c
index e920271..a9d060c 100644
--- a/coregrind/m_signals.c
+++ b/coregrind/m_signals.c
@@ -477,11 +477,18 @@
    ".previous\n"
 #elif defined(VGP_ppc64_linux)
 #  define _MYSIG(name) \
-   ".text\n" \
+   ".align   2\n" \
+   ".globl   my_sigreturn\n" \
+   ".section \".opd\",\"aw\"\n" \
+   ".align   3\n" \
    "my_sigreturn:\n" \
+   ".quad    .my_sigreturn,.TOC.@tocbase,0\n" \
+   ".previous\n" \
+   ".type    .my_sigreturn,@function\n" \
+   ".globl   .my_sigreturn\n" \
+   ".my_sigreturn:\n" \
    "	li	0, " #name "\n" \
-   "	sc\n" \
-   ".previous\n"
+   "	sc\n"
 #else
 #  error Unknown platform
 #endif
diff --git a/coregrind/m_syscall.c b/coregrind/m_syscall.c
index ae28f49..db95656 100644
--- a/coregrind/m_syscall.c
+++ b/coregrind/m_syscall.c
@@ -218,7 +218,15 @@
    bottom but of [1]. */
 extern void do_syscall_WRK ( ULong* argblock );
 asm(
-".text\n"
+".align   2\n"
+".globl   do_syscall_WRK\n"
+".section \".opd\",\"aw\"\n"
+".align   3\n"
+"do_syscall_WRK:\n"
+".quad    .do_syscall_WRK,.TOC.@tocbase,0\n"
+".previous\n"
+".type    .do_syscall_WRK,@function\n"
+".globl   .do_syscall_WRK\n"
 ".do_syscall_WRK:\n"
 "        std  3,-16(1)\n"  /* stash arg */
 "        ld   8, 48(3)\n"  /* sc arg 6 */
@@ -236,7 +244,6 @@
 "        andi. 3,3,1\n"
 "        std  3,8(5)\n"    /* argblock[1] = cr0.s0 & 1 */
 "        blr\n"
-".previous\n"
 );
 #else
 #  error Unknown platform
diff --git a/coregrind/m_syswrap/syscall-ppc64-linux.S b/coregrind/m_syswrap/syscall-ppc64-linux.S
index 3d8134f..0a31c24 100644
--- a/coregrind/m_syswrap/syscall-ppc64-linux.S
+++ b/coregrind/m_syswrap/syscall-ppc64-linux.S
@@ -70,14 +70,22 @@
 /* from vki_arch.h */
 #define VKI_SIG_SETMASK 2
 
+.align 2
+.globl ML_(do_syscall_for_client_WRK)
+.section ".opd","aw"
+.align 3
+ML_(do_syscall_for_client_WRK):	
+.quad .ML_(do_syscall_for_client_WRK),.TOC.@tocbase,0
+.previous
+.type .ML_(do_syscall_for_client_WRK),@function
 .globl .ML_(do_syscall_for_client_WRK)
 .ML_(do_syscall_for_client_WRK):
         /* make a stack frame */
-        stdu    1,-64(1)
-        std     31,56(1)
-        std     30,48(1)
-        std     29,40(1)
-        std     28,32(1)
+        stdu    1,-80(1)
+        std     31,72(1)
+        std     30,64(1)
+        std     29,56(1)
+        std     28,48(1)
         mr      31,3            /* syscall number */
         mr      30,4            /* guest_state */
         mr      29,6            /* postmask */
@@ -122,11 +130,11 @@
         /* now safe from signals */
 
         /* pop off stack frame */
-5:      ld      28,32(1)
-        ld      29,40(1)
-        ld      30,48(1)
-        ld      31,56(1)
-        addi    1,1,64
+5:      ld      28,48(1)
+        ld      29,56(1)
+        ld      30,64(1)
+        ld      31,72(1)
+        addi    1,1,80
         blr
 
 	/* failure: return -ve error code */
@@ -149,8 +157,7 @@
 ML_(blksys_committed): .long 4b
 ML_(blksys_finished):  .long 5b
 
-.previous
-		
+
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
 
diff --git a/coregrind/m_syswrap/syswrap-ppc64-linux.c b/coregrind/m_syswrap/syswrap-ppc64-linux.c
index 35c068d..ef75719 100644
--- a/coregrind/m_syswrap/syswrap-ppc64-linux.c
+++ b/coregrind/m_syswrap/syswrap-ppc64-linux.c
@@ -74,7 +74,7 @@
    address, the second word is the TOC ptr (r2), and the third word is
    the static chain value. */
 asm(
-".text\n"
+"   .align   2\n"
 "   .globl   vgModuleLocal_call_on_new_stack_0_1\n"
 "   .section \".opd\",\"aw\"\n"
 "   .align   3\n"
@@ -122,7 +122,6 @@
 "   mtcr 0\n\t"            // CAB: Need this?
 "   bctr\n\t"              // jump to dst
 "   trap\n"                // should never get here
-".previous\n"
 );
 
 
@@ -166,7 +165,15 @@
                                      Int*  parent_tid, 
                                      void/*vki_modify_ldt_t*/ * );
 asm(
-".text\n"
+"   .align   2\n"
+"   .globl   do_syscall_clone_ppc64_linux\n"
+"   .section \".opd\",\"aw\"\n"
+"   .align   3\n"
+"do_syscall_clone_ppc64_linux:\n"
+"   .quad    .do_syscall_clone_ppc64_linux,.TOC.@tocbase,0\n"
+"   .previous\n"
+"   .type    .do_syscall_clone_ppc64_linux,@function\n"
+"   .globl   .do_syscall_clone_ppc64_linux\n"
 ".do_syscall_clone_ppc64_linux:\n"
 "       stdu    1,-64(1)\n"
 "       std     29,40(1)\n"
@@ -229,7 +236,6 @@
 "       ld      31,56(1)\n"
 "       addi    1,1,64\n"
 "       blr\n"
-".previous\n"
 );
 
 #undef __NR_CLONE
diff --git a/coregrind/m_trampoline.S b/coregrind/m_trampoline.S
index 8423fed..1da47dd 100644
--- a/coregrind/m_trampoline.S
+++ b/coregrind/m_trampoline.S
@@ -296,11 +296,28 @@
 	/* a leading page of unexecutable code */
 	UD2_PAGE
 
+.align 2
 .global VG_(trampoline_stuff_start)
+.section ".opd","aw"
+.align 3
 VG_(trampoline_stuff_start):
+.quad .VG_(trampoline_stuff_start),.TOC.@tocbase,0
+.previous
+.type .VG_(trampoline_stuff_start),@function
+.global  .VG_(trampoline_stuff_start)
+.VG_(trampoline_stuff_start):
 	
+
+.align 2
 .global VG_(trampoline_stuff_end)
+.section ".opd","aw"
+.align 3
 VG_(trampoline_stuff_end):
+.quad .VG_(trampoline_stuff_end),.TOC.@tocbase,0
+.previous
+.type .VG_(trampoline_stuff_end),@function
+.global  .VG_(trampoline_stuff_end)
+.VG_(trampoline_stuff_end):
 
 #	undef UD2_16
 #	undef UD2_64
diff --git a/coregrind/vki_unistd-ppc64-linux.h b/coregrind/vki_unistd-ppc64-linux.h
index 17f640c..ffd5397 100644
--- a/coregrind/vki_unistd-ppc64-linux.h
+++ b/coregrind/vki_unistd-ppc64-linux.h
@@ -309,4 +309,4 @@
 #define __NR_inotify_rm_watch   277
 
 
-#endif /* __VKI_UNISTD_PPC32_LINUX_H */
+#endif /* __VKI_UNISTD_PPC64_LINUX_H */
diff --git a/docs/internals/performance.txt b/docs/internals/performance.txt
index ddeeda6..ff2075c 100644
--- a/docs/internals/performance.txt
+++ b/docs/internals/performance.txt
@@ -16,7 +16,7 @@
   Saved 1--3% on a few programs.
 - r5345,r5346,r5352: Julian improved the dispatcher so that x86 and
   AMD64 use jumps instead of call/return for calling translations.
-  Also, on x86, amd64 and ppc32, --profile-flags style profiling was
+  Also, on x86, amd64, ppc32 and ppc64, --profile-flags style profiling was
   removed from the despatch loop unless --profile-flags is being used.
   Improved Nulgrind performance typically by 10--20%, and Memcheck
   performance typically by 2--20%.