make thread-pointer-loading asm non-volatile

this will allow the compiler to cache and reuse the result, meaning we
no longer have to take care not to load it more than once for the sake
of archs where the load may be expensive.

depends on commit 1c84c99913bf1cd47b866ed31e665848a0da84a2 for
correctness, since otherwise the compiler could hoist loads during
stage 3 of dynamic linking before the initial thread-pointer setup.
diff --git a/arch/aarch64/pthread_arch.h b/arch/aarch64/pthread_arch.h
index e8499d8..e64b126 100644
--- a/arch/aarch64/pthread_arch.h
+++ b/arch/aarch64/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	char *self;
-	__asm__ __volatile__ ("mrs %0,tpidr_el0" : "=r"(self));
+	__asm__ ("mrs %0,tpidr_el0" : "=r"(self));
 	return (void*)(self - sizeof(struct pthread));
 }
 
diff --git a/arch/arm/pthread_arch.h b/arch/arm/pthread_arch.h
index 5c6aff2..e689ea2 100644
--- a/arch/arm/pthread_arch.h
+++ b/arch/arm/pthread_arch.h
@@ -4,7 +4,7 @@
 static inline pthread_t __pthread_self()
 {
 	char *p;
-	__asm__ __volatile__ ( "mrc p15,0,%0,c13,c0,3" : "=r"(p) );
+	__asm__ ( "mrc p15,0,%0,c13,c0,3" : "=r"(p) );
 	return (void *)(p-sizeof(struct pthread));
 }
 
@@ -20,7 +20,7 @@
 {
 	extern hidden uintptr_t __a_gettp_ptr;
 	register uintptr_t p __asm__("r0");
-	__asm__ __volatile__ ( BLX " %1" : "=r"(p) : "r"(__a_gettp_ptr) : "cc", "lr" );
+	__asm__ ( BLX " %1" : "=r"(p) : "r"(__a_gettp_ptr) : "cc", "lr" );
 	return (void *)(p-sizeof(struct pthread));
 }
 
diff --git a/arch/i386/pthread_arch.h b/arch/i386/pthread_arch.h
index 7f38a56..6f600b9 100644
--- a/arch/i386/pthread_arch.h
+++ b/arch/i386/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	struct pthread *self;
-	__asm__ __volatile__ ("movl %%gs:0,%0" : "=r" (self) );
+	__asm__ ("movl %%gs:0,%0" : "=r" (self) );
 	return self;
 }
 
diff --git a/arch/microblaze/pthread_arch.h b/arch/microblaze/pthread_arch.h
index 08d1ba7..f6ba8de 100644
--- a/arch/microblaze/pthread_arch.h
+++ b/arch/microblaze/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	struct pthread *self;
-	__asm__ __volatile__ ("ori %0, r21, 0" : "=r" (self) );
+	__asm__ ("ori %0, r21, 0" : "=r" (self) );
 	return self;
 }
 
diff --git a/arch/mips/pthread_arch.h b/arch/mips/pthread_arch.h
index 5fea15a..1e7839e 100644
--- a/arch/mips/pthread_arch.h
+++ b/arch/mips/pthread_arch.h
@@ -2,10 +2,10 @@
 {
 #if __mips_isa_rev < 2
 	register char *tp __asm__("$3");
-	__asm__ __volatile__ (".word 0x7c03e83b" : "=r" (tp) );
+	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
 	char *tp;
-	__asm__ __volatile__ ("rdhwr %0, $29" : "=r" (tp) );
+	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
 	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
 }
diff --git a/arch/mips64/pthread_arch.h b/arch/mips64/pthread_arch.h
index 5fea15a..1e7839e 100644
--- a/arch/mips64/pthread_arch.h
+++ b/arch/mips64/pthread_arch.h
@@ -2,10 +2,10 @@
 {
 #if __mips_isa_rev < 2
 	register char *tp __asm__("$3");
-	__asm__ __volatile__ (".word 0x7c03e83b" : "=r" (tp) );
+	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
 	char *tp;
-	__asm__ __volatile__ ("rdhwr %0, $29" : "=r" (tp) );
+	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
 	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
 }
diff --git a/arch/mipsn32/pthread_arch.h b/arch/mipsn32/pthread_arch.h
index 5fea15a..1e7839e 100644
--- a/arch/mipsn32/pthread_arch.h
+++ b/arch/mipsn32/pthread_arch.h
@@ -2,10 +2,10 @@
 {
 #if __mips_isa_rev < 2
 	register char *tp __asm__("$3");
-	__asm__ __volatile__ (".word 0x7c03e83b" : "=r" (tp) );
+	__asm__ (".word 0x7c03e83b" : "=r" (tp) );
 #else
 	char *tp;
-	__asm__ __volatile__ ("rdhwr %0, $29" : "=r" (tp) );
+	__asm__ ("rdhwr %0, $29" : "=r" (tp) );
 #endif
 	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
 }
diff --git a/arch/or1k/pthread_arch.h b/arch/or1k/pthread_arch.h
index 521b9c5..1b806f8 100644
--- a/arch/or1k/pthread_arch.h
+++ b/arch/or1k/pthread_arch.h
@@ -3,10 +3,10 @@
 {
 #ifdef __clang__
 	char *tp;
-	__asm__ __volatile__ ("l.ori %0, r10, 0" : "=r" (tp) );
+	__asm__ ("l.ori %0, r10, 0" : "=r" (tp) );
 #else
 	register char *tp __asm__("r10");
-	__asm__ __volatile__ ("" : "=r" (tp) );
+	__asm__ ("" : "=r" (tp) );
 #endif
 	return (struct pthread *) (tp - sizeof(struct pthread));
 }
diff --git a/arch/powerpc/pthread_arch.h b/arch/powerpc/pthread_arch.h
index bab2e6c..ae0f28d 100644
--- a/arch/powerpc/pthread_arch.h
+++ b/arch/powerpc/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	register char *tp __asm__("r2");
-	__asm__ __volatile__ ("" : "=r" (tp) );
+	__asm__ ("" : "=r" (tp) );
 	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
 }
                         
diff --git a/arch/powerpc64/pthread_arch.h b/arch/powerpc64/pthread_arch.h
index 37b75e2..79c3ecd 100644
--- a/arch/powerpc64/pthread_arch.h
+++ b/arch/powerpc64/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	register char *tp __asm__("r13");
-	__asm__ __volatile__ ("" : "=r" (tp) );
+	__asm__ ("" : "=r" (tp) );
 	return (pthread_t)(tp - 0x7000 - sizeof(struct pthread));
 }
 
diff --git a/arch/s390x/pthread_arch.h b/arch/s390x/pthread_arch.h
index bd90016..e2251f1 100644
--- a/arch/s390x/pthread_arch.h
+++ b/arch/s390x/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	struct pthread *self;
-	__asm__ __volatile__ (
+	__asm__ (
 		"ear  %0, %%a0\n"
 		"sllg %0, %0, 32\n"
 		"ear  %0, %%a1\n"
diff --git a/arch/sh/pthread_arch.h b/arch/sh/pthread_arch.h
index a7dd27a..3ee9c1a 100644
--- a/arch/sh/pthread_arch.h
+++ b/arch/sh/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	char *self;
-	__asm__ __volatile__ ("stc gbr,%0" : "=r" (self) );
+	__asm__ ("stc gbr,%0" : "=r" (self) );
 	return (struct pthread *) (self - sizeof(struct pthread));
 }
 
diff --git a/arch/x32/pthread_arch.h b/arch/x32/pthread_arch.h
index 267ad07..f640a1a 100644
--- a/arch/x32/pthread_arch.h
+++ b/arch/x32/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	struct pthread *self;
-	__asm__ __volatile__ ("mov %%fs:0,%0" : "=r" (self) );
+	__asm__ ("mov %%fs:0,%0" : "=r" (self) );
 	return self;
 }
 
diff --git a/arch/x86_64/pthread_arch.h b/arch/x86_64/pthread_arch.h
index c61509c..65e880c 100644
--- a/arch/x86_64/pthread_arch.h
+++ b/arch/x86_64/pthread_arch.h
@@ -1,7 +1,7 @@
 static inline struct pthread *__pthread_self()
 {
 	struct pthread *self;
-	__asm__ __volatile__ ("mov %%fs:0,%0" : "=r" (self) );
+	__asm__ ("mov %%fs:0,%0" : "=r" (self) );
 	return self;
 }