ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics After a bunch of benchmarking on the interaction between dmb and pldw, it turns out that issuing the pldw *after* the dmb instruction can give modest performance gains (~3% atomic_add_return improvement on a dual A15). This patch adds prefetchw invocations to our barriered atomic operations including cmpxchg, test_and_xxx and futexes. Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

commit: c32ffce0f66e5d1d4856254516e24f5ef275cd00 [log] [tgz]
author: Will Deacon <will.deacon@arm.com> Fri Feb 21 17:01:48 2014 +0100
committer: Russell King <rmk+kernel@arm.linux.org.uk> Tue Feb 25 11:30:20 2014 +0000
tree: 125229cdd38bfd6e7e62cff7eb8771a34cc999a7
parent: 6ea41c80115f49e7d8b80312ffc99973d283471f [diff] [blame]
diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h
index df2fbba..abb2c37 100644
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h

@@ -2,6 +2,7 @@
 #define __ASM_ARM_CMPXCHG_H
 
 #include <linux/irqflags.h>
+#include <linux/prefetch.h>
 #include <asm/barrier.h>
 
 #if defined(CONFIG_CPU_SA1100) || defined(CONFIG_CPU_SA110)
@@ -35,6 +36,7 @@
 #endif
 
 	smp_mb();
+	prefetchw((const void *)ptr);
 
 	switch (size) {
 #if __LINUX_ARM_ARCH__ >= 6
@@ -138,6 +140,8 @@
 {
 	unsigned long oldval, res;
 
+	prefetchw((const void *)ptr);
+
 	switch (size) {
 #ifndef CONFIG_CPU_V6	/* min ARCH >= ARMv6K */
 	case 1:
@@ -230,6 +234,8 @@
 	unsigned long long oldval;
 	unsigned long res;
 
+	prefetchw(ptr);
+
 	__asm__ __volatile__(
 "1:	ldrexd		%1, %H1, [%3]\n"
 "	teq		%1, %4\n"
commit	c32ffce0f66e5d1d4856254516e24f5ef275cd00	[log] [tgz]
author	Will Deacon <will.deacon@arm.com>	Fri Feb 21 17:01:48 2014 +0100
committer	Russell King <rmk+kernel@arm.linux.org.uk>	Tue Feb 25 11:30:20 2014 +0000
tree	125229cdd38bfd6e7e62cff7eb8771a34cc999a7
parent	6ea41c80115f49e7d8b80312ffc99973d283471f [diff] [blame]