ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics
After a bunch of benchmarking on the interaction between dmb and pldw,
it turns out that issuing the pldw *after* the dmb instruction can
give modest performance gains (~3% atomic_add_return improvement on a
dual A15).
This patch adds prefetchw invocations to our barriered atomic operations
including cmpxchg, test_and_xxx and futexes.
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h
index df2fbba..abb2c37 100644
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h
@@ -2,6 +2,7 @@
#define __ASM_ARM_CMPXCHG_H
#include <linux/irqflags.h>
+#include <linux/prefetch.h>
#include <asm/barrier.h>
#if defined(CONFIG_CPU_SA1100) || defined(CONFIG_CPU_SA110)
@@ -35,6 +36,7 @@
#endif
smp_mb();
+ prefetchw((const void *)ptr);
switch (size) {
#if __LINUX_ARM_ARCH__ >= 6
@@ -138,6 +140,8 @@
{
unsigned long oldval, res;
+ prefetchw((const void *)ptr);
+
switch (size) {
#ifndef CONFIG_CPU_V6 /* min ARCH >= ARMv6K */
case 1:
@@ -230,6 +234,8 @@
unsigned long long oldval;
unsigned long res;
+ prefetchw(ptr);
+
__asm__ __volatile__(
"1: ldrexd %1, %H1, [%3]\n"
" teq %1, %4\n"