ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics After a bunch of benchmarking on the interaction between dmb and pldw, it turns out that issuing the pldw *after* the dmb instruction can give modest performance gains (~3% atomic_add_return improvement on a dual A15). This patch adds prefetchw invocations to our barriered atomic operations including cmpxchg, test_and_xxx and futexes. Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

commit: c32ffce0f66e5d1d4856254516e24f5ef275cd00 [log] [tgz]
author: Will Deacon <will.deacon@arm.com> Fri Feb 21 17:01:48 2014 +0100
committer: Russell King <rmk+kernel@arm.linux.org.uk> Tue Feb 25 11:30:20 2014 +0000
tree: 125229cdd38bfd6e7e62cff7eb8771a34cc999a7
parent: 6ea41c80115f49e7d8b80312ffc99973d283471f [diff] [blame]
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 2aff798..53e69da 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h

@@ -23,6 +23,7 @@
 
 #define __futex_atomic_op(insn, ret, oldval, tmp, uaddr, oparg)	\
 	smp_mb();						\
+	prefetchw(uaddr);					\
 	__asm__ __volatile__(					\
 	"1:	ldrex	%1, [%3]\n"				\
 	"	" insn "\n"					\
@@ -46,6 +47,8 @@
 		return -EFAULT;
 
 	smp_mb();
+	/* Prefetching cannot fault */
+	prefetchw(uaddr);
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
 	"1:	ldrex	%1, [%4]\n"
 	"	teq	%1, %2\n"
commit	c32ffce0f66e5d1d4856254516e24f5ef275cd00	[log] [tgz]
author	Will Deacon <will.deacon@arm.com>	Fri Feb 21 17:01:48 2014 +0100
committer	Russell King <rmk+kernel@arm.linux.org.uk>	Tue Feb 25 11:30:20 2014 +0000
tree	125229cdd38bfd6e7e62cff7eb8771a34cc999a7
parent	6ea41c80115f49e7d8b80312ffc99973d283471f [diff] [blame]