arm64: atomics: fix grossly inconsistent asm constraints for exclusives

Our uses of inline asm constraints for atomic operations are fairly
wild and varied. We basically need to guarantee the following:

  1. Any instructions with barrier implications
     (load-acquire/store-release) have a "memory" clobber

  2. When performing exclusive accesses, the addresing mode is generated
     using the "Q" constraint

  3. Atomic blocks which use the condition flags, have a "cc" clobber

This patch addresses these concerns which, as well as fixing the
semantics of the code, stops GCC complaining about impossible asm
constraints.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 407717b..8363644 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -49,12 +49,12 @@
 	int result;
 
 	asm volatile("// atomic_add\n"
-"1:	ldxr	%w0, [%3]\n"
-"	add	%w0, %w0, %w4\n"
-"	stxr	%w1, %w0, [%3]\n"
+"1:	ldxr	%w0, %2\n"
+"	add	%w0, %w0, %w3\n"
+"	stxr	%w1, %w0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
 	: "cc");
 }
 
@@ -64,13 +64,13 @@
 	int result;
 
 	asm volatile("// atomic_add_return\n"
-"1:	ldaxr	%w0, [%3]\n"
-"	add	%w0, %w0, %w4\n"
-"	stlxr	%w1, %w0, [%3]\n"
+"1:	ldaxr	%w0, %2\n"
+"	add	%w0, %w0, %w3\n"
+"	stlxr	%w1, %w0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
+	: "cc", "memory");
 
 	return result;
 }
@@ -81,12 +81,12 @@
 	int result;
 
 	asm volatile("// atomic_sub\n"
-"1:	ldxr	%w0, [%3]\n"
-"	sub	%w0, %w0, %w4\n"
-"	stxr	%w1, %w0, [%3]\n"
+"1:	ldxr	%w0, %2\n"
+"	sub	%w0, %w0, %w3\n"
+"	stxr	%w1, %w0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
 	: "cc");
 }
 
@@ -96,13 +96,13 @@
 	int result;
 
 	asm volatile("// atomic_sub_return\n"
-"1:	ldaxr	%w0, [%3]\n"
-"	sub	%w0, %w0, %w4\n"
-"	stlxr	%w1, %w0, [%3]\n"
+"1:	ldaxr	%w0, %2\n"
+"	sub	%w0, %w0, %w3\n"
+"	stlxr	%w1, %w0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
+	: "cc", "memory");
 
 	return result;
 }
@@ -113,15 +113,15 @@
 	int oldval;
 
 	asm volatile("// atomic_cmpxchg\n"
-"1:	ldaxr	%w1, [%3]\n"
-"	cmp	%w1, %w4\n"
+"1:	ldaxr	%w1, %2\n"
+"	cmp	%w1, %w3\n"
 "	b.ne	2f\n"
-"	stlxr	%w0, %w5, [%3]\n"
+"	stlxr	%w0, %w4, %2\n"
 "	cbnz	%w0, 1b\n"
 "2:"
-	: "=&r" (tmp), "=&r" (oldval), "+o" (ptr->counter)
-	: "r" (&ptr->counter), "Ir" (old), "r" (new)
-	: "cc");
+	: "=&r" (tmp), "=&r" (oldval), "+Q" (ptr->counter)
+	: "Ir" (old), "r" (new)
+	: "cc", "memory");
 
 	return oldval;
 }
@@ -131,12 +131,12 @@
 	unsigned long tmp, tmp2;
 
 	asm volatile("// atomic_clear_mask\n"
-"1:	ldxr	%0, [%3]\n"
-"	bic	%0, %0, %4\n"
-"	stxr	%w1, %0, [%3]\n"
+"1:	ldxr	%0, %2\n"
+"	bic	%0, %0, %3\n"
+"	stxr	%w1, %0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (tmp), "=&r" (tmp2), "+o" (*addr)
-	: "r" (addr), "Ir" (mask)
+	: "=&r" (tmp), "=&r" (tmp2), "+Q" (*addr)
+	: "Ir" (mask)
 	: "cc");
 }
 
@@ -182,12 +182,12 @@
 	unsigned long tmp;
 
 	asm volatile("// atomic64_add\n"
-"1:	ldxr	%0, [%3]\n"
-"	add	%0, %0, %4\n"
-"	stxr	%w1, %0, [%3]\n"
+"1:	ldxr	%0, %2\n"
+"	add	%0, %0, %3\n"
+"	stxr	%w1, %0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
 	: "cc");
 }
 
@@ -197,13 +197,13 @@
 	unsigned long tmp;
 
 	asm volatile("// atomic64_add_return\n"
-"1:	ldaxr	%0, [%3]\n"
-"	add	%0, %0, %4\n"
-"	stlxr	%w1, %0, [%3]\n"
+"1:	ldaxr	%0, %2\n"
+"	add	%0, %0, %3\n"
+"	stlxr	%w1, %0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
+	: "cc", "memory");
 
 	return result;
 }
@@ -214,12 +214,12 @@
 	unsigned long tmp;
 
 	asm volatile("// atomic64_sub\n"
-"1:	ldxr	%0, [%3]\n"
-"	sub	%0, %0, %4\n"
-"	stxr	%w1, %0, [%3]\n"
+"1:	ldxr	%0, %2\n"
+"	sub	%0, %0, %3\n"
+"	stxr	%w1, %0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
 	: "cc");
 }
 
@@ -229,13 +229,13 @@
 	unsigned long tmp;
 
 	asm volatile("// atomic64_sub_return\n"
-"1:	ldaxr	%0, [%3]\n"
-"	sub	%0, %0, %4\n"
-"	stlxr	%w1, %0, [%3]\n"
+"1:	ldaxr	%0, %2\n"
+"	sub	%0, %0, %3\n"
+"	stlxr	%w1, %0, %2\n"
 "	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	: "Ir" (i)
+	: "cc", "memory");
 
 	return result;
 }
@@ -246,15 +246,15 @@
 	unsigned long res;
 
 	asm volatile("// atomic64_cmpxchg\n"
-"1:	ldaxr	%1, [%3]\n"
-"	cmp	%1, %4\n"
+"1:	ldaxr	%1, %2\n"
+"	cmp	%1, %3\n"
 "	b.ne	2f\n"
-"	stlxr	%w0, %5, [%3]\n"
+"	stlxr	%w0, %4, %2\n"
 "	cbnz	%w0, 1b\n"
 "2:"
-	: "=&r" (res), "=&r" (oldval), "+o" (ptr->counter)
-	: "r" (&ptr->counter), "Ir" (old), "r" (new)
-	: "cc");
+	: "=&r" (res), "=&r" (oldval), "+Q" (ptr->counter)
+	: "Ir" (old), "r" (new)
+	: "cc", "memory");
 
 	return oldval;
 }
@@ -267,15 +267,15 @@
 	unsigned long tmp;
 
 	asm volatile("// atomic64_dec_if_positive\n"
-"1:	ldaxr	%0, [%3]\n"
+"1:	ldaxr	%0, %2\n"
 "	subs	%0, %0, #1\n"
 "	b.mi	2f\n"
-"	stlxr	%w1, %0, [%3]\n"
+"	stlxr	%w1, %0, %2\n"
 "	cbnz	%w1, 1b\n"
 "2:"
-	: "=&r" (result), "=&r" (tmp), "+o" (v->counter)
-	: "r" (&v->counter)
-	: "cc");
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+	:
+	: "cc", "memory");
 
 	return result;
 }
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index e0e65b0..968b5cb 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -29,39 +29,39 @@
 	switch (size) {
 	case 1:
 		asm volatile("//	__xchg1\n"
-		"1:	ldaxrb	%w0, [%3]\n"
-		"	stlxrb	%w1, %w2, [%3]\n"
+		"1:	ldaxrb	%w0, %2\n"
+		"	stlxrb	%w1, %w3, %2\n"
 		"	cbnz	%w1, 1b\n"
-			: "=&r" (ret), "=&r" (tmp)
-			: "r" (x), "r" (ptr)
-			: "memory", "cc");
+			: "=&r" (ret), "=&r" (tmp), "+Q" (*(u8 *)ptr)
+			: "r" (x)
+			: "cc", "memory");
 		break;
 	case 2:
 		asm volatile("//	__xchg2\n"
-		"1:	ldaxrh	%w0, [%3]\n"
-		"	stlxrh	%w1, %w2, [%3]\n"
+		"1:	ldaxrh	%w0, %2\n"
+		"	stlxrh	%w1, %w3, %2\n"
 		"	cbnz	%w1, 1b\n"
-			: "=&r" (ret), "=&r" (tmp)
-			: "r" (x), "r" (ptr)
-			: "memory", "cc");
+			: "=&r" (ret), "=&r" (tmp), "+Q" (*(u16 *)ptr)
+			: "r" (x)
+			: "cc", "memory");
 		break;
 	case 4:
 		asm volatile("//	__xchg4\n"
-		"1:	ldaxr	%w0, [%3]\n"
-		"	stlxr	%w1, %w2, [%3]\n"
+		"1:	ldaxr	%w0, %2\n"
+		"	stlxr	%w1, %w3, %2\n"
 		"	cbnz	%w1, 1b\n"
-			: "=&r" (ret), "=&r" (tmp)
-			: "r" (x), "r" (ptr)
-			: "memory", "cc");
+			: "=&r" (ret), "=&r" (tmp), "+Q" (*(u32 *)ptr)
+			: "r" (x)
+			: "cc", "memory");
 		break;
 	case 8:
 		asm volatile("//	__xchg8\n"
-		"1:	ldaxr	%0, [%3]\n"
-		"	stlxr	%w1, %2, [%3]\n"
+		"1:	ldaxr	%0, %2\n"
+		"	stlxr	%w1, %3, %2\n"
 		"	cbnz	%w1, 1b\n"
-			: "=&r" (ret), "=&r" (tmp)
-			: "r" (x), "r" (ptr)
-			: "memory", "cc");
+			: "=&r" (ret), "=&r" (tmp), "+Q" (*(u64 *)ptr)
+			: "r" (x)
+			: "cc", "memory");
 		break;
 	default:
 		BUILD_BUG();
@@ -82,14 +82,14 @@
 	case 1:
 		do {
 			asm volatile("// __cmpxchg1\n"
-			"	ldxrb	%w1, [%2]\n"
+			"	ldxrb	%w1, %2\n"
 			"	mov	%w0, #0\n"
 			"	cmp	%w1, %w3\n"
 			"	b.ne	1f\n"
-			"	stxrb	%w0, %w4, [%2]\n"
+			"	stxrb	%w0, %w4, %2\n"
 			"1:\n"
-				: "=&r" (res), "=&r" (oldval)
-				: "r" (ptr), "Ir" (old), "r" (new)
+				: "=&r" (res), "=&r" (oldval), "+Q" (*(u8 *)ptr)
+				: "Ir" (old), "r" (new)
 				: "cc");
 		} while (res);
 		break;
@@ -97,29 +97,29 @@
 	case 2:
 		do {
 			asm volatile("// __cmpxchg2\n"
-			"	ldxrh	%w1, [%2]\n"
+			"	ldxrh	%w1, %2\n"
 			"	mov	%w0, #0\n"
 			"	cmp	%w1, %w3\n"
 			"	b.ne	1f\n"
-			"	stxrh	%w0, %w4, [%2]\n"
+			"	stxrh	%w0, %w4, %2\n"
 			"1:\n"
-				: "=&r" (res), "=&r" (oldval)
-				: "r" (ptr), "Ir" (old), "r" (new)
-				: "memory", "cc");
+				: "=&r" (res), "=&r" (oldval), "+Q" (*(u16 *)ptr)
+				: "Ir" (old), "r" (new)
+				: "cc");
 		} while (res);
 		break;
 
 	case 4:
 		do {
 			asm volatile("// __cmpxchg4\n"
-			"	ldxr	%w1, [%2]\n"
+			"	ldxr	%w1, %2\n"
 			"	mov	%w0, #0\n"
 			"	cmp	%w1, %w3\n"
 			"	b.ne	1f\n"
-			"	stxr	%w0, %w4, [%2]\n"
+			"	stxr	%w0, %w4, %2\n"
 			"1:\n"
-				: "=&r" (res), "=&r" (oldval)
-				: "r" (ptr), "Ir" (old), "r" (new)
+				: "=&r" (res), "=&r" (oldval), "+Q" (*(u32 *)ptr)
+				: "Ir" (old), "r" (new)
 				: "cc");
 		} while (res);
 		break;
@@ -127,14 +127,14 @@
 	case 8:
 		do {
 			asm volatile("// __cmpxchg8\n"
-			"	ldxr	%1, [%2]\n"
+			"	ldxr	%1, %2\n"
 			"	mov	%w0, #0\n"
 			"	cmp	%1, %3\n"
 			"	b.ne	1f\n"
-			"	stxr	%w0, %4, [%2]\n"
+			"	stxr	%w0, %4, %2\n"
 			"1:\n"
-				: "=&r" (res), "=&r" (oldval)
-				: "r" (ptr), "Ir" (old), "r" (new)
+				: "=&r" (res), "=&r" (oldval), "+Q" (*(u64 *)ptr)
+				: "Ir" (old), "r" (new)
 				: "cc");
 		} while (res);
 		break;
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 3468ae8..c582fa3 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -39,7 +39,7 @@
 "	.popsection\n"							\
 	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp)	\
 	: "r" (oparg), "Ir" (-EFAULT)					\
-	: "cc")
+	: "cc", "memory")
 
 static inline int
 futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index 41112fe..7065e92 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -45,13 +45,13 @@
 	asm volatile(
 	"	sevl\n"
 	"1:	wfe\n"
-	"2:	ldaxr	%w0, [%1]\n"
+	"2:	ldaxr	%w0, %1\n"
 	"	cbnz	%w0, 1b\n"
-	"	stxr	%w0, %w2, [%1]\n"
+	"	stxr	%w0, %w2, %1\n"
 	"	cbnz	%w0, 2b\n"
-	: "=&r" (tmp)
-	: "r" (&lock->lock), "r" (1)
-	: "memory");
+	: "=&r" (tmp), "+Q" (lock->lock)
+	: "r" (1)
+	: "cc", "memory");
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -59,13 +59,13 @@
 	unsigned int tmp;
 
 	asm volatile(
-	"	ldaxr	%w0, [%1]\n"
+	"	ldaxr	%w0, %1\n"
 	"	cbnz	%w0, 1f\n"
-	"	stxr	%w0, %w2, [%1]\n"
+	"	stxr	%w0, %w2, %1\n"
 	"1:\n"
-	: "=&r" (tmp)
-	: "r" (&lock->lock), "r" (1)
-	: "memory");
+	: "=&r" (tmp), "+Q" (lock->lock)
+	: "r" (1)
+	: "cc", "memory");
 
 	return !tmp;
 }
@@ -73,8 +73,8 @@
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	asm volatile(
-	"	stlr	%w1, [%0]\n"
-	: : "r" (&lock->lock), "r" (0) : "memory");
+	"	stlr	%w1, %0\n"
+	: "=Q" (lock->lock) : "r" (0) : "memory");
 }
 
 /*
@@ -94,13 +94,13 @@
 	asm volatile(
 	"	sevl\n"
 	"1:	wfe\n"
-	"2:	ldaxr	%w0, [%1]\n"
+	"2:	ldaxr	%w0, %1\n"
 	"	cbnz	%w0, 1b\n"
-	"	stxr	%w0, %w2, [%1]\n"
+	"	stxr	%w0, %w2, %1\n"
 	"	cbnz	%w0, 2b\n"
-	: "=&r" (tmp)
-	: "r" (&rw->lock), "r" (0x80000000)
-	: "memory");
+	: "=&r" (tmp), "+Q" (rw->lock)
+	: "r" (0x80000000)
+	: "cc", "memory");
 }
 
 static inline int arch_write_trylock(arch_rwlock_t *rw)
@@ -108,13 +108,13 @@
 	unsigned int tmp;
 
 	asm volatile(
-	"	ldaxr	%w0, [%1]\n"
+	"	ldaxr	%w0, %1\n"
 	"	cbnz	%w0, 1f\n"
-	"	stxr	%w0, %w2, [%1]\n"
+	"	stxr	%w0, %w2, %1\n"
 	"1:\n"
-	: "=&r" (tmp)
-	: "r" (&rw->lock), "r" (0x80000000)
-	: "memory");
+	: "=&r" (tmp), "+Q" (rw->lock)
+	: "r" (0x80000000)
+	: "cc", "memory");
 
 	return !tmp;
 }
@@ -122,8 +122,8 @@
 static inline void arch_write_unlock(arch_rwlock_t *rw)
 {
 	asm volatile(
-	"	stlr	%w1, [%0]\n"
-	: : "r" (&rw->lock), "r" (0) : "memory");
+	"	stlr	%w1, %0\n"
+	: "=Q" (rw->lock) : "r" (0) : "memory");
 }
 
 /* write_can_lock - would write_trylock() succeed? */
@@ -148,14 +148,14 @@
 	asm volatile(
 	"	sevl\n"
 	"1:	wfe\n"
-	"2:	ldaxr	%w0, [%2]\n"
+	"2:	ldaxr	%w0, %2\n"
 	"	add	%w0, %w0, #1\n"
 	"	tbnz	%w0, #31, 1b\n"
-	"	stxr	%w1, %w0, [%2]\n"
+	"	stxr	%w1, %w0, %2\n"
 	"	cbnz	%w1, 2b\n"
-	: "=&r" (tmp), "=&r" (tmp2)
-	: "r" (&rw->lock)
-	: "memory");
+	: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
+	:
+	: "cc", "memory");
 }
 
 static inline void arch_read_unlock(arch_rwlock_t *rw)
@@ -163,13 +163,13 @@
 	unsigned int tmp, tmp2;
 
 	asm volatile(
-	"1:	ldxr	%w0, [%2]\n"
+	"1:	ldxr	%w0, %2\n"
 	"	sub	%w0, %w0, #1\n"
-	"	stlxr	%w1, %w0, [%2]\n"
+	"	stlxr	%w1, %w0, %2\n"
 	"	cbnz	%w1, 1b\n"
-	: "=&r" (tmp), "=&r" (tmp2)
-	: "r" (&rw->lock)
-	: "memory");
+	: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
+	:
+	: "cc", "memory");
 }
 
 static inline int arch_read_trylock(arch_rwlock_t *rw)
@@ -177,14 +177,14 @@
 	unsigned int tmp, tmp2 = 1;
 
 	asm volatile(
-	"	ldaxr	%w0, [%2]\n"
+	"	ldaxr	%w0, %2\n"
 	"	add	%w0, %w0, #1\n"
 	"	tbnz	%w0, #31, 1f\n"
-	"	stxr	%w1, %w0, [%2]\n"
+	"	stxr	%w1, %w0, %2\n"
 	"1:\n"
-	: "=&r" (tmp), "+r" (tmp2)
-	: "r" (&rw->lock)
-	: "memory");
+	: "=&r" (tmp), "+r" (tmp2), "+Q" (rw->lock)
+	:
+	: "cc", "memory");
 
 	return !tmp2;
 }