s390/bitops: make use of interlocked-access facility 1 instructions

Make use of the interlocked-access facility 1 that got added with the
z196 architecure.
This facilility added new instructions which can atomically update a
storage location without a compare-and-swap loop. E.g. setting a bit
within a "long" can be done with a single instruction.

The size of the kernel image gets ~30kb smaller. Considering that there
are appr. 1900 bitops call sites this means that each one saves about
15-16 bytes per call site which is expected.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 10135a3..bb26481 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -65,7 +65,10 @@
 #define __BITOPS_AND		"nr"
 #define __BITOPS_XOR		"xr"
 
-#define __BITOPS_LOOP(__old, __new, __addr, __val, __op_string)	\
+#define __BITOPS_LOOP(__addr, __val, __op_string)		\
+({								\
+	unsigned long __old, __new;				\
+								\
 	asm volatile(						\
 		"	l	%0,%2\n"			\
 		"0:	lr	%1,%0\n"			\
@@ -75,15 +78,40 @@
 		: "=&d" (__old), "=&d" (__new),			\
 		  "=Q" (*(unsigned long *) __addr)		\
 		: "d" (__val), "Q" (*(unsigned long *) __addr)	\
-		: "cc");
+		: "cc");					\
+	__old;							\
+})
 
 #else /* CONFIG_64BIT */
 
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+
+#define __BITOPS_OR		"laog"
+#define __BITOPS_AND		"lang"
+#define __BITOPS_XOR		"laxg"
+
+#define __BITOPS_LOOP(__addr, __val, __op_string)		\
+({								\
+	unsigned long __old;					\
+								\
+	asm volatile(						\
+		__op_string "	%0,%2,%1\n"			\
+		: "=d" (__old),	"+Q" (*(unsigned long *)__addr)	\
+		: "d" (__val)					\
+		: "cc");					\
+	__old;							\
+})
+
+#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+
 #define __BITOPS_OR		"ogr"
 #define __BITOPS_AND		"ngr"
 #define __BITOPS_XOR		"xgr"
 
-#define __BITOPS_LOOP(__old, __new, __addr, __val, __op_string)	\
+#define __BITOPS_LOOP(__addr, __val, __op_string)		\
+({								\
+	unsigned long __old, __new;				\
+								\
 	asm volatile(						\
 		"	lg	%0,%2\n"			\
 		"0:	lgr	%1,%0\n"			\
@@ -93,7 +121,11 @@
 		: "=&d" (__old), "=&d" (__new),			\
 		  "=Q" (*(unsigned long *) __addr)		\
 		: "d" (__val), "Q" (*(unsigned long *) __addr)	\
-		: "cc");
+		: "cc");					\
+	__old;							\
+})
+
+#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
 
 #endif /* CONFIG_64BIT */
 
@@ -105,7 +137,7 @@
  */
 static inline void set_bit_cs(unsigned long nr, volatile unsigned long *ptr)
 {
-        unsigned long addr, old, new, mask;
+	unsigned long addr, mask;
 
 	addr = (unsigned long) ptr;
 	/* calculate address for CS */
@@ -113,7 +145,7 @@
 	/* make OR mask */
 	mask = 1UL << (nr & (BITS_PER_LONG - 1));
 	/* Do the atomic update. */
-	__BITOPS_LOOP(old, new, addr, mask, __BITOPS_OR);
+	__BITOPS_LOOP(addr, mask, __BITOPS_OR);
 }
 
 /*
@@ -121,7 +153,7 @@
  */
 static inline void clear_bit_cs(unsigned long nr, volatile unsigned long *ptr)
 {
-        unsigned long addr, old, new, mask;
+	unsigned long addr, mask;
 
 	addr = (unsigned long) ptr;
 	/* calculate address for CS */
@@ -129,7 +161,7 @@
 	/* make AND mask */
 	mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
 	/* Do the atomic update. */
-	__BITOPS_LOOP(old, new, addr, mask, __BITOPS_AND);
+	__BITOPS_LOOP(addr, mask, __BITOPS_AND);
 }
 
 /*
@@ -137,7 +169,7 @@
  */
 static inline void change_bit_cs(unsigned long nr, volatile unsigned long *ptr)
 {
-        unsigned long addr, old, new, mask;
+	unsigned long addr, mask;
 
 	addr = (unsigned long) ptr;
 	/* calculate address for CS */
@@ -145,7 +177,7 @@
 	/* make XOR mask */
 	mask = 1UL << (nr & (BITS_PER_LONG - 1));
 	/* Do the atomic update. */
-	__BITOPS_LOOP(old, new, addr, mask, __BITOPS_XOR);
+	__BITOPS_LOOP(addr, mask, __BITOPS_XOR);
 }
 
 /*
@@ -154,7 +186,7 @@
 static inline int
 test_and_set_bit_cs(unsigned long nr, volatile unsigned long *ptr)
 {
-        unsigned long addr, old, new, mask;
+	unsigned long addr, old, mask;
 
 	addr = (unsigned long) ptr;
 	/* calculate address for CS */
@@ -162,7 +194,7 @@
 	/* make OR/test mask */
 	mask = 1UL << (nr & (BITS_PER_LONG - 1));
 	/* Do the atomic update. */
-	__BITOPS_LOOP(old, new, addr, mask, __BITOPS_OR);
+	old = __BITOPS_LOOP(addr, mask, __BITOPS_OR);
 	barrier();
 	return (old & mask) != 0;
 }
@@ -173,7 +205,7 @@
 static inline int
 test_and_clear_bit_cs(unsigned long nr, volatile unsigned long *ptr)
 {
-        unsigned long addr, old, new, mask;
+	unsigned long addr, old, mask;
 
 	addr = (unsigned long) ptr;
 	/* calculate address for CS */
@@ -181,9 +213,9 @@
 	/* make AND/test mask */
 	mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
 	/* Do the atomic update. */
-	__BITOPS_LOOP(old, new, addr, mask, __BITOPS_AND);
+	old = __BITOPS_LOOP(addr, mask, __BITOPS_AND);
 	barrier();
-	return (old ^ new) != 0;
+	return (old & ~mask) != 0;
 }
 
 /*
@@ -192,7 +224,7 @@
 static inline int
 test_and_change_bit_cs(unsigned long nr, volatile unsigned long *ptr)
 {
-        unsigned long addr, old, new, mask;
+	unsigned long addr, old, mask;
 
 	addr = (unsigned long) ptr;
 	/* calculate address for CS */
@@ -200,7 +232,7 @@
 	/* make XOR/test mask */
 	mask = 1UL << (nr & (BITS_PER_LONG - 1));
 	/* Do the atomic update. */
-	__BITOPS_LOOP(old, new, addr, mask, __BITOPS_XOR);
+	old = __BITOPS_LOOP(addr, mask, __BITOPS_XOR);
 	barrier();
 	return (old & mask) != 0;
 }