x86: this_cpu_cmpxchg and this_cpu_xchg operations

Provide support as far as the hardware capabilities of the x86 cpus
allow.

Define CONFIG_CMPXCHG_LOCAL in Kconfig.cpu to allow core code to test for
fast cpuops implementations.

V1->V2:
	- Take out the definition for this_cpu_cmpxchg_8 and move it into
	  a separate patch.

tj: - Reordered ops to better follow this_cpu_* organization.
    - Renamed macro temp variables similar to their existing
      neighbours.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2ac9069..15588a0 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -310,6 +310,9 @@
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
+config CMPXCHG_LOCAL
+	def_bool X86_64 || (X86_32 && !M386)
+
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index dd0cd4b..b85ade5 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -263,6 +263,83 @@
 })
 
 /*
+ * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
+ * full lock semantics even though they are not needed.
+ */
+#define percpu_xchg_op(var, nval)					\
+({									\
+	typeof(var) pxo_ret__;						\
+	typeof(var) pxo_new__ = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("xchgb %2, "__percpu_arg(1)				\
+			    : "=a" (pxo_ret__), "+m" (var)		\
+			    : "q" (pxo_new__)				\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("xchgw %2, "__percpu_arg(1)				\
+			    : "=a" (pxo_ret__), "+m" (var)		\
+			    : "r" (pxo_new__)				\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("xchgl %2, "__percpu_arg(1)				\
+			    : "=a" (pxo_ret__), "+m" (var)		\
+			    : "r" (pxo_new__)				\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("xchgq %2, "__percpu_arg(1)				\
+			    : "=a" (pxo_ret__), "+m" (var)		\
+			    : "r" (pxo_new__)				\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	pxo_ret__;							\
+})
+
+/*
+ * cmpxchg has no such implied lock semantics as a result it is much
+ * more efficient for cpu local operations.
+ */
+#define percpu_cmpxchg_op(var, oval, nval)				\
+({									\
+	typeof(var) pco_ret__;						\
+	typeof(var) pco_old__ = (oval);					\
+	typeof(var) pco_new__ = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("cmpxchgb %2, "__percpu_arg(1)			\
+			    : "=a" (pco_ret__), "+m" (var)		\
+			    : "q" (pco_new__), "0" (pco_old__)		\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("cmpxchgw %2, "__percpu_arg(1)			\
+			    : "=a" (pco_ret__), "+m" (var)		\
+			    : "r" (pco_new__), "0" (pco_old__)		\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("cmpxchgl %2, "__percpu_arg(1)			\
+			    : "=a" (pco_ret__), "+m" (var)		\
+			    : "r" (pco_new__), "0" (pco_old__)		\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("cmpxchgq %2, "__percpu_arg(1)			\
+			    : "=a" (pco_ret__), "+m" (var)		\
+			    : "r" (pco_new__), "0" (pco_old__)		\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	pco_ret__;							\
+})
+
+/*
  * percpu_read() makes gcc load the percpu variable every time it is
  * accessed while percpu_read_stable() allows the value to be cached.
  * percpu_read_stable() is more efficient and can be used if its value
@@ -300,6 +377,12 @@
 #define __this_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define __this_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define __this_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
+/*
+ * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
+ * faster than an xchg with forced lock semantics.
+ */
+#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
 #define this_cpu_read_1(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
 #define this_cpu_read_2(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
@@ -319,6 +402,11 @@
 #define this_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define this_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define this_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
+#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
 #define irqsafe_cpu_add_1(pcp, val)	percpu_add_op((pcp), val)
 #define irqsafe_cpu_add_2(pcp, val)	percpu_add_op((pcp), val)
@@ -332,15 +420,32 @@
 #define irqsafe_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
+#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
 #ifndef CONFIG_M386
 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+
 #define this_cpu_add_return_1(pcp, val)	percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_2(pcp, val)	percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_4(pcp, val)	percpu_add_return_op(pcp, val)
-#endif
+#define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+
+#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#endif /* !CONFIG_M386 */
+
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.