powerpc/4xx: Extended DCR support v2

This adds supports to the "extended" DCR addressing via the indirect
mfdcrx/mtdcrx instructions supported by some 4xx cores (440H6 and
later).

I enabled the feature for now only on AMCC 460 chips.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index f3d9d74..3188832 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -164,6 +164,7 @@
 #define CPU_FTR_NEED_PAIRED_STWCX	ASM_CONST(0x0000000004000000)
 #define CPU_FTR_LWSYNC			ASM_CONST(0x0000000008000000)
 #define CPU_FTR_NOEXECUTE		ASM_CONST(0x0000000010000000)
+#define CPU_FTR_INDEXED_DCR		ASM_CONST(0x0000000020000000)
 
 /*
  * Add the 64-bit processor unique features in the top half of the word;
@@ -369,6 +370,8 @@
 #define CPU_FTRS_8XX	(CPU_FTR_USE_TB)
 #define CPU_FTRS_40X	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_44X	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
+#define CPU_FTRS_440x6	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE | \
+	    CPU_FTR_INDEXED_DCR)
 #define CPU_FTRS_E200	(CPU_FTR_USE_TB | CPU_FTR_SPE_COMP | \
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \
 	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE)
@@ -455,7 +458,7 @@
 	    CPU_FTRS_40X |
 #endif
 #ifdef CONFIG_44x
-	    CPU_FTRS_44X |
+	    CPU_FTRS_44X | CPU_FTRS_440x6 |
 #endif
 #ifdef CONFIG_E200
 	    CPU_FTRS_E200 |
@@ -495,7 +498,7 @@
 	    CPU_FTRS_40X &
 #endif
 #ifdef CONFIG_44x
-	    CPU_FTRS_44X &
+	    CPU_FTRS_44X & CPU_FTRS_440x6 &
 #endif
 #ifdef CONFIG_E200
 	    CPU_FTRS_E200 &
diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h
index 72d2b72..7d2e623 100644
--- a/arch/powerpc/include/asm/dcr-native.h
+++ b/arch/powerpc/include/asm/dcr-native.h
@@ -23,6 +23,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/spinlock.h>
+#include <asm/cputable.h>
 
 typedef struct {
 	unsigned int base;
@@ -39,23 +40,45 @@
 #define dcr_read_native(host, dcr_n)		mfdcr(dcr_n + host.base)
 #define dcr_write_native(host, dcr_n, value)	mtdcr(dcr_n + host.base, value)
 
-/* Device Control Registers */
-void __mtdcr(int reg, unsigned int val);
-unsigned int __mfdcr(int reg);
+/* Table based DCR accessors */
+extern void __mtdcr(unsigned int reg, unsigned int val);
+extern unsigned int __mfdcr(unsigned int reg);
+
+/* mfdcrx/mtdcrx instruction based accessors. We hand code
+ * the opcodes in order not to depend on newer binutils
+ */
+static inline unsigned int mfdcrx(unsigned int reg)
+{
+	unsigned int ret;
+	asm volatile(".long 0x7c000206 | (%0 << 21) | (%1 << 16)"
+		     : "=r" (ret) : "r" (reg));
+	return ret;
+}
+
+static inline void mtdcrx(unsigned int reg, unsigned int val)
+{
+	asm volatile(".long 0x7c000306 | (%0 << 21) | (%1 << 16)"
+		     : : "r" (val), "r" (reg));
+}
+
 #define mfdcr(rn)						\
 	({unsigned int rval;					\
-	if (__builtin_constant_p(rn))				\
+	if (__builtin_constant_p(rn) && rn < 1024)		\
 		asm volatile("mfdcr %0," __stringify(rn)	\
 		              : "=r" (rval));			\
+	else if (likely(cpu_has_feature(CPU_FTR_INDEXED_DCR)))	\
+		rval = mfdcrx(rn);				\
 	else							\
 		rval = __mfdcr(rn);				\
 	rval;})
 
 #define mtdcr(rn, v)						\
 do {								\
-	if (__builtin_constant_p(rn))				\
+	if (__builtin_constant_p(rn) && rn < 1024)		\
 		asm volatile("mtdcr " __stringify(rn) ",%0"	\
 			      : : "r" (v)); 			\
+	else if (likely(cpu_has_feature(CPU_FTR_INDEXED_DCR)))	\
+		mtdcrx(rn, v);					\
 	else							\
 		__mtdcr(rn, v);					\
 } while (0)
@@ -69,8 +92,13 @@
 	unsigned int val;
 
 	spin_lock_irqsave(&dcr_ind_lock, flags);
-	__mtdcr(base_addr, reg);
-	val = __mfdcr(base_data);
+	if (cpu_has_feature(CPU_FTR_INDEXED_DCR)) {
+		mtdcrx(base_addr, reg);
+		val = mfdcrx(base_data);
+	} else {
+		__mtdcr(base_addr, reg);
+		val = __mfdcr(base_data);
+	}
 	spin_unlock_irqrestore(&dcr_ind_lock, flags);
 	return val;
 }
@@ -81,8 +109,13 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(&dcr_ind_lock, flags);
-	__mtdcr(base_addr, reg);
-	__mtdcr(base_data, val);
+	if (cpu_has_feature(CPU_FTR_INDEXED_DCR)) {
+		mtdcrx(base_addr, reg);
+		mtdcrx(base_data, val);
+	} else {
+		__mtdcr(base_addr, reg);
+		__mtdcr(base_data, val);
+	}
 	spin_unlock_irqrestore(&dcr_ind_lock, flags);
 }
 
@@ -93,9 +126,15 @@
 	unsigned int val;
 
 	spin_lock_irqsave(&dcr_ind_lock, flags);
-	__mtdcr(base_addr, reg);
-	val = (__mfdcr(base_data) & ~clr) | set;
-	__mtdcr(base_data, val);
+	if (cpu_has_feature(CPU_FTR_INDEXED_DCR)) {
+		mtdcrx(base_addr, reg);
+		val = (mfdcrx(base_data) & ~clr) | set;
+		mtdcrx(base_data, val);
+	} else {
+		__mtdcr(base_addr, reg);
+		val = (__mfdcr(base_data) & ~clr) | set;
+		__mtdcr(base_data, val);
+	}
 	spin_unlock_irqrestore(&dcr_ind_lock, flags);
 }
 
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 7e87195..921a229 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -1509,7 +1509,7 @@
 		.pvr_mask		= 0xffff0002,
 		.pvr_value		= 0x13020002,
 		.cpu_name		= "460EX",
-		.cpu_features		= CPU_FTRS_44X,
+		.cpu_features		= CPU_FTRS_440x6,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
@@ -1521,7 +1521,7 @@
 		.pvr_mask		= 0xffff0002,
 		.pvr_value		= 0x13020000,
 		.cpu_name		= "460GT",
-		.cpu_features		= CPU_FTRS_44X,
+		.cpu_features		= CPU_FTRS_440x6,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
diff --git a/arch/powerpc/sysdev/dcr-low.S b/arch/powerpc/sysdev/dcr-low.S
index 2078f39..d3098ef 100644
--- a/arch/powerpc/sysdev/dcr-low.S
+++ b/arch/powerpc/sysdev/dcr-low.S
@@ -11,14 +11,20 @@
 
 #include <asm/ppc_asm.h>
 #include <asm/processor.h>
+#include <asm/bug.h>
 
 #define DCR_ACCESS_PROLOG(table) \
+	cmpli	cr0,r3,1024;	 \
 	rlwinm  r3,r3,4,18,27;   \
 	lis     r5,table@h;      \
 	ori     r5,r5,table@l;   \
 	add     r3,r3,r5;        \
+	bge-	1f;		 \
 	mtctr   r3;              \
-	bctr
+	bctr;			 \
+1:	trap;			 \
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;	\
+	blr
 
 _GLOBAL(__mfdcr)
 	DCR_ACCESS_PROLOG(__mfdcr_table)