[SPARC64]: Refine code sequences to get the cpu id.

On uniprocessor, it's always zero for optimize that.

On SMP, the jmpl to the stub kills the return address stack in the cpu
branch prediction logic, so expand the code sequence inline and use a
code patching section to fix things up.  This also always better and
explicit register selection, which will be taken advantage of in a
future changeset.

The hard_smp_processor_id() function is big, so do not inline it.

Fix up tests for Jalapeno to also test for Serrano chips too.  These
tests want "jbus Ultra-IIIi" cases to match, so that is what we should
test for.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/asm-sparc64/cpudata.h b/include/asm-sparc64/cpudata.h
index f837688..da54b4f 100644
--- a/include/asm-sparc64/cpudata.h
+++ b/include/asm-sparc64/cpudata.h
@@ -60,9 +60,18 @@
 } __attribute__((aligned(64)));
 extern struct trap_per_cpu trap_block[NR_CPUS];
 extern void init_cur_cpu_trap(void);
-extern void per_cpu_patch(void);
 extern void setup_tba(void);
 
+#ifdef CONFIG_SMP
+struct cpuid_patch_entry {
+	unsigned int	addr;
+	unsigned int	cheetah_safari[4];
+	unsigned int	cheetah_jbus[4];
+	unsigned int	starfire[4];
+};
+extern struct cpuid_patch_entry __cpuid_patch, __cpuid_patch_end;
+#endif
+
 #endif /* !(__ASSEMBLY__) */
 
 #define TRAP_PER_CPU_THREAD	0x00
@@ -70,35 +79,58 @@
 
 #define TRAP_BLOCK_SZ_SHIFT	6
 
-/* Clobbers %g1, loads %g6 with local processor's cpuid */
-#define __GET_CPUID			\
-	ba,pt	%xcc, __get_cpu_id;	\
-	 rd	%pc, %g1;
+#ifdef CONFIG_SMP
+
+#define __GET_CPUID(REG)				\
+	/* Spitfire implementation (default). */	\
+661:	ldxa		[%g0] ASI_UPA_CONFIG, REG;	\
+	srlx		REG, 17, REG;			\
+	 and		REG, 0x1f, REG;			\
+	nop;						\
+	.section	.cpuid_patch, "ax";		\
+	/* Instruction location. */			\
+	.word		661b;				\
+	/* Cheetah Safari implementation. */		\
+	ldxa		[%g0] ASI_SAFARI_CONFIG, REG;	\
+	srlx		REG, 17, REG;			\
+	and		REG, 0x3ff, REG;		\
+	nop;						\
+	/* Cheetah JBUS implementation. */		\
+	ldxa		[%g0] ASI_JBUS_CONFIG, REG;	\
+	srlx		REG, 17, REG;			\
+	and		REG, 0x1f, REG;			\
+	nop;						\
+	/* Starfire implementation. */			\
+	sethi		%hi(0x1fff40000d0 >> 9), REG;	\
+	sllx		REG, 9, REG;			\
+	or		REG, 0xd0, REG;			\
+	lduwa		[REG] ASI_PHYS_BYPASS_EC_E, REG;\
+	.previous;
 
 /* Clobbers %g1, current address space PGD phys address into %g7.  */
 #define TRAP_LOAD_PGD_PHYS			\
-	__GET_CPUID				\
-	sllx	%g6, TRAP_BLOCK_SZ_SHIFT, %g6;	\
+	__GET_CPUID(%g1)			\
 	sethi	%hi(trap_block), %g7;		\
+	sllx	%g1, TRAP_BLOCK_SZ_SHIFT, %g1;	\
 	or	%g7, %lo(trap_block), %g7;	\
-	add	%g7, %g6, %g7;			\
+	add	%g7, %g1, %g7;			\
 	ldx	[%g7 + TRAP_PER_CPU_PGD_PADDR], %g7;
 
 /* Clobbers %g1, loads local processor's IRQ work area into %g6.  */
 #define TRAP_LOAD_IRQ_WORK			\
-	__GET_CPUID				\
-	sethi	%hi(__irq_work), %g1;		\
-	sllx	%g6, 6, %g6;			\
-	or	%g1, %lo(__irq_work), %g1;	\
-	add	%g1, %g6, %g6;
+	__GET_CPUID(%g1)			\
+	sethi	%hi(__irq_work), %g6;		\
+	sllx	%g1, 6, %g1;			\
+	or	%g6, %lo(__irq_work), %g6;	\
+	add	%g6, %g1, %g6;
 
 /* Clobbers %g1, loads %g6 with current thread info pointer.  */
 #define TRAP_LOAD_THREAD_REG			\
-	__GET_CPUID				\
-	sllx	%g6, TRAP_BLOCK_SZ_SHIFT, %g6;	\
-	sethi	%hi(trap_block), %g1;		\
-	or	%g1, %lo(trap_block), %g1;	\
-	ldx	[%g1 + %g6], %g6;
+	__GET_CPUID(%g1)			\
+	sethi	%hi(trap_block), %g6;		\
+	sllx	%g1, TRAP_BLOCK_SZ_SHIFT, %g1;	\
+	or	%g6, %lo(trap_block), %g6;	\
+	ldx	[%g6 + %g1], %g6;
 
 /* Given the current thread info pointer in %g6, load the per-cpu
  * area base of the current processor into %g5.  REG1, REG2, and REG3 are
@@ -109,7 +141,6 @@
  * trap will load the fully resolved %g5 per-cpu base.  This can corrupt
  * the calculations done by the macro mid-stream.
  */
-#ifdef CONFIG_SMP
 #define LOAD_PER_CPU_BASE(REG1, REG2, REG3)		\
 	ldub	[%g6 + TI_CPU], REG1;			\
 	sethi	%hi(__per_cpu_shift), REG3;		\
@@ -118,8 +149,26 @@
 	ldx	[REG2 + %lo(__per_cpu_base)], REG2;	\
 	sllx	REG1, REG3, REG3;			\
 	add	REG3, REG2, %g5;
+
 #else
+
+/* Uniprocessor versions, we know the cpuid is zero.  */
+#define TRAP_LOAD_PGD_PHYS			\
+	sethi	%hi(trap_block), %g7;		\
+	or	%g7, %lo(trap_block), %g7;	\
+	ldx	[%g7 + TRAP_PER_CPU_PGD_PADDR], %g7;
+
+#define TRAP_LOAD_IRQ_WORK			\
+	sethi	%hi(__irq_work), %g6;		\
+	or	%g6, %lo(__irq_work), %g6;
+
+#define TRAP_LOAD_THREAD_REG			\
+	sethi	%hi(trap_block), %g6;		\
+	ldx	[%g6 + %lo(trap_block)], %g6;
+
+/* No per-cpu areas on uniprocessor, so no need to load %g5.  */
 #define LOAD_PER_CPU_BASE(REG1, REG2, REG3)
-#endif
+
+#endif /* !(CONFIG_SMP) */
 
 #endif /* _SPARC64_CPUDATA_H */
diff --git a/include/asm-sparc64/head.h b/include/asm-sparc64/head.h
index 0abd3a6..731c842 100644
--- a/include/asm-sparc64/head.h
+++ b/include/asm-sparc64/head.h
@@ -10,6 +10,7 @@
 
 #define __CHEETAH_ID	0x003e0014
 #define __JALAPENO_ID	0x003e0016
+#define __SERRANO_ID	0x003e0022
 
 #define CHEETAH_MANUF		0x003e
 #define CHEETAH_IMPL		0x0014 /* Ultra-III   */
diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h
index 473edb2..ad1d35a 100644
--- a/include/asm-sparc64/smp.h
+++ b/include/asm-sparc64/smp.h
@@ -37,33 +37,7 @@
  *	General functions that each host system must provide.
  */
 
-static __inline__ int hard_smp_processor_id(void)
-{
-	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		unsigned long cfg, ver;
-		__asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver));
-		if ((ver >> 32) == 0x003e0016) {
-			__asm__ __volatile__("ldxa [%%g0] %1, %0"
-					     : "=r" (cfg)
-					     : "i" (ASI_JBUS_CONFIG));
-			return ((cfg >> 17) & 0x1f);
-		} else {
-			__asm__ __volatile__("ldxa [%%g0] %1, %0"
-					     : "=r" (cfg)
-					     : "i" (ASI_SAFARI_CONFIG));
-			return ((cfg >> 17) & 0x3ff);
-		}
-	} else if (this_is_starfire != 0) {
-		return starfire_hard_smp_processor_id();
-	} else {
-		unsigned long upaconfig;
-		__asm__ __volatile__("ldxa	[%%g0] %1, %0"
-				     : "=r" (upaconfig)
-				     : "i" (ASI_UPA_CONFIG));
-		return ((upaconfig >> 17) & 0x1f);
-	}
-}
-
+extern int hard_smp_processor_id(void);
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern void smp_setup_cpu_possible_map(void);