[SPARC64]: Initial sun4v TLB miss handling infrastructure.

Things are a little tricky because, unlike sun4u, we have
to:

1) do a hypervisor trap to do the TLB load.
2) do the TSB lookup calculations by hand

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/kernel/head.S b/arch/sparc64/kernel/head.S
index 7840271..03fc0b5 100644
--- a/arch/sparc64/kernel/head.S
+++ b/arch/sparc64/kernel/head.S
@@ -474,6 +474,7 @@
 sparc64_boot_end:
 
 #include "systbls.S"
+#include "sun4v_tlb_miss.S"
 #include "ktlb.S"
 #include "tsb.S"
 #include "etrap.S"
diff --git a/arch/sparc64/kernel/ktlb.S b/arch/sparc64/kernel/ktlb.S
index c133543..2e55084 100644
--- a/arch/sparc64/kernel/ktlb.S
+++ b/arch/sparc64/kernel/ktlb.S
@@ -16,12 +16,16 @@
 	.text
 	.align		32
 
-	.globl		kvmap_itlb
 kvmap_itlb:
 	/* g6: TAG TARGET */
 	mov		TLB_TAG_ACCESS, %g4
 	ldxa		[%g4] ASI_IMMU, %g4
 
+	/* sun4v_itlb_miss branches here with the missing virtual
+	 * address already loaded into %g4
+	 */
+kvmap_itlb_4v:
+
 kvmap_itlb_nonlinear:
 	/* Catch kernel NULL pointer calls.  */
 	sethi		%hi(PAGE_SIZE), %g5
@@ -94,11 +98,15 @@
 	 nop
 
 	.align		32
-	.globl		kvmap_dtlb
 kvmap_dtlb:
 	/* %g6: TAG TARGET */
 	mov		TLB_TAG_ACCESS, %g4
 	ldxa		[%g4] ASI_DMMU, %g4
+
+	/* sun4v_dtlb_miss branches here with the missing virtual
+	 * address already loaded into %g4
+	 */
+kvmap_dtlb_4v:
 	brgez,pn	%g4, kvmap_dtlb_nonlinear
 	 nop
 
diff --git a/arch/sparc64/kernel/sun4v_tlb_miss.S b/arch/sparc64/kernel/sun4v_tlb_miss.S
new file mode 100644
index 0000000..58ea5dd
--- /dev/null
+++ b/arch/sparc64/kernel/sun4v_tlb_miss.S
@@ -0,0 +1,219 @@
+/* sun4v_tlb_miss.S: Sun4v TLB miss handlers.
+ *
+ * Copyright (C) 2006 <davem@davemloft.net>
+ */
+
+	.text
+	.align	32
+
+sun4v_itlb_miss:
+	/* Load CPU ID into %g3.  */
+	mov	SCRATCHPAD_CPUID, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g3
+	
+	/* Load UTSB reg into %g1.  */
+	ldxa	[%g1 + %g1] ASI_SCRATCHPAD, %g1
+
+	/* Load &trap_block[smp_processor_id()] into %g2.  */
+	sethi	%hi(trap_block), %g2
+	or	%g2, %lo(trap_block), %g2
+	sllx	%g3, TRAP_BLOCK_SZ_SHIFT, %g3
+	add	%g2, %g3, %g2
+
+	/* Create a TAG TARGET, "(vaddr>>22) | (ctx << 48)", in %g6.
+	 * Branch if kernel TLB miss.  The kernel TSB and user TSB miss
+	 * code wants the missing virtual address in %g4, so that value
+	 * cannot be modified through the entirety of this handler.
+	 */
+	ldx	[%g2 + TRAP_PER_CPU_FAULT_INFO + HV_FAULT_I_ADDR_OFFSET], %g4
+	ldx	[%g2 + TRAP_PER_CPU_FAULT_INFO + HV_FAULT_I_CTX_OFFSET], %g5
+	srlx	%g4, 22, %g3
+	sllx	%g5, 48, %g6
+	or	%g6, %g3, %g6
+	brz,pn	%g5, kvmap_itlb_4v
+	 nop
+
+	/* Create TSB pointer.  This is something like:
+	 *
+	 * index_mask = (512 << (tsb_reg & 0x7UL)) - 1UL;
+	 * tsb_base = tsb_reg & ~0x7UL;
+	 */
+	and	%g1, 0x7, %g3
+	andn	%g1, 0x7, %g1
+	mov	512, %g7
+	sllx	%g7, %g3, %g7
+	sub	%g7, 1, %g7
+
+	/* TSB index mask is in %g7, tsb base is in %g1.  Compute
+	 * the TSB entry pointer into %g1:
+	 *
+	 * tsb_index = ((vaddr >> PAGE_SHIFT) & tsb_mask);
+	 * tsb_ptr = tsb_base + (tsb_index * 16);
+	 */
+	srlx	%g4, PAGE_SHIFT, %g3
+	and	%g3, %g7, %g3
+	sllx	%g3, 4, %g3
+	add	%g1, %g3, %g1
+
+	/* Load TSB tag/pte into %g2/%g3 and compare the tag.  */
+	ldda	[%g1] ASI_QUAD_LDD_PHYS, %g2
+	cmp	%g2, %g6
+	sethi	%hi(_PAGE_EXEC), %g7
+	bne,a,pn %xcc, tsb_miss_page_table_walk
+	 mov	FAULT_CODE_ITLB, %g3
+	andcc	%g3, %g7, %g0
+	be,a,pn	%xcc, tsb_do_fault
+	 mov	FAULT_CODE_ITLB, %g3
+
+	/* We have a valid entry, make hypervisor call to load
+	 * I-TLB and return from trap.
+	 *
+	 * %g3:	PTE
+	 * %g4:	vaddr
+	 * %g6:	TAG TARGET (only "CTX << 48" part matters)
+	 */
+sun4v_itlb_load:
+	mov	%o0, %g1		! save %o0
+	mov	%o1, %g2		! save %o1
+	mov	%o2, %g5		! save %o2
+	mov	%o3, %g7		! save %o3
+	mov	%g4, %o0		! vaddr
+	srlx	%g6, 48, %o1		! ctx
+	mov	%g3, %o2		! PTE
+	mov	HV_MMU_IMMU, %o3	! flags
+	ta	HV_MMU_MAP_ADDR_TRAP
+	mov	%g1, %o0		! restore %o0
+	mov	%g2, %o1		! restore %o1
+	mov	%g5, %o2		! restore %o2
+	mov	%g7, %o3		! restore %o3
+
+	retry
+
+sun4v_dtlb_miss:
+	/* Load CPU ID into %g3.  */
+	mov	SCRATCHPAD_CPUID, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g3
+	
+	/* Load UTSB reg into %g1.  */
+	ldxa	[%g1 + %g1] ASI_SCRATCHPAD, %g1
+
+	/* Load &trap_block[smp_processor_id()] into %g2.  */
+	sethi	%hi(trap_block), %g2
+	or	%g2, %lo(trap_block), %g2
+	sllx	%g3, TRAP_BLOCK_SZ_SHIFT, %g3
+	add	%g2, %g3, %g2
+
+	/* Create a TAG TARGET, "(vaddr>>22) | (ctx << 48)", in %g6.
+	 * Branch if kernel TLB miss.  The kernel TSB and user TSB miss
+	 * code wants the missing virtual address in %g4, so that value
+	 * cannot be modified through the entirety of this handler.
+	 */
+	ldx	[%g2 + TRAP_PER_CPU_FAULT_INFO + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + TRAP_PER_CPU_FAULT_INFO + HV_FAULT_D_CTX_OFFSET], %g5
+	srlx	%g4, 22, %g3
+	sllx	%g5, 48, %g6
+	or	%g6, %g3, %g6
+	brz,pn	%g5, kvmap_dtlb_4v
+	 nop
+
+	/* Create TSB pointer.  This is something like:
+	 *
+	 * index_mask = (512 << (tsb_reg & 0x7UL)) - 1UL;
+	 * tsb_base = tsb_reg & ~0x7UL;
+	 */
+	and	%g1, 0x7, %g3
+	andn	%g1, 0x7, %g1
+	mov	512, %g7
+	sllx	%g7, %g3, %g7
+	sub	%g7, 1, %g7
+
+	/* TSB index mask is in %g7, tsb base is in %g1.  Compute
+	 * the TSB entry pointer into %g1:
+	 *
+	 * tsb_index = ((vaddr >> PAGE_SHIFT) & tsb_mask);
+	 * tsb_ptr = tsb_base + (tsb_index * 16);
+	 */
+	srlx	%g4, PAGE_SHIFT, %g3
+	and	%g3, %g7, %g3
+	sllx	%g3, 4, %g3
+	add	%g1, %g3, %g1
+
+	/* Load TSB tag/pte into %g2/%g3 and compare the tag.  */
+	ldda	[%g1] ASI_QUAD_LDD_PHYS, %g2
+	cmp	%g2, %g6
+	bne,a,pn %xcc, tsb_miss_page_table_walk
+	 mov	FAULT_CODE_ITLB, %g3
+
+	/* We have a valid entry, make hypervisor call to load
+	 * D-TLB and return from trap.
+	 *
+	 * %g3:	PTE
+	 * %g4:	vaddr
+	 * %g6:	TAG TARGET (only "CTX << 48" part matters)
+	 */
+sun4v_dtlb_load:
+	mov	%o0, %g1		! save %o0
+	mov	%o1, %g2		! save %o1
+	mov	%o2, %g5		! save %o2
+	mov	%o3, %g7		! save %o3
+	mov	%g4, %o0		! vaddr
+	srlx	%g6, 48, %o1		! ctx
+	mov	%g3, %o2		! PTE
+	mov	HV_MMU_DMMU, %o3	! flags
+	ta	HV_MMU_MAP_ADDR_TRAP
+	mov	%g1, %o0		! restore %o0
+	mov	%g2, %o1		! restore %o1
+	mov	%g5, %o2		! restore %o2
+	mov	%g7, %o3		! restore %o3
+
+	retry
+
+sun4v_dtlb_prot:
+	/* Load CPU ID into %g3.  */
+	mov	SCRATCHPAD_CPUID, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g3
+	
+	/* Load &trap_block[smp_processor_id()] into %g2.  */
+	sethi	%hi(trap_block), %g2
+	or	%g2, %lo(trap_block), %g2
+	sllx	%g3, TRAP_BLOCK_SZ_SHIFT, %g3
+	add	%g2, %g3, %g2
+
+	ldx	[%g2 + TRAP_PER_CPU_FAULT_INFO + HV_FAULT_D_ADDR_OFFSET], %g5
+	rdpr	%tl, %g1
+	cmp	%g1, 1
+	bgu,pn		%xcc, winfix_trampoline
+	 nop
+	ba,pt		%xcc, sparc64_realfault_common
+	 mov		FAULT_CODE_DTLB | FAULT_CODE_WRITE, %g4
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define SUN4V_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	srl	%g1, 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	sun4v_patch_tlb_handlers
+	.type	sun4v_patch_tlb_handlers,#function
+sun4v_patch_tlb_handlers:
+	SUN4V_DO_PATCH(tl0_iamiss, sun4v_itlb_miss)
+	SUN4V_DO_PATCH(tl1_iamiss, sun4v_itlb_miss)
+	SUN4V_DO_PATCH(tl0_damiss, sun4v_dtlb_miss)
+	SUN4V_DO_PATCH(tl1_damiss, sun4v_dtlb_miss)
+	SUN4V_DO_PATCH(tl0_daprot, sun4v_dtlb_prot)
+	SUN4V_DO_PATCH(tl1_daprot, sun4v_dtlb_prot)
+	retl
+	 nop
+	.size	sun4v_patch_tlb_handlers,.-sun4v_patch_tlb_handlers
diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S
index 96e6316..818bc9e 100644
--- a/arch/sparc64/kernel/tsb.S
+++ b/arch/sparc64/kernel/tsb.S
@@ -18,30 +18,33 @@
 	 * %g4:	available temporary
 	 * %g5:	available temporary
 	 * %g6: TAG TARGET
-	 * %g7:	physical address base of the linux page
+	 * %g7:	available temporary, will be loaded by us with
+	 *      the physical address base of the linux page
 	 *      tables for the current address space
 	 */
-	.globl		tsb_miss_dtlb
 tsb_miss_dtlb:
 	mov		TLB_TAG_ACCESS, %g4
 	ldxa		[%g4] ASI_DMMU, %g4
 	ba,pt		%xcc, tsb_miss_page_table_walk
 	 nop
 
-	.globl		tsb_miss_itlb
 tsb_miss_itlb:
 	mov		TLB_TAG_ACCESS, %g4
 	ldxa		[%g4] ASI_IMMU, %g4
 	ba,pt		%xcc, tsb_miss_page_table_walk
 	 nop
 
+	/* The sun4v TLB miss handlers jump directly here instead
+	 * of tsb_miss_{d,i}tlb with the missing virtual address
+	 * already loaded into %g4.
+	 */
 tsb_miss_page_table_walk:
 	TRAP_LOAD_PGD_PHYS(%g7, %g5)
 
 	USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
 
 tsb_reload:
-	TSB_LOCK_TAG(%g1, %g2, %g4)
+	TSB_LOCK_TAG(%g1, %g2, %g7)
 
 	/* Load and check PTE.  */
 	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
@@ -52,9 +55,9 @@
 	 * bother putting it into the TSB.
 	 */
 	srlx		%g5, 32, %g2
-	sethi		%hi(_PAGE_ALL_SZ_BITS >> 32), %g4
+	sethi		%hi(_PAGE_ALL_SZ_BITS >> 32), %g7
+	and		%g2, %g7, %g2
 	sethi		%hi(_PAGE_SZBITS >> 32), %g7
-	and		%g2, %g4, %g2
 	cmp		%g2, %g7
 	bne,a,pn	%xcc, tsb_tlb_reload
 	 TSB_STORE(%g1, %g0)
@@ -68,12 +71,54 @@
 	 nop
 
 tsb_dtlb_load:
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN
+
+661:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN
 	retry
+	.section	.gl_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_DTLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_dtlb_load
+	 mov		%g5, %g3
 
 tsb_itlb_load:
-	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
+
+661:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
 	retry
+	.section	.gl_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_ITLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_itlb_load
+	 mov		%g5, %g3
 
 	/* No valid entry in the page tables, do full fault
 	 * processing.
@@ -95,10 +140,17 @@
 	 nop
 
 tsb_do_dtlb_fault:
-	rdpr	%tl, %g4
-	cmp	%g4, 1
-	mov	TLB_TAG_ACCESS, %g4
+	rdpr	%tl, %g3
+	cmp	%g3, 1
+
+661:	mov	TLB_TAG_ACCESS, %g4
 	ldxa	[%g4] ASI_DMMU, %g5
+	.section .gl_2insn_patch, "ax"
+	.word	661b
+	mov	%g4, %g5
+	nop
+	.previous
+
 	be,pt	%xcc, sparc64_realfault_common
 	 mov	FAULT_CODE_DTLB, %g4
 	ba,pt	%xcc, winfix_trampoline
@@ -196,12 +248,23 @@
 	add	%g2, %g1, %g2
 	stx	%o0, [%g2 + TRAP_PER_CPU_PGD_PADDR]
 
-	mov	TSB_REG, %g1
+661:	mov	TSB_REG, %g1
 	stxa	%o1, [%g1] ASI_DMMU
+	.section .gl_2insn_patch, "ax"
+	.word	661b
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	stxa	%o1, [%g1] ASI_SCRATCHPAD
+	.previous
+
 	membar	#Sync
 
-	stxa	%o1, [%g1] ASI_IMMU
+661:	stxa	%o1, [%g1] ASI_IMMU
 	membar	#Sync
+	.section .gl_2insn_patch, "ax"
+	.word	661b
+	nop
+	nop
+	.previous
 
 	brz	%o2, 9f
 	 nop
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index 686bf6b..a09a8a2 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -71,6 +71,9 @@
   __con_initcall_end = .;
   SECURITY_INIT
   . = ALIGN(4);
+  __tsb_ldquad_phys_patch = .;
+  .tsb_ldquad_phys_patch : { *(.tsb_ldquad_phys_patch) }
+  __tsb_ldquad_phys_patch_end = .;
   __tsb_phys_patch = .;
   .tsb_phys_patch : { *(.tsb_phys_patch) }
   __tsb_phys_patch_end = .;
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index ab50cd9..e9aac42 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1050,8 +1050,25 @@
 
 static void __init tsb_phys_patch(void)
 {
+	struct tsb_ldquad_phys_patch_entry *pquad;
 	struct tsb_phys_patch_entry *p;
 
+	pquad = &__tsb_ldquad_phys_patch;
+	while (pquad < &__tsb_ldquad_phys_patch_end) {
+		unsigned long addr = pquad->addr;
+
+		if (tlb_type == hypervisor)
+			*(unsigned int *) addr = pquad->sun4v_insn;
+		else
+			*(unsigned int *) addr = pquad->sun4u_insn;
+		wmb();
+		__asm__ __volatile__("flush	%0"
+				     : /* no outputs */
+				     : "r" (addr));
+
+		pquad++;
+	}
+
 	p = &__tsb_phys_patch;
 	while (p < &__tsb_phys_patch_end) {
 		unsigned long addr = p->addr;
@@ -1069,6 +1086,7 @@
 /* paging_init() sets up the page tables */
 
 extern void cheetah_ecache_flush_init(void);
+extern void sun4v_patch_tlb_handlers(void);
 
 static unsigned long last_valid_pfn;
 pgd_t swapper_pg_dir[2048];
@@ -1078,9 +1096,13 @@
 	unsigned long end_pfn, pages_avail, shift;
 	unsigned long real_end, i;
 
-	if (tlb_type == cheetah_plus)
+	if (tlb_type == cheetah_plus ||
+	    tlb_type == hypervisor)
 		tsb_phys_patch();
 
+	if (tlb_type == hypervisor)
+		sun4v_patch_tlb_handlers();
+
 	/* Find available physical memory... */
 	read_obp_memory("available", &pavail[0], &pavail_ents);
 
diff --git a/include/asm-sparc64/cpudata.h b/include/asm-sparc64/cpudata.h
index 998145b..a3dc4af 100644
--- a/include/asm-sparc64/cpudata.h
+++ b/include/asm-sparc64/cpudata.h
@@ -6,6 +6,8 @@
 #ifndef _SPARC64_CPUDATA_H
 #define _SPARC64_CPUDATA_H
 
+#include <asm/hypervisor.h>
+
 #ifndef __ASSEMBLY__
 
 #include <linux/percpu.h>
@@ -57,6 +59,9 @@
 
 /* D-cache line 2 */
 	unsigned long		__pad2[4];
+
+/* Dcache lines 3 and 4 */
+	struct hv_fault_status	fault_info;
 } __attribute__((aligned(64)));
 extern struct trap_per_cpu trap_block[NR_CPUS];
 extern void init_cur_cpu_trap(void);
@@ -88,8 +93,9 @@
 
 #define TRAP_PER_CPU_THREAD	0x00
 #define TRAP_PER_CPU_PGD_PADDR	0x08
+#define TRAP_PER_CPU_FAULT_INFO	0x20
 
-#define TRAP_BLOCK_SZ_SHIFT	6
+#define TRAP_BLOCK_SZ_SHIFT	7
 
 #include <asm/scratchpad.h>
 
diff --git a/include/asm-sparc64/tsb.h b/include/asm-sparc64/tsb.h
index 44709cd..7f3abc3 100644
--- a/include/asm-sparc64/tsb.h
+++ b/include/asm-sparc64/tsb.h
@@ -53,6 +53,14 @@
  * kernel image, so we don't play these games for swapper_tsb access.
  */
 #ifndef __ASSEMBLY__
+struct tsb_ldquad_phys_patch_entry {
+	unsigned int	addr;
+	unsigned int	sun4u_insn;
+	unsigned int	sun4v_insn;
+};
+extern struct tsb_ldquad_phys_patch_entry __tsb_ldquad_phys_patch,
+	__tsb_ldquad_phys_patch_end;
+
 struct tsb_phys_patch_entry {
 	unsigned int	addr;
 	unsigned int	insn;
@@ -61,9 +69,10 @@
 #endif
 #define TSB_LOAD_QUAD(TSB, REG)	\
 661:	ldda		[TSB] ASI_NUCLEUS_QUAD_LDD, REG; \
-	.section	.tsb_phys_patch, "ax"; \
+	.section	.tsb_ldquad_phys_patch, "ax"; \
 	.word		661b; \
 	ldda		[TSB] ASI_QUAD_LDD_PHYS, REG; \
+	ldda		[TSB] ASI_QUAD_LDD_PHYS_4V, REG; \
 	.previous
 
 #define TSB_LOAD_TAG_HIGH(TSB, REG) \