sh: TLB miss fast-path optimizations.

Handle simple TLB miss faults which can be resolved completely
from the page table in assembler.

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index a03f155..48308dc 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -379,6 +379,9 @@
 	  See <file:Documentation/sh/register-banks.txt> for further
 	  information on SR.RB and register banking in the kernel in general.
 
+config CPU_HAS_PTEA
+	bool
+
 endmenu
 
 menu "Timer support"
diff --git a/arch/sh/kernel/cpu/sh3/entry.S b/arch/sh/kernel/cpu/sh3/entry.S
index 869d56f..5de99b4 100644
--- a/arch/sh/kernel/cpu/sh3/entry.S
+++ b/arch/sh/kernel/cpu/sh3/entry.S
@@ -13,8 +13,10 @@
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/cpu/mmu_context.h>
 #include <asm/unistd.h>
+#include <asm/cpu/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
 
 ! NOTE:
 ! GNU as (as of 2.9.1) changes bf/s into bt/s and bra, when the address
@@ -136,29 +138,14 @@
 
 call_dpf:
 	mov.l	1f, r0
-	mov	r5, r8
-	mov.l	@r0, r6
-	mov	r6, r9
-	mov.l	2f, r0
-	sts	pr, r10
-	jsr	@r0
-	 mov	r15, r4
-	!
-	tst	r0, r0
-	bf/s	0f
-	 lds	r10, pr
-	rts
-	 nop
-0:	sti
+ 	mov.l	@r0, r6		! address
 	mov.l	3f, r0
-	mov	r9, r6
-	mov	r8, r5
+	sti
 	jmp	@r0
-	 mov	r15, r4
+ 	 mov	r15, r4		! regs
 
 	.align 2
 1:	.long	MMU_TEA
-2:	.long	__do_page_fault
 3:	.long	do_page_fault
 
 	.align	2
@@ -344,9 +331,176 @@
 2:	.long	ret_from_exception
 !
 !
+
+/* This code makes some assumptions to improve performance.
+ * Make sure they are stil true. */
+#if PTRS_PER_PGD != PTRS_PER_PTE
+#error PDG and PTE sizes don't match
+#endif
+
+/* gas doesn't flag impossible values for mov #immediate as an error */
+#if (_PAGE_PRESENT >> 2) > 0x7f
+#error cannot load PAGE_PRESENT as an immediate
+#endif
+#if _PAGE_DIRTY > 0x7f
+#error cannot load PAGE_DIRTY as an immediate
+#endif
+#if (_PAGE_PRESENT << 2) != _PAGE_ACCESSED
+#error cannot derive PAGE_ACCESSED from PAGE_PRESENT
+#endif
+
+#if defined(CONFIG_CPU_SH4)
+#define ldmmupteh(r)	mov.l	8f, r
+#else
+#define ldmmupteh(r)	mov	#MMU_PTEH, r
+#endif
+
 	.balign 	1024,0,1024
 tlb_miss:
-	mov.l	1f, k2
+#ifdef COUNT_EXCEPTIONS
+	! Increment the counts
+	mov.l	9f, k1
+	mov.l	@k1, k2
+	add	#1, k2
+	mov.l	k2, @k1
+#endif
+
+	! k0 scratch
+	! k1 pgd and pte pointers
+	! k2 faulting address
+	! k3 pgd and pte index masks
+	! k4 shift
+
+	! Load up the pgd entry (k1)
+
+	ldmmupteh(k0)			!  9 LS (latency=2)	MMU_PTEH
+
+	mov.w	4f, k3			!  8 LS (latency=2)	(PTRS_PER_PGD-1) << 2
+	mov	#-(PGDIR_SHIFT-2), k4	!  6 EX
+
+	mov.l	@(MMU_TEA-MMU_PTEH,k0), k2	! 18 LS (latency=2)
+
+	mov.l	@(MMU_TTB-MMU_PTEH,k0), k1	! 18 LS (latency=2)
+
+	mov	k2, k0			!   5 MT (latency=0)
+	shld	k4, k0			!  99 EX
+
+	and	k3, k0			!  78 EX
+
+	mov.l	@(k0, k1), k1		!  21 LS (latency=2)
+	mov	#-(PAGE_SHIFT-2), k4	!   6 EX
+
+	! Load up the pte entry (k2)
+
+	mov	k2, k0			!   5 MT (latency=0)
+	shld	k4, k0			!  99 EX
+
+	tst	k1, k1			!  86 MT
+
+	bt	20f			! 110 BR
+
+	and	k3, k0			!  78 EX
+	mov.w	5f, k4			!   8 LS (latency=2)	_PAGE_PRESENT
+
+	mov.l	@(k0, k1), k2		!  21 LS (latency=2)
+	add	k0, k1			!  49 EX
+
+#ifdef CONFIG_CPU_HAS_PTEA
+	! Test the entry for present and _PAGE_ACCESSED
+
+	mov	#-28, k3		!   6 EX
+	mov	k2, k0			!   5 MT (latency=0)
+
+	tst	k4, k2			!  68 MT
+	shld	k3, k0			!  99 EX
+
+	bt	20f			! 110 BR
+
+	! Set PTEA register
+	! MMU_PTEA = ((pteval >> 28) & 0xe) | (pteval & 0x1)
+	!
+	! k0=pte>>28, k1=pte*, k2=pte, k3=<unused>, k4=_PAGE_PRESENT
+
+	and	#0xe, k0		!  79 EX
+
+	mov	k0, k3			!   5 MT (latency=0)
+	mov	k2, k0			!   5 MT (latency=0)
+
+	and	#1, k0			!  79 EX
+
+	or	k0, k3			!  82 EX
+
+	ldmmupteh(k0)			!   9 LS (latency=2)
+	shll2	k4			! 101 EX		_PAGE_ACCESSED
+
+	tst	k4, k2			!  68 MT
+
+	mov.l	k3, @(MMU_PTEA-MMU_PTEH,k0)	! 27 LS
+
+	mov.l	7f, k3			!   9 LS (latency=2)	_PAGE_FLAGS_HARDWARE_MASK
+
+	! k0=MMU_PTEH, k1=pte*, k2=pte, k3=_PAGE_FLAGS_HARDWARE, k4=_PAGE_ACCESSED
+#else
+
+	! Test the entry for present and _PAGE_ACCESSED
+
+	mov.l	7f, k3			!   9 LS (latency=2)	_PAGE_FLAGS_HARDWARE_MASK
+	tst	k4, k2			!  68 MT
+
+	shll2	k4			! 101 EX		_PAGE_ACCESSED
+	ldmmupteh(k0)			!   9 LS (latency=2)
+
+	bt	20f			! 110 BR
+	tst	k4, k2			!  68 MT
+
+	! k0=MMU_PTEH, k1=pte*, k2=pte, k3=_PAGE_FLAGS_HARDWARE, k4=_PAGE_ACCESSED
+
+#endif
+
+	! Set up the entry
+
+	and	k2, k3			!  78 EX
+	bt/s	10f			! 108 BR
+
+	 mov.l	k3, @(MMU_PTEL-MMU_PTEH,k0)	! 27 LS
+
+	ldtlb				! 128 CO
+
+	! At least one instruction between ldtlb and rte
+	nop				! 119 NOP
+
+	rte				! 126 CO
+
+	 nop				! 119 NOP
+
+
+10:	or	k4, k2			!  82 EX
+
+	ldtlb				! 128 CO
+
+	! At least one instruction between ldtlb and rte
+	mov.l	k2, @k1			!  27 LS
+
+	rte				! 126 CO
+
+	! Note we cannot execute mov here, because it is executed after
+	! restoring SSR, so would be executed in user space.
+	 nop				! 119 NOP
+
+
+	.align 5
+	! Once cache line if possible...
+1:	.long	swapper_pg_dir
+4:	.short	(PTRS_PER_PGD-1) << 2
+5:	.short	_PAGE_PRESENT
+7:	.long	_PAGE_FLAGS_HARDWARE_MASK
+8:	.long	MMU_PTEH
+#ifdef COUNT_EXCEPTIONS
+9:	.long	exception_count_miss
+#endif
+
+	! Either pgd or pte not present
+20:	mov.l	1f, k2
 	mov.l	4f, k3
 	bra	handle_exception
 	 mov.l	@k2, k2
@@ -496,6 +650,15 @@
 	bf	interrupt_exception
 	shlr2	r8
 	shlr	r8
+
+#ifdef COUNT_EXCEPTIONS
+	mov.l	5f, r9
+	add	r8, r9
+	mov.l	@r9, r10
+	add	#1, r10
+	mov.l	r10, @r9
+#endif
+
 	mov.l	4f, r9
 	add	r8, r9
 	mov.l	@r9, r9
@@ -509,6 +672,9 @@
 2:	.long	0x000080f0	! FD=1, IMASK=15
 3:	.long	0xcfffffff	! RB=0, BL=0
 4:	.long	exception_handling_table
+#ifdef COUNT_EXCEPTIONS
+5:	.long	exception_count_table
+#endif
 
 interrupt_exception:
 	mov.l	1f, r9
diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index c294de1..afe0f1b 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -79,16 +79,16 @@
 	case 0x205:
 		cpu_data->type = CPU_SH7750;
 		cpu_data->flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU |
-				   CPU_HAS_PERF_COUNTER | CPU_HAS_PTEA;
+				   CPU_HAS_PERF_COUNTER;
 		break;
 	case 0x206:
 		cpu_data->type = CPU_SH7750S;
 		cpu_data->flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU |
-				   CPU_HAS_PERF_COUNTER | CPU_HAS_PTEA;
+				   CPU_HAS_PERF_COUNTER;
 		break;
 	case 0x1100:
 		cpu_data->type = CPU_SH7751;
-		cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+		cpu_data->flags |= CPU_HAS_FPU;
 		break;
 	case 0x2000:
 		cpu_data->type = CPU_SH73180;
@@ -126,23 +126,22 @@
 		break;
 	case 0x8000:
 		cpu_data->type = CPU_ST40RA;
-		cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+		cpu_data->flags |= CPU_HAS_FPU;
 		break;
 	case 0x8100:
 		cpu_data->type = CPU_ST40GX1;
-		cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+		cpu_data->flags |= CPU_HAS_FPU;
 		break;
 	case 0x700:
 		cpu_data->type = CPU_SH4_501;
 		cpu_data->icache.ways = 2;
 		cpu_data->dcache.ways = 2;
-		cpu_data->flags |= CPU_HAS_PTEA;
 		break;
 	case 0x600:
 		cpu_data->type = CPU_SH4_202;
 		cpu_data->icache.ways = 2;
 		cpu_data->dcache.ways = 2;
-		cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+		cpu_data->flags |= CPU_HAS_FPU;
 		break;
 	case 0x500 ... 0x501:
 		switch (prr) {
@@ -160,7 +159,7 @@
 		cpu_data->icache.ways = 2;
 		cpu_data->dcache.ways = 2;
 
-		cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA;
+		cpu_data->flags |= CPU_HAS_FPU;
 
 		break;
 	default:
@@ -173,6 +172,10 @@
 	cpu_data->dcache.ways = 1;
 #endif
 
+#ifdef CONFIG_CPU_HAS_PTEA
+	cpu_data->flags |= CPU_HAS_PTEA;
+#endif
+
 	/*
 	 * On anything that's not a direct-mapped cache, look to the CVR
 	 * for I/D-cache specifics.
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 88e9663..6cd6d00 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -20,6 +20,7 @@
 	bool
 	select CPU_HAS_INTEVT
 	select CPU_HAS_SR_RB
+	select CPU_HAS_PTEA if !CPU_SUBTYPE_ST40
 
 config CPU_SH4A
 	bool
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 128907e..123fb80 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -223,89 +223,3 @@
 	if (!user_mode(regs))
 		goto no_context;
 }
-
-#ifdef CONFIG_SH_STORE_QUEUES
-/*
- * This is a special case for the SH-4 store queues, as pages for this
- * space still need to be faulted in before it's possible to flush the
- * store queue cache for writeout to the remapped region.
- */
-#define P3_ADDR_MAX		(P4SEG_STORE_QUE + 0x04000000)
-#else
-#define P3_ADDR_MAX		P4SEG
-#endif
-
-/*
- * Called with interrupts disabled.
- */
-asmlinkage int __kprobes __do_page_fault(struct pt_regs *regs,
-					 unsigned long writeaccess,
-					 unsigned long address)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	pte_t entry;
-	struct mm_struct *mm = current->mm;
-	spinlock_t *ptl;
-	int ret = 1;
-
-#ifdef CONFIG_SH_KGDB
-	if (kgdb_nofault && kgdb_bus_err_hook)
-		kgdb_bus_err_hook();
-#endif
-
-	/*
-	 * We don't take page faults for P1, P2, and parts of P4, these
-	 * are always mapped, whether it be due to legacy behaviour in
-	 * 29-bit mode, or due to PMB configuration in 32-bit mode.
-	 */
-	if (address >= P3SEG && address < P3_ADDR_MAX) {
-		pgd = pgd_offset_k(address);
-		mm = NULL;
-	} else {
-		if (unlikely(address >= TASK_SIZE || !mm))
-			return 1;
-
-		pgd = pgd_offset(mm, address);
-	}
-
-	pud = pud_offset(pgd, address);
-	if (pud_none_or_clear_bad(pud))
-		return 1;
-	pmd = pmd_offset(pud, address);
-	if (pmd_none_or_clear_bad(pmd))
-		return 1;
-
-	if (mm)
-		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-	else
-		pte = pte_offset_kernel(pmd, address);
-
-	entry = *pte;
-	if (unlikely(pte_none(entry) || pte_not_present(entry)))
-		goto unlock;
-	if (unlikely(writeaccess && !pte_write(entry)))
-		goto unlock;
-
-	if (writeaccess)
-		entry = pte_mkdirty(entry);
-	entry = pte_mkyoung(entry);
-
-#ifdef CONFIG_CPU_SH4
-	/*
-	 * ITLB is not affected by "ldtlb" instruction.
-	 * So, we need to flush the entry by ourselves.
-	 */
-	__flush_tlb_page(get_asid(), address & PAGE_MASK);
-#endif
-
-	set_pte(pte, entry);
-	update_mmu_cache(NULL, address, entry);
-	ret = 0;
-unlock:
-	if (mm)
-		pte_unmap_unlock(pte, ptl);
-	return ret;
-}