[PATCH] ppc64: support 64k pages

Adds a new CONFIG_PPC_64K_PAGES which, when enabled, changes the kernel
base page size to 64K.  The resulting kernel still boots on any
hardware.  On current machines with 4K pages support only, the kernel
will maintain 16 "subpages" for each 64K page transparently.

Note that while real 64K capable HW has been tested, the current patch
will not enable it yet as such hardware is not released yet, and I'm
still verifying with the firmware architects the proper to get the
information from the newer hypervisors.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index d6ed910..e0d02c4 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -1,7 +1,7 @@
 /*
  * ppc64 MMU hashtable management routines
  *
- * (c) Copyright IBM Corp. 2003
+ * (c) Copyright IBM Corp. 2003, 2005
  *
  * Maintained by: Benjamin Herrenschmidt
  *                <benh@kernel.crashing.org>
@@ -10,6 +10,7 @@
  * described in the kernel's COPYING file.
  */
 
+#include <linux/config.h>
 #include <asm/reg.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -42,14 +43,24 @@
 /* Save non-volatile offsets */
 #define STK_REG(i)	(112 + ((i)-14)*8)
 
+
+#ifndef CONFIG_PPC_64K_PAGES
+
+/*****************************************************************************
+ *                                                                           *
+ *           4K SW & 4K HW pages implementation                              *
+ *                                                                           *
+ *****************************************************************************/
+
+
 /*
- * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
- *		pte_t *ptep, unsigned long trap, int local)
+ * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ *		 pte_t *ptep, unsigned long trap, int local)
  *
- * Adds a page to the hash table. This is the non-LPAR version for now
+ * Adds a 4K page to the hash table in a segment of 4K pages only
  */
 
-_GLOBAL(__hash_page)
+_GLOBAL(__hash_page_4K)
 	mflr	r0
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
@@ -88,7 +99,8 @@
 	/* If so, just bail out and refault if needed. Someone else
 	 * is changing this PTE anyway and might hash it.
 	 */
-	bne-	bail_ok
+	bne-	htab_bail_ok
+
 	/* Prepare new PTE value (turn access RW into DIRTY, then
 	 * add BUSY,HASHPTE and ACCESSED)
 	 */
@@ -118,10 +130,10 @@
 
 	/* Convert linux PTE bits into HW equivalents */
 	andi.	r3,r30,0x1fe		/* Get basic set of flags */
-	xori	r3,r3,HW_NO_EXEC	/* _PAGE_EXEC -> NOEXEC */
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
 	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
 	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
-	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
 
@@ -158,19 +170,21 @@
 	andc	r30,r30,r0
 	ori	r30,r30,_PAGE_HASHPTE
 
-	/* page number in r5 */
-	rldicl	r5,r31,64-PTE_SHIFT,PTE_SHIFT
+	/* physical address r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
 
 	/* Calculate primary group hash */
 	and	r0,r28,r27
-	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	rldicr	r3,r0,3,63-3		/* r3 = (hash & mask) << 3 */
 
 	/* Call ppc_md.hpte_insert */
-	ld	r7,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
 	mr	r4,r29			/* Retreive va */
-	li	r6,0			/* no vflags */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
 _GLOBAL(htab_call_hpte_insert1)
-	bl	.			/* Will be patched by htab_finish_init() */
+	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge	htab_pte_insert_ok	/* Insertion successful */
 	cmpdi	0,r3,-2			/* Critical failure */
@@ -178,19 +192,21 @@
 
 	/* Now try secondary slot */
 	
-	/* page number in r5 */
-	rldicl	r5,r31,64-PTE_SHIFT,PTE_SHIFT
+	/* physical address r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
 
 	/* Calculate secondary group hash */
 	andc	r0,r27,r28
 	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
 	
 	/* Call ppc_md.hpte_insert */
-	ld	r7,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
 	mr	r4,r29			/* Retreive va */
-	li	r6,HPTE_V_SECONDARY@l	/* secondary slot */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
 _GLOBAL(htab_call_hpte_insert2)
-	bl	.			/* Will be patched by htab_finish_init() */
+	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
 	bge+	htab_pte_insert_ok	/* Insertion successful */
 	cmpdi	0,r3,-2			/* Critical failure */
@@ -207,14 +223,14 @@
 	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */	
 	/* Call ppc_md.hpte_remove */
 _GLOBAL(htab_call_hpte_remove)
-	bl	.			/* Will be patched by htab_finish_init() */
+	bl	.			/* Patched by htab_finish_init() */
 
 	/* Try all again */
 	b	htab_insert_pte	
 
-bail_ok:
+htab_bail_ok:
 	li	r3,0
-	b	bail
+	b	htab_bail
 
 htab_pte_insert_ok:
 	/* Insert slot number & secondary bit in PTE */
@@ -227,7 +243,7 @@
 	ld	r6,STK_PARM(r6)(r1)
 	std	r30,0(r6)
 	li	r3, 0
-bail:
+htab_bail:
 	ld	r27,STK_REG(r27)(r1)
 	ld	r28,STK_REG(r28)(r1)
 	ld	r29,STK_REG(r29)(r1)
@@ -256,10 +272,10 @@
 
 	/* Call ppc_md.hpte_updatepp */
 	mr	r5,r29			/* va */
-	li	r6,0			/* large is 0 */
+	li	r6,MMU_PAGE_4K		/* page size */
 	ld	r7,STK_PARM(r8)(r1)	/* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
-	bl	.			/* Will be patched by htab_finish_init() */
+	bl	.			/* Patched by htab_finish_init() */
 
 	/* if we failed because typically the HPTE wasn't really here
 	 * we try an insertion. 
@@ -276,13 +292,556 @@
 	/* Bail out clearing reservation */
 	stdcx.	r31,0,r6
 	li	r3,1
-	b	bail
+	b	htab_bail
 
 htab_pte_insert_failure:
 	/* Bail out restoring old PTE */
 	ld	r6,STK_PARM(r6)(r1)
 	std	r31,0(r6)
 	li	r3,-1
-	b	bail
+	b	htab_bail
 
 
+#else /* CONFIG_PPC_64K_PAGES */
+
+
+/*****************************************************************************
+ *                                                                           *
+ *           64K SW & 4K or 64K HW in a 4K segment pages implementation      *
+ *                                                                           *
+ *****************************************************************************/
+
+/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ *		 pte_t *ptep, unsigned long trap, int local)
+ */
+
+/*
+ * For now, we do NOT implement Admixed pages
+ */
+_GLOBAL(__hash_page_4K)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	/* Save all params that we need after a function call */
+	std	r6,STK_PARM(r6)(r1)
+	std	r8,STK_PARM(r8)(r1)
+
+	/* Add _PAGE_PRESENT to access */
+	ori	r4,r4,_PAGE_PRESENT
+
+	/* Save non-volatile registers.
+	 * r31 will hold "old PTE"
+	 * r30 is "new PTE"
+	 * r29 is "va"
+	 * r28 is a hash value
+	 * r27 is hashtab mask (maybe dynamic patched instead ?)
+	 * r26 is the hidx mask
+	 * r25 is the index in combo page
+	 */
+	std	r25,STK_REG(r25)(r1)
+	std	r26,STK_REG(r26)(r1)
+	std	r27,STK_REG(r27)(r1)
+	std	r28,STK_REG(r28)(r1)
+	std	r29,STK_REG(r29)(r1)
+	std	r30,STK_REG(r30)(r1)
+	std	r31,STK_REG(r31)(r1)
+
+	/* Step 1:
+	 *
+	 * Check permissions, atomically mark the linux PTE busy
+	 * and hashed.
+	 */
+1:
+	ldarx	r31,0,r6
+	/* Check access rights (access & ~(pte_val(*ptep))) */
+	andc.	r0,r4,r31
+	bne-	htab_wrong_access
+	/* Check if PTE is busy */
+	andi.	r0,r31,_PAGE_BUSY
+	/* If so, just bail out and refault if needed. Someone else
+	 * is changing this PTE anyway and might hash it.
+	 */
+	bne-	htab_bail_ok
+	/* Prepare new PTE value (turn access RW into DIRTY, then
+	 * add BUSY and ACCESSED)
+	 */
+	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
+	or	r30,r30,r31
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+	/* Write the linux PTE atomically (setting busy) */
+	stdcx.	r30,0,r6
+	bne-	1b
+	isync
+
+	/* Step 2:
+	 *
+	 * Insert/Update the HPTE in the hash table. At this point,
+	 * r4 (access) is re-useable, we use it for the new HPTE flags
+	 */
+
+	/* Load the hidx index */
+	rldicl	r25,r3,64-12,60
+
+	/* Calc va and put it in r29 */
+	rldicr	r29,r5,28,63-28		/* r29 = (vsid << 28) */
+	rldicl	r3,r3,0,36		/* r3 = (ea & 0x0fffffff) */
+	or	r29,r3,r29		/* r29 = va
+
+	/* Calculate hash value for primary slot and store it in r28 */
+	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
+	rldicl	r0,r3,64-12,48		/* (ea >> 12) & 0xffff */
+	xor	r28,r5,r0
+
+	/* Convert linux PTE bits into HW equivalents */
+	andi.	r3,r30,0x1fe		/* Get basic set of flags */
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
+	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
+	andc	r0,r30,r0		/* r0 = pte & ~r0 */
+	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
+
+	/* We eventually do the icache sync here (maybe inline that
+	 * code rather than call a C function...)
+	 */
+BEGIN_FTR_SECTION
+	mr	r4,r30
+	mr	r5,r7
+	bl	.hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+	/* At this point, r3 contains new PP bits, save them in
+	 * place of "access" in the param area (sic)
+	 */
+	std	r3,STK_PARM(r4)(r1)
+
+	/* Get htab_hash_mask */
+	ld	r4,htab_hash_mask@got(2)
+	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
+
+	/* Check if we may already be in the hashtable, in this case, we
+	 * go to out-of-line code to try to modify the HPTE. We look for
+	 * the bit at (1 >> (index + 32))
+	 */
+	andi.	r0,r31,_PAGE_HASHPTE
+	li	r26,0			/* Default hidx */
+	beq	htab_insert_pte
+	ld	r6,STK_PARM(r6)(r1)
+	ori	r26,r6,0x8000		/* Load the hidx mask */
+	ld	r26,0(r26)
+	addi	r5,r25,36		/* Check actual HPTE_SUB bit, this */
+	rldcr.	r0,r31,r5,0		/* must match pgtable.h definition */
+	bne	htab_modify_pte
+
+htab_insert_pte:
+	/* real page number in r5, PTE RPN value + index */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT
+	add	r5,r5,r25
+	sldi	r5,r5,HW_PAGE_SHIFT
+
+	/* Calculate primary group hash */
+	and	r0,r28,r27
+	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
+_GLOBAL(htab_call_hpte_insert1)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Now try secondary slot */
+
+	/* real page number in r5, PTE RPN value + index */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT
+	add	r5,r5,r25
+	sldi	r5,r5,HW_PAGE_SHIFT
+
+	/* Calculate secondary group hash */
+	andc	r0,r27,r28
+	rldicr	r3,r0,3,63-3		/* r0 = (~hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
+_GLOBAL(htab_call_hpte_insert2)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge+	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Both are full, we need to evict something */
+	mftb	r0
+	/* Pick a random group based on TB */
+	andi.	r0,r0,1
+	mr	r5,r28
+	bne	2f
+	not	r5,r5
+2:	and	r0,r5,r27
+	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
+	/* Call ppc_md.hpte_remove */
+_GLOBAL(htab_call_hpte_remove)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* Try all again */
+	b	htab_insert_pte
+
+htab_bail_ok:
+	li	r3,0
+	b	htab_bail
+
+htab_pte_insert_ok:
+	/* Insert slot number & secondary bit in PTE second half,
+	 * clear _PAGE_BUSY and set approriate HPTE slot bit
+	 */
+	ld	r6,STK_PARM(r6)(r1)
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	/* HPTE SUB bit */
+	li	r0,1
+	subfic	r5,r25,27		/* Must match bit position in */
+	sld	r0,r0,r5		/* pgtable.h */
+	or	r30,r30,r0
+	/* hindx */
+	sldi	r5,r25,2
+	sld	r3,r3,r5
+	li	r4,0xf
+	sld	r4,r4,r5
+	andc	r26,r26,r4
+	or	r26,r26,r3
+	ori	r5,r6,0x8000
+	std	r26,0(r5)
+	lwsync
+	std	r30,0(r6)
+	li	r3, 0
+htab_bail:
+	ld	r25,STK_REG(r25)(r1)
+	ld	r26,STK_REG(r26)(r1)
+	ld	r27,STK_REG(r27)(r1)
+	ld	r28,STK_REG(r28)(r1)
+	ld	r29,STK_REG(r29)(r1)
+	ld      r30,STK_REG(r30)(r1)
+	ld      r31,STK_REG(r31)(r1)
+	addi    r1,r1,STACKFRAMESIZE
+	ld      r0,16(r1)
+	mtlr    r0
+	blr
+
+htab_modify_pte:
+	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+	mr	r4,r3
+	sldi	r5,r25,2
+	srd	r3,r26,r5
+
+	/* Secondary group ? if yes, get a inverted hash value */
+	mr	r5,r28
+	andi.	r0,r3,0x8 /* page secondary ? */
+	beq	1f
+	not	r5,r5
+1:	andi.	r3,r3,0x7 /* extract idx alone */
+
+	/* Calculate proper slot value for ppc_md.hpte_updatepp */
+	and	r0,r5,r27
+	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	add	r3,r0,r3	/* add slot idx */
+
+	/* Call ppc_md.hpte_updatepp */
+	mr	r5,r29			/* va */
+	li	r6,MMU_PAGE_4K		/* page size */
+	ld	r7,STK_PARM(r8)(r1)	/* get "local" param */
+_GLOBAL(htab_call_hpte_updatepp)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* if we failed because typically the HPTE wasn't really here
+	 * we try an insertion.
+	 */
+	cmpdi	0,r3,-1
+	beq-	htab_insert_pte
+
+	/* Clear the BUSY bit and Write out the PTE */
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	ld	r6,STK_PARM(r6)(r1)
+	std	r30,0(r6)
+	li	r3,0
+	b	htab_bail
+
+htab_wrong_access:
+	/* Bail out clearing reservation */
+	stdcx.	r31,0,r6
+	li	r3,1
+	b	htab_bail
+
+htab_pte_insert_failure:
+	/* Bail out restoring old PTE */
+	ld	r6,STK_PARM(r6)(r1)
+	std	r31,0(r6)
+	li	r3,-1
+	b	htab_bail
+
+
+/*****************************************************************************
+ *                                                                           *
+ *           64K SW & 64K HW in a 64K segment pages implementation           *
+ *                                                                           *
+ *****************************************************************************/
+
+_GLOBAL(__hash_page_64K)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	/* Save all params that we need after a function call */
+	std	r6,STK_PARM(r6)(r1)
+	std	r8,STK_PARM(r8)(r1)
+
+	/* Add _PAGE_PRESENT to access */
+	ori	r4,r4,_PAGE_PRESENT
+
+	/* Save non-volatile registers.
+	 * r31 will hold "old PTE"
+	 * r30 is "new PTE"
+	 * r29 is "va"
+	 * r28 is a hash value
+	 * r27 is hashtab mask (maybe dynamic patched instead ?)
+	 */
+	std	r27,STK_REG(r27)(r1)
+	std	r28,STK_REG(r28)(r1)
+	std	r29,STK_REG(r29)(r1)
+	std	r30,STK_REG(r30)(r1)
+	std	r31,STK_REG(r31)(r1)
+
+	/* Step 1:
+	 *
+	 * Check permissions, atomically mark the linux PTE busy
+	 * and hashed.
+	 */
+1:
+	ldarx	r31,0,r6
+	/* Check access rights (access & ~(pte_val(*ptep))) */
+	andc.	r0,r4,r31
+	bne-	ht64_wrong_access
+	/* Check if PTE is busy */
+	andi.	r0,r31,_PAGE_BUSY
+	/* If so, just bail out and refault if needed. Someone else
+	 * is changing this PTE anyway and might hash it.
+	 */
+	bne-	ht64_bail_ok
+	/* Prepare new PTE value (turn access RW into DIRTY, then
+	 * add BUSY,HASHPTE and ACCESSED)
+	 */
+	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
+	or	r30,r30,r31
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+	/* Write the linux PTE atomically (setting busy) */
+	stdcx.	r30,0,r6
+	bne-	1b
+	isync
+
+	/* Step 2:
+	 *
+	 * Insert/Update the HPTE in the hash table. At this point,
+	 * r4 (access) is re-useable, we use it for the new HPTE flags
+	 */
+
+	/* Calc va and put it in r29 */
+	rldicr	r29,r5,28,63-28
+	rldicl	r3,r3,0,36
+	or	r29,r3,r29
+
+	/* Calculate hash value for primary slot and store it in r28 */
+	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
+	rldicl	r0,r3,64-16,52		/* (ea >> 16) & 0xfff */
+	xor	r28,r5,r0
+
+	/* Convert linux PTE bits into HW equivalents */
+	andi.	r3,r30,0x1fe		/* Get basic set of flags */
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
+	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
+	andc	r0,r30,r0		/* r0 = pte & ~r0 */
+	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
+
+	/* We eventually do the icache sync here (maybe inline that
+	 * code rather than call a C function...)
+	 */
+BEGIN_FTR_SECTION
+	mr	r4,r30
+	mr	r5,r7
+	bl	.hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+	/* At this point, r3 contains new PP bits, save them in
+	 * place of "access" in the param area (sic)
+	 */
+	std	r3,STK_PARM(r4)(r1)
+
+	/* Get htab_hash_mask */
+	ld	r4,htab_hash_mask@got(2)
+	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
+
+	/* Check if we may already be in the hashtable, in this case, we
+	 * go to out-of-line code to try to modify the HPTE
+	 */
+	andi.	r0,r31,_PAGE_HASHPTE
+	bne	ht64_modify_pte
+
+ht64_insert_pte:
+	/* Clear hpte bits in new pte (we also clear BUSY btw) and
+	 * add _PAGE_HASHPTE
+	 */
+	lis	r0,_PAGE_HPTEFLAGS@h
+	ori	r0,r0,_PAGE_HPTEFLAGS@l
+	andc	r30,r30,r0
+	ori	r30,r30,_PAGE_HASHPTE
+
+	/* Phyical address in r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
+
+	/* Calculate primary group hash */
+	and	r0,r28,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_64K
+_GLOBAL(ht64_call_hpte_insert1)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge	ht64_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	ht64_pte_insert_failure
+
+	/* Now try secondary slot */
+
+	/* Phyical address in r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
+
+	/* Calculate secondary group hash */
+	andc	r0,r27,r28
+	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_64K
+_GLOBAL(ht64_call_hpte_insert2)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge+	ht64_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	ht64_pte_insert_failure
+
+	/* Both are full, we need to evict something */
+	mftb	r0
+	/* Pick a random group based on TB */
+	andi.	r0,r0,1
+	mr	r5,r28
+	bne	2f
+	not	r5,r5
+2:	and	r0,r5,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	/* Call ppc_md.hpte_remove */
+_GLOBAL(ht64_call_hpte_remove)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* Try all again */
+	b	ht64_insert_pte
+
+ht64_bail_ok:
+	li	r3,0
+	b	ht64_bail
+
+ht64_pte_insert_ok:
+	/* Insert slot number & secondary bit in PTE */
+	rldimi	r30,r3,12,63-15
+
+	/* Write out the PTE with a normal write
+	 * (maybe add eieio may be good still ?)
+	 */
+ht64_write_out_pte:
+	ld	r6,STK_PARM(r6)(r1)
+	std	r30,0(r6)
+	li	r3, 0
+ht64_bail:
+	ld	r27,STK_REG(r27)(r1)
+	ld	r28,STK_REG(r28)(r1)
+	ld	r29,STK_REG(r29)(r1)
+	ld      r30,STK_REG(r30)(r1)
+	ld      r31,STK_REG(r31)(r1)
+	addi    r1,r1,STACKFRAMESIZE
+	ld      r0,16(r1)
+	mtlr    r0
+	blr
+
+ht64_modify_pte:
+	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+	mr	r4,r3
+	rlwinm	r3,r31,32-12,29,31
+
+	/* Secondary group ? if yes, get a inverted hash value */
+	mr	r5,r28
+	andi.	r0,r31,_PAGE_F_SECOND
+	beq	1f
+	not	r5,r5
+1:
+	/* Calculate proper slot value for ppc_md.hpte_updatepp */
+	and	r0,r5,r27
+	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	add	r3,r0,r3	/* add slot idx */
+
+	/* Call ppc_md.hpte_updatepp */
+	mr	r5,r29			/* va */
+	li	r6,MMU_PAGE_64K
+	ld	r7,STK_PARM(r8)(r1)	/* get "local" param */
+_GLOBAL(ht64_call_hpte_updatepp)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* if we failed because typically the HPTE wasn't really here
+	 * we try an insertion.
+	 */
+	cmpdi	0,r3,-1
+	beq-	ht64_insert_pte
+
+	/* Clear the BUSY bit and Write out the PTE */
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	b	ht64_write_out_pte
+
+ht64_wrong_access:
+	/* Bail out clearing reservation */
+	stdcx.	r31,0,r6
+	li	r3,1
+	b	ht64_bail
+
+ht64_pte_insert_failure:
+	/* Bail out restoring old PTE */
+	ld	r6,STK_PARM(r6)(r1)
+	std	r31,0(r6)
+	li	r3,-1
+	b	ht64_bail
+
+
+#endif /* CONFIG_PPC_64K_PAGES */
+
+
+/*****************************************************************************
+ *                                                                           *
+ *           Huge pages implementation is in hugetlbpage.c                   *
+ *                                                                           *
+ *****************************************************************************/
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 174d145..d96bcfe 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -9,6 +9,9 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+
+#undef DEBUG_LOW
+
 #include <linux/spinlock.h>
 #include <linux/bitops.h>
 #include <linux/threads.h>
@@ -22,11 +25,84 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/cputable.h>
+#include <asm/udbg.h>
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
 
 #define HPTE_LOCK_BIT 3
 
 static DEFINE_SPINLOCK(native_tlbie_lock);
 
+static inline void __tlbie(unsigned long va, unsigned int psize)
+{
+	unsigned int penc;
+
+	/* clear top 16 bits, non SLS segment */
+	va &= ~(0xffffULL << 48);
+
+	switch (psize) {
+	case MMU_PAGE_4K:
+		va &= ~0xffful;
+		asm volatile("tlbie %0,0" : : "r" (va) : "memory");
+		break;
+	default:
+		penc = mmu_psize_defs[psize].penc;
+		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+		va |= (0x7f >> (8 - penc)) << 12;
+		asm volatile("tlbie %0,1" : : "r" (va) : "memory");
+		break;
+	}
+}
+
+static inline void __tlbiel(unsigned long va, unsigned int psize)
+{
+	unsigned int penc;
+
+	/* clear top 16 bits, non SLS segment */
+	va &= ~(0xffffULL << 48);
+
+	switch (psize) {
+	case MMU_PAGE_4K:
+		va &= ~0xffful;
+		asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
+			     : : "r"(va) : "memory");
+		break;
+	default:
+		penc = mmu_psize_defs[psize].penc;
+		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+		va |= (0x7f >> (8 - penc)) << 12;
+		asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
+			     : : "r"(va) : "memory");
+		break;
+	}
+
+}
+
+static inline void tlbie(unsigned long va, int psize, int local)
+{
+	unsigned int use_local = local && cpu_has_feature(CPU_FTR_TLBIEL);
+	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+	if (use_local)
+		use_local = mmu_psize_defs[psize].tlbiel;
+	if (lock_tlbie && !use_local)
+		spin_lock(&native_tlbie_lock);
+	asm volatile("ptesync": : :"memory");
+	if (use_local) {
+		__tlbiel(va, psize);
+		asm volatile("ptesync": : :"memory");
+	} else {
+		__tlbie(va, psize);
+		asm volatile("eieio; tlbsync; ptesync": : :"memory");
+	}
+	if (lock_tlbie && !use_local)
+		spin_unlock(&native_tlbie_lock);
+}
+
 static inline void native_lock_hpte(hpte_t *hptep)
 {
 	unsigned long *word = &hptep->v;
@@ -48,13 +124,19 @@
 }
 
 long native_hpte_insert(unsigned long hpte_group, unsigned long va,
-			unsigned long prpn, unsigned long vflags,
-			unsigned long rflags)
+			unsigned long pa, unsigned long rflags,
+			unsigned long vflags, int psize)
 {
 	hpte_t *hptep = htab_address + hpte_group;
 	unsigned long hpte_v, hpte_r;
 	int i;
 
+	if (!(vflags & HPTE_V_BOLTED)) {
+		DBG_LOW("    insert(group=%lx, va=%016lx, pa=%016lx,"
+			" rflags=%lx, vflags=%lx, psize=%d)\n",
+			hpte_group, va, pa, rflags, vflags, psize);
+	}
+
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
 		if (! (hptep->v & HPTE_V_VALID)) {
 			/* retry with lock held */
@@ -70,10 +152,13 @@
 	if (i == HPTES_PER_GROUP)
 		return -1;
 
-	hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
-	if (vflags & HPTE_V_LARGE)
-		va &= ~(1UL << HPTE_V_AVPN_SHIFT);
-	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
+	hpte_v = hpte_encode_v(va, psize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize) | rflags;
+
+	if (!(vflags & HPTE_V_BOLTED)) {
+		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
+			i, hpte_v, hpte_r);
+	}
 
 	hptep->r = hpte_r;
 	/* Guarantee the second dword is visible before the valid bit */
@@ -96,6 +181,8 @@
 	int slot_offset;
 	unsigned long hpte_v;
 
+	DBG_LOW("    remove(group=%lx)\n", hpte_group);
+
 	/* pick a random entry to start at */
 	slot_offset = mftb() & 0x7;
 
@@ -126,34 +213,51 @@
 	return i;
 }
 
-static inline void set_pp_bit(unsigned long pp, hpte_t *addr)
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+				 unsigned long va, int psize, int local)
 {
-	unsigned long old;
-	unsigned long *p = &addr->r;
+	hpte_t *hptep = htab_address + slot;
+	unsigned long hpte_v, want_v;
+	int ret = 0;
 
-	__asm__ __volatile__(
-	"1:	ldarx	%0,0,%3\n\
-		rldimi	%0,%2,0,61\n\
-		stdcx.	%0,0,%3\n\
-		bne	1b"
-	: "=&r" (old), "=m" (*p)
-	: "r" (pp), "r" (p), "m" (*p)
-	: "cc");
+	want_v = hpte_encode_v(va, psize);
+
+	DBG_LOW("    update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)",
+		va, want_v & HPTE_V_AVPN, slot, newpp);
+
+	native_lock_hpte(hptep);
+
+	hpte_v = hptep->v;
+
+	/* Even if we miss, we need to invalidate the TLB */
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
+		DBG_LOW(" -> miss\n");
+		native_unlock_hpte(hptep);
+		ret = -1;
+	} else {
+		DBG_LOW(" -> hit\n");
+		/* Update the HPTE */
+		hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
+			(newpp & (HPTE_R_PP | HPTE_R_N));
+		native_unlock_hpte(hptep);
+	}
+
+	/* Ensure it is out of the tlb too. */
+	tlbie(va, psize, local);
+
+	return ret;
 }
 
-/*
- * Only works on small pages. Yes its ugly to have to check each slot in
- * the group but we only use this during bootup.
- */
-static long native_hpte_find(unsigned long vpn)
+static long native_hpte_find(unsigned long va, int psize)
 {
 	hpte_t *hptep;
 	unsigned long hash;
 	unsigned long i, j;
 	long slot;
-	unsigned long hpte_v;
+	unsigned long want_v, hpte_v;
 
-	hash = hpt_hash(vpn, 0);
+	hash = hpt_hash(va, mmu_psize_defs[psize].shift);
+	want_v = hpte_encode_v(va, psize);
 
 	for (j = 0; j < 2; j++) {
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -161,7 +265,7 @@
 			hptep = htab_address + slot;
 			hpte_v = hptep->v;
 
-			if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11))
+			if (HPTE_V_COMPARE(hpte_v, want_v)
 			    && (hpte_v & HPTE_V_VALID)
 			    && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) {
 				/* HPTE matches */
@@ -177,118 +281,90 @@
 	return -1;
 }
 
-static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-				 unsigned long va, int large, int local)
-{
-	hpte_t *hptep = htab_address + slot;
-	unsigned long hpte_v;
-	unsigned long avpn = va >> 23;
-	int ret = 0;
-
-	if (large)
-		avpn &= ~1;
-
-	native_lock_hpte(hptep);
-
-	hpte_v = hptep->v;
-
-	/* Even if we miss, we need to invalidate the TLB */
-	if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-	    || !(hpte_v & HPTE_V_VALID)) {
-		native_unlock_hpte(hptep);
-		ret = -1;
-	} else {
-		set_pp_bit(newpp, hptep);
-		native_unlock_hpte(hptep);
-	}
-
-	/* Ensure it is out of the tlb too */
-	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
-		tlbiel(va);
-	} else {
-		int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			spin_lock(&native_tlbie_lock);
-		tlbie(va, large);
-		if (lock_tlbie)
-			spin_unlock(&native_tlbie_lock);
-	}
-
-	return ret;
-}
-
 /*
  * Update the page protection bits. Intended to be used to create
  * guard pages for kernel data structures on pages which are bolted
  * in the HPT. Assumes pages being operated on will not be stolen.
- * Does not work on large pages.
  *
  * No need to lock here because we should be the only user.
  */
-static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
+				       int psize)
 {
-	unsigned long vsid, va, vpn, flags = 0;
+	unsigned long vsid, va;
 	long slot;
 	hpte_t *hptep;
-	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
 
 	vsid = get_kernel_vsid(ea);
 	va = (vsid << 28) | (ea & 0x0fffffff);
-	vpn = va >> PAGE_SHIFT;
 
-	slot = native_hpte_find(vpn);
+	slot = native_hpte_find(va, psize);
 	if (slot == -1)
 		panic("could not find page to bolt\n");
 	hptep = htab_address + slot;
 
-	set_pp_bit(newpp, hptep);
+	/* Update the HPTE */
+	hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
+		(newpp & (HPTE_R_PP | HPTE_R_N));
 
-	/* Ensure it is out of the tlb too */
-	if (lock_tlbie)
-		spin_lock_irqsave(&native_tlbie_lock, flags);
-	tlbie(va, 0);
-	if (lock_tlbie)
-		spin_unlock_irqrestore(&native_tlbie_lock, flags);
+	/* Ensure it is out of the tlb too. */
+	tlbie(va, psize, 0);
 }
 
 static void native_hpte_invalidate(unsigned long slot, unsigned long va,
-				    int large, int local)
+				   int psize, int local)
 {
 	hpte_t *hptep = htab_address + slot;
 	unsigned long hpte_v;
-	unsigned long avpn = va >> 23;
+	unsigned long want_v;
 	unsigned long flags;
-	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
-
-	if (large)
-		avpn &= ~1;
 
 	local_irq_save(flags);
-	native_lock_hpte(hptep);
 
+	DBG_LOW("    invalidate(va=%016lx, hash: %x)\n", va, slot);
+
+	want_v = hpte_encode_v(va, psize);
+	native_lock_hpte(hptep);
 	hpte_v = hptep->v;
 
 	/* Even if we miss, we need to invalidate the TLB */
-	if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-	    || !(hpte_v & HPTE_V_VALID)) {
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
 		native_unlock_hpte(hptep);
-	} else {
+	else
 		/* Invalidate the hpte. NOTE: this also unlocks it */
 		hptep->v = 0;
+
+	/* Invalidate the TLB */
+	tlbie(va, psize, local);
+
+	local_irq_restore(flags);
+}
+
+/*
+ * XXX This need fixing based on page size. It's only used by
+ * native_hpte_clear() for now which needs fixing too so they
+ * make a good pair...
+ */
+static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
+{
+	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
+	unsigned long va;
+
+	va = avpn << 23;
+
+	if (! (hpte_v & HPTE_V_LARGE)) {
+		unsigned long vpi, pteg;
+
+		pteg = slot / HPTES_PER_GROUP;
+		if (hpte_v & HPTE_V_SECONDARY)
+			pteg = ~pteg;
+
+		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
+
+		va |= vpi << PAGE_SHIFT;
 	}
 
-	/* Invalidate the tlb */
-	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
-		tlbiel(va);
-	} else {
-		if (lock_tlbie)
-			spin_lock(&native_tlbie_lock);
-		tlbie(va, large);
-		if (lock_tlbie)
-			spin_unlock(&native_tlbie_lock);
-	}
-	local_irq_restore(flags);
+	return va;
 }
 
 /*
@@ -298,6 +374,8 @@
  *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
  * athough there is the control page available...
+ *
+ * XXX FIXME: 4k only for now !
  */
 static void native_hpte_clear(void)
 {
@@ -327,7 +405,7 @@
 
 		if (hpte_v & HPTE_V_VALID) {
 			hptep->v = 0;
-			tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE);
+			tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K, 0);
 		}
 	}
 
@@ -335,59 +413,59 @@
 	local_irq_restore(flags);
 }
 
+/*
+ * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
+ * the lock all the time
+ */
 static void native_flush_hash_range(unsigned long number, int local)
 {
-	unsigned long va, vpn, hash, secondary, slot, flags, avpn;
-	int i, j;
+	unsigned long va, hash, index, hidx, shift, slot;
 	hpte_t *hptep;
 	unsigned long hpte_v;
+	unsigned long want_v;
+	unsigned long flags;
+	real_pte_t pte;
 	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-	unsigned long large = batch->large;
+	unsigned long psize = batch->psize;
+	int i;
 
 	local_irq_save(flags);
 
-	j = 0;
 	for (i = 0; i < number; i++) {
-		va = batch->vaddr[j];
-		if (large)
-			vpn = va >> HPAGE_SHIFT;
-		else
-			vpn = va >> PAGE_SHIFT;
-		hash = hpt_hash(vpn, large);
-		secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15;
-		if (secondary)
-			hash = ~hash;
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12;
+		va = batch->vaddr[i];
+		pte = batch->pte[i];
 
-		hptep = htab_address + slot;
-
-		avpn = va >> 23;
-		if (large)
-			avpn &= ~0x1UL;
-
-		native_lock_hpte(hptep);
-
-		hpte_v = hptep->v;
-
-		/* Even if we miss, we need to invalidate the TLB */
-		if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-		    || !(hpte_v & HPTE_V_VALID)) {
-			native_unlock_hpte(hptep);
-		} else {
-			/* Invalidate the hpte. NOTE: this also unlocks it */
-			hptep->v = 0;
-		}
-
-		j++;
+		pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
+			hash = hpt_hash(va, shift);
+			hidx = __rpte_to_hidx(pte, index);
+			if (hidx & _PTEIDX_SECONDARY)
+				hash = ~hash;
+			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot += hidx & _PTEIDX_GROUP_IX;
+			hptep = htab_address + slot;
+			want_v = hpte_encode_v(va, psize);
+			native_lock_hpte(hptep);
+			hpte_v = hptep->v;
+			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
+			    !(hpte_v & HPTE_V_VALID))
+				native_unlock_hpte(hptep);
+			else
+				hptep->v = 0;
+		} pte_iterate_hashed_end();
 	}
 
-	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+	if (cpu_has_feature(CPU_FTR_TLBIEL) &&
+	    mmu_psize_defs[psize].tlbiel && local) {
 		asm volatile("ptesync":::"memory");
+		for (i = 0; i < number; i++) {
+			va = batch->vaddr[i];
+			pte = batch->pte[i];
 
-		for (i = 0; i < j; i++)
-			__tlbiel(batch->vaddr[i]);
-
+			pte_iterate_hashed_subpages(pte, psize, va, index,
+						    shift) {
+				__tlbiel(va, psize);
+			} pte_iterate_hashed_end();
+		}
 		asm volatile("ptesync":::"memory");
 	} else {
 		int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
@@ -396,10 +474,15 @@
 			spin_lock(&native_tlbie_lock);
 
 		asm volatile("ptesync":::"memory");
+		for (i = 0; i < number; i++) {
+			va = batch->vaddr[i];
+			pte = batch->pte[i];
 
-		for (i = 0; i < j; i++)
-			__tlbie(batch->vaddr[i], large);
-
+			pte_iterate_hashed_subpages(pte, psize, va, index,
+						    shift) {
+				__tlbie(va, psize);
+			} pte_iterate_hashed_end();
+		}
 		asm volatile("eieio; tlbsync; ptesync":::"memory");
 
 		if (lock_tlbie)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6e9e05c..b2f3dbc 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -19,6 +19,7 @@
  */
 
 #undef DEBUG
+#undef DEBUG_LOW
 
 #include <linux/config.h>
 #include <linux/spinlock.h>
@@ -59,6 +60,15 @@
 #define DBG(fmt...)
 #endif
 
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#define KB (1024)
+#define MB (1024*KB)
+
 /*
  * Note:  pte   --> Linux PTE
  *        HPTE  --> PowerPC Hashed Page Table Entry
@@ -77,91 +87,290 @@
 
 hpte_t *htab_address;
 unsigned long htab_hash_mask;
-
 unsigned long _SDR1;
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+int mmu_linear_psize = MMU_PAGE_4K;
+int mmu_virtual_psize = MMU_PAGE_4K;
+#ifdef CONFIG_HUGETLB_PAGE
+int mmu_huge_psize = MMU_PAGE_16M;
+unsigned int HPAGE_SHIFT;
+#endif
 
-#define KB (1024)
-#define MB (1024*KB)
+/* There are definitions of page sizes arrays to be used when none
+ * is provided by the firmware.
+ */
 
-static inline void loop_forever(void)
+/* Pre-POWER4 CPUs (4k pages only)
+ */
+struct mmu_psize_def mmu_psize_defaults_old[] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.sllp	= 0,
+		.penc	= 0,
+		.avpnm	= 0,
+		.tlbiel = 0,
+	},
+};
+
+/* POWER4, GPUL, POWER5
+ *
+ * Support for 16Mb large pages
+ */
+struct mmu_psize_def mmu_psize_defaults_gp[] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.sllp	= 0,
+		.penc	= 0,
+		.avpnm	= 0,
+		.tlbiel = 1,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.sllp	= SLB_VSID_L,
+		.penc	= 0,
+		.avpnm	= 0x1UL,
+		.tlbiel = 0,
+	},
+};
+
+
+int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
+		      unsigned long pstart, unsigned long mode, int psize)
 {
-	volatile unsigned long x = 1;
-	for(;x;x|=1)
-		;
-}
-
-static inline void create_pte_mapping(unsigned long start, unsigned long end,
-				      unsigned long mode, int large)
-{
-	unsigned long addr;
-	unsigned int step;
+	unsigned long vaddr, paddr;
+	unsigned int step, shift;
 	unsigned long tmp_mode;
-	unsigned long vflags;
+	int ret = 0;
 
-	if (large) {
-		step = 16*MB;
-		vflags = HPTE_V_BOLTED | HPTE_V_LARGE;
-	} else {
-		step = 4*KB;
-		vflags = HPTE_V_BOLTED;
-	}
+	shift = mmu_psize_defs[psize].shift;
+	step = 1 << shift;
 
-	for (addr = start; addr < end; addr += step) {
+	for (vaddr = vstart, paddr = pstart; vaddr < vend;
+	     vaddr += step, paddr += step) {
 		unsigned long vpn, hash, hpteg;
-		unsigned long vsid = get_kernel_vsid(addr);
-		unsigned long va = (vsid << 28) | (addr & 0xfffffff);
-		int ret = -1;
+		unsigned long vsid = get_kernel_vsid(vaddr);
+		unsigned long va = (vsid << 28) | (vaddr & 0x0fffffff);
 
-		if (large)
-			vpn = va >> HPAGE_SHIFT;
-		else
-			vpn = va >> PAGE_SHIFT;
-
-
+		vpn = va >> shift;
 		tmp_mode = mode;
 		
 		/* Make non-kernel text non-executable */
-		if (!in_kernel_text(addr))
-			tmp_mode = mode | HW_NO_EXEC;
+		if (!in_kernel_text(vaddr))
+			tmp_mode = mode | HPTE_R_N;
 
-		hash = hpt_hash(vpn, large);
-
+		hash = hpt_hash(va, shift);
 		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 
+		/* The crap below can be cleaned once ppd_md.probe() can
+		 * set up the hash callbacks, thus we can just used the
+		 * normal insert callback here.
+		 */
 #ifdef CONFIG_PPC_ISERIES
-		if (systemcfg->platform & PLATFORM_ISERIES_LPAR)
-			ret = iSeries_hpte_bolt_or_insert(hpteg, va,
-				virt_to_abs(addr) >> PAGE_SHIFT,
-				vflags, tmp_mode);
+		if (systemcfg->platform == PLATFORM_ISERIES_LPAR)
+			ret = iSeries_hpte_insert(hpteg, va,
+						  virt_to_abs(paddr),
+						  tmp_mode,
+						  HPTE_V_BOLTED,
+						  psize);
 		else
 #endif
 #ifdef CONFIG_PPC_PSERIES
 		if (systemcfg->platform & PLATFORM_LPAR)
 			ret = pSeries_lpar_hpte_insert(hpteg, va,
-				virt_to_abs(addr) >> PAGE_SHIFT,
-				vflags, tmp_mode);
+						       virt_to_abs(paddr),
+						       tmp_mode,
+						       HPTE_V_BOLTED,
+						       psize);
 		else
 #endif
 #ifdef CONFIG_PPC_MULTIPLATFORM
 			ret = native_hpte_insert(hpteg, va,
-				virt_to_abs(addr) >> PAGE_SHIFT,
-				vflags, tmp_mode);
+						 virt_to_abs(paddr),
+						 tmp_mode, HPTE_V_BOLTED,
+						 psize);
 #endif
-
-		if (ret == -1) {
-			ppc64_terminate_msg(0x20, "create_pte_mapping");
-			loop_forever();
-		}
+		if (ret < 0)
+			break;
 	}
+	return ret < 0 ? ret : 0;
 }
 
-static unsigned long get_hashtable_size(void)
+static int __init htab_dt_scan_page_sizes(unsigned long node,
+					  const char *uname, int depth,
+					  void *data)
+{
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	u32 *prop;
+	unsigned long size = 0;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = (u32 *)of_get_flat_dt_prop(node,
+					  "ibm,segment-page-sizes", &size);
+	if (prop != NULL) {
+		DBG("Page sizes from device-tree:\n");
+		size /= 4;
+		cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE);
+		while(size > 0) {
+			unsigned int shift = prop[0];
+			unsigned int slbenc = prop[1];
+			unsigned int lpnum = prop[2];
+			unsigned int lpenc = 0;
+			struct mmu_psize_def *def;
+			int idx = -1;
+
+			size -= 3; prop += 3;
+			while(size > 0 && lpnum) {
+				if (prop[0] == shift)
+					lpenc = prop[1];
+				prop += 2; size -= 2;
+				lpnum--;
+			}
+			switch(shift) {
+			case 0xc:
+				idx = MMU_PAGE_4K;
+				break;
+			case 0x10:
+				idx = MMU_PAGE_64K;
+				break;
+			case 0x14:
+				idx = MMU_PAGE_1M;
+				break;
+			case 0x18:
+				idx = MMU_PAGE_16M;
+				cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE;
+				break;
+			case 0x22:
+				idx = MMU_PAGE_16G;
+				break;
+			}
+			if (idx < 0)
+				continue;
+			def = &mmu_psize_defs[idx];
+			def->shift = shift;
+			if (shift <= 23)
+				def->avpnm = 0;
+			else
+				def->avpnm = (1 << (shift - 23)) - 1;
+			def->sllp = slbenc;
+			def->penc = lpenc;
+			/* We don't know for sure what's up with tlbiel, so
+			 * for now we only set it for 4K and 64K pages
+			 */
+			if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+				def->tlbiel = 1;
+			else
+				def->tlbiel = 0;
+
+			DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, "
+			    "tlbiel=%d, penc=%d\n",
+			    idx, shift, def->sllp, def->avpnm, def->tlbiel,
+			    def->penc);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+
+static void __init htab_init_page_sizes(void)
+{
+	int rc;
+
+	/* Default to 4K pages only */
+	memcpy(mmu_psize_defs, mmu_psize_defaults_old,
+	       sizeof(mmu_psize_defaults_old));
+
+	/*
+	 * Try to find the available page sizes in the device-tree
+	 */
+	rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
+	if (rc != 0)  /* Found */
+		goto found;
+
+	/*
+	 * Not in the device-tree, let's fallback on known size
+	 * list for 16M capable GP & GR
+	 */
+	if ((systemcfg->platform != PLATFORM_ISERIES_LPAR) &&
+	    cpu_has_feature(CPU_FTR_16M_PAGE))
+		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
+		       sizeof(mmu_psize_defaults_gp));
+ found:
+	/*
+	 * Pick a size for the linear mapping. Currently, we only support
+	 * 16M, 1M and 4K which is the default
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		mmu_linear_psize = MMU_PAGE_16M;
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		mmu_linear_psize = MMU_PAGE_1M;
+
+	/*
+	 * Pick a size for the ordinary pages. Default is 4K, we support
+	 * 64K if cache inhibited large pages are supported by the
+	 * processor
+	 */
+#ifdef CONFIG_PPC_64K_PAGES
+	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+	    cpu_has_feature(CPU_FTR_CI_LARGE_PAGE))
+		mmu_virtual_psize = MMU_PAGE_64K;
+#endif
+
+	printk(KERN_INFO "Page orders: linear mapping = %d, others = %d\n",
+	       mmu_psize_defs[mmu_linear_psize].shift,
+	       mmu_psize_defs[mmu_virtual_psize].shift);
+
+#ifdef CONFIG_HUGETLB_PAGE
+	/* Init large page size. Currently, we pick 16M or 1M depending
+	 * on what is available
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		mmu_huge_psize = MMU_PAGE_16M;
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		mmu_huge_psize = MMU_PAGE_1M;
+
+	/* Calculate HPAGE_SHIFT and sanity check it */
+	if (mmu_psize_defs[mmu_huge_psize].shift > 16 &&
+	    mmu_psize_defs[mmu_huge_psize].shift < 28)
+		HPAGE_SHIFT = mmu_psize_defs[mmu_huge_psize].shift;
+	else
+		HPAGE_SHIFT = 0; /* No huge pages dude ! */
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+static int __init htab_dt_scan_pftsize(unsigned long node,
+				       const char *uname, int depth,
+				       void *data)
+{
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	u32 *prop;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+	if (prop != NULL) {
+		/* pft_size[0] is the NUMA CEC cookie */
+		ppc64_pft_size = prop[1];
+		return 1;
+	}
+	return 0;
+}
+
+static unsigned long __init htab_get_table_size(void)
 {
 	unsigned long rnd_mem_size, pteg_count;
 
-	/* If hash size wasn't obtained in prom.c, we calculate it now based on
-	 * the total RAM size
+	/* If hash size isn't already provided by the platform, we try to
+	 * retreive it from the device-tree. If it's not there neither, we
+	 * calculate it now based on the total RAM size
 	 */
+	if (ppc64_pft_size == 0)
+		of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
 	if (ppc64_pft_size)
 		return 1UL << ppc64_pft_size;
 
@@ -181,17 +390,21 @@
 	unsigned long table, htab_size_bytes;
 	unsigned long pteg_count;
 	unsigned long mode_rw;
-	int i, use_largepages = 0;
 	unsigned long base = 0, size = 0;
+	int i;
+
 	extern unsigned long tce_alloc_start, tce_alloc_end;
 
 	DBG(" -> htab_initialize()\n");
 
+	/* Initialize page sizes */
+	htab_init_page_sizes();
+
 	/*
 	 * Calculate the required size of the htab.  We want the number of
 	 * PTEGs to equal one half the number of real pages.
 	 */ 
-	htab_size_bytes = get_hashtable_size();
+	htab_size_bytes = htab_get_table_size();
 	pteg_count = htab_size_bytes >> 7;
 
 	/* For debug, make the HTAB 1/8 as big as it normally would be. */
@@ -211,14 +424,11 @@
 		 * the absolute address space.
 		 */
 		table = lmb_alloc(htab_size_bytes, htab_size_bytes);
+		BUG_ON(table == 0);
 
 		DBG("Hash table allocated at %lx, size: %lx\n", table,
 		    htab_size_bytes);
 
-		if ( !table ) {
-			ppc64_terminate_msg(0x20, "hpt space");
-			loop_forever();
-		}
 		htab_address = abs_to_virt(table);
 
 		/* htab absolute addr + encoded htabsize */
@@ -234,8 +444,6 @@
 	 * _NOT_ map it to avoid cache paradoxes as it's remapped non
 	 * cacheable later on
 	 */
-	if (cpu_has_feature(CPU_FTR_16M_PAGE))
-		use_largepages = 1;
 
 	/* create bolted the linear mapping in the hash table */
 	for (i=0; i < lmb.memory.cnt; i++) {
@@ -246,27 +454,32 @@
 
 #ifdef CONFIG_U3_DART
 		/* Do not map the DART space. Fortunately, it will be aligned
-		 * in such a way that it will not cross two lmb regions and will
-		 * fit within a single 16Mb page.
-		 * The DART space is assumed to be a full 16Mb region even if we
-		 * only use 2Mb of that space. We will use more of it later for
-		 * AGP GART. We have to use a full 16Mb large page.
+		 * in such a way that it will not cross two lmb regions and
+		 * will fit within a single 16Mb page.
+		 * The DART space is assumed to be a full 16Mb region even if
+		 * we only use 2Mb of that space. We will use more of it later
+		 * for AGP GART. We have to use a full 16Mb large page.
 		 */
 		DBG("DART base: %lx\n", dart_tablebase);
 
 		if (dart_tablebase != 0 && dart_tablebase >= base
 		    && dart_tablebase < (base + size)) {
 			if (base != dart_tablebase)
-				create_pte_mapping(base, dart_tablebase, mode_rw,
-						   use_largepages);
+				BUG_ON(htab_bolt_mapping(base, dart_tablebase,
+							 base, mode_rw,
+							 mmu_linear_psize));
 			if ((base + size) > (dart_tablebase + 16*MB))
-				create_pte_mapping(dart_tablebase + 16*MB, base + size,
-						   mode_rw, use_largepages);
+				BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
+							 base + size,
+							 dart_tablebase+16*MB,
+							 mode_rw,
+							 mmu_linear_psize));
 			continue;
 		}
 #endif /* CONFIG_U3_DART */
-		create_pte_mapping(base, base + size, mode_rw, use_largepages);
-	}
+		BUG_ON(htab_bolt_mapping(base, base + size, base,
+					 mode_rw, mmu_linear_psize));
+       }
 
 	/*
 	 * If we have a memory_limit and we've allocated TCEs then we need to
@@ -282,8 +495,9 @@
 		if (base + size >= tce_alloc_start)
 			tce_alloc_start = base + size + 1;
 
-		create_pte_mapping(tce_alloc_start, tce_alloc_end,
-			mode_rw, use_largepages);
+ 		BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
+ 					 tce_alloc_start, mode_rw,
+					 mmu_linear_psize));
 	}
 
 	DBG(" <- htab_initialize()\n");
@@ -298,9 +512,6 @@
 {
 	struct page *page;
 
-	if (!pfn_valid(pte_pfn(pte)))
-		return pp;
-
 	page = pte_page(pte);
 
 	/* page is dirty */
@@ -309,7 +520,7 @@
 			__flush_dcache_icache(page_address(page));
 			set_bit(PG_arch_1, &page->flags);
 		} else
-			pp |= HW_NO_EXEC;
+			pp |= HPTE_R_N;
 	}
 	return pp;
 }
@@ -325,94 +536,169 @@
 	unsigned long vsid;
 	struct mm_struct *mm;
 	pte_t *ptep;
-	int ret;
-	int user_region = 0;
-	int local = 0;
 	cpumask_t tmp;
+	int rc, user_region = 0, local = 0;
 
-	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
-		return 1;
+	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
+		ea, access, trap);
 
+	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
+		DBG_LOW(" out of pgtable range !\n");
+ 		return 1;
+	}
+
+	/* Get region & vsid */
  	switch (REGION_ID(ea)) {
 	case USER_REGION_ID:
 		user_region = 1;
 		mm = current->mm;
-		if (! mm)
+		if (! mm) {
+			DBG_LOW(" user region with no mm !\n");
 			return 1;
-
+		}
 		vsid = get_vsid(mm->context.id, ea);
 		break;
 	case VMALLOC_REGION_ID:
 		mm = &init_mm;
 		vsid = get_kernel_vsid(ea);
 		break;
-#if 0
-	case KERNEL_REGION_ID:
-		/*
-		 * Should never get here - entire 0xC0... region is bolted.
-		 * Send the problem up to do_page_fault 
-		 */
-#endif
 	default:
 		/* Not a valid range
 		 * Send the problem up to do_page_fault 
 		 */
 		return 1;
-		break;
 	}
+	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
 
+	/* Get pgdir */
 	pgdir = mm->pgd;
-
 	if (pgdir == NULL)
 		return 1;
 
+	/* Check CPU locality */
 	tmp = cpumask_of_cpu(smp_processor_id());
 	if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
 		local = 1;
 
-	/* Is this a huge page ? */
-	if (unlikely(in_hugepage_area(mm->context, ea)))
-		ret = hash_huge_page(mm, access, ea, vsid, local);
-	else {
-		ptep = find_linux_pte(pgdir, ea);
-		if (ptep == NULL)
-			return 1;
-		ret = __hash_page(ea, access, vsid, ptep, trap, local);
+	/* Handle hugepage regions */
+	if (unlikely(in_hugepage_area(mm->context, ea))) {
+		DBG_LOW(" -> huge page !\n");
+		return hash_huge_page(mm, access, ea, vsid, local);
 	}
 
-	return ret;
+	/* Get PTE and page size from page tables */
+	ptep = find_linux_pte(pgdir, ea);
+	if (ptep == NULL || !pte_present(*ptep)) {
+		DBG_LOW(" no PTE !\n");
+		return 1;
+	}
+
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+	/* Pre-check access permissions (will be re-checked atomically
+	 * in __hash_page_XX but this pre-check is a fast path
+	 */
+	if (access & ~pte_val(*ptep)) {
+		DBG_LOW(" no access !\n");
+		return 1;
+	}
+
+	/* Do actual hashing */
+#ifndef CONFIG_PPC_64K_PAGES
+	rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
+#else
+	if (mmu_virtual_psize == MMU_PAGE_64K)
+		rc = __hash_page_64K(ea, access, vsid, ptep, trap, local);
+	else
+		rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+	DBG_LOW(" -> rc=%d\n", rc);
+	return rc;
 }
 
-void flush_hash_page(unsigned long va, pte_t pte, int local)
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  unsigned long access, unsigned long trap)
 {
-	unsigned long vpn, hash, secondary, slot;
-	unsigned long huge = pte_huge(pte);
+	unsigned long vsid;
+	void *pgdir;
+	pte_t *ptep;
+	cpumask_t mask;
+	unsigned long flags;
+	int local = 0;
 
-	if (huge)
-		vpn = va >> HPAGE_SHIFT;
+	/* We don't want huge pages prefaulted for now
+	 */
+	if (unlikely(in_hugepage_area(mm->context, ea)))
+		return;
+
+	DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
+		" trap=%lx\n", mm, mm->pgd, ea, access, trap);
+
+	/* Get PTE, VSID, access mask */
+	pgdir = mm->pgd;
+	if (pgdir == NULL)
+		return;
+	ptep = find_linux_pte(pgdir, ea);
+	if (!ptep)
+		return;
+	vsid = get_vsid(mm->context.id, ea);
+
+	/* Hash it in */
+	local_irq_save(flags);
+	mask = cpumask_of_cpu(smp_processor_id());
+	if (cpus_equal(mm->cpu_vm_mask, mask))
+		local = 1;
+#ifndef CONFIG_PPC_64K_PAGES
+	__hash_page_4K(ea, access, vsid, ptep, trap, local);
+#else
+	if (mmu_virtual_psize == MMU_PAGE_64K)
+		__hash_page_64K(ea, access, vsid, ptep, trap, local);
 	else
-		vpn = va >> PAGE_SHIFT;
-	hash = hpt_hash(vpn, huge);
-	secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
-	if (secondary)
-		hash = ~hash;
-	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-	slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
+		__hash_page_4K(ea, access, vsid, ptep, trap, local);
+#endif /* CONFIG_PPC_64K_PAGES */
+	local_irq_restore(flags);
+}
 
-	ppc_md.hpte_invalidate(slot, va, huge, local);
+void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int local)
+{
+	unsigned long hash, index, shift, hidx, slot;
+
+	DBG_LOW("flush_hash_page(va=%016x)\n", va);
+	pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
+		hash = hpt_hash(va, shift);
+		hidx = __rpte_to_hidx(pte, index);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+		DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx);
+		ppc_md.hpte_invalidate(slot, va, psize, local);
+	} pte_iterate_hashed_end();
 }
 
 void flush_hash_range(unsigned long number, int local)
 {
-	if (ppc_md.flush_hash_range) {
+	if (ppc_md.flush_hash_range)
 		ppc_md.flush_hash_range(number, local);
-	} else {
+	else {
 		int i;
 		struct ppc64_tlb_batch *batch =
 			&__get_cpu_var(ppc64_tlb_batch);
 
 		for (i = 0; i < number; i++)
-			flush_hash_page(batch->vaddr[i], batch->pte[i], local);
+			flush_hash_page(batch->vaddr[i], batch->pte[i],
+					batch->psize, local);
 	}
 }
 
@@ -452,6 +738,18 @@
 	extern unsigned int *htab_call_hpte_remove;
 	extern unsigned int *htab_call_hpte_updatepp;
 
+#ifdef CONFIG_PPC_64K_PAGES
+	extern unsigned int *ht64_call_hpte_insert1;
+	extern unsigned int *ht64_call_hpte_insert2;
+	extern unsigned int *ht64_call_hpte_remove;
+	extern unsigned int *ht64_call_hpte_updatepp;
+
+	make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert);
+	make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert);
+	make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove);
+	make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp);
+#endif /* CONFIG_PPC_64K_PAGES */
+
 	make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
 	make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
 	make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0ea0994..0073a04 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -47,10 +47,25 @@
 		pu = pud_offset(pg, addr);
 		if (!pud_none(*pu)) {
 			pm = pmd_offset(pu, addr);
+#ifdef CONFIG_PPC_64K_PAGES
+			/* Currently, we use the normal PTE offset within full
+			 * size PTE pages, thus our huge PTEs are scattered in
+			 * the PTE page and we do waste some. We may change
+			 * that in the future, but the current mecanism keeps
+			 * things much simpler
+			 */
+			if (!pmd_none(*pm)) {
+				/* Note: pte_offset_* are all equivalent on
+				 * ppc64 as we don't have HIGHMEM
+				 */
+				pt = pte_offset_kernel(pm, addr);
+				return pt;
+			}
+#else /* CONFIG_PPC_64K_PAGES */
+			/* On 4k pages, we put huge PTEs in the PMD page */
 			pt = (pte_t *)pm;
-			BUG_ON(!pmd_none(*pm)
-			       && !(pte_present(*pt) && pte_huge(*pt)));
 			return pt;
+#endif /* CONFIG_PPC_64K_PAGES */
 		}
 	}
 
@@ -74,9 +89,16 @@
 	if (pu) {
 		pm = pmd_alloc(mm, pu, addr);
 		if (pm) {
+#ifdef CONFIG_PPC_64K_PAGES
+			/* See comment in huge_pte_offset. Note that if we ever
+			 * want to put the page size in the PMD, we would have
+			 * to open code our own pte_alloc* function in order
+			 * to populate and set the size atomically
+			 */
+			pt = pte_alloc_map(mm, pm, addr);
+#else /* CONFIG_PPC_64K_PAGES */
 			pt = (pte_t *)pm;
-			BUG_ON(!pmd_none(*pm)
-			       && !(pte_present(*pt) && pte_huge(*pt)));
+#endif /* CONFIG_PPC_64K_PAGES */
 			return pt;
 		}
 	}
@@ -84,35 +106,29 @@
 	return NULL;
 }
 
-#define HUGEPTE_BATCH_SIZE	(HPAGE_SIZE / PMD_SIZE)
-
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
-	int i;
-
 	if (pte_present(*ptep)) {
-		pte_clear(mm, addr, ptep);
+		/* We open-code pte_clear because we need to pass the right
+		 * argument to hpte_update (huge / !huge)
+		 */
+		unsigned long old = pte_update(ptep, ~0UL);
+		if (old & _PAGE_HASHPTE)
+			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
 		flush_tlb_pending();
 	}
-
-	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
-		*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-		ptep++;
-	}
+	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
 
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep)
 {
 	unsigned long old = pte_update(ptep, ~0UL);
-	int i;
 
 	if (old & _PAGE_HASHPTE)
-		hpte_update(mm, addr, old, 0);
-
-	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
-		ptep[i] = __pte(0);
+		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
+	*ptep = __pte(0);
 
 	return __pte(old);
 }
@@ -563,6 +579,8 @@
 	int lastshift;
 	u16 areamask, curareas;
 
+	if (HPAGE_SHIFT == 0)
+		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 
@@ -619,19 +637,15 @@
 		   unsigned long ea, unsigned long vsid, int local)
 {
 	pte_t *ptep;
-	unsigned long va, vpn;
-	pte_t old_pte, new_pte;
-	unsigned long rflags, prpn;
+	unsigned long old_pte, new_pte;
+	unsigned long va, rflags, pa;
 	long slot;
 	int err = 1;
 
-	spin_lock(&mm->page_table_lock);
-
 	ptep = huge_pte_offset(mm, ea);
 
 	/* Search the Linux page table for a match with va */
 	va = (vsid << 28) | (ea & 0x0fffffff);
-	vpn = va >> HPAGE_SHIFT;
 
 	/*
 	 * If no pte found or not present, send the problem up to
@@ -640,8 +654,6 @@
 	if (unlikely(!ptep || pte_none(*ptep)))
 		goto out;
 
-/* 	BUG_ON(pte_bad(*ptep)); */
-
 	/* 
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
@@ -661,58 +673,64 @@
 	 */
 
 
-	old_pte = *ptep;
-	new_pte = old_pte;
+	do {
+		old_pte = pte_val(*ptep);
+		if (old_pte & _PAGE_BUSY)
+			goto out;
+		new_pte = old_pte | _PAGE_BUSY |
+			_PAGE_ACCESSED | _PAGE_HASHPTE;
+	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					 old_pte, new_pte));
 
-	rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-	rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 
 	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
 		unsigned long hash, slot;
 
-		hash = hpt_hash(vpn, 1);
-		if (pte_val(old_pte) & _PAGE_SECONDARY)
+		hash = hpt_hash(va, HPAGE_SHIFT);
+		if (old_pte & _PAGE_F_SECOND)
 			hash = ~hash;
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+		slot += (old_pte & _PAGE_F_GIX) >> 12;
 
 		if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
-			pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
+			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
-	if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(vpn, 1);
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
 		unsigned long hpte_group;
 
-		prpn = pte_pfn(old_pte);
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 
 repeat:
 		hpte_group = ((hash & htab_hash_mask) *
 			      HPTES_PER_GROUP) & ~0x7UL;
 
-		/* Update the linux pte with the HPTE slot */
-		pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
-		pte_val(new_pte) |= _PAGE_HASHPTE;
+		/* clear HPTE slot informations in new PTE */
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 
 		/* Add in WIMG bits */
 		/* XXX We should store these in the pte */
+		/* --BenH: I think they are ... */
 		rflags |= _PAGE_COHERENT;
 
-		slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-					  HPTE_V_LARGE, rflags);
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+					  mmu_huge_psize);
 
 		/* Primary is full, try the secondary */
 		if (unlikely(slot == -1)) {
-			pte_val(new_pte) |= _PAGE_SECONDARY;
+			new_pte |= _PAGE_F_SECOND;
 			hpte_group = ((~hash & htab_hash_mask) *
 				      HPTES_PER_GROUP) & ~0x7UL; 
-			slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-						  HPTE_V_LARGE |
+			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 						  HPTE_V_SECONDARY,
-						  rflags);
+						  mmu_huge_psize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
@@ -726,20 +744,18 @@
 		if (unlikely(slot == -2))
 			panic("hash_huge_page: pte_insert failed\n");
 
-		pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
-
-		/* 
-		 * No need to use ldarx/stdcx here because all who
-		 * might be updating the pte will hold the
-		 * page_table_lock
-		 */
-		*ptep = new_pte;
+		new_pte |= (slot << 12) & _PAGE_F_GIX;
 	}
 
+	/*
+	 * No need to use ldarx/stdcx here because all who
+	 * might be updating the pte will hold the
+	 * page_table_lock
+	 */
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+
 	err = 0;
 
  out:
-	spin_unlock(&mm->page_table_lock);
-
 	return err;
 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index b0fc822..dfe7fa3 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -188,12 +188,21 @@
 	memset(addr, 0, kmem_cache_size(cache));
 }
 
+#ifdef CONFIG_PPC_64K_PAGES
+static const int pgtable_cache_size[2] = {
+	PTE_TABLE_SIZE, PGD_TABLE_SIZE
+};
+static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+	"pte_pmd_cache", "pgd_cache",
+};
+#else
 static const int pgtable_cache_size[2] = {
 	PTE_TABLE_SIZE, PMD_TABLE_SIZE
 };
 static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 	"pgd_pte_cache", "pud_pmd_cache",
 };
+#endif /* CONFIG_PPC_64K_PAGES */
 
 kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 
@@ -201,19 +210,14 @@
 {
 	int i;
 
-	BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
-	BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
-	BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
-	BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
-
 	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
 		int size = pgtable_cache_size[i];
 		const char *name = pgtable_cache_name[i];
 
 		pgtable_cache[i] = kmem_cache_create(name,
 						     size, size,
-						     SLAB_HWCACHE_ALIGN
-						     | SLAB_MUST_HWCACHE_ALIGN,
+						     SLAB_HWCACHE_ALIGN |
+						     SLAB_MUST_HWCACHE_ALIGN,
 						     zero_ctor,
 						     NULL);
 		if (! pgtable_cache[i])
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 117b000..7faa46b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -61,6 +61,9 @@
 int mem_init_done;
 unsigned long memory_limit;
 
+extern void hash_preload(struct mm_struct *mm, unsigned long ea,
+			 unsigned long access, unsigned long trap);
+
 /*
  * This is called by /dev/mem to know if a given address has to
  * be mapped non-cacheable or not
@@ -493,18 +496,10 @@
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t pte)
 {
-	/* handle i-cache coherency */
-	unsigned long pfn = pte_pfn(pte);
-#ifdef CONFIG_PPC32
-	pmd_t *pmd;
-#else
-	unsigned long vsid;
-	void *pgdir;
-	pte_t *ptep;
-	int local = 0;
-	cpumask_t tmp;
-	unsigned long flags;
+#ifdef CONFIG_PPC_STD_MMU
+	unsigned long access = 0, trap;
 #endif
+	unsigned long pfn = pte_pfn(pte);
 
 	/* handle i-cache coherency */
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
@@ -535,30 +530,21 @@
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
 	if (!pte_young(pte) || address >= TASK_SIZE)
 		return;
-#ifdef CONFIG_PPC32
-	if (Hash == 0)
+
+	/* We try to figure out if we are coming from an instruction
+	 * access fault and pass that down to __hash_page so we avoid
+	 * double-faulting on execution of fresh text. We have to test
+	 * for regs NULL since init will get here first thing at boot
+	 *
+	 * We also avoid filling the hash if not coming from a fault
+	 */
+	if (current->thread.regs == NULL)
 		return;
-	pmd = pmd_offset(pgd_offset(vma->vm_mm, address), address);
-	if (!pmd_none(*pmd))
-		add_hash_page(vma->vm_mm->context, address, pmd_val(*pmd));
-#else
-	pgdir = vma->vm_mm->pgd;
-	if (pgdir == NULL)
+	trap = TRAP(current->thread.regs);
+	if (trap == 0x400)
+		access |= _PAGE_EXEC;
+	else if (trap != 0x300)
 		return;
-
-	ptep = find_linux_pte(pgdir, address);
-	if (!ptep)
-		return;
-
-	vsid = get_vsid(vma->vm_mm->context.id, address);
-
-	local_irq_save(flags);
-	tmp = cpumask_of_cpu(smp_processor_id());
-	if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
-		local = 1;
-
-	__hash_page(address, 0, vsid, ptep, 0x300, local);
-	local_irq_restore(flags);
-#endif
-#endif
+	hash_preload(vma->vm_mm, address, access, trap);
+#endif /* CONFIG_PPC_STD_MMU */
 }
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index b79a782..51b7869 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -101,7 +101,6 @@
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
-	unsigned long vsid;
 
 	if (mem_init_done) {
 		pgdp = pgd_offset_k(ea);
@@ -117,28 +116,15 @@
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
 	} else {
-		unsigned long va, vpn, hash, hpteg;
-
 		/*
 		 * If the mm subsystem is not fully up, we cannot create a
 		 * linux page table entry for this mapping.  Simply bolt an
 		 * entry in the hardware page table.
+		 *
 		 */
-		vsid = get_kernel_vsid(ea);
-		va = (vsid << 28) | (ea & 0xFFFFFFF);
-		vpn = va >> PAGE_SHIFT;
-
-		hash = hpt_hash(vpn, 0);
-
-		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
-
-		/* Panic if a pte grpup is full */
-		if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT,
-				       HPTE_V_BOLTED,
-				       _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX)
-		    == -1) {
-			panic("map_io_page: could not insert mapping");
-		}
+		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
+				      mmu_virtual_psize))
+			panic("Can't map bolted IO mapping");
 	}
 	return 0;
 }
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index cef9e83..d137abd 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -179,6 +179,21 @@
 }
 
 /*
+ * Preload a translation in the hash table
+ */
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  unsigned long access, unsigned long trap)
+{
+	pmd_t *pmd;
+
+	if (Hash == 0)
+		return;
+	pmd = pmd_offset(pgd_offset(vma->vm_mm, address), address);
+	if (!pmd_none(*pmd))
+		add_hash_page(vma->vm_mm->context, address, pmd_val(*pmd));
+}
+
+/*
  * Initialize the hash table and patch the instructions in hashtable.S.
  */
 void __init MMU_init_hw(void)
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 0473953..60e852f 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -14,14 +14,32 @@
  *      2 of the License, or (at your option) any later version.
  */
 
+#undef DEBUG
+
 #include <linux/config.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
 #include <asm/paca.h>
 #include <asm/cputable.h>
+#include <asm/cacheflush.h>
 
-extern void slb_allocate(unsigned long ea);
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+extern void slb_allocate_realmode(unsigned long ea);
+extern void slb_allocate_user(unsigned long ea);
+
+static void slb_allocate(unsigned long ea)
+{
+	/* Currently, we do real mode for all SLBs including user, but
+	 * that will change if we bring back dynamic VSIDs
+	 */
+	slb_allocate_realmode(ea);
+}
 
 static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot)
 {
@@ -46,13 +64,15 @@
 {
 	/* If you change this make sure you change SLB_NUM_BOLTED
 	 * appropriately too. */
-	unsigned long ksp_flags = SLB_VSID_KERNEL;
+	unsigned long linear_llp, virtual_llp, lflags, vflags;
 	unsigned long ksp_esid_data;
 
 	WARN_ON(!irqs_disabled());
 
-	if (cpu_has_feature(CPU_FTR_16M_PAGE))
-		ksp_flags |= SLB_VSID_L;
+	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp;
+	lflags = SLB_VSID_KERNEL | linear_llp;
+	vflags = SLB_VSID_KERNEL | virtual_llp;
 
 	ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
 	if ((ksp_esid_data & ESID_MASK) == KERNELBASE)
@@ -67,9 +87,9 @@
 		     /* Slot 2 - kernel stack */
 		     "slbmte	%2,%3\n"
 		     "isync"
-		     :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)),
+		     :: "r"(mk_vsid_data(VMALLOCBASE, vflags)),
 		        "r"(mk_esid_data(VMALLOCBASE, 1)),
-		        "r"(mk_vsid_data(ksp_esid_data, ksp_flags)),
+		        "r"(mk_vsid_data(ksp_esid_data, lflags)),
 		        "r"(ksp_esid_data)
 		     : "memory");
 }
@@ -102,6 +122,9 @@
 
 	get_paca()->slb_cache_ptr = 0;
 	get_paca()->context = mm->context;
+#ifdef CONFIG_PPC_64K_PAGES
+	get_paca()->pgdir = mm->pgd;
+#endif /* CONFIG_PPC_64K_PAGES */
 
 	/*
 	 * preload some userspace segments into the SLB.
@@ -131,28 +154,77 @@
 	slb_allocate(unmapped_base);
 }
 
+static inline void patch_slb_encoding(unsigned int *insn_addr,
+				      unsigned int immed)
+{
+	/* Assume the instruction had a "0" immediate value, just
+	 * "or" in the new value
+	 */
+	*insn_addr |= immed;
+	flush_icache_range((unsigned long)insn_addr, 4+
+			   (unsigned long)insn_addr);
+}
+
 void slb_initialize(void)
 {
+	unsigned long linear_llp, virtual_llp;
+	static int slb_encoding_inited;
+	extern unsigned int *slb_miss_kernel_load_linear;
+	extern unsigned int *slb_miss_kernel_load_virtual;
+	extern unsigned int *slb_miss_user_load_normal;
+#ifdef CONFIG_HUGETLB_PAGE
+	extern unsigned int *slb_miss_user_load_huge;
+	unsigned long huge_llp;
+
+	huge_llp = mmu_psize_defs[mmu_huge_psize].sllp;
+#endif
+
+	/* Prepare our SLB miss handler based on our page size */
+	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp;
+	if (!slb_encoding_inited) {
+		slb_encoding_inited = 1;
+		patch_slb_encoding(slb_miss_kernel_load_linear,
+				   SLB_VSID_KERNEL | linear_llp);
+		patch_slb_encoding(slb_miss_kernel_load_virtual,
+				   SLB_VSID_KERNEL | virtual_llp);
+		patch_slb_encoding(slb_miss_user_load_normal,
+				   SLB_VSID_USER | virtual_llp);
+
+		DBG("SLB: linear  LLP = %04x\n", linear_llp);
+		DBG("SLB: virtual LLP = %04x\n", virtual_llp);
+#ifdef CONFIG_HUGETLB_PAGE
+		patch_slb_encoding(slb_miss_user_load_huge,
+				   SLB_VSID_USER | huge_llp);
+		DBG("SLB: huge    LLP = %04x\n", huge_llp);
+#endif
+	}
+
 	/* On iSeries the bolted entries have already been set up by
 	 * the hypervisor from the lparMap data in head.S */
 #ifndef CONFIG_PPC_ISERIES
-	unsigned long flags = SLB_VSID_KERNEL;
+ {
+	unsigned long lflags, vflags;
 
- 	/* Invalidate the entire SLB (even slot 0) & all the ERATS */
- 	if (cpu_has_feature(CPU_FTR_16M_PAGE))
- 		flags |= SLB_VSID_L;
+	lflags = SLB_VSID_KERNEL | linear_llp;
+	vflags = SLB_VSID_KERNEL | virtual_llp;
 
- 	asm volatile("isync":::"memory");
- 	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
+	/* Invalidate the entire SLB (even slot 0) & all the ERATS */
+	asm volatile("isync":::"memory");
+	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
 	asm volatile("isync; slbia; isync":::"memory");
-	create_slbe(KERNELBASE, flags, 0);
-	create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1);
+	create_slbe(KERNELBASE, lflags, 0);
+
+	/* VMALLOC space has 4K pages always for now */
+	create_slbe(VMALLOCBASE, vflags, 1);
+
 	/* We don't bolt the stack for the time being - we're in boot,
 	 * so the stack is in the bolted segment.  By the time it goes
 	 * elsewhere, we'll call _switch() which will bolt in the new
 	 * one. */
 	asm volatile("isync":::"memory");
-#endif
+ }
+#endif /* CONFIG_PPC_ISERIES */
 
 	get_paca()->stab_rr = SLB_NUM_BOLTED;
 }
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index a3a03da..3e18241 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -18,25 +18,161 @@
 
 #include <linux/config.h>
 #include <asm/processor.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/cputable.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
 
-/* void slb_allocate(unsigned long ea);
+/* void slb_allocate_realmode(unsigned long ea);
  *
  * Create an SLB entry for the given EA (user or kernel).
  * 	r3 = faulting address, r13 = PACA
  *	r9, r10, r11 are clobbered by this function
  * No other registers are examined or changed.
  */
-_GLOBAL(slb_allocate)
-	/*
-	 * First find a slot, round robin. Previously we tried to find
-	 * a free slot first but that took too long. Unfortunately we
-	 * dont have any LRU information to help us choose a slot.
+_GLOBAL(slb_allocate_realmode)
+	/* r3 = faulting address */
+
+	srdi	r9,r3,60		/* get region */
+	srdi	r10,r3,28		/* get esid */
+	cmpldi	cr7,r9,0xc		/* cmp KERNELBASE for later use */
+
+	/* r3 = address, r10 = esid, cr7 = <>KERNELBASE */
+	blt	cr7,0f			/* user or kernel? */
+
+	/* kernel address: proto-VSID = ESID */
+	/* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
+	 * this code will generate the protoVSID 0xfffffffff for the
+	 * top segment.  That's ok, the scramble below will translate
+	 * it to VSID 0, which is reserved as a bad VSID - one which
+	 * will never have any pages in it.  */
+
+	/* Check if hitting the linear mapping of the vmalloc/ioremap
+	 * kernel space
+	*/
+	bne	cr7,1f
+
+	/* Linear mapping encoding bits, the "li" instruction below will
+	 * be patched by the kernel at boot
 	 */
+_GLOBAL(slb_miss_kernel_load_linear)
+	li	r11,0
+	b	slb_finish_load
+
+1:	/* vmalloc/ioremap mapping encoding bits, the "li" instruction below
+	 * will be patched by the kernel at boot
+	 */
+_GLOBAL(slb_miss_kernel_load_virtual)
+	li	r11,0
+	b	slb_finish_load
+
+
+0:	/* user address: proto-VSID = context << 15 | ESID. First check
+	 * if the address is within the boundaries of the user region
+	 */
+	srdi.	r9,r10,USER_ESID_BITS
+	bne-	8f			/* invalid ea bits set */
+
+	/* Figure out if the segment contains huge pages */
+#ifdef CONFIG_HUGETLB_PAGE
+BEGIN_FTR_SECTION
+	b	1f
+END_FTR_SECTION_IFCLR(CPU_FTR_16M_PAGE)
+	lhz	r9,PACAHIGHHTLBAREAS(r13)
+	srdi	r11,r10,(HTLB_AREA_SHIFT-SID_SHIFT)
+	srd	r9,r9,r11
+	lhz	r11,PACALOWHTLBAREAS(r13)
+	srd	r11,r11,r10
+	or.	r9,r9,r11
+	beq	1f
+_GLOBAL(slb_miss_user_load_huge)
+	li	r11,0
+	b	2f
+1:
+#endif /* CONFIG_HUGETLB_PAGE */
+
+_GLOBAL(slb_miss_user_load_normal)
+	li	r11,0
+
+2:
+	ld	r9,PACACONTEXTID(r13)
+	rldimi	r10,r9,USER_ESID_BITS,0
+	b	slb_finish_load
+
+8:	/* invalid EA */
+	li	r10,0			/* BAD_VSID */
+	li	r11,SLB_VSID_USER	/* flags don't much matter */
+	b	slb_finish_load
+
+#ifdef __DISABLED__
+
+/* void slb_allocate_user(unsigned long ea);
+ *
+ * Create an SLB entry for the given EA (user or kernel).
+ * 	r3 = faulting address, r13 = PACA
+ *	r9, r10, r11 are clobbered by this function
+ * No other registers are examined or changed.
+ *
+ * It is called with translation enabled in order to be able to walk the
+ * page tables. This is not currently used.
+ */
+_GLOBAL(slb_allocate_user)
+	/* r3 = faulting address */
+	srdi	r10,r3,28		/* get esid */
+
+	crset	4*cr7+lt		/* set "user" flag for later */
+
+	/* check if we fit in the range covered by the pagetables*/
+	srdi.	r9,r3,PGTABLE_EADDR_SIZE
+	crnot	4*cr0+eq,4*cr0+eq
+	beqlr
+
+	/* now we need to get to the page tables in order to get the page
+	 * size encoding from the PMD. In the future, we'll be able to deal
+	 * with 1T segments too by getting the encoding from the PGD instead
+	 */
+	ld	r9,PACAPGDIR(r13)
+	cmpldi	cr0,r9,0
+	beqlr
+	rlwinm	r11,r10,8,25,28
+	ldx	r9,r9,r11		/* get pgd_t */
+	cmpldi	cr0,r9,0
+	beqlr
+	rlwinm	r11,r10,3,17,28
+	ldx	r9,r9,r11		/* get pmd_t */
+	cmpldi	cr0,r9,0
+	beqlr
+
+	/* build vsid flags */
+	andi.	r11,r9,SLB_VSID_LLP
+	ori	r11,r11,SLB_VSID_USER
+
+	/* get context to calculate proto-VSID */
+	ld	r9,PACACONTEXTID(r13)
+	rldimi	r10,r9,USER_ESID_BITS,0
+
+	/* fall through slb_finish_load */
+
+#endif /* __DISABLED__ */
+
+
+/*
+ * Finish loading of an SLB entry and return
+ *
+ * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <>KERNELBASE
+ */
+slb_finish_load:
+	ASM_VSID_SCRAMBLE(r10,r9)
+	rldimi	r11,r10,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+
+	/* r3 = EA, r11 = VSID data */
+	/*
+	 * Find a slot, round robin. Previously we tried to find a
+	 * free slot first but that took too long. Unfortunately we
+ 	 * dont have any LRU information to help us choose a slot.
+ 	 */
 #ifdef CONFIG_PPC_ISERIES
 	/*
 	 * On iSeries, the "bolted" stack segment can be cast out on
@@ -45,9 +181,9 @@
 	 */
 	ld	r9,PACAKSAVE(r13)
 	clrrdi	r9,r9,28
-	clrrdi	r11,r3,28
+	clrrdi	r3,r3,28
 	li	r10,SLB_NUM_BOLTED-1	/* Stack goes in last bolted slot */
-	cmpld	r9,r11
+	cmpld	r9,r3
 	beq	3f
 #endif /* CONFIG_PPC_ISERIES */
 
@@ -61,63 +197,12 @@
 
 4:
 	std	r10,PACASTABRR(r13)
+
 3:
-	/* r3 = faulting address, r10 = entry */
+	rldimi	r3,r10,0,36		/* r3= EA[0:35] | entry */
+	oris	r10,r3,SLB_ESID_V@h	/* r3 |= SLB_ESID_V */
 
-	srdi	r9,r3,60		/* get region */
-	srdi	r3,r3,28		/* get esid */
-	cmpldi	cr7,r9,0xc		/* cmp KERNELBASE for later use */
-
-	rldimi	r10,r3,28,0		/* r10= ESID<<28 | entry */
-	oris	r10,r10,SLB_ESID_V@h	/* r10 |= SLB_ESID_V */
-
-	/* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
-
-	blt	cr7,0f			/* user or kernel? */
-
-	/* kernel address: proto-VSID = ESID */
-	/* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
-	 * this code will generate the protoVSID 0xfffffffff for the
-	 * top segment.  That's ok, the scramble below will translate
-	 * it to VSID 0, which is reserved as a bad VSID - one which
-	 * will never have any pages in it.  */
-	li	r11,SLB_VSID_KERNEL
-BEGIN_FTR_SECTION
-	bne	cr7,9f
-	li	r11,(SLB_VSID_KERNEL|SLB_VSID_L)
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-	b	9f
-
-0:	/* user address: proto-VSID = context<<15 | ESID */
-	srdi.	r9,r3,USER_ESID_BITS
-	bne-	8f			/* invalid ea bits set */
-
-#ifdef CONFIG_HUGETLB_PAGE
-BEGIN_FTR_SECTION
-	lhz	r9,PACAHIGHHTLBAREAS(r13)
-	srdi	r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
-	srd	r9,r9,r11
-	lhz	r11,PACALOWHTLBAREAS(r13)
-	srd	r11,r11,r3
-	or	r9,r9,r11
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-#endif /* CONFIG_HUGETLB_PAGE */
-
-	li	r11,SLB_VSID_USER
-
-#ifdef CONFIG_HUGETLB_PAGE
-BEGIN_FTR_SECTION
-	rldimi	r11,r9,8,55		/* shift masked bit into SLB_VSID_L */
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-#endif /* CONFIG_HUGETLB_PAGE */
-
-	ld	r9,PACACONTEXTID(r13)
-	rldimi	r3,r9,USER_ESID_BITS,0
-
-9:	/* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
-	ASM_VSID_SCRAMBLE(r3,r9)
-
-	rldimi	r11,r3,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+	/* r3 = ESID data, r11 = VSID data */
 
 	/*
 	 * No need for an isync before or after this slbmte. The exception
@@ -125,7 +210,9 @@
 	 */
 	slbmte	r11,r10
 
-	bgelr	cr7			/* we're done for kernel addresses */
+	/* we're done for kernel addresses */
+	crclr	4*cr0+eq		/* set result to "success" */
+	bgelr	cr7
 
 	/* Update the slb cache */
 	lhz	r3,PACASLBCACHEPTR(r13)	/* offset = paca->slb_cache_ptr */
@@ -143,9 +230,6 @@
 	li	r3,SLB_CACHE_ENTRIES+1
 2:
 	sth	r3,PACASLBCACHEPTR(r13)	/* paca->slb_cache_ptr = offset */
+	crclr	4*cr0+eq		/* set result to "success" */
 	blr
 
-8:	/* invalid EA */
-	li	r3,0			/* BAD_VSID */
-	li	r11,SLB_VSID_USER	/* flags don't much matter */
-	b	9b
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 1b83f00..fa325db 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -26,7 +26,6 @@
 	unsigned long vsid_data;
 };
 
-/* Both the segment table and SLB code uses the following cache */
 #define NR_STAB_CACHE_ENTRIES 8
 DEFINE_PER_CPU(long, stab_cache_ptr);
 DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
@@ -186,7 +185,7 @@
 		/* Never flush the first entry. */
 		ste += 1;
 		for (entry = 1;
-		     entry < (PAGE_SIZE / sizeof(struct stab_entry));
+		     entry < (HW_PAGE_SIZE / sizeof(struct stab_entry));
 		     entry++, ste++) {
 			unsigned long ea;
 			ea = ste->esid_data & ESID_MASK;
@@ -200,6 +199,10 @@
 
 	__get_cpu_var(stab_cache_ptr) = 0;
 
+#ifdef CONFIG_PPC_64K_PAGES
+	get_paca()->pgdir = mm->pgd;
+#endif /* CONFIG_PPC_64K_PAGES */
+
 	/* Now preload some entries for the new task */
 	if (test_tsk_thread_flag(tsk, TIF_32BIT))
 		unmapped_base = TASK_UNMAPPED_BASE_USER32;
@@ -223,8 +226,6 @@
 	asm volatile("sync" : : : "memory");
 }
 
-extern void slb_initialize(void);
-
 /*
  * Allocate segment tables for secondary CPUs.  These must all go in
  * the first (bolted) segment, so that do_stab_bolted won't get a
@@ -243,18 +244,21 @@
 		if (cpu == 0)
 			continue; /* stab for CPU 0 is statically allocated */
 
-		newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1<<SID_SHIFT);
+		newstab = lmb_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
+					 1<<SID_SHIFT);
 		if (! newstab)
 			panic("Unable to allocate segment table for CPU %d.\n",
 			      cpu);
 
 		newstab += KERNELBASE;
 
-		memset((void *)newstab, 0, PAGE_SIZE);
+		memset((void *)newstab, 0, HW_PAGE_SIZE);
 
 		paca[cpu].stab_addr = newstab;
 		paca[cpu].stab_real = virt_to_abs(newstab);
-		printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx virtual, 0x%lx absolute\n", cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
+		printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx "
+		       "virtual, 0x%lx absolute\n",
+		       cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
 	}
 }
 
@@ -267,13 +271,9 @@
 {
 	unsigned long vsid = get_kernel_vsid(KERNELBASE);
 
-	if (cpu_has_feature(CPU_FTR_SLB)) {
-		slb_initialize();
-	} else {
-		asm volatile("isync; slbia; isync":::"memory");
-		make_ste(stab, GET_ESID(KERNELBASE), vsid);
+	asm volatile("isync; slbia; isync":::"memory");
+	make_ste(stab, GET_ESID(KERNELBASE), vsid);
 
-		/* Order update */
-		asm volatile("sync":::"memory");
-	}
+	/* Order update */
+	asm volatile("sync":::"memory");
 }
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index 09ab81a..53e31b8 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -21,6 +21,7 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  */
+
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -30,7 +31,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
-#include <linux/highmem.h>
+#include <asm/bug.h>
 
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
@@ -126,28 +127,46 @@
  * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
  */
 void hpte_update(struct mm_struct *mm, unsigned long addr,
-		 unsigned long pte, int wrprot)
+		 pte_t *ptep, unsigned long pte, int huge)
 {
 	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
 	unsigned long vsid;
+	unsigned int psize = mmu_virtual_psize;
 	int i;
 
 	i = batch->index;
 
+	/* We mask the address for the base page size. Huge pages will
+	 * have applied their own masking already
+	 */
+	addr &= PAGE_MASK;
+
+	/* Get page size (maybe move back to caller) */
+	if (huge) {
+#ifdef CONFIG_HUGETLB_PAGE
+		psize = mmu_huge_psize;
+#else
+		BUG();
+#endif
+	}
+
 	/*
 	 * This can happen when we are in the middle of a TLB batch and
 	 * we encounter memory pressure (eg copy_page_range when it tries
 	 * to allocate a new pte). If we have to reclaim memory and end
 	 * up scanning and resetting referenced bits then our batch context
 	 * will change mid stream.
+	 *
+	 * We also need to ensure only one page size is present in a given
+	 * batch
 	 */
-	if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) {
+	if (i != 0 && (mm != batch->mm || batch->psize != psize)) {
 		flush_tlb_pending();
 		i = 0;
 	}
 	if (i == 0) {
 		batch->mm = mm;
-		batch->large = pte_huge(pte);
+		batch->psize = psize;
 	}
 	if (addr < KERNELBASE) {
 		vsid = get_vsid(mm->context.id, addr);
@@ -155,7 +174,7 @@
 	} else
 		vsid = get_kernel_vsid(addr);
 	batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
-	batch->pte[i] = __pte(pte);
+	batch->pte[i] = __real_pte(__pte(pte), ptep);
 	batch->index = ++i;
 	if (i >= PPC64_TLB_BATCH_NR)
 		flush_tlb_pending();
@@ -177,7 +196,8 @@
 		local = 1;
 
 	if (i == 1)
-		flush_hash_page(batch->vaddr[0], batch->pte[0], local);
+		flush_hash_page(batch->vaddr[0], batch->pte[0],
+				batch->psize, local);
 	else
 		flush_hash_range(i, local);
 	batch->index = 0;