[ARM] Feroceon: Feroceon-specific WA-cache compatible {copy,clear}_user_page()

This patch implements a set of Feroceon-specific
{copy,clear}_user_page() routines that perform more optimally than
the generic implementations.  This also deals with write-allocate
caches (Feroceon can run L1 D in WA mode) which otherwise prevents
Linux from booting.

[nico: optimized the code even further]

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Tested-by: Sylver Bruneau <sylver.bruneau@googlemail.com>
Tested-by: Martin Michlmayr <tbm@cyrius.com>
Signed-off-by: Nicolas Pitre <nico@marvell.com>
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index a92a577..33ed048 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -372,7 +372,7 @@
 	select CPU_PABRT_NOIFAR
 	select CPU_CACHE_VIVT
 	select CPU_CP15_MMU
-	select CPU_COPY_V4WB if MMU
+	select CPU_COPY_FEROCEON if MMU
 	select CPU_TLB_V4WBI if MMU
 
 config CPU_FEROCEON_OLD_ID
@@ -523,6 +523,9 @@
 config CPU_COPY_V4WB
 	bool
 
+config CPU_COPY_FEROCEON
+	bool
+
 config CPU_COPY_V6
 	bool
 
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 44536a0..32b2d2d 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -36,6 +36,7 @@
 obj-$(CONFIG_CPU_COPY_V3)	+= copypage-v3.o
 obj-$(CONFIG_CPU_COPY_V4WT)	+= copypage-v4wt.o
 obj-$(CONFIG_CPU_COPY_V4WB)	+= copypage-v4wb.o
+obj-$(CONFIG_CPU_COPY_FEROCEON)	+= copypage-feroceon.o
 obj-$(CONFIG_CPU_COPY_V6)	+= copypage-v6.o context.o
 obj-$(CONFIG_CPU_SA1100)	+= copypage-v4mc.o
 obj-$(CONFIG_CPU_XSCALE)	+= copypage-xscale.o
diff --git a/arch/arm/mm/copypage-feroceon.S b/arch/arm/mm/copypage-feroceon.S
new file mode 100644
index 0000000..7eb0d32
--- /dev/null
+++ b/arch/arm/mm/copypage-feroceon.S
@@ -0,0 +1,95 @@
+/*
+ *  linux/arch/arm/lib/copypage-feroceon.S
+ *
+ *  Copyright (C) 2008 Marvell Semiconductors
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This handles copy_user_page and clear_user_page on Feroceon
+ * more optimally than the generic implementations.
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/asm-offsets.h>
+
+	.text
+	.align	5
+
+ENTRY(feroceon_copy_user_page)
+	stmfd	sp!, {r4-r9, lr}
+	mov	ip, #PAGE_SZ
+1:	mov	lr, r1
+	ldmia	r1!, {r2 - r9}
+	pld	[lr, #32]
+	pld	[lr, #64]
+	pld	[lr, #96]
+	pld	[lr, #128]
+	pld	[lr, #160]
+	pld	[lr, #192]
+	pld	[lr, #224]
+	stmia	r0, {r2 - r9}
+	ldmia	r1!, {r2 - r9}
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	stmia	r0, {r2 - r9}
+	ldmia	r1!, {r2 - r9}
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	stmia	r0, {r2 - r9}
+	ldmia	r1!, {r2 - r9}
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	stmia	r0, {r2 - r9}
+	ldmia	r1!, {r2 - r9}
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	stmia	r0, {r2 - r9}
+	ldmia	r1!, {r2 - r9}
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	stmia	r0, {r2 - r9}
+	ldmia	r1!, {r2 - r9}
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	stmia	r0, {r2 - r9}
+	ldmia	r1!, {r2 - r9}
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	stmia	r0, {r2 - r9}
+	subs	ip, ip, #(32 * 8)
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	bne	1b
+	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
+	ldmfd	sp!, {r4-r9, pc}
+
+	.align	5
+
+ENTRY(feroceon_clear_user_page)
+	stmfd	sp!, {r4-r7, lr}
+	mov	r1, #PAGE_SZ/32
+	mov	r2, #0
+	mov	r3, #0
+	mov	r4, #0
+	mov	r5, #0
+	mov	r6, #0
+	mov	r7, #0
+	mov	ip, #0
+	mov	lr, #0
+1:	stmia	r0, {r2-r7, ip, lr}
+	subs	r1, r1, #1
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D line
+	add	r0, r0, #32
+	bne	1b
+	mcr	p15, 0, r1, c7, c10, 4		@ drain WB
+	ldmfd	sp!, {r4-r7, pc}
+
+	__INITDATA
+
+	.type	feroceon_user_fns, #object
+ENTRY(feroceon_user_fns)
+	.long	feroceon_clear_user_page
+	.long	feroceon_copy_user_page
+	.size	feroceon_user_fns, . - feroceon_user_fns
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index f37abd7..a02c171 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -440,7 +440,7 @@
 	.long	cpu_feroceon_name
 	.long	feroceon_processor_functions
 	.long	v4wbi_tlb_fns
-	.long	v4wb_user_fns
+	.long	feroceon_user_fns
 	.long	feroceon_cache_fns
 	.size	__feroceon_old_id_proc_info, . - __feroceon_old_id_proc_info
 #endif
@@ -466,6 +466,6 @@
 	.long	cpu_feroceon_name
 	.long	feroceon_processor_functions
 	.long	v4wbi_tlb_fns
-	.long	v4wb_user_fns
+	.long	feroceon_user_fns
 	.long	feroceon_cache_fns
 	.size	__feroceon_proc_info, . - __feroceon_proc_info