ARM: Add support for FA526 v2

Adds support for Faraday FA526 core. This core is used at least by:
Cortina Systems Gemini and Centroid family
Cavium Networks ECONA family
Grain Media GM8120
Pixelplus ImageARM
Prolific PL-1029
Faraday IP evaluation boards

v2:
- move TLB_BTB to separate patch
- update copyrights

Signed-off-by: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 24e0f01..d29f926 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -72,6 +72,7 @@
 tune-$(CONFIG_CPU_ARM922T)	:=-mtune=arm9tdmi
 tune-$(CONFIG_CPU_ARM925T)	:=-mtune=arm9tdmi
 tune-$(CONFIG_CPU_ARM926T)	:=-mtune=arm9tdmi
+tune-$(CONFIG_CPU_FA526)	:=-mtune=arm9tdmi
 tune-$(CONFIG_CPU_SA110)	:=-mtune=strongarm110
 tune-$(CONFIG_CPU_SA1100)	:=-mtune=strongarm1100
 tune-$(CONFIG_CPU_XSCALE)	:=$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 77d61423..def0248 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -459,6 +459,20 @@
 		mcr	p15, 0, r0, c7, c5, 4	@ ISB
 		mov	pc, r12
 
+__fa526_cache_on:
+		mov	r12, lr
+		bl	__setup_mmu
+		mov	r0, #0
+		mcr	p15, 0, r0, c7, c7, 0	@ Invalidate whole cache
+		mcr	p15, 0, r0, c7, c10, 4	@ drain write buffer
+		mcr	p15, 0, r0, c8, c7, 0	@ flush UTLB
+		mrc	p15, 0, r0, c1, c0, 0	@ read control reg
+		orr	r0, r0, #0x1000		@ I-cache enable
+		bl	__common_mmu_cache_on
+		mov	r0, #0
+		mcr	p15, 0, r0, c8, c7, 0	@ flush UTLB
+		mov	pc, r12
+
 __arm6_mmu_cache_on:
 		mov	r12, lr
 		bl	__setup_mmu
@@ -636,6 +650,12 @@
 		b	__armv4_mmu_cache_off
 		b	__armv5tej_mmu_cache_flush
 
+		.word	0x66015261		@ FA526
+		.word	0xff01fff1
+		b	__fa526_cache_on
+		b	__armv4_mmu_cache_off
+		b	__fa526_cache_flush
+
 		@ These match on the architecture ID
 
 		.word	0x00020000		@ ARMv4T
@@ -775,6 +795,12 @@
 		mcr	p15, 0, ip, c7, c10, 4	@ drain WB
 		mov	pc, lr
 		
+__fa526_cache_flush:
+		mov	r1, #0
+		mcr	p15, 0, r1, c7, c14, 0	@ clean and invalidate D cache
+		mcr	p15, 0, r1, c7, c5, 0	@ flush I cache
+		mcr	p15, 0, r1, c7, c10, 4	@ drain WB
+		mov	pc, lr
 
 __armv6_mmu_cache_flush:
 		mov	r1, #0
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 6cbd8fd..a6b8b90 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -46,6 +46,14 @@
 # define MULTI_CACHE 1
 #endif
 
+#if defined(CONFIG_CPU_FA526)
+# ifdef _CACHE
+#  define MULTI_CACHE 1
+# else
+#  define _CACHE fa
+# endif
+#endif
+
 #if defined(CONFIG_CPU_ARM926T)
 # ifdef _CACHE
 #  define MULTI_CACHE 1
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index f341c9d..e6eb8a6 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -76,6 +76,14 @@
 # endif
 #endif
 
+#ifdef CONFIG_CPU_COPY_FA
+# ifdef _USER
+#  define MULTI_USER 1
+# else
+#  define _USER fa
+# endif
+#endif
+
 #ifdef CONFIG_CPU_SA1100
 # ifdef _USER
 #  define MULTI_USER 1
diff --git a/arch/arm/include/asm/proc-fns.h b/arch/arm/include/asm/proc-fns.h
index db80203..0094928 100644
--- a/arch/arm/include/asm/proc-fns.h
+++ b/arch/arm/include/asm/proc-fns.h
@@ -89,6 +89,14 @@
 #   define CPU_NAME cpu_arm922
 #  endif
 # endif
+# ifdef CONFIG_CPU_FA526
+#  ifdef CPU_NAME
+#   undef  MULTI_CPU
+#   define MULTI_CPU
+#  else
+#   define CPU_NAME cpu_fa526
+#  endif
+# endif
 # ifdef CONFIG_CPU_ARM925T
 #  ifdef CPU_NAME
 #   undef  MULTI_CPU
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index 811be55..d6a4dad 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -125,6 +125,12 @@
 				    : : "r" (0) : "memory")
 #define dmb() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 5" \
 				    : : "r" (0) : "memory")
+#elif defined(CONFIG_CPU_FA526)
+#define isb() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c5, 4" \
+				    : : "r" (0) : "memory")
+#define dsb() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 4" \
+				    : : "r" (0) : "memory")
+#define dmb() __asm__ __volatile__ ("" : : : "memory")
 #else
 #define isb() __asm__ __volatile__ ("" : : : "memory")
 #define dsb() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 4" \
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index ffedd24..a622180 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -54,6 +54,7 @@
  *	  v4wb  - ARMv4 with write buffer without I TLB flush entry instruction
  *	  v4wbi - ARMv4 with write buffer with I TLB flush entry instruction
  *	  fr    - Feroceon (v4wbi with non-outer-cacheable page table walks)
+ *	  fa    - Faraday (v4 with write buffer with UTLB and branch target buffer (BTB))
  *	  v6wbi - ARMv6 with write buffer with I TLB flush entry instruction
  *	  v7wbi - identical to v6wbi
  */
@@ -90,6 +91,22 @@
 # define v4_always_flags	(-1UL)
 #endif
 
+#define fa_tlb_flags	(TLB_WB | TLB_BTB | TLB_DCLEAN | \
+			 TLB_V4_U_FULL | TLB_V4_U_PAGE)
+
+#ifdef CONFIG_CPU_TLB_FA
+# define fa_possible_flags	fa_tlb_flags
+# define fa_always_flags	fa_tlb_flags
+# ifdef _TLB
+#  define MULTI_TLB 1
+# else
+#  define _TLB fa
+# endif
+#else
+# define fa_possible_flags	0
+# define fa_always_flags	(-1UL)
+#endif
+
 #define v4wbi_tlb_flags	(TLB_WB | TLB_DCLEAN | \
 			 TLB_V4_I_FULL | TLB_V4_D_FULL | \
 			 TLB_V4_I_PAGE | TLB_V4_D_PAGE)
@@ -268,6 +285,7 @@
 				 v4wbi_possible_flags | \
 				 fr_possible_flags | \
 				 v4wb_possible_flags | \
+				 fa_possible_flags | \
 				 v6wbi_possible_flags | \
 				 v7wbi_possible_flags)
 
@@ -276,6 +294,7 @@
 				 v4wbi_always_flags & \
 				 fr_always_flags & \
 				 v4wb_always_flags & \
+				 fa_always_flags & \
 				 v6wbi_always_flags & \
 				 v7wbi_always_flags)
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index d490f37..bc333186 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -186,6 +186,24 @@
 	  Say Y if you want support for the ARM926T processor.
 	  Otherwise, say N.
 
+# FA526
+config CPU_FA526
+	bool
+	select CPU_32v4
+	select CPU_ABRT_EV4
+	select CPU_PABRT_NOIFAR
+	select CPU_CACHE_VIVT
+	select CPU_CP15_MMU
+	select CPU_CACHE_FA
+	select CPU_COPY_FA if MMU
+	select CPU_TLB_FA if MMU
+	help
+	  The FA526 is a version of the ARMv4 compatible processor with
+	  Branch Target Buffer, Unified TLB and cache line size 16.
+
+	  Say Y if you want support for the FA526 processor.
+	  Otherwise, say N.
+
 # ARM940T
 config CPU_ARM940T
 	bool "Support ARM940T processor" if ARCH_INTEGRATOR
@@ -484,6 +502,9 @@
 config CPU_CACHE_VIPT
 	bool
 
+config CPU_CACHE_FA
+	bool
+
 if MMU
 # The copy-page model
 config CPU_COPY_V3
@@ -498,6 +519,9 @@
 config CPU_COPY_FEROCEON
 	bool
 
+config CPU_COPY_FA
+	bool
+
 config CPU_COPY_V6
 	bool
 
@@ -528,6 +552,13 @@
 	help
 	  Feroceon TLB (v4wbi with non-outer-cachable page table walks).
 
+config CPU_TLB_FA
+	bool
+	help
+	  Faraday ARM FA526 architecture, unified TLB with writeback cache
+	  and invalidate instruction cache entry. Branch target buffer is
+	  also supported.
+
 config CPU_TLB_V6
 	bool
 
@@ -638,7 +669,7 @@
 
 config CPU_DCACHE_WRITETHROUGH
 	bool "Force write through D-cache"
-	depends on (CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || CPU_ARM1020) && !CPU_DCACHE_DISABLE
+	depends on (CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || CPU_ARM1020 || CPU_FA526) && !CPU_DCACHE_DISABLE
 	default y if CPU_ARM925T
 	help
 	  Say Y here to use the data cache in writethrough mode. Unless you
@@ -653,7 +684,7 @@
 
 config CPU_BPREDICT_DISABLE
 	bool "Disable branch prediction"
-	depends on CPU_ARM1020 || CPU_V6 || CPU_XSC3 || CPU_V7
+	depends on CPU_ARM1020 || CPU_V6 || CPU_XSC3 || CPU_V7 || CPU_FA526
 	help
 	  Say Y here to disable branch prediction.  If unsure, say N.
 
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 480f78a..40f941c 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -32,6 +32,7 @@
 obj-$(CONFIG_CPU_CACHE_V4WB)	+= cache-v4wb.o
 obj-$(CONFIG_CPU_CACHE_V6)	+= cache-v6.o
 obj-$(CONFIG_CPU_CACHE_V7)	+= cache-v7.o
+obj-$(CONFIG_CPU_CACHE_FA)	+= cache-fa.o
 
 obj-$(CONFIG_CPU_COPY_V3)	+= copypage-v3.o
 obj-$(CONFIG_CPU_COPY_V4WT)	+= copypage-v4wt.o
@@ -41,6 +42,7 @@
 obj-$(CONFIG_CPU_SA1100)	+= copypage-v4mc.o
 obj-$(CONFIG_CPU_XSCALE)	+= copypage-xscale.o
 obj-$(CONFIG_CPU_XSC3)		+= copypage-xsc3.o
+obj-$(CONFIG_CPU_COPY_FA)	+= copypage-fa.o
 
 obj-$(CONFIG_CPU_TLB_V3)	+= tlb-v3.o
 obj-$(CONFIG_CPU_TLB_V4WT)	+= tlb-v4.o
@@ -49,6 +51,7 @@
 obj-$(CONFIG_CPU_TLB_FEROCEON)	+= tlb-v4wbi.o	# reuse v4wbi TLB functions
 obj-$(CONFIG_CPU_TLB_V6)	+= tlb-v6.o
 obj-$(CONFIG_CPU_TLB_V7)	+= tlb-v7.o
+obj-$(CONFIG_CPU_TLB_FA)	+= tlb-fa.o
 
 obj-$(CONFIG_CPU_ARM610)	+= proc-arm6_7.o
 obj-$(CONFIG_CPU_ARM710)	+= proc-arm6_7.o
@@ -62,6 +65,7 @@
 obj-$(CONFIG_CPU_ARM926T)	+= proc-arm926.o
 obj-$(CONFIG_CPU_ARM940T)	+= proc-arm940.o
 obj-$(CONFIG_CPU_ARM946E)	+= proc-arm946.o
+obj-$(CONFIG_CPU_FA526)		+= proc-fa526.o
 obj-$(CONFIG_CPU_ARM1020)	+= proc-arm1020.o
 obj-$(CONFIG_CPU_ARM1020E)	+= proc-arm1020e.o
 obj-$(CONFIG_CPU_ARM1022)	+= proc-arm1022.o
diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
new file mode 100644
index 0000000..b63a8f7
--- /dev/null
+++ b/arch/arm/mm/cache-fa.S
@@ -0,0 +1,220 @@
+/*
+ *  linux/arch/arm/mm/cache-fa.S
+ *
+ *  Copyright (C) 2005 Faraday Corp.
+ *  Copyright (C) 2008-2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on cache-v4wb.S:
+ *  Copyright (C) 1997-2002 Russell king
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Processors: FA520 FA526 FA626	
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+
+#include "proc-macros.S"
+
+/*
+ * The size of one data cache line.
+ */
+#define CACHE_DLINESIZE	16
+
+/*
+ * The total size of the data cache.
+ */
+#ifdef CONFIG_ARCH_GEMINI
+#define CACHE_DSIZE	8192
+#else
+#define CACHE_DSIZE	16384 
+#endif 
+
+/* FIXME: put optimal value here. Current one is just estimation */
+#define CACHE_DLIMIT	(CACHE_DSIZE * 2)
+
+/*
+ *	flush_user_cache_all()
+ *
+ *	Clean and invalidate all cache entries in a particular address
+ *	space.
+ */
+ENTRY(fa_flush_user_cache_all)
+	/* FALLTHROUGH */
+/*
+ *	flush_kern_cache_all()
+ *
+ *	Clean and invalidate the entire cache.
+ */
+ENTRY(fa_flush_kern_cache_all)
+	mov	ip, #0
+	mov	r2, #VM_EXEC
+__flush_whole_cache:
+	mcr	p15, 0, ip, c7, c14, 0		@ clean/invalidate D cache
+	tst	r2, #VM_EXEC
+	mcrne	p15, 0, ip, c7, c5, 0		@ invalidate I cache
+	mcrne	p15, 0, ip, c7, c5, 6		@ invalidate BTB
+	mcrne	p15, 0, ip, c7, c10, 4		@ drain write buffer
+	mcrne	p15, 0, ip, c7, c5, 4		@ prefetch flush
+	mov	pc, lr
+
+/*
+ *	flush_user_cache_range(start, end, flags)
+ *
+ *	Invalidate a range of cache entries in the specified
+ *	address space.
+ *
+ *	- start - start address (inclusive, page aligned)
+ *	- end	- end address (exclusive, page aligned)
+ *	- flags	- vma_area_struct flags describing address space
+ */
+ENTRY(fa_flush_user_cache_range)
+	mov	ip, #0
+	sub	r3, r1, r0			@ calculate total size
+	cmp	r3, #CACHE_DLIMIT		@ total size >= limit?
+	bhs	__flush_whole_cache		@ flush whole D cache
+
+1:	tst	r2, #VM_EXEC
+	mcrne	p15, 0, r0, c7, c5, 1		@ invalidate I line
+	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	tst	r2, #VM_EXEC
+	mcrne	p15, 0, ip, c7, c5, 6		@ invalidate BTB
+	mcrne	p15, 0, ip, c7, c10, 4		@ data write barrier
+	mcrne	p15, 0, ip, c7, c5, 4		@ prefetch flush
+	mov	pc, lr
+
+/*
+ *	coherent_kern_range(start, end)
+ *
+ *	Ensure coherency between the Icache and the Dcache in the
+ *	region described by start.  If you have non-snooping
+ *	Harvard caches, you need to implement this function.
+ *
+ *	- start  - virtual start address
+ *	- end	 - virtual end address
+ */
+ENTRY(fa_coherent_kern_range)
+	/* fall through */
+
+/*
+ *	coherent_user_range(start, end)
+ *
+ *	Ensure coherency between the Icache and the Dcache in the
+ *	region described by start.  If you have non-snooping
+ *	Harvard caches, you need to implement this function.
+ *
+ *	- start  - virtual start address
+ *	- end	 - virtual end address
+ */
+ENTRY(fa_coherent_user_range)
+	bic	r0, r0, #CACHE_DLINESIZE - 1
+1:	mcr	p15, 0, r0, c7, c14, 1		@ clean and invalidate D entry
+	mcr	p15, 0, r0, c7, c5, 1		@ invalidate I entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 6		@ invalidate BTB
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mcr	p15, 0, r0, c7, c5, 4		@ prefetch flush
+	mov	pc, lr
+
+/*
+ *	flush_kern_dcache_page(kaddr)
+ *
+ *	Ensure that the data held in the page kaddr is written back
+ *	to the page in question.
+ *
+ *	- kaddr   - kernel address (guaranteed to be page aligned)
+ */
+ENTRY(fa_flush_kern_dcache_page)
+	add	r1, r0, #PAGE_SZ
+1:	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D line
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 0		@ invalidate I cache
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+
+/*
+ *	dma_inv_range(start, end)
+ *
+ *	Invalidate (discard) the specified virtual address range.
+ *	May not write back any entries.  If 'start' or 'end'
+ *	are not cache line aligned, those lines must be written
+ *	back.
+ *
+ *	- start  - virtual start address
+ *	- end	 - virtual end address
+ */
+ENTRY(fa_dma_inv_range)
+	tst	r0, #CACHE_DLINESIZE - 1
+	bic	r0, r0, #CACHE_DLINESIZE - 1
+	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D entry
+	tst	r1, #CACHE_DLINESIZE - 1
+	bic	r1, r1, #CACHE_DLINESIZE - 1
+	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D entry
+1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+
+/*
+ *	dma_clean_range(start, end)
+ *
+ *	Clean (write back) the specified virtual address range.
+ *
+ *	- start  - virtual start address
+ *	- end	 - virtual end address
+ */
+ENTRY(fa_dma_clean_range)
+	bic	r0, r0, #CACHE_DLINESIZE - 1
+1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0	
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+
+/*
+ *	dma_flush_range(start,end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+ENTRY(fa_dma_flush_range)
+	bic	r0, r0, #CACHE_DLINESIZE - 1
+1:	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	cmp	r0, r1
+	blo	1b
+	mov	r0, #0	
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+
+	__INITDATA
+
+	.type	fa_cache_fns, #object
+ENTRY(fa_cache_fns)
+	.long	fa_flush_kern_cache_all
+	.long	fa_flush_user_cache_all
+	.long	fa_flush_user_cache_range
+	.long	fa_coherent_kern_range
+	.long	fa_coherent_user_range
+	.long	fa_flush_kern_dcache_page
+	.long	fa_dma_inv_range
+	.long	fa_dma_clean_range
+	.long	fa_dma_flush_range
+	.size	fa_cache_fns, . - fa_cache_fns
diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
new file mode 100644
index 0000000..b2a6008
--- /dev/null
+++ b/arch/arm/mm/copypage-fa.c
@@ -0,0 +1,86 @@
+/*
+ *  linux/arch/arm/lib/copypage-fa.S
+ *
+ *  Copyright (C) 2005 Faraday Corp.
+ *  Copyright (C) 2008-2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on copypage-v4wb.S:
+ *  Copyright (C) 1995-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/highmem.h>
+
+/*
+ * Faraday optimised copy_user_page
+ */
+static void __naked
+fa_copy_user_page(void *kto, const void *kfrom)
+{
+	asm("\
+	stmfd	sp!, {r4, lr}			@ 2\n\
+	mov	r2, %0				@ 1\n\
+1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
+	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ 1   clean and invalidate D line\n\
+	add	r0, r0, #16			@ 1\n\
+	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
+	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
+	mcr	p15, 0, r0, c7, c14, 1		@ 1   clean and invalidate D line\n\
+	add	r0, r0, #16			@ 1\n\
+	subs	r2, r2, #1			@ 1\n\
+	bne	1b				@ 1\n\
+	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
+	ldmfd	sp!, {r4, pc}			@ 3"
+	:
+	: "I" (PAGE_SIZE / 32));
+}
+
+void fa_copy_user_highpage(struct page *to, struct page *from,
+	unsigned long vaddr)
+{
+	void *kto, *kfrom;
+
+	kto = kmap_atomic(to, KM_USER0);
+	kfrom = kmap_atomic(from, KM_USER1);
+	fa_copy_user_page(kto, kfrom);
+	kunmap_atomic(kfrom, KM_USER1);
+	kunmap_atomic(kto, KM_USER0);
+}
+
+/*
+ * Faraday optimised clear_user_page
+ *
+ * Same story as above.
+ */
+void fa_clear_user_highpage(struct page *page, unsigned long vaddr)
+{
+	void *ptr, *kaddr = kmap_atomic(page, KM_USER0);
+	asm volatile("\
+	mov	r1, %2				@ 1\n\
+	mov	r2, #0				@ 1\n\
+	mov	r3, #0				@ 1\n\
+	mov	ip, #0				@ 1\n\
+	mov	lr, #0				@ 1\n\
+1:	stmia	%0, {r2, r3, ip, lr}		@ 4\n\
+	mcr	p15, 0, %0, c7, c14, 1		@ 1   clean and invalidate D line\n\
+	add	%0, %0, #16			@ 1\n\
+	stmia	%0, {r2, r3, ip, lr}		@ 4\n\
+	mcr	p15, 0, %0, c7, c14, 1		@ 1   clean and invalidate D line\n\
+	add	%0, %0, #16			@ 1\n\
+	subs	r1, r1, #1			@ 1\n\
+	bne	1b				@ 1\n\
+	mcr	p15, 0, r1, c7, c10, 4		@ 1   drain WB"
+	: "=r" (ptr)
+	: "0" (kaddr), "I" (PAGE_SIZE / 32)
+	: "r1", "r2", "r3", "ip", "lr");
+	kunmap_atomic(kaddr, KM_USER0);
+}
+
+struct cpu_user_fns fa_user_fns __initdata = {
+	.cpu_clear_user_highpage = fa_clear_user_highpage,
+	.cpu_copy_user_highpage	= fa_copy_user_highpage,
+};
diff --git a/arch/arm/mm/proc-fa526.S b/arch/arm/mm/proc-fa526.S
new file mode 100644
index 0000000..08b8a95
--- /dev/null
+++ b/arch/arm/mm/proc-fa526.S
@@ -0,0 +1,248 @@
+/*
+ *  linux/arch/arm/mm/proc-fa526.S: MMU functions for FA526
+ *
+ *  Written by : Luke Lee
+ *  Copyright (C) 2005 Faraday Corp.
+ *  Copyright (C) 2008-2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ *
+ * These are the low level assembler for performing cache and TLB
+ * functions on the fa526.
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/assembler.h>
+#include <asm/hwcap.h>
+#include <asm/pgtable-hwdef.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+
+#include "proc-macros.S"
+
+#define CACHE_DLINESIZE	16
+
+	.text
+/*
+ * cpu_fa526_proc_init()
+ */
+ENTRY(cpu_fa526_proc_init)
+	mov	pc, lr
+
+/*
+ * cpu_fa526_proc_fin()
+ */
+ENTRY(cpu_fa526_proc_fin)
+	stmfd	sp!, {lr}
+	mov	ip, #PSR_F_BIT | PSR_I_BIT | SVC_MODE
+	msr	cpsr_c, ip
+	bl	fa_flush_kern_cache_all
+	mrc	p15, 0, r0, c1, c0, 0		@ ctrl register
+	bic	r0, r0, #0x1000			@ ...i............
+	bic	r0, r0, #0x000e			@ ............wca.
+	mcr	p15, 0, r0, c1, c0, 0		@ disable caches
+	nop
+	nop
+	ldmfd	sp!, {pc}
+
+/*
+ * cpu_fa526_reset(loc)
+ *
+ * Perform a soft reset of the system.  Put the CPU into the
+ * same state as it would be if it had been reset, and branch
+ * to what would be the reset vector.
+ *
+ * loc: location to jump to for soft reset
+ */
+	.align	4
+ENTRY(cpu_fa526_reset)
+/* TODO: Use CP8 if possible... */
+	mov	ip, #0
+	mcr	p15, 0, ip, c7, c7, 0		@ invalidate I,D caches
+	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
+#ifdef CONFIG_MMU
+	mcr	p15, 0, ip, c8, c7, 0		@ invalidate I & D TLBs
+#endif
+	mrc	p15, 0, ip, c1, c0, 0		@ ctrl register
+	bic	ip, ip, #0x000f			@ ............wcam
+	bic	ip, ip, #0x1100			@ ...i...s........
+	bic	ip, ip, #0x0800			@ BTB off
+	mcr	p15, 0, ip, c1, c0, 0		@ ctrl register
+	nop
+	nop
+	mov	pc, r0
+
+/*
+ * cpu_fa526_do_idle()
+ */
+	.align	4
+ENTRY(cpu_fa526_do_idle)
+	mcr	p15, 0, r0, c7, c0, 4		@ Wait for interrupt
+	mov	pc, lr
+
+
+ENTRY(cpu_fa526_dcache_clean_area)
+1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
+	add	r0, r0, #CACHE_DLINESIZE
+	subs	r1, r1, #CACHE_DLINESIZE
+	bhi	1b
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+
+/* =============================== PageTable ============================== */
+
+/*
+ * cpu_fa526_switch_mm(pgd)
+ *
+ * Set the translation base pointer to be as described by pgd.
+ *
+ * pgd: new page tables
+ */
+	.align	4
+ENTRY(cpu_fa526_switch_mm)
+#ifdef CONFIG_MMU
+	mov	ip, #0
+#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
+	mcr	p15, 0, ip, c7, c6, 0		@ invalidate D cache
+#else
+	mcr	p15, 0, ip, c7, c14, 0		@ clean and invalidate whole D cache
+#endif
+	mcr	p15, 0, ip, c7, c5, 0		@ invalidate I cache
+	mcr	p15, 0, ip, c7, c5, 6		@ invalidate BTB since mm changed
+	mcr	p15, 0, ip, c7, c10, 4		@ data write barrier
+	mcr	p15, 0, ip, c7, c5, 4		@ prefetch flush
+	mcr	p15, 0, r0, c2, c0, 0		@ load page table pointer
+	mcr	p15, 0, ip, c8, c7, 0		@ invalidate UTLB
+#endif
+	mov	pc, lr
+
+/*
+ * cpu_fa526_set_pte_ext(ptep, pte, ext)
+ *
+ * Set a PTE and flush it out
+ */
+	.align	4
+ENTRY(cpu_fa526_set_pte_ext)
+#ifdef CONFIG_MMU
+	armv3_set_pte_ext
+	mov	r0, r0
+	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+#endif
+	mov	pc, lr
+
+	__INIT
+
+	.type	__fa526_setup, #function
+__fa526_setup:
+	/* On return of this routine, r0 must carry correct flags for CFG register */
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c7		@ invalidate I,D caches on v4
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer on v4
+#ifdef CONFIG_MMU
+	mcr	p15, 0, r0, c8, c7		@ invalidate I,D TLBs on v4
+#endif
+	mcr	p15, 0, r0, c7, c5, 5		@ invalidate IScratchpad RAM
+
+	mov	r0, #1
+	mcr	p15, 0, r0, c1, c1, 0		@ turn-on ECR
+
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 6		@ invalidate BTB All
+	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
+	mcr	p15, 0, r0, c7, c5, 4		@ prefetch flush
+
+	mov	r0, #0x1f			@ Domains 0, 1 = manager, 2 = client
+	mcr	p15, 0, r0, c3, c0		@ load domain access register
+
+	mrc	p15, 0, r0, c1, c0		@ get control register v4
+	ldr	r5, fa526_cr1_clear
+	bic	r0, r0, r5
+	ldr	r5, fa526_cr1_set
+	orr	r0, r0, r5
+	mov	pc, lr
+	.size	__fa526_setup, . - __fa526_setup
+
+	/*
+	 * .RVI ZFRS BLDP WCAM
+	 * ..11 1001 .111 1101
+	 *
+	 */
+	.type	fa526_cr1_clear, #object
+	.type	fa526_cr1_set, #object
+fa526_cr1_clear:
+	.word	0x3f3f
+fa526_cr1_set:
+	.word	0x397D
+
+	__INITDATA
+
+/*
+ * Purpose : Function pointers used to access above functions - all calls
+ *	     come through these
+ */
+	.type	fa526_processor_functions, #object
+fa526_processor_functions:
+	.word	v4_early_abort
+	.word	pabort_noifar
+	.word	cpu_fa526_proc_init
+	.word	cpu_fa526_proc_fin
+	.word	cpu_fa526_reset
+	.word   cpu_fa526_do_idle
+	.word	cpu_fa526_dcache_clean_area
+	.word	cpu_fa526_switch_mm
+	.word	cpu_fa526_set_pte_ext
+	.size	fa526_processor_functions, . - fa526_processor_functions
+
+	.section ".rodata"
+
+	.type	cpu_arch_name, #object
+cpu_arch_name:
+	.asciz	"armv4"
+	.size	cpu_arch_name, . - cpu_arch_name
+
+	.type	cpu_elf_name, #object
+cpu_elf_name:
+	.asciz	"v4"
+	.size	cpu_elf_name, . - cpu_elf_name
+
+	.type	cpu_fa526_name, #object
+cpu_fa526_name:
+	.asciz	"FA526"
+	.size	cpu_fa526_name, . - cpu_fa526_name
+
+	.align
+
+	.section ".proc.info.init", #alloc, #execinstr
+
+	.type	__fa526_proc_info,#object
+__fa526_proc_info:
+	.long	0x66015261
+	.long	0xff01fff1
+	.long   PMD_TYPE_SECT | \
+		PMD_SECT_BUFFERABLE | \
+		PMD_SECT_CACHEABLE | \
+		PMD_BIT4 | \
+		PMD_SECT_AP_WRITE | \
+		PMD_SECT_AP_READ
+	.long   PMD_TYPE_SECT | \
+		PMD_BIT4 | \
+		PMD_SECT_AP_WRITE | \
+		PMD_SECT_AP_READ
+	b	__fa526_setup
+	.long	cpu_arch_name
+	.long	cpu_elf_name
+	.long	HWCAP_SWP | HWCAP_HALF
+	.long	cpu_fa526_name
+	.long	fa526_processor_functions
+	.long	fa_tlb_fns
+	.long	fa_user_fns
+	.long	fa_cache_fns
+	.size	__fa526_proc_info, . - __fa526_proc_info
diff --git a/arch/arm/mm/tlb-fa.S b/arch/arm/mm/tlb-fa.S
new file mode 100644
index 0000000..9694f1f
--- /dev/null
+++ b/arch/arm/mm/tlb-fa.S
@@ -0,0 +1,75 @@
+/*
+ *  linux/arch/arm/mm/tlb-fa.S
+ *
+ *  Copyright (C) 2005 Faraday Corp.
+ *  Copyright (C) 2008-2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on tlb-v4wbi.S:
+ *  Copyright (C) 1997-2002 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ARM architecture version 4, Faraday variation.
+ *  This assume an unified TLBs, with a write buffer, and branch target buffer (BTB)
+ *
+ *  Processors: FA520 FA526 FA626
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/asm-offsets.h>
+#include <asm/tlbflush.h>
+#include "proc-macros.S"
+
+
+/*
+ *	flush_user_tlb_range(start, end, mm)
+ *
+ *	Invalidate a range of TLB entries in the specified address space.
+ *
+ *	- start - range start address
+ *	- end   - range end address
+ *	- mm    - mm_struct describing address space
+ */
+	.align	4
+ENTRY(fa_flush_user_tlb_range)
+	vma_vm_mm ip, r2
+	act_mm	r3				@ get current->active_mm
+	eors	r3, ip, r3			@ == mm ?
+	movne	pc, lr				@ no, we dont do anything
+	mov	r3, #0
+	mcr	p15, 0, r3, c7, c10, 4		@ drain WB
+	bic	r0, r0, #0x0ff
+	bic	r0, r0, #0xf00
+1:	mcr	p15, 0, r0, c8, c7, 1		@ invalidate UTLB entry
+	add	r0, r0, #PAGE_SZ
+	cmp	r0, r1
+	blo	1b
+	mcr	p15, 0, r3, c7, c5, 6		@ invalidate BTB
+	mcr	p15, 0, r3, c7, c10, 4		@ data write barrier
+	mov	pc, lr
+
+
+ENTRY(fa_flush_kern_tlb_range)
+	mov	r3, #0
+	mcr	p15, 0, r3, c7, c10, 4		@ drain WB
+	bic	r0, r0, #0x0ff
+	bic	r0, r0, #0xf00
+1:	mcr	p15, 0, r0, c8, c7, 1		@ invalidate UTLB entry
+	add	r0, r0, #PAGE_SZ
+	cmp	r0, r1
+	blo	1b
+	mcr	p15, 0, r3, c7, c5, 6		@ invalidate BTB
+	mcr	p15, 0, r3, c7, c10, 4		@ data write barrier
+	mcr	p15, 0, r3, c7, c5, 4		@ prefetch flush
+	mov	pc, lr
+
+	__INITDATA
+
+	.type	fa_tlb_fns, #object
+ENTRY(fa_tlb_fns)
+	.long	fa_flush_user_tlb_range
+	.long	fa_flush_kern_tlb_range
+	.long	fa_tlb_flags
+	.size	fa_tlb_fns, . - fa_tlb_fns