ARM: P2V: extend to 16-bit translation offsets

MSM's memory is aligned to 2MB, which is more than we can do with our
existing method as we're limited to the upper 8 bits.  Extend this by
using two instructions to 16 bits, automatically selected when MSM is
enabled.

Acked-by: Tony Lindgren <tony@atomide.com>
Reviewed-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Tested-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4147f76..b357c29 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -195,7 +195,6 @@
 	bool "Patch physical to virtual translations at runtime (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	depends on !XIP_KERNEL && !THUMB2_KERNEL && MMU
-	depends on !ARCH_MSM
 	depends on !ARCH_REALVIEW || !SPARSEMEM
 	help
 	  Patch phys-to-virt translation functions at runtime according to
@@ -204,6 +203,10 @@
 	  This can only be used with non-XIP, non-Thumb2, MMU kernels where
 	  the base of physical memory is at a 16MB boundary.
 
+config ARM_PATCH_PHYS_VIRT_16BIT
+	def_bool y
+	depends on ARM_PATCH_PHYS_VIRT && ARCH_MSM
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 7197879..2398b3f 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -154,29 +154,42 @@
 #ifndef __virt_to_phys
 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
 
+/*
+ * Constants used to force the right instruction encodings and shifts
+ * so that all we need to do is modify the 8-bit constant field.
+ */
+#define __PV_BITS_31_24	0x81000000
+#define __PV_BITS_23_16	0x00810000
+
 extern unsigned long __pv_phys_offset;
 #define PHYS_OFFSET __pv_phys_offset
 
-#define __pv_stub(from,to,instr)			\
+#define __pv_stub(from,to,instr,type)			\
 	__asm__("@ __pv_stub\n"				\
 	"1:	" instr "	%0, %1, %2\n"		\
 	"	.pushsection .pv_table,\"a\"\n"		\
 	"	.long	1b\n"				\
 	"	.popsection\n"				\
 	: "=r" (to)					\
-	: "r" (from), "I" (0x81000000))
+	: "r" (from), "I" (type))
 
 static inline unsigned long __virt_to_phys(unsigned long x)
 {
 	unsigned long t;
-	__pv_stub(x, t, "add");
+	__pv_stub(x, t, "add", __PV_BITS_31_24);
+#ifdef CONFIG_ARM_PATCH_PHYS_VIRT_16BIT
+	__pv_stub(t, t, "add", __PV_BITS_23_16);
+#endif
 	return t;
 }
 
 static inline unsigned long __phys_to_virt(unsigned long x)
 {
 	unsigned long t;
-	__pv_stub(x, t, "sub");
+	__pv_stub(x, t, "sub", __PV_BITS_31_24);
+#ifdef CONFIG_ARM_PATCH_PHYS_VIRT_16BIT
+	__pv_stub(t, t, "sub", __PV_BITS_23_16);
+#endif
 	return t;
 }
 #else
diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h
index d072c21..a2b775b 100644
--- a/arch/arm/include/asm/module.h
+++ b/arch/arm/include/asm/module.h
@@ -31,7 +31,11 @@
 
 /* Add __virt_to_phys patching state as well */
 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
+#ifdef CONFIG_ARM_PATCH_PHYS_VIRT_16BIT
+#define MODULE_ARCH_VERMAGIC_P2V "p2v16 "
+#else
 #define MODULE_ARCH_VERMAGIC_P2V "p2v8 "
+#endif
 #else
 #define MODULE_ARCH_VERMAGIC_P2V ""
 #endif
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 1db8ead..a94dd99 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -456,8 +456,13 @@
 	add	r4, r4, r3	@ adjust table start address
 	add	r5, r5, r3	@ adjust table end address
 	str	r8, [r7, r3]!	@ save computed PHYS_OFFSET to __pv_phys_offset
+#ifndef CONFIG_ARM_PATCH_PHYS_VIRT_16BIT
 	mov	r6, r3, lsr #24	@ constant for add/sub instructions
 	teq	r3, r6, lsl #24 @ must be 16MiB aligned
+#else
+	mov	r6, r3, lsr #16	@ constant for add/sub instructions
+	teq	r3, r6, lsl #16	@ must be 64kiB aligned
+#endif
 	bne	__error
 	str	r6, [r7, #4]	@ save to __pv_offset
 	b	__fixup_a_pv_table
@@ -471,10 +476,18 @@
 
 	.text
 __fixup_a_pv_table:
+#ifdef CONFIG_ARM_PATCH_PHYS_VIRT_16BIT
+	and	r0, r6, #255	@ offset bits 23-16
+	mov	r6, r6, lsr #8	@ offset bits 31-24
+#else
+	mov	r0, #0		@ just in case...
+#endif
 	b	3f
 2:	ldr	ip, [r7, r3]
 	bic	ip, ip, #0x000000ff
-	orr	ip, ip, r6
+	tst	ip, #0x400	@ rotate shift tells us LS or MS byte
+	orrne	ip, ip, r6	@ mask in offset bits 31-24
+	orreq	ip, ip, r0	@ mask in offset bits 23-16
 	str	ip, [r7, r3]
 3:	cmp	r4, r5
 	ldrcc	r7, [r4], #4	@ use branch for delay slot