ARM: 7452/1: delay: allow timer-based delay implementation to be selected

This patch allows a timer-based delay implementation to be selected by
switching the delay routines over to use get_cycles, which is
implemented in terms of read_current_timer. This further allows us to
skip the loop calibration and have a consistent delay function in the
face of core frequency scaling.

To avoid the pain of dealing with memory-mapped counters, this
implementation uses the co-processor interface to the architected timers
when they are available. The previous loop-based implementation is
kept around for CPUs without the architected timers and we retain both
the maximum delay (2ms) and the corresponding conversion factors for
determining the number of loops required for a given interval. Since the
indirection of the timer routines will only work when called from C,
the sa1100 sleep routines are modified to branch to the loop-based delay
functions directly.

Tested-by: Shinya Kuribayashi <shinya.kuribayashi.px@renesas.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
diff --git a/arch/arm/lib/delay-loop.S b/arch/arm/lib/delay-loop.S
new file mode 100644
index 0000000..36b668d
--- /dev/null
+++ b/arch/arm/lib/delay-loop.S
@@ -0,0 +1,67 @@
+/*
+ *  linux/arch/arm/lib/delay.S
+ *
+ *  Copyright (C) 1995, 1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/delay.h>
+		.text
+
+.LC0:		.word	loops_per_jiffy
+.LC1:		.word	UDELAY_MULT
+
+/*
+ * r0  <= 2000
+ * lpj <= 0x01ffffff (max. 3355 bogomips)
+ * HZ  <= 1000
+ */
+
+ENTRY(__loop_udelay)
+		ldr	r2, .LC1
+		mul	r0, r2, r0
+ENTRY(__loop_const_udelay)			@ 0 <= r0 <= 0x7fffff06
+		mov	r1, #-1
+		ldr	r2, .LC0
+		ldr	r2, [r2]		@ max = 0x01ffffff
+		add	r0, r0, r1, lsr #32-14
+		mov	r0, r0, lsr #14		@ max = 0x0001ffff
+		add	r2, r2, r1, lsr #32-10
+		mov	r2, r2, lsr #10		@ max = 0x00007fff
+		mul	r0, r2, r0		@ max = 2^32-1
+		add	r0, r0, r1, lsr #32-6
+		movs	r0, r0, lsr #6
+		moveq	pc, lr
+
+/*
+ * loops = r0 * HZ * loops_per_jiffy / 1000000
+ */
+
+@ Delay routine
+ENTRY(__loop_delay)
+		subs	r0, r0, #1
+#if 0
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+#endif
+		bhi	__loop_delay
+		mov	pc, lr
+ENDPROC(__loop_udelay)
+ENDPROC(__loop_const_udelay)
+ENDPROC(__loop_delay)