[POWERPC] Implement clockevents driver for powerpc

This registers a clock event structure for the decrementer and turns
on CONFIG_GENERIC_CLOCKEVENTS, which means that we now don't need
most of timer_interrupt(), since the work is done in generic code.
For secondary CPUs, their decrementer clockevent is registered when
the CPU comes up (the generic code automatically removes the
clockevent when the CPU goes down).

Signed-off-by: Tony Breeds <tony@bakeyournoodle.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6819a94..a46f811 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -35,6 +35,9 @@
 config GENERIC_TIME_VSYSCALL
 	def_bool y
 
+config GENERIC_CLOCKEVENTS
+	def_bool y
+
 config GENERIC_HARDIRQS
 	bool
 	default y
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b24dcba..d30f08f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -569,6 +569,8 @@
 	if (system_state > SYSTEM_BOOTING)
 		snapshot_timebase();
 
+	secondary_cpu_time_init();
+
 	spin_lock(&call_lock);
 	cpu_set(cpu, cpu_online_map);
 	spin_unlock(&call_lock);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e71a0d8..d20947c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -73,6 +73,7 @@
 
 /* powerpc clocksource/clockevent code */
 
+#include <linux/clockchips.h>
 #include <linux/clocksource.h>
 
 static cycle_t rtc_read(void);
@@ -97,6 +98,27 @@
 	.read         = timebase_read,
 };
 
+#define DECREMENTER_MAX	0x7fffffff
+
+static int decrementer_set_next_event(unsigned long evt,
+				      struct clock_event_device *dev);
+static void decrementer_set_mode(enum clock_event_mode mode,
+				 struct clock_event_device *dev);
+
+static struct clock_event_device decrementer_clockevent = {
+       .name           = "decrementer",
+       .rating         = 200,
+       .shift          = 32,
+       .mult           = 0,	/* To be filled in */
+       .irq            = 0,
+       .set_next_event = decrementer_set_next_event,
+       .set_mode       = decrementer_set_mode,
+       .features       = CLOCK_EVT_FEAT_ONESHOT,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, decrementers);
+void init_decrementer_clockevent(void);
+
 #ifdef CONFIG_PPC_ISERIES
 static unsigned long __initdata iSeries_recal_titan;
 static signed long __initdata iSeries_recal_tb;
@@ -517,10 +539,12 @@
 void timer_interrupt(struct pt_regs * regs)
 {
 	struct pt_regs *old_regs;
-	int next_dec;
 	int cpu = smp_processor_id();
-	unsigned long ticks;
-	u64 tb_next_jiffy;
+	struct clock_event_device *evt = &per_cpu(decrementers, cpu);
+
+	/* Ensure a positive value is written to the decrementer, or else
+	 * some CPUs will continuue to take decrementer exceptions */
+	set_dec(DECREMENTER_MAX);
 
 #ifdef CONFIG_PPC32
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
@@ -530,7 +554,6 @@
 	old_regs = set_irq_regs(regs);
 	irq_enter();
 
-	profile_tick(CPU_PROFILING);
 	calculate_steal_time();
 
 #ifdef CONFIG_PPC_ISERIES
@@ -538,44 +561,20 @@
 		get_lppaca()->int_dword.fields.decr_int = 0;
 #endif
 
-	while ((ticks = tb_ticks_since(per_cpu(last_jiffy, cpu)))
-	       >= tb_ticks_per_jiffy) {
-		/* Update last_jiffy */
-		per_cpu(last_jiffy, cpu) += tb_ticks_per_jiffy;
-		/* Handle RTCL overflow on 601 */
-		if (__USE_RTC() && per_cpu(last_jiffy, cpu) >= 1000000000)
-			per_cpu(last_jiffy, cpu) -= 1000000000;
+	/*
+	 * We cannot disable the decrementer, so in the period
+	 * between this cpu's being marked offline in cpu_online_map
+	 * and calling stop-self, it is taking timer interrupts.
+	 * Avoid calling into the scheduler rebalancing code if this
+	 * is the case.
+	 */
+	if (!cpu_is_offline(cpu))
+		account_process_time(regs);
 
-		/*
-		 * We cannot disable the decrementer, so in the period
-		 * between this cpu's being marked offline in cpu_online_map
-		 * and calling stop-self, it is taking timer interrupts.
-		 * Avoid calling into the scheduler rebalancing code if this
-		 * is the case.
-		 */
-		if (!cpu_is_offline(cpu))
-			account_process_time(regs);
-
-		/*
-		 * No need to check whether cpu is offline here; boot_cpuid
-		 * should have been fixed up by now.
-		 */
-		if (cpu != boot_cpuid)
-			continue;
-
-		write_seqlock(&xtime_lock);
-		tb_next_jiffy = tb_last_jiffy + tb_ticks_per_jiffy;
-		if (__USE_RTC() && tb_next_jiffy >= 1000000000)
-			tb_next_jiffy -= 1000000000;
-		if (per_cpu(last_jiffy, cpu) >= tb_next_jiffy) {
-			tb_last_jiffy = tb_next_jiffy;
-			do_timer(1);
-		}
-		write_sequnlock(&xtime_lock);
-	}
-	
-	next_dec = tb_ticks_per_jiffy - ticks;
-	set_dec(next_dec);
+	if (evt->event_handler)
+		evt->event_handler(evt);
+	else
+		evt->set_next_event(DECREMENTER_MAX, evt);
 
 #ifdef CONFIG_PPC_ISERIES
 	if (firmware_has_feature(FW_FEATURE_ISERIES) && hvlpevent_is_pending())
@@ -795,6 +794,53 @@
 	       clock->name, clock->mult, clock->shift);
 }
 
+static int decrementer_set_next_event(unsigned long evt,
+				      struct clock_event_device *dev)
+{
+	set_dec(evt);
+	return 0;
+}
+
+static void decrementer_set_mode(enum clock_event_mode mode,
+				 struct clock_event_device *dev)
+{
+	if (mode != CLOCK_EVT_MODE_ONESHOT)
+		decrementer_set_next_event(DECREMENTER_MAX, dev);
+}
+
+static void register_decrementer_clockevent(int cpu)
+{
+	struct clock_event_device *dec = &per_cpu(decrementers, cpu);
+
+	*dec = decrementer_clockevent;
+	dec->cpumask = cpumask_of_cpu(cpu);
+
+	printk(KERN_ERR "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
+	       dec->name, dec->mult, dec->shift, cpu);
+
+	clockevents_register_device(dec);
+}
+
+void init_decrementer_clockevent(void)
+{
+	int cpu = smp_processor_id();
+
+	decrementer_clockevent.mult = div_sc(ppc_tb_freq, NSEC_PER_SEC,
+					     decrementer_clockevent.shift);
+	decrementer_clockevent.max_delta_ns =
+		clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent);
+	decrementer_clockevent.min_delta_ns = 1000;
+
+	register_decrementer_clockevent(cpu);
+}
+
+void secondary_cpu_time_init(void)
+{
+	/* FIME: Should make unrelatred change to move snapshot_timebase
+	 * call here ! */
+	register_decrementer_clockevent(smp_processor_id());
+}
+
 /* This function is only called on the boot processor */
 void __init time_init(void)
 {
@@ -908,8 +954,7 @@
 	if (!firmware_has_feature(FW_FEATURE_ISERIES))
 		clocksource_init();
 
-	/* Not exact, but the timer interrupt takes care of this */
-	set_dec(tb_ticks_per_jiffy);
+	init_decrementer_clockevent();
 }
 
 
diff --git a/include/asm-powerpc/time.h b/include/asm-powerpc/time.h
index fa331da..f058955 100644
--- a/include/asm-powerpc/time.h
+++ b/include/asm-powerpc/time.h
@@ -245,6 +245,7 @@
 #define snapshot_timebases()			do { } while (0)
 #endif
 
+extern void secondary_cpu_time_init(void);
 extern void iSeries_time_init_early(void);
 
 #endif /* __KERNEL__ */