sched,perf: Fix periodic timers

In the below two commits (see Fixes) we have periodic timers that can
stop themselves when they're no longer required, but need to be
(re)-started when their idle condition changes.

Further complications is that we want the timer handler to always do
the forward such that it will always correctly deal with the overruns,
and we do not want to race such that the handler has already decided
to stop, but the (external) restart sees the timer still active and we
end up with a 'lost' timer.

The problem with the current code is that the re-start can come before
the callback does the forward, at which point the forward from the
callback will WARN about forwarding an enqueued timer.

Now, conceptually its easy to detect if you're before or after the fwd
by comparing the expiration time against the current time. Of course,
that's expensive (and racy) because we don't have the current time.

Alternatively one could cache this state inside the timer, but then
everybody pays the overhead of maintaining this extra state, and that
is undesired.

The only other option that I could see is the external timer_active
variable, which I tried to kill before. I would love a nicer interface
for this seemingly simple 'problem' but alas.

Fixes: 272325c4821f ("perf: Fix mux_interval hrtimer wreckage")
Fixes: 77a4d1a1b9a1 ("sched: Cleanup bandwidth timers")
Cc: pjt@google.com
Cc: tglx@linutronix.de
Cc: klamm@yandex-team.ru
Cc: mingo@kernel.org
Cc: bsegall@google.com
Cc: hpa@zytor.com
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150514102311.GX21418@twins.programming.kicks-ass.net
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f528829..d9c93f3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -752,24 +752,21 @@
 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
 	struct perf_cpu_context *cpuctx;
-	enum hrtimer_restart ret = HRTIMER_NORESTART;
 	int rotations = 0;
 
 	WARN_ON(!irqs_disabled());
 
 	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
 	rotations = perf_rotate_context(cpuctx);
 
-	/*
-	 * arm timer if needed
-	 */
-	if (rotations) {
+	raw_spin_lock(&cpuctx->hrtimer_lock);
+	if (rotations)
 		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
-		ret = HRTIMER_RESTART;
-	}
+	else
+		cpuctx->hrtimer_active = 0;
+	raw_spin_unlock(&cpuctx->hrtimer_lock);
 
-	return ret;
+	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 }
 
 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
@@ -792,7 +789,8 @@
 
 	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
-	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	raw_spin_lock_init(&cpuctx->hrtimer_lock);
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	timer->function = perf_mux_hrtimer_handler;
 }
 
@@ -800,15 +798,20 @@
 {
 	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
+	unsigned long flags;
 
 	/* not for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
 		return 0;
 
-	if (hrtimer_is_queued(timer))
-		return 0;
+	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+	if (!cpuctx->hrtimer_active) {
+		cpuctx->hrtimer_active = 1;
+		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+	}
+	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 
-	hrtimer_start(timer, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
 	return 0;
 }