cpu/hotplug: Fix rollback during error-out in __cpu_disable()

The recent introduction of the hotplug thread which invokes the callbacks on
the plugged cpu, cased the following regression:

If takedown_cpu() fails, then we run into several issues:

 1) The rollback of the target cpu states is not invoked. That leaves the smp
    threads and the hotplug thread in disabled state.

 2) notify_online() is executed due to a missing skip_onerr flag. That causes
    that both CPU_DOWN_FAILED and CPU_ONLINE notifications are invoked which
    confuses quite some notifiers.

 3) The CPU_DOWN_FAILED notification is not invoked on the target CPU. That's
    not an issue per se, but it is inconsistent and in consequence blocks the
    patches which rely on these states being invoked on the target CPU and not
    on the controlling cpu. It also does not preserve the strict call order on
    rollback which is problematic for the ongoing state machine conversion as
    well.

To fix this we add a rollback flag to the remote callback machinery and invoke
the rollback including the CPU_DOWN_FAILED notification on the remote
cpu. Further mark the notify online state with 'skip_onerr' so we don't get a
double invokation.

This workaround will go away once we moved the unplug invocation to the target
cpu itself.

[ tglx: Massaged changelog and moved the CPU_DOWN_FAILED notifiaction to the
  	target cpu ]

Fixes: 4cb28ced23c4 ("cpu/hotplug: Create hotplug threads")
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-s390@vger.kernel.org
Cc: rt@linutronix.de
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Link: http://lkml.kernel.org/r/20160408124015.GA21960@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ea42e8..3e3f6e4 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -36,6 +36,7 @@
  * @target:	The target state
  * @thread:	Pointer to the hotplug thread
  * @should_run:	Thread should execute
+ * @rollback:	Perform a rollback
  * @cb_stat:	The state for a single callback (install/uninstall)
  * @cb:		Single callback function (install/uninstall)
  * @result:	Result of the operation
@@ -47,6 +48,7 @@
 #ifdef CONFIG_SMP
 	struct task_struct	*thread;
 	bool			should_run;
+	bool			rollback;
 	enum cpuhp_state	cb_state;
 	int			(*cb)(unsigned int cpu);
 	int			result;
@@ -301,6 +303,11 @@
 	return __cpu_notify(val, cpu, -1, NULL);
 }
 
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
+{
+	BUG_ON(cpu_notify(val, cpu));
+}
+
 /* Notifier wrappers for transitioning to state machine */
 static int notify_prepare(unsigned int cpu)
 {
@@ -477,6 +484,16 @@
 		} else {
 			ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
 		}
+	} else if (st->rollback) {
+		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+
+		undo_cpu_down(cpu, st, cpuhp_ap_states);
+		/*
+		 * This is a momentary workaround to keep the notifier users
+		 * happy. Will go away once we got rid of the notifiers.
+		 */
+		cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+		st->rollback = false;
 	} else {
 		/* Cannot happen .... */
 		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
@@ -636,11 +653,6 @@
 	read_unlock(&tasklist_lock);
 }
 
-static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
-{
-	BUG_ON(cpu_notify(val, cpu));
-}
-
 static int notify_down_prepare(unsigned int cpu)
 {
 	int err, nr_calls = 0;
@@ -721,9 +733,10 @@
 	 */
 	err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
 	if (err) {
-		/* CPU didn't die: tell everyone.  Can't complain. */
-		cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+		/* CPU refused to die */
 		irq_unlock_sparse();
+		/* Unpark the hotplug thread so we can rollback there */
+		kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread);
 		return err;
 	}
 	BUG_ON(cpu_online(cpu));
@@ -832,6 +845,11 @@
 	 * to do the further cleanups.
 	 */
 	ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+	if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
+		st->target = prev_state;
+		st->rollback = true;
+		cpuhp_kick_ap_work(cpu);
+	}
 
 	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
 out:
@@ -1249,6 +1267,7 @@
 		.name			= "notify:online",
 		.startup		= notify_online,
 		.teardown		= notify_down_prepare,
+		.skip_onerr		= true,
 	},
 #endif
 	/*