[PATCH] elevator switching race

There's a race between shutting down one io scheduler and firing up the
next, in which a new io could enter and cause the io scheduler to be
invoked with bad or NULL data.

To fix this, we need to maintain the queue lock for a bit longer.
Unfortunately we cannot do that, since the elevator init requires to be
run without the lock held.  This isn't easily fixable, without also
changing the mempool API.  So split the initialization into two parts,
and alloc-init operation and an attach operation.  Then we can
preallocate the io scheduler and related structures, and run the attach
inside the lock after we detach the old one.

This patch has survived 30 minutes of 1 second io scheduler switching
with a very busy io load.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/block/elevator.c b/block/elevator.c
index 8768a36..a0afdd3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -121,16 +121,16 @@
 	return e;
 }
 
-static int elevator_attach(request_queue_t *q, struct elevator_queue *eq)
+static void *elevator_init_queue(request_queue_t *q, struct elevator_queue *eq)
 {
-	int ret = 0;
+	return eq->ops->elevator_init_fn(q, eq);
+}
 
+static void elevator_attach(request_queue_t *q, struct elevator_queue *eq,
+			   void *data)
+{
 	q->elevator = eq;
-
-	if (eq->ops->elevator_init_fn)
-		ret = eq->ops->elevator_init_fn(q, eq);
-
-	return ret;
+	eq->elevator_data = data;
 }
 
 static char chosen_elevator[16];
@@ -181,6 +181,7 @@
 	struct elevator_type *e = NULL;
 	struct elevator_queue *eq;
 	int ret = 0;
+	void *data;
 
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
@@ -202,10 +203,13 @@
 	if (!eq)
 		return -ENOMEM;
 
-	ret = elevator_attach(q, eq);
-	if (ret)
+	data = elevator_init_queue(q, eq);
+	if (!data) {
 		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
 
+	elevator_attach(q, eq, data);
 	return ret;
 }
 
@@ -722,13 +726,16 @@
 	return error;
 }
 
+static void __elv_unregister_queue(elevator_t *e)
+{
+	kobject_uevent(&e->kobj, KOBJ_REMOVE);
+	kobject_del(&e->kobj);
+}
+
 void elv_unregister_queue(struct request_queue *q)
 {
-	if (q) {
-		elevator_t *e = q->elevator;
-		kobject_uevent(&e->kobj, KOBJ_REMOVE);
-		kobject_del(&e->kobj);
-	}
+	if (q)
+		__elv_unregister_queue(q->elevator);
 }
 
 int elv_register(struct elevator_type *e)
@@ -780,6 +787,7 @@
 static int elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 {
 	elevator_t *old_elevator, *e;
+	void *data;
 
 	/*
 	 * Allocate new elevator
@@ -788,6 +796,12 @@
 	if (!e)
 		return 0;
 
+	data = elevator_init_queue(q, e);
+	if (!data) {
+		kobject_put(&e->kobj);
+		return 0;
+	}
+
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data
 	 */
@@ -806,19 +820,19 @@
 		elv_drain_elevator(q);
 	}
 
-	spin_unlock_irq(q->queue_lock);
-
 	/*
-	 * unregister old elevator data
+	 * Remember old elevator.
 	 */
-	elv_unregister_queue(q);
 	old_elevator = q->elevator;
 
 	/*
 	 * attach and start new elevator
 	 */
-	if (elevator_attach(q, e))
-		goto fail;
+	elevator_attach(q, e, data);
+
+	spin_unlock_irq(q->queue_lock);
+
+	__elv_unregister_queue(old_elevator);
 
 	if (elv_register_queue(q))
 		goto fail_register;
@@ -837,7 +851,6 @@
 	 */
 	elevator_exit(e);
 	e = NULL;
-fail:
 	q->elevator = old_elevator;
 	elv_register_queue(q);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);