drbd: close race when detaching from disk BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 IP: bd_release+0x21/0x70 Process drbd_w_t7146 Call Trace: close_bdev_exclusive drbd_free_ldev [drbd] drbd_ldev_destroy [drbd] w_after_state_ch [drbd] Race probably went like this: state.disk = D_FAILED ... first one to hit zero during D_FAILED: put_ldev() /* ----------------> 0 */ i = atomic_dec_return() if (i == 0) if (state.disk == D_FAILED) schedule_work(go_diskless) /* 1 <------ */ get_ldev_if_state() go_diskless() do_some_pre_cleanup() corresponding put_ldev(): force_state(D_DISKLESS) /* 0 <------ */ i = atomic_dec_return() if (i == 0) atomic_inc() /* ---------> 1 */ state.disk = D_DISKLESS schedule_work(after_state_ch) /* execution pre-empted by IRQ ? */ after_state_ch() put_ldev() i = atomic_dec_return() /* 0 */ if (i == 0) if (state.disk == D_DISKLESS) if (state.disk == D_DISKLESS) drbd_ldev_destroy() drbd_ldev_destroy(); Trying to fix this by checking the disk state *before* the atomic_dec_return(), which implies memory barriers, and by inserting extra memory barriers around the state assignment in __drbd_set_state(). Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

commit: ba3c6fb87d2df008eed8faaf01bb198e512fa72f [log] [tgz]
author: Lars Ellenberg <lars.ellenberg@linbit.com> Tue Feb 11 08:57:18 2014 +0100
committer: Philipp Reisner <philipp.reisner@linbit.com> Thu Jul 10 18:34:54 2014 +0200
tree: 85faa78b91dac8b5eadcbacab2781c1d7a26046f
parent: 2ed912e9d35b4d7d766a530f5dd14513a3ab9f64 [diff]
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a16f9ae..a0ffc19 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h

@@ -1946,6 +1946,11 @@
 
 static inline void put_ldev(struct drbd_device *device)
 {
+	enum drbd_disk_state ds = device->state.disk;
+	/* We must check the state *before* the atomic_dec becomes visible,
+	 * or we have a theoretical race where someone hitting zero,
+	 * while state still D_FAILED, will then see D_DISKLESS in the
+	 * condition below and calling into destroy, where he must not, yet. */
 	int i = atomic_dec_return(&device->local_cnt);
 
 	/* This may be called from some endio handler,
@@ -1954,10 +1959,10 @@
 	__release(local);
 	D_ASSERT(device, i >= 0);
 	if (i == 0) {
-		if (device->state.disk == D_DISKLESS)
+		if (ds == D_DISKLESS)
 			/* even internal references gone, safe to destroy */
 			drbd_ldev_destroy(device);
-		if (device->state.disk == D_FAILED) {
+		if (ds == D_FAILED) {
 			/* all application IO references gone. */
 			if (!test_and_set_bit(GO_DISKLESS, &device->flags))
 				drbd_queue_work(&first_peer_device(device)->connection->sender_work,

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 1bddd6c..6629f46 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c

@@ -958,7 +958,6 @@
 	enum drbd_state_rv rv = SS_SUCCESS;
 	enum sanitize_state_warnings ssw;
 	struct after_state_chg_work *ascw;
-	bool did_remote, should_do_remote;
 
 	os = drbd_read_state(device);
 
@@ -1010,18 +1009,22 @@
 	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
 		atomic_inc(&device->local_cnt);
 
-	did_remote = drbd_should_do_remote(device->state);
 	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
 		clear_bit(RS_DONE, &device->flags);
 
+	/* changes to local_cnt and device flags should be visible before
+	 * changes to state, which again should be visible before anything else
+	 * depending on that change happens. */
+	smp_wmb();
 	device->state.i = ns.i;
-	should_do_remote = drbd_should_do_remote(device->state);
 	device->resource->susp = ns.susp;
 	device->resource->susp_nod = ns.susp_nod;
 	device->resource->susp_fen = ns.susp_fen;
+	smp_wmb();
 
 	/* put replicated vs not-replicated requests in seperate epochs */
-	if (did_remote != should_do_remote)
+	if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+	    drbd_should_do_remote((union drbd_dev_state)ns.i))
 		start_new_tl_epoch(connection);
 
 	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
commit	ba3c6fb87d2df008eed8faaf01bb198e512fa72f	[log] [tgz]
author	Lars Ellenberg <lars.ellenberg@linbit.com>	Tue Feb 11 08:57:18 2014 +0100
committer	Philipp Reisner <philipp.reisner@linbit.com>	Thu Jul 10 18:34:54 2014 +0200
tree	85faa78b91dac8b5eadcbacab2781c1d7a26046f
parent	2ed912e9d35b4d7d766a530f5dd14513a3ab9f64 [diff]