dm: always hold bdev reference

Fix a potential deadlock when creating multiple snapshots by holding a
reference to struct block_device for the whole lifecycle of every dm
device instead of obtaining it independently at each point it is needed.

bdget_disk() was called while the device was being suspended, in
dm_suspend().  However there could be other devices already suspended,
for example when creating additional snapshots of a device. bdget_disk()
can wait for IO and allocate memory resulting in waiting for the
already-suspended device - deadlock.

This patch changes the code so that it gets the reference to struct
block_device when struct mapped_device is allocated and initialized in
alloc_dev() where it is always OK to allocate memory or wait for I/O.
It drops the reference when it is destroyed in free_dev().  Thus there
is no call to bdget_disk() while any device is suspended.

Previously unlock_fs() was called only if bdev was held.  Now it is
called unconditionally, but the superfluous calls are harmless because
it returns immediately if the filesystem was not previously frozen.

This patch also now allows the device size to be changed in a
noflush suspend because the bdev is held.  This has no adverse effect.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1cfd9b7..5e06f1e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1180,6 +1180,10 @@
 	if (!md->wq)
 		goto bad_thread;
 
+	md->bdev = bdget_disk(md->disk, 0);
+	if (!md->bdev)
+		goto bad_bdev;
+
 	/* Populate the mapping, nobody knows we exist yet */
 	spin_lock(&_minor_lock);
 	old_md = idr_replace(&_minor_idr, md, minor);
@@ -1189,6 +1193,8 @@
 
 	return md;
 
+bad_bdev:
+	destroy_workqueue(md->wq);
 bad_thread:
 	put_disk(md->disk);
 bad_disk:
@@ -1214,10 +1220,8 @@
 {
 	int minor = MINOR(disk_devt(md->disk));
 
-	if (md->bdev) {
-		unlock_fs(md);
-		bdput(md->bdev);
-	}
+	unlock_fs(md);
+	bdput(md->bdev);
 	destroy_workqueue(md->wq);
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
@@ -1277,8 +1281,7 @@
 	if (size != get_capacity(md->disk))
 		memset(&md->geometry, 0, sizeof(md->geometry));
 
-	if (md->bdev)
-		__set_size(md, size);
+	__set_size(md, size);
 
 	if (!size) {
 		dm_table_destroy(t);
@@ -1520,11 +1523,6 @@
 	if (!dm_suspended(md))
 		goto out;
 
-	/* without bdev, the device size cannot be changed */
-	if (!md->bdev)
-		if (get_capacity(md->disk) != dm_table_get_size(table))
-			goto out;
-
 	__unbind(md);
 	r = __bind(md, table);
 
@@ -1552,9 +1550,6 @@
 
 	set_bit(DMF_FROZEN, &md->flags);
 
-	/* don't bdput right now, we don't want the bdev
-	 * to go away while it is locked.
-	 */
 	return 0;
 }
 
@@ -1601,24 +1596,14 @@
 	/* This does not get reverted if there's an error later. */
 	dm_table_presuspend_targets(map);
 
-	/* bdget() can stall if the pending I/Os are not flushed */
-	if (!noflush) {
-		md->bdev = bdget_disk(md->disk, 0);
-		if (!md->bdev) {
-			DMWARN("bdget failed in dm_suspend");
-			r = -ENOMEM;
+	/*
+	 * Flush I/O to the device. noflush supersedes do_lockfs,
+	 * because lock_fs() needs to flush I/Os.
+	 */
+	if (!noflush && do_lockfs) {
+		r = lock_fs(md);
+		if (r)
 			goto out;
-		}
-
-		/*
-		 * Flush I/O to the device. noflush supersedes do_lockfs,
-		 * because lock_fs() needs to flush I/Os.
-		 */
-		if (do_lockfs) {
-			r = lock_fs(md);
-			if (r)
-				goto out;
-		}
 	}
 
 	/*
@@ -1675,11 +1660,6 @@
 	set_bit(DMF_SUSPENDED, &md->flags);
 
 out:
-	if (r && md->bdev) {
-		bdput(md->bdev);
-		md->bdev = NULL;
-	}
-
 	dm_table_put(map);
 
 out_unlock:
@@ -1708,11 +1688,6 @@
 
 	unlock_fs(md);
 
-	if (md->bdev) {
-		bdput(md->bdev);
-		md->bdev = NULL;
-	}
-
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	dm_table_unplug_all(map);