dm thin: add read only and fail io modes

Add read-only and fail-io modes to thin provisioning.

If a transaction commit fails the pool's metadata device will transition
to "read-only" mode.  If a commit fails once already in read-only mode
the transition to "fail-io" mode occurs.

Once in fail-io mode the pool and all associated thin devices will
report a status of "Fail".

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 98c50f9..087e9b3 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 Red Hat UK.
+ * Copyright (C) 2011-2012 Red Hat UK.
  *
  * This file is released under the GPL.
  */
@@ -496,12 +496,27 @@
  */
 struct dm_thin_new_mapping;
 
+/*
+ * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ */
+enum pool_mode {
+	PM_WRITE,		/* metadata may be changed */
+	PM_READ_ONLY,		/* metadata may not be changed */
+	PM_FAIL,		/* all I/O fails */
+};
+
 struct pool_features {
+	enum pool_mode mode;
+
 	unsigned zero_new_blocks:1;
 	unsigned discard_enabled:1;
 	unsigned discard_passdown:1;
 };
 
+struct thin_c;
+typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
+
 struct pool {
 	struct list_head list;
 	struct dm_target *ti;	/* Only set if a pool target is bound */
@@ -542,8 +557,17 @@
 	struct dm_thin_new_mapping *next_mapping;
 	mempool_t *mapping_pool;
 	mempool_t *endio_hook_pool;
+
+	process_bio_fn process_bio;
+	process_bio_fn process_discard;
+
+	process_mapping_fn process_prepared_mapping;
+	process_mapping_fn process_prepared_discard;
 };
 
+static enum pool_mode get_pool_mode(struct pool *pool);
+static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+
 /*
  * Target context for a pool.
  */
@@ -718,16 +742,28 @@
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 
-	/*
-	 * Batch together any FUA/FLUSH bios we find and then issue
-	 * a single commit for them in process_deferred_bios().
-	 */
-	if (bio_triggers_commit(tc, bio)) {
-		spin_lock_irqsave(&pool->lock, flags);
-		bio_list_add(&pool->deferred_flush_bios, bio);
-		spin_unlock_irqrestore(&pool->lock, flags);
-	} else
+	if (!bio_triggers_commit(tc, bio)) {
 		generic_make_request(bio);
+		return;
+	}
+
+	/*
+	 * Complete bio with an error if earlier I/O caused changes to
+	 * the metadata that can't be committed e.g, due to I/O errors
+	 * on the metadata device.
+	 */
+	if (dm_thin_aborted_changes(tc->td)) {
+		bio_io_error(bio);
+		return;
+	}
+
+	/*
+	 * Batch together any bios that trigger commits and then issue a
+	 * single commit for them in process_deferred_bios().
+	 */
+	spin_lock_irqsave(&pool->lock, flags);
+	bio_list_add(&pool->deferred_flush_bios, bio);
+	spin_unlock_irqrestore(&pool->lock, flags);
 }
 
 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
@@ -864,6 +900,14 @@
 	wake_worker(pool);
 }
 
+static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
+{
+	if (m->bio)
+		m->bio->bi_end_io = m->saved_bi_end_io;
+	cell_error(m->cell);
+	list_del(&m->list);
+	mempool_free(m, m->tc->pool->mapping_pool);
+}
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
@@ -908,18 +952,20 @@
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 {
-	int r;
 	struct thin_c *tc = m->tc;
 
-	r = dm_thin_remove_block(tc->td, m->virt_block);
-	if (r)
-		DMERR("dm_thin_remove_block() failed");
+	bio_io_error(m->bio);
+	cell_defer_except(tc, m->cell);
+	cell_defer_except(tc, m->cell2);
+	mempool_free(m, tc->pool->mapping_pool);
+}
 
-	/*
-	 * Pass the discard down to the underlying device?
-	 */
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+{
+	struct thin_c *tc = m->tc;
+
 	if (m->pass_discard)
 		remap_and_issue(tc, m->bio, m->data_block);
 	else
@@ -930,8 +976,20 @@
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
+static void process_prepared_discard(struct dm_thin_new_mapping *m)
+{
+	int r;
+	struct thin_c *tc = m->tc;
+
+	r = dm_thin_remove_block(tc->td, m->virt_block);
+	if (r)
+		DMERR("dm_thin_remove_block() failed");
+
+	process_prepared_discard_passdown(m);
+}
+
 static void process_prepared(struct pool *pool, struct list_head *head,
-			     void (*fn)(struct dm_thin_new_mapping *))
+			     process_mapping_fn *fn)
 {
 	unsigned long flags;
 	struct list_head maps;
@@ -943,7 +1001,7 @@
 	spin_unlock_irqrestore(&pool->lock, flags);
 
 	list_for_each_entry_safe(m, tmp, &maps, list)
-		fn(m);
+		(*fn)(m);
 }
 
 /*
@@ -1109,6 +1167,35 @@
 	}
 }
 
+static int commit(struct pool *pool)
+{
+	int r;
+
+	r = dm_pool_commit_metadata(pool->pmd);
+	if (r)
+		DMERR("commit failed, error = %d", r);
+
+	return r;
+}
+
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ * Many callers don't care about the return value.
+ */
+static int commit_or_fallback(struct pool *pool)
+{
+	int r;
+
+	if (get_pool_mode(pool) != PM_WRITE)
+		return -EINVAL;
+
+	r = commit(pool);
+	if (r)
+		set_pool_mode(pool, PM_READ_ONLY);
+
+	return r;
+}
+
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
 	int r;
@@ -1137,12 +1224,7 @@
 			 * Try to commit to see if that will free up some
 			 * more space.
 			 */
-			r = dm_pool_commit_metadata(pool->pmd);
-			if (r) {
-				DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-				      __func__, r);
-				return r;
-			}
+			(void) commit_or_fallback(pool);
 
 			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 			if (r)
@@ -1373,6 +1455,7 @@
 
 	default:
 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
+		set_pool_mode(tc->pool, PM_READ_ONLY);
 		cell_error(cell);
 		break;
 	}
@@ -1430,6 +1513,49 @@
 	}
 }
 
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+	int r;
+	int rw = bio_data_dir(bio);
+	dm_block_t block = get_bio_block(tc, bio);
+	struct dm_thin_lookup_result lookup_result;
+
+	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+	switch (r) {
+	case 0:
+		if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+			bio_io_error(bio);
+		else
+			remap_and_issue(tc, bio, lookup_result.block);
+		break;
+
+	case -ENODATA:
+		if (rw != READ) {
+			bio_io_error(bio);
+			break;
+		}
+
+		if (tc->origin_dev) {
+			remap_to_origin_and_issue(tc, bio);
+			break;
+		}
+
+		zero_fill_bio(bio);
+		bio_endio(bio, 0);
+		break;
+
+	default:
+		DMERR("dm_thin_find_block() failed, error = %d", r);
+		bio_io_error(bio);
+		break;
+	}
+}
+
+static void process_bio_fail(struct thin_c *tc, struct bio *bio)
+{
+	bio_io_error(bio);
+}
+
 static int need_commit_due_to_time(struct pool *pool)
 {
 	return jiffies < pool->last_commit_jiffies ||
@@ -1441,7 +1567,6 @@
 	unsigned long flags;
 	struct bio *bio;
 	struct bio_list bios;
-	int r;
 
 	bio_list_init(&bios);
 
@@ -1468,9 +1593,9 @@
 		}
 
 		if (bio->bi_rw & REQ_DISCARD)
-			process_discard(tc, bio);
+			pool->process_discard(tc, bio);
 		else
-			process_bio(tc, bio);
+			pool->process_bio(tc, bio);
 	}
 
 	/*
@@ -1486,10 +1611,7 @@
 	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
 		return;
 
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r) {
-		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-		      __func__, r);
+	if (commit_or_fallback(pool)) {
 		while ((bio = bio_list_pop(&bios)))
 			bio_io_error(bio);
 		return;
@@ -1504,8 +1626,8 @@
 {
 	struct pool *pool = container_of(ws, struct pool, worker);
 
-	process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
-	process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
+	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 	process_deferred_bios(pool);
 }
 
@@ -1522,6 +1644,52 @@
 
 /*----------------------------------------------------------------*/
 
+static enum pool_mode get_pool_mode(struct pool *pool)
+{
+	return pool->pf.mode;
+}
+
+static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+{
+	int r;
+
+	pool->pf.mode = mode;
+
+	switch (mode) {
+	case PM_FAIL:
+		DMERR("switching pool to failure mode");
+		pool->process_bio = process_bio_fail;
+		pool->process_discard = process_bio_fail;
+		pool->process_prepared_mapping = process_prepared_mapping_fail;
+		pool->process_prepared_discard = process_prepared_discard_fail;
+		break;
+
+	case PM_READ_ONLY:
+		DMERR("switching pool to read-only mode");
+		r = dm_pool_abort_metadata(pool->pmd);
+		if (r) {
+			DMERR("aborting transaction failed");
+			set_pool_mode(pool, PM_FAIL);
+		} else {
+			dm_pool_metadata_read_only(pool->pmd);
+			pool->process_bio = process_bio_read_only;
+			pool->process_discard = process_discard;
+			pool->process_prepared_mapping = process_prepared_mapping_fail;
+			pool->process_prepared_discard = process_prepared_discard_passdown;
+		}
+		break;
+
+	case PM_WRITE:
+		pool->process_bio = process_bio;
+		pool->process_discard = process_discard;
+		pool->process_prepared_mapping = process_prepared_mapping;
+		pool->process_prepared_discard = process_prepared_discard;
+		break;
+	}
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * Mapping functions.
  */
@@ -1567,6 +1735,12 @@
 	struct dm_thin_lookup_result result;
 
 	map_context->ptr = thin_hook_bio(tc, bio);
+
+	if (get_pool_mode(tc->pool) == PM_FAIL) {
+		bio_io_error(bio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
 		thin_defer_bio(tc, bio);
 		return DM_MAPIO_SUBMITTED;
@@ -1603,14 +1777,35 @@
 		break;
 
 	case -ENODATA:
+		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
+			/*
+			 * This block isn't provisioned, and we have no way
+			 * of doing so.  Just error it.
+			 */
+			bio_io_error(bio);
+			r = DM_MAPIO_SUBMITTED;
+			break;
+		}
+		/* fall through */
+
+	case -EWOULDBLOCK:
 		/*
 		 * In future, the failed dm_thin_find_block above could
 		 * provide the hint to load the metadata into cache.
 		 */
-	case -EWOULDBLOCK:
 		thin_defer_bio(tc, bio);
 		r = DM_MAPIO_SUBMITTED;
 		break;
+
+	default:
+		/*
+		 * Must always call bio_io_error on failure.
+		 * dm_thin_find_block can fail with -EINVAL if the
+		 * pool is switched to fail-io mode.
+		 */
+		bio_io_error(bio);
+		r = DM_MAPIO_SUBMITTED;
+		break;
 	}
 
 	return r;
@@ -1647,15 +1842,26 @@
 {
 	struct pool_c *pt = ti->private;
 
+	/*
+	 * We want to make sure that degraded pools are never upgraded.
+	 */
+	enum pool_mode old_mode = pool->pf.mode;
+	enum pool_mode new_mode = pt->pf.mode;
+
+	if (old_mode > new_mode)
+		new_mode = old_mode;
+
 	pool->ti = ti;
 	pool->low_water_blocks = pt->low_water_blocks;
 	pool->pf = pt->pf;
+	set_pool_mode(pool, new_mode);
 
 	/*
 	 * If discard_passdown was enabled verify that the data device
 	 * supports discards.  Disable discard_passdown if not; otherwise
 	 * -EOPNOTSUPP will be returned.
 	 */
+	/* FIXME: pull this out into a sep fn. */
 	if (pt->pf.discard_passdown) {
 		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
 		if (!q || !blk_queue_discard(q)) {
@@ -1681,6 +1887,7 @@
 /* Initialize pool features. */
 static void pool_features_init(struct pool_features *pf)
 {
+	pf->mode = PM_WRITE;
 	pf->zero_new_blocks = 1;
 	pf->discard_enabled = 1;
 	pf->discard_passdown = 1;
@@ -1711,14 +1918,16 @@
 
 static struct pool *pool_create(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
-				unsigned long block_size, char **error)
+				unsigned long block_size,
+				int read_only, char **error)
 {
 	int r;
 	void *err_p;
 	struct pool *pool;
 	struct dm_pool_metadata *pmd;
+	bool format_device = read_only ? false : true;
 
-	pmd = dm_pool_metadata_open(metadata_dev, block_size, true);
+	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
 	if (IS_ERR(pmd)) {
 		*error = "Error creating metadata object";
 		return (struct pool *)pmd;
@@ -1835,8 +2044,8 @@
 
 static struct pool *__pool_find(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
-				unsigned long block_size, char **error,
-				int *created)
+				unsigned long block_size, int read_only,
+				char **error, int *created)
 {
 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
 
@@ -1857,7 +2066,7 @@
 			__pool_inc(pool);
 
 		} else {
-			pool = pool_create(pool_md, metadata_dev, block_size, error);
+			pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
 			*created = 1;
 		}
 	}
@@ -1908,19 +2117,23 @@
 		arg_name = dm_shift_arg(as);
 		argc--;
 
-		if (!strcasecmp(arg_name, "skip_block_zeroing")) {
+		if (!strcasecmp(arg_name, "skip_block_zeroing"))
 			pf->zero_new_blocks = 0;
-			continue;
-		} else if (!strcasecmp(arg_name, "ignore_discard")) {
-			pf->discard_enabled = 0;
-			continue;
-		} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
-			pf->discard_passdown = 0;
-			continue;
-		}
 
-		ti->error = "Unrecognised pool feature requested";
-		r = -EINVAL;
+		else if (!strcasecmp(arg_name, "ignore_discard"))
+			pf->discard_enabled = 0;
+
+		else if (!strcasecmp(arg_name, "no_discard_passdown"))
+			pf->discard_passdown = 0;
+
+		else if (!strcasecmp(arg_name, "read_only"))
+			pf->mode = PM_READ_ONLY;
+
+		else {
+			ti->error = "Unrecognised pool feature requested";
+			r = -EINVAL;
+			break;
+		}
 	}
 
 	return r;
@@ -2013,7 +2226,7 @@
 	}
 
 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
-			   block_size, &ti->error, &pool_created);
+			   block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
 	if (IS_ERR(pool)) {
 		r = PTR_ERR(pool);
 		goto out_free_pt;
@@ -2146,15 +2359,12 @@
 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
 		if (r) {
 			DMERR("failed to resize data device");
+			/* FIXME Stricter than necessary: Rollback transaction instead here */
+			set_pool_mode(pool, PM_READ_ONLY);
 			return r;
 		}
 
-		r = dm_pool_commit_metadata(pool->pmd);
-		if (r) {
-			DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-			      __func__, r);
-			return r;
-		}
+		(void) commit_or_fallback(pool);
 	}
 
 	return 0;
@@ -2177,19 +2387,12 @@
 
 static void pool_postsuspend(struct dm_target *ti)
 {
-	int r;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 
 	cancel_delayed_work(&pool->waker);
 	flush_workqueue(pool->wq);
-
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r < 0) {
-		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-		      __func__, r);
-		/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
-	}
+	(void) commit_or_fallback(pool);
 }
 
 static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2323,12 +2526,7 @@
 	if (r)
 		return r;
 
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r) {
-		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-		      __func__, r);
-		return r;
-	}
+	(void) commit_or_fallback(pool);
 
 	r = dm_pool_reserve_metadata_snap(pool->pmd);
 	if (r)
@@ -2389,16 +2587,32 @@
 	else
 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 
-	if (!r) {
-		r = dm_pool_commit_metadata(pool->pmd);
-		if (r)
-			DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
-			      argv[0], r);
-	}
+	if (!r)
+		(void) commit_or_fallback(pool);
 
 	return r;
 }
 
+static void emit_flags(struct pool_features *pf, char *result,
+		       unsigned sz, unsigned maxlen)
+{
+	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
+		!pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+	DMEMIT("%u ", count);
+
+	if (!pf->zero_new_blocks)
+		DMEMIT("skip_block_zeroing ");
+
+	if (!pf->discard_enabled)
+		DMEMIT("ignore_discard ");
+
+	if (!pf->discard_passdown)
+		DMEMIT("no_discard_passdown ");
+
+	if (pf->mode == PM_READ_ONLY)
+		DMEMIT("read_only ");
+}
+
 /*
  * Status line is:
  *    <transaction id> <used metadata sectors>/<total metadata sectors>
@@ -2407,7 +2621,7 @@
 static int pool_status(struct dm_target *ti, status_type_t type,
 		       char *result, unsigned maxlen)
 {
-	int r, count;
+	int r;
 	unsigned sz = 0;
 	uint64_t transaction_id;
 	dm_block_t nr_free_blocks_data;
@@ -2422,6 +2636,11 @@
 
 	switch (type) {
 	case STATUSTYPE_INFO:
+		if (get_pool_mode(pool) == PM_FAIL) {
+			DMEMIT("Fail");
+			break;
+		}
+
 		r = dm_pool_get_metadata_transaction_id(pool->pmd,
 							&transaction_id);
 		if (r)
@@ -2457,9 +2676,19 @@
 		       (unsigned long long)nr_blocks_data);
 
 		if (held_root)
-			DMEMIT("%llu", held_root);
+			DMEMIT("%llu ", held_root);
 		else
-			DMEMIT("-");
+			DMEMIT("- ");
+
+		if (pool->pf.mode == PM_READ_ONLY)
+			DMEMIT("ro ");
+		else
+			DMEMIT("rw ");
+
+		if (pool->pf.discard_enabled && pool->pf.discard_passdown)
+			DMEMIT("discard_passdown");
+		else
+			DMEMIT("no_discard_passdown");
 
 		break;
 
@@ -2469,20 +2698,7 @@
 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
 		       (unsigned long)pool->sectors_per_block,
 		       (unsigned long long)pt->low_water_blocks);
-
-		count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
-			!pt->pf.discard_passdown;
-		DMEMIT("%u ", count);
-
-		if (!pool->pf.zero_new_blocks)
-			DMEMIT("skip_block_zeroing ");
-
-		if (!pool->pf.discard_enabled)
-			DMEMIT("ignore_discard ");
-
-		if (!pt->pf.discard_passdown)
-			DMEMIT("no_discard_passdown ");
-
+		emit_flags(&pt->pf, result, sz, maxlen);
 		break;
 	}
 
@@ -2542,7 +2758,7 @@
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -2647,6 +2863,11 @@
 	}
 	__pool_inc(tc->pool);
 
+	if (get_pool_mode(tc->pool) == PM_FAIL) {
+		ti->error = "Couldn't open thin device, Pool is in fail mode";
+		goto bad_thin_open;
+	}
+
 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
 	if (r) {
 		ti->error = "Couldn't open thin internal device";
@@ -2755,6 +2976,11 @@
 	char buf[BDEVNAME_SIZE];
 	struct thin_c *tc = ti->private;
 
+	if (get_pool_mode(tc->pool) == PM_FAIL) {
+		DMEMIT("Fail");
+		return 0;
+	}
+
 	if (!tc->td)
 		DMEMIT("-");
 	else {
@@ -2823,7 +3049,7 @@
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,