Merge tag 'dm-4.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer:
 "Apologies for not pressing this request-based DM partial completion
  issue further, it was an oversight on my part.  We'll have to get it
  fixed up properly and revisit for a future release.

   - Revert block and DM core changes the removed request-based DM's
     ability to handle partial request completions -- otherwise with the
     current SCSI LLDs these changes could lead to silent data
     corruption.

   - Fix two DM version bumps that were missing from the initial 4.2 DM
     pull request (enabled userspace lvm2 to know certain changes have
     been made)"

* tag 'dm-4.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm cache policy smq: fix "default" version to be 1.4.0
  dm: bump the ioctl version to 4.32.0
  Revert "block, dm: don't copy bios for request clones"
  Revert "dm: do not allocate any mempools for blk-mq request-based DM"
diff --git a/block/blk-core.c b/block/blk-core.c
index 688ae94..82819e6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -142,7 +142,7 @@
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	if (error && !(rq->cmd_flags & REQ_CLONE))
+	if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
@@ -153,8 +153,7 @@
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 &&
-	    !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
+	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 		bio_endio(bio, error);
 }
 
@@ -2927,22 +2926,95 @@
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
-void blk_rq_prep_clone(struct request *dst, struct request *src)
+/**
+ * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
+ * @rq: the clone request to be cleaned up
+ *
+ * Description:
+ *     Free all bios in @rq for a cloned request.
+ */
+void blk_rq_unprep_clone(struct request *rq)
+{
+	struct bio *bio;
+
+	while ((bio = rq->bio) != NULL) {
+		rq->bio = bio->bi_next;
+
+		bio_put(bio);
+	}
+}
+EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
+
+/*
+ * Copy attributes of the original request to the clone request.
+ * The actual data parts (e.g. ->cmd, ->sense) are not copied.
+ */
+static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
-	dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
+	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
-	dst->bio = src->bio;
-	dst->biotail = src->biotail;
-	dst->cmd = src->cmd;
-	dst->cmd_len = src->cmd_len;
-	dst->sense = src->sense;
+}
+
+/**
+ * blk_rq_prep_clone - Helper function to setup clone request
+ * @rq: the request to be setup
+ * @rq_src: original request to be cloned
+ * @bs: bio_set that bios for clone are allocated from
+ * @gfp_mask: memory allocation mask for bio
+ * @bio_ctr: setup function to be called for each clone bio.
+ *           Returns %0 for success, non %0 for failure.
+ * @data: private data to be passed to @bio_ctr
+ *
+ * Description:
+ *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
+ *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
+ *     are not copied, and copying such parts is the caller's responsibility.
+ *     Also, pages which the original bios are pointing to are not copied
+ *     and the cloned bios just point same pages.
+ *     So cloned bios must be completed before original bios, which means
+ *     the caller must complete @rq before @rq_src.
+ */
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+		      struct bio_set *bs, gfp_t gfp_mask,
+		      int (*bio_ctr)(struct bio *, struct bio *, void *),
+		      void *data)
+{
+	struct bio *bio, *bio_src;
+
+	if (!bs)
+		bs = fs_bio_set;
+
+	__rq_for_each_bio(bio_src, rq_src) {
+		bio = bio_clone_fast(bio_src, gfp_mask, bs);
+		if (!bio)
+			goto free_and_out;
+
+		if (bio_ctr && bio_ctr(bio, bio_src, data))
+			goto free_and_out;
+
+		if (rq->bio) {
+			rq->biotail->bi_next = bio;
+			rq->biotail = bio;
+		} else
+			rq->bio = rq->biotail = bio;
+	}
+
+	__blk_rq_prep_clone(rq, rq_src);
+
+	return 0;
+
+free_and_out:
+	if (bio)
+		bio_put(bio);
+	blk_rq_unprep_clone(rq);
+
+	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 80f02d3..b6f2265 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1750,7 +1750,7 @@
 
 static struct dm_cache_policy_type default_policy_type = {
 	.name = "default",
-	.version = {1, 0, 0},
+	.version = {1, 4, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = smq_create,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 85e1d39..16ba55a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -942,30 +942,23 @@
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
+	struct dm_target *tgt;
 	unsigned i;
 
-	switch (type) {
-	case DM_TYPE_BIO_BASED:
-		for (i = 0; i < t->num_targets; i++) {
-			struct dm_target *tgt = t->targets + i;
-
-			per_bio_data_size = max(per_bio_data_size,
-						tgt->per_bio_data_size);
-		}
-		t->mempools = dm_alloc_bio_mempools(t->integrity_supported,
-						    per_bio_data_size);
-		break;
-	case DM_TYPE_REQUEST_BASED:
-	case DM_TYPE_MQ_REQUEST_BASED:
-		t->mempools = dm_alloc_rq_mempools(md, type);
-		break;
-	default:
+	if (unlikely(type == DM_TYPE_NONE)) {
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
-	if (IS_ERR(t->mempools))
-		return PTR_ERR(t->mempools);
+	if (type == DM_TYPE_BIO_BASED)
+		for (i = 0; i < t->num_targets; i++) {
+			tgt = t->targets + i;
+			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
+		}
+
+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
+	if (!t->mempools)
+		return -ENOMEM;
 
 	return 0;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2fe0992..f331d88 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -993,6 +993,57 @@
 	dec_pending(io, error);
 }
 
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+	struct dm_rq_clone_bio_info *info =
+		container_of(clone, struct dm_rq_clone_bio_info, clone);
+	struct dm_rq_target_io *tio = info->tio;
+	struct bio *bio = info->orig;
+	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
+
+	bio_put(clone);
+
+	if (tio->error)
+		/*
+		 * An error has already been detected on the request.
+		 * Once error occurred, just let clone->end_io() handle
+		 * the remainder.
+		 */
+		return;
+	else if (error) {
+		/*
+		 * Don't notice the error to the upper layer yet.
+		 * The error handling decision is made by the target driver,
+		 * when the request is completed.
+		 */
+		tio->error = error;
+		return;
+	}
+
+	/*
+	 * I/O for the bio successfully completed.
+	 * Notice the data completion to the upper layer.
+	 */
+
+	/*
+	 * bios are processed from the head of the list.
+	 * So the completing bio should always be rq->bio.
+	 * If it's not, something wrong is happening.
+	 */
+	if (tio->orig->bio != bio)
+		DMERR("bio completion is going in the middle of the request");
+
+	/*
+	 * Update the original request.
+	 * Do not use blk_end_request() here, because it may complete
+	 * the original request before the clone, and break the ordering.
+	 */
+	blk_update_request(tio->orig, 0, nr_bytes);
+}
+
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
@@ -1050,6 +1101,8 @@
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 
+	blk_rq_unprep_clone(clone);
+
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
@@ -1784,13 +1837,39 @@
 		dm_complete_request(rq, r);
 }
 
-static void setup_clone(struct request *clone, struct request *rq,
-		        struct dm_rq_target_io *tio)
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+				 void *data)
 {
-	blk_rq_prep_clone(clone, rq);
+	struct dm_rq_target_io *tio = data;
+	struct dm_rq_clone_bio_info *info =
+		container_of(bio, struct dm_rq_clone_bio_info, clone);
+
+	info->orig = bio_orig;
+	info->tio = tio;
+	bio->bi_end_io = end_clone_bio;
+
+	return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+	int r;
+
+	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+			      dm_rq_bio_constructor, tio);
+	if (r)
+		return r;
+
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
+
 	tio->clone = clone;
+
+	return 0;
 }
 
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
@@ -1811,7 +1890,12 @@
 		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
-	setup_clone(clone, rq, tio);
+	if (setup_clone(clone, rq, tio, gfp_mask)) {
+		/* -ENOMEM */
+		if (alloc_clone)
+			free_clone_request(md, clone);
+		return NULL;
+	}
 
 	return clone;
 }
@@ -1905,7 +1989,11 @@
 		}
 		if (r != DM_MAPIO_REMAPPED)
 			return r;
-		setup_clone(clone, rq, tio);
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
 	}
 
 	switch (r) {
@@ -2349,42 +2437,30 @@
 	kfree(md);
 }
 
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
-	if (type == DM_TYPE_BIO_BASED)
-		return type;
-
-	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-	switch (filter_md_type(dm_table_get_type(t), md)) {
-	case DM_TYPE_BIO_BASED:
-		if (md->bs && md->io_pool) {
+	if (md->bs) {
+		/* The md already has necessary mempools. */
+		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
 			/*
-			 * This bio-based md already has necessary mempools.
 			 * Reload bioset because front_pad may have changed
 			 * because a different table was loaded.
 			 */
 			bioset_free(md->bs);
 			md->bs = p->bs;
 			p->bs = NULL;
-			goto out;
 		}
-		break;
-	case DM_TYPE_REQUEST_BASED:
-		if (md->rq_pool && md->io_pool)
-			/*
-			 * This request-based md already has necessary mempools.
-			 */
-			goto out;
-		break;
-	case DM_TYPE_MQ_REQUEST_BASED:
-		BUG_ON(p); /* No mempools needed */
-		return;
+		/*
+		 * There's no need to reload with request-based dm
+		 * because the size of front_pad doesn't change.
+		 * Note for future: If you are to reload bioset,
+		 * prep-ed requests in the queue may refer
+		 * to bio from the old bioset, so you must walk
+		 * through the queue to unprep.
+		 */
+		goto out;
 	}
 
 	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
@@ -2395,6 +2471,7 @@
 	p->rq_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
+
 out:
 	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
@@ -2774,6 +2851,14 @@
 	return err;
 }
 
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+	if (type == DM_TYPE_BIO_BASED)
+		return type;
+
+	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
 /*
  * Setup the DM device's queue based on md's type
  */
@@ -3486,23 +3571,48 @@
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
-					     unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools;
-	unsigned int pool_size = dm_get_reserved_bio_based_ios();
+	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	struct kmem_cache *cachep = NULL;
+	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
-	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
-	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
-		offsetof(struct dm_target_io, clone);
+	type = filter_md_type(type, md);
 
-	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
-	if (!pools->io_pool)
-		goto out;
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
+		cachep = _io_cache;
+		pool_size = dm_get_reserved_bio_based_ios();
+		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+		break;
+	case DM_TYPE_REQUEST_BASED:
+		cachep = _rq_tio_cache;
+		pool_size = dm_get_reserved_rq_based_ios();
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+		/* fall through to setup remaining rq-based pools */
+	case DM_TYPE_MQ_REQUEST_BASED:
+		if (!pool_size)
+			pool_size = dm_get_reserved_rq_based_ios();
+		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
+		/* per_bio_data_size is not used. See __bind_mempools(). */
+		WARN_ON(per_bio_data_size != 0);
+		break;
+	default:
+		BUG();
+	}
+
+	if (cachep) {
+		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+		if (!pools->io_pool)
+			goto out;
+	}
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3512,37 +3622,11 @@
 		goto out;
 
 	return pools;
+
 out:
 	dm_free_md_mempools(pools);
-	return ERR_PTR(-ENOMEM);
-}
 
-struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
-					    unsigned type)
-{
-	unsigned int pool_size;
-	struct dm_md_mempools *pools;
-
-	if (filter_md_type(type, md) == DM_TYPE_MQ_REQUEST_BASED)
-		return NULL; /* No mempools needed */
-
-	pool_size = dm_get_reserved_rq_based_ios();
-	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	if (!pools)
-		return ERR_PTR(-ENOMEM);
-
-	pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-	if (!pools->rq_pool)
-		goto out;
-
-	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
-	if (!pools->io_pool)
-		goto out;
-
-	return pools;
-out:
-	dm_free_md_mempools(pools);
-	return ERR_PTR(-ENOMEM);
+	return NULL;
 }
 
 void dm_free_md_mempools(struct dm_md_mempools *pools)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 7fff744..4e98499 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -223,9 +223,8 @@
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
-					     unsigned per_bio_data_size);
-struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type);
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6ab9d12..7303b34 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -192,7 +192,6 @@
 	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NO_TIMEOUT,	/* requests may never expire */
-	__REQ_CLONE,		/* cloned bios */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -247,6 +246,5 @@
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 #define REQ_NO_TIMEOUT		(1ULL << __REQ_NO_TIMEOUT)
-#define REQ_CLONE		(1ULL << __REQ_CLONE)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7f2f54b..d4068c1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -774,7 +774,11 @@
 		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
-extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src);
+extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+			     struct bio_set *bs, gfp_t gfp_mask,
+			     int (*bio_ctr)(struct bio *, struct bio *, void *),
+			     void *data);
+extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index eac8c36..061aca3 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	31
+#define DM_VERSION_MINOR	32
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2015-3-12)"
+#define DM_VERSION_EXTRA	"-ioctl (2015-6-26)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */