Merge branch 'for-3.16/blk-mq-tagging' into for-3.16/core

Signed-off-by: Jens Axboe <axboe@fb.com>

Conflicts:
	block/blk-mq-tag.c
diff --git a/block/Makefile b/block/Makefile
index 20645e8..b4c4d3b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,12 +2,13 @@
 # Makefile for the kernel block layer
 #
 
-obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
-			genhd.o scsi_ioctl.o partition-generic.o partitions/
+			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+			partitions/
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
@@ -20,3 +21,4 @@
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
diff --git a/fs/bio-integrity.c b/block/bio-integrity.c
similarity index 100%
rename from fs/bio-integrity.c
rename to block/bio-integrity.c
diff --git a/fs/bio.c b/block/bio.c
similarity index 100%
rename from fs/bio.c
rename to block/bio.c
diff --git a/block/blk-core.c b/block/blk-core.c
index c426970..a6bd3e7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1233,12 +1233,15 @@
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
+	int inflight;
+
 	if (now == part->stamp)
 		return;
 
-	if (part_in_flight(part)) {
+	inflight = part_in_flight(part);
+	if (inflight) {
 		__part_stat_add(cpu, part, time_in_queue,
-				part_in_flight(part) * (now - part->stamp));
+				inflight * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c80086c..e6b3fba 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -20,7 +20,7 @@
 	int i;
 
 	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_mq_bitmap *bm = &bt->map[i];
+		struct blk_align_bitmap *bm = &bt->map[i];
 		int ret;
 
 		ret = find_first_zero_bit(&bm->word, bm->depth);
@@ -117,7 +117,7 @@
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag)
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 {
 	int tag, org_last_tag, end;
 
@@ -360,7 +360,7 @@
 	int i;
 
 	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_mq_bitmap *bm = &bt->map[i];
+		struct blk_align_bitmap *bm = &bt->map[i];
 		int bit = 0;
 
 		do {
@@ -400,7 +400,7 @@
 	unsigned int i, used;
 
 	for (i = 0, used = 0; i < bt->map_nr; i++) {
-		struct blk_mq_bitmap *bm = &bt->map[i];
+		struct blk_align_bitmap *bm = &bt->map[i];
 
 		used += bitmap_weight(&bm->word, bm->depth);
 	}
@@ -438,7 +438,7 @@
 		}
 
 		nr = ALIGN(depth, tags_per_word) / tags_per_word;
-		bt->map = kzalloc_node(nr * sizeof(struct blk_mq_bitmap),
+		bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
 						GFP_KERNEL, node);
 		if (!bt->map)
 			return -ENOMEM;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 0f5ec8b..e144f68 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_TAG_H
 #define INT_BLK_MQ_TAG_H
 
+#include "blk-mq.h"
+
 enum {
 	BT_WAIT_QUEUES	= 8,
 	BT_WAIT_BATCH	= 8,
@@ -14,18 +16,13 @@
 #define TAG_TO_INDEX(bt, tag)	((tag) >> (bt)->bits_per_word)
 #define TAG_TO_BIT(bt, tag)	((tag) & ((1 << (bt)->bits_per_word) - 1))
 
-struct blk_mq_bitmap {
-	unsigned long word;
-	unsigned long depth;
-} ____cacheline_aligned_in_smp;
-
 struct blk_mq_bitmap_tags {
 	unsigned int depth;
 	unsigned int wake_cnt;
 	unsigned int bits_per_word;
 
 	unsigned int map_nr;
-	struct blk_mq_bitmap *map;
+	struct blk_align_bitmap *map;
 
 	unsigned int wake_index;
 	struct bt_wait_state *bs;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3c4f1fc..0fbef7e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -56,21 +56,40 @@
 {
 	unsigned int i;
 
-	for (i = 0; i < hctx->nr_ctx_map; i++)
-		if (hctx->ctx_map[i])
+	for (i = 0; i < hctx->ctx_map.map_size; i++)
+		if (hctx->ctx_map.map[i].word)
 			return true;
 
 	return false;
 }
 
+static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
+					      struct blk_mq_ctx *ctx)
+{
+	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+}
+
+#define CTX_TO_BIT(hctx, ctx)	\
+	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
+
 /*
  * Mark this ctx as having pending work in this hardware queue
  */
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 				     struct blk_mq_ctx *ctx)
 {
-	if (!test_bit(ctx->index_hw, hctx->ctx_map))
-		set_bit(ctx->index_hw, hctx->ctx_map);
+	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
+
+	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
+		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+}
+
+static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
+				      struct blk_mq_ctx *ctx)
+{
+	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
+
+	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
 
 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
@@ -630,6 +649,40 @@
 }
 
 /*
+ * Process software queues that have been marked busy, splicing them
+ * to the for-dispatch
+ */
+static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+{
+	struct blk_mq_ctx *ctx;
+	int i;
+
+	for (i = 0; i < hctx->ctx_map.map_size; i++) {
+		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
+		unsigned int off, bit;
+
+		if (!bm->word)
+			continue;
+
+		bit = 0;
+		off = i * hctx->ctx_map.bits_per_word;
+		do {
+			bit = find_next_bit(&bm->word, bm->depth, bit);
+			if (bit >= bm->depth)
+				break;
+
+			ctx = hctx->ctxs[bit + off];
+			clear_bit(bit, &bm->word);
+			spin_lock(&ctx->lock);
+			list_splice_tail_init(&ctx->rq_list, list);
+			spin_unlock(&ctx->lock);
+
+			bit++;
+		} while (1);
+	}
+}
+
+/*
  * Run this hardware queue, pulling any software queues mapped to it in.
  * Note that this function currently has various problems around ordering
  * of IO. In particular, we'd like FIFO behaviour on handling existing
@@ -638,10 +691,9 @@
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
-	struct blk_mq_ctx *ctx;
 	struct request *rq;
 	LIST_HEAD(rq_list);
-	int bit, queued;
+	int queued;
 
 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 
@@ -653,14 +705,7 @@
 	/*
 	 * Touch any software queue that has pending entries.
 	 */
-	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
-		clear_bit(bit, hctx->ctx_map);
-		ctx = hctx->ctxs[bit];
-
-		spin_lock(&ctx->lock);
-		list_splice_tail_init(&ctx->rq_list, &rq_list);
-		spin_unlock(&ctx->lock);
-	}
+	flush_busy_ctxs(hctx, &rq_list);
 
 	/*
 	 * If we have previous entries on our dispatch list, grab them
@@ -674,13 +719,9 @@
 	}
 
 	/*
-	 * Delete and return all entries from our dispatch list
-	 */
-	queued = 0;
-
-	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
+	queued = 0;
 	while (!list_empty(&rq_list)) {
 		int ret;
 
@@ -1103,17 +1144,15 @@
 	}
 
 	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
-		init_request_from_bio(rq, bio);
-
+		blk_mq_bio_to_request(rq, bio);
 		spin_lock(&ctx->lock);
 insert_rq:
 		__blk_mq_insert_request(hctx, rq, false);
 		spin_unlock(&ctx->lock);
-		blk_account_io_start(rq, 1);
 	} else {
 		spin_lock(&ctx->lock);
 		if (!blk_mq_attempt_merge(q, ctx, bio)) {
-			init_request_from_bio(rq, bio);
+			blk_mq_bio_to_request(rq, bio);
 			goto insert_rq;
 		}
 
@@ -1175,7 +1214,7 @@
 	spin_lock(&ctx->lock);
 	if (!list_empty(&ctx->rq_list)) {
 		list_splice_init(&ctx->rq_list, &tmp);
-		clear_bit(ctx->index_hw, hctx->ctx_map);
+		blk_mq_hctx_clear_pending(hctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
 
@@ -1315,6 +1354,34 @@
 	return NULL;
 }
 
+static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
+{
+	kfree(bitmap->map);
+}
+
+static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
+{
+	unsigned int bpw = 8, total, num_maps, i;
+
+	bitmap->bits_per_word = bpw;
+
+	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
+	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
+					GFP_KERNEL, node);
+	if (!bitmap->map)
+		return -ENOMEM;
+
+	bitmap->map_size = num_maps;
+
+	total = nr_cpu_ids;
+	for (i = 0; i < num_maps; i++) {
+		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
+		total -= bitmap->map[i].depth;
+	}
+
+	return 0;
+}
+
 static int blk_mq_init_hw_queues(struct request_queue *q,
 		struct blk_mq_tag_set *set)
 {
@@ -1325,7 +1392,6 @@
 	 * Initialize hardware queues
 	 */
 	queue_for_each_hw_ctx(q, hctx, i) {
-		unsigned int num_maps;
 		int node;
 
 		node = hctx->numa_node;
@@ -1356,13 +1422,9 @@
 		if (!hctx->ctxs)
 			break;
 
-		num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
-		hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
-						GFP_KERNEL, node);
-		if (!hctx->ctx_map)
+		if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
 			break;
 
-		hctx->nr_ctx_map = num_maps;
 		hctx->nr_ctx = 0;
 
 		if (set->ops->init_hctx &&
@@ -1385,7 +1447,7 @@
 
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		kfree(hctx->ctxs);
-		kfree(hctx->ctx_map);
+		blk_mq_free_bitmap(&hctx->ctx_map);
 	}
 
 	return 1;
@@ -1614,7 +1676,6 @@
 	blk_mq_del_queue_tag_set(q);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		kfree(hctx->ctx_map);
 		kfree(hctx->ctxs);
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (q->mq_ops->exit_hctx)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 97cfab9..5e5a378 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -52,4 +52,13 @@
 extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
 extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
 
+/*
+ * Basic implementation of sparser bitmap, allowing the user to spread
+ * the bits over more cachelines.
+ */
+struct blk_align_bitmap {
+	unsigned long word;
+	unsigned long depth;
+} ____cacheline_aligned_in_smp;
+
 #endif
diff --git a/fs/ioprio.c b/block/ioprio.c
similarity index 100%
rename from fs/ioprio.c
rename to block/ioprio.c
diff --git a/fs/Makefile b/fs/Makefile
index f9cb987..4030cbf 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,14 +14,13 @@
 		stack.o fs_struct.o statfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
-obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
 else
 obj-y +=	no-block.o
 endif
 
 obj-$(CONFIG_PROC_FS) += proc_namespace.o
 
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 379f88d..a06ca7b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -11,6 +11,12 @@
 	void (*notify)(void *data, unsigned long action, unsigned int cpu);
 };
 
+struct blk_mq_ctxmap {
+	unsigned int map_size;
+	unsigned int bits_per_word;
+	struct blk_align_bitmap *map;
+};
+
 struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
@@ -31,8 +37,8 @@
 
 	void			*driver_data;
 
-	unsigned int 		nr_ctx_map;
-	unsigned long		*ctx_map;
+	struct blk_mq_ctxmap	ctx_map;
+
 	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;