blk-mq: use sparser tag layout for lower queue depth

For best performance, spreading tags over multiple cachelines
makes the tagging more efficient on multicore systems. But since
we have 8 * sizeof(unsigned long) tags per cacheline, we don't
always get a nice spread.

Attempt to spread the tags over at least 4 cachelines, using fewer
number of bits per unsigned long if we have to. This improves
tagging performance in setups with 32-128 tags. For higher depths,
the spread is the same as before (BITS_PER_LONG tags per cacheline).

Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 467f3a2..6c78c08 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -44,7 +44,7 @@
 {
 	int tag, org_last_tag, end;
 
-	org_last_tag = last_tag = TAG_TO_BIT(last_tag);
+	org_last_tag = last_tag;
 	end = bm->depth;
 	do {
 restart:
@@ -84,12 +84,12 @@
 	int index, i, tag;
 
 	last_tag = org_last_tag = *tag_cache;
-	index = TAG_TO_INDEX(last_tag);
+	index = TAG_TO_INDEX(bt, last_tag);
 
 	for (i = 0; i < bt->map_nr; i++) {
-		tag = __bt_get_word(&bt->map[index], last_tag);
+		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
 		if (tag != -1) {
-			tag += index * BITS_PER_LONG;
+			tag += (index << bt->bits_per_word);
 			goto done;
 		}
 
@@ -233,10 +233,10 @@
 
 static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 {
-	const int index = TAG_TO_INDEX(tag);
+	const int index = TAG_TO_INDEX(bt, tag);
 	struct bt_wait_state *bs;
 
-	clear_bit(TAG_TO_BIT(tag), &bt->map[index].word);
+	clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
 
 	bs = bt_wake_ptr(bt);
 	if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
@@ -292,7 +292,7 @@
 			bit++;
 		} while (1);
 
-		off += BITS_PER_LONG;
+		off += (1 << bt->bits_per_word);
 	}
 }
 
@@ -333,14 +333,31 @@
 {
 	int i;
 
+	bt->bits_per_word = ilog2(BITS_PER_LONG);
+
 	/*
 	 * Depth can be zero for reserved tags, that's not a failure
 	 * condition.
 	 */
 	if (depth) {
-		int nr, i, map_depth;
+		unsigned int nr, i, map_depth, tags_per_word;
 
-		nr = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
+		tags_per_word = (1 << bt->bits_per_word);
+
+		/*
+		 * If the tag space is small, shrink the number of tags
+		 * per word so we spread over a few cachelines, at least.
+		 * If less than 4 tags, just forget about it, it's not
+		 * going to work optimally anyway.
+		 */
+		if (depth >= 4) {
+			while (tags_per_word * 4 > depth) {
+				bt->bits_per_word--;
+				tags_per_word = (1 << bt->bits_per_word);
+			}
+		}
+
+		nr = ALIGN(depth, tags_per_word) / tags_per_word;
 		bt->map = kzalloc_node(nr * sizeof(struct blk_mq_bitmap),
 						GFP_KERNEL, node);
 		if (!bt->map)
@@ -349,8 +366,8 @@
 		bt->map_nr = nr;
 		map_depth = depth;
 		for (i = 0; i < nr; i++) {
-			bt->map[i].depth = min(map_depth, BITS_PER_LONG);
-			map_depth -= BITS_PER_LONG;
+			bt->map[i].depth = min(map_depth, tags_per_word);
+			map_depth -= tags_per_word;
 		}
 	}
 
@@ -443,8 +460,10 @@
 	if (!tags)
 		return 0;
 
-	page += sprintf(page, "nr_tags=%u, reserved_tags=%u\n",
-			tags->nr_tags, tags->nr_reserved_tags);
+	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
+			"bits_per_word=%u\n",
+			tags->nr_tags, tags->nr_reserved_tags,
+			tags->bitmap_tags.bits_per_word);
 
 	free = bt_unused_tags(&tags->bitmap_tags);
 	res = bt_unused_tags(&tags->breserved_tags);