blk-mq: dequeue request one by one from sw queue if hctx is busy It won't be efficient to dequeue request one by one from sw queue, but we have to do that when queue is busy for better merge performance. This patch takes the Exponential Weighted Moving Average(EWMA) to figure out if queue is busy, then only dequeue request one by one from sw queue when queue is busy. Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue") Cc: Kashyap Desai <kashyap.desai@broadcom.com> Cc: Laurence Oberman <loberman@redhat.com> Cc: Omar Sandoval <osandov@fb.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Bart Van Assche <bart.vanassche@wdc.com> Cc: Hannes Reinecke <hare@suse.de> Reported-by: Kashyap Desai <kashyap.desai@broadcom.com> Tested-by: Kashyap Desai <kashyap.desai@broadcom.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>

commit: 6e768717304bdbe8d2897ca8298f6b58863fdc41 [log] [tgz]
author: Ming Lei <ming.lei@redhat.com> Tue Jul 03 09:03:16 2018 -0600
committer: Jens Axboe <axboe@kernel.dk> Mon Jul 09 09:07:53 2018 -0600
tree: 64f259002c9bcca0bd1f187cdd7d067e68a29d45
parent: d893ff86034f7107f89d8b740c2b5902a21a49db [diff]
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7efe268..cb1e6cf 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c

@@ -622,6 +622,14 @@
 	return 0;
 }
 
+static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+
+	seq_printf(m, "%u\n", hctx->dispatch_busy);
+	return 0;
+}
+
 static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
 	__acquires(&ctx->lock)
 {
@@ -783,6 +791,7 @@
 	{"queued", 0600, hctx_queued_show, hctx_queued_write},
 	{"run", 0600, hctx_run_show, hctx_run_write},
 	{"active", 0400, hctx_active_show},
+	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
 	{},
 };
 

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index f3b4b5c..fdc129e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c

@@ -206,15 +206,8 @@
 		}
 	} else if (has_sched_dispatch) {
 		blk_mq_do_dispatch_sched(hctx);
-	} else if (q->mq_ops->get_budget) {
-		/*
-		 * If we need to get budget before queuing request, we
-		 * dequeue request one by one from sw queue for avoiding
-		 * to mess up I/O merge when dispatch runs out of resource.
-		 *
-		 * TODO: get more budgets, and dequeue more requests in
-		 * one time.
-		 */
+	} else if (hctx->dispatch_busy) {
+		/* dequeue request one by one from sw queue if queue is busy */
 		blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 795ba85..850fdd0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c

@@ -1074,6 +1074,35 @@
 	return true;
 }
 
+#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
+#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
+/*
+ * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+ * - EWMA is one simple way to compute running average value
+ * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+ * - take 4 as factor for avoiding to get too small(0) result, and this
+ *   factor doesn't matter because EWMA decreases exponentially
+ */
+static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+{
+	unsigned int ewma;
+
+	if (hctx->queue->elevator)
+		return;
+
+	ewma = hctx->dispatch_busy;
+
+	if (!ewma && !busy)
+		return;
+
+	ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+	if (busy)
+		ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+	ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+
+	hctx->dispatch_busy = ewma;
+}
+
 #define BLK_MQ_RESOURCE_DELAY	3		/* ms units */
 
 /*
@@ -1210,8 +1239,10 @@
 		else if (needs_restart && (ret == BLK_STS_RESOURCE))
 			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
 
+		blk_mq_update_dispatch_busy(hctx, true);
 		return false;
-	}
+	} else
+		blk_mq_update_dispatch_busy(hctx, false);
 
 	/*
 	 * If the host/device is unable to accept more work, inform the

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ea69025..d710e92 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h

@@ -35,9 +35,10 @@
 	struct sbitmap		ctx_map;
 
 	struct blk_mq_ctx	*dispatch_from;
+	unsigned int		dispatch_busy;
 
-	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
+	struct blk_mq_ctx	**ctxs;
 
 	spinlock_t		dispatch_wait_lock;
 	wait_queue_entry_t	dispatch_wait;
commit	6e768717304bdbe8d2897ca8298f6b58863fdc41	[log] [tgz]
author	Ming Lei <ming.lei@redhat.com>	Tue Jul 03 09:03:16 2018 -0600
committer	Jens Axboe <axboe@kernel.dk>	Mon Jul 09 09:07:53 2018 -0600
tree	64f259002c9bcca0bd1f187cdd7d067e68a29d45
parent	d893ff86034f7107f89d8b740c2b5902a21a49db [diff]