blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

For SCSI devices, there is often a per-request-queue depth, which needs
to be respected before queuing one request.

Currently blk-mq always dequeues the request first, then calls
.queue_rq() to dispatch the request to lld. One obvious issue with this
approach is that I/O merging may not be successful, because when the
per-request-queue depth can't be respected, .queue_rq() has to return
BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch
list. This means it never gets a chance to be merged with other IO.

This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
then we can try to get reserved budget first before dequeuing request.
If the budget for queueing I/O can't be satisfied, we don't need to
dequeue request at all. Hence the request can be left in the IO
scheduler queue, for more merging opportunities.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba8..8e525e6 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,31 +89,57 @@
 	return false;
 }
 
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+/* return true if hctx need to run again */
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
 
 	do {
-		struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+		struct request *rq;
+		blk_status_t ret;
 
-		if (!rq)
+		if (e->type->ops.mq.has_work &&
+				!e->type->ops.mq.has_work(hctx))
 			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = e->type->ops.mq.dispatch_request(hctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
+
+		/*
+		 * Now this rq owns the budget which has to be released
+		 * if this rq won't be queued to driver via .queue_rq()
+		 * in blk_mq_dispatch_rq_list().
+		 */
 		list_add(&rq->queuelist, &rq_list);
-	} while (blk_mq_dispatch_rq_list(q, &rq_list));
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	return false;
 }
 
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+/* return true if hw queue need to be run again */
+bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 	LIST_HEAD(rq_list);
+	bool run_queue = false;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
-		return;
+		return false;
 
 	hctx->run++;
 
@@ -143,14 +169,23 @@
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
-			blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
+				has_sched_dispatch)
+			run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list);
+		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
+
+	if (run_queue && !blk_mq_sched_needs_restart(hctx) &&
+			!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
+		blk_mq_sched_mark_restart_hctx(hctx);
+		return true;
+	}
+
+	return false;
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,