blkcg: implement per-blkg request allocation

Currently, request_queue has one request_list to allocate requests
from regardless of blkcg of the IO being issued.  When the unified
request pool is used up, cfq proportional IO limits become meaningless
- whoever grabs the next request being freed wins the race regardless
of the configured weights.

This can be easily demonstrated by creating a blkio cgroup w/ very low
weight, put a program which can issue a lot of random direct IOs there
and running a sequential IO from a different cgroup.  As soon as the
request pool is used up, the sequential IO bandwidth crashes.

This patch implements per-blkg request_list.  Each blkg has its own
request_list and any IO allocates its request from the matching blkg
making blkcgs completely isolated in terms of request allocation.

* Root blkcg uses the request_list embedded in each request_queue,
  which was renamed to @q->root_rl from @q->rq.  While making blkcg rl
  handling a bit harier, this enables avoiding most overhead for root
  blkcg.

* Queue fullness is properly per request_list but bdi isn't blkcg
  aware yet, so congestion state currently just follows the root
  blkcg.  As writeback isn't aware of blkcg yet, this works okay for
  async congestion but readahead may get the wrong signals.  It's
  better than blkcg completely collapsing with shared request_list but
  needs to be improved with future changes.

* After this change, each block cgroup gets a full request pool making
  resource consumption of each cgroup higher.  This makes allowing
  non-root users to create cgroups less desirable; however, note that
  allowing non-root users to directly manage cgroups is already
  severely broken regardless of this patch - each block cgroup
  consumes kernel memory and skews IO weight (IO weights are not
  hierarchical).

v2: queue-sysfs.txt updated and patch description udpated as suggested
    by Vivek.

v3: blk_get_rl() wasn't checking error return from
    blkg_lookup_create() and may cause oops on lookup failure.  Fix it
    by falling back to root_rl on blkg lookup failures.  This problem
    was spotted by Rakesh Iyer <rni@google.com>.

v4: Updated to accomodate 458f27a982 "block: Avoid missed wakeup in
    request waitqueue".  blk_drain_queue() now wakes up waiters on all
    blkg->rl on the target queue.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/block/blk-core.c b/block/blk-core.c
index f392a2e..dd134d8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -416,9 +416,14 @@
 	 * left with hung waiters. We need to wake up those waiters.
 	 */
 	if (q->request_fn) {
+		struct request_list *rl;
+
 		spin_lock_irq(q->queue_lock);
-		for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++)
-			wake_up_all(&q->rq.wait[i]);
+
+		blk_queue_for_each_rl(rl, q)
+			for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
+				wake_up_all(&rl->wait[i]);
+
 		spin_unlock_irq(q->queue_lock);
 	}
 }
@@ -685,7 +690,7 @@
 	if (!q)
 		return NULL;
 
-	if (blk_init_rl(&q->rq, q, GFP_KERNEL))
+	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
 		return NULL;
 
 	q->request_fn		= rfn;
@@ -776,7 +781,12 @@
 {
 	struct request_queue *q = rl->q;
 
-	if (rl->count[sync] < queue_congestion_off_threshold(q))
+	/*
+	 * bdi isn't aware of blkcg yet.  As all async IOs end up root
+	 * blkcg anyway, just use root blkcg state.
+	 */
+	if (rl == &q->root_rl &&
+	    rl->count[sync] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, sync);
 
 	if (rl->count[sync] + 1 <= q->nr_requests) {
@@ -897,7 +907,12 @@
 				}
 			}
 		}
-		blk_set_queue_congested(q, is_sync);
+		/*
+		 * bdi isn't aware of blkcg yet.  As all async IOs end up
+		 * root blkcg anyway, just use root blkcg state.
+		 */
+		if (rl == &q->root_rl)
+			blk_set_queue_congested(q, is_sync);
 	}
 
 	/*
@@ -939,6 +954,7 @@
 		goto fail_alloc;
 
 	blk_rq_init(q, rq);
+	blk_rq_set_rl(rq, rl);
 	rq->cmd_flags = rw_flags | REQ_ALLOCED;
 
 	/* init elvpriv */
@@ -1032,15 +1048,19 @@
 {
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	DEFINE_WAIT(wait);
-	struct request_list *rl = &q->rq;
+	struct request_list *rl;
 	struct request *rq;
+
+	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(&q->rq, rw_flags, bio, gfp_mask);
+	rq = __get_request(rl, rw_flags, bio, gfp_mask);
 	if (rq)
 		return rq;
 
-	if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q)))
+	if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) {
+		blk_put_rl(rl);
 		return NULL;
+	}
 
 	/* wait on @rl and retry */
 	prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
@@ -1231,12 +1251,14 @@
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
 		unsigned int flags = req->cmd_flags;
+		struct request_list *rl = blk_rq_rl(req);
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 
-		blk_free_request(&q->rq, req);
-		freed_request(&q->rq, flags);
+		blk_free_request(rl, req);
+		freed_request(rl, flags);
+		blk_put_rl(rl);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);