blk-throttle: add latency target support One hard problem adding .low limit is to detect idle cgroup. If one cgroup doesn't dispatch enough IO against its low limit, we must have a mechanism to determine if other cgroups dispatch more IO. We added the think time detection mechanism before, but it doesn't work for all workloads. Here we add a latency based approach. We already have mechanism to calculate latency threshold for each IO size. For every IO dispatched from a cgorup, we compare its latency against its threshold and record the info. If most IO latency is below threshold (in the code I use 75%), the cgroup could be treated idle and other cgroups can dispatch more IO. Currently this latency target check is only for SSD as we can't calcualte the latency target for hard disk. And this is only for cgroup leaf node so far. Signed-off-by: Shaohua Li <shli@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>

commit: 53696b8d212f4a0f0e5dcdb3df64558dcdf03d1a [log] [tgz]
author: Shaohua Li <shli@fb.com> Mon Mar 27 15:19:43 2017 -0700
committer: Jens Axboe <axboe@fb.com> Tue Mar 28 08:02:20 2017 -0600
tree: 843090a882b03bbaa9731b1bab546be79f7a79c5
parent: b9147dd1bae2b15d6931ecd42f8606c775fecbc9 [diff] [blame]
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 140da29..c82bf9b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c

@@ -165,6 +165,10 @@
 	unsigned long checked_last_finish_time; /* ns / 1024 */
 	unsigned long avg_idletime; /* ns / 1024 */
 	unsigned long idletime_threshold; /* us */
+
+	unsigned int bio_cnt; /* total bios */
+	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+	unsigned long bio_cnt_reset_time;
 };
 
 /* We measure latency for request size from <= 4k to >= 1M */
@@ -1720,12 +1724,15 @@
 	 * - single idle is too long, longer than a fixed value (in case user
 	 *   configure a too big threshold) or 4 times of slice
 	 * - average think time is more than threshold
+	 * - IO latency is largely below threshold
 	 */
 	unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
 
 	time = min_t(unsigned long, MAX_IDLE_TIME, time);
 	return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
-	       tg->avg_idletime > tg->idletime_threshold;
+	       tg->avg_idletime > tg->idletime_threshold ||
+	       (tg->latency_target && tg->bio_cnt &&
+		tg->bad_bio_cnt * 5 < tg->bio_cnt);
 }
 
 static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
@@ -2194,12 +2201,36 @@
 
 	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
 	finish_time = __blk_stat_time(finish_time_ns) >> 10;
+	if (!start_time || finish_time <= start_time)
+		return;
+
+	lat = finish_time - start_time;
 	/* this is only for bio based driver */
-	if (start_time && finish_time > start_time &&
-	    !(bio->bi_issue_stat.stat & SKIP_LATENCY)) {
-		lat = finish_time - start_time;
+	if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
 		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
 			bio_op(bio), lat);
+
+	if (tg->latency_target) {
+		int bucket;
+		unsigned int threshold;
+
+		bucket = request_bucket_index(
+			blk_stat_size(&bio->bi_issue_stat));
+		threshold = tg->td->avg_buckets[bucket].latency +
+			tg->latency_target;
+		if (lat > threshold)
+			tg->bad_bio_cnt++;
+		/*
+		 * Not race free, could get wrong count, which means cgroups
+		 * will be throttled
+		 */
+		tg->bio_cnt++;
+	}
+
+	if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
+		tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
+		tg->bio_cnt /= 2;
+		tg->bad_bio_cnt /= 2;
 	}
 }
 #endif
commit	53696b8d212f4a0f0e5dcdb3df64558dcdf03d1a	[log] [tgz]
author	Shaohua Li <shli@fb.com>	Mon Mar 27 15:19:43 2017 -0700
committer	Jens Axboe <axboe@fb.com>	Tue Mar 28 08:02:20 2017 -0600
tree	843090a882b03bbaa9731b1bab546be79f7a79c5
parent	b9147dd1bae2b15d6931ecd42f8606c775fecbc9 [diff] [blame]