blob: a1660bafc912452a37730ac0ed2bf02f6c8a99bc [file] [log] [blame]
Omar Sandoval00e04392017-04-14 01:00:02 -07001/*
2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
3 * scalable techniques.
4 *
5 * Copyright (C) 2017 Facebook
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License v2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <https://www.gnu.org/licenses/>.
18 */
19
20#include <linux/kernel.h>
21#include <linux/blkdev.h>
22#include <linux/blk-mq.h>
23#include <linux/elevator.h>
24#include <linux/module.h>
25#include <linux/sbitmap.h>
26
27#include "blk.h"
28#include "blk-mq.h"
Omar Sandoval16b738f2017-05-04 00:31:33 -070029#include "blk-mq-debugfs.h"
Omar Sandoval00e04392017-04-14 01:00:02 -070030#include "blk-mq-sched.h"
31#include "blk-mq-tag.h"
32#include "blk-stat.h"
33
34/* Scheduling domains. */
35enum {
36 KYBER_READ,
37 KYBER_SYNC_WRITE,
38 KYBER_OTHER, /* Async writes, discard, etc. */
39 KYBER_NUM_DOMAINS,
40};
41
42enum {
43 KYBER_MIN_DEPTH = 256,
44
45 /*
46 * In order to prevent starvation of synchronous requests by a flood of
47 * asynchronous requests, we reserve 25% of requests for synchronous
48 * operations.
49 */
50 KYBER_ASYNC_PERCENT = 75,
51};
52
53/*
54 * Initial device-wide depths for each scheduling domain.
55 *
56 * Even for fast devices with lots of tags like NVMe, you can saturate
57 * the device with only a fraction of the maximum possible queue depth.
58 * So, we cap these to a reasonable value.
59 */
60static const unsigned int kyber_depth[] = {
61 [KYBER_READ] = 256,
62 [KYBER_SYNC_WRITE] = 128,
63 [KYBER_OTHER] = 64,
64};
65
66/*
67 * Scheduling domain batch sizes. We favor reads.
68 */
69static const unsigned int kyber_batch_size[] = {
70 [KYBER_READ] = 16,
71 [KYBER_SYNC_WRITE] = 8,
72 [KYBER_OTHER] = 8,
73};
74
Jianchao Wanga6088842018-05-30 10:47:40 -060075/*
76 * There is a same mapping between ctx & hctx and kcq & khd,
77 * we use request->mq_ctx->index_hw to index the kcq in khd.
78 */
79struct kyber_ctx_queue {
80 /*
81 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
82 * Also protect the rqs on rq_list when merge.
83 */
84 spinlock_t lock;
85 struct list_head rq_list[KYBER_NUM_DOMAINS];
86} ____cacheline_aligned_in_smp;
87
Omar Sandoval00e04392017-04-14 01:00:02 -070088struct kyber_queue_data {
89 struct request_queue *q;
90
91 struct blk_stat_callback *cb;
92
93 /*
94 * The device is divided into multiple scheduling domains based on the
95 * request type. Each domain has a fixed number of in-flight requests of
96 * that type device-wide, limited by these tokens.
97 */
98 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
99
100 /*
101 * Async request percentage, converted to per-word depth for
102 * sbitmap_get_shallow().
103 */
104 unsigned int async_depth;
105
106 /* Target latencies in nanoseconds. */
107 u64 read_lat_nsec, write_lat_nsec;
108};
109
110struct kyber_hctx_data {
111 spinlock_t lock;
112 struct list_head rqs[KYBER_NUM_DOMAINS];
113 unsigned int cur_domain;
114 unsigned int batching;
Jianchao Wanga6088842018-05-30 10:47:40 -0600115 struct kyber_ctx_queue *kcqs;
116 struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
Ingo Molnarac6424b2017-06-20 12:06:13 +0200117 wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
Omar Sandovalfcf38cd2017-12-05 22:57:43 -0800118 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
Omar Sandoval00e04392017-04-14 01:00:02 -0700119 atomic_t wait_index[KYBER_NUM_DOMAINS];
120};
121
Omar Sandovalfcf38cd2017-12-05 22:57:43 -0800122static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
123 void *key);
124
Jianchao Wanga6088842018-05-30 10:47:40 -0600125static unsigned int kyber_sched_domain(unsigned int op)
Omar Sandoval00e04392017-04-14 01:00:02 -0700126{
Omar Sandoval00e04392017-04-14 01:00:02 -0700127 if ((op & REQ_OP_MASK) == REQ_OP_READ)
128 return KYBER_READ;
129 else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
130 return KYBER_SYNC_WRITE;
131 else
132 return KYBER_OTHER;
133}
134
135enum {
136 NONE = 0,
137 GOOD = 1,
138 GREAT = 2,
139 BAD = -1,
140 AWFUL = -2,
141};
142
143#define IS_GOOD(status) ((status) > 0)
144#define IS_BAD(status) ((status) < 0)
145
146static int kyber_lat_status(struct blk_stat_callback *cb,
147 unsigned int sched_domain, u64 target)
148{
149 u64 latency;
150
151 if (!cb->stat[sched_domain].nr_samples)
152 return NONE;
153
154 latency = cb->stat[sched_domain].mean;
155 if (latency >= 2 * target)
156 return AWFUL;
157 else if (latency > target)
158 return BAD;
159 else if (latency <= target / 2)
160 return GREAT;
161 else /* (latency <= target) */
162 return GOOD;
163}
164
165/*
166 * Adjust the read or synchronous write depth given the status of reads and
167 * writes. The goal is that the latencies of the two domains are fair (i.e., if
168 * one is good, then the other is good).
169 */
170static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
171 unsigned int sched_domain, int this_status,
172 int other_status)
173{
174 unsigned int orig_depth, depth;
175
176 /*
177 * If this domain had no samples, or reads and writes are both good or
178 * both bad, don't adjust the depth.
179 */
180 if (this_status == NONE ||
181 (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
182 (IS_BAD(this_status) && IS_BAD(other_status)))
183 return;
184
185 orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
186
187 if (other_status == NONE) {
188 depth++;
189 } else {
190 switch (this_status) {
191 case GOOD:
192 if (other_status == AWFUL)
193 depth -= max(depth / 4, 1U);
194 else
195 depth -= max(depth / 8, 1U);
196 break;
197 case GREAT:
198 if (other_status == AWFUL)
199 depth /= 2;
200 else
201 depth -= max(depth / 4, 1U);
202 break;
203 case BAD:
204 depth++;
205 break;
206 case AWFUL:
207 if (other_status == GREAT)
208 depth += 2;
209 else
210 depth++;
211 break;
212 }
213 }
214
215 depth = clamp(depth, 1U, kyber_depth[sched_domain]);
216 if (depth != orig_depth)
217 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
218}
219
220/*
221 * Adjust the depth of other requests given the status of reads and synchronous
222 * writes. As long as either domain is doing fine, we don't throttle, but if
223 * both domains are doing badly, we throttle heavily.
224 */
225static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
226 int read_status, int write_status,
227 bool have_samples)
228{
229 unsigned int orig_depth, depth;
230 int status;
231
232 orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
233
234 if (read_status == NONE && write_status == NONE) {
235 depth += 2;
236 } else if (have_samples) {
237 if (read_status == NONE)
238 status = write_status;
239 else if (write_status == NONE)
240 status = read_status;
241 else
242 status = max(read_status, write_status);
243 switch (status) {
244 case GREAT:
245 depth += 2;
246 break;
247 case GOOD:
248 depth++;
249 break;
250 case BAD:
251 depth -= max(depth / 4, 1U);
252 break;
253 case AWFUL:
254 depth /= 2;
255 break;
256 }
257 }
258
259 depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
260 if (depth != orig_depth)
261 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
262}
263
264/*
265 * Apply heuristics for limiting queue depths based on gathered latency
266 * statistics.
267 */
268static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
269{
270 struct kyber_queue_data *kqd = cb->data;
271 int read_status, write_status;
272
273 read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
274 write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
275
276 kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
277 kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
278 kyber_adjust_other_depth(kqd, read_status, write_status,
279 cb->stat[KYBER_OTHER].nr_samples != 0);
280
281 /*
282 * Continue monitoring latencies if we aren't hitting the targets or
283 * we're still throttling other requests.
284 */
285 if (!blk_stat_is_active(kqd->cb) &&
286 ((IS_BAD(read_status) || IS_BAD(write_status) ||
287 kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
288 blk_stat_activate_msecs(kqd->cb, 100);
289}
290
291static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
292{
293 /*
294 * All of the hardware queues have the same depth, so we can just grab
295 * the shift of the first one.
296 */
297 return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
298}
299
Jianchao Wanga6088842018-05-30 10:47:40 -0600300static int kyber_bucket_fn(const struct request *rq)
301{
302 return kyber_sched_domain(rq->cmd_flags);
303}
304
Omar Sandoval00e04392017-04-14 01:00:02 -0700305static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
306{
307 struct kyber_queue_data *kqd;
308 unsigned int max_tokens;
309 unsigned int shift;
310 int ret = -ENOMEM;
311 int i;
312
313 kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
314 if (!kqd)
315 goto err;
316 kqd->q = q;
317
Jianchao Wanga6088842018-05-30 10:47:40 -0600318 kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
Omar Sandoval00e04392017-04-14 01:00:02 -0700319 KYBER_NUM_DOMAINS, kqd);
320 if (!kqd->cb)
321 goto err_kqd;
322
323 /*
324 * The maximum number of tokens for any scheduling domain is at least
325 * the queue depth of a single hardware queue. If the hardware doesn't
326 * have many tags, still provide a reasonable number.
327 */
328 max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
329 KYBER_MIN_DEPTH);
330 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
331 WARN_ON(!kyber_depth[i]);
332 WARN_ON(!kyber_batch_size[i]);
333 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
334 max_tokens, -1, false, GFP_KERNEL,
335 q->node);
336 if (ret) {
337 while (--i >= 0)
338 sbitmap_queue_free(&kqd->domain_tokens[i]);
339 goto err_cb;
340 }
341 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
342 }
343
344 shift = kyber_sched_tags_shift(kqd);
345 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
346
347 kqd->read_lat_nsec = 2000000ULL;
348 kqd->write_lat_nsec = 10000000ULL;
349
350 return kqd;
351
352err_cb:
353 blk_stat_free_callback(kqd->cb);
354err_kqd:
355 kfree(kqd);
356err:
357 return ERR_PTR(ret);
358}
359
360static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
361{
362 struct kyber_queue_data *kqd;
363 struct elevator_queue *eq;
364
365 eq = elevator_alloc(q, e);
366 if (!eq)
367 return -ENOMEM;
368
369 kqd = kyber_queue_data_alloc(q);
370 if (IS_ERR(kqd)) {
371 kobject_put(&eq->kobj);
372 return PTR_ERR(kqd);
373 }
374
375 eq->elevator_data = kqd;
376 q->elevator = eq;
377
378 blk_stat_add_callback(q, kqd->cb);
379
380 return 0;
381}
382
383static void kyber_exit_sched(struct elevator_queue *e)
384{
385 struct kyber_queue_data *kqd = e->elevator_data;
386 struct request_queue *q = kqd->q;
387 int i;
388
389 blk_stat_remove_callback(q, kqd->cb);
390
391 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
392 sbitmap_queue_free(&kqd->domain_tokens[i]);
393 blk_stat_free_callback(kqd->cb);
394 kfree(kqd);
395}
396
Jianchao Wanga6088842018-05-30 10:47:40 -0600397static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
398{
399 unsigned int i;
400
401 spin_lock_init(&kcq->lock);
402 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
403 INIT_LIST_HEAD(&kcq->rq_list[i]);
404}
405
Omar Sandoval00e04392017-04-14 01:00:02 -0700406static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
407{
Jens Axboe28820642018-05-09 13:55:14 -0600408 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
Omar Sandoval00e04392017-04-14 01:00:02 -0700409 struct kyber_hctx_data *khd;
410 int i;
411
412 khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
413 if (!khd)
414 return -ENOMEM;
415
Jianchao Wanga6088842018-05-30 10:47:40 -0600416 khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
417 sizeof(struct kyber_ctx_queue),
418 GFP_KERNEL, hctx->numa_node);
419 if (!khd->kcqs)
420 goto err_khd;
421
422 for (i = 0; i < hctx->nr_ctx; i++)
423 kyber_ctx_queue_init(&khd->kcqs[i]);
424
425 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
426 if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
427 ilog2(8), GFP_KERNEL, hctx->numa_node)) {
428 while (--i >= 0)
429 sbitmap_free(&khd->kcq_map[i]);
430 goto err_kcqs;
431 }
432 }
433
Omar Sandoval00e04392017-04-14 01:00:02 -0700434 spin_lock_init(&khd->lock);
435
436 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
437 INIT_LIST_HEAD(&khd->rqs[i]);
Omar Sandovalfcf38cd2017-12-05 22:57:43 -0800438 init_waitqueue_func_entry(&khd->domain_wait[i],
439 kyber_domain_wake);
440 khd->domain_wait[i].private = hctx;
Ingo Molnar2055da92017-06-20 12:06:46 +0200441 INIT_LIST_HEAD(&khd->domain_wait[i].entry);
Omar Sandoval00e04392017-04-14 01:00:02 -0700442 atomic_set(&khd->wait_index[i], 0);
443 }
444
445 khd->cur_domain = 0;
446 khd->batching = 0;
447
448 hctx->sched_data = khd;
Jens Axboe28820642018-05-09 13:55:14 -0600449 sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
450 kqd->async_depth);
Omar Sandoval00e04392017-04-14 01:00:02 -0700451
452 return 0;
Jianchao Wanga6088842018-05-30 10:47:40 -0600453
454err_kcqs:
455 kfree(khd->kcqs);
456err_khd:
457 kfree(khd);
458 return -ENOMEM;
Omar Sandoval00e04392017-04-14 01:00:02 -0700459}
460
461static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
462{
Jianchao Wanga6088842018-05-30 10:47:40 -0600463 struct kyber_hctx_data *khd = hctx->sched_data;
464 int i;
465
466 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
467 sbitmap_free(&khd->kcq_map[i]);
468 kfree(khd->kcqs);
Omar Sandoval00e04392017-04-14 01:00:02 -0700469 kfree(hctx->sched_data);
470}
471
472static int rq_get_domain_token(struct request *rq)
473{
474 return (long)rq->elv.priv[0];
475}
476
477static void rq_set_domain_token(struct request *rq, int token)
478{
479 rq->elv.priv[0] = (void *)(long)token;
480}
481
482static void rq_clear_domain_token(struct kyber_queue_data *kqd,
483 struct request *rq)
484{
485 unsigned int sched_domain;
486 int nr;
487
488 nr = rq_get_domain_token(rq);
489 if (nr != -1) {
Jianchao Wanga6088842018-05-30 10:47:40 -0600490 sched_domain = kyber_sched_domain(rq->cmd_flags);
Omar Sandoval00e04392017-04-14 01:00:02 -0700491 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
492 rq->mq_ctx->cpu);
493 }
494}
495
Christoph Hellwig5bbf4e52017-06-16 18:15:26 +0200496static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
Omar Sandoval00e04392017-04-14 01:00:02 -0700497{
Omar Sandoval00e04392017-04-14 01:00:02 -0700498 /*
499 * We use the scheduler tags as per-hardware queue queueing tokens.
500 * Async requests can be limited at this stage.
501 */
Christoph Hellwig5bbf4e52017-06-16 18:15:26 +0200502 if (!op_is_sync(op)) {
503 struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
Omar Sandoval00e04392017-04-14 01:00:02 -0700504
Christoph Hellwig5bbf4e52017-06-16 18:15:26 +0200505 data->shallow_depth = kqd->async_depth;
506 }
507}
508
Jianchao Wanga6088842018-05-30 10:47:40 -0600509static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
510{
511 struct kyber_hctx_data *khd = hctx->sched_data;
512 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
513 struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
514 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
515 struct list_head *rq_list = &kcq->rq_list[sched_domain];
516 bool merged;
517
518 spin_lock(&kcq->lock);
519 merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
520 spin_unlock(&kcq->lock);
521 blk_mq_put_ctx(ctx);
522
523 return merged;
524}
525
Christoph Hellwig5bbf4e52017-06-16 18:15:26 +0200526static void kyber_prepare_request(struct request *rq, struct bio *bio)
527{
528 rq_set_domain_token(rq, -1);
Omar Sandoval00e04392017-04-14 01:00:02 -0700529}
530
Jianchao Wanga6088842018-05-30 10:47:40 -0600531static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
532 struct list_head *rq_list, bool at_head)
533{
534 struct kyber_hctx_data *khd = hctx->sched_data;
535 struct request *rq, *next;
536
537 list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
540 struct list_head *head = &kcq->rq_list[sched_domain];
541
542 spin_lock(&kcq->lock);
543 if (at_head)
544 list_move(&rq->queuelist, head);
545 else
546 list_move_tail(&rq->queuelist, head);
547 sbitmap_set_bit(&khd->kcq_map[sched_domain],
548 rq->mq_ctx->index_hw);
549 blk_mq_sched_request_inserted(rq);
550 spin_unlock(&kcq->lock);
551 }
552}
553
Christoph Hellwig7b9e9362017-06-16 18:15:21 +0200554static void kyber_finish_request(struct request *rq)
Omar Sandoval00e04392017-04-14 01:00:02 -0700555{
Christoph Hellwig7b9e9362017-06-16 18:15:21 +0200556 struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
Omar Sandoval00e04392017-04-14 01:00:02 -0700557
558 rq_clear_domain_token(kqd, rq);
Omar Sandoval00e04392017-04-14 01:00:02 -0700559}
560
561static void kyber_completed_request(struct request *rq)
562{
563 struct request_queue *q = rq->q;
564 struct kyber_queue_data *kqd = q->elevator->elevator_data;
565 unsigned int sched_domain;
566 u64 now, latency, target;
567
568 /*
569 * Check if this request met our latency goal. If not, quickly gather
570 * some statistics and start throttling.
571 */
Jianchao Wanga6088842018-05-30 10:47:40 -0600572 sched_domain = kyber_sched_domain(rq->cmd_flags);
Omar Sandoval00e04392017-04-14 01:00:02 -0700573 switch (sched_domain) {
574 case KYBER_READ:
575 target = kqd->read_lat_nsec;
576 break;
577 case KYBER_SYNC_WRITE:
578 target = kqd->write_lat_nsec;
579 break;
580 default:
581 return;
582 }
583
584 /* If we are already monitoring latencies, don't check again. */
585 if (blk_stat_is_active(kqd->cb))
586 return;
587
Omar Sandoval544ccc8d2018-05-09 02:08:50 -0700588 now = ktime_get_ns();
589 if (now < rq->io_start_time_ns)
Omar Sandoval00e04392017-04-14 01:00:02 -0700590 return;
591
Omar Sandoval544ccc8d2018-05-09 02:08:50 -0700592 latency = now - rq->io_start_time_ns;
Omar Sandoval00e04392017-04-14 01:00:02 -0700593
594 if (latency > target)
595 blk_stat_activate_msecs(kqd->cb, 10);
596}
597
Jianchao Wanga6088842018-05-30 10:47:40 -0600598struct flush_kcq_data {
599 struct kyber_hctx_data *khd;
600 unsigned int sched_domain;
601 struct list_head *list;
602};
603
604static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
Omar Sandoval00e04392017-04-14 01:00:02 -0700605{
Jianchao Wanga6088842018-05-30 10:47:40 -0600606 struct flush_kcq_data *flush_data = data;
607 struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
Omar Sandoval00e04392017-04-14 01:00:02 -0700608
Jianchao Wanga6088842018-05-30 10:47:40 -0600609 spin_lock(&kcq->lock);
610 list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
611 flush_data->list);
612 sbitmap_clear_bit(sb, bitnr);
613 spin_unlock(&kcq->lock);
Omar Sandoval00e04392017-04-14 01:00:02 -0700614
Jianchao Wanga6088842018-05-30 10:47:40 -0600615 return true;
616}
617
618static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
619 unsigned int sched_domain,
620 struct list_head *list)
621{
622 struct flush_kcq_data data = {
623 .khd = khd,
624 .sched_domain = sched_domain,
625 .list = list,
626 };
627
628 sbitmap_for_each_set(&khd->kcq_map[sched_domain],
629 flush_busy_kcq, &data);
Omar Sandoval00e04392017-04-14 01:00:02 -0700630}
631
Ingo Molnarac6424b2017-06-20 12:06:13 +0200632static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
Omar Sandoval00e04392017-04-14 01:00:02 -0700633 void *key)
634{
635 struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
636
Ingo Molnar2055da92017-06-20 12:06:46 +0200637 list_del_init(&wait->entry);
Omar Sandoval00e04392017-04-14 01:00:02 -0700638 blk_mq_run_hw_queue(hctx, true);
639 return 1;
640}
641
642static int kyber_get_domain_token(struct kyber_queue_data *kqd,
643 struct kyber_hctx_data *khd,
644 struct blk_mq_hw_ctx *hctx)
645{
646 unsigned int sched_domain = khd->cur_domain;
647 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
Ingo Molnarac6424b2017-06-20 12:06:13 +0200648 wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
Omar Sandoval00e04392017-04-14 01:00:02 -0700649 struct sbq_wait_state *ws;
650 int nr;
651
652 nr = __sbitmap_queue_get(domain_tokens);
Omar Sandoval00e04392017-04-14 01:00:02 -0700653
654 /*
655 * If we failed to get a domain token, make sure the hardware queue is
656 * run when one becomes available. Note that this is serialized on
657 * khd->lock, but we still need to be careful about the waker.
658 */
Omar Sandovalfcf38cd2017-12-05 22:57:43 -0800659 if (nr < 0 && list_empty_careful(&wait->entry)) {
Omar Sandoval00e04392017-04-14 01:00:02 -0700660 ws = sbq_wait_ptr(domain_tokens,
661 &khd->wait_index[sched_domain]);
Omar Sandovalfcf38cd2017-12-05 22:57:43 -0800662 khd->domain_ws[sched_domain] = ws;
Omar Sandoval00e04392017-04-14 01:00:02 -0700663 add_wait_queue(&ws->wait, wait);
664
665 /*
666 * Try again in case a token was freed before we got on the wait
Omar Sandovalfcf38cd2017-12-05 22:57:43 -0800667 * queue.
Omar Sandoval00e04392017-04-14 01:00:02 -0700668 */
669 nr = __sbitmap_queue_get(domain_tokens);
670 }
Omar Sandovalfcf38cd2017-12-05 22:57:43 -0800671
672 /*
673 * If we got a token while we were on the wait queue, remove ourselves
674 * from the wait queue to ensure that all wake ups make forward
675 * progress. It's possible that the waker already deleted the entry
676 * between the !list_empty_careful() check and us grabbing the lock, but
677 * list_del_init() is okay with that.
678 */
679 if (nr >= 0 && !list_empty_careful(&wait->entry)) {
680 ws = khd->domain_ws[sched_domain];
681 spin_lock_irq(&ws->wait.lock);
682 list_del_init(&wait->entry);
683 spin_unlock_irq(&ws->wait.lock);
684 }
685
Omar Sandoval00e04392017-04-14 01:00:02 -0700686 return nr;
687}
688
689static struct request *
690kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
691 struct kyber_hctx_data *khd,
Jianchao Wanga6088842018-05-30 10:47:40 -0600692 struct blk_mq_hw_ctx *hctx)
Omar Sandoval00e04392017-04-14 01:00:02 -0700693{
694 struct list_head *rqs;
695 struct request *rq;
696 int nr;
697
698 rqs = &khd->rqs[khd->cur_domain];
Omar Sandoval00e04392017-04-14 01:00:02 -0700699
700 /*
Jianchao Wanga6088842018-05-30 10:47:40 -0600701 * If we already have a flushed request, then we just need to get a
702 * token for it. Otherwise, if there are pending requests in the kcqs,
703 * flush the kcqs, but only if we can get a token. If not, we should
704 * leave the requests in the kcqs so that they can be merged. Note that
705 * khd->lock serializes the flushes, so if we observed any bit set in
706 * the kcq_map, we will always get a request.
Omar Sandoval00e04392017-04-14 01:00:02 -0700707 */
Jianchao Wanga6088842018-05-30 10:47:40 -0600708 rq = list_first_entry_or_null(rqs, struct request, queuelist);
Omar Sandoval00e04392017-04-14 01:00:02 -0700709 if (rq) {
710 nr = kyber_get_domain_token(kqd, khd, hctx);
711 if (nr >= 0) {
712 khd->batching++;
713 rq_set_domain_token(rq, nr);
714 list_del_init(&rq->queuelist);
715 return rq;
716 }
Jianchao Wanga6088842018-05-30 10:47:40 -0600717 } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
718 nr = kyber_get_domain_token(kqd, khd, hctx);
719 if (nr >= 0) {
720 kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
721 rq = list_first_entry(rqs, struct request, queuelist);
722 khd->batching++;
723 rq_set_domain_token(rq, nr);
724 list_del_init(&rq->queuelist);
725 return rq;
726 }
Omar Sandoval00e04392017-04-14 01:00:02 -0700727 }
728
729 /* There were either no pending requests or no tokens. */
730 return NULL;
731}
732
733static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
734{
735 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
736 struct kyber_hctx_data *khd = hctx->sched_data;
Omar Sandoval00e04392017-04-14 01:00:02 -0700737 struct request *rq;
738 int i;
739
740 spin_lock(&khd->lock);
741
742 /*
743 * First, if we are still entitled to batch, try to dispatch a request
744 * from the batch.
745 */
746 if (khd->batching < kyber_batch_size[khd->cur_domain]) {
Jianchao Wanga6088842018-05-30 10:47:40 -0600747 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
Omar Sandoval00e04392017-04-14 01:00:02 -0700748 if (rq)
749 goto out;
750 }
751
752 /*
753 * Either,
754 * 1. We were no longer entitled to a batch.
755 * 2. The domain we were batching didn't have any requests.
756 * 3. The domain we were batching was out of tokens.
757 *
758 * Start another batch. Note that this wraps back around to the original
759 * domain if no other domains have requests or tokens.
760 */
761 khd->batching = 0;
762 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
763 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
764 khd->cur_domain = 0;
765 else
766 khd->cur_domain++;
767
Jianchao Wanga6088842018-05-30 10:47:40 -0600768 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
Omar Sandoval00e04392017-04-14 01:00:02 -0700769 if (rq)
770 goto out;
771 }
772
773 rq = NULL;
774out:
775 spin_unlock(&khd->lock);
776 return rq;
777}
778
779static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
780{
781 struct kyber_hctx_data *khd = hctx->sched_data;
782 int i;
783
784 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
Jianchao Wanga6088842018-05-30 10:47:40 -0600785 if (!list_empty_careful(&khd->rqs[i]) ||
786 sbitmap_any_bit_set(&khd->kcq_map[i]))
Omar Sandoval00e04392017-04-14 01:00:02 -0700787 return true;
788 }
Jianchao Wanga6088842018-05-30 10:47:40 -0600789
790 return false;
Omar Sandoval00e04392017-04-14 01:00:02 -0700791}
792
793#define KYBER_LAT_SHOW_STORE(op) \
794static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
795 char *page) \
796{ \
797 struct kyber_queue_data *kqd = e->elevator_data; \
798 \
799 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
800} \
801 \
802static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
803 const char *page, size_t count) \
804{ \
805 struct kyber_queue_data *kqd = e->elevator_data; \
806 unsigned long long nsec; \
807 int ret; \
808 \
809 ret = kstrtoull(page, 10, &nsec); \
810 if (ret) \
811 return ret; \
812 \
813 kqd->op##_lat_nsec = nsec; \
814 \
815 return count; \
816}
817KYBER_LAT_SHOW_STORE(read);
818KYBER_LAT_SHOW_STORE(write);
819#undef KYBER_LAT_SHOW_STORE
820
821#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
822static struct elv_fs_entry kyber_sched_attrs[] = {
823 KYBER_LAT_ATTR(read),
824 KYBER_LAT_ATTR(write),
825 __ATTR_NULL
826};
827#undef KYBER_LAT_ATTR
828
Omar Sandoval16b738f2017-05-04 00:31:33 -0700829#ifdef CONFIG_BLK_DEBUG_FS
830#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
831static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
832{ \
833 struct request_queue *q = data; \
834 struct kyber_queue_data *kqd = q->elevator->elevator_data; \
835 \
836 sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
837 return 0; \
838} \
839 \
840static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
841 __acquires(&khd->lock) \
842{ \
843 struct blk_mq_hw_ctx *hctx = m->private; \
844 struct kyber_hctx_data *khd = hctx->sched_data; \
845 \
846 spin_lock(&khd->lock); \
847 return seq_list_start(&khd->rqs[domain], *pos); \
848} \
849 \
850static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
851 loff_t *pos) \
852{ \
853 struct blk_mq_hw_ctx *hctx = m->private; \
854 struct kyber_hctx_data *khd = hctx->sched_data; \
855 \
856 return seq_list_next(v, &khd->rqs[domain], pos); \
857} \
858 \
859static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
860 __releases(&khd->lock) \
861{ \
862 struct blk_mq_hw_ctx *hctx = m->private; \
863 struct kyber_hctx_data *khd = hctx->sched_data; \
864 \
865 spin_unlock(&khd->lock); \
866} \
867 \
868static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
869 .start = kyber_##name##_rqs_start, \
870 .next = kyber_##name##_rqs_next, \
871 .stop = kyber_##name##_rqs_stop, \
872 .show = blk_mq_debugfs_rq_show, \
873}; \
874 \
875static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
876{ \
877 struct blk_mq_hw_ctx *hctx = data; \
878 struct kyber_hctx_data *khd = hctx->sched_data; \
Ingo Molnarac6424b2017-06-20 12:06:13 +0200879 wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
Omar Sandoval16b738f2017-05-04 00:31:33 -0700880 \
Ingo Molnar2055da92017-06-20 12:06:46 +0200881 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
Omar Sandoval16b738f2017-05-04 00:31:33 -0700882 return 0; \
883}
884KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
885KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
886KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
887#undef KYBER_DEBUGFS_DOMAIN_ATTRS
888
889static int kyber_async_depth_show(void *data, struct seq_file *m)
890{
891 struct request_queue *q = data;
892 struct kyber_queue_data *kqd = q->elevator->elevator_data;
893
894 seq_printf(m, "%u\n", kqd->async_depth);
895 return 0;
896}
897
898static int kyber_cur_domain_show(void *data, struct seq_file *m)
899{
900 struct blk_mq_hw_ctx *hctx = data;
901 struct kyber_hctx_data *khd = hctx->sched_data;
902
903 switch (khd->cur_domain) {
904 case KYBER_READ:
905 seq_puts(m, "READ\n");
906 break;
907 case KYBER_SYNC_WRITE:
908 seq_puts(m, "SYNC_WRITE\n");
909 break;
910 case KYBER_OTHER:
911 seq_puts(m, "OTHER\n");
912 break;
913 default:
914 seq_printf(m, "%u\n", khd->cur_domain);
915 break;
916 }
917 return 0;
918}
919
920static int kyber_batching_show(void *data, struct seq_file *m)
921{
922 struct blk_mq_hw_ctx *hctx = data;
923 struct kyber_hctx_data *khd = hctx->sched_data;
924
925 seq_printf(m, "%u\n", khd->batching);
926 return 0;
927}
928
929#define KYBER_QUEUE_DOMAIN_ATTRS(name) \
930 {#name "_tokens", 0400, kyber_##name##_tokens_show}
931static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
932 KYBER_QUEUE_DOMAIN_ATTRS(read),
933 KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
934 KYBER_QUEUE_DOMAIN_ATTRS(other),
935 {"async_depth", 0400, kyber_async_depth_show},
936 {},
937};
938#undef KYBER_QUEUE_DOMAIN_ATTRS
939
940#define KYBER_HCTX_DOMAIN_ATTRS(name) \
941 {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
942 {#name "_waiting", 0400, kyber_##name##_waiting_show}
943static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
944 KYBER_HCTX_DOMAIN_ATTRS(read),
945 KYBER_HCTX_DOMAIN_ATTRS(sync_write),
946 KYBER_HCTX_DOMAIN_ATTRS(other),
947 {"cur_domain", 0400, kyber_cur_domain_show},
948 {"batching", 0400, kyber_batching_show},
949 {},
950};
951#undef KYBER_HCTX_DOMAIN_ATTRS
952#endif
953
Omar Sandoval00e04392017-04-14 01:00:02 -0700954static struct elevator_type kyber_sched = {
955 .ops.mq = {
956 .init_sched = kyber_init_sched,
957 .exit_sched = kyber_exit_sched,
958 .init_hctx = kyber_init_hctx,
959 .exit_hctx = kyber_exit_hctx,
Christoph Hellwig5bbf4e52017-06-16 18:15:26 +0200960 .limit_depth = kyber_limit_depth,
Jianchao Wanga6088842018-05-30 10:47:40 -0600961 .bio_merge = kyber_bio_merge,
Christoph Hellwig5bbf4e52017-06-16 18:15:26 +0200962 .prepare_request = kyber_prepare_request,
Jianchao Wanga6088842018-05-30 10:47:40 -0600963 .insert_requests = kyber_insert_requests,
Christoph Hellwig7b9e9362017-06-16 18:15:21 +0200964 .finish_request = kyber_finish_request,
Ming Leiba989a02018-02-23 23:36:57 +0800965 .requeue_request = kyber_finish_request,
Omar Sandoval00e04392017-04-14 01:00:02 -0700966 .completed_request = kyber_completed_request,
967 .dispatch_request = kyber_dispatch_request,
968 .has_work = kyber_has_work,
969 },
970 .uses_mq = true,
Omar Sandoval16b738f2017-05-04 00:31:33 -0700971#ifdef CONFIG_BLK_DEBUG_FS
972 .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
973 .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
974#endif
Omar Sandoval00e04392017-04-14 01:00:02 -0700975 .elevator_attrs = kyber_sched_attrs,
976 .elevator_name = "kyber",
977 .elevator_owner = THIS_MODULE,
978};
979
980static int __init kyber_init(void)
981{
982 return elv_register(&kyber_sched);
983}
984
985static void __exit kyber_exit(void)
986{
987 elv_unregister(&kyber_sched);
988}
989
990module_init(kyber_init);
991module_exit(kyber_exit);
992
993MODULE_AUTHOR("Omar Sandoval");
994MODULE_LICENSE("GPL");
995MODULE_DESCRIPTION("Kyber I/O scheduler");