blob: 637c9066506ba529bdfcbe7decdc1ec0c2730929 [file] [log] [blame]
Paolo Valente70f28712013-05-09 19:10:02 +02001/*
2 * Budget Fair Queueing (BFQ) disk scheduler.
3 *
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 *
12 * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
13 * file.
14 *
15 * BFQ is a proportional-share storage-I/O scheduling algorithm based on
16 * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
17 * measured in number of sectors, to processes instead of time slices. The
18 * device is not granted to the in-service process for a given time slice,
19 * but until it has exhausted its assigned budget. This change from the time
20 * to the service domain allows BFQ to distribute the device throughput
21 * among processes as desired, without any distortion due to ZBR, workload
22 * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
23 * called B-WF2Q+, to schedule processes according to their budgets. More
24 * precisely, BFQ schedules queues associated to processes. Thanks to the
25 * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
26 * I/O-bound processes issuing sequential requests (to boost the
27 * throughput), and yet guarantee a low latency to interactive and soft
28 * real-time applications.
29 *
30 * BFQ is described in [1], where also a reference to the initial, more
31 * theoretical paper on BFQ can be found. The interested reader can find
32 * in the latter paper full details on the main algorithm, as well as
33 * formulas of the guarantees and formal proofs of all the properties.
34 * With respect to the version of BFQ presented in these papers, this
35 * implementation adds a few more heuristics, such as the one that
36 * guarantees a low latency to soft real-time applications, and a
37 * hierarchical extension based on H-WF2Q+.
38 *
39 * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
40 * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
41 * complexity derives from the one introduced with EEVDF in [3].
42 *
43 * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
44 * with the BFQ Disk I/O Scheduler'',
45 * Proceedings of the 5th Annual International Systems and Storage
46 * Conference (SYSTOR '12), June 2012.
47 *
48 * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
49 *
50 * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
51 * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
52 * Oct 1997.
53 *
54 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
55 *
56 * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
57 * First: A Flexible and Accurate Mechanism for Proportional Share
58 * Resource Allocation,'' technical report.
59 *
60 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
61 */
62#include <linux/module.h>
63#include <linux/slab.h>
64#include <linux/blkdev.h>
65#include <linux/cgroup.h>
66#include <linux/elevator.h>
67#include <linux/jiffies.h>
68#include <linux/rbtree.h>
69#include <linux/ioprio.h>
70#include "bfq.h"
71#include "blk.h"
72
73/* Expiration time of sync (0) and async (1) requests, in jiffies. */
74static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
75
76/* Maximum backwards seek, in KiB. */
77static const int bfq_back_max = 16 * 1024;
78
79/* Penalty of a backwards seek, in number of sectors. */
80static const int bfq_back_penalty = 2;
81
82/* Idling period duration, in jiffies. */
83static int bfq_slice_idle = HZ / 125;
84
85/* Default maximum budget values, in sectors and number of requests. */
86static const int bfq_default_max_budget = 16 * 1024;
87static const int bfq_max_budget_async_rq = 4;
88
89/*
90 * Async to sync throughput distribution is controlled as follows:
91 * when an async request is served, the entity is charged the number
92 * of sectors of the request, multiplied by the factor below
93 */
94static const int bfq_async_charge_factor = 10;
95
96/* Default timeout values, in jiffies, approximating CFQ defaults. */
97static const int bfq_timeout_sync = HZ / 8;
98static int bfq_timeout_async = HZ / 25;
99
100struct kmem_cache *bfq_pool;
101
102/* Below this threshold (in ms), we consider thinktime immediate. */
103#define BFQ_MIN_TT 2
104
105/* hw_tag detection: parallel requests threshold and min samples needed. */
106#define BFQ_HW_QUEUE_THRESHOLD 4
107#define BFQ_HW_QUEUE_SAMPLES 32
108
109#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
110#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
111
112/* Min samples used for peak rate estimation (for autotuning). */
113#define BFQ_PEAK_RATE_SAMPLES 32
114
115/* Shift used for peak rate fixed precision calculations. */
116#define BFQ_RATE_SHIFT 16
117
118/*
119 * By default, BFQ computes the duration of the weight raising for
120 * interactive applications automatically, using the following formula:
121 * duration = (R / r) * T, where r is the peak rate of the device, and
122 * R and T are two reference parameters.
123 * In particular, R is the peak rate of the reference device (see below),
124 * and T is a reference time: given the systems that are likely to be
125 * installed on the reference device according to its speed class, T is
126 * about the maximum time needed, under BFQ and while reading two files in
127 * parallel, to load typical large applications on these systems.
128 * In practice, the slower/faster the device at hand is, the more/less it
129 * takes to load applications with respect to the reference device.
130 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
131 * applications.
132 *
133 * BFQ uses four different reference pairs (R, T), depending on:
134 * . whether the device is rotational or non-rotational;
135 * . whether the device is slow, such as old or portable HDDs, as well as
136 * SD cards, or fast, such as newer HDDs and SSDs.
137 *
138 * The device's speed class is dynamically (re)detected in
139 * bfq_update_peak_rate() every time the estimated peak rate is updated.
140 *
141 * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
142 * are the reference values for a slow/fast rotational device, whereas
143 * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
144 * a slow/fast non-rotational device. Finally, device_speed_thresh are the
145 * thresholds used to switch between speed classes.
146 * Both the reference peak rates and the thresholds are measured in
147 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
148 */
149static int R_slow[2] = {1536, 10752};
150static int R_fast[2] = {17415, 34791};
151/*
152 * To improve readability, a conversion function is used to initialize the
153 * following arrays, which entails that they can be initialized only in a
154 * function.
155 */
156static int T_slow[2];
157static int T_fast[2];
158static int device_speed_thresh[2];
159
160#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
161 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
162
163#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
164#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
165
166static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
167
168#include "bfq-ioc.c"
169#include "bfq-sched.c"
170#include "bfq-cgroup.c"
171
172#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
173 IOPRIO_CLASS_IDLE)
174#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
175 IOPRIO_CLASS_RT)
176
177#define bfq_sample_valid(samples) ((samples) > 80)
178
179/*
180 * The following macro groups conditions that need to be evaluated when
181 * checking if existing queues and groups form a symmetric scenario
182 * and therefore idling can be reduced or disabled for some of the
183 * queues. See the comment to the function bfq_bfqq_must_not_expire()
184 * for further details.
185 */
186#ifdef CONFIG_CGROUP_BFQIO
187#define symmetric_scenario (!bfqd->active_numerous_groups && \
188 !bfq_differentiated_weights(bfqd))
189#else
190#define symmetric_scenario (!bfq_differentiated_weights(bfqd))
191#endif
192
193/*
194 * We regard a request as SYNC, if either it's a read or has the SYNC bit
195 * set (in which case it could also be a direct WRITE).
196 */
197static inline int bfq_bio_sync(struct bio *bio)
198{
199 if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
200 return 1;
201
202 return 0;
203}
204
205/*
206 * Scheduler run of queue, if there are requests pending and no one in the
207 * driver that will restart queueing.
208 */
209static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
210{
211 if (bfqd->queued != 0) {
212 bfq_log(bfqd, "schedule dispatch");
213 kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
214 }
215}
216
217/*
218 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
219 * We choose the request that is closesr to the head right now. Distance
220 * behind the head is penalized and only allowed to a certain extent.
221 */
222static struct request *bfq_choose_req(struct bfq_data *bfqd,
223 struct request *rq1,
224 struct request *rq2,
225 sector_t last)
226{
227 sector_t s1, s2, d1 = 0, d2 = 0;
228 unsigned long back_max;
229#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
230#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
231 unsigned wrap = 0; /* bit mask: requests behind the disk head? */
232
233 if (rq1 == NULL || rq1 == rq2)
234 return rq2;
235 if (rq2 == NULL)
236 return rq1;
237
238 if (rq_is_sync(rq1) && !rq_is_sync(rq2))
239 return rq1;
240 else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
241 return rq2;
242 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
243 return rq1;
244 else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
245 return rq2;
246
247 s1 = blk_rq_pos(rq1);
248 s2 = blk_rq_pos(rq2);
249
250 /*
251 * By definition, 1KiB is 2 sectors.
252 */
253 back_max = bfqd->bfq_back_max * 2;
254
255 /*
256 * Strict one way elevator _except_ in the case where we allow
257 * short backward seeks which are biased as twice the cost of a
258 * similar forward seek.
259 */
260 if (s1 >= last)
261 d1 = s1 - last;
262 else if (s1 + back_max >= last)
263 d1 = (last - s1) * bfqd->bfq_back_penalty;
264 else
265 wrap |= BFQ_RQ1_WRAP;
266
267 if (s2 >= last)
268 d2 = s2 - last;
269 else if (s2 + back_max >= last)
270 d2 = (last - s2) * bfqd->bfq_back_penalty;
271 else
272 wrap |= BFQ_RQ2_WRAP;
273
274 /* Found required data */
275
276 /*
277 * By doing switch() on the bit mask "wrap" we avoid having to
278 * check two variables for all permutations: --> faster!
279 */
280 switch (wrap) {
281 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
282 if (d1 < d2)
283 return rq1;
284 else if (d2 < d1)
285 return rq2;
286 else {
287 if (s1 >= s2)
288 return rq1;
289 else
290 return rq2;
291 }
292
293 case BFQ_RQ2_WRAP:
294 return rq1;
295 case BFQ_RQ1_WRAP:
296 return rq2;
297 case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
298 default:
299 /*
300 * Since both rqs are wrapped,
301 * start with the one that's further behind head
302 * (--> only *one* back seek required),
303 * since back seek takes more time than forward.
304 */
305 if (s1 <= s2)
306 return rq1;
307 else
308 return rq2;
309 }
310}
311
312static struct bfq_queue *
313bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
314 sector_t sector, struct rb_node **ret_parent,
315 struct rb_node ***rb_link)
316{
317 struct rb_node **p, *parent;
318 struct bfq_queue *bfqq = NULL;
319
320 parent = NULL;
321 p = &root->rb_node;
322 while (*p) {
323 struct rb_node **n;
324
325 parent = *p;
326 bfqq = rb_entry(parent, struct bfq_queue, pos_node);
327
328 /*
329 * Sort strictly based on sector. Smallest to the left,
330 * largest to the right.
331 */
332 if (sector > blk_rq_pos(bfqq->next_rq))
333 n = &(*p)->rb_right;
334 else if (sector < blk_rq_pos(bfqq->next_rq))
335 n = &(*p)->rb_left;
336 else
337 break;
338 p = n;
339 bfqq = NULL;
340 }
341
342 *ret_parent = parent;
343 if (rb_link)
344 *rb_link = p;
345
346 bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
347 (long long unsigned)sector,
348 bfqq != NULL ? bfqq->pid : 0);
349
350 return bfqq;
351}
352
353static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
354{
355 struct rb_node **p, *parent;
356 struct bfq_queue *__bfqq;
357
358 if (bfqq->pos_root != NULL) {
359 rb_erase(&bfqq->pos_node, bfqq->pos_root);
360 bfqq->pos_root = NULL;
361 }
362
363 if (bfq_class_idle(bfqq))
364 return;
365 if (!bfqq->next_rq)
366 return;
367
368 bfqq->pos_root = &bfqd->rq_pos_tree;
369 __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
370 blk_rq_pos(bfqq->next_rq), &parent, &p);
371 if (__bfqq == NULL) {
372 rb_link_node(&bfqq->pos_node, parent, p);
373 rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
374 } else
375 bfqq->pos_root = NULL;
376}
377
378/*
379 * Tell whether there are active queues or groups with differentiated weights.
380 */
381static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
382{
383 /*
384 * For weights to differ, at least one of the trees must contain
385 * at least two nodes.
386 */
387 return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
388 (bfqd->queue_weights_tree.rb_node->rb_left ||
389 bfqd->queue_weights_tree.rb_node->rb_right)
390#ifdef CONFIG_CGROUP_BFQIO
391 ) ||
392 (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
393 (bfqd->group_weights_tree.rb_node->rb_left ||
394 bfqd->group_weights_tree.rb_node->rb_right)
395#endif
396 );
397}
398
399/*
400 * If the weight-counter tree passed as input contains no counter for
401 * the weight of the input entity, then add that counter; otherwise just
402 * increment the existing counter.
403 *
404 * Note that weight-counter trees contain few nodes in mostly symmetric
405 * scenarios. For example, if all queues have the same weight, then the
406 * weight-counter tree for the queues may contain at most one node.
407 * This holds even if low_latency is on, because weight-raised queues
408 * are not inserted in the tree.
409 * In most scenarios, the rate at which nodes are created/destroyed
410 * should be low too.
411 */
412static void bfq_weights_tree_add(struct bfq_data *bfqd,
413 struct bfq_entity *entity,
414 struct rb_root *root)
415{
416 struct rb_node **new = &(root->rb_node), *parent = NULL;
417
418 /*
419 * Do not insert if the entity is already associated with a
420 * counter, which happens if:
421 * 1) the entity is associated with a queue,
422 * 2) a request arrival has caused the queue to become both
423 * non-weight-raised, and hence change its weight, and
424 * backlogged; in this respect, each of the two events
425 * causes an invocation of this function,
426 * 3) this is the invocation of this function caused by the
427 * second event. This second invocation is actually useless,
428 * and we handle this fact by exiting immediately. More
429 * efficient or clearer solutions might possibly be adopted.
430 */
431 if (entity->weight_counter)
432 return;
433
434 while (*new) {
435 struct bfq_weight_counter *__counter = container_of(*new,
436 struct bfq_weight_counter,
437 weights_node);
438 parent = *new;
439
440 if (entity->weight == __counter->weight) {
441 entity->weight_counter = __counter;
442 goto inc_counter;
443 }
444 if (entity->weight < __counter->weight)
445 new = &((*new)->rb_left);
446 else
447 new = &((*new)->rb_right);
448 }
449
450 entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
451 GFP_ATOMIC);
452 entity->weight_counter->weight = entity->weight;
453 rb_link_node(&entity->weight_counter->weights_node, parent, new);
454 rb_insert_color(&entity->weight_counter->weights_node, root);
455
456inc_counter:
457 entity->weight_counter->num_active++;
458}
459
460/*
461 * Decrement the weight counter associated with the entity, and, if the
462 * counter reaches 0, remove the counter from the tree.
463 * See the comments to the function bfq_weights_tree_add() for considerations
464 * about overhead.
465 */
466static void bfq_weights_tree_remove(struct bfq_data *bfqd,
467 struct bfq_entity *entity,
468 struct rb_root *root)
469{
470 if (!entity->weight_counter)
471 return;
472
473 BUG_ON(RB_EMPTY_ROOT(root));
474 BUG_ON(entity->weight_counter->weight != entity->weight);
475
476 BUG_ON(!entity->weight_counter->num_active);
477 entity->weight_counter->num_active--;
478 if (entity->weight_counter->num_active > 0)
479 goto reset_entity_pointer;
480
481 rb_erase(&entity->weight_counter->weights_node, root);
482 kfree(entity->weight_counter);
483
484reset_entity_pointer:
485 entity->weight_counter = NULL;
486}
487
488static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
489 struct bfq_queue *bfqq,
490 struct request *last)
491{
492 struct rb_node *rbnext = rb_next(&last->rb_node);
493 struct rb_node *rbprev = rb_prev(&last->rb_node);
494 struct request *next = NULL, *prev = NULL;
495
496 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
497
498 if (rbprev != NULL)
499 prev = rb_entry_rq(rbprev);
500
501 if (rbnext != NULL)
502 next = rb_entry_rq(rbnext);
503 else {
504 rbnext = rb_first(&bfqq->sort_list);
505 if (rbnext && rbnext != &last->rb_node)
506 next = rb_entry_rq(rbnext);
507 }
508
509 return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
510}
511
512/* see the definition of bfq_async_charge_factor for details */
513static inline unsigned long bfq_serv_to_charge(struct request *rq,
514 struct bfq_queue *bfqq)
515{
516 return blk_rq_sectors(rq) *
517 (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
518 bfq_async_charge_factor));
519}
520
521/**
522 * bfq_updated_next_req - update the queue after a new next_rq selection.
523 * @bfqd: the device data the queue belongs to.
524 * @bfqq: the queue to update.
525 *
526 * If the first request of a queue changes we make sure that the queue
527 * has enough budget to serve at least its first request (if the
528 * request has grown). We do this because if the queue has not enough
529 * budget for its first request, it has to go through two dispatch
530 * rounds to actually get it dispatched.
531 */
532static void bfq_updated_next_req(struct bfq_data *bfqd,
533 struct bfq_queue *bfqq)
534{
535 struct bfq_entity *entity = &bfqq->entity;
536 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
537 struct request *next_rq = bfqq->next_rq;
538 unsigned long new_budget;
539
540 if (next_rq == NULL)
541 return;
542
543 if (bfqq == bfqd->in_service_queue)
544 /*
545 * In order not to break guarantees, budgets cannot be
546 * changed after an entity has been selected.
547 */
548 return;
549
550 BUG_ON(entity->tree != &st->active);
551 BUG_ON(entity == entity->sched_data->in_service_entity);
552
553 new_budget = max_t(unsigned long, bfqq->max_budget,
554 bfq_serv_to_charge(next_rq, bfqq));
555 if (entity->budget != new_budget) {
556 entity->budget = new_budget;
557 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
558 new_budget);
559 bfq_activate_bfqq(bfqd, bfqq);
560 }
561}
562
563static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
564{
565 u64 dur;
566
567 if (bfqd->bfq_wr_max_time > 0)
568 return bfqd->bfq_wr_max_time;
569
570 dur = bfqd->RT_prod;
571 do_div(dur, bfqd->peak_rate);
572
573 return dur;
574}
575
Mauro Andreolini0b335882015-06-07 02:34:22 +0200576static inline unsigned
577bfq_bfqq_cooperations(struct bfq_queue *bfqq)
578{
579 return bfqq->bic ? bfqq->bic->cooperations : 0;
580}
581
582static inline void
583bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
584{
585 if (bic->saved_idle_window)
586 bfq_mark_bfqq_idle_window(bfqq);
587 else
588 bfq_clear_bfqq_idle_window(bfqq);
589 if (bic->saved_IO_bound)
590 bfq_mark_bfqq_IO_bound(bfqq);
591 else
592 bfq_clear_bfqq_IO_bound(bfqq);
593 /* Assuming that the flag in_large_burst is already correctly set */
594 if (bic->wr_time_left && bfqq->bfqd->low_latency &&
595 !bfq_bfqq_in_large_burst(bfqq) &&
596 bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {
597 /*
598 * Start a weight raising period with the duration given by
599 * the raising_time_left snapshot.
600 */
601 if (bfq_bfqq_busy(bfqq))
602 bfqq->bfqd->wr_busy_queues++;
603 bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
604 bfqq->wr_cur_max_time = bic->wr_time_left;
605 bfqq->last_wr_start_finish = jiffies;
606 bfqq->entity.ioprio_changed = 1;
607 }
608 /*
609 * Clear wr_time_left to prevent bfq_bfqq_save_state() from
610 * getting confused about the queue's need of a weight-raising
611 * period.
612 */
613 bic->wr_time_left = 0;
614}
615
616/* Must be called with the queue_lock held. */
617static int bfqq_process_refs(struct bfq_queue *bfqq)
618{
619 int process_refs, io_refs;
620
621 io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
622 process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
623 BUG_ON(process_refs < 0);
624 return process_refs;
625}
626
Paolo Valente70f28712013-05-09 19:10:02 +0200627/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
628static inline void bfq_reset_burst_list(struct bfq_data *bfqd,
629 struct bfq_queue *bfqq)
630{
631 struct bfq_queue *item;
632 struct hlist_node *pos, *n;
633
634 hlist_for_each_entry_safe(item, pos, n,
635 &bfqd->burst_list, burst_list_node)
636 hlist_del_init(&item->burst_list_node);
637 hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
638 bfqd->burst_size = 1;
639}
640
641/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
642static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
643{
644 /* Increment burst size to take into account also bfqq */
645 bfqd->burst_size++;
646
647 if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
648 struct bfq_queue *pos, *bfqq_item;
649 struct hlist_node *p, *n;
650
651 /*
652 * Enough queues have been activated shortly after each
653 * other to consider this burst as large.
654 */
655 bfqd->large_burst = true;
656
657 /*
658 * We can now mark all queues in the burst list as
659 * belonging to a large burst.
660 */
661 hlist_for_each_entry(bfqq_item, n, &bfqd->burst_list,
662 burst_list_node)
663 bfq_mark_bfqq_in_large_burst(bfqq_item);
664 bfq_mark_bfqq_in_large_burst(bfqq);
665
666 /*
667 * From now on, and until the current burst finishes, any
668 * new queue being activated shortly after the last queue
669 * was inserted in the burst can be immediately marked as
670 * belonging to a large burst. So the burst list is not
671 * needed any more. Remove it.
672 */
673 hlist_for_each_entry_safe(pos, p, n, &bfqd->burst_list,
674 burst_list_node)
675 hlist_del_init(&pos->burst_list_node);
676 } else /* burst not yet large: add bfqq to the burst list */
677 hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
678}
679
680/*
681 * If many queues happen to become active shortly after each other, then,
682 * to help the processes associated to these queues get their job done as
683 * soon as possible, it is usually better to not grant either weight-raising
684 * or device idling to these queues. In this comment we describe, firstly,
685 * the reasons why this fact holds, and, secondly, the next function, which
686 * implements the main steps needed to properly mark these queues so that
687 * they can then be treated in a different way.
688 *
689 * As for the terminology, we say that a queue becomes active, i.e.,
690 * switches from idle to backlogged, either when it is created (as a
691 * consequence of the arrival of an I/O request), or, if already existing,
692 * when a new request for the queue arrives while the queue is idle.
693 * Bursts of activations, i.e., activations of different queues occurring
694 * shortly after each other, are typically caused by services or applications
695 * that spawn or reactivate many parallel threads/processes. Examples are
696 * systemd during boot or git grep.
697 *
698 * These services or applications benefit mostly from a high throughput:
699 * the quicker the requests of the activated queues are cumulatively served,
700 * the sooner the target job of these queues gets completed. As a consequence,
701 * weight-raising any of these queues, which also implies idling the device
702 * for it, is almost always counterproductive: in most cases it just lowers
703 * throughput.
704 *
705 * On the other hand, a burst of activations may be also caused by the start
706 * of an application that does not consist in a lot of parallel I/O-bound
707 * threads. In fact, with a complex application, the burst may be just a
708 * consequence of the fact that several processes need to be executed to
709 * start-up the application. To start an application as quickly as possible,
710 * the best thing to do is to privilege the I/O related to the application
711 * with respect to all other I/O. Therefore, the best strategy to start as
712 * quickly as possible an application that causes a burst of activations is
713 * to weight-raise all the queues activated during the burst. This is the
714 * exact opposite of the best strategy for the other type of bursts.
715 *
716 * In the end, to take the best action for each of the two cases, the two
717 * types of bursts need to be distinguished. Fortunately, this seems
718 * relatively easy to do, by looking at the sizes of the bursts. In
719 * particular, we found a threshold such that bursts with a larger size
720 * than that threshold are apparently caused only by services or commands
721 * such as systemd or git grep. For brevity, hereafter we call just 'large'
722 * these bursts. BFQ *does not* weight-raise queues whose activations occur
723 * in a large burst. In addition, for each of these queues BFQ performs or
724 * does not perform idling depending on which choice boosts the throughput
725 * most. The exact choice depends on the device and request pattern at
726 * hand.
727 *
728 * Turning back to the next function, it implements all the steps needed
729 * to detect the occurrence of a large burst and to properly mark all the
730 * queues belonging to it (so that they can then be treated in a different
731 * way). This goal is achieved by maintaining a special "burst list" that
732 * holds, temporarily, the queues that belong to the burst in progress. The
733 * list is then used to mark these queues as belonging to a large burst if
734 * the burst does become large. The main steps are the following.
735 *
736 * . when the very first queue is activated, the queue is inserted into the
737 * list (as it could be the first queue in a possible burst)
738 *
739 * . if the current burst has not yet become large, and a queue Q that does
740 * not yet belong to the burst is activated shortly after the last time
741 * at which a new queue entered the burst list, then the function appends
742 * Q to the burst list
743 *
744 * . if, as a consequence of the previous step, the burst size reaches
745 * the large-burst threshold, then
746 *
747 * . all the queues in the burst list are marked as belonging to a
748 * large burst
749 *
750 * . the burst list is deleted; in fact, the burst list already served
751 * its purpose (keeping temporarily track of the queues in a burst,
752 * so as to be able to mark them as belonging to a large burst in the
753 * previous sub-step), and now is not needed any more
754 *
755 * . the device enters a large-burst mode
756 *
757 * . if a queue Q that does not belong to the burst is activated while
758 * the device is in large-burst mode and shortly after the last time
759 * at which a queue either entered the burst list or was marked as
760 * belonging to the current large burst, then Q is immediately marked
761 * as belonging to a large burst.
762 *
763 * . if a queue Q that does not belong to the burst is activated a while
764 * later, i.e., not shortly after, than the last time at which a queue
765 * either entered the burst list or was marked as belonging to the
766 * current large burst, then the current burst is deemed as finished and:
767 *
768 * . the large-burst mode is reset if set
769 *
770 * . the burst list is emptied
771 *
772 * . Q is inserted in the burst list, as Q may be the first queue
773 * in a possible new burst (then the burst list contains just Q
774 * after this step).
775 */
776static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
777 bool idle_for_long_time)
778{
779 /*
780 * If bfqq happened to be activated in a burst, but has been idle
781 * for at least as long as an interactive queue, then we assume
782 * that, in the overall I/O initiated in the burst, the I/O
783 * associated to bfqq is finished. So bfqq does not need to be
784 * treated as a queue belonging to a burst anymore. Accordingly,
785 * we reset bfqq's in_large_burst flag if set, and remove bfqq
786 * from the burst list if it's there. We do not decrement instead
787 * burst_size, because the fact that bfqq does not need to belong
788 * to the burst list any more does not invalidate the fact that
789 * bfqq may have been activated during the current burst.
790 */
791 if (idle_for_long_time) {
792 hlist_del_init(&bfqq->burst_list_node);
793 bfq_clear_bfqq_in_large_burst(bfqq);
794 }
795
796 /*
797 * If bfqq is already in the burst list or is part of a large
798 * burst, then there is nothing else to do.
799 */
800 if (!hlist_unhashed(&bfqq->burst_list_node) ||
801 bfq_bfqq_in_large_burst(bfqq))
802 return;
803
804 /*
805 * If bfqq's activation happens late enough, then the current
806 * burst is finished, and related data structures must be reset.
807 *
808 * In this respect, consider the special case where bfqq is the very
809 * first queue being activated. In this case, last_ins_in_burst is
810 * not yet significant when we get here. But it is easy to verify
811 * that, whether or not the following condition is true, bfqq will
812 * end up being inserted into the burst list. In particular the
813 * list will happen to contain only bfqq. And this is exactly what
814 * has to happen, as bfqq may be the first queue in a possible
815 * burst.
816 */
817 if (time_is_before_jiffies(bfqd->last_ins_in_burst +
818 bfqd->bfq_burst_interval)) {
819 bfqd->large_burst = false;
820 bfq_reset_burst_list(bfqd, bfqq);
821 return;
822 }
823
824 /*
825 * If we get here, then bfqq is being activated shortly after the
826 * last queue. So, if the current burst is also large, we can mark
827 * bfqq as belonging to this large burst immediately.
828 */
829 if (bfqd->large_burst) {
830 bfq_mark_bfqq_in_large_burst(bfqq);
831 return;
832 }
833
834 /*
835 * If we get here, then a large-burst state has not yet been
836 * reached, but bfqq is being activated shortly after the last
837 * queue. Then we add bfqq to the burst.
838 */
839 bfq_add_to_burst(bfqd, bfqq);
840}
841
842static void bfq_add_request(struct request *rq)
843{
844 struct bfq_queue *bfqq = RQ_BFQQ(rq);
845 struct bfq_entity *entity = &bfqq->entity;
846 struct bfq_data *bfqd = bfqq->bfqd;
847 struct request *next_rq, *prev;
848 unsigned long old_wr_coeff = bfqq->wr_coeff;
849 bool interactive = false;
850
851 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
852 bfqq->queued[rq_is_sync(rq)]++;
853 bfqd->queued++;
854
855 elv_rb_add(&bfqq->sort_list, rq);
856
857 /*
858 * Check if this request is a better next-serve candidate.
859 */
860 prev = bfqq->next_rq;
861 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
862 BUG_ON(next_rq == NULL);
863 bfqq->next_rq = next_rq;
864
865 /*
866 * Adjust priority tree position, if next_rq changes.
867 */
868 if (prev != bfqq->next_rq)
869 bfq_rq_pos_tree_add(bfqd, bfqq);
870
871 if (!bfq_bfqq_busy(bfqq)) {
Mauro Andreolini0b335882015-06-07 02:34:22 +0200872 bool soft_rt, coop_or_in_burst,
Paolo Valente70f28712013-05-09 19:10:02 +0200873 idle_for_long_time = time_is_before_jiffies(
874 bfqq->budget_timeout +
875 bfqd->bfq_wr_min_idle_time);
876
877 if (bfq_bfqq_sync(bfqq)) {
878 bool already_in_burst =
879 !hlist_unhashed(&bfqq->burst_list_node) ||
880 bfq_bfqq_in_large_burst(bfqq);
881 bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
882 /*
883 * If bfqq was not already in the current burst,
884 * then, at this point, bfqq either has been
885 * added to the current burst or has caused the
886 * current burst to terminate. In particular, in
887 * the second case, bfqq has become the first
888 * queue in a possible new burst.
889 * In both cases last_ins_in_burst needs to be
890 * moved forward.
891 */
892 if (!already_in_burst)
893 bfqd->last_ins_in_burst = jiffies;
894 }
895
Mauro Andreolini0b335882015-06-07 02:34:22 +0200896 coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||
897 bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;
Paolo Valente70f28712013-05-09 19:10:02 +0200898 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
Mauro Andreolini0b335882015-06-07 02:34:22 +0200899 !coop_or_in_burst &&
Paolo Valente70f28712013-05-09 19:10:02 +0200900 time_is_before_jiffies(bfqq->soft_rt_next_start);
Mauro Andreolini0b335882015-06-07 02:34:22 +0200901 interactive = !coop_or_in_burst && idle_for_long_time;
Paolo Valente70f28712013-05-09 19:10:02 +0200902 entity->budget = max_t(unsigned long, bfqq->max_budget,
903 bfq_serv_to_charge(next_rq, bfqq));
904
905 if (!bfq_bfqq_IO_bound(bfqq)) {
906 if (time_before(jiffies,
907 RQ_BIC(rq)->ttime.last_end_request +
908 bfqd->bfq_slice_idle)) {
909 bfqq->requests_within_timer++;
910 if (bfqq->requests_within_timer >=
911 bfqd->bfq_requests_within_timer)
912 bfq_mark_bfqq_IO_bound(bfqq);
913 } else
914 bfqq->requests_within_timer = 0;
915 }
916
917 if (!bfqd->low_latency)
918 goto add_bfqq_busy;
919
Mauro Andreolini0b335882015-06-07 02:34:22 +0200920 if (bfq_bfqq_just_split(bfqq))
921 goto set_ioprio_changed;
922
Paolo Valente70f28712013-05-09 19:10:02 +0200923 /*
Mauro Andreolini0b335882015-06-07 02:34:22 +0200924 * If the queue:
925 * - is not being boosted,
926 * - has been idle for enough time,
927 * - is not a sync queue or is linked to a bfq_io_cq (it is
928 * shared "for its nature" or it is not shared and its
929 * requests have not been redirected to a shared queue)
930 * start a weight-raising period.
Paolo Valente70f28712013-05-09 19:10:02 +0200931 */
Mauro Andreolini0b335882015-06-07 02:34:22 +0200932 if (old_wr_coeff == 1 && (interactive || soft_rt) &&
933 (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
Paolo Valente70f28712013-05-09 19:10:02 +0200934 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
935 if (interactive)
936 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
937 else
938 bfqq->wr_cur_max_time =
939 bfqd->bfq_wr_rt_max_time;
940 bfq_log_bfqq(bfqd, bfqq,
941 "wrais starting at %lu, rais_max_time %u",
942 jiffies,
943 jiffies_to_msecs(bfqq->wr_cur_max_time));
944 } else if (old_wr_coeff > 1) {
945 if (interactive)
946 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
Mauro Andreolini0b335882015-06-07 02:34:22 +0200947 else if (coop_or_in_burst ||
Paolo Valente70f28712013-05-09 19:10:02 +0200948 (bfqq->wr_cur_max_time ==
949 bfqd->bfq_wr_rt_max_time &&
950 !soft_rt)) {
951 bfqq->wr_coeff = 1;
952 bfq_log_bfqq(bfqd, bfqq,
953 "wrais ending at %lu, rais_max_time %u",
954 jiffies,
955 jiffies_to_msecs(bfqq->
956 wr_cur_max_time));
957 } else if (time_before(
958 bfqq->last_wr_start_finish +
959 bfqq->wr_cur_max_time,
960 jiffies +
961 bfqd->bfq_wr_rt_max_time) &&
962 soft_rt) {
963 /*
964 *
965 * The remaining weight-raising time is lower
Mauro Andreolini0b335882015-06-07 02:34:22 +0200966 * than bfqd->bfq_wr_rt_max_time, which means
967 * that the application is enjoying weight
968 * raising either because deemed soft-rt in
969 * the near past, or because deemed interactive
970 * a long ago.
971 * In both cases, resetting now the current
972 * remaining weight-raising time for the
973 * application to the weight-raising duration
974 * for soft rt applications would not cause any
975 * latency increase for the application (as the
976 * new duration would be higher than the
977 * remaining time).
Paolo Valente70f28712013-05-09 19:10:02 +0200978 *
979 * In addition, the application is now meeting
980 * the requirements for being deemed soft rt.
981 * In the end we can correctly and safely
982 * (re)charge the weight-raising duration for
983 * the application with the weight-raising
984 * duration for soft rt applications.
985 *
986 * In particular, doing this recharge now, i.e.,
987 * before the weight-raising period for the
988 * application finishes, reduces the probability
989 * of the following negative scenario:
990 * 1) the weight of a soft rt application is
991 * raised at startup (as for any newly
992 * created application),
993 * 2) since the application is not interactive,
994 * at a certain time weight-raising is
995 * stopped for the application,
996 * 3) at that time the application happens to
997 * still have pending requests, and hence
998 * is destined to not have a chance to be
999 * deemed soft rt before these requests are
1000 * completed (see the comments to the
1001 * function bfq_bfqq_softrt_next_start()
1002 * for details on soft rt detection),
1003 * 4) these pending requests experience a high
1004 * latency because the application is not
1005 * weight-raised while they are pending.
1006 */
1007 bfqq->last_wr_start_finish = jiffies;
1008 bfqq->wr_cur_max_time =
1009 bfqd->bfq_wr_rt_max_time;
1010 }
1011 }
Mauro Andreolini0b335882015-06-07 02:34:22 +02001012set_ioprio_changed:
Paolo Valente70f28712013-05-09 19:10:02 +02001013 if (old_wr_coeff != bfqq->wr_coeff)
1014 entity->ioprio_changed = 1;
1015add_bfqq_busy:
1016 bfqq->last_idle_bklogged = jiffies;
1017 bfqq->service_from_backlogged = 0;
1018 bfq_clear_bfqq_softrt_update(bfqq);
1019 bfq_add_bfqq_busy(bfqd, bfqq);
1020 } else {
1021 if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
1022 time_is_before_jiffies(
1023 bfqq->last_wr_start_finish +
1024 bfqd->bfq_wr_min_inter_arr_async)) {
1025 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
1026 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
1027
1028 bfqd->wr_busy_queues++;
1029 entity->ioprio_changed = 1;
1030 bfq_log_bfqq(bfqd, bfqq,
1031 "non-idle wrais starting at %lu, rais_max_time %u",
1032 jiffies,
1033 jiffies_to_msecs(bfqq->wr_cur_max_time));
1034 }
1035 if (prev != bfqq->next_rq)
1036 bfq_updated_next_req(bfqd, bfqq);
1037 }
1038
1039 if (bfqd->low_latency &&
1040 (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
1041 bfqq->last_wr_start_finish = jiffies;
1042}
1043
1044static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1045 struct bio *bio)
1046{
1047 struct task_struct *tsk = current;
1048 struct bfq_io_cq *bic;
1049 struct bfq_queue *bfqq;
1050
1051 bic = bfq_bic_lookup(bfqd, tsk->io_context);
1052 if (bic == NULL)
1053 return NULL;
1054
1055 bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1056 if (bfqq != NULL) {
1057 sector_t sector = bio->bi_sector + bio_sectors(bio);
1058
1059 return elv_rb_find(&bfqq->sort_list, sector);
1060 }
1061
1062 return NULL;
1063}
1064
1065static void bfq_activate_request(struct request_queue *q, struct request *rq)
1066{
1067 struct bfq_data *bfqd = q->elevator->elevator_data;
1068
1069 bfqd->rq_in_driver++;
1070 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1071 bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1072 (long long unsigned)bfqd->last_position);
1073}
1074
1075static inline void bfq_deactivate_request(struct request_queue *q,
1076 struct request *rq)
1077{
1078 struct bfq_data *bfqd = q->elevator->elevator_data;
1079
1080 BUG_ON(bfqd->rq_in_driver == 0);
1081 bfqd->rq_in_driver--;
1082}
1083
1084static void bfq_remove_request(struct request *rq)
1085{
1086 struct bfq_queue *bfqq = RQ_BFQQ(rq);
1087 struct bfq_data *bfqd = bfqq->bfqd;
1088 const int sync = rq_is_sync(rq);
1089
1090 if (bfqq->next_rq == rq) {
1091 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1092 bfq_updated_next_req(bfqd, bfqq);
1093 }
1094
1095 if (rq->queuelist.prev != &rq->queuelist)
1096 list_del_init(&rq->queuelist);
1097 BUG_ON(bfqq->queued[sync] == 0);
1098 bfqq->queued[sync]--;
1099 bfqd->queued--;
1100 elv_rb_del(&bfqq->sort_list, rq);
1101
1102 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1103 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
1104 bfq_del_bfqq_busy(bfqd, bfqq, 1);
1105 /*
1106 * Remove queue from request-position tree as it is empty.
1107 */
1108 if (bfqq->pos_root != NULL) {
1109 rb_erase(&bfqq->pos_node, bfqq->pos_root);
1110 bfqq->pos_root = NULL;
1111 }
1112 }
1113
1114 if (rq->cmd_flags & REQ_META) {
1115 BUG_ON(bfqq->meta_pending == 0);
1116 bfqq->meta_pending--;
1117 }
1118}
1119
1120static int bfq_merge(struct request_queue *q, struct request **req,
1121 struct bio *bio)
1122{
1123 struct bfq_data *bfqd = q->elevator->elevator_data;
1124 struct request *__rq;
1125
1126 __rq = bfq_find_rq_fmerge(bfqd, bio);
1127 if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1128 *req = __rq;
1129 return ELEVATOR_FRONT_MERGE;
1130 }
1131
1132 return ELEVATOR_NO_MERGE;
1133}
1134
1135static void bfq_merged_request(struct request_queue *q, struct request *req,
1136 int type)
1137{
1138 if (type == ELEVATOR_FRONT_MERGE &&
1139 rb_prev(&req->rb_node) &&
1140 blk_rq_pos(req) <
1141 blk_rq_pos(container_of(rb_prev(&req->rb_node),
1142 struct request, rb_node))) {
1143 struct bfq_queue *bfqq = RQ_BFQQ(req);
1144 struct bfq_data *bfqd = bfqq->bfqd;
1145 struct request *prev, *next_rq;
1146
1147 /* Reposition request in its sort_list */
1148 elv_rb_del(&bfqq->sort_list, req);
1149 elv_rb_add(&bfqq->sort_list, req);
1150 /* Choose next request to be served for bfqq */
1151 prev = bfqq->next_rq;
1152 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
1153 bfqd->last_position);
1154 BUG_ON(next_rq == NULL);
1155 bfqq->next_rq = next_rq;
1156 /*
1157 * If next_rq changes, update both the queue's budget to
1158 * fit the new request and the queue's position in its
1159 * rq_pos_tree.
1160 */
1161 if (prev != bfqq->next_rq) {
1162 bfq_updated_next_req(bfqd, bfqq);
1163 bfq_rq_pos_tree_add(bfqd, bfqq);
1164 }
1165 }
1166}
1167
1168static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1169 struct request *next)
1170{
1171 struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
1172
1173 /*
1174 * If next and rq belong to the same bfq_queue and next is older
1175 * than rq, then reposition rq in the fifo (by substituting next
1176 * with rq). Otherwise, if next and rq belong to different
1177 * bfq_queues, never reposition rq: in fact, we would have to
1178 * reposition it with respect to next's position in its own fifo,
1179 * which would most certainly be too expensive with respect to
1180 * the benefits.
1181 */
1182 if (bfqq == next_bfqq &&
1183 !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1184 time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1185 list_del_init(&rq->queuelist);
1186 list_replace_init(&next->queuelist, &rq->queuelist);
1187 rq_set_fifo_time(rq, rq_fifo_time(next));
1188 }
1189
1190 if (bfqq->next_rq == next)
1191 bfqq->next_rq = rq;
1192
1193 bfq_remove_request(next);
1194}
1195
1196/* Must be called with bfqq != NULL */
1197static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
1198{
1199 BUG_ON(bfqq == NULL);
1200 if (bfq_bfqq_busy(bfqq))
1201 bfqq->bfqd->wr_busy_queues--;
1202 bfqq->wr_coeff = 1;
1203 bfqq->wr_cur_max_time = 0;
1204 /* Trigger a weight change on the next activation of the queue */
1205 bfqq->entity.ioprio_changed = 1;
1206}
1207
1208static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
1209 struct bfq_group *bfqg)
1210{
1211 int i, j;
1212
1213 for (i = 0; i < 2; i++)
1214 for (j = 0; j < IOPRIO_BE_NR; j++)
1215 if (bfqg->async_bfqq[i][j] != NULL)
1216 bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
1217 if (bfqg->async_idle_bfqq != NULL)
1218 bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
1219}
1220
1221static void bfq_end_wr(struct bfq_data *bfqd)
1222{
1223 struct bfq_queue *bfqq;
1224
1225 spin_lock_irq(bfqd->queue->queue_lock);
1226
1227 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
1228 bfq_bfqq_end_wr(bfqq);
1229 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
1230 bfq_bfqq_end_wr(bfqq);
1231 bfq_end_wr_async(bfqd);
1232
1233 spin_unlock_irq(bfqd->queue->queue_lock);
1234}
1235
Mauro Andreolini0b335882015-06-07 02:34:22 +02001236static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
1237{
1238 if (request)
1239 return blk_rq_pos(io_struct);
1240 else
1241 return ((struct bio *)io_struct)->bi_sector;
1242}
1243
1244static inline sector_t bfq_dist_from(sector_t pos1,
1245 sector_t pos2)
1246{
1247 if (pos1 >= pos2)
1248 return pos1 - pos2;
1249 else
1250 return pos2 - pos1;
1251}
1252
1253static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
1254 sector_t sector)
1255{
1256 return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
1257 BFQQ_SEEK_THR;
1258}
1259
1260static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
1261{
1262 struct rb_root *root = &bfqd->rq_pos_tree;
1263 struct rb_node *parent, *node;
1264 struct bfq_queue *__bfqq;
1265
1266 if (RB_EMPTY_ROOT(root))
1267 return NULL;
1268
1269 /*
1270 * First, if we find a request starting at the end of the last
1271 * request, choose it.
1272 */
1273 __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
1274 if (__bfqq != NULL)
1275 return __bfqq;
1276
1277 /*
1278 * If the exact sector wasn't found, the parent of the NULL leaf
1279 * will contain the closest sector (rq_pos_tree sorted by
1280 * next_request position).
1281 */
1282 __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1283 if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
1284 return __bfqq;
1285
1286 if (blk_rq_pos(__bfqq->next_rq) < sector)
1287 node = rb_next(&__bfqq->pos_node);
1288 else
1289 node = rb_prev(&__bfqq->pos_node);
1290 if (node == NULL)
1291 return NULL;
1292
1293 __bfqq = rb_entry(node, struct bfq_queue, pos_node);
1294 if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
1295 return __bfqq;
1296
1297 return NULL;
1298}
1299
1300/*
1301 * bfqd - obvious
1302 * cur_bfqq - passed in so that we don't decide that the current queue
1303 * is closely cooperating with itself
1304 * sector - used as a reference point to search for a close queue
1305 */
1306static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
1307 struct bfq_queue *cur_bfqq,
1308 sector_t sector)
1309{
1310 struct bfq_queue *bfqq;
1311
1312 if (bfq_class_idle(cur_bfqq))
1313 return NULL;
1314 if (!bfq_bfqq_sync(cur_bfqq))
1315 return NULL;
1316 if (BFQQ_SEEKY(cur_bfqq))
1317 return NULL;
1318
1319 /* If device has only one backlogged bfq_queue, don't search. */
1320 if (bfqd->busy_queues == 1)
1321 return NULL;
1322
1323 /*
1324 * We should notice if some of the queues are cooperating, e.g.
1325 * working closely on the same area of the disk. In that case,
1326 * we can group them together and don't waste time idling.
1327 */
1328 bfqq = bfqq_close(bfqd, sector);
1329 if (bfqq == NULL || bfqq == cur_bfqq)
1330 return NULL;
1331
1332 /*
1333 * Do not merge queues from different bfq_groups.
1334 */
1335 if (bfqq->entity.parent != cur_bfqq->entity.parent)
1336 return NULL;
1337
1338 /*
1339 * It only makes sense to merge sync queues.
1340 */
1341 if (!bfq_bfqq_sync(bfqq))
1342 return NULL;
1343 if (BFQQ_SEEKY(bfqq))
1344 return NULL;
1345
1346 /*
1347 * Do not merge queues of different priority classes.
1348 */
1349 if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
1350 return NULL;
1351
1352 return bfqq;
1353}
1354
1355static struct bfq_queue *
1356bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
1357{
1358 int process_refs, new_process_refs;
1359 struct bfq_queue *__bfqq;
1360
1361 /*
1362 * If there are no process references on the new_bfqq, then it is
1363 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
1364 * may have dropped their last reference (not just their last process
1365 * reference).
1366 */
1367 if (!bfqq_process_refs(new_bfqq))
1368 return NULL;
1369
1370 /* Avoid a circular list and skip interim queue merges. */
1371 while ((__bfqq = new_bfqq->new_bfqq)) {
1372 if (__bfqq == bfqq)
1373 return NULL;
1374 new_bfqq = __bfqq;
1375 }
1376
1377 process_refs = bfqq_process_refs(bfqq);
1378 new_process_refs = bfqq_process_refs(new_bfqq);
1379 /*
1380 * If the process for the bfqq has gone away, there is no
1381 * sense in merging the queues.
1382 */
1383 if (process_refs == 0 || new_process_refs == 0)
1384 return NULL;
1385
1386 bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
1387 new_bfqq->pid);
1388
1389 /*
1390 * Merging is just a redirection: the requests of the process
1391 * owning one of the two queues are redirected to the other queue.
1392 * The latter queue, in its turn, is set as shared if this is the
1393 * first time that the requests of some process are redirected to
1394 * it.
1395 *
1396 * We redirect bfqq to new_bfqq and not the opposite, because we
1397 * are in the context of the process owning bfqq, hence we have
1398 * the io_cq of this process. So we can immediately configure this
1399 * io_cq to redirect the requests of the process to new_bfqq.
1400 *
1401 * NOTE, even if new_bfqq coincides with the in-service queue, the
1402 * io_cq of new_bfqq is not available, because, if the in-service
1403 * queue is shared, bfqd->in_service_bic may not point to the
1404 * io_cq of the in-service queue.
1405 * Redirecting the requests of the process owning bfqq to the
1406 * currently in-service queue is in any case the best option, as
1407 * we feed the in-service queue with new requests close to the
1408 * last request served and, by doing so, hopefully increase the
1409 * throughput.
1410 */
1411 bfqq->new_bfqq = new_bfqq;
1412 atomic_add(process_refs, &new_bfqq->ref);
1413 return new_bfqq;
1414}
1415
1416/*
1417 * Attempt to schedule a merge of bfqq with the currently in-service queue
1418 * or with a close queue among the scheduled queues.
1419 * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
1420 * structure otherwise.
1421 *
1422 * The OOM queue is not allowed to participate to cooperation: in fact, since
1423 * the requests temporarily redirected to the OOM queue could be redirected
1424 * again to dedicated queues at any time, the state needed to correctly
1425 * handle merging with the OOM queue would be quite complex and expensive
1426 * to maintain. Besides, in such a critical condition as an out of memory,
1427 * the benefits of queue merging may be little relevant, or even negligible.
1428 */
1429static struct bfq_queue *
1430bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1431 void *io_struct, bool request)
1432{
1433 struct bfq_queue *in_service_bfqq, *new_bfqq;
1434
1435 if (bfqq->new_bfqq)
1436 return bfqq->new_bfqq;
1437
1438 if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
1439 return NULL;
1440
1441 in_service_bfqq = bfqd->in_service_queue;
1442
1443 if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
1444 !bfqd->in_service_bic ||
1445 unlikely(in_service_bfqq == &bfqd->oom_bfqq))
1446 goto check_scheduled;
1447
1448 if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
1449 goto check_scheduled;
1450
1451 if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
1452 goto check_scheduled;
1453
1454 if (in_service_bfqq->entity.parent != bfqq->entity.parent)
1455 goto check_scheduled;
1456
1457 if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
1458 bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
1459 new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
1460 if (new_bfqq != NULL)
1461 return new_bfqq; /* Merge with in-service queue */
1462 }
1463
1464 /*
1465 * Check whether there is a cooperator among currently scheduled
1466 * queues. The only thing we need is that the bio/request is not
1467 * NULL, as we need it to establish whether a cooperator exists.
1468 */
1469check_scheduled:
1470 new_bfqq = bfq_close_cooperator(bfqd, bfqq,
1471 bfq_io_struct_pos(io_struct, request));
1472 if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))
1473 return bfq_setup_merge(bfqq, new_bfqq);
1474
1475 return NULL;
1476}
1477
1478static inline void
1479bfq_bfqq_save_state(struct bfq_queue *bfqq)
1480{
1481 /*
1482 * If bfqq->bic == NULL, the queue is already shared or its requests
1483 * have already been redirected to a shared queue; both idle window
1484 * and weight raising state have already been saved. Do nothing.
1485 */
1486 if (bfqq->bic == NULL)
1487 return;
1488 if (bfqq->bic->wr_time_left)
1489 /*
1490 * This is the queue of a just-started process, and would
1491 * deserve weight raising: we set wr_time_left to the full
1492 * weight-raising duration to trigger weight-raising when
1493 * and if the queue is split and the first request of the
1494 * queue is enqueued.
1495 */
1496 bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);
1497 else if (bfqq->wr_coeff > 1) {
1498 unsigned long wr_duration =
1499 jiffies - bfqq->last_wr_start_finish;
1500 /*
1501 * It may happen that a queue's weight raising period lasts
1502 * longer than its wr_cur_max_time, as weight raising is
1503 * handled only when a request is enqueued or dispatched (it
1504 * does not use any timer). If the weight raising period is
1505 * about to end, don't save it.
1506 */
1507 if (bfqq->wr_cur_max_time <= wr_duration)
1508 bfqq->bic->wr_time_left = 0;
1509 else
1510 bfqq->bic->wr_time_left =
1511 bfqq->wr_cur_max_time - wr_duration;
1512 /*
1513 * The bfq_queue is becoming shared or the requests of the
1514 * process owning the queue are being redirected to a shared
1515 * queue. Stop the weight raising period of the queue, as in
1516 * both cases it should not be owned by an interactive or
1517 * soft real-time application.
1518 */
1519 bfq_bfqq_end_wr(bfqq);
1520 } else
1521 bfqq->bic->wr_time_left = 0;
1522 bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
1523 bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
1524 bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
1525 bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
1526 bfqq->bic->cooperations++;
1527 bfqq->bic->failed_cooperations = 0;
1528}
1529
1530static inline void
1531bfq_get_bic_reference(struct bfq_queue *bfqq)
1532{
1533 /*
1534 * If bfqq->bic has a non-NULL value, the bic to which it belongs
1535 * is about to begin using a shared bfq_queue.
1536 */
1537 if (bfqq->bic)
1538 atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
1539}
1540
1541static void
1542bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
1543 struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
1544{
1545 bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
1546 (long unsigned)new_bfqq->pid);
1547 /* Save weight raising and idle window of the merged queues */
1548 bfq_bfqq_save_state(bfqq);
1549 bfq_bfqq_save_state(new_bfqq);
1550 if (bfq_bfqq_IO_bound(bfqq))
1551 bfq_mark_bfqq_IO_bound(new_bfqq);
1552 bfq_clear_bfqq_IO_bound(bfqq);
1553 /*
1554 * Grab a reference to the bic, to prevent it from being destroyed
1555 * before being possibly touched by a bfq_split_bfqq().
1556 */
1557 bfq_get_bic_reference(bfqq);
1558 bfq_get_bic_reference(new_bfqq);
1559 /*
1560 * Merge queues (that is, let bic redirect its requests to new_bfqq)
1561 */
1562 bic_set_bfqq(bic, new_bfqq, 1);
1563 bfq_mark_bfqq_coop(new_bfqq);
1564 /*
1565 * new_bfqq now belongs to at least two bics (it is a shared queue):
1566 * set new_bfqq->bic to NULL. bfqq either:
1567 * - does not belong to any bic any more, and hence bfqq->bic must
1568 * be set to NULL, or
1569 * - is a queue whose owning bics have already been redirected to a
1570 * different queue, hence the queue is destined to not belong to
1571 * any bic soon and bfqq->bic is already NULL (therefore the next
1572 * assignment causes no harm).
1573 */
1574 new_bfqq->bic = NULL;
1575 bfqq->bic = NULL;
1576 bfq_put_queue(bfqq);
1577}
1578
1579static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
1580{
1581 struct bfq_io_cq *bic = bfqq->bic;
1582 struct bfq_data *bfqd = bfqq->bfqd;
1583
1584 if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {
1585 bic->failed_cooperations++;
1586 if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)
1587 bic->cooperations = 0;
1588 }
1589}
1590
Paolo Valente70f28712013-05-09 19:10:02 +02001591static int bfq_allow_merge(struct request_queue *q, struct request *rq,
1592 struct bio *bio)
1593{
1594 struct bfq_data *bfqd = q->elevator->elevator_data;
1595 struct bfq_io_cq *bic;
Mauro Andreolini0b335882015-06-07 02:34:22 +02001596 struct bfq_queue *bfqq, *new_bfqq;
Paolo Valente70f28712013-05-09 19:10:02 +02001597
1598 /*
1599 * Disallow merge of a sync bio into an async request.
1600 */
1601 if (bfq_bio_sync(bio) && !rq_is_sync(rq))
1602 return 0;
1603
1604 /*
1605 * Lookup the bfqq that this bio will be queued with. Allow
1606 * merge only if rq is queued there.
1607 * Queue lock is held here.
1608 */
1609 bic = bfq_bic_lookup(bfqd, current->io_context);
1610 if (bic == NULL)
1611 return 0;
1612
1613 bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
Mauro Andreolini0b335882015-06-07 02:34:22 +02001614 /*
1615 * We take advantage of this function to perform an early merge
1616 * of the queues of possible cooperating processes.
1617 */
1618 if (bfqq != NULL) {
1619 new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
1620 if (new_bfqq != NULL) {
1621 bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
1622 /*
1623 * If we get here, the bio will be queued in the
1624 * shared queue, i.e., new_bfqq, so use new_bfqq
1625 * to decide whether bio and rq can be merged.
1626 */
1627 bfqq = new_bfqq;
1628 } else
1629 bfq_bfqq_increase_failed_cooperations(bfqq);
1630 }
1631
Paolo Valente70f28712013-05-09 19:10:02 +02001632 return bfqq == RQ_BFQQ(rq);
1633}
1634
1635static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
1636 struct bfq_queue *bfqq)
1637{
1638 if (bfqq != NULL) {
1639 bfq_mark_bfqq_must_alloc(bfqq);
1640 bfq_mark_bfqq_budget_new(bfqq);
1641 bfq_clear_bfqq_fifo_expire(bfqq);
1642
1643 bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
1644
1645 bfq_log_bfqq(bfqd, bfqq,
1646 "set_in_service_queue, cur-budget = %lu",
1647 bfqq->entity.budget);
1648 }
1649
1650 bfqd->in_service_queue = bfqq;
1651}
1652
1653/*
1654 * Get and set a new queue for service.
1655 */
Mauro Andreolini0b335882015-06-07 02:34:22 +02001656static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
Paolo Valente70f28712013-05-09 19:10:02 +02001657{
Mauro Andreolini0b335882015-06-07 02:34:22 +02001658 struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
Paolo Valente70f28712013-05-09 19:10:02 +02001659
1660 __bfq_set_in_service_queue(bfqd, bfqq);
1661 return bfqq;
1662}
1663
Paolo Valente70f28712013-05-09 19:10:02 +02001664/*
1665 * If enough samples have been computed, return the current max budget
1666 * stored in bfqd, which is dynamically updated according to the
1667 * estimated disk peak rate; otherwise return the default max budget
1668 */
1669static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
1670{
1671 if (bfqd->budgets_assigned < 194)
1672 return bfq_default_max_budget;
1673 else
1674 return bfqd->bfq_max_budget;
1675}
1676
1677/*
1678 * Return min budget, which is a fraction of the current or default
1679 * max budget (trying with 1/32)
1680 */
1681static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
1682{
1683 if (bfqd->budgets_assigned < 194)
1684 return bfq_default_max_budget / 32;
1685 else
1686 return bfqd->bfq_max_budget / 32;
1687}
1688
1689static void bfq_arm_slice_timer(struct bfq_data *bfqd)
1690{
1691 struct bfq_queue *bfqq = bfqd->in_service_queue;
1692 struct bfq_io_cq *bic;
1693 unsigned long sl;
1694
1695 BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
1696
1697 /* Processes have exited, don't wait. */
1698 bic = bfqd->in_service_bic;
1699 if (bic == NULL || atomic_read(&bic->icq.ioc->nr_tasks) == 0)
1700 return;
1701
1702 bfq_mark_bfqq_wait_request(bfqq);
1703
1704 /*
1705 * We don't want to idle for seeks, but we do want to allow
1706 * fair distribution of slice time for a process doing back-to-back
1707 * seeks. So allow a little bit of time for him to submit a new rq.
1708 *
1709 * To prevent processes with (partly) seeky workloads from
1710 * being too ill-treated, grant them a small fraction of the
1711 * assigned budget before reducing the waiting time to
1712 * BFQ_MIN_TT. This happened to help reduce latency.
1713 */
1714 sl = bfqd->bfq_slice_idle;
1715 /*
1716 * Unless the queue is being weight-raised or the scenario is
1717 * asymmetric, grant only minimum idle time if the queue either
1718 * has been seeky for long enough or has already proved to be
1719 * constantly seeky.
1720 */
1721 if (bfq_sample_valid(bfqq->seek_samples) &&
1722 ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
1723 bfq_max_budget(bfqq->bfqd) / 8) ||
1724 bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
1725 symmetric_scenario)
1726 sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
1727 else if (bfqq->wr_coeff > 1)
1728 sl = sl * 3;
1729 bfqd->last_idling_start = ktime_get();
1730 mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
1731 bfq_log(bfqd, "arm idle: %u/%u ms",
1732 jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
1733}
1734
1735/*
1736 * Set the maximum time for the in-service queue to consume its
1737 * budget. This prevents seeky processes from lowering the disk
1738 * throughput (always guaranteed with a time slice scheme as in CFQ).
1739 */
1740static void bfq_set_budget_timeout(struct bfq_data *bfqd)
1741{
1742 struct bfq_queue *bfqq = bfqd->in_service_queue;
1743 unsigned int timeout_coeff;
1744 if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
1745 timeout_coeff = 1;
1746 else
1747 timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
1748
1749 bfqd->last_budget_start = ktime_get();
1750
1751 bfq_clear_bfqq_budget_new(bfqq);
1752 bfqq->budget_timeout = jiffies +
1753 bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
1754
1755 bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
1756 jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
1757 timeout_coeff));
1758}
1759
1760/*
1761 * Move request from internal lists to the request queue dispatch list.
1762 */
1763static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
1764{
1765 struct bfq_data *bfqd = q->elevator->elevator_data;
1766 struct bfq_queue *bfqq = RQ_BFQQ(rq);
1767
1768 /*
1769 * For consistency, the next instruction should have been executed
1770 * after removing the request from the queue and dispatching it.
1771 * We execute instead this instruction before bfq_remove_request()
1772 * (and hence introduce a temporary inconsistency), for efficiency.
1773 * In fact, in a forced_dispatch, this prevents two counters related
1774 * to bfqq->dispatched to risk to be uselessly decremented if bfqq
1775 * is not in service, and then to be incremented again after
1776 * incrementing bfqq->dispatched.
1777 */
1778 bfqq->dispatched++;
1779 bfq_remove_request(rq);
1780 elv_dispatch_sort(q, rq);
1781
1782 if (bfq_bfqq_sync(bfqq))
1783 bfqd->sync_flight++;
1784}
1785
1786/*
1787 * Return expired entry, or NULL to just start from scratch in rbtree.
1788 */
1789static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
1790{
1791 struct request *rq = NULL;
1792
1793 if (bfq_bfqq_fifo_expire(bfqq))
1794 return NULL;
1795
1796 bfq_mark_bfqq_fifo_expire(bfqq);
1797
1798 if (list_empty(&bfqq->fifo))
1799 return NULL;
1800
1801 rq = rq_entry_fifo(bfqq->fifo.next);
1802
1803 if (time_before(jiffies, rq_fifo_time(rq)))
1804 return NULL;
1805
1806 return rq;
1807}
1808
Paolo Valente70f28712013-05-09 19:10:02 +02001809static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
1810{
1811 struct bfq_entity *entity = &bfqq->entity;
1812 return entity->budget - entity->service;
1813}
1814
1815static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1816{
1817 BUG_ON(bfqq != bfqd->in_service_queue);
1818
1819 __bfq_bfqd_reset_in_service(bfqd);
1820
1821 /*
1822 * If this bfqq is shared between multiple processes, check
1823 * to make sure that those processes are still issuing I/Os
1824 * within the mean seek distance. If not, it may be time to
1825 * break the queues apart again.
1826 */
1827 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
1828 bfq_mark_bfqq_split_coop(bfqq);
1829
1830 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1831 /*
1832 * Overloading budget_timeout field to store the time
1833 * at which the queue remains with no backlog; used by
1834 * the weight-raising mechanism.
1835 */
1836 bfqq->budget_timeout = jiffies;
1837 bfq_del_bfqq_busy(bfqd, bfqq, 1);
1838 } else {
1839 bfq_activate_bfqq(bfqd, bfqq);
1840 /*
1841 * Resort priority tree of potential close cooperators.
1842 */
1843 bfq_rq_pos_tree_add(bfqd, bfqq);
1844 }
1845}
1846
1847/**
1848 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
1849 * @bfqd: device data.
1850 * @bfqq: queue to update.
1851 * @reason: reason for expiration.
1852 *
1853 * Handle the feedback on @bfqq budget. See the body for detailed
1854 * comments.
1855 */
1856static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
1857 struct bfq_queue *bfqq,
1858 enum bfqq_expiration reason)
1859{
1860 struct request *next_rq;
1861 unsigned long budget, min_budget;
1862
1863 budget = bfqq->max_budget;
1864 min_budget = bfq_min_budget(bfqd);
1865
1866 BUG_ON(bfqq != bfqd->in_service_queue);
1867
1868 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
1869 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
1870 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
1871 budget, bfq_min_budget(bfqd));
1872 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
1873 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
1874
1875 if (bfq_bfqq_sync(bfqq)) {
1876 switch (reason) {
1877 /*
1878 * Caveat: in all the following cases we trade latency
1879 * for throughput.
1880 */
1881 case BFQ_BFQQ_TOO_IDLE:
1882 /*
1883 * This is the only case where we may reduce
1884 * the budget: if there is no request of the
1885 * process still waiting for completion, then
1886 * we assume (tentatively) that the timer has
1887 * expired because the batch of requests of
1888 * the process could have been served with a
1889 * smaller budget. Hence, betting that
1890 * process will behave in the same way when it
1891 * becomes backlogged again, we reduce its
1892 * next budget. As long as we guess right,
1893 * this budget cut reduces the latency
1894 * experienced by the process.
1895 *
1896 * However, if there are still outstanding
1897 * requests, then the process may have not yet
1898 * issued its next request just because it is
1899 * still waiting for the completion of some of
1900 * the still outstanding ones. So in this
1901 * subcase we do not reduce its budget, on the
1902 * contrary we increase it to possibly boost
1903 * the throughput, as discussed in the
1904 * comments to the BUDGET_TIMEOUT case.
1905 */
1906 if (bfqq->dispatched > 0) /* still outstanding reqs */
1907 budget = min(budget * 2, bfqd->bfq_max_budget);
1908 else {
1909 if (budget > 5 * min_budget)
1910 budget -= 4 * min_budget;
1911 else
1912 budget = min_budget;
1913 }
1914 break;
1915 case BFQ_BFQQ_BUDGET_TIMEOUT:
1916 /*
1917 * We double the budget here because: 1) it
1918 * gives the chance to boost the throughput if
1919 * this is not a seeky process (which may have
1920 * bumped into this timeout because of, e.g.,
1921 * ZBR), 2) together with charge_full_budget
1922 * it helps give seeky processes higher
1923 * timestamps, and hence be served less
1924 * frequently.
1925 */
1926 budget = min(budget * 2, bfqd->bfq_max_budget);
1927 break;
1928 case BFQ_BFQQ_BUDGET_EXHAUSTED:
1929 /*
1930 * The process still has backlog, and did not
1931 * let either the budget timeout or the disk
1932 * idling timeout expire. Hence it is not
1933 * seeky, has a short thinktime and may be
1934 * happy with a higher budget too. So
1935 * definitely increase the budget of this good
1936 * candidate to boost the disk throughput.
1937 */
1938 budget = min(budget * 4, bfqd->bfq_max_budget);
1939 break;
1940 case BFQ_BFQQ_NO_MORE_REQUESTS:
1941 /*
1942 * Leave the budget unchanged.
1943 */
1944 default:
1945 return;
1946 }
1947 } else /* async queue */
1948 /* async queues get always the maximum possible budget
1949 * (their ability to dispatch is limited by
1950 * @bfqd->bfq_max_budget_async_rq).
1951 */
1952 budget = bfqd->bfq_max_budget;
1953
1954 bfqq->max_budget = budget;
1955
1956 if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
1957 bfqq->max_budget > bfqd->bfq_max_budget)
1958 bfqq->max_budget = bfqd->bfq_max_budget;
1959
1960 /*
1961 * Make sure that we have enough budget for the next request.
1962 * Since the finish time of the bfqq must be kept in sync with
1963 * the budget, be sure to call __bfq_bfqq_expire() after the
1964 * update.
1965 */
1966 next_rq = bfqq->next_rq;
1967 if (next_rq != NULL)
1968 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
1969 bfq_serv_to_charge(next_rq, bfqq));
1970 else
1971 bfqq->entity.budget = bfqq->max_budget;
1972
1973 bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
1974 next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
1975 bfqq->entity.budget);
1976}
1977
1978static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
1979{
1980 unsigned long max_budget;
1981
1982 /*
1983 * The max_budget calculated when autotuning is equal to the
1984 * amount of sectors transfered in timeout_sync at the
1985 * estimated peak rate.
1986 */
1987 max_budget = (unsigned long)(peak_rate * 1000 *
1988 timeout >> BFQ_RATE_SHIFT);
1989
1990 return max_budget;
1991}
1992
1993/*
1994 * In addition to updating the peak rate, checks whether the process
1995 * is "slow", and returns 1 if so. This slow flag is used, in addition
1996 * to the budget timeout, to reduce the amount of service provided to
1997 * seeky processes, and hence reduce their chances to lower the
1998 * throughput. See the code for more details.
1999 */
2000static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2001 int compensate, enum bfqq_expiration reason)
2002{
2003 u64 bw, usecs, expected, timeout;
2004 ktime_t delta;
2005 int update = 0;
2006
2007 if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2008 return 0;
2009
2010 if (compensate)
2011 delta = bfqd->last_idling_start;
2012 else
2013 delta = ktime_get();
2014 delta = ktime_sub(delta, bfqd->last_budget_start);
2015 usecs = ktime_to_us(delta);
2016
2017 /* Don't trust short/unrealistic values. */
2018 if (usecs < 100 || usecs >= LONG_MAX)
2019 return 0;
2020
2021 /*
2022 * Calculate the bandwidth for the last slice. We use a 64 bit
2023 * value to store the peak rate, in sectors per usec in fixed
2024 * point math. We do so to have enough precision in the estimate
2025 * and to avoid overflows.
2026 */
2027 bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2028 do_div(bw, (unsigned long)usecs);
2029
2030 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2031
2032 /*
2033 * Use only long (> 20ms) intervals to filter out spikes for
2034 * the peak rate estimation.
2035 */
2036 if (usecs > 20000) {
2037 if (bw > bfqd->peak_rate ||
2038 (!BFQQ_SEEKY(bfqq) &&
2039 reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2040 bfq_log(bfqd, "measured bw =%llu", bw);
2041 /*
2042 * To smooth oscillations use a low-pass filter with
2043 * alpha=7/8, i.e.,
2044 * new_rate = (7/8) * old_rate + (1/8) * bw
2045 */
2046 do_div(bw, 8);
2047 if (bw == 0)
2048 return 0;
2049 bfqd->peak_rate *= 7;
2050 do_div(bfqd->peak_rate, 8);
2051 bfqd->peak_rate += bw;
2052 update = 1;
2053 bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2054 }
2055
2056 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2057
2058 if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2059 bfqd->peak_rate_samples++;
2060
2061 if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2062 update) {
2063 int dev_type = blk_queue_nonrot(bfqd->queue);
2064 if (bfqd->bfq_user_max_budget == 0) {
2065 bfqd->bfq_max_budget =
2066 bfq_calc_max_budget(bfqd->peak_rate,
2067 timeout);
2068 bfq_log(bfqd, "new max_budget=%lu",
2069 bfqd->bfq_max_budget);
2070 }
2071 if (bfqd->device_speed == BFQ_BFQD_FAST &&
2072 bfqd->peak_rate < device_speed_thresh[dev_type]) {
2073 bfqd->device_speed = BFQ_BFQD_SLOW;
2074 bfqd->RT_prod = R_slow[dev_type] *
2075 T_slow[dev_type];
2076 } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
2077 bfqd->peak_rate > device_speed_thresh[dev_type]) {
2078 bfqd->device_speed = BFQ_BFQD_FAST;
2079 bfqd->RT_prod = R_fast[dev_type] *
2080 T_fast[dev_type];
2081 }
2082 }
2083 }
2084
2085 /*
2086 * If the process has been served for a too short time
2087 * interval to let its possible sequential accesses prevail on
2088 * the initial seek time needed to move the disk head on the
2089 * first sector it requested, then give the process a chance
2090 * and for the moment return false.
2091 */
2092 if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2093 return 0;
2094
2095 /*
2096 * A process is considered ``slow'' (i.e., seeky, so that we
2097 * cannot treat it fairly in the service domain, as it would
2098 * slow down too much the other processes) if, when a slice
2099 * ends for whatever reason, it has received service at a
2100 * rate that would not be high enough to complete the budget
2101 * before the budget timeout expiration.
2102 */
2103 expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2104
2105 /*
2106 * Caveat: processes doing IO in the slower disk zones will
2107 * tend to be slow(er) even if not seeky. And the estimated
2108 * peak rate will actually be an average over the disk
2109 * surface. Hence, to not be too harsh with unlucky processes,
2110 * we keep a budget/3 margin of safety before declaring a
2111 * process slow.
2112 */
2113 return expected > (4 * bfqq->entity.budget) / 3;
2114}
2115
2116/*
2117 * To be deemed as soft real-time, an application must meet two
2118 * requirements. First, the application must not require an average
2119 * bandwidth higher than the approximate bandwidth required to playback or
2120 * record a compressed high-definition video.
2121 * The next function is invoked on the completion of the last request of a
2122 * batch, to compute the next-start time instant, soft_rt_next_start, such
2123 * that, if the next request of the application does not arrive before
2124 * soft_rt_next_start, then the above requirement on the bandwidth is met.
2125 *
2126 * The second requirement is that the request pattern of the application is
2127 * isochronous, i.e., that, after issuing a request or a batch of requests,
2128 * the application stops issuing new requests until all its pending requests
2129 * have been completed. After that, the application may issue a new batch,
2130 * and so on.
2131 * For this reason the next function is invoked to compute
2132 * soft_rt_next_start only for applications that meet this requirement,
2133 * whereas soft_rt_next_start is set to infinity for applications that do
2134 * not.
2135 *
2136 * Unfortunately, even a greedy application may happen to behave in an
2137 * isochronous way if the CPU load is high. In fact, the application may
2138 * stop issuing requests while the CPUs are busy serving other processes,
2139 * then restart, then stop again for a while, and so on. In addition, if
2140 * the disk achieves a low enough throughput with the request pattern
2141 * issued by the application (e.g., because the request pattern is random
2142 * and/or the device is slow), then the application may meet the above
2143 * bandwidth requirement too. To prevent such a greedy application to be
2144 * deemed as soft real-time, a further rule is used in the computation of
2145 * soft_rt_next_start: soft_rt_next_start must be higher than the current
2146 * time plus the maximum time for which the arrival of a request is waited
2147 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
2148 * This filters out greedy applications, as the latter issue instead their
2149 * next request as soon as possible after the last one has been completed
2150 * (in contrast, when a batch of requests is completed, a soft real-time
2151 * application spends some time processing data).
2152 *
2153 * Unfortunately, the last filter may easily generate false positives if
2154 * only bfqd->bfq_slice_idle is used as a reference time interval and one
2155 * or both the following cases occur:
2156 * 1) HZ is so low that the duration of a jiffy is comparable to or higher
2157 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
2158 * HZ=100.
2159 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
2160 * for a while, then suddenly 'jump' by several units to recover the lost
2161 * increments. This seems to happen, e.g., inside virtual machines.
2162 * To address this issue, we do not use as a reference time interval just
2163 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
2164 * particular we add the minimum number of jiffies for which the filter
2165 * seems to be quite precise also in embedded systems and KVM/QEMU virtual
2166 * machines.
2167 */
2168static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
2169 struct bfq_queue *bfqq)
2170{
2171 return max(bfqq->last_idle_bklogged +
2172 HZ * bfqq->service_from_backlogged /
2173 bfqd->bfq_wr_max_softrt_rate,
2174 jiffies + bfqq->bfqd->bfq_slice_idle + 4);
2175}
2176
2177/*
2178 * Return the largest-possible time instant such that, for as long as possible,
2179 * the current time will be lower than this time instant according to the macro
2180 * time_is_before_jiffies().
2181 */
2182static inline unsigned long bfq_infinity_from_now(unsigned long now)
2183{
2184 return now + ULONG_MAX / 2;
2185}
2186
2187/**
2188 * bfq_bfqq_expire - expire a queue.
2189 * @bfqd: device owning the queue.
2190 * @bfqq: the queue to expire.
2191 * @compensate: if true, compensate for the time spent idling.
2192 * @reason: the reason causing the expiration.
2193 *
2194 *
2195 * If the process associated to the queue is slow (i.e., seeky), or in
2196 * case of budget timeout, or, finally, if it is async, we
2197 * artificially charge it an entire budget (independently of the
2198 * actual service it received). As a consequence, the queue will get
2199 * higher timestamps than the correct ones upon reactivation, and
2200 * hence it will be rescheduled as if it had received more service
2201 * than what it actually received. In the end, this class of processes
2202 * will receive less service in proportion to how slowly they consume
2203 * their budgets (and hence how seriously they tend to lower the
2204 * throughput).
2205 *
2206 * In contrast, when a queue expires because it has been idling for
2207 * too much or because it exhausted its budget, we do not touch the
2208 * amount of service it has received. Hence when the queue will be
2209 * reactivated and its timestamps updated, the latter will be in sync
2210 * with the actual service received by the queue until expiration.
2211 *
2212 * Charging a full budget to the first type of queues and the exact
2213 * service to the others has the effect of using the WF2Q+ policy to
2214 * schedule the former on a timeslice basis, without violating the
2215 * service domain guarantees of the latter.
2216 */
2217static void bfq_bfqq_expire(struct bfq_data *bfqd,
2218 struct bfq_queue *bfqq,
2219 int compensate,
2220 enum bfqq_expiration reason)
2221{
2222 int slow;
2223 BUG_ON(bfqq != bfqd->in_service_queue);
2224
2225 /* Update disk peak rate for autotuning and check whether the
2226 * process is slow (see bfq_update_peak_rate).
2227 */
2228 slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2229
2230 /*
2231 * As above explained, 'punish' slow (i.e., seeky), timed-out
2232 * and async queues, to favor sequential sync workloads.
2233 *
2234 * Processes doing I/O in the slower disk zones will tend to be
2235 * slow(er) even if not seeky. Hence, since the estimated peak
2236 * rate is actually an average over the disk surface, these
2237 * processes may timeout just for bad luck. To avoid punishing
2238 * them we do not charge a full budget to a process that
2239 * succeeded in consuming at least 2/3 of its budget.
2240 */
2241 if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2242 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2243 bfq_bfqq_charge_full_budget(bfqq);
2244
2245 bfqq->service_from_backlogged += bfqq->entity.service;
2246
2247 if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2248 !bfq_bfqq_constantly_seeky(bfqq)) {
2249 bfq_mark_bfqq_constantly_seeky(bfqq);
2250 if (!blk_queue_nonrot(bfqd->queue))
2251 bfqd->const_seeky_busy_in_flight_queues++;
2252 }
2253
2254 if (reason == BFQ_BFQQ_TOO_IDLE &&
2255 bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )
2256 bfq_clear_bfqq_IO_bound(bfqq);
2257
2258 if (bfqd->low_latency && bfqq->wr_coeff == 1)
2259 bfqq->last_wr_start_finish = jiffies;
2260
2261 if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
2262 RB_EMPTY_ROOT(&bfqq->sort_list)) {
2263 /*
2264 * If we get here, and there are no outstanding requests,
2265 * then the request pattern is isochronous (see the comments
2266 * to the function bfq_bfqq_softrt_next_start()). Hence we
2267 * can compute soft_rt_next_start. If, instead, the queue
2268 * still has outstanding requests, then we have to wait
2269 * for the completion of all the outstanding requests to
2270 * discover whether the request pattern is actually
2271 * isochronous.
2272 */
2273 if (bfqq->dispatched == 0)
2274 bfqq->soft_rt_next_start =
2275 bfq_bfqq_softrt_next_start(bfqd, bfqq);
2276 else {
2277 /*
2278 * The application is still waiting for the
2279 * completion of one or more requests:
2280 * prevent it from possibly being incorrectly
2281 * deemed as soft real-time by setting its
2282 * soft_rt_next_start to infinity. In fact,
2283 * without this assignment, the application
2284 * would be incorrectly deemed as soft
2285 * real-time if:
2286 * 1) it issued a new request before the
2287 * completion of all its in-flight
2288 * requests, and
2289 * 2) at that time, its soft_rt_next_start
2290 * happened to be in the past.
2291 */
2292 bfqq->soft_rt_next_start =
2293 bfq_infinity_from_now(jiffies);
2294 /*
2295 * Schedule an update of soft_rt_next_start to when
2296 * the task may be discovered to be isochronous.
2297 */
2298 bfq_mark_bfqq_softrt_update(bfqq);
2299 }
2300 }
2301
2302 bfq_log_bfqq(bfqd, bfqq,
2303 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
2304 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2305
2306 /*
2307 * Increase, decrease or leave budget unchanged according to
2308 * reason.
2309 */
2310 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2311 __bfq_bfqq_expire(bfqd, bfqq);
2312}
2313
2314/*
2315 * Budget timeout is not implemented through a dedicated timer, but
2316 * just checked on request arrivals and completions, as well as on
2317 * idle timer expirations.
2318 */
2319static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2320{
2321 if (bfq_bfqq_budget_new(bfqq) ||
2322 time_before(jiffies, bfqq->budget_timeout))
2323 return 0;
2324 return 1;
2325}
2326
2327/*
2328 * If we expire a queue that is waiting for the arrival of a new
2329 * request, we may prevent the fictitious timestamp back-shifting that
2330 * allows the guarantees of the queue to be preserved (see [1] for
2331 * this tricky aspect). Hence we return true only if this condition
2332 * does not hold, or if the queue is slow enough to deserve only to be
2333 * kicked off for preserving a high throughput.
2334*/
2335static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2336{
2337 bfq_log_bfqq(bfqq->bfqd, bfqq,
2338 "may_budget_timeout: wait_request %d left %d timeout %d",
2339 bfq_bfqq_wait_request(bfqq),
2340 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2341 bfq_bfqq_budget_timeout(bfqq));
2342
2343 return (!bfq_bfqq_wait_request(bfqq) ||
2344 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2345 &&
2346 bfq_bfqq_budget_timeout(bfqq);
2347}
2348
2349/*
2350 * Device idling is allowed only for the queues for which this function
2351 * returns true. For this reason, the return value of this function plays a
2352 * critical role for both throughput boosting and service guarantees. The
2353 * return value is computed through a logical expression. In this rather
2354 * long comment, we try to briefly describe all the details and motivations
2355 * behind the components of this logical expression.
2356 *
2357 * First, the expression is false if bfqq is not sync, or if: bfqq happened
2358 * to become active during a large burst of queue activations, and the
2359 * pattern of requests bfqq contains boosts the throughput if bfqq is
2360 * expired. In fact, queues that became active during a large burst benefit
2361 * only from throughput, as discussed in the comments to bfq_handle_burst.
2362 * In this respect, expiring bfqq certainly boosts the throughput on NCQ-
2363 * capable flash-based devices, whereas, on rotational devices, it boosts
2364 * the throughput only if bfqq contains random requests.
2365 *
2366 * On the opposite end, if (a) bfqq is sync, (b) the above burst-related
2367 * condition does not hold, and (c) bfqq is being weight-raised, then the
2368 * expression always evaluates to true, as device idling is instrumental
2369 * for preserving low-latency guarantees (see [1]). If, instead, conditions
2370 * (a) and (b) do hold, but (c) does not, then the expression evaluates to
2371 * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and
2372 * (2) at least one of the following two conditions holds.
2373 * The first condition is that the device is not performing NCQ, because
2374 * idling the device most certainly boosts the throughput if this condition
2375 * holds and bfqq is I/O-bound and has been granted a non-null idle window.
2376 * The second compound condition is made of the logical AND of two components.
2377 *
2378 * The first component is true only if there is no weight-raised busy
2379 * queue. This guarantees that the device is not idled for a sync non-
2380 * weight-raised queue when there are busy weight-raised queues. The former
2381 * is then expired immediately if empty. Combined with the timestamping
2382 * rules of BFQ (see [1] for details), this causes sync non-weight-raised
2383 * queues to get a lower number of requests served, and hence to ask for a
2384 * lower number of requests from the request pool, before the busy weight-
2385 * raised queues get served again.
2386 *
2387 * This is beneficial for the processes associated with weight-raised
2388 * queues, when the request pool is saturated (e.g., in the presence of
2389 * write hogs). In fact, if the processes associated with the other queues
2390 * ask for requests at a lower rate, then weight-raised processes have a
2391 * higher probability to get a request from the pool immediately (or at
2392 * least soon) when they need one. Hence they have a higher probability to
2393 * actually get a fraction of the disk throughput proportional to their
2394 * high weight. This is especially true with NCQ-capable drives, which
2395 * enqueue several requests in advance and further reorder internally-
2396 * queued requests.
2397 *
2398 * In the end, mistreating non-weight-raised queues when there are busy
2399 * weight-raised queues seems to mitigate starvation problems in the
2400 * presence of heavy write workloads and NCQ, and hence to guarantee a
2401 * higher application and system responsiveness in these hostile scenarios.
2402 *
2403 * If the first component of the compound condition is instead true, i.e.,
2404 * there is no weight-raised busy queue, then the second component of the
2405 * compound condition takes into account service-guarantee and throughput
2406 * issues related to NCQ (recall that the compound condition is evaluated
2407 * only if the device is detected as supporting NCQ).
2408 *
2409 * As for service guarantees, allowing the drive to enqueue more than one
2410 * request at a time, and hence delegating de facto final scheduling
2411 * decisions to the drive's internal scheduler, causes loss of control on
2412 * the actual request service order. In this respect, when the drive is
2413 * allowed to enqueue more than one request at a time, the service
2414 * distribution enforced by the drive's internal scheduler is likely to
2415 * coincide with the desired device-throughput distribution only in the
2416 * following, perfectly symmetric, scenario:
2417 * 1) all active queues have the same weight,
2418 * 2) all active groups at the same level in the groups tree have the same
2419 * weight,
2420 * 3) all active groups at the same level in the groups tree have the same
2421 * number of children.
2422 *
2423 * Even in such a scenario, sequential I/O may still receive a preferential
2424 * treatment, but this is not likely to be a big issue with flash-based
2425 * devices, because of their non-dramatic loss of throughput with random
2426 * I/O. Things do differ with HDDs, for which additional care is taken, as
2427 * explained after completing the discussion for flash-based devices.
2428 *
2429 * Unfortunately, keeping the necessary state for evaluating exactly the
2430 * above symmetry conditions would be quite complex and time-consuming.
2431 * Therefore BFQ evaluates instead the following stronger sub-conditions,
2432 * for which it is much easier to maintain the needed state:
2433 * 1) all active queues have the same weight,
2434 * 2) all active groups have the same weight,
2435 * 3) all active groups have at most one active child each.
2436 * In particular, the last two conditions are always true if hierarchical
2437 * support and the cgroups interface are not enabled, hence no state needs
2438 * to be maintained in this case.
2439 *
2440 * According to the above considerations, the second component of the
2441 * compound condition evaluates to true if any of the above symmetry
2442 * sub-condition does not hold, or the device is not flash-based. Therefore,
2443 * if also the first component is true, then idling is allowed for a sync
2444 * queue. These are the only sub-conditions considered if the device is
2445 * flash-based, as, for such a device, it is sensible to force idling only
2446 * for service-guarantee issues. In fact, as for throughput, idling
2447 * NCQ-capable flash-based devices would not boost the throughput even
2448 * with sequential I/O; rather it would lower the throughput in proportion
2449 * to how fast the device is. In the end, (only) if all the three
2450 * sub-conditions hold and the device is flash-based, the compound
2451 * condition evaluates to false and therefore no idling is performed.
2452 *
2453 * As already said, things change with a rotational device, where idling
2454 * boosts the throughput with sequential I/O (even with NCQ). Hence, for
2455 * such a device the second component of the compound condition evaluates
2456 * to true also if the following additional sub-condition does not hold:
2457 * the queue is constantly seeky. Unfortunately, this different behavior
2458 * with respect to flash-based devices causes an additional asymmetry: if
2459 * some sync queues enjoy idling and some other sync queues do not, then
2460 * the latter get a low share of the device throughput, simply because the
2461 * former get many requests served after being set as in service, whereas
2462 * the latter do not. As a consequence, to guarantee the desired throughput
2463 * distribution, on HDDs the compound expression evaluates to true (and
2464 * hence device idling is performed) also if the following last symmetry
2465 * condition does not hold: no other queue is benefiting from idling. Also
2466 * this last condition is actually replaced with a simpler-to-maintain and
2467 * stronger condition: there is no busy queue which is not constantly seeky
2468 * (and hence may also benefit from idling).
2469 *
2470 * To sum up, when all the required symmetry and throughput-boosting
2471 * sub-conditions hold, the second component of the compound condition
2472 * evaluates to false, and hence no idling is performed. This helps to
2473 * keep the drives' internal queues full on NCQ-capable devices, and hence
2474 * to boost the throughput, without causing 'almost' any loss of service
2475 * guarantees. The 'almost' follows from the fact that, if the internal
2476 * queue of one such device is filled while all the sub-conditions hold,
2477 * but at some point in time some sub-condition stops to hold, then it may
2478 * become impossible to let requests be served in the new desired order
2479 * until all the requests already queued in the device have been served.
2480 */
2481static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
2482{
2483 struct bfq_data *bfqd = bfqq->bfqd;
2484#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \
2485 bfqd->busy_in_flight_queues == \
2486 bfqd->const_seeky_busy_in_flight_queues)
2487
2488#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \
2489 bfqd->hw_tag && \
2490 (blk_queue_nonrot(bfqd->queue) || \
2491 bfq_bfqq_constantly_seeky(bfqq)))
2492
2493/*
2494 * Condition for expiring a non-weight-raised queue (and hence not idling
2495 * the device).
2496 */
2497#define cond_for_expiring_non_wr (bfqd->hw_tag && \
2498 (bfqd->wr_busy_queues > 0 || \
2499 (blk_queue_nonrot(bfqd->queue) || \
2500 cond_for_seeky_on_ncq_hdd)))
2501
2502 return bfq_bfqq_sync(bfqq) &&
2503 !cond_for_expiring_in_burst &&
2504 (bfqq->wr_coeff > 1 || !symmetric_scenario ||
2505 (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&
2506 !cond_for_expiring_non_wr)
2507 );
2508}
2509
2510/*
2511 * If the in-service queue is empty but sync, and the function
2512 * bfq_bfqq_must_not_expire returns true, then:
2513 * 1) the queue must remain in service and cannot be expired, and
2514 * 2) the disk must be idled to wait for the possible arrival of a new
2515 * request for the queue.
2516 * See the comments to the function bfq_bfqq_must_not_expire for the reasons
2517 * why performing device idling is the best choice to boost the throughput
2518 * and preserve service guarantees when bfq_bfqq_must_not_expire itself
2519 * returns true.
2520 */
2521static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
2522{
2523 struct bfq_data *bfqd = bfqq->bfqd;
2524
2525 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
2526 bfq_bfqq_must_not_expire(bfqq);
2527}
2528
2529/*
2530 * Select a queue for service. If we have a current queue in service,
2531 * check whether to continue servicing it, or retrieve and set a new one.
2532 */
2533static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
2534{
Mauro Andreolini0b335882015-06-07 02:34:22 +02002535 struct bfq_queue *bfqq;
Paolo Valente70f28712013-05-09 19:10:02 +02002536 struct request *next_rq;
2537 enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2538
2539 bfqq = bfqd->in_service_queue;
2540 if (bfqq == NULL)
2541 goto new_queue;
2542
2543 bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
2544
Paolo Valente70f28712013-05-09 19:10:02 +02002545 if (bfq_may_expire_for_budg_timeout(bfqq) &&
2546 !timer_pending(&bfqd->idle_slice_timer) &&
2547 !bfq_bfqq_must_idle(bfqq))
2548 goto expire;
2549
2550 next_rq = bfqq->next_rq;
2551 /*
2552 * If bfqq has requests queued and it has enough budget left to
2553 * serve them, keep the queue, otherwise expire it.
2554 */
2555 if (next_rq != NULL) {
2556 if (bfq_serv_to_charge(next_rq, bfqq) >
2557 bfq_bfqq_budget_left(bfqq)) {
2558 reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
2559 goto expire;
2560 } else {
2561 /*
2562 * The idle timer may be pending because we may
2563 * not disable disk idling even when a new request
2564 * arrives.
2565 */
2566 if (timer_pending(&bfqd->idle_slice_timer)) {
2567 /*
2568 * If we get here: 1) at least a new request
2569 * has arrived but we have not disabled the
2570 * timer because the request was too small,
2571 * 2) then the block layer has unplugged
2572 * the device, causing the dispatch to be
2573 * invoked.
2574 *
2575 * Since the device is unplugged, now the
2576 * requests are probably large enough to
2577 * provide a reasonable throughput.
2578 * So we disable idling.
2579 */
2580 bfq_clear_bfqq_wait_request(bfqq);
2581 del_timer(&bfqd->idle_slice_timer);
2582 }
Mauro Andreolini0b335882015-06-07 02:34:22 +02002583 goto keep_queue;
Paolo Valente70f28712013-05-09 19:10:02 +02002584 }
2585 }
2586
2587 /*
2588 * No requests pending. However, if the in-service queue is idling
2589 * for a new request, or has requests waiting for a completion and
2590 * may idle after their completion, then keep it anyway.
2591 */
Mauro Andreolini0b335882015-06-07 02:34:22 +02002592 if (timer_pending(&bfqd->idle_slice_timer) ||
2593 (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
Paolo Valente70f28712013-05-09 19:10:02 +02002594 bfqq = NULL;
2595 goto keep_queue;
Paolo Valente70f28712013-05-09 19:10:02 +02002596 }
2597
2598 reason = BFQ_BFQQ_NO_MORE_REQUESTS;
2599expire:
2600 bfq_bfqq_expire(bfqd, bfqq, 0, reason);
2601new_queue:
Mauro Andreolini0b335882015-06-07 02:34:22 +02002602 bfqq = bfq_set_in_service_queue(bfqd);
Paolo Valente70f28712013-05-09 19:10:02 +02002603 bfq_log(bfqd, "select_queue: new queue %d returned",
2604 bfqq != NULL ? bfqq->pid : 0);
2605keep_queue:
2606 return bfqq;
2607}
2608
Mauro Andreolini0b335882015-06-07 02:34:22 +02002609static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
Paolo Valente70f28712013-05-09 19:10:02 +02002610{
Mauro Andreolini0b335882015-06-07 02:34:22 +02002611 struct bfq_entity *entity = &bfqq->entity;
2612 if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
Paolo Valente70f28712013-05-09 19:10:02 +02002613 bfq_log_bfqq(bfqd, bfqq,
2614 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
Mauro Andreolini0b335882015-06-07 02:34:22 +02002615 jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
Paolo Valente70f28712013-05-09 19:10:02 +02002616 jiffies_to_msecs(bfqq->wr_cur_max_time),
2617 bfqq->wr_coeff,
2618 bfqq->entity.weight, bfqq->entity.orig_weight);
2619
2620 BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
2621 entity->orig_weight * bfqq->wr_coeff);
2622 if (entity->ioprio_changed)
2623 bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
Mauro Andreolini0b335882015-06-07 02:34:22 +02002624
Paolo Valente70f28712013-05-09 19:10:02 +02002625 /*
2626 * If the queue was activated in a burst, or
2627 * too much time has elapsed from the beginning
Mauro Andreolini0b335882015-06-07 02:34:22 +02002628 * of this weight-raising period, or the queue has
2629 * exceeded the acceptable number of cooperations,
2630 * then end weight raising.
Paolo Valente70f28712013-05-09 19:10:02 +02002631 */
2632 if (bfq_bfqq_in_large_burst(bfqq) ||
Mauro Andreolini0b335882015-06-07 02:34:22 +02002633 bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||
Paolo Valente70f28712013-05-09 19:10:02 +02002634 time_is_before_jiffies(bfqq->last_wr_start_finish +
2635 bfqq->wr_cur_max_time)) {
2636 bfqq->last_wr_start_finish = jiffies;
2637 bfq_log_bfqq(bfqd, bfqq,
2638 "wrais ending at %lu, rais_max_time %u",
2639 bfqq->last_wr_start_finish,
2640 jiffies_to_msecs(bfqq->wr_cur_max_time));
2641 bfq_bfqq_end_wr(bfqq);
Paolo Valente70f28712013-05-09 19:10:02 +02002642 }
2643 }
Mauro Andreolini0b335882015-06-07 02:34:22 +02002644 /* Update weight both if it must be raised and if it must be lowered */
2645 if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
2646 __bfq_entity_update_weight_prio(
2647 bfq_entity_service_tree(entity),
2648 entity);
Paolo Valente70f28712013-05-09 19:10:02 +02002649}
2650
2651/*
2652 * Dispatch one request from bfqq, moving it to the request queue
2653 * dispatch list.
2654 */
2655static int bfq_dispatch_request(struct bfq_data *bfqd,
2656 struct bfq_queue *bfqq)
2657{
2658 int dispatched = 0;
2659 struct request *rq;
2660 unsigned long service_to_charge;
2661
2662 BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
2663
2664 /* Follow expired path, else get first next available. */
2665 rq = bfq_check_fifo(bfqq);
2666 if (rq == NULL)
2667 rq = bfqq->next_rq;
2668 service_to_charge = bfq_serv_to_charge(rq, bfqq);
2669
2670 if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
2671 /*
2672 * This may happen if the next rq is chosen in fifo order
2673 * instead of sector order. The budget is properly
2674 * dimensioned to be always sufficient to serve the next
2675 * request only if it is chosen in sector order. The reason
2676 * is that it would be quite inefficient and little useful
2677 * to always make sure that the budget is large enough to
2678 * serve even the possible next rq in fifo order.
2679 * In fact, requests are seldom served in fifo order.
2680 *
2681 * Expire the queue for budget exhaustion, and make sure
2682 * that the next act_budget is enough to serve the next
2683 * request, even if it comes from the fifo expired path.
2684 */
2685 bfqq->next_rq = rq;
2686 /*
2687 * Since this dispatch is failed, make sure that
2688 * a new one will be performed
2689 */
2690 if (!bfqd->rq_in_driver)
2691 bfq_schedule_dispatch(bfqd);
2692 goto expire;
2693 }
2694
2695 /* Finally, insert request into driver dispatch list. */
2696 bfq_bfqq_served(bfqq, service_to_charge);
2697 bfq_dispatch_insert(bfqd->queue, rq);
2698
2699 bfq_update_wr_data(bfqd, bfqq);
2700
2701 bfq_log_bfqq(bfqd, bfqq,
2702 "dispatched %u sec req (%llu), budg left %lu",
2703 blk_rq_sectors(rq),
2704 (long long unsigned)blk_rq_pos(rq),
2705 bfq_bfqq_budget_left(bfqq));
2706
2707 dispatched++;
2708
2709 if (bfqd->in_service_bic == NULL) {
2710 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
2711 bfqd->in_service_bic = RQ_BIC(rq);
2712 }
2713
2714 if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
2715 dispatched >= bfqd->bfq_max_budget_async_rq) ||
2716 bfq_class_idle(bfqq)))
2717 goto expire;
2718
2719 return dispatched;
2720
2721expire:
2722 bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
2723 return dispatched;
2724}
2725
2726static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
2727{
2728 int dispatched = 0;
2729
2730 while (bfqq->next_rq != NULL) {
2731 bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
2732 dispatched++;
2733 }
2734
2735 BUG_ON(!list_empty(&bfqq->fifo));
2736 return dispatched;
2737}
2738
2739/*
2740 * Drain our current requests.
2741 * Used for barriers and when switching io schedulers on-the-fly.
2742 */
2743static int bfq_forced_dispatch(struct bfq_data *bfqd)
2744{
2745 struct bfq_queue *bfqq, *n;
2746 struct bfq_service_tree *st;
2747 int dispatched = 0;
2748
2749 bfqq = bfqd->in_service_queue;
2750 if (bfqq != NULL)
2751 __bfq_bfqq_expire(bfqd, bfqq);
2752
2753 /*
2754 * Loop through classes, and be careful to leave the scheduler
2755 * in a consistent state, as feedback mechanisms and vtime
2756 * updates cannot be disabled during the process.
2757 */
2758 list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
2759 st = bfq_entity_service_tree(&bfqq->entity);
2760
2761 dispatched += __bfq_forced_dispatch_bfqq(bfqq);
2762 bfqq->max_budget = bfq_max_budget(bfqd);
2763
2764 bfq_forget_idle(st);
2765 }
2766
2767 BUG_ON(bfqd->busy_queues != 0);
2768
2769 return dispatched;
2770}
2771
2772static int bfq_dispatch_requests(struct request_queue *q, int force)
2773{
2774 struct bfq_data *bfqd = q->elevator->elevator_data;
2775 struct bfq_queue *bfqq;
2776 int max_dispatch;
2777
2778 bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
2779 if (bfqd->busy_queues == 0)
2780 return 0;
2781
2782 if (unlikely(force))
2783 return bfq_forced_dispatch(bfqd);
2784
2785 bfqq = bfq_select_queue(bfqd);
2786 if (bfqq == NULL)
2787 return 0;
2788
2789 if (bfq_class_idle(bfqq))
2790 max_dispatch = 1;
2791
2792 if (!bfq_bfqq_sync(bfqq))
2793 max_dispatch = bfqd->bfq_max_budget_async_rq;
2794
2795 if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {
2796 if (bfqd->busy_queues > 1)
2797 return 0;
2798 if (bfqq->dispatched >= 4 * max_dispatch)
2799 return 0;
2800 }
2801
2802 if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
2803 return 0;
2804
2805 bfq_clear_bfqq_wait_request(bfqq);
2806 BUG_ON(timer_pending(&bfqd->idle_slice_timer));
2807
2808 if (!bfq_dispatch_request(bfqd, bfqq))
2809 return 0;
2810
2811 bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
2812 bfq_bfqq_sync(bfqq) ? "sync" : "async");
2813
2814 return 1;
2815}
2816
2817/*
2818 * Task holds one reference to the queue, dropped when task exits. Each rq
2819 * in-flight on this queue also holds a reference, dropped when rq is freed.
2820 *
2821 * Queue lock must be held here.
2822 */
2823static void bfq_put_queue(struct bfq_queue *bfqq)
2824{
2825 struct bfq_data *bfqd = bfqq->bfqd;
2826
2827 BUG_ON(atomic_read(&bfqq->ref) <= 0);
2828
2829 bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
2830 atomic_read(&bfqq->ref));
2831 if (!atomic_dec_and_test(&bfqq->ref))
2832 return;
2833
2834 BUG_ON(rb_first(&bfqq->sort_list) != NULL);
2835 BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
2836 BUG_ON(bfqq->entity.tree != NULL);
2837 BUG_ON(bfq_bfqq_busy(bfqq));
2838 BUG_ON(bfqd->in_service_queue == bfqq);
2839
2840 if (bfq_bfqq_sync(bfqq))
2841 /*
2842 * The fact that this queue is being destroyed does not
2843 * invalidate the fact that this queue may have been
2844 * activated during the current burst. As a consequence,
2845 * although the queue does not exist anymore, and hence
2846 * needs to be removed from the burst list if there,
2847 * the burst size has not to be decremented.
2848 */
2849 hlist_del_init(&bfqq->burst_list_node);
2850
2851 bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
2852
2853 kmem_cache_free(bfq_pool, bfqq);
2854}
2855
2856static void bfq_put_cooperator(struct bfq_queue *bfqq)
2857{
2858 struct bfq_queue *__bfqq, *next;
2859
2860 /*
2861 * If this queue was scheduled to merge with another queue, be
2862 * sure to drop the reference taken on that queue (and others in
2863 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
2864 */
2865 __bfqq = bfqq->new_bfqq;
2866 while (__bfqq) {
2867 if (__bfqq == bfqq)
2868 break;
2869 next = __bfqq->new_bfqq;
2870 bfq_put_queue(__bfqq);
2871 __bfqq = next;
2872 }
2873}
2874
2875static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2876{
2877 if (bfqq == bfqd->in_service_queue) {
2878 __bfq_bfqq_expire(bfqd, bfqq);
2879 bfq_schedule_dispatch(bfqd);
2880 }
2881
2882 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
2883 atomic_read(&bfqq->ref));
2884
2885 bfq_put_cooperator(bfqq);
2886
2887 bfq_put_queue(bfqq);
2888}
2889
2890static inline void bfq_init_icq(struct io_cq *icq)
2891{
2892 struct bfq_io_cq *bic = icq_to_bic(icq);
2893
2894 bic->ttime.last_end_request = jiffies;
Mauro Andreolini0b335882015-06-07 02:34:22 +02002895 /*
2896 * A newly created bic indicates that the process has just
2897 * started doing I/O, and is probably mapping into memory its
2898 * executable and libraries: it definitely needs weight raising.
2899 * There is however the possibility that the process performs,
2900 * for a while, I/O close to some other process. EQM intercepts
2901 * this behavior and may merge the queue corresponding to the
2902 * process with some other queue, BEFORE the weight of the queue
2903 * is raised. Merged queues are not weight-raised (they are assumed
2904 * to belong to processes that benefit only from high throughput).
2905 * If the merge is basically the consequence of an accident, then
2906 * the queue will be split soon and will get back its old weight.
2907 * It is then important to write down somewhere that this queue
2908 * does need weight raising, even if it did not make it to get its
2909 * weight raised before being merged. To this purpose, we overload
2910 * the field raising_time_left and assign 1 to it, to mark the queue
2911 * as needing weight raising.
2912 */
2913 bic->wr_time_left = 1;
Paolo Valente70f28712013-05-09 19:10:02 +02002914}
2915
2916static void bfq_exit_icq(struct io_cq *icq)
2917{
2918 struct bfq_io_cq *bic = icq_to_bic(icq);
2919 struct bfq_data *bfqd = bic_to_bfqd(bic);
2920
2921 if (bic->bfqq[BLK_RW_ASYNC]) {
2922 bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
2923 bic->bfqq[BLK_RW_ASYNC] = NULL;
2924 }
2925
2926 if (bic->bfqq[BLK_RW_SYNC]) {
Mauro Andreolini0b335882015-06-07 02:34:22 +02002927 /*
2928 * If the bic is using a shared queue, put the reference
2929 * taken on the io_context when the bic started using a
2930 * shared bfq_queue.
2931 */
2932 if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
2933 put_io_context(icq->ioc);
Paolo Valente70f28712013-05-09 19:10:02 +02002934 bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
2935 bic->bfqq[BLK_RW_SYNC] = NULL;
2936 }
2937}
2938
2939/*
2940 * Update the entity prio values; note that the new values will not
2941 * be used until the next (re)activation.
2942 */
2943static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
2944{
2945 struct task_struct *tsk = current;
2946 struct io_context *ioc = bic->icq.ioc;
2947 int ioprio_class;
2948
2949 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2950 switch (ioprio_class) {
2951 default:
2952 dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
2953 "bfq: bad prio class %d\n", ioprio_class);
2954 case IOPRIO_CLASS_NONE:
2955 /*
2956 * No prio set, inherit CPU scheduling settings.
2957 */
2958 bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
2959 bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
2960 break;
2961 case IOPRIO_CLASS_RT:
2962 bfqq->entity.new_ioprio = task_ioprio(ioc);
2963 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
2964 break;
2965 case IOPRIO_CLASS_BE:
2966 bfqq->entity.new_ioprio = task_ioprio(ioc);
2967 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
2968 break;
2969 case IOPRIO_CLASS_IDLE:
2970 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
2971 bfqq->entity.new_ioprio = 7;
2972 bfq_clear_bfqq_idle_window(bfqq);
2973 break;
2974 }
2975
2976 if (bfqq->entity.new_ioprio < 0 ||
2977 bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {
2978 printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",
2979 bfqq->entity.new_ioprio);
2980 BUG();
2981 }
2982
2983 bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);
2984 bfqq->entity.ioprio_changed = 1;
2985}
2986
2987static void bfq_check_ioprio_change(struct io_context *ioc,
2988 struct bfq_io_cq *bic)
2989{
2990 struct bfq_data *bfqd;
2991 struct bfq_queue *bfqq, *new_bfqq;
2992 struct bfq_group *bfqg;
2993 unsigned long uninitialized_var(flags);
2994 int ioprio = bic->icq.ioc->ioprio;
2995
2996 bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
2997 &flags);
2998 if (unlikely(bfqd == NULL))
2999 return;
3000
3001 bic->ioprio = ioprio;
3002
3003 bfqq = bic->bfqq[BLK_RW_ASYNC];
3004 if (bfqq != NULL) {
3005 bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3006 sched_data);
3007 new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic->icq.ioc,
3008 GFP_ATOMIC);
3009 if (new_bfqq != NULL) {
3010 bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3011 bfq_log_bfqq(bfqd, bfqq,
3012 "check_ioprio_change: bfqq %p %d",
3013 bfqq, atomic_read(&bfqq->ref));
3014 bfq_put_queue(bfqq);
3015 }
3016 }
3017
3018 bfqq = bic->bfqq[BLK_RW_SYNC];
3019 if (bfqq != NULL)
3020 bfq_set_next_ioprio_data(bfqq, bic);
3021
3022 bfq_put_bfqd_unlock(bfqd, &flags);
3023}
3024
3025static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3026 struct bfq_io_cq *bic, pid_t pid, int is_sync)
3027{
3028 RB_CLEAR_NODE(&bfqq->entity.rb_node);
3029 INIT_LIST_HEAD(&bfqq->fifo);
3030 INIT_HLIST_NODE(&bfqq->burst_list_node);
3031
3032 atomic_set(&bfqq->ref, 0);
3033 bfqq->bfqd = bfqd;
3034
3035 if (bic)
3036 bfq_set_next_ioprio_data(bfqq, bic);
3037
3038 if (is_sync) {
3039 if (!bfq_class_idle(bfqq))
3040 bfq_mark_bfqq_idle_window(bfqq);
3041 bfq_mark_bfqq_sync(bfqq);
3042 }
3043 bfq_mark_bfqq_IO_bound(bfqq);
3044
3045 /* Tentative initial value to trade off between thr and lat */
3046 bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3047 bfqq->pid = pid;
3048
3049 bfqq->wr_coeff = 1;
3050 bfqq->last_wr_start_finish = 0;
3051 /*
3052 * Set to the value for which bfqq will not be deemed as
3053 * soft rt when it becomes backlogged.
3054 */
3055 bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
3056}
3057
3058static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3059 struct bfq_group *bfqg,
3060 int is_sync,
3061 struct io_context *ioc,
3062 gfp_t gfp_mask)
3063{
3064 struct bfq_queue *bfqq, *new_bfqq = NULL;
3065 struct bfq_io_cq *bic;
3066
3067retry:
3068 bic = bfq_bic_lookup(bfqd, ioc);
3069 /* bic always exists here */
3070 bfqq = bic_to_bfqq(bic, is_sync);
3071
3072 /*
3073 * Always try a new alloc if we fall back to the OOM bfqq
3074 * originally, since it should just be a temporary situation.
3075 */
3076 if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3077 bfqq = NULL;
3078 if (new_bfqq != NULL) {
3079 bfqq = new_bfqq;
3080 new_bfqq = NULL;
3081 } else if (gfp_mask & __GFP_WAIT) {
3082 spin_unlock_irq(bfqd->queue->queue_lock);
3083 new_bfqq = kmem_cache_alloc_node(bfq_pool,
3084 gfp_mask | __GFP_ZERO,
3085 bfqd->queue->node);
3086 spin_lock_irq(bfqd->queue->queue_lock);
3087 if (new_bfqq != NULL)
3088 goto retry;
3089 } else {
3090 bfqq = kmem_cache_alloc_node(bfq_pool,
3091 gfp_mask | __GFP_ZERO,
3092 bfqd->queue->node);
3093 }
3094
3095 if (bfqq != NULL) {
3096 bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
3097 is_sync);
3098 bfq_init_entity(&bfqq->entity, bfqg);
3099 bfq_log_bfqq(bfqd, bfqq, "allocated");
3100 } else {
3101 bfqq = &bfqd->oom_bfqq;
3102 bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3103 }
3104 }
3105
3106 if (new_bfqq != NULL)
3107 kmem_cache_free(bfq_pool, new_bfqq);
3108
3109 return bfqq;
3110}
3111
3112static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3113 struct bfq_group *bfqg,
3114 int ioprio_class, int ioprio)
3115{
3116 switch (ioprio_class) {
3117 case IOPRIO_CLASS_RT:
3118 return &bfqg->async_bfqq[0][ioprio];
3119 case IOPRIO_CLASS_BE:
3120 return &bfqg->async_bfqq[1][ioprio];
3121 case IOPRIO_CLASS_IDLE:
3122 return &bfqg->async_idle_bfqq;
3123 default:
3124 BUG();
3125 }
3126}
3127
3128static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3129 struct bfq_group *bfqg, int is_sync,
3130 struct io_context *ioc, gfp_t gfp_mask)
3131{
3132 const int ioprio = task_ioprio(ioc);
3133 const int ioprio_class = task_ioprio_class(ioc);
3134 struct bfq_queue **async_bfqq = NULL;
3135 struct bfq_queue *bfqq = NULL;
3136
3137 if (!is_sync) {
3138 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3139 ioprio);
3140 bfqq = *async_bfqq;
3141 }
3142
3143 if (bfqq == NULL)
3144 bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
3145
3146 /*
3147 * Pin the queue now that it's allocated, scheduler exit will
3148 * prune it.
3149 */
3150 if (!is_sync && *async_bfqq == NULL) {
3151 atomic_inc(&bfqq->ref);
3152 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3153 bfqq, atomic_read(&bfqq->ref));
3154 *async_bfqq = bfqq;
3155 }
3156
3157 atomic_inc(&bfqq->ref);
3158 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3159 atomic_read(&bfqq->ref));
3160 return bfqq;
3161}
3162
3163static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3164 struct bfq_io_cq *bic)
3165{
3166 unsigned long elapsed = jiffies - bic->ttime.last_end_request;
3167 unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3168
3169 bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
3170 bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
3171 bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
3172 bic->ttime.ttime_samples;
3173}
3174
3175static void bfq_update_io_seektime(struct bfq_data *bfqd,
3176 struct bfq_queue *bfqq,
3177 struct request *rq)
3178{
3179 sector_t sdist;
3180 u64 total;
3181
3182 if (bfqq->last_request_pos < blk_rq_pos(rq))
3183 sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3184 else
3185 sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3186
3187 /*
3188 * Don't allow the seek distance to get too large from the
3189 * odd fragment, pagein, etc.
3190 */
3191 if (bfqq->seek_samples == 0) /* first request, not really a seek */
3192 sdist = 0;
3193 else if (bfqq->seek_samples <= 60) /* second & third seek */
3194 sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3195 else
3196 sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3197
3198 bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3199 bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3200 total = bfqq->seek_total + (bfqq->seek_samples/2);
3201 do_div(total, bfqq->seek_samples);
3202 bfqq->seek_mean = (sector_t)total;
3203
3204 bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3205 (u64)bfqq->seek_mean);
3206}
3207
3208/*
3209 * Disable idle window if the process thinks too long or seeks so much that
3210 * it doesn't matter.
3211 */
3212static void bfq_update_idle_window(struct bfq_data *bfqd,
3213 struct bfq_queue *bfqq,
3214 struct bfq_io_cq *bic)
3215{
3216 int enable_idle;
3217
3218 /* Don't idle for async or idle io prio class. */
3219 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3220 return;
3221
Mauro Andreolini0b335882015-06-07 02:34:22 +02003222 /* Idle window just restored, statistics are meaningless. */
3223 if (bfq_bfqq_just_split(bfqq))
3224 return;
3225
Paolo Valente70f28712013-05-09 19:10:02 +02003226 enable_idle = bfq_bfqq_idle_window(bfqq);
3227
3228 if (atomic_read(&bic->icq.ioc->nr_tasks) == 0 ||
3229 bfqd->bfq_slice_idle == 0 ||
3230 (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3231 bfqq->wr_coeff == 1))
3232 enable_idle = 0;
3233 else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
3234 if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3235 bfqq->wr_coeff == 1)
3236 enable_idle = 0;
3237 else
3238 enable_idle = 1;
3239 }
3240 bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3241 enable_idle);
3242
3243 if (enable_idle)
3244 bfq_mark_bfqq_idle_window(bfqq);
3245 else
3246 bfq_clear_bfqq_idle_window(bfqq);
3247}
3248
3249/*
3250 * Called when a new fs request (rq) is added to bfqq. Check if there's
3251 * something we should do about it.
3252 */
3253static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3254 struct request *rq)
3255{
3256 struct bfq_io_cq *bic = RQ_BIC(rq);
3257
3258 if (rq->cmd_flags & REQ_META)
3259 bfqq->meta_pending++;
3260
3261 bfq_update_io_thinktime(bfqd, bic);
3262 bfq_update_io_seektime(bfqd, bfqq, rq);
3263 if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {
3264 bfq_clear_bfqq_constantly_seeky(bfqq);
3265 if (!blk_queue_nonrot(bfqd->queue)) {
3266 BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);
3267 bfqd->const_seeky_busy_in_flight_queues--;
3268 }
3269 }
3270 if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3271 !BFQQ_SEEKY(bfqq))
3272 bfq_update_idle_window(bfqd, bfqq, bic);
Mauro Andreolini0b335882015-06-07 02:34:22 +02003273 bfq_clear_bfqq_just_split(bfqq);
Paolo Valente70f28712013-05-09 19:10:02 +02003274
3275 bfq_log_bfqq(bfqd, bfqq,
3276 "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3277 bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3278 (long long unsigned)bfqq->seek_mean);
3279
3280 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3281
3282 if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
3283 int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
3284 blk_rq_sectors(rq) < 32;
3285 int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
3286
3287 /*
3288 * There is just this request queued: if the request
3289 * is small and the queue is not to be expired, then
3290 * just exit.
3291 *
3292 * In this way, if the disk is being idled to wait for
3293 * a new request from the in-service queue, we avoid
3294 * unplugging the device and committing the disk to serve
3295 * just a small request. On the contrary, we wait for
3296 * the block layer to decide when to unplug the device:
3297 * hopefully, new requests will be merged to this one
3298 * quickly, then the device will be unplugged and
3299 * larger requests will be dispatched.
3300 */
3301 if (small_req && !budget_timeout)
3302 return;
3303
3304 /*
3305 * A large enough request arrived, or the queue is to
3306 * be expired: in both cases disk idling is to be
3307 * stopped, so clear wait_request flag and reset
3308 * timer.
3309 */
3310 bfq_clear_bfqq_wait_request(bfqq);
3311 del_timer(&bfqd->idle_slice_timer);
3312
3313 /*
3314 * The queue is not empty, because a new request just
3315 * arrived. Hence we can safely expire the queue, in
3316 * case of budget timeout, without risking that the
3317 * timestamps of the queue are not updated correctly.
3318 * See [1] for more details.
3319 */
3320 if (budget_timeout)
3321 bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3322
3323 /*
3324 * Let the request rip immediately, or let a new queue be
3325 * selected if bfqq has just been expired.
3326 */
3327 __blk_run_queue(bfqd->queue);
3328 }
3329}
3330
3331static void bfq_insert_request(struct request_queue *q, struct request *rq)
3332{
3333 struct bfq_data *bfqd = q->elevator->elevator_data;
Mauro Andreolini0b335882015-06-07 02:34:22 +02003334 struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
Paolo Valente70f28712013-05-09 19:10:02 +02003335
3336 assert_spin_locked(bfqd->queue->queue_lock);
3337
Mauro Andreolini0b335882015-06-07 02:34:22 +02003338 /*
3339 * An unplug may trigger a requeue of a request from the device
3340 * driver: make sure we are in process context while trying to
3341 * merge two bfq_queues.
3342 */
3343 if (!in_interrupt()) {
3344 new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
3345 if (new_bfqq != NULL) {
3346 if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
3347 new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
3348 /*
3349 * Release the request's reference to the old bfqq
3350 * and make sure one is taken to the shared queue.
3351 */
3352 new_bfqq->allocated[rq_data_dir(rq)]++;
3353 bfqq->allocated[rq_data_dir(rq)]--;
3354 atomic_inc(&new_bfqq->ref);
3355 bfq_put_queue(bfqq);
3356 if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
3357 bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
3358 bfqq, new_bfqq);
3359 rq->elv.priv[1] = new_bfqq;
3360 bfqq = new_bfqq;
3361 } else
3362 bfq_bfqq_increase_failed_cooperations(bfqq);
3363 }
3364
Paolo Valente70f28712013-05-09 19:10:02 +02003365 bfq_add_request(rq);
3366
Mauro Andreolini0b335882015-06-07 02:34:22 +02003367 /*
3368 * Here a newly-created bfq_queue has already started a weight-raising
3369 * period: clear raising_time_left to prevent bfq_bfqq_save_state()
3370 * from assigning it a full weight-raising period. See the detailed
3371 * comments about this field in bfq_init_icq().
3372 */
3373 if (bfqq->bic != NULL)
3374 bfqq->bic->wr_time_left = 0;
Paolo Valente70f28712013-05-09 19:10:02 +02003375 rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3376 list_add_tail(&rq->queuelist, &bfqq->fifo);
3377
3378 bfq_rq_enqueued(bfqd, bfqq, rq);
3379}
3380
3381static void bfq_update_hw_tag(struct bfq_data *bfqd)
3382{
3383 bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3384 bfqd->rq_in_driver);
3385
3386 if (bfqd->hw_tag == 1)
3387 return;
3388
3389 /*
3390 * This sample is valid if the number of outstanding requests
3391 * is large enough to allow a queueing behavior. Note that the
3392 * sum is not exact, as it's not taking into account deactivated
3393 * requests.
3394 */
3395 if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3396 return;
3397
3398 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3399 return;
3400
3401 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3402 bfqd->max_rq_in_driver = 0;
3403 bfqd->hw_tag_samples = 0;
3404}
3405
3406static void bfq_completed_request(struct request_queue *q, struct request *rq)
3407{
3408 struct bfq_queue *bfqq = RQ_BFQQ(rq);
3409 struct bfq_data *bfqd = bfqq->bfqd;
3410 bool sync = bfq_bfqq_sync(bfqq);
3411
3412 bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",
3413 blk_rq_sectors(rq), sync);
3414
3415 bfq_update_hw_tag(bfqd);
3416
3417 BUG_ON(!bfqd->rq_in_driver);
3418 BUG_ON(!bfqq->dispatched);
3419 bfqd->rq_in_driver--;
3420 bfqq->dispatched--;
3421
3422 if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
3423 bfq_weights_tree_remove(bfqd, &bfqq->entity,
3424 &bfqd->queue_weights_tree);
3425 if (!blk_queue_nonrot(bfqd->queue)) {
3426 BUG_ON(!bfqd->busy_in_flight_queues);
3427 bfqd->busy_in_flight_queues--;
3428 if (bfq_bfqq_constantly_seeky(bfqq)) {
3429 BUG_ON(!bfqd->
3430 const_seeky_busy_in_flight_queues);
3431 bfqd->const_seeky_busy_in_flight_queues--;
3432 }
3433 }
3434 }
3435
3436 if (sync) {
3437 bfqd->sync_flight--;
3438 RQ_BIC(rq)->ttime.last_end_request = jiffies;
3439 }
3440
3441 /*
3442 * If we are waiting to discover whether the request pattern of the
3443 * task associated with the queue is actually isochronous, and
3444 * both requisites for this condition to hold are satisfied, then
3445 * compute soft_rt_next_start (see the comments to the function
3446 * bfq_bfqq_softrt_next_start()).
3447 */
3448 if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
3449 RB_EMPTY_ROOT(&bfqq->sort_list))
3450 bfqq->soft_rt_next_start =
3451 bfq_bfqq_softrt_next_start(bfqd, bfqq);
3452
3453 /*
3454 * If this is the in-service queue, check if it needs to be expired,
3455 * or if we want to idle in case it has no pending requests.
3456 */
3457 if (bfqd->in_service_queue == bfqq) {
3458 if (bfq_bfqq_budget_new(bfqq))
3459 bfq_set_budget_timeout(bfqd);
3460
3461 if (bfq_bfqq_must_idle(bfqq)) {
3462 bfq_arm_slice_timer(bfqd);
3463 goto out;
3464 } else if (bfq_may_expire_for_budg_timeout(bfqq))
3465 bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3466 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3467 (bfqq->dispatched == 0 ||
3468 !bfq_bfqq_must_not_expire(bfqq)))
3469 bfq_bfqq_expire(bfqd, bfqq, 0,
3470 BFQ_BFQQ_NO_MORE_REQUESTS);
3471 }
3472
3473 if (!bfqd->rq_in_driver)
3474 bfq_schedule_dispatch(bfqd);
3475
3476out:
3477 return;
3478}
3479
3480static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3481{
3482 if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3483 bfq_clear_bfqq_must_alloc(bfqq);
3484 return ELV_MQUEUE_MUST;
3485 }
3486
3487 return ELV_MQUEUE_MAY;
3488}
3489
3490static int bfq_may_queue(struct request_queue *q, int rw)
3491{
3492 struct bfq_data *bfqd = q->elevator->elevator_data;
3493 struct task_struct *tsk = current;
3494 struct bfq_io_cq *bic;
3495 struct bfq_queue *bfqq;
3496
3497 /*
3498 * Don't force setup of a queue from here, as a call to may_queue
3499 * does not necessarily imply that a request actually will be
3500 * queued. So just lookup a possibly existing queue, or return
3501 * 'may queue' if that fails.
3502 */
3503 bic = bfq_bic_lookup(bfqd, tsk->io_context);
3504 if (bic == NULL)
3505 return ELV_MQUEUE_MAY;
3506
3507 bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
3508 if (bfqq != NULL)
3509 return __bfq_may_queue(bfqq);
3510
3511 return ELV_MQUEUE_MAY;
3512}
3513
3514/*
3515 * Queue lock held here.
3516 */
3517static void bfq_put_request(struct request *rq)
3518{
3519 struct bfq_queue *bfqq = RQ_BFQQ(rq);
3520
3521 if (bfqq != NULL) {
3522 const int rw = rq_data_dir(rq);
3523
3524 BUG_ON(!bfqq->allocated[rw]);
3525 bfqq->allocated[rw]--;
3526
3527 rq->elv.priv[0] = NULL;
3528 rq->elv.priv[1] = NULL;
3529
3530 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3531 bfqq, atomic_read(&bfqq->ref));
3532 bfq_put_queue(bfqq);
3533 }
3534}
3535
Paolo Valente70f28712013-05-09 19:10:02 +02003536/*
3537 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3538 * was the last process referring to said bfqq.
3539 */
3540static struct bfq_queue *
3541bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
3542{
3543 bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
Mauro Andreolini0b335882015-06-07 02:34:22 +02003544
3545 put_io_context(bic->icq.ioc);
3546
Paolo Valente70f28712013-05-09 19:10:02 +02003547 if (bfqq_process_refs(bfqq) == 1) {
3548 bfqq->pid = current->pid;
3549 bfq_clear_bfqq_coop(bfqq);
3550 bfq_clear_bfqq_split_coop(bfqq);
3551 return bfqq;
3552 }
3553
3554 bic_set_bfqq(bic, NULL, 1);
3555
3556 bfq_put_cooperator(bfqq);
3557
3558 bfq_put_queue(bfqq);
3559 return NULL;
3560}
3561
3562/*
3563 * Allocate bfq data structures associated with this request.
3564 */
3565static int bfq_set_request(struct request_queue *q, struct request *rq,
3566 gfp_t gfp_mask)
3567{
3568 struct bfq_data *bfqd = q->elevator->elevator_data;
3569 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3570 const int rw = rq_data_dir(rq);
3571 const int is_sync = rq_is_sync(rq);
3572 struct bfq_queue *bfqq;
3573 struct bfq_group *bfqg;
3574 unsigned long flags;
Mauro Andreolini0b335882015-06-07 02:34:22 +02003575 bool split = false;
Paolo Valente70f28712013-05-09 19:10:02 +02003576
3577 /* handle changed prio notifications; cgroup change is handled separately */
3578 if (unlikely(icq_get_changed(&bic->icq) & ICQ_IOPRIO_CHANGED))
3579 bfq_check_ioprio_change(bic->icq.ioc, bic);
3580
3581 might_sleep_if(gfp_mask & __GFP_WAIT);
3582
3583 spin_lock_irqsave(q->queue_lock, flags);
3584
3585 if (bic == NULL)
3586 goto queue_fail;
3587
3588 bfqg = bfq_bic_update_cgroup(bic);
3589
3590new_queue:
3591 bfqq = bic_to_bfqq(bic, is_sync);
3592 if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3593 bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic->icq.ioc, gfp_mask);
3594 bic_set_bfqq(bic, bfqq, is_sync);
Mauro Andreolini0b335882015-06-07 02:34:22 +02003595 if (split && is_sync) {
3596 if ((bic->was_in_burst_list && bfqd->large_burst) ||
3597 bic->saved_in_large_burst)
3598 bfq_mark_bfqq_in_large_burst(bfqq);
3599 else {
3600 bfq_clear_bfqq_in_large_burst(bfqq);
3601 if (bic->was_in_burst_list)
3602 hlist_add_head(&bfqq->burst_list_node,
3603 &bfqd->burst_list);
3604 }
3605 }
Paolo Valente70f28712013-05-09 19:10:02 +02003606 } else {
Mauro Andreolini0b335882015-06-07 02:34:22 +02003607 /* If the queue was seeky for too long, break it apart. */
Paolo Valente70f28712013-05-09 19:10:02 +02003608 if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
3609 bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
3610 bfqq = bfq_split_bfqq(bic, bfqq);
Mauro Andreolini0b335882015-06-07 02:34:22 +02003611 split = true;
Paolo Valente70f28712013-05-09 19:10:02 +02003612 if (!bfqq)
3613 goto new_queue;
3614 }
Paolo Valente70f28712013-05-09 19:10:02 +02003615 }
3616
3617 bfqq->allocated[rw]++;
3618 atomic_inc(&bfqq->ref);
3619 bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
3620 atomic_read(&bfqq->ref));
3621
3622 rq->elv.priv[0] = bic;
3623 rq->elv.priv[1] = bfqq;
3624
Mauro Andreolini0b335882015-06-07 02:34:22 +02003625 /*
3626 * If a bfq_queue has only one process reference, it is owned
3627 * by only one bfq_io_cq: we can set the bic field of the
3628 * bfq_queue to the address of that structure. Also, if the
3629 * queue has just been split, mark a flag so that the
3630 * information is available to the other scheduler hooks.
3631 */
3632 if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
3633 bfqq->bic = bic;
3634 if (split) {
3635 bfq_mark_bfqq_just_split(bfqq);
3636 /*
3637 * If the queue has just been split from a shared
3638 * queue, restore the idle window and the possible
3639 * weight raising period.
3640 */
3641 bfq_bfqq_resume_state(bfqq, bic);
3642 }
3643 }
3644
Paolo Valente70f28712013-05-09 19:10:02 +02003645 spin_unlock_irqrestore(q->queue_lock, flags);
3646
3647 return 0;
3648
3649queue_fail:
3650 bfq_schedule_dispatch(bfqd);
3651 spin_unlock_irqrestore(q->queue_lock, flags);
3652
3653 return 1;
3654}
3655
3656static void bfq_kick_queue(struct work_struct *work)
3657{
3658 struct bfq_data *bfqd =
3659 container_of(work, struct bfq_data, unplug_work);
3660 struct request_queue *q = bfqd->queue;
3661
3662 spin_lock_irq(q->queue_lock);
3663 __blk_run_queue(q);
3664 spin_unlock_irq(q->queue_lock);
3665}
3666
3667/*
3668 * Handler of the expiration of the timer running if the in-service queue
3669 * is idling inside its time slice.
3670 */
3671static void bfq_idle_slice_timer(unsigned long data)
3672{
3673 struct bfq_data *bfqd = (struct bfq_data *)data;
3674 struct bfq_queue *bfqq;
3675 unsigned long flags;
3676 enum bfqq_expiration reason;
3677
3678 spin_lock_irqsave(bfqd->queue->queue_lock, flags);
3679
3680 bfqq = bfqd->in_service_queue;
3681 /*
3682 * Theoretical race here: the in-service queue can be NULL or
3683 * different from the queue that was idling if the timer handler
3684 * spins on the queue_lock and a new request arrives for the
3685 * current queue and there is a full dispatch cycle that changes
3686 * the in-service queue. This can hardly happen, but in the worst
3687 * case we just expire a queue too early.
3688 */
3689 if (bfqq != NULL) {
3690 bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
3691 if (bfq_bfqq_budget_timeout(bfqq))
3692 /*
3693 * Also here the queue can be safely expired
3694 * for budget timeout without wasting
3695 * guarantees
3696 */
3697 reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3698 else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
3699 /*
3700 * The queue may not be empty upon timer expiration,
3701 * because we may not disable the timer when the
3702 * first request of the in-service queue arrives
3703 * during disk idling.
3704 */
3705 reason = BFQ_BFQQ_TOO_IDLE;
3706 else
3707 goto schedule_dispatch;
3708
3709 bfq_bfqq_expire(bfqd, bfqq, 1, reason);
3710 }
3711
3712schedule_dispatch:
3713 bfq_schedule_dispatch(bfqd);
3714
3715 spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
3716}
3717
3718static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
3719{
3720 del_timer_sync(&bfqd->idle_slice_timer);
3721 cancel_work_sync(&bfqd->unplug_work);
3722}
3723
3724static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3725 struct bfq_queue **bfqq_ptr)
3726{
3727 struct bfq_group *root_group = bfqd->root_group;
3728 struct bfq_queue *bfqq = *bfqq_ptr;
3729
3730 bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
3731 if (bfqq != NULL) {
3732 bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
3733 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
3734 bfqq, atomic_read(&bfqq->ref));
3735 bfq_put_queue(bfqq);
3736 *bfqq_ptr = NULL;
3737 }
3738}
3739
3740/*
3741 * Release all the bfqg references to its async queues. If we are
3742 * deallocating the group these queues may still contain requests, so
3743 * we reparent them to the root cgroup (i.e., the only one that will
3744 * exist for sure until all the requests on a device are gone).
3745 */
3746static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
3747{
3748 int i, j;
3749
3750 for (i = 0; i < 2; i++)
3751 for (j = 0; j < IOPRIO_BE_NR; j++)
3752 __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
3753
3754 __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
3755}
3756
3757static void bfq_exit_queue(struct elevator_queue *e)
3758{
3759 struct bfq_data *bfqd = e->elevator_data;
3760 struct request_queue *q = bfqd->queue;
3761 struct bfq_queue *bfqq, *n;
3762
3763 bfq_shutdown_timer_wq(bfqd);
3764
3765 spin_lock_irq(q->queue_lock);
3766
3767 BUG_ON(bfqd->in_service_queue != NULL);
3768 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
3769 bfq_deactivate_bfqq(bfqd, bfqq, 0);
3770
3771 bfq_disconnect_groups(bfqd);
3772 spin_unlock_irq(q->queue_lock);
3773
3774 bfq_shutdown_timer_wq(bfqd);
3775
3776 synchronize_rcu();
3777
3778 BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3779
3780 bfq_free_root_group(bfqd);
3781 kfree(bfqd);
3782}
3783
3784static void *bfq_init_queue(struct request_queue *q)
3785{
3786 struct bfq_group *bfqg;
3787 struct bfq_data *bfqd;
3788
3789 bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
3790 if (bfqd == NULL)
3791 return NULL;
3792
3793 /*
3794 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
3795 * Grab a permanent reference to it, so that the normal code flow
3796 * will not attempt to free it.
3797 */
3798 bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
3799 atomic_inc(&bfqd->oom_bfqq.ref);
3800 bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
3801 bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;
3802 bfqd->oom_bfqq.entity.new_weight =
3803 bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);
3804 /*
3805 * Trigger weight initialization, according to ioprio, at the
3806 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
3807 * class won't be changed any more.
3808 */
3809 bfqd->oom_bfqq.entity.ioprio_changed = 1;
3810
3811 bfqd->queue = q;
3812
3813 bfqg = bfq_alloc_root_group(bfqd, q->node);
3814 if (bfqg == NULL) {
3815 kfree(bfqd);
3816 return NULL;
3817 }
3818
3819 bfqd->root_group = bfqg;
3820 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
3821#ifdef CONFIG_CGROUP_BFQIO
3822 bfqd->active_numerous_groups = 0;
3823#endif
3824
3825 init_timer(&bfqd->idle_slice_timer);
3826 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
3827 bfqd->idle_slice_timer.data = (unsigned long)bfqd;
3828
3829 bfqd->rq_pos_tree = RB_ROOT;
3830 bfqd->queue_weights_tree = RB_ROOT;
3831 bfqd->group_weights_tree = RB_ROOT;
3832
3833 INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
3834
3835 INIT_LIST_HEAD(&bfqd->active_list);
3836 INIT_LIST_HEAD(&bfqd->idle_list);
3837 INIT_HLIST_HEAD(&bfqd->burst_list);
3838
3839 bfqd->hw_tag = -1;
3840
3841 bfqd->bfq_max_budget = bfq_default_max_budget;
3842
3843 bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
3844 bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
3845 bfqd->bfq_back_max = bfq_back_max;
3846 bfqd->bfq_back_penalty = bfq_back_penalty;
3847 bfqd->bfq_slice_idle = bfq_slice_idle;
3848 bfqd->bfq_class_idle_last_service = 0;
3849 bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
3850 bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
3851 bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
3852
3853 bfqd->bfq_coop_thresh = 2;
3854 bfqd->bfq_failed_cooperations = 7000;
3855 bfqd->bfq_requests_within_timer = 120;
3856
3857 bfqd->bfq_large_burst_thresh = 11;
3858 bfqd->bfq_burst_interval = msecs_to_jiffies(500);
3859
3860 bfqd->low_latency = true;
3861
3862 bfqd->bfq_wr_coeff = 20;
3863 bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
3864 bfqd->bfq_wr_max_time = 0;
3865 bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
3866 bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
3867 bfqd->bfq_wr_max_softrt_rate = 7000; /*
3868 * Approximate rate required
3869 * to playback or record a
3870 * high-definition compressed
3871 * video.
3872 */
3873 bfqd->wr_busy_queues = 0;
3874 bfqd->busy_in_flight_queues = 0;
3875 bfqd->const_seeky_busy_in_flight_queues = 0;
3876
3877 /*
3878 * Begin by assuming, optimistically, that the device peak rate is
3879 * equal to the highest reference rate.
3880 */
3881 bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
3882 T_fast[blk_queue_nonrot(bfqd->queue)];
3883 bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];
3884 bfqd->device_speed = BFQ_BFQD_FAST;
3885
3886 return bfqd;
3887}
3888
3889static void bfq_slab_kill(void)
3890{
3891 if (bfq_pool != NULL)
3892 kmem_cache_destroy(bfq_pool);
3893}
3894
3895static int __init bfq_slab_setup(void)
3896{
3897 bfq_pool = KMEM_CACHE(bfq_queue, 0);
3898 if (bfq_pool == NULL)
3899 return -ENOMEM;
3900 return 0;
3901}
3902
3903static ssize_t bfq_var_show(unsigned int var, char *page)
3904{
3905 return sprintf(page, "%d\n", var);
3906}
3907
3908static ssize_t bfq_var_store(unsigned long *var, const char *page,
3909 size_t count)
3910{
3911 unsigned long new_val;
3912 int ret = kstrtoul(page, 10, &new_val);
3913
3914 if (ret == 0)
3915 *var = new_val;
3916
3917 return count;
3918}
3919
3920static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
3921{
3922 struct bfq_data *bfqd = e->elevator_data;
3923 return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
3924 jiffies_to_msecs(bfqd->bfq_wr_max_time) :
3925 jiffies_to_msecs(bfq_wr_duration(bfqd)));
3926}
3927
3928static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
3929{
3930 struct bfq_queue *bfqq;
3931 struct bfq_data *bfqd = e->elevator_data;
3932 ssize_t num_char = 0;
3933
3934 num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
3935 bfqd->queued);
3936
3937 spin_lock_irq(bfqd->queue->queue_lock);
3938
3939 num_char += sprintf(page + num_char, "Active:\n");
3940 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
3941 num_char += sprintf(page + num_char,
3942 "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",
3943 bfqq->pid,
3944 bfqq->entity.weight,
3945 bfqq->queued[0],
3946 bfqq->queued[1],
3947 jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
3948 jiffies_to_msecs(bfqq->wr_cur_max_time));
3949 }
3950
3951 num_char += sprintf(page + num_char, "Idle:\n");
3952 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
3953 num_char += sprintf(page + num_char,
3954 "pid%d: weight %hu, dur %d/%u\n",
3955 bfqq->pid,
3956 bfqq->entity.weight,
3957 jiffies_to_msecs(jiffies -
3958 bfqq->last_wr_start_finish),
3959 jiffies_to_msecs(bfqq->wr_cur_max_time));
3960 }
3961
3962 spin_unlock_irq(bfqd->queue->queue_lock);
3963
3964 return num_char;
3965}
3966
3967#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
3968static ssize_t __FUNC(struct elevator_queue *e, char *page) \
3969{ \
3970 struct bfq_data *bfqd = e->elevator_data; \
3971 unsigned int __data = __VAR; \
3972 if (__CONV) \
3973 __data = jiffies_to_msecs(__data); \
3974 return bfq_var_show(__data, (page)); \
3975}
3976SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
3977SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
3978SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
3979SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
3980SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
3981SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
3982SHOW_FUNCTION(bfq_max_budget_async_rq_show,
3983 bfqd->bfq_max_budget_async_rq, 0);
3984SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
3985SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
3986SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
3987SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
3988SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
3989SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
3990SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
3991 1);
3992SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
3993#undef SHOW_FUNCTION
3994
3995#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
3996static ssize_t \
3997__FUNC(struct elevator_queue *e, const char *page, size_t count) \
3998{ \
3999 struct bfq_data *bfqd = e->elevator_data; \
4000 unsigned long uninitialized_var(__data); \
4001 int ret = bfq_var_store(&__data, (page), count); \
4002 if (__data < (MIN)) \
4003 __data = (MIN); \
4004 else if (__data > (MAX)) \
4005 __data = (MAX); \
4006 if (__CONV) \
4007 *(__PTR) = msecs_to_jiffies(__data); \
4008 else \
4009 *(__PTR) = __data; \
4010 return ret; \
4011}
4012STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4013 INT_MAX, 1);
4014STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4015 INT_MAX, 1);
4016STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4017STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4018 INT_MAX, 0);
4019STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4020STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4021 1, INT_MAX, 0);
4022STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4023 INT_MAX, 1);
4024STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
4025STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
4026STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
4027 1);
4028STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
4029 INT_MAX, 1);
4030STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
4031 &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
4032STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
4033 INT_MAX, 0);
4034#undef STORE_FUNCTION
4035
4036/* do nothing for the moment */
4037static ssize_t bfq_weights_store(struct elevator_queue *e,
4038 const char *page, size_t count)
4039{
4040 return count;
4041}
4042
4043static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4044{
4045 u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4046
4047 if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4048 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4049 else
4050 return bfq_default_max_budget;
4051}
4052
4053static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4054 const char *page, size_t count)
4055{
4056 struct bfq_data *bfqd = e->elevator_data;
4057 unsigned long uninitialized_var(__data);
4058 int ret = bfq_var_store(&__data, (page), count);
4059
4060 if (__data == 0)
4061 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4062 else {
4063 if (__data > INT_MAX)
4064 __data = INT_MAX;
4065 bfqd->bfq_max_budget = __data;
4066 }
4067
4068 bfqd->bfq_user_max_budget = __data;
4069
4070 return ret;
4071}
4072
4073static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4074 const char *page, size_t count)
4075{
4076 struct bfq_data *bfqd = e->elevator_data;
4077 unsigned long uninitialized_var(__data);
4078 int ret = bfq_var_store(&__data, (page), count);
4079
4080 if (__data < 1)
4081 __data = 1;
4082 else if (__data > INT_MAX)
4083 __data = INT_MAX;
4084
4085 bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4086 if (bfqd->bfq_user_max_budget == 0)
4087 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4088
4089 return ret;
4090}
4091
4092static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4093 const char *page, size_t count)
4094{
4095 struct bfq_data *bfqd = e->elevator_data;
4096 unsigned long uninitialized_var(__data);
4097 int ret = bfq_var_store(&__data, (page), count);
4098
4099 if (__data > 1)
4100 __data = 1;
4101 if (__data == 0 && bfqd->low_latency != 0)
4102 bfq_end_wr(bfqd);
4103 bfqd->low_latency = __data;
4104
4105 return ret;
4106}
4107
4108#define BFQ_ATTR(name) \
4109 __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4110
4111static struct elv_fs_entry bfq_attrs[] = {
4112 BFQ_ATTR(fifo_expire_sync),
4113 BFQ_ATTR(fifo_expire_async),
4114 BFQ_ATTR(back_seek_max),
4115 BFQ_ATTR(back_seek_penalty),
4116 BFQ_ATTR(slice_idle),
4117 BFQ_ATTR(max_budget),
4118 BFQ_ATTR(max_budget_async_rq),
4119 BFQ_ATTR(timeout_sync),
4120 BFQ_ATTR(timeout_async),
4121 BFQ_ATTR(low_latency),
4122 BFQ_ATTR(wr_coeff),
4123 BFQ_ATTR(wr_max_time),
4124 BFQ_ATTR(wr_rt_max_time),
4125 BFQ_ATTR(wr_min_idle_time),
4126 BFQ_ATTR(wr_min_inter_arr_async),
4127 BFQ_ATTR(wr_max_softrt_rate),
4128 BFQ_ATTR(weights),
4129 __ATTR_NULL
4130};
4131
4132static struct elevator_type iosched_bfq = {
4133 .ops = {
4134 .elevator_merge_fn = bfq_merge,
4135 .elevator_merged_fn = bfq_merged_request,
4136 .elevator_merge_req_fn = bfq_merged_requests,
4137 .elevator_allow_merge_fn = bfq_allow_merge,
4138 .elevator_dispatch_fn = bfq_dispatch_requests,
4139 .elevator_add_req_fn = bfq_insert_request,
4140 .elevator_activate_req_fn = bfq_activate_request,
4141 .elevator_deactivate_req_fn = bfq_deactivate_request,
4142 .elevator_completed_req_fn = bfq_completed_request,
4143 .elevator_former_req_fn = elv_rb_former_request,
4144 .elevator_latter_req_fn = elv_rb_latter_request,
4145 .elevator_init_icq_fn = bfq_init_icq,
4146 .elevator_exit_icq_fn = bfq_exit_icq,
4147 .elevator_set_req_fn = bfq_set_request,
4148 .elevator_put_req_fn = bfq_put_request,
4149 .elevator_may_queue_fn = bfq_may_queue,
4150 .elevator_init_fn = bfq_init_queue,
4151 .elevator_exit_fn = bfq_exit_queue,
4152 },
4153 .icq_size = sizeof(struct bfq_io_cq),
4154 .icq_align = __alignof__(struct bfq_io_cq),
4155 .elevator_attrs = bfq_attrs,
4156 .elevator_name = "bfq",
4157 .elevator_owner = THIS_MODULE,
4158};
4159
4160static int __init bfq_init(void)
4161{
4162 /*
4163 * Can be 0 on HZ < 1000 setups.
4164 */
4165 if (bfq_slice_idle == 0)
4166 bfq_slice_idle = 1;
4167
4168 if (bfq_timeout_async == 0)
4169 bfq_timeout_async = 1;
4170
4171 if (bfq_slab_setup())
4172 return -ENOMEM;
4173
4174 /*
4175 * Times to load large popular applications for the typical systems
4176 * installed on the reference devices (see the comments before the
4177 * definitions of the two arrays).
4178 */
4179 T_slow[0] = msecs_to_jiffies(2600);
4180 T_slow[1] = msecs_to_jiffies(1000);
4181 T_fast[0] = msecs_to_jiffies(5500);
4182 T_fast[1] = msecs_to_jiffies(2000);
4183
4184 /*
4185 * Thresholds that determine the switch between speed classes (see
4186 * the comments before the definition of the array).
4187 */
4188 device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
4189 device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
4190
4191 elv_register(&iosched_bfq);
4192 pr_info("BFQ I/O-scheduler: v7r8");
4193
4194 return 0;
4195}
4196
4197static void __exit bfq_exit(void)
4198{
4199 elv_unregister(&iosched_bfq);
4200 bfq_slab_kill();
4201}
4202
4203module_init(bfq_init);
4204module_exit(bfq_exit);
4205
4206MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4207MODULE_LICENSE("GPL");