blob: 52a429397d3b11254964353e12819c52d0c2002f [file] [log] [blame]
Vivek Goyale43473b2010-09-15 17:06:35 -04001/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
12#include "blk-cgroup.h"
Tejun Heobc9fcbf2011-10-19 14:31:18 +020013#include "blk.h"
Vivek Goyale43473b2010-09-15 17:06:35 -040014
15/* Max dispatch from a group in 1 round */
16static int throtl_grp_quantum = 8;
17
18/* Total max dispatch from all groups in one round */
19static int throtl_quantum = 32;
20
21/* Throttling is performed over 100ms slice and after that slice is renewed */
22static unsigned long throtl_slice = HZ/10; /* 100 ms */
23
Vivek Goyal450adcb2011-03-01 13:40:54 -050024/* A workqueue to queue throttle related work */
25static struct workqueue_struct *kthrotld_workqueue;
26static void throtl_schedule_delayed_work(struct throtl_data *td,
27 unsigned long delay);
28
Vivek Goyale43473b2010-09-15 17:06:35 -040029struct throtl_rb_root {
30 struct rb_root rb;
31 struct rb_node *left;
32 unsigned int count;
33 unsigned long min_disptime;
34};
35
36#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
37 .count = 0, .min_disptime = 0}
38
39#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
40
41struct throtl_grp {
42 /* List of throtl groups on the request queue*/
43 struct hlist_node tg_node;
44
45 /* active throtl group service_tree member */
46 struct rb_node rb_node;
47
48 /*
49 * Dispatch time in jiffies. This is the estimated time when group
50 * will unthrottle and is ready to dispatch more bio. It is used as
51 * key to sort active groups in service tree.
52 */
53 unsigned long disptime;
54
55 struct blkio_group blkg;
56 atomic_t ref;
57 unsigned int flags;
58
59 /* Two lists for READ and WRITE */
60 struct bio_list bio_lists[2];
61
62 /* Number of queued bios on READ and WRITE lists */
63 unsigned int nr_queued[2];
64
65 /* bytes per second rate limits */
66 uint64_t bps[2];
67
Vivek Goyal8e89d132010-09-15 17:06:37 -040068 /* IOPS limits */
69 unsigned int iops[2];
70
Vivek Goyale43473b2010-09-15 17:06:35 -040071 /* Number of bytes disptached in current slice */
72 uint64_t bytes_disp[2];
Vivek Goyal8e89d132010-09-15 17:06:37 -040073 /* Number of bio's dispatched in current slice */
74 unsigned int io_disp[2];
Vivek Goyale43473b2010-09-15 17:06:35 -040075
76 /* When did we start a new slice */
77 unsigned long slice_start[2];
78 unsigned long slice_end[2];
Vivek Goyalfe071432010-10-01 14:49:49 +020079
80 /* Some throttle limits got updated for the group */
Andreas Schwab6f037932011-03-30 12:21:56 +020081 int limits_changed;
Vivek Goyal4843c692011-05-19 15:38:27 -040082
83 struct rcu_head rcu_head;
Vivek Goyale43473b2010-09-15 17:06:35 -040084};
85
86struct throtl_data
87{
88 /* List of throtl groups */
89 struct hlist_head tg_list;
90
91 /* service tree for active throtl groups */
92 struct throtl_rb_root tg_service_tree;
93
Vivek Goyal29b12582011-05-19 15:38:24 -040094 struct throtl_grp *root_tg;
Vivek Goyale43473b2010-09-15 17:06:35 -040095 struct request_queue *queue;
96
97 /* Total Number of queued bios on READ and WRITE lists */
98 unsigned int nr_queued[2];
99
100 /*
Vivek Goyal02977e42010-10-01 14:49:48 +0200101 * number of total undestroyed groups
Vivek Goyale43473b2010-09-15 17:06:35 -0400102 */
103 unsigned int nr_undestroyed_grps;
104
105 /* Work for dispatching throttled bios */
106 struct delayed_work throtl_work;
Vivek Goyalfe071432010-10-01 14:49:49 +0200107
Andreas Schwab6f037932011-03-30 12:21:56 +0200108 int limits_changed;
Vivek Goyale43473b2010-09-15 17:06:35 -0400109};
110
111enum tg_state_flags {
112 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
113};
114
115#define THROTL_TG_FNS(name) \
116static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
117{ \
118 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
119} \
120static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
121{ \
122 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
123} \
124static inline int throtl_tg_##name(const struct throtl_grp *tg) \
125{ \
126 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
127}
128
129THROTL_TG_FNS(on_rr);
130
131#define throtl_log_tg(td, tg, fmt, args...) \
132 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
133 blkg_path(&(tg)->blkg), ##args); \
134
135#define throtl_log(td, fmt, args...) \
136 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
137
138static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
139{
140 if (blkg)
141 return container_of(blkg, struct throtl_grp, blkg);
142
143 return NULL;
144}
145
Joe Perchesd2f31a52011-06-13 20:19:27 +0200146static inline unsigned int total_nr_queued(struct throtl_data *td)
Vivek Goyale43473b2010-09-15 17:06:35 -0400147{
Joe Perchesd2f31a52011-06-13 20:19:27 +0200148 return td->nr_queued[0] + td->nr_queued[1];
Vivek Goyale43473b2010-09-15 17:06:35 -0400149}
150
151static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
152{
153 atomic_inc(&tg->ref);
154 return tg;
155}
156
Vivek Goyal4843c692011-05-19 15:38:27 -0400157static void throtl_free_tg(struct rcu_head *head)
158{
159 struct throtl_grp *tg;
160
161 tg = container_of(head, struct throtl_grp, rcu_head);
Vivek Goyal5624a4e2011-05-19 15:38:28 -0400162 free_percpu(tg->blkg.stats_cpu);
Vivek Goyal4843c692011-05-19 15:38:27 -0400163 kfree(tg);
164}
165
Vivek Goyale43473b2010-09-15 17:06:35 -0400166static void throtl_put_tg(struct throtl_grp *tg)
167{
168 BUG_ON(atomic_read(&tg->ref) <= 0);
169 if (!atomic_dec_and_test(&tg->ref))
170 return;
Vivek Goyal4843c692011-05-19 15:38:27 -0400171
172 /*
173 * A group is freed in rcu manner. But having an rcu lock does not
174 * mean that one can access all the fields of blkg and assume these
175 * are valid. For example, don't try to follow throtl_data and
176 * request queue links.
177 *
178 * Having a reference to blkg under an rcu allows acess to only
179 * values local to groups like group stats and group rate limits
180 */
181 call_rcu(&tg->rcu_head, throtl_free_tg);
Vivek Goyale43473b2010-09-15 17:06:35 -0400182}
183
Tejun Heocd1604f2012-03-05 13:15:06 -0800184static struct blkio_group *throtl_alloc_blkio_group(struct request_queue *q,
185 struct blkio_cgroup *blkcg)
Vivek Goyala29a1712011-05-19 15:38:19 -0400186{
Tejun Heocd1604f2012-03-05 13:15:06 -0800187 struct throtl_grp *tg;
188
189 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, q->node);
190 if (!tg)
191 return NULL;
192
Vivek Goyala29a1712011-05-19 15:38:19 -0400193 INIT_HLIST_NODE(&tg->tg_node);
194 RB_CLEAR_NODE(&tg->rb_node);
195 bio_list_init(&tg->bio_lists[0]);
196 bio_list_init(&tg->bio_lists[1]);
197 tg->limits_changed = false;
198
Tejun Heoe56da7e2012-03-05 13:15:07 -0800199 tg->bps[READ] = -1;
200 tg->bps[WRITE] = -1;
201 tg->iops[READ] = -1;
202 tg->iops[WRITE] = -1;
Vivek Goyala29a1712011-05-19 15:38:19 -0400203
204 /*
205 * Take the initial reference that will be released on destroy
206 * This can be thought of a joint reference by cgroup and
207 * request queue which will be dropped by either request queue
208 * exit or cgroup deletion path depending on who is exiting first.
209 */
210 atomic_set(&tg->ref, 1);
Vivek Goyala29a1712011-05-19 15:38:19 -0400211
Tejun Heocd1604f2012-03-05 13:15:06 -0800212 return &tg->blkg;
Vivek Goyala29a1712011-05-19 15:38:19 -0400213}
214
Tejun Heocd1604f2012-03-05 13:15:06 -0800215static void throtl_link_blkio_group(struct request_queue *q,
216 struct blkio_group *blkg)
Vivek Goyal269f5412011-05-19 15:38:25 -0400217{
Tejun Heocd1604f2012-03-05 13:15:06 -0800218 struct throtl_data *td = q->td;
219 struct throtl_grp *tg = tg_of_blkg(blkg);
220
Tejun Heocd1604f2012-03-05 13:15:06 -0800221 hlist_add_head(&tg->tg_node, &td->tg_list);
222 td->nr_undestroyed_grps++;
Vivek Goyalf469a7b2011-05-19 15:38:23 -0400223}
224
225static struct
Tejun Heocd1604f2012-03-05 13:15:06 -0800226throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
Vivek Goyale43473b2010-09-15 17:06:35 -0400227{
Vivek Goyale43473b2010-09-15 17:06:35 -0400228 /*
Vivek Goyalbe2c6b12011-01-19 08:25:02 -0700229 * This is the common case when there are no blkio cgroups.
Tejun Heocd1604f2012-03-05 13:15:06 -0800230 * Avoid lookup in this case
231 */
Vivek Goyalbe2c6b12011-01-19 08:25:02 -0700232 if (blkcg == &blkio_root_cgroup)
Tejun Heo7a4dd282012-03-05 13:15:09 -0800233 return td->root_tg;
Vivek Goyale43473b2010-09-15 17:06:35 -0400234
Tejun Heo7a4dd282012-03-05 13:15:09 -0800235 return tg_of_blkg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
Vivek Goyale43473b2010-09-15 17:06:35 -0400236}
237
Tejun Heocd1604f2012-03-05 13:15:06 -0800238static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
239 struct blkio_cgroup *blkcg)
Vivek Goyale43473b2010-09-15 17:06:35 -0400240{
Vivek Goyalf469a7b2011-05-19 15:38:23 -0400241 struct request_queue *q = td->queue;
Tejun Heocd1604f2012-03-05 13:15:06 -0800242 struct throtl_grp *tg = NULL;
Tejun Heo0a5a7d02012-03-05 13:15:02 -0800243
Vivek Goyalf469a7b2011-05-19 15:38:23 -0400244 /*
Tejun Heocd1604f2012-03-05 13:15:06 -0800245 * This is the common case when there are no blkio cgroups.
246 * Avoid lookup in this case
Vivek Goyalf469a7b2011-05-19 15:38:23 -0400247 */
Tejun Heocd1604f2012-03-05 13:15:06 -0800248 if (blkcg == &blkio_root_cgroup) {
Vivek Goyal29b12582011-05-19 15:38:24 -0400249 tg = td->root_tg;
Tejun Heocd1604f2012-03-05 13:15:06 -0800250 } else {
251 struct blkio_group *blkg;
252
253 blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_THROTL, false);
254
255 /* if %NULL and @q is alive, fall back to root_tg */
256 if (!IS_ERR(blkg))
257 tg = tg_of_blkg(blkg);
258 else if (!blk_queue_dead(q))
259 tg = td->root_tg;
Vivek Goyalf469a7b2011-05-19 15:38:23 -0400260 }
261
Vivek Goyale43473b2010-09-15 17:06:35 -0400262 return tg;
263}
264
265static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
266{
267 /* Service tree is empty */
268 if (!root->count)
269 return NULL;
270
271 if (!root->left)
272 root->left = rb_first(&root->rb);
273
274 if (root->left)
275 return rb_entry_tg(root->left);
276
277 return NULL;
278}
279
280static void rb_erase_init(struct rb_node *n, struct rb_root *root)
281{
282 rb_erase(n, root);
283 RB_CLEAR_NODE(n);
284}
285
286static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
287{
288 if (root->left == n)
289 root->left = NULL;
290 rb_erase_init(n, &root->rb);
291 --root->count;
292}
293
294static void update_min_dispatch_time(struct throtl_rb_root *st)
295{
296 struct throtl_grp *tg;
297
298 tg = throtl_rb_first(st);
299 if (!tg)
300 return;
301
302 st->min_disptime = tg->disptime;
303}
304
305static void
306tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
307{
308 struct rb_node **node = &st->rb.rb_node;
309 struct rb_node *parent = NULL;
310 struct throtl_grp *__tg;
311 unsigned long key = tg->disptime;
312 int left = 1;
313
314 while (*node != NULL) {
315 parent = *node;
316 __tg = rb_entry_tg(parent);
317
318 if (time_before(key, __tg->disptime))
319 node = &parent->rb_left;
320 else {
321 node = &parent->rb_right;
322 left = 0;
323 }
324 }
325
326 if (left)
327 st->left = &tg->rb_node;
328
329 rb_link_node(&tg->rb_node, parent, node);
330 rb_insert_color(&tg->rb_node, &st->rb);
331}
332
333static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
334{
335 struct throtl_rb_root *st = &td->tg_service_tree;
336
337 tg_service_tree_add(st, tg);
338 throtl_mark_tg_on_rr(tg);
339 st->count++;
340}
341
342static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
343{
344 if (!throtl_tg_on_rr(tg))
345 __throtl_enqueue_tg(td, tg);
346}
347
348static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
349{
350 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
351 throtl_clear_tg_on_rr(tg);
352}
353
354static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
355{
356 if (throtl_tg_on_rr(tg))
357 __throtl_dequeue_tg(td, tg);
358}
359
360static void throtl_schedule_next_dispatch(struct throtl_data *td)
361{
362 struct throtl_rb_root *st = &td->tg_service_tree;
363
364 /*
365 * If there are more bios pending, schedule more work.
366 */
367 if (!total_nr_queued(td))
368 return;
369
370 BUG_ON(!st->count);
371
372 update_min_dispatch_time(st);
373
374 if (time_before_eq(st->min_disptime, jiffies))
Vivek Goyal450adcb2011-03-01 13:40:54 -0500375 throtl_schedule_delayed_work(td, 0);
Vivek Goyale43473b2010-09-15 17:06:35 -0400376 else
Vivek Goyal450adcb2011-03-01 13:40:54 -0500377 throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
Vivek Goyale43473b2010-09-15 17:06:35 -0400378}
379
380static inline void
381throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
382{
383 tg->bytes_disp[rw] = 0;
Vivek Goyal8e89d132010-09-15 17:06:37 -0400384 tg->io_disp[rw] = 0;
Vivek Goyale43473b2010-09-15 17:06:35 -0400385 tg->slice_start[rw] = jiffies;
386 tg->slice_end[rw] = jiffies + throtl_slice;
387 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
388 rw == READ ? 'R' : 'W', tg->slice_start[rw],
389 tg->slice_end[rw], jiffies);
390}
391
Vivek Goyald1ae8ff2010-12-01 19:34:46 +0100392static inline void throtl_set_slice_end(struct throtl_data *td,
393 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
394{
395 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
396}
397
Vivek Goyale43473b2010-09-15 17:06:35 -0400398static inline void throtl_extend_slice(struct throtl_data *td,
399 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
400{
401 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
402 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
403 rw == READ ? 'R' : 'W', tg->slice_start[rw],
404 tg->slice_end[rw], jiffies);
405}
406
407/* Determine if previously allocated or extended slice is complete or not */
408static bool
409throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
410{
411 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
412 return 0;
413
414 return 1;
415}
416
417/* Trim the used slices and adjust slice start accordingly */
418static inline void
419throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
420{
Vivek Goyal3aad5d32010-10-01 14:51:14 +0200421 unsigned long nr_slices, time_elapsed, io_trim;
422 u64 bytes_trim, tmp;
Vivek Goyale43473b2010-09-15 17:06:35 -0400423
424 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
425
426 /*
427 * If bps are unlimited (-1), then time slice don't get
428 * renewed. Don't try to trim the slice if slice is used. A new
429 * slice will start when appropriate.
430 */
431 if (throtl_slice_used(td, tg, rw))
432 return;
433
Vivek Goyald1ae8ff2010-12-01 19:34:46 +0100434 /*
435 * A bio has been dispatched. Also adjust slice_end. It might happen
436 * that initially cgroup limit was very low resulting in high
437 * slice_end, but later limit was bumped up and bio was dispached
438 * sooner, then we need to reduce slice_end. A high bogus slice_end
439 * is bad because it does not allow new slice to start.
440 */
441
442 throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
443
Vivek Goyale43473b2010-09-15 17:06:35 -0400444 time_elapsed = jiffies - tg->slice_start[rw];
445
446 nr_slices = time_elapsed / throtl_slice;
447
448 if (!nr_slices)
449 return;
Vivek Goyal3aad5d32010-10-01 14:51:14 +0200450 tmp = tg->bps[rw] * throtl_slice * nr_slices;
451 do_div(tmp, HZ);
452 bytes_trim = tmp;
Vivek Goyale43473b2010-09-15 17:06:35 -0400453
Vivek Goyal8e89d132010-09-15 17:06:37 -0400454 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
Vivek Goyale43473b2010-09-15 17:06:35 -0400455
Vivek Goyal8e89d132010-09-15 17:06:37 -0400456 if (!bytes_trim && !io_trim)
Vivek Goyale43473b2010-09-15 17:06:35 -0400457 return;
458
459 if (tg->bytes_disp[rw] >= bytes_trim)
460 tg->bytes_disp[rw] -= bytes_trim;
461 else
462 tg->bytes_disp[rw] = 0;
463
Vivek Goyal8e89d132010-09-15 17:06:37 -0400464 if (tg->io_disp[rw] >= io_trim)
465 tg->io_disp[rw] -= io_trim;
466 else
467 tg->io_disp[rw] = 0;
468
Vivek Goyale43473b2010-09-15 17:06:35 -0400469 tg->slice_start[rw] += nr_slices * throtl_slice;
470
Vivek Goyal3aad5d32010-10-01 14:51:14 +0200471 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
Vivek Goyale43473b2010-09-15 17:06:35 -0400472 " start=%lu end=%lu jiffies=%lu",
Vivek Goyal8e89d132010-09-15 17:06:37 -0400473 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
Vivek Goyale43473b2010-09-15 17:06:35 -0400474 tg->slice_start[rw], tg->slice_end[rw], jiffies);
475}
476
Vivek Goyal8e89d132010-09-15 17:06:37 -0400477static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
478 struct bio *bio, unsigned long *wait)
Vivek Goyale43473b2010-09-15 17:06:35 -0400479{
480 bool rw = bio_data_dir(bio);
Vivek Goyal8e89d132010-09-15 17:06:37 -0400481 unsigned int io_allowed;
Vivek Goyale43473b2010-09-15 17:06:35 -0400482 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
Vivek Goyalc49c06e2010-10-01 21:16:42 +0200483 u64 tmp;
Vivek Goyale43473b2010-09-15 17:06:35 -0400484
Vivek Goyal8e89d132010-09-15 17:06:37 -0400485 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
Vivek Goyale43473b2010-09-15 17:06:35 -0400486
Vivek Goyal8e89d132010-09-15 17:06:37 -0400487 /* Slice has just started. Consider one slice interval */
488 if (!jiffy_elapsed)
489 jiffy_elapsed_rnd = throtl_slice;
490
491 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
492
Vivek Goyalc49c06e2010-10-01 21:16:42 +0200493 /*
494 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
495 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
496 * will allow dispatch after 1 second and after that slice should
497 * have been trimmed.
498 */
499
500 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
501 do_div(tmp, HZ);
502
503 if (tmp > UINT_MAX)
504 io_allowed = UINT_MAX;
505 else
506 io_allowed = tmp;
Vivek Goyal8e89d132010-09-15 17:06:37 -0400507
508 if (tg->io_disp[rw] + 1 <= io_allowed) {
Vivek Goyale43473b2010-09-15 17:06:35 -0400509 if (wait)
510 *wait = 0;
511 return 1;
512 }
513
Vivek Goyal8e89d132010-09-15 17:06:37 -0400514 /* Calc approx time to dispatch */
515 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
516
517 if (jiffy_wait > jiffy_elapsed)
518 jiffy_wait = jiffy_wait - jiffy_elapsed;
519 else
520 jiffy_wait = 1;
521
522 if (wait)
523 *wait = jiffy_wait;
524 return 0;
525}
526
527static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
528 struct bio *bio, unsigned long *wait)
529{
530 bool rw = bio_data_dir(bio);
Vivek Goyal3aad5d32010-10-01 14:51:14 +0200531 u64 bytes_allowed, extra_bytes, tmp;
Vivek Goyal8e89d132010-09-15 17:06:37 -0400532 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
Vivek Goyale43473b2010-09-15 17:06:35 -0400533
534 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
535
536 /* Slice has just started. Consider one slice interval */
537 if (!jiffy_elapsed)
538 jiffy_elapsed_rnd = throtl_slice;
539
540 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
541
Vivek Goyal5e901a22010-10-01 21:16:38 +0200542 tmp = tg->bps[rw] * jiffy_elapsed_rnd;
543 do_div(tmp, HZ);
Vivek Goyal3aad5d32010-10-01 14:51:14 +0200544 bytes_allowed = tmp;
Vivek Goyale43473b2010-09-15 17:06:35 -0400545
546 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
547 if (wait)
548 *wait = 0;
549 return 1;
550 }
551
552 /* Calc approx time to dispatch */
553 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
554 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
555
556 if (!jiffy_wait)
557 jiffy_wait = 1;
558
559 /*
560 * This wait time is without taking into consideration the rounding
561 * up we did. Add that time also.
562 */
563 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
Vivek Goyale43473b2010-09-15 17:06:35 -0400564 if (wait)
565 *wait = jiffy_wait;
Vivek Goyal8e89d132010-09-15 17:06:37 -0400566 return 0;
567}
Vivek Goyale43473b2010-09-15 17:06:35 -0400568
Vivek Goyalaf75cd32011-05-19 15:38:31 -0400569static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
570 if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
571 return 1;
572 return 0;
573}
574
Vivek Goyal8e89d132010-09-15 17:06:37 -0400575/*
576 * Returns whether one can dispatch a bio or not. Also returns approx number
577 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
578 */
579static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
580 struct bio *bio, unsigned long *wait)
581{
582 bool rw = bio_data_dir(bio);
583 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
584
585 /*
586 * Currently whole state machine of group depends on first bio
587 * queued in the group bio list. So one should not be calling
588 * this function with a different bio if there are other bios
589 * queued.
590 */
591 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
592
593 /* If tg->bps = -1, then BW is unlimited */
594 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
595 if (wait)
596 *wait = 0;
597 return 1;
598 }
599
600 /*
601 * If previous slice expired, start a new one otherwise renew/extend
602 * existing slice to make sure it is at least throtl_slice interval
603 * long since now.
604 */
605 if (throtl_slice_used(td, tg, rw))
606 throtl_start_new_slice(td, tg, rw);
607 else {
608 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
609 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
610 }
611
612 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
613 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
614 if (wait)
615 *wait = 0;
616 return 1;
617 }
618
619 max_wait = max(bps_wait, iops_wait);
620
621 if (wait)
622 *wait = max_wait;
623
624 if (time_before(tg->slice_end[rw], jiffies + max_wait))
625 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
Vivek Goyale43473b2010-09-15 17:06:35 -0400626
627 return 0;
628}
629
630static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
631{
632 bool rw = bio_data_dir(bio);
Shaohua Lie5a94f52011-08-01 10:31:06 +0200633 bool sync = rw_is_sync(bio->bi_rw);
Vivek Goyale43473b2010-09-15 17:06:35 -0400634
635 /* Charge the bio to the group */
636 tg->bytes_disp[rw] += bio->bi_size;
Vivek Goyal8e89d132010-09-15 17:06:37 -0400637 tg->io_disp[rw]++;
Vivek Goyale43473b2010-09-15 17:06:35 -0400638
Vivek Goyale43473b2010-09-15 17:06:35 -0400639 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
Vivek Goyale43473b2010-09-15 17:06:35 -0400640}
641
642static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
643 struct bio *bio)
644{
645 bool rw = bio_data_dir(bio);
646
647 bio_list_add(&tg->bio_lists[rw], bio);
648 /* Take a bio reference on tg */
649 throtl_ref_get_tg(tg);
650 tg->nr_queued[rw]++;
651 td->nr_queued[rw]++;
652 throtl_enqueue_tg(td, tg);
653}
654
655static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
656{
657 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
658 struct bio *bio;
659
660 if ((bio = bio_list_peek(&tg->bio_lists[READ])))
661 tg_may_dispatch(td, tg, bio, &read_wait);
662
663 if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
664 tg_may_dispatch(td, tg, bio, &write_wait);
665
666 min_wait = min(read_wait, write_wait);
667 disptime = jiffies + min_wait;
668
Vivek Goyale43473b2010-09-15 17:06:35 -0400669 /* Update dispatch time */
670 throtl_dequeue_tg(td, tg);
671 tg->disptime = disptime;
672 throtl_enqueue_tg(td, tg);
673}
674
675static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
676 bool rw, struct bio_list *bl)
677{
678 struct bio *bio;
679
680 bio = bio_list_pop(&tg->bio_lists[rw]);
681 tg->nr_queued[rw]--;
682 /* Drop bio reference on tg */
683 throtl_put_tg(tg);
684
685 BUG_ON(td->nr_queued[rw] <= 0);
686 td->nr_queued[rw]--;
687
688 throtl_charge_bio(tg, bio);
689 bio_list_add(bl, bio);
690 bio->bi_rw |= REQ_THROTTLED;
691
692 throtl_trim_slice(td, tg, rw);
693}
694
695static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
696 struct bio_list *bl)
697{
698 unsigned int nr_reads = 0, nr_writes = 0;
699 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
Vivek Goyalc2f68052010-11-15 19:32:42 +0100700 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
Vivek Goyale43473b2010-09-15 17:06:35 -0400701 struct bio *bio;
702
703 /* Try to dispatch 75% READS and 25% WRITES */
704
705 while ((bio = bio_list_peek(&tg->bio_lists[READ]))
706 && tg_may_dispatch(td, tg, bio, NULL)) {
707
708 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
709 nr_reads++;
710
711 if (nr_reads >= max_nr_reads)
712 break;
713 }
714
715 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
716 && tg_may_dispatch(td, tg, bio, NULL)) {
717
718 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
719 nr_writes++;
720
721 if (nr_writes >= max_nr_writes)
722 break;
723 }
724
725 return nr_reads + nr_writes;
726}
727
728static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
729{
730 unsigned int nr_disp = 0;
731 struct throtl_grp *tg;
732 struct throtl_rb_root *st = &td->tg_service_tree;
733
734 while (1) {
735 tg = throtl_rb_first(st);
736
737 if (!tg)
738 break;
739
740 if (time_before(jiffies, tg->disptime))
741 break;
742
743 throtl_dequeue_tg(td, tg);
744
745 nr_disp += throtl_dispatch_tg(td, tg, bl);
746
747 if (tg->nr_queued[0] || tg->nr_queued[1]) {
748 tg_update_disptime(td, tg);
749 throtl_enqueue_tg(td, tg);
750 }
751
752 if (nr_disp >= throtl_quantum)
753 break;
754 }
755
756 return nr_disp;
757}
758
Vivek Goyalfe071432010-10-01 14:49:49 +0200759static void throtl_process_limit_change(struct throtl_data *td)
760{
761 struct throtl_grp *tg;
762 struct hlist_node *pos, *n;
763
Vivek Goyalde701c72011-03-07 21:09:32 +0100764 if (!td->limits_changed)
Vivek Goyalfe071432010-10-01 14:49:49 +0200765 return;
766
Vivek Goyalde701c72011-03-07 21:09:32 +0100767 xchg(&td->limits_changed, false);
Vivek Goyalfe071432010-10-01 14:49:49 +0200768
Vivek Goyalde701c72011-03-07 21:09:32 +0100769 throtl_log(td, "limits changed");
Vivek Goyalfe071432010-10-01 14:49:49 +0200770
Vivek Goyal04a6b512010-12-01 19:34:52 +0100771 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
Vivek Goyalde701c72011-03-07 21:09:32 +0100772 if (!tg->limits_changed)
773 continue;
Vivek Goyalfe071432010-10-01 14:49:49 +0200774
Vivek Goyalde701c72011-03-07 21:09:32 +0100775 if (!xchg(&tg->limits_changed, false))
776 continue;
777
778 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
779 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
780 tg->iops[READ], tg->iops[WRITE]);
781
Vivek Goyal04521db2011-03-22 21:54:29 +0100782 /*
783 * Restart the slices for both READ and WRITES. It
784 * might happen that a group's limit are dropped
785 * suddenly and we don't want to account recently
786 * dispatched IO with new low rate
787 */
788 throtl_start_new_slice(td, tg, 0);
789 throtl_start_new_slice(td, tg, 1);
790
Vivek Goyalde701c72011-03-07 21:09:32 +0100791 if (throtl_tg_on_rr(tg))
792 tg_update_disptime(td, tg);
793 }
Vivek Goyalfe071432010-10-01 14:49:49 +0200794}
795
Vivek Goyale43473b2010-09-15 17:06:35 -0400796/* Dispatch throttled bios. Should be called without queue lock held. */
797static int throtl_dispatch(struct request_queue *q)
798{
799 struct throtl_data *td = q->td;
800 unsigned int nr_disp = 0;
801 struct bio_list bio_list_on_stack;
802 struct bio *bio;
Vivek Goyal69d60eb2011-03-09 08:27:37 +0100803 struct blk_plug plug;
Vivek Goyale43473b2010-09-15 17:06:35 -0400804
805 spin_lock_irq(q->queue_lock);
806
Vivek Goyalfe071432010-10-01 14:49:49 +0200807 throtl_process_limit_change(td);
808
Vivek Goyale43473b2010-09-15 17:06:35 -0400809 if (!total_nr_queued(td))
810 goto out;
811
812 bio_list_init(&bio_list_on_stack);
813
Joe Perchesd2f31a52011-06-13 20:19:27 +0200814 throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
Vivek Goyale43473b2010-09-15 17:06:35 -0400815 total_nr_queued(td), td->nr_queued[READ],
816 td->nr_queued[WRITE]);
817
818 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
819
820 if (nr_disp)
821 throtl_log(td, "bios disp=%u", nr_disp);
822
823 throtl_schedule_next_dispatch(td);
824out:
825 spin_unlock_irq(q->queue_lock);
826
827 /*
828 * If we dispatched some requests, unplug the queue to make sure
829 * immediate dispatch
830 */
831 if (nr_disp) {
Vivek Goyal69d60eb2011-03-09 08:27:37 +0100832 blk_start_plug(&plug);
Vivek Goyale43473b2010-09-15 17:06:35 -0400833 while((bio = bio_list_pop(&bio_list_on_stack)))
834 generic_make_request(bio);
Vivek Goyal69d60eb2011-03-09 08:27:37 +0100835 blk_finish_plug(&plug);
Vivek Goyale43473b2010-09-15 17:06:35 -0400836 }
837 return nr_disp;
838}
839
840void blk_throtl_work(struct work_struct *work)
841{
842 struct throtl_data *td = container_of(work, struct throtl_data,
843 throtl_work.work);
844 struct request_queue *q = td->queue;
845
846 throtl_dispatch(q);
847}
848
849/* Call with queue lock held */
Vivek Goyal450adcb2011-03-01 13:40:54 -0500850static void
851throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
Vivek Goyale43473b2010-09-15 17:06:35 -0400852{
853
Vivek Goyale43473b2010-09-15 17:06:35 -0400854 struct delayed_work *dwork = &td->throtl_work;
855
Vivek Goyal04521db2011-03-22 21:54:29 +0100856 /* schedule work if limits changed even if no bio is queued */
Joe Perchesd2f31a52011-06-13 20:19:27 +0200857 if (total_nr_queued(td) || td->limits_changed) {
Vivek Goyale43473b2010-09-15 17:06:35 -0400858 /*
859 * We might have a work scheduled to be executed in future.
860 * Cancel that and schedule a new one.
861 */
862 __cancel_delayed_work(dwork);
Vivek Goyal450adcb2011-03-01 13:40:54 -0500863 queue_delayed_work(kthrotld_workqueue, dwork, delay);
Vivek Goyale43473b2010-09-15 17:06:35 -0400864 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
865 delay, jiffies);
866 }
867}
Vivek Goyale43473b2010-09-15 17:06:35 -0400868
869static void
870throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
871{
872 /* Something wrong if we are trying to remove same group twice */
873 BUG_ON(hlist_unhashed(&tg->tg_node));
874
875 hlist_del_init(&tg->tg_node);
876
877 /*
878 * Put the reference taken at the time of creation so that when all
879 * queues are gone, group can be destroyed.
880 */
881 throtl_put_tg(tg);
882 td->nr_undestroyed_grps--;
883}
884
Tejun Heo72e06c22012-03-05 13:15:00 -0800885static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
Vivek Goyale43473b2010-09-15 17:06:35 -0400886{
887 struct hlist_node *pos, *n;
888 struct throtl_grp *tg;
Tejun Heo72e06c22012-03-05 13:15:00 -0800889 bool empty = true;
Vivek Goyale43473b2010-09-15 17:06:35 -0400890
891 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
Tejun Heo72e06c22012-03-05 13:15:00 -0800892 /* skip root? */
893 if (!release_root && tg == td->root_tg)
894 continue;
895
Vivek Goyale43473b2010-09-15 17:06:35 -0400896 /*
897 * If cgroup removal path got to blk_group first and removed
898 * it from cgroup list, then it will take care of destroying
899 * cfqg also.
900 */
901 if (!blkiocg_del_blkio_group(&tg->blkg))
902 throtl_destroy_tg(td, tg);
Tejun Heo72e06c22012-03-05 13:15:00 -0800903 else
904 empty = false;
Vivek Goyale43473b2010-09-15 17:06:35 -0400905 }
Tejun Heo72e06c22012-03-05 13:15:00 -0800906 return empty;
Vivek Goyale43473b2010-09-15 17:06:35 -0400907}
908
Vivek Goyale43473b2010-09-15 17:06:35 -0400909/*
910 * Blk cgroup controller notification saying that blkio_group object is being
911 * delinked as associated cgroup object is going away. That also means that
912 * no new IO will come in this group. So get rid of this group as soon as
913 * any pending IO in the group is finished.
914 *
Tejun Heoca32aef2012-03-05 13:15:03 -0800915 * This function is called under rcu_read_lock(). @q is the rcu protected
916 * pointer. That means @q is a valid request_queue pointer as long as we
917 * are rcu read lock.
Vivek Goyale43473b2010-09-15 17:06:35 -0400918 *
Tejun Heoca32aef2012-03-05 13:15:03 -0800919 * @q was fetched from blkio_group under blkio_cgroup->lock. That means
Vivek Goyale43473b2010-09-15 17:06:35 -0400920 * it should not be NULL as even if queue was going away, cgroup deltion
921 * path got to it first.
922 */
Tejun Heoca32aef2012-03-05 13:15:03 -0800923void throtl_unlink_blkio_group(struct request_queue *q,
924 struct blkio_group *blkg)
Vivek Goyale43473b2010-09-15 17:06:35 -0400925{
926 unsigned long flags;
Vivek Goyale43473b2010-09-15 17:06:35 -0400927
Tejun Heoca32aef2012-03-05 13:15:03 -0800928 spin_lock_irqsave(q->queue_lock, flags);
929 throtl_destroy_tg(q->td, tg_of_blkg(blkg));
930 spin_unlock_irqrestore(q->queue_lock, flags);
Vivek Goyale43473b2010-09-15 17:06:35 -0400931}
932
Tejun Heo72e06c22012-03-05 13:15:00 -0800933static bool throtl_clear_queue(struct request_queue *q)
934{
935 lockdep_assert_held(q->queue_lock);
936
937 /*
938 * Clear tgs but leave the root one alone. This is necessary
939 * because root_tg is expected to be persistent and safe because
940 * blk-throtl can never be disabled while @q is alive. This is a
941 * kludge to prepare for unified blkg. This whole function will be
942 * removed soon.
943 */
944 return throtl_release_tgs(q->td, false);
945}
946
Vivek Goyalde701c72011-03-07 21:09:32 +0100947static void throtl_update_blkio_group_common(struct throtl_data *td,
948 struct throtl_grp *tg)
949{
950 xchg(&tg->limits_changed, true);
951 xchg(&td->limits_changed, true);
952 /* Schedule a work now to process the limit change */
953 throtl_schedule_delayed_work(td, 0);
954}
955
Vivek Goyalfe071432010-10-01 14:49:49 +0200956/*
Tejun Heoca32aef2012-03-05 13:15:03 -0800957 * For all update functions, @q should be a valid pointer because these
Vivek Goyalfe071432010-10-01 14:49:49 +0200958 * update functions are called under blkcg_lock, that means, blkg is
Tejun Heoca32aef2012-03-05 13:15:03 -0800959 * valid and in turn @q is valid. queue exit path can not race because
Vivek Goyalfe071432010-10-01 14:49:49 +0200960 * of blkcg_lock
961 *
962 * Can not take queue lock in update functions as queue lock under blkcg_lock
963 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
964 */
Tejun Heoca32aef2012-03-05 13:15:03 -0800965static void throtl_update_blkio_group_read_bps(struct request_queue *q,
Vivek Goyalfe071432010-10-01 14:49:49 +0200966 struct blkio_group *blkg, u64 read_bps)
Vivek Goyale43473b2010-09-15 17:06:35 -0400967{
Vivek Goyalde701c72011-03-07 21:09:32 +0100968 struct throtl_grp *tg = tg_of_blkg(blkg);
Vivek Goyalfe071432010-10-01 14:49:49 +0200969
Vivek Goyalde701c72011-03-07 21:09:32 +0100970 tg->bps[READ] = read_bps;
Tejun Heoca32aef2012-03-05 13:15:03 -0800971 throtl_update_blkio_group_common(q->td, tg);
Vivek Goyale43473b2010-09-15 17:06:35 -0400972}
973
Tejun Heoca32aef2012-03-05 13:15:03 -0800974static void throtl_update_blkio_group_write_bps(struct request_queue *q,
Vivek Goyalfe071432010-10-01 14:49:49 +0200975 struct blkio_group *blkg, u64 write_bps)
Vivek Goyale43473b2010-09-15 17:06:35 -0400976{
Vivek Goyalde701c72011-03-07 21:09:32 +0100977 struct throtl_grp *tg = tg_of_blkg(blkg);
Vivek Goyalfe071432010-10-01 14:49:49 +0200978
Vivek Goyalde701c72011-03-07 21:09:32 +0100979 tg->bps[WRITE] = write_bps;
Tejun Heoca32aef2012-03-05 13:15:03 -0800980 throtl_update_blkio_group_common(q->td, tg);
Vivek Goyale43473b2010-09-15 17:06:35 -0400981}
982
Tejun Heoca32aef2012-03-05 13:15:03 -0800983static void throtl_update_blkio_group_read_iops(struct request_queue *q,
Vivek Goyalfe071432010-10-01 14:49:49 +0200984 struct blkio_group *blkg, unsigned int read_iops)
Vivek Goyal8e89d132010-09-15 17:06:37 -0400985{
Vivek Goyalde701c72011-03-07 21:09:32 +0100986 struct throtl_grp *tg = tg_of_blkg(blkg);
Vivek Goyalfe071432010-10-01 14:49:49 +0200987
Vivek Goyalde701c72011-03-07 21:09:32 +0100988 tg->iops[READ] = read_iops;
Tejun Heoca32aef2012-03-05 13:15:03 -0800989 throtl_update_blkio_group_common(q->td, tg);
Vivek Goyal8e89d132010-09-15 17:06:37 -0400990}
991
Tejun Heoca32aef2012-03-05 13:15:03 -0800992static void throtl_update_blkio_group_write_iops(struct request_queue *q,
Vivek Goyalfe071432010-10-01 14:49:49 +0200993 struct blkio_group *blkg, unsigned int write_iops)
Vivek Goyal8e89d132010-09-15 17:06:37 -0400994{
Vivek Goyalde701c72011-03-07 21:09:32 +0100995 struct throtl_grp *tg = tg_of_blkg(blkg);
Vivek Goyalfe071432010-10-01 14:49:49 +0200996
Vivek Goyalde701c72011-03-07 21:09:32 +0100997 tg->iops[WRITE] = write_iops;
Tejun Heoca32aef2012-03-05 13:15:03 -0800998 throtl_update_blkio_group_common(q->td, tg);
Vivek Goyal8e89d132010-09-15 17:06:37 -0400999}
1000
Vivek Goyalda527772011-03-02 19:05:33 -05001001static void throtl_shutdown_wq(struct request_queue *q)
Vivek Goyale43473b2010-09-15 17:06:35 -04001002{
1003 struct throtl_data *td = q->td;
1004
1005 cancel_delayed_work_sync(&td->throtl_work);
1006}
1007
1008static struct blkio_policy_type blkio_policy_throtl = {
1009 .ops = {
Tejun Heocd1604f2012-03-05 13:15:06 -08001010 .blkio_alloc_group_fn = throtl_alloc_blkio_group,
1011 .blkio_link_group_fn = throtl_link_blkio_group,
Vivek Goyale43473b2010-09-15 17:06:35 -04001012 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
Tejun Heo72e06c22012-03-05 13:15:00 -08001013 .blkio_clear_queue_fn = throtl_clear_queue,
Vivek Goyale43473b2010-09-15 17:06:35 -04001014 .blkio_update_group_read_bps_fn =
1015 throtl_update_blkio_group_read_bps,
1016 .blkio_update_group_write_bps_fn =
1017 throtl_update_blkio_group_write_bps,
Vivek Goyal8e89d132010-09-15 17:06:37 -04001018 .blkio_update_group_read_iops_fn =
1019 throtl_update_blkio_group_read_iops,
1020 .blkio_update_group_write_iops_fn =
1021 throtl_update_blkio_group_write_iops,
Vivek Goyale43473b2010-09-15 17:06:35 -04001022 },
Vivek Goyal8e89d132010-09-15 17:06:37 -04001023 .plid = BLKIO_POLICY_THROTL,
Vivek Goyale43473b2010-09-15 17:06:35 -04001024};
1025
Tejun Heobc16a4f2011-10-19 14:33:01 +02001026bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
Vivek Goyale43473b2010-09-15 17:06:35 -04001027{
1028 struct throtl_data *td = q->td;
1029 struct throtl_grp *tg;
Vivek Goyale43473b2010-09-15 17:06:35 -04001030 bool rw = bio_data_dir(bio), update_disptime = true;
Vivek Goyalaf75cd32011-05-19 15:38:31 -04001031 struct blkio_cgroup *blkcg;
Tejun Heobc16a4f2011-10-19 14:33:01 +02001032 bool throttled = false;
Vivek Goyale43473b2010-09-15 17:06:35 -04001033
1034 if (bio->bi_rw & REQ_THROTTLED) {
1035 bio->bi_rw &= ~REQ_THROTTLED;
Tejun Heobc16a4f2011-10-19 14:33:01 +02001036 goto out;
Vivek Goyale43473b2010-09-15 17:06:35 -04001037 }
1038
Vivek Goyalaf75cd32011-05-19 15:38:31 -04001039 /*
1040 * A throtl_grp pointer retrieved under rcu can be used to access
1041 * basic fields like stats and io rates. If a group has no rules,
1042 * just update the dispatch stats in lockless manner and return.
1043 */
Vivek Goyalaf75cd32011-05-19 15:38:31 -04001044 rcu_read_lock();
1045 blkcg = task_blkio_cgroup(current);
Tejun Heocd1604f2012-03-05 13:15:06 -08001046 tg = throtl_lookup_tg(td, blkcg);
Vivek Goyalaf75cd32011-05-19 15:38:31 -04001047 if (tg) {
Vivek Goyalaf75cd32011-05-19 15:38:31 -04001048 if (tg_no_rule_group(tg, rw)) {
1049 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
Shaohua Lie5a94f52011-08-01 10:31:06 +02001050 rw, rw_is_sync(bio->bi_rw));
Tejun Heo2a7f1242012-03-05 13:15:01 -08001051 goto out_unlock_rcu;
Vivek Goyalaf75cd32011-05-19 15:38:31 -04001052 }
1053 }
Vivek Goyalaf75cd32011-05-19 15:38:31 -04001054
1055 /*
1056 * Either group has not been allocated yet or it is not an unlimited
1057 * IO group
1058 */
Vivek Goyale43473b2010-09-15 17:06:35 -04001059 spin_lock_irq(q->queue_lock);
Tejun Heocd1604f2012-03-05 13:15:06 -08001060 tg = throtl_lookup_create_tg(td, blkcg);
Tejun Heobc16a4f2011-10-19 14:33:01 +02001061 if (unlikely(!tg))
1062 goto out_unlock;
Vivek Goyalf469a7b2011-05-19 15:38:23 -04001063
Vivek Goyale43473b2010-09-15 17:06:35 -04001064 if (tg->nr_queued[rw]) {
1065 /*
1066 * There is already another bio queued in same dir. No
1067 * need to update dispatch time.
1068 */
Vivek Goyal231d7042011-03-07 21:05:14 +01001069 update_disptime = false;
Vivek Goyale43473b2010-09-15 17:06:35 -04001070 goto queue_bio;
Vivek Goyalde701c72011-03-07 21:09:32 +01001071
Vivek Goyale43473b2010-09-15 17:06:35 -04001072 }
1073
1074 /* Bio is with-in rate limit of group */
1075 if (tg_may_dispatch(td, tg, bio, NULL)) {
1076 throtl_charge_bio(tg, bio);
Vivek Goyal04521db2011-03-22 21:54:29 +01001077
1078 /*
1079 * We need to trim slice even when bios are not being queued
1080 * otherwise it might happen that a bio is not queued for
1081 * a long time and slice keeps on extending and trim is not
1082 * called for a long time. Now if limits are reduced suddenly
1083 * we take into account all the IO dispatched so far at new
1084 * low rate and * newly queued IO gets a really long dispatch
1085 * time.
1086 *
1087 * So keep on trimming slice even if bio is not queued.
1088 */
1089 throtl_trim_slice(td, tg, rw);
Tejun Heobc16a4f2011-10-19 14:33:01 +02001090 goto out_unlock;
Vivek Goyale43473b2010-09-15 17:06:35 -04001091 }
1092
1093queue_bio:
Joe Perchesfd16d262011-06-13 10:42:49 +02001094 throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
Vivek Goyal8e89d132010-09-15 17:06:37 -04001095 " iodisp=%u iops=%u queued=%d/%d",
1096 rw == READ ? 'R' : 'W',
Vivek Goyale43473b2010-09-15 17:06:35 -04001097 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
Vivek Goyal8e89d132010-09-15 17:06:37 -04001098 tg->io_disp[rw], tg->iops[rw],
Vivek Goyale43473b2010-09-15 17:06:35 -04001099 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1100
1101 throtl_add_bio_tg(q->td, tg, bio);
Tejun Heobc16a4f2011-10-19 14:33:01 +02001102 throttled = true;
Vivek Goyale43473b2010-09-15 17:06:35 -04001103
1104 if (update_disptime) {
1105 tg_update_disptime(td, tg);
1106 throtl_schedule_next_dispatch(td);
1107 }
1108
Tejun Heobc16a4f2011-10-19 14:33:01 +02001109out_unlock:
Vivek Goyale43473b2010-09-15 17:06:35 -04001110 spin_unlock_irq(q->queue_lock);
Tejun Heo2a7f1242012-03-05 13:15:01 -08001111out_unlock_rcu:
1112 rcu_read_unlock();
Tejun Heobc16a4f2011-10-19 14:33:01 +02001113out:
1114 return throttled;
Vivek Goyale43473b2010-09-15 17:06:35 -04001115}
1116
Tejun Heoc9a929d2011-10-19 14:42:16 +02001117/**
1118 * blk_throtl_drain - drain throttled bios
1119 * @q: request_queue to drain throttled bios for
1120 *
1121 * Dispatch all currently throttled bios on @q through ->make_request_fn().
1122 */
1123void blk_throtl_drain(struct request_queue *q)
1124 __releases(q->queue_lock) __acquires(q->queue_lock)
1125{
1126 struct throtl_data *td = q->td;
1127 struct throtl_rb_root *st = &td->tg_service_tree;
1128 struct throtl_grp *tg;
1129 struct bio_list bl;
1130 struct bio *bio;
1131
Jens Axboe334c2b02011-10-25 15:51:48 +02001132 WARN_ON_ONCE(!queue_is_locked(q));
Tejun Heoc9a929d2011-10-19 14:42:16 +02001133
1134 bio_list_init(&bl);
1135
1136 while ((tg = throtl_rb_first(st))) {
1137 throtl_dequeue_tg(td, tg);
1138
1139 while ((bio = bio_list_peek(&tg->bio_lists[READ])))
1140 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1141 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1142 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1143 }
1144 spin_unlock_irq(q->queue_lock);
1145
1146 while ((bio = bio_list_pop(&bl)))
1147 generic_make_request(bio);
1148
1149 spin_lock_irq(q->queue_lock);
1150}
1151
Vivek Goyale43473b2010-09-15 17:06:35 -04001152int blk_throtl_init(struct request_queue *q)
1153{
1154 struct throtl_data *td;
Tejun Heocd1604f2012-03-05 13:15:06 -08001155 struct blkio_group *blkg;
Vivek Goyale43473b2010-09-15 17:06:35 -04001156
1157 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1158 if (!td)
1159 return -ENOMEM;
1160
1161 INIT_HLIST_HEAD(&td->tg_list);
1162 td->tg_service_tree = THROTL_RB_ROOT;
Vivek Goyalde701c72011-03-07 21:09:32 +01001163 td->limits_changed = false;
Vivek Goyala29a1712011-05-19 15:38:19 -04001164 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
Vivek Goyale43473b2010-09-15 17:06:35 -04001165
Tejun Heocd1604f2012-03-05 13:15:06 -08001166 q->td = td;
Vivek Goyal29b12582011-05-19 15:38:24 -04001167 td->queue = q;
Vivek Goyal02977e42010-10-01 14:49:48 +02001168
Tejun Heocd1604f2012-03-05 13:15:06 -08001169 /* alloc and init root group. */
Tejun Heof51b8022012-03-05 13:15:05 -08001170 rcu_read_lock();
1171 spin_lock_irq(q->queue_lock);
1172
Tejun Heocd1604f2012-03-05 13:15:06 -08001173 blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_THROTL,
1174 true);
1175 if (!IS_ERR(blkg))
1176 td->root_tg = tg_of_blkg(blkg);
Tejun Heof51b8022012-03-05 13:15:05 -08001177
1178 spin_unlock_irq(q->queue_lock);
1179 rcu_read_unlock();
1180
1181 if (!td->root_tg) {
Vivek Goyal29b12582011-05-19 15:38:24 -04001182 kfree(td);
1183 return -ENOMEM;
1184 }
Vivek Goyale43473b2010-09-15 17:06:35 -04001185 return 0;
1186}
1187
1188void blk_throtl_exit(struct request_queue *q)
1189{
1190 struct throtl_data *td = q->td;
1191 bool wait = false;
1192
1193 BUG_ON(!td);
1194
Vivek Goyalda527772011-03-02 19:05:33 -05001195 throtl_shutdown_wq(q);
Vivek Goyale43473b2010-09-15 17:06:35 -04001196
1197 spin_lock_irq(q->queue_lock);
Tejun Heo72e06c22012-03-05 13:15:00 -08001198 throtl_release_tgs(td, true);
Vivek Goyale43473b2010-09-15 17:06:35 -04001199
1200 /* If there are other groups */
Vivek Goyal02977e42010-10-01 14:49:48 +02001201 if (td->nr_undestroyed_grps > 0)
Vivek Goyale43473b2010-09-15 17:06:35 -04001202 wait = true;
1203
1204 spin_unlock_irq(q->queue_lock);
1205
1206 /*
Tejun Heoca32aef2012-03-05 13:15:03 -08001207 * Wait for tg->blkg->q accessors to exit their grace periods.
Vivek Goyale43473b2010-09-15 17:06:35 -04001208 * Do this wait only if there are other undestroyed groups out
1209 * there (other than root group). This can happen if cgroup deletion
1210 * path claimed the responsibility of cleaning up a group before
1211 * queue cleanup code get to the group.
1212 *
1213 * Do not call synchronize_rcu() unconditionally as there are drivers
1214 * which create/delete request queue hundreds of times during scan/boot
1215 * and synchronize_rcu() can take significant time and slow down boot.
1216 */
1217 if (wait)
1218 synchronize_rcu();
Vivek Goyalfe071432010-10-01 14:49:49 +02001219
1220 /*
1221 * Just being safe to make sure after previous flush if some body did
1222 * update limits through cgroup and another work got queued, cancel
1223 * it.
1224 */
Vivek Goyalda527772011-03-02 19:05:33 -05001225 throtl_shutdown_wq(q);
Tejun Heoc9a929d2011-10-19 14:42:16 +02001226}
1227
1228void blk_throtl_release(struct request_queue *q)
1229{
1230 kfree(q->td);
Vivek Goyale43473b2010-09-15 17:06:35 -04001231}
1232
1233static int __init throtl_init(void)
1234{
Vivek Goyal450adcb2011-03-01 13:40:54 -05001235 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
1236 if (!kthrotld_workqueue)
1237 panic("Failed to create kthrotld\n");
1238
Vivek Goyale43473b2010-09-15 17:06:35 -04001239 blkio_policy_register(&blkio_policy_throtl);
1240 return 0;
1241}
1242
1243module_init(throtl_init);