blob: 7af350293c2f16cbd08e755673c952f13e6785e9 [file] [log] [blame]
Tejun Heo7caa4712019-08-28 15:05:58 -07001/* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
42 * paramters for several different classes of devices are provided and the
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
Tejun Heo8504dea2019-08-28 15:06:00 -070049 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
50 * device-specific coefficients.
51 *
Tejun Heo7caa4712019-08-28 15:05:58 -070052 * 2. Control Strategy
53 *
54 * The device virtual time (vtime) is used as the primary control metric.
55 * The control strategy is composed of the following three parts.
56 *
57 * 2-1. Vtime Distribution
58 *
59 * When a cgroup becomes active in terms of IOs, its hierarchical share is
60 * calculated. Please consider the following hierarchy where the numbers
61 * inside parentheses denote the configured weights.
62 *
63 * root
64 * / \
65 * A (w:100) B (w:300)
66 * / \
67 * A0 (w:100) A1 (w:100)
68 *
69 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
70 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
71 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
72 * 12.5% each. The distribution mechanism only cares about these flattened
73 * shares. They're called hweights (hierarchical weights) and always add
74 * upto 1 (HWEIGHT_WHOLE).
75 *
76 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
77 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
78 * against the device vtime - an IO which takes 10ms on the underlying
79 * device is considered to take 80ms on A0.
80 *
81 * This constitutes the basis of IO capacity distribution. Each cgroup's
82 * vtime is running at a rate determined by its hweight. A cgroup tracks
83 * the vtime consumed by past IOs and can issue a new IO iff doing so
84 * wouldn't outrun the current device vtime. Otherwise, the IO is
85 * suspended until the vtime has progressed enough to cover it.
86 *
87 * 2-2. Vrate Adjustment
88 *
89 * It's unrealistic to expect the cost model to be perfect. There are too
90 * many devices and even on the same device the overall performance
91 * fluctuates depending on numerous factors such as IO mixture and device
92 * internal garbage collection. The controller needs to adapt dynamically.
93 *
94 * This is achieved by adjusting the overall IO rate according to how busy
95 * the device is. If the device becomes overloaded, we're sending down too
96 * many IOs and should generally slow down. If there are waiting issuers
97 * but the device isn't saturated, we're issuing too few and should
98 * generally speed up.
99 *
100 * To slow down, we lower the vrate - the rate at which the device vtime
101 * passes compared to the wall clock. For example, if the vtime is running
102 * at the vrate of 75%, all cgroups added up would only be able to issue
103 * 750ms worth of IOs per second, and vice-versa for speeding up.
104 *
105 * Device business is determined using two criteria - rq wait and
106 * completion latencies.
107 *
108 * When a device gets saturated, the on-device and then the request queues
109 * fill up and a bio which is ready to be issued has to wait for a request
110 * to become available. When this delay becomes noticeable, it's a clear
111 * indication that the device is saturated and we lower the vrate. This
112 * saturation signal is fairly conservative as it only triggers when both
113 * hardware and software queues are filled up, and is used as the default
114 * busy signal.
115 *
116 * As devices can have deep queues and be unfair in how the queued commands
117 * are executed, soley depending on rq wait may not result in satisfactory
118 * control quality. For a better control quality, completion latency QoS
119 * parameters can be configured so that the device is considered saturated
120 * if N'th percentile completion latency rises above the set point.
121 *
122 * The completion latency requirements are a function of both the
123 * underlying device characteristics and the desired IO latency quality of
124 * service. There is an inherent trade-off - the tighter the latency QoS,
125 * the higher the bandwidth lossage. Latency QoS is disabled by default
126 * and can be set through /sys/fs/cgroup/io.cost.qos.
127 *
128 * 2-3. Work Conservation
129 *
130 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
131 * periodically while B is sending out enough parallel IOs to saturate the
132 * device on its own. Let's say A's usage amounts to 100ms worth of IO
133 * cost per second, i.e., 10% of the device capacity. The naive
134 * distribution of half and half would lead to 60% utilization of the
135 * device, a significant reduction in the total amount of work done
136 * compared to free-for-all competition. This is too high a cost to pay
137 * for IO control.
138 *
139 * To conserve the total amount of work done, we keep track of how much
140 * each active cgroup is actually using and yield part of its weight if
141 * there are other cgroups which can make use of it. In the above case,
142 * A's weight will be lowered so that it hovers above the actual usage and
143 * B would be able to use the rest.
144 *
145 * As we don't want to penalize a cgroup for donating its weight, the
146 * surplus weight adjustment factors in a margin and has an immediate
147 * snapback mechanism in case the cgroup needs more IO vtime for itself.
148 *
149 * Note that adjusting down surplus weights has the same effects as
150 * accelerating vtime for other cgroups and work conservation can also be
151 * implemented by adjusting vrate dynamically. However, squaring who can
152 * donate and should take back how much requires hweight propagations
153 * anyway making it easier to implement and understand as a separate
154 * mechanism.
Tejun Heo6954ff12019-08-28 15:05:59 -0700155 *
156 * 3. Monitoring
157 *
158 * Instead of debugfs or other clumsy monitoring mechanisms, this
159 * controller uses a drgn based monitoring script -
160 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
161 * https://github.com/osandov/drgn. The ouput looks like the following.
162 *
163 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
164 * active weight hweight% inflt% del_ms usages%
165 * test/a * 50/ 50 33.33/ 33.33 27.65 0*041 033:033:033
166 * test/b * 100/ 100 66.67/ 66.67 17.56 0*000 066:079:077
167 *
168 * - per : Timer period
169 * - cur_per : Internal wall and device vtime clock
170 * - vrate : Device virtual time rate against wall clock
171 * - weight : Surplus-adjusted and configured weights
172 * - hweight : Surplus-adjusted and configured hierarchical weights
173 * - inflt : The percentage of in-flight IO cost at the end of last period
174 * - del_ms : Deferred issuer delay induction level and duration
175 * - usages : Usage history
Tejun Heo7caa4712019-08-28 15:05:58 -0700176 */
177
178#include <linux/kernel.h>
179#include <linux/module.h>
180#include <linux/timer.h>
181#include <linux/time64.h>
182#include <linux/parser.h>
183#include <linux/sched/signal.h>
184#include <linux/blk-cgroup.h>
185#include "blk-rq-qos.h"
186#include "blk-stat.h"
187#include "blk-wbt.h"
188
189#ifdef CONFIG_TRACEPOINTS
190
191/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
192#define TRACE_IOCG_PATH_LEN 1024
193static DEFINE_SPINLOCK(trace_iocg_path_lock);
194static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
195
196#define TRACE_IOCG_PATH(type, iocg, ...) \
197 do { \
198 unsigned long flags; \
199 if (trace_iocost_##type##_enabled()) { \
200 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
201 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
202 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
203 trace_iocost_##type(iocg, trace_iocg_path, \
204 ##__VA_ARGS__); \
205 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
206 } \
207 } while (0)
208
209#else /* CONFIG_TRACE_POINTS */
210#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
211#endif /* CONFIG_TRACE_POINTS */
212
213enum {
214 MILLION = 1000000,
215
216 /* timer period is calculated from latency requirements, bound it */
217 MIN_PERIOD = USEC_PER_MSEC,
218 MAX_PERIOD = USEC_PER_SEC,
219
220 /*
221 * A cgroup's vtime can run 50% behind the device vtime, which
222 * serves as its IO credit buffer. Surplus weight adjustment is
223 * immediately canceled if the vtime margin runs below 10%.
224 */
225 MARGIN_PCT = 50,
226 INUSE_MARGIN_PCT = 10,
227
228 /* Have some play in waitq timer operations */
229 WAITQ_TIMER_MARGIN_PCT = 5,
230
231 /*
232 * vtime can wrap well within a reasonable uptime when vrate is
233 * consistently raised. Don't trust recorded cgroup vtime if the
234 * period counter indicates that it's older than 5mins.
235 */
236 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
237
238 /*
239 * Remember the past three non-zero usages and use the max for
240 * surplus calculation. Three slots guarantee that we remember one
241 * full period usage from the last active stretch even after
242 * partial deactivation and re-activation periods. Don't start
243 * giving away weight before collecting two data points to prevent
244 * hweight adjustments based on one partial activation period.
245 */
246 NR_USAGE_SLOTS = 3,
247 MIN_VALID_USAGES = 2,
248
249 /* 1/64k is granular enough and can easily be handled w/ u32 */
250 HWEIGHT_WHOLE = 1 << 16,
251
252 /*
253 * As vtime is used to calculate the cost of each IO, it needs to
254 * be fairly high precision. For example, it should be able to
255 * represent the cost of a single page worth of discard with
256 * suffificient accuracy. At the same time, it should be able to
257 * represent reasonably long enough durations to be useful and
258 * convenient during operation.
259 *
260 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
261 * granularity and days of wrap-around time even at extreme vrates.
262 */
263 VTIME_PER_SEC_SHIFT = 37,
264 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
265 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
266
267 /* bound vrate adjustments within two orders of magnitude */
268 VRATE_MIN_PPM = 10000, /* 1% */
269 VRATE_MAX_PPM = 100000000, /* 10000% */
270
271 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
272 VRATE_CLAMP_ADJ_PCT = 4,
273
274 /* if IOs end up waiting for requests, issue less */
275 RQ_WAIT_BUSY_PCT = 5,
276
277 /* unbusy hysterisis */
278 UNBUSY_THR_PCT = 75,
279
280 /* don't let cmds which take a very long time pin lagging for too long */
281 MAX_LAGGING_PERIODS = 10,
282
283 /*
284 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
285 * donate the surplus.
286 */
287 SURPLUS_SCALE_PCT = 125, /* * 125% */
288 SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
289 SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
290
291 /* switch iff the conditions are met for longer than this */
292 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
293
294 /*
295 * Count IO size in 4k pages. The 12bit shift helps keeping
296 * size-proportional components of cost calculation in closer
297 * numbers of digits to per-IO cost components.
298 */
299 IOC_PAGE_SHIFT = 12,
300 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
301 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
302
303 /* if apart further than 16M, consider randio for linear model */
304 LCOEF_RANDIO_PAGES = 4096,
305};
306
307enum ioc_running {
308 IOC_IDLE,
309 IOC_RUNNING,
310 IOC_STOP,
311};
312
313/* io.cost.qos controls including per-dev enable of the whole controller */
314enum {
315 QOS_ENABLE,
316 QOS_CTRL,
317 NR_QOS_CTRL_PARAMS,
318};
319
320/* io.cost.qos params */
321enum {
322 QOS_RPPM,
323 QOS_RLAT,
324 QOS_WPPM,
325 QOS_WLAT,
326 QOS_MIN,
327 QOS_MAX,
328 NR_QOS_PARAMS,
329};
330
331/* io.cost.model controls */
332enum {
333 COST_CTRL,
334 COST_MODEL,
335 NR_COST_CTRL_PARAMS,
336};
337
338/* builtin linear cost model coefficients */
339enum {
340 I_LCOEF_RBPS,
341 I_LCOEF_RSEQIOPS,
342 I_LCOEF_RRANDIOPS,
343 I_LCOEF_WBPS,
344 I_LCOEF_WSEQIOPS,
345 I_LCOEF_WRANDIOPS,
346 NR_I_LCOEFS,
347};
348
349enum {
350 LCOEF_RPAGE,
351 LCOEF_RSEQIO,
352 LCOEF_RRANDIO,
353 LCOEF_WPAGE,
354 LCOEF_WSEQIO,
355 LCOEF_WRANDIO,
356 NR_LCOEFS,
357};
358
359enum {
360 AUTOP_INVALID,
361 AUTOP_HDD,
362 AUTOP_SSD_QD1,
363 AUTOP_SSD_DFL,
364 AUTOP_SSD_FAST,
365};
366
367struct ioc_gq;
368
369struct ioc_params {
370 u32 qos[NR_QOS_PARAMS];
371 u64 i_lcoefs[NR_I_LCOEFS];
372 u64 lcoefs[NR_LCOEFS];
373 u32 too_fast_vrate_pct;
374 u32 too_slow_vrate_pct;
375};
376
377struct ioc_missed {
378 u32 nr_met;
379 u32 nr_missed;
380 u32 last_met;
381 u32 last_missed;
382};
383
384struct ioc_pcpu_stat {
385 struct ioc_missed missed[2];
386
387 u64 rq_wait_ns;
388 u64 last_rq_wait_ns;
389};
390
391/* per device */
392struct ioc {
393 struct rq_qos rqos;
394
395 bool enabled;
396
397 struct ioc_params params;
398 u32 period_us;
399 u32 margin_us;
400 u64 vrate_min;
401 u64 vrate_max;
402
403 spinlock_t lock;
404 struct timer_list timer;
405 struct list_head active_iocgs; /* active cgroups */
406 struct ioc_pcpu_stat __percpu *pcpu_stat;
407
408 enum ioc_running running;
409 atomic64_t vtime_rate;
410
411 seqcount_t period_seqcount;
412 u32 period_at; /* wallclock starttime */
413 u64 period_at_vtime; /* vtime starttime */
414
415 atomic64_t cur_period; /* inc'd each period */
416 int busy_level; /* saturation history */
417
418 u64 inuse_margin_vtime;
419 bool weights_updated;
420 atomic_t hweight_gen; /* for lazy hweights */
421
422 u64 autop_too_fast_at;
423 u64 autop_too_slow_at;
424 int autop_idx;
425 bool user_qos_params:1;
426 bool user_cost_model:1;
427};
428
429/* per device-cgroup pair */
430struct ioc_gq {
431 struct blkg_policy_data pd;
432 struct ioc *ioc;
433
434 /*
435 * A iocg can get its weight from two sources - an explicit
436 * per-device-cgroup configuration or the default weight of the
437 * cgroup. `cfg_weight` is the explicit per-device-cgroup
438 * configuration. `weight` is the effective considering both
439 * sources.
440 *
441 * When an idle cgroup becomes active its `active` goes from 0 to
442 * `weight`. `inuse` is the surplus adjusted active weight.
443 * `active` and `inuse` are used to calculate `hweight_active` and
444 * `hweight_inuse`.
445 *
446 * `last_inuse` remembers `inuse` while an iocg is idle to persist
447 * surplus adjustments.
448 */
449 u32 cfg_weight;
450 u32 weight;
451 u32 active;
452 u32 inuse;
453 u32 last_inuse;
454
455 sector_t cursor; /* to detect randio */
456
457 /*
458 * `vtime` is this iocg's vtime cursor which progresses as IOs are
459 * issued. If lagging behind device vtime, the delta represents
460 * the currently available IO budget. If runnning ahead, the
461 * overage.
462 *
463 * `vtime_done` is the same but progressed on completion rather
464 * than issue. The delta behind `vtime` represents the cost of
465 * currently in-flight IOs.
466 *
467 * `last_vtime` is used to remember `vtime` at the end of the last
468 * period to calculate utilization.
469 */
470 atomic64_t vtime;
471 atomic64_t done_vtime;
472 u64 last_vtime;
473
474 /*
475 * The period this iocg was last active in. Used for deactivation
476 * and invalidating `vtime`.
477 */
478 atomic64_t active_period;
479 struct list_head active_list;
480
481 /* see __propagate_active_weight() and current_hweight() for details */
482 u64 child_active_sum;
483 u64 child_inuse_sum;
484 int hweight_gen;
485 u32 hweight_active;
486 u32 hweight_inuse;
487 bool has_surplus;
488
489 struct wait_queue_head waitq;
490 struct hrtimer waitq_timer;
491 struct hrtimer delay_timer;
492
493 /* usage is recorded as fractions of HWEIGHT_WHOLE */
494 int usage_idx;
495 u32 usages[NR_USAGE_SLOTS];
496
497 /* this iocg's depth in the hierarchy and ancestors including self */
498 int level;
499 struct ioc_gq *ancestors[];
500};
501
502/* per cgroup */
503struct ioc_cgrp {
504 struct blkcg_policy_data cpd;
505 unsigned int dfl_weight;
506};
507
508struct ioc_now {
509 u64 now_ns;
510 u32 now;
511 u64 vnow;
512 u64 vrate;
513};
514
515struct iocg_wait {
516 struct wait_queue_entry wait;
517 struct bio *bio;
518 u64 abs_cost;
519 bool committed;
520};
521
522struct iocg_wake_ctx {
523 struct ioc_gq *iocg;
524 u32 hw_inuse;
525 s64 vbudget;
526};
527
528static const struct ioc_params autop[] = {
529 [AUTOP_HDD] = {
530 .qos = {
531 [QOS_RLAT] = 50000, /* 50ms */
532 [QOS_WLAT] = 50000,
533 [QOS_MIN] = VRATE_MIN_PPM,
534 [QOS_MAX] = VRATE_MAX_PPM,
535 },
536 .i_lcoefs = {
537 [I_LCOEF_RBPS] = 174019176,
538 [I_LCOEF_RSEQIOPS] = 41708,
539 [I_LCOEF_RRANDIOPS] = 370,
540 [I_LCOEF_WBPS] = 178075866,
541 [I_LCOEF_WSEQIOPS] = 42705,
542 [I_LCOEF_WRANDIOPS] = 378,
543 },
544 },
545 [AUTOP_SSD_QD1] = {
546 .qos = {
547 [QOS_RLAT] = 25000, /* 25ms */
548 [QOS_WLAT] = 25000,
549 [QOS_MIN] = VRATE_MIN_PPM,
550 [QOS_MAX] = VRATE_MAX_PPM,
551 },
552 .i_lcoefs = {
553 [I_LCOEF_RBPS] = 245855193,
554 [I_LCOEF_RSEQIOPS] = 61575,
555 [I_LCOEF_RRANDIOPS] = 6946,
556 [I_LCOEF_WBPS] = 141365009,
557 [I_LCOEF_WSEQIOPS] = 33716,
558 [I_LCOEF_WRANDIOPS] = 26796,
559 },
560 },
561 [AUTOP_SSD_DFL] = {
562 .qos = {
563 [QOS_RLAT] = 25000, /* 25ms */
564 [QOS_WLAT] = 25000,
565 [QOS_MIN] = VRATE_MIN_PPM,
566 [QOS_MAX] = VRATE_MAX_PPM,
567 },
568 .i_lcoefs = {
569 [I_LCOEF_RBPS] = 488636629,
570 [I_LCOEF_RSEQIOPS] = 8932,
571 [I_LCOEF_RRANDIOPS] = 8518,
572 [I_LCOEF_WBPS] = 427891549,
573 [I_LCOEF_WSEQIOPS] = 28755,
574 [I_LCOEF_WRANDIOPS] = 21940,
575 },
576 .too_fast_vrate_pct = 500,
577 },
578 [AUTOP_SSD_FAST] = {
579 .qos = {
580 [QOS_RLAT] = 5000, /* 5ms */
581 [QOS_WLAT] = 5000,
582 [QOS_MIN] = VRATE_MIN_PPM,
583 [QOS_MAX] = VRATE_MAX_PPM,
584 },
585 .i_lcoefs = {
586 [I_LCOEF_RBPS] = 3102524156LLU,
587 [I_LCOEF_RSEQIOPS] = 724816,
588 [I_LCOEF_RRANDIOPS] = 778122,
589 [I_LCOEF_WBPS] = 1742780862LLU,
590 [I_LCOEF_WSEQIOPS] = 425702,
591 [I_LCOEF_WRANDIOPS] = 443193,
592 },
593 .too_slow_vrate_pct = 10,
594 },
595};
596
597/*
598 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
599 * vtime credit shortage and down on device saturation.
600 */
601static u32 vrate_adj_pct[] =
602 { 0, 0, 0, 0,
603 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
604 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
605 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
606
607static struct blkcg_policy blkcg_policy_iocost;
608
609/* accessors and helpers */
610static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
611{
612 return container_of(rqos, struct ioc, rqos);
613}
614
615static struct ioc *q_to_ioc(struct request_queue *q)
616{
617 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
618}
619
620static const char *q_name(struct request_queue *q)
621{
622 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
623 return kobject_name(q->kobj.parent);
624 else
625 return "<unknown>";
626}
627
628static const char __maybe_unused *ioc_name(struct ioc *ioc)
629{
630 return q_name(ioc->rqos.q);
631}
632
633static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
634{
635 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
636}
637
638static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
639{
640 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
641}
642
643static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
644{
645 return pd_to_blkg(&iocg->pd);
646}
647
648static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
649{
650 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
651 struct ioc_cgrp, cpd);
652}
653
654/*
655 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
656 * weight, the more expensive each IO.
657 */
658static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
659{
660 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
661}
662
663static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
664{
665 bio->bi_iocost_cost = cost;
666 atomic64_add(cost, &iocg->vtime);
667}
668
669#define CREATE_TRACE_POINTS
670#include <trace/events/iocost.h>
671
672/* latency Qos params changed, update period_us and all the dependent params */
673static void ioc_refresh_period_us(struct ioc *ioc)
674{
675 u32 ppm, lat, multi, period_us;
676
677 lockdep_assert_held(&ioc->lock);
678
679 /* pick the higher latency target */
680 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
681 ppm = ioc->params.qos[QOS_RPPM];
682 lat = ioc->params.qos[QOS_RLAT];
683 } else {
684 ppm = ioc->params.qos[QOS_WPPM];
685 lat = ioc->params.qos[QOS_WLAT];
686 }
687
688 /*
689 * We want the period to be long enough to contain a healthy number
690 * of IOs while short enough for granular control. Define it as a
691 * multiple of the latency target. Ideally, the multiplier should
692 * be scaled according to the percentile so that it would nominally
693 * contain a certain number of requests. Let's be simpler and
694 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
695 */
696 if (ppm)
697 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
698 else
699 multi = 2;
700 period_us = multi * lat;
701 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
702
703 /* calculate dependent params */
704 ioc->period_us = period_us;
705 ioc->margin_us = period_us * MARGIN_PCT / 100;
706 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
707 period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
708}
709
710static int ioc_autop_idx(struct ioc *ioc)
711{
712 int idx = ioc->autop_idx;
713 const struct ioc_params *p = &autop[idx];
714 u32 vrate_pct;
715 u64 now_ns;
716
717 /* rotational? */
718 if (!blk_queue_nonrot(ioc->rqos.q))
719 return AUTOP_HDD;
720
721 /* handle SATA SSDs w/ broken NCQ */
722 if (blk_queue_depth(ioc->rqos.q) == 1)
723 return AUTOP_SSD_QD1;
724
725 /* use one of the normal ssd sets */
726 if (idx < AUTOP_SSD_DFL)
727 return AUTOP_SSD_DFL;
728
729 /* if user is overriding anything, maintain what was there */
730 if (ioc->user_qos_params || ioc->user_cost_model)
731 return idx;
732
733 /* step up/down based on the vrate */
734 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
735 VTIME_PER_USEC);
736 now_ns = ktime_get_ns();
737
738 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
739 if (!ioc->autop_too_fast_at)
740 ioc->autop_too_fast_at = now_ns;
741 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
742 return idx + 1;
743 } else {
744 ioc->autop_too_fast_at = 0;
745 }
746
747 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
748 if (!ioc->autop_too_slow_at)
749 ioc->autop_too_slow_at = now_ns;
750 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
751 return idx - 1;
752 } else {
753 ioc->autop_too_slow_at = 0;
754 }
755
756 return idx;
757}
758
759/*
760 * Take the followings as input
761 *
762 * @bps maximum sequential throughput
763 * @seqiops maximum sequential 4k iops
764 * @randiops maximum random 4k iops
765 *
766 * and calculate the linear model cost coefficients.
767 *
768 * *@page per-page cost 1s / (@bps / 4096)
769 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
770 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
771 */
772static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
773 u64 *page, u64 *seqio, u64 *randio)
774{
775 u64 v;
776
777 *page = *seqio = *randio = 0;
778
779 if (bps)
780 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
781 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
782
783 if (seqiops) {
784 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
785 if (v > *page)
786 *seqio = v - *page;
787 }
788
789 if (randiops) {
790 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
791 if (v > *page)
792 *randio = v - *page;
793 }
794}
795
796static void ioc_refresh_lcoefs(struct ioc *ioc)
797{
798 u64 *u = ioc->params.i_lcoefs;
799 u64 *c = ioc->params.lcoefs;
800
801 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
802 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
803 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
804 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
805}
806
807static bool ioc_refresh_params(struct ioc *ioc, bool force)
808{
809 const struct ioc_params *p;
810 int idx;
811
812 lockdep_assert_held(&ioc->lock);
813
814 idx = ioc_autop_idx(ioc);
815 p = &autop[idx];
816
817 if (idx == ioc->autop_idx && !force)
818 return false;
819
820 if (idx != ioc->autop_idx)
821 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
822
823 ioc->autop_idx = idx;
824 ioc->autop_too_fast_at = 0;
825 ioc->autop_too_slow_at = 0;
826
827 if (!ioc->user_qos_params)
828 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
829 if (!ioc->user_cost_model)
830 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
831
832 ioc_refresh_period_us(ioc);
833 ioc_refresh_lcoefs(ioc);
834
835 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
836 VTIME_PER_USEC, MILLION);
837 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
838 VTIME_PER_USEC, MILLION);
839
840 return true;
841}
842
843/* take a snapshot of the current [v]time and vrate */
844static void ioc_now(struct ioc *ioc, struct ioc_now *now)
845{
846 unsigned seq;
847
848 now->now_ns = ktime_get();
849 now->now = ktime_to_us(now->now_ns);
850 now->vrate = atomic64_read(&ioc->vtime_rate);
851
852 /*
853 * The current vtime is
854 *
855 * vtime at period start + (wallclock time since the start) * vrate
856 *
857 * As a consistent snapshot of `period_at_vtime` and `period_at` is
858 * needed, they're seqcount protected.
859 */
860 do {
861 seq = read_seqcount_begin(&ioc->period_seqcount);
862 now->vnow = ioc->period_at_vtime +
863 (now->now - ioc->period_at) * now->vrate;
864 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
865}
866
867static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
868{
869 lockdep_assert_held(&ioc->lock);
870 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
871
872 write_seqcount_begin(&ioc->period_seqcount);
873 ioc->period_at = now->now;
874 ioc->period_at_vtime = now->vnow;
875 write_seqcount_end(&ioc->period_seqcount);
876
877 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
878 add_timer(&ioc->timer);
879}
880
881/*
882 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
883 * weight sums and propagate upwards accordingly.
884 */
885static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
886{
887 struct ioc *ioc = iocg->ioc;
888 int lvl;
889
890 lockdep_assert_held(&ioc->lock);
891
892 inuse = min(active, inuse);
893
894 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
895 struct ioc_gq *parent = iocg->ancestors[lvl];
896 struct ioc_gq *child = iocg->ancestors[lvl + 1];
897 u32 parent_active = 0, parent_inuse = 0;
898
899 /* update the level sums */
900 parent->child_active_sum += (s32)(active - child->active);
901 parent->child_inuse_sum += (s32)(inuse - child->inuse);
902 /* apply the udpates */
903 child->active = active;
904 child->inuse = inuse;
905
906 /*
907 * The delta between inuse and active sums indicates that
908 * that much of weight is being given away. Parent's inuse
909 * and active should reflect the ratio.
910 */
911 if (parent->child_active_sum) {
912 parent_active = parent->weight;
913 parent_inuse = DIV64_U64_ROUND_UP(
914 parent_active * parent->child_inuse_sum,
915 parent->child_active_sum);
916 }
917
918 /* do we need to keep walking up? */
919 if (parent_active == parent->active &&
920 parent_inuse == parent->inuse)
921 break;
922
923 active = parent_active;
924 inuse = parent_inuse;
925 }
926
927 ioc->weights_updated = true;
928}
929
930static void commit_active_weights(struct ioc *ioc)
931{
932 lockdep_assert_held(&ioc->lock);
933
934 if (ioc->weights_updated) {
935 /* paired with rmb in current_hweight(), see there */
936 smp_wmb();
937 atomic_inc(&ioc->hweight_gen);
938 ioc->weights_updated = false;
939 }
940}
941
942static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
943{
944 __propagate_active_weight(iocg, active, inuse);
945 commit_active_weights(iocg->ioc);
946}
947
948static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
949{
950 struct ioc *ioc = iocg->ioc;
951 int lvl;
952 u32 hwa, hwi;
953 int ioc_gen;
954
955 /* hot path - if uptodate, use cached */
956 ioc_gen = atomic_read(&ioc->hweight_gen);
957 if (ioc_gen == iocg->hweight_gen)
958 goto out;
959
960 /*
961 * Paired with wmb in commit_active_weights(). If we saw the
962 * updated hweight_gen, all the weight updates from
963 * __propagate_active_weight() are visible too.
964 *
965 * We can race with weight updates during calculation and get it
966 * wrong. However, hweight_gen would have changed and a future
967 * reader will recalculate and we're guaranteed to discard the
968 * wrong result soon.
969 */
970 smp_rmb();
971
972 hwa = hwi = HWEIGHT_WHOLE;
973 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
974 struct ioc_gq *parent = iocg->ancestors[lvl];
975 struct ioc_gq *child = iocg->ancestors[lvl + 1];
976 u32 active_sum = READ_ONCE(parent->child_active_sum);
977 u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
978 u32 active = READ_ONCE(child->active);
979 u32 inuse = READ_ONCE(child->inuse);
980
981 /* we can race with deactivations and either may read as zero */
982 if (!active_sum || !inuse_sum)
983 continue;
984
985 active_sum = max(active, active_sum);
986 hwa = hwa * active / active_sum; /* max 16bits * 10000 */
987
988 inuse_sum = max(inuse, inuse_sum);
989 hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
990 }
991
992 iocg->hweight_active = max_t(u32, hwa, 1);
993 iocg->hweight_inuse = max_t(u32, hwi, 1);
994 iocg->hweight_gen = ioc_gen;
995out:
996 if (hw_activep)
997 *hw_activep = iocg->hweight_active;
998 if (hw_inusep)
999 *hw_inusep = iocg->hweight_inuse;
1000}
1001
1002static void weight_updated(struct ioc_gq *iocg)
1003{
1004 struct ioc *ioc = iocg->ioc;
1005 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1006 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1007 u32 weight;
1008
1009 lockdep_assert_held(&ioc->lock);
1010
1011 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1012 if (weight != iocg->weight && iocg->active)
1013 propagate_active_weight(iocg, weight,
1014 DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1015 iocg->weight = weight;
1016}
1017
1018static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1019{
1020 struct ioc *ioc = iocg->ioc;
1021 u64 last_period, cur_period, max_period_delta;
1022 u64 vtime, vmargin, vmin;
1023 int i;
1024
1025 /*
1026 * If seem to be already active, just update the stamp to tell the
1027 * timer that we're still active. We don't mind occassional races.
1028 */
1029 if (!list_empty(&iocg->active_list)) {
1030 ioc_now(ioc, now);
1031 cur_period = atomic64_read(&ioc->cur_period);
1032 if (atomic64_read(&iocg->active_period) != cur_period)
1033 atomic64_set(&iocg->active_period, cur_period);
1034 return true;
1035 }
1036
1037 /* racy check on internal node IOs, treat as root level IOs */
1038 if (iocg->child_active_sum)
1039 return false;
1040
1041 spin_lock_irq(&ioc->lock);
1042
1043 ioc_now(ioc, now);
1044
1045 /* update period */
1046 cur_period = atomic64_read(&ioc->cur_period);
1047 last_period = atomic64_read(&iocg->active_period);
1048 atomic64_set(&iocg->active_period, cur_period);
1049
1050 /* already activated or breaking leaf-only constraint? */
1051 for (i = iocg->level; i > 0; i--)
1052 if (!list_empty(&iocg->active_list))
1053 goto fail_unlock;
1054 if (iocg->child_active_sum)
1055 goto fail_unlock;
1056
1057 /*
1058 * vtime may wrap when vrate is raised substantially due to
1059 * underestimated IO costs. Look at the period and ignore its
1060 * vtime if the iocg has been idle for too long. Also, cap the
1061 * budget it can start with to the margin.
1062 */
1063 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1064 vtime = atomic64_read(&iocg->vtime);
1065 vmargin = ioc->margin_us * now->vrate;
1066 vmin = now->vnow - vmargin;
1067
1068 if (last_period + max_period_delta < cur_period ||
1069 time_before64(vtime, vmin)) {
1070 atomic64_add(vmin - vtime, &iocg->vtime);
1071 atomic64_add(vmin - vtime, &iocg->done_vtime);
1072 vtime = vmin;
1073 }
1074
1075 /*
1076 * Activate, propagate weight and start period timer if not
1077 * running. Reset hweight_gen to avoid accidental match from
1078 * wrapping.
1079 */
1080 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1081 list_add(&iocg->active_list, &ioc->active_iocgs);
1082 propagate_active_weight(iocg, iocg->weight,
1083 iocg->last_inuse ?: iocg->weight);
1084
1085 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1086 last_period, cur_period, vtime);
1087
1088 iocg->last_vtime = vtime;
1089
1090 if (ioc->running == IOC_IDLE) {
1091 ioc->running = IOC_RUNNING;
1092 ioc_start_period(ioc, now);
1093 }
1094
1095 spin_unlock_irq(&ioc->lock);
1096 return true;
1097
1098fail_unlock:
1099 spin_unlock_irq(&ioc->lock);
1100 return false;
1101}
1102
1103static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1104 int flags, void *key)
1105{
1106 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1107 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1108 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1109
1110 ctx->vbudget -= cost;
1111
1112 if (ctx->vbudget < 0)
1113 return -1;
1114
1115 iocg_commit_bio(ctx->iocg, wait->bio, cost);
1116
1117 /*
1118 * autoremove_wake_function() removes the wait entry only when it
1119 * actually changed the task state. We want the wait always
1120 * removed. Remove explicitly and use default_wake_function().
1121 */
1122 list_del_init(&wq_entry->entry);
1123 wait->committed = true;
1124
1125 default_wake_function(wq_entry, mode, flags, key);
1126 return 0;
1127}
1128
1129static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1130{
1131 struct ioc *ioc = iocg->ioc;
1132 struct iocg_wake_ctx ctx = { .iocg = iocg };
1133 u64 margin_ns = (u64)(ioc->period_us *
1134 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1135 u64 vshortage, expires, oexpires;
1136
1137 lockdep_assert_held(&iocg->waitq.lock);
1138
1139 /*
1140 * Wake up the ones which are due and see how much vtime we'll need
1141 * for the next one.
1142 */
1143 current_hweight(iocg, NULL, &ctx.hw_inuse);
1144 ctx.vbudget = now->vnow - atomic64_read(&iocg->vtime);
1145 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1146 if (!waitqueue_active(&iocg->waitq))
1147 return;
1148 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1149 return;
1150
1151 /* determine next wakeup, add a quarter margin to guarantee chunking */
1152 vshortage = -ctx.vbudget;
1153 expires = now->now_ns +
1154 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1155 expires += margin_ns / 4;
1156
1157 /* if already active and close enough, don't bother */
1158 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1159 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1160 abs(oexpires - expires) <= margin_ns / 4)
1161 return;
1162
1163 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1164 margin_ns / 4, HRTIMER_MODE_ABS);
1165}
1166
1167static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1168{
1169 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1170 struct ioc_now now;
1171 unsigned long flags;
1172
1173 ioc_now(iocg->ioc, &now);
1174
1175 spin_lock_irqsave(&iocg->waitq.lock, flags);
1176 iocg_kick_waitq(iocg, &now);
1177 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1178
1179 return HRTIMER_NORESTART;
1180}
1181
1182static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1183{
1184 struct ioc *ioc = iocg->ioc;
1185 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1186 u64 vtime = atomic64_read(&iocg->vtime);
1187 u64 vmargin = ioc->margin_us * now->vrate;
1188 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1189 u64 expires, oexpires;
1190
1191 /* clear or maintain depending on the overage */
1192 if (time_before_eq64(vtime, now->vnow)) {
1193 blkcg_clear_delay(blkg);
1194 return;
1195 }
1196 if (!atomic_read(&blkg->use_delay) &&
1197 time_before_eq64(vtime, now->vnow + vmargin))
1198 return;
1199
1200 /* use delay */
1201 if (cost) {
1202 u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1203 now->vrate);
1204 blkcg_add_delay(blkg, now->now_ns, cost_ns);
1205 }
1206 blkcg_use_delay(blkg);
1207
1208 expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1209 now->vrate) * NSEC_PER_USEC;
1210
1211 /* if already active and close enough, don't bother */
1212 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1213 if (hrtimer_is_queued(&iocg->delay_timer) &&
1214 abs(oexpires - expires) <= margin_ns / 4)
1215 return;
1216
1217 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1218 margin_ns / 4, HRTIMER_MODE_ABS);
1219}
1220
1221static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1222{
1223 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1224 struct ioc_now now;
1225
1226 ioc_now(iocg->ioc, &now);
1227 iocg_kick_delay(iocg, &now, 0);
1228
1229 return HRTIMER_NORESTART;
1230}
1231
1232static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1233{
1234 u32 nr_met[2] = { };
1235 u32 nr_missed[2] = { };
1236 u64 rq_wait_ns = 0;
1237 int cpu, rw;
1238
1239 for_each_online_cpu(cpu) {
1240 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1241 u64 this_rq_wait_ns;
1242
1243 for (rw = READ; rw <= WRITE; rw++) {
1244 u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1245 u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1246
1247 nr_met[rw] += this_met - stat->missed[rw].last_met;
1248 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1249 stat->missed[rw].last_met = this_met;
1250 stat->missed[rw].last_missed = this_missed;
1251 }
1252
1253 this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1254 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1255 stat->last_rq_wait_ns = this_rq_wait_ns;
1256 }
1257
1258 for (rw = READ; rw <= WRITE; rw++) {
1259 if (nr_met[rw] + nr_missed[rw])
1260 missed_ppm_ar[rw] =
1261 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1262 nr_met[rw] + nr_missed[rw]);
1263 else
1264 missed_ppm_ar[rw] = 0;
1265 }
1266
1267 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1268 ioc->period_us * NSEC_PER_USEC);
1269}
1270
1271/* was iocg idle this period? */
1272static bool iocg_is_idle(struct ioc_gq *iocg)
1273{
1274 struct ioc *ioc = iocg->ioc;
1275
1276 /* did something get issued this period? */
1277 if (atomic64_read(&iocg->active_period) ==
1278 atomic64_read(&ioc->cur_period))
1279 return false;
1280
1281 /* is something in flight? */
1282 if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime))
1283 return false;
1284
1285 return true;
1286}
1287
1288/* returns usage with margin added if surplus is large enough */
1289static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1290{
1291 /* add margin */
1292 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1293 usage += SURPLUS_SCALE_ABS;
1294
1295 /* don't bother if the surplus is too small */
1296 if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1297 return 0;
1298
1299 return usage;
1300}
1301
1302static void ioc_timer_fn(struct timer_list *timer)
1303{
1304 struct ioc *ioc = container_of(timer, struct ioc, timer);
1305 struct ioc_gq *iocg, *tiocg;
1306 struct ioc_now now;
1307 int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1308 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1309 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1310 u32 missed_ppm[2], rq_wait_pct;
1311 u64 period_vtime;
1312 int i;
1313
1314 /* how were the latencies during the period? */
1315 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1316
1317 /* take care of active iocgs */
1318 spin_lock_irq(&ioc->lock);
1319
1320 ioc_now(ioc, &now);
1321
1322 period_vtime = now.vnow - ioc->period_at_vtime;
1323 if (WARN_ON_ONCE(!period_vtime)) {
1324 spin_unlock_irq(&ioc->lock);
1325 return;
1326 }
1327
1328 /*
1329 * Waiters determine the sleep durations based on the vrate they
1330 * saw at the time of sleep. If vrate has increased, some waiters
1331 * could be sleeping for too long. Wake up tardy waiters which
1332 * should have woken up in the last period and expire idle iocgs.
1333 */
1334 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1335 if (!waitqueue_active(&iocg->waitq) && !iocg_is_idle(iocg))
1336 continue;
1337
1338 spin_lock(&iocg->waitq.lock);
1339
1340 if (waitqueue_active(&iocg->waitq)) {
1341 /* might be oversleeping vtime / hweight changes, kick */
1342 iocg_kick_waitq(iocg, &now);
1343 iocg_kick_delay(iocg, &now, 0);
1344 } else if (iocg_is_idle(iocg)) {
1345 /* no waiter and idle, deactivate */
1346 iocg->last_inuse = iocg->inuse;
1347 __propagate_active_weight(iocg, 0, 0);
1348 list_del_init(&iocg->active_list);
1349 }
1350
1351 spin_unlock(&iocg->waitq.lock);
1352 }
1353 commit_active_weights(ioc);
1354
1355 /* calc usages and see whether some weights need to be moved around */
1356 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1357 u64 vdone, vtime, vusage, vmargin, vmin;
1358 u32 hw_active, hw_inuse, usage;
1359
1360 /*
1361 * Collect unused and wind vtime closer to vnow to prevent
1362 * iocgs from accumulating a large amount of budget.
1363 */
1364 vdone = atomic64_read(&iocg->done_vtime);
1365 vtime = atomic64_read(&iocg->vtime);
1366 current_hweight(iocg, &hw_active, &hw_inuse);
1367
1368 /*
1369 * Latency QoS detection doesn't account for IOs which are
1370 * in-flight for longer than a period. Detect them by
1371 * comparing vdone against period start. If lagging behind
1372 * IOs from past periods, don't increase vrate.
1373 */
1374 if (!atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1375 time_after64(vtime, vdone) &&
1376 time_after64(vtime, now.vnow -
1377 MAX_LAGGING_PERIODS * period_vtime) &&
1378 time_before64(vdone, now.vnow - period_vtime))
1379 nr_lagging++;
1380
1381 if (waitqueue_active(&iocg->waitq))
1382 vusage = now.vnow - iocg->last_vtime;
1383 else if (time_before64(iocg->last_vtime, vtime))
1384 vusage = vtime - iocg->last_vtime;
1385 else
1386 vusage = 0;
1387
1388 iocg->last_vtime += vusage;
1389 /*
1390 * Factor in in-flight vtime into vusage to avoid
1391 * high-latency completions appearing as idle. This should
1392 * be done after the above ->last_time adjustment.
1393 */
1394 vusage = max(vusage, vtime - vdone);
1395
1396 /* calculate hweight based usage ratio and record */
1397 if (vusage) {
1398 usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1399 period_vtime);
1400 iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1401 iocg->usages[iocg->usage_idx] = usage;
1402 } else {
1403 usage = 0;
1404 }
1405
1406 /* see whether there's surplus vtime */
1407 vmargin = ioc->margin_us * now.vrate;
1408 vmin = now.vnow - vmargin;
1409
1410 iocg->has_surplus = false;
1411
1412 if (!waitqueue_active(&iocg->waitq) &&
1413 time_before64(vtime, vmin)) {
1414 u64 delta = vmin - vtime;
1415
1416 /* throw away surplus vtime */
1417 atomic64_add(delta, &iocg->vtime);
1418 atomic64_add(delta, &iocg->done_vtime);
1419 iocg->last_vtime += delta;
1420 /* if usage is sufficiently low, maybe it can donate */
1421 if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1422 iocg->has_surplus = true;
1423 nr_surpluses++;
1424 }
1425 } else if (hw_inuse < hw_active) {
1426 u32 new_hwi, new_inuse;
1427
1428 /* was donating but might need to take back some */
1429 if (waitqueue_active(&iocg->waitq)) {
1430 new_hwi = hw_active;
1431 } else {
1432 new_hwi = max(hw_inuse,
1433 usage * SURPLUS_SCALE_PCT / 100 +
1434 SURPLUS_SCALE_ABS);
1435 }
1436
1437 new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1438 hw_inuse);
1439 new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1440
1441 if (new_inuse > iocg->inuse) {
1442 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1443 iocg->inuse, new_inuse,
1444 hw_inuse, new_hwi);
1445 __propagate_active_weight(iocg, iocg->weight,
1446 new_inuse);
1447 }
1448 } else {
1449 /* genuninely out of vtime */
1450 nr_shortages++;
1451 }
1452 }
1453
1454 if (!nr_shortages || !nr_surpluses)
1455 goto skip_surplus_transfers;
1456
1457 /* there are both shortages and surpluses, transfer surpluses */
1458 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1459 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1460 int nr_valid = 0;
1461
1462 if (!iocg->has_surplus)
1463 continue;
1464
1465 /* base the decision on max historical usage */
1466 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1467 if (iocg->usages[i]) {
1468 usage = max(usage, iocg->usages[i]);
1469 nr_valid++;
1470 }
1471 }
1472 if (nr_valid < MIN_VALID_USAGES)
1473 continue;
1474
1475 current_hweight(iocg, &hw_active, &hw_inuse);
1476 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1477 if (!new_hwi)
1478 continue;
1479
1480 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1481 hw_inuse);
1482 if (new_inuse < iocg->inuse) {
1483 TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1484 iocg->inuse, new_inuse,
1485 hw_inuse, new_hwi);
1486 __propagate_active_weight(iocg, iocg->weight, new_inuse);
1487 }
1488 }
1489skip_surplus_transfers:
1490 commit_active_weights(ioc);
1491
1492 /*
1493 * If q is getting clogged or we're missing too much, we're issuing
1494 * too much IO and should lower vtime rate. If we're not missing
1495 * and experiencing shortages but not surpluses, we're too stingy
1496 * and should increase vtime rate.
1497 */
1498 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1499 missed_ppm[READ] > ppm_rthr ||
1500 missed_ppm[WRITE] > ppm_wthr) {
1501 ioc->busy_level = max(ioc->busy_level, 0);
1502 ioc->busy_level++;
1503 } else if (nr_lagging) {
1504 ioc->busy_level = max(ioc->busy_level, 0);
1505 } else if (nr_shortages && !nr_surpluses &&
1506 rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1507 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1508 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1509 ioc->busy_level = min(ioc->busy_level, 0);
1510 ioc->busy_level--;
1511 } else {
1512 ioc->busy_level = 0;
1513 }
1514
1515 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1516
1517 if (ioc->busy_level) {
1518 u64 vrate = atomic64_read(&ioc->vtime_rate);
1519 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1520
1521 /* rq_wait signal is always reliable, ignore user vrate_min */
1522 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1523 vrate_min = VRATE_MIN;
1524
1525 /*
1526 * If vrate is out of bounds, apply clamp gradually as the
1527 * bounds can change abruptly. Otherwise, apply busy_level
1528 * based adjustment.
1529 */
1530 if (vrate < vrate_min) {
1531 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1532 100);
1533 vrate = min(vrate, vrate_min);
1534 } else if (vrate > vrate_max) {
1535 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1536 100);
1537 vrate = max(vrate, vrate_max);
1538 } else {
1539 int idx = min_t(int, abs(ioc->busy_level),
1540 ARRAY_SIZE(vrate_adj_pct) - 1);
1541 u32 adj_pct = vrate_adj_pct[idx];
1542
1543 if (ioc->busy_level > 0)
1544 adj_pct = 100 - adj_pct;
1545 else
1546 adj_pct = 100 + adj_pct;
1547
1548 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1549 vrate_min, vrate_max);
1550 }
1551
1552 trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
1553 nr_lagging, nr_shortages,
1554 nr_surpluses);
1555
1556 atomic64_set(&ioc->vtime_rate, vrate);
1557 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1558 ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1559 }
1560
1561 ioc_refresh_params(ioc, false);
1562
1563 /*
1564 * This period is done. Move onto the next one. If nothing's
1565 * going on with the device, stop the timer.
1566 */
1567 atomic64_inc(&ioc->cur_period);
1568
1569 if (ioc->running != IOC_STOP) {
1570 if (!list_empty(&ioc->active_iocgs)) {
1571 ioc_start_period(ioc, &now);
1572 } else {
1573 ioc->busy_level = 0;
1574 ioc->running = IOC_IDLE;
1575 }
1576 }
1577
1578 spin_unlock_irq(&ioc->lock);
1579}
1580
1581static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1582 bool is_merge, u64 *costp)
1583{
1584 struct ioc *ioc = iocg->ioc;
1585 u64 coef_seqio, coef_randio, coef_page;
1586 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1587 u64 seek_pages = 0;
1588 u64 cost = 0;
1589
1590 switch (bio_op(bio)) {
1591 case REQ_OP_READ:
1592 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
1593 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
1594 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
1595 break;
1596 case REQ_OP_WRITE:
1597 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
1598 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
1599 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
1600 break;
1601 default:
1602 goto out;
1603 }
1604
1605 if (iocg->cursor) {
1606 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1607 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1608 }
1609
1610 if (!is_merge) {
1611 if (seek_pages > LCOEF_RANDIO_PAGES) {
1612 cost += coef_randio;
1613 } else {
1614 cost += coef_seqio;
1615 }
1616 }
1617 cost += pages * coef_page;
1618out:
1619 *costp = cost;
1620}
1621
1622static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1623{
1624 u64 cost;
1625
1626 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1627 return cost;
1628}
1629
1630static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1631{
1632 struct blkcg_gq *blkg = bio->bi_blkg;
1633 struct ioc *ioc = rqos_to_ioc(rqos);
1634 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1635 struct ioc_now now;
1636 struct iocg_wait wait;
1637 u32 hw_active, hw_inuse;
1638 u64 abs_cost, cost, vtime;
1639
1640 /* bypass IOs if disabled or for root cgroup */
1641 if (!ioc->enabled || !iocg->level)
1642 return;
1643
1644 /* always activate so that even 0 cost IOs get protected to some level */
1645 if (!iocg_activate(iocg, &now))
1646 return;
1647
1648 /* calculate the absolute vtime cost */
1649 abs_cost = calc_vtime_cost(bio, iocg, false);
1650 if (!abs_cost)
1651 return;
1652
1653 iocg->cursor = bio_end_sector(bio);
1654
1655 vtime = atomic64_read(&iocg->vtime);
1656 current_hweight(iocg, &hw_active, &hw_inuse);
1657
1658 if (hw_inuse < hw_active &&
1659 time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1660 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1661 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1662 spin_lock_irq(&ioc->lock);
1663 propagate_active_weight(iocg, iocg->weight, iocg->weight);
1664 spin_unlock_irq(&ioc->lock);
1665 current_hweight(iocg, &hw_active, &hw_inuse);
1666 }
1667
1668 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1669
1670 /*
1671 * If no one's waiting and within budget, issue right away. The
1672 * tests are racy but the races aren't systemic - we only miss once
1673 * in a while which is fine.
1674 */
1675 if (!waitqueue_active(&iocg->waitq) &&
1676 time_before_eq64(vtime + cost, now.vnow)) {
1677 iocg_commit_bio(iocg, bio, cost);
1678 return;
1679 }
1680
1681 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1682 iocg_commit_bio(iocg, bio, cost);
1683 iocg_kick_delay(iocg, &now, cost);
1684 return;
1685 }
1686
1687 /*
1688 * Append self to the waitq and schedule the wakeup timer if we're
1689 * the first waiter. The timer duration is calculated based on the
1690 * current vrate. vtime and hweight changes can make it too short
1691 * or too long. Each wait entry records the absolute cost it's
1692 * waiting for to allow re-evaluation using a custom wait entry.
1693 *
1694 * If too short, the timer simply reschedules itself. If too long,
1695 * the period timer will notice and trigger wakeups.
1696 *
1697 * All waiters are on iocg->waitq and the wait states are
1698 * synchronized using waitq.lock.
1699 */
1700 spin_lock_irq(&iocg->waitq.lock);
1701
1702 /*
1703 * We activated above but w/o any synchronization. Deactivation is
1704 * synchronized with waitq.lock and we won't get deactivated as
1705 * long as we're waiting, so we're good if we're activated here.
1706 * In the unlikely case that we are deactivated, just issue the IO.
1707 */
1708 if (unlikely(list_empty(&iocg->active_list))) {
1709 spin_unlock_irq(&iocg->waitq.lock);
1710 iocg_commit_bio(iocg, bio, cost);
1711 return;
1712 }
1713
1714 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1715 wait.wait.private = current;
1716 wait.bio = bio;
1717 wait.abs_cost = abs_cost;
1718 wait.committed = false; /* will be set true by waker */
1719
1720 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1721 iocg_kick_waitq(iocg, &now);
1722
1723 spin_unlock_irq(&iocg->waitq.lock);
1724
1725 while (true) {
1726 set_current_state(TASK_UNINTERRUPTIBLE);
1727 if (wait.committed)
1728 break;
1729 io_schedule();
1730 }
1731
1732 /* waker already committed us, proceed */
1733 finish_wait(&iocg->waitq, &wait.wait);
1734}
1735
1736static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1737 struct bio *bio)
1738{
1739 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1740 sector_t bio_end = bio_end_sector(bio);
1741 u32 hw_inuse;
1742 u64 abs_cost, cost;
1743
1744 /* add iff the existing request has cost assigned */
1745 if (!rq->bio || !rq->bio->bi_iocost_cost)
1746 return;
1747
1748 abs_cost = calc_vtime_cost(bio, iocg, true);
1749 if (!abs_cost)
1750 return;
1751
1752 /* update cursor if backmerging into the request at the cursor */
1753 if (blk_rq_pos(rq) < bio_end &&
1754 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1755 iocg->cursor = bio_end;
1756
1757 current_hweight(iocg, NULL, &hw_inuse);
1758 cost = div64_u64(abs_cost * HWEIGHT_WHOLE, hw_inuse);
1759 bio->bi_iocost_cost = cost;
1760
1761 atomic64_add(cost, &iocg->vtime);
1762}
1763
1764static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1765{
1766 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1767
1768 if (iocg && bio->bi_iocost_cost)
1769 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1770}
1771
1772static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1773{
1774 struct ioc *ioc = rqos_to_ioc(rqos);
1775 u64 on_q_ns, rq_wait_ns;
1776 int pidx, rw;
1777
1778 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1779 return;
1780
1781 switch (req_op(rq) & REQ_OP_MASK) {
1782 case REQ_OP_READ:
1783 pidx = QOS_RLAT;
1784 rw = READ;
1785 break;
1786 case REQ_OP_WRITE:
1787 pidx = QOS_WLAT;
1788 rw = WRITE;
1789 break;
1790 default:
1791 return;
1792 }
1793
1794 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1795 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1796
1797 if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1798 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1799 else
1800 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1801
1802 this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1803}
1804
1805static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1806{
1807 struct ioc *ioc = rqos_to_ioc(rqos);
1808
1809 spin_lock_irq(&ioc->lock);
1810 ioc_refresh_params(ioc, false);
1811 spin_unlock_irq(&ioc->lock);
1812}
1813
1814static void ioc_rqos_exit(struct rq_qos *rqos)
1815{
1816 struct ioc *ioc = rqos_to_ioc(rqos);
1817
1818 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1819
1820 spin_lock_irq(&ioc->lock);
1821 ioc->running = IOC_STOP;
1822 spin_unlock_irq(&ioc->lock);
1823
1824 del_timer_sync(&ioc->timer);
1825 free_percpu(ioc->pcpu_stat);
1826 kfree(ioc);
1827}
1828
1829static struct rq_qos_ops ioc_rqos_ops = {
1830 .throttle = ioc_rqos_throttle,
1831 .merge = ioc_rqos_merge,
1832 .done_bio = ioc_rqos_done_bio,
1833 .done = ioc_rqos_done,
1834 .queue_depth_changed = ioc_rqos_queue_depth_changed,
1835 .exit = ioc_rqos_exit,
1836};
1837
1838static int blk_iocost_init(struct request_queue *q)
1839{
1840 struct ioc *ioc;
1841 struct rq_qos *rqos;
1842 int ret;
1843
1844 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1845 if (!ioc)
1846 return -ENOMEM;
1847
1848 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1849 if (!ioc->pcpu_stat) {
1850 kfree(ioc);
1851 return -ENOMEM;
1852 }
1853
1854 rqos = &ioc->rqos;
1855 rqos->id = RQ_QOS_COST;
1856 rqos->ops = &ioc_rqos_ops;
1857 rqos->q = q;
1858
1859 spin_lock_init(&ioc->lock);
1860 timer_setup(&ioc->timer, ioc_timer_fn, 0);
1861 INIT_LIST_HEAD(&ioc->active_iocgs);
1862
1863 ioc->running = IOC_IDLE;
1864 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1865 seqcount_init(&ioc->period_seqcount);
1866 ioc->period_at = ktime_to_us(ktime_get());
1867 atomic64_set(&ioc->cur_period, 0);
1868 atomic_set(&ioc->hweight_gen, 0);
1869
1870 spin_lock_irq(&ioc->lock);
1871 ioc->autop_idx = AUTOP_INVALID;
1872 ioc_refresh_params(ioc, true);
1873 spin_unlock_irq(&ioc->lock);
1874
1875 rq_qos_add(q, rqos);
1876 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1877 if (ret) {
1878 rq_qos_del(q, rqos);
Tejun Heo3532e722019-08-29 08:53:06 -07001879 free_percpu(ioc->pcpu_stat);
Tejun Heo7caa4712019-08-28 15:05:58 -07001880 kfree(ioc);
1881 return ret;
1882 }
1883 return 0;
1884}
1885
1886static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
1887{
1888 struct ioc_cgrp *iocc;
1889
1890 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
Tejun Heoe916ad22019-08-30 06:10:58 -07001891 if (!iocc)
1892 return NULL;
Tejun Heo7caa4712019-08-28 15:05:58 -07001893
Tejun Heoe916ad22019-08-30 06:10:58 -07001894 iocc->dfl_weight = CGROUP_WEIGHT_DFL;
Tejun Heo7caa4712019-08-28 15:05:58 -07001895 return &iocc->cpd;
1896}
1897
1898static void ioc_cpd_free(struct blkcg_policy_data *cpd)
1899{
1900 kfree(container_of(cpd, struct ioc_cgrp, cpd));
1901}
1902
1903static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
1904 struct blkcg *blkcg)
1905{
1906 int levels = blkcg->css.cgroup->level + 1;
1907 struct ioc_gq *iocg;
1908
1909 iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
1910 gfp, q->node);
1911 if (!iocg)
1912 return NULL;
1913
1914 return &iocg->pd;
1915}
1916
1917static void ioc_pd_init(struct blkg_policy_data *pd)
1918{
1919 struct ioc_gq *iocg = pd_to_iocg(pd);
1920 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
1921 struct ioc *ioc = q_to_ioc(blkg->q);
1922 struct ioc_now now;
1923 struct blkcg_gq *tblkg;
1924 unsigned long flags;
1925
1926 ioc_now(ioc, &now);
1927
1928 iocg->ioc = ioc;
1929 atomic64_set(&iocg->vtime, now.vnow);
1930 atomic64_set(&iocg->done_vtime, now.vnow);
1931 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
1932 INIT_LIST_HEAD(&iocg->active_list);
1933 iocg->hweight_active = HWEIGHT_WHOLE;
1934 iocg->hweight_inuse = HWEIGHT_WHOLE;
1935
1936 init_waitqueue_head(&iocg->waitq);
1937 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1938 iocg->waitq_timer.function = iocg_waitq_timer_fn;
1939 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1940 iocg->delay_timer.function = iocg_delay_timer_fn;
1941
1942 iocg->level = blkg->blkcg->css.cgroup->level;
1943
1944 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
1945 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
1946 iocg->ancestors[tiocg->level] = tiocg;
1947 }
1948
1949 spin_lock_irqsave(&ioc->lock, flags);
1950 weight_updated(iocg);
1951 spin_unlock_irqrestore(&ioc->lock, flags);
1952}
1953
1954static void ioc_pd_free(struct blkg_policy_data *pd)
1955{
1956 struct ioc_gq *iocg = pd_to_iocg(pd);
1957 struct ioc *ioc = iocg->ioc;
1958
1959 if (ioc) {
Tejun Heo7caa4712019-08-28 15:05:58 -07001960 spin_lock(&ioc->lock);
1961 if (!list_empty(&iocg->active_list)) {
1962 propagate_active_weight(iocg, 0, 0);
1963 list_del_init(&iocg->active_list);
1964 }
1965 spin_unlock(&ioc->lock);
Tejun Heoe036c4c2019-09-10 09:15:25 -07001966
1967 hrtimer_cancel(&iocg->waitq_timer);
1968 hrtimer_cancel(&iocg->delay_timer);
Tejun Heo7caa4712019-08-28 15:05:58 -07001969 }
1970 kfree(iocg);
1971}
1972
1973static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
1974 int off)
1975{
1976 const char *dname = blkg_dev_name(pd->blkg);
1977 struct ioc_gq *iocg = pd_to_iocg(pd);
1978
1979 if (dname && iocg->cfg_weight)
1980 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
1981 return 0;
1982}
1983
1984
1985static int ioc_weight_show(struct seq_file *sf, void *v)
1986{
1987 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1988 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
1989
1990 seq_printf(sf, "default %u\n", iocc->dfl_weight);
1991 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
1992 &blkcg_policy_iocost, seq_cft(sf)->private, false);
1993 return 0;
1994}
1995
1996static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
1997 size_t nbytes, loff_t off)
1998{
1999 struct blkcg *blkcg = css_to_blkcg(of_css(of));
2000 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2001 struct blkg_conf_ctx ctx;
2002 struct ioc_gq *iocg;
2003 u32 v;
2004 int ret;
2005
2006 if (!strchr(buf, ':')) {
2007 struct blkcg_gq *blkg;
2008
2009 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2010 return -EINVAL;
2011
2012 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2013 return -EINVAL;
2014
2015 spin_lock(&blkcg->lock);
2016 iocc->dfl_weight = v;
2017 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2018 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2019
2020 if (iocg) {
2021 spin_lock_irq(&iocg->ioc->lock);
2022 weight_updated(iocg);
2023 spin_unlock_irq(&iocg->ioc->lock);
2024 }
2025 }
2026 spin_unlock(&blkcg->lock);
2027
2028 return nbytes;
2029 }
2030
2031 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2032 if (ret)
2033 return ret;
2034
2035 iocg = blkg_to_iocg(ctx.blkg);
2036
2037 if (!strncmp(ctx.body, "default", 7)) {
2038 v = 0;
2039 } else {
2040 if (!sscanf(ctx.body, "%u", &v))
2041 goto einval;
2042 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2043 goto einval;
2044 }
2045
2046 spin_lock_irq(&iocg->ioc->lock);
2047 iocg->cfg_weight = v;
2048 weight_updated(iocg);
2049 spin_unlock_irq(&iocg->ioc->lock);
2050
2051 blkg_conf_finish(&ctx);
2052 return nbytes;
2053
2054einval:
2055 blkg_conf_finish(&ctx);
2056 return -EINVAL;
2057}
2058
2059static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2060 int off)
2061{
2062 const char *dname = blkg_dev_name(pd->blkg);
2063 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2064
2065 if (!dname)
2066 return 0;
2067
2068 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2069 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2070 ioc->params.qos[QOS_RPPM] / 10000,
2071 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2072 ioc->params.qos[QOS_RLAT],
2073 ioc->params.qos[QOS_WPPM] / 10000,
2074 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2075 ioc->params.qos[QOS_WLAT],
2076 ioc->params.qos[QOS_MIN] / 10000,
2077 ioc->params.qos[QOS_MIN] % 10000 / 100,
2078 ioc->params.qos[QOS_MAX] / 10000,
2079 ioc->params.qos[QOS_MAX] % 10000 / 100);
2080 return 0;
2081}
2082
2083static int ioc_qos_show(struct seq_file *sf, void *v)
2084{
2085 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2086
2087 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2088 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2089 return 0;
2090}
2091
2092static const match_table_t qos_ctrl_tokens = {
2093 { QOS_ENABLE, "enable=%u" },
2094 { QOS_CTRL, "ctrl=%s" },
2095 { NR_QOS_CTRL_PARAMS, NULL },
2096};
2097
2098static const match_table_t qos_tokens = {
2099 { QOS_RPPM, "rpct=%s" },
2100 { QOS_RLAT, "rlat=%u" },
2101 { QOS_WPPM, "wpct=%s" },
2102 { QOS_WLAT, "wlat=%u" },
2103 { QOS_MIN, "min=%s" },
2104 { QOS_MAX, "max=%s" },
2105 { NR_QOS_PARAMS, NULL },
2106};
2107
2108static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2109 size_t nbytes, loff_t off)
2110{
2111 struct gendisk *disk;
2112 struct ioc *ioc;
2113 u32 qos[NR_QOS_PARAMS];
2114 bool enable, user;
2115 char *p;
2116 int ret;
2117
2118 disk = blkcg_conf_get_disk(&input);
2119 if (IS_ERR(disk))
2120 return PTR_ERR(disk);
2121
2122 ioc = q_to_ioc(disk->queue);
2123 if (!ioc) {
2124 ret = blk_iocost_init(disk->queue);
2125 if (ret)
2126 goto err;
2127 ioc = q_to_ioc(disk->queue);
2128 }
2129
2130 spin_lock_irq(&ioc->lock);
2131 memcpy(qos, ioc->params.qos, sizeof(qos));
2132 enable = ioc->enabled;
2133 user = ioc->user_qos_params;
2134 spin_unlock_irq(&ioc->lock);
2135
2136 while ((p = strsep(&input, " \t\n"))) {
2137 substring_t args[MAX_OPT_ARGS];
2138 char buf[32];
2139 int tok;
2140 s64 v;
2141
2142 if (!*p)
2143 continue;
2144
2145 switch (match_token(p, qos_ctrl_tokens, args)) {
2146 case QOS_ENABLE:
2147 match_u64(&args[0], &v);
2148 enable = v;
2149 continue;
2150 case QOS_CTRL:
2151 match_strlcpy(buf, &args[0], sizeof(buf));
2152 if (!strcmp(buf, "auto"))
2153 user = false;
2154 else if (!strcmp(buf, "user"))
2155 user = true;
2156 else
2157 goto einval;
2158 continue;
2159 }
2160
2161 tok = match_token(p, qos_tokens, args);
2162 switch (tok) {
2163 case QOS_RPPM:
2164 case QOS_WPPM:
2165 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2166 sizeof(buf))
2167 goto einval;
2168 if (cgroup_parse_float(buf, 2, &v))
2169 goto einval;
2170 if (v < 0 || v > 10000)
2171 goto einval;
2172 qos[tok] = v * 100;
2173 break;
2174 case QOS_RLAT:
2175 case QOS_WLAT:
2176 if (match_u64(&args[0], &v))
2177 goto einval;
2178 qos[tok] = v;
2179 break;
2180 case QOS_MIN:
2181 case QOS_MAX:
2182 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2183 sizeof(buf))
2184 goto einval;
2185 if (cgroup_parse_float(buf, 2, &v))
2186 goto einval;
2187 if (v < 0)
2188 goto einval;
2189 qos[tok] = clamp_t(s64, v * 100,
2190 VRATE_MIN_PPM, VRATE_MAX_PPM);
2191 break;
2192 default:
2193 goto einval;
2194 }
2195 user = true;
2196 }
2197
2198 if (qos[QOS_MIN] > qos[QOS_MAX])
2199 goto einval;
2200
2201 spin_lock_irq(&ioc->lock);
2202
2203 if (enable) {
2204 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2205 ioc->enabled = true;
2206 } else {
2207 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2208 ioc->enabled = false;
2209 }
2210
2211 if (user) {
2212 memcpy(ioc->params.qos, qos, sizeof(qos));
2213 ioc->user_qos_params = true;
2214 } else {
2215 ioc->user_qos_params = false;
2216 }
2217
2218 ioc_refresh_params(ioc, true);
2219 spin_unlock_irq(&ioc->lock);
2220
2221 put_disk_and_module(disk);
2222 return nbytes;
2223einval:
2224 ret = -EINVAL;
2225err:
2226 put_disk_and_module(disk);
2227 return ret;
2228}
2229
2230static u64 ioc_cost_model_prfill(struct seq_file *sf,
2231 struct blkg_policy_data *pd, int off)
2232{
2233 const char *dname = blkg_dev_name(pd->blkg);
2234 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2235 u64 *u = ioc->params.i_lcoefs;
2236
2237 if (!dname)
2238 return 0;
2239
2240 seq_printf(sf, "%s ctrl=%s model=linear "
2241 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2242 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2243 dname, ioc->user_cost_model ? "user" : "auto",
2244 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2245 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2246 return 0;
2247}
2248
2249static int ioc_cost_model_show(struct seq_file *sf, void *v)
2250{
2251 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2252
2253 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2254 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2255 return 0;
2256}
2257
2258static const match_table_t cost_ctrl_tokens = {
2259 { COST_CTRL, "ctrl=%s" },
2260 { COST_MODEL, "model=%s" },
2261 { NR_COST_CTRL_PARAMS, NULL },
2262};
2263
2264static const match_table_t i_lcoef_tokens = {
2265 { I_LCOEF_RBPS, "rbps=%u" },
2266 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2267 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2268 { I_LCOEF_WBPS, "wbps=%u" },
2269 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2270 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2271 { NR_I_LCOEFS, NULL },
2272};
2273
2274static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2275 size_t nbytes, loff_t off)
2276{
2277 struct gendisk *disk;
2278 struct ioc *ioc;
2279 u64 u[NR_I_LCOEFS];
2280 bool user;
2281 char *p;
2282 int ret;
2283
2284 disk = blkcg_conf_get_disk(&input);
2285 if (IS_ERR(disk))
2286 return PTR_ERR(disk);
2287
2288 ioc = q_to_ioc(disk->queue);
2289 if (!ioc) {
2290 ret = blk_iocost_init(disk->queue);
2291 if (ret)
2292 goto err;
2293 ioc = q_to_ioc(disk->queue);
2294 }
2295
2296 spin_lock_irq(&ioc->lock);
2297 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2298 user = ioc->user_cost_model;
2299 spin_unlock_irq(&ioc->lock);
2300
2301 while ((p = strsep(&input, " \t\n"))) {
2302 substring_t args[MAX_OPT_ARGS];
2303 char buf[32];
2304 int tok;
2305 u64 v;
2306
2307 if (!*p)
2308 continue;
2309
2310 switch (match_token(p, cost_ctrl_tokens, args)) {
2311 case COST_CTRL:
2312 match_strlcpy(buf, &args[0], sizeof(buf));
2313 if (!strcmp(buf, "auto"))
2314 user = false;
2315 else if (!strcmp(buf, "user"))
2316 user = true;
2317 else
2318 goto einval;
2319 continue;
2320 case COST_MODEL:
2321 match_strlcpy(buf, &args[0], sizeof(buf));
2322 if (strcmp(buf, "linear"))
2323 goto einval;
2324 continue;
2325 }
2326
2327 tok = match_token(p, i_lcoef_tokens, args);
2328 if (tok == NR_I_LCOEFS)
2329 goto einval;
2330 if (match_u64(&args[0], &v))
2331 goto einval;
2332 u[tok] = v;
2333 user = true;
2334 }
2335
2336 spin_lock_irq(&ioc->lock);
2337 if (user) {
2338 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2339 ioc->user_cost_model = true;
2340 } else {
2341 ioc->user_cost_model = false;
2342 }
2343 ioc_refresh_params(ioc, true);
2344 spin_unlock_irq(&ioc->lock);
2345
2346 put_disk_and_module(disk);
2347 return nbytes;
2348
2349einval:
2350 ret = -EINVAL;
2351err:
2352 put_disk_and_module(disk);
2353 return ret;
2354}
2355
2356static struct cftype ioc_files[] = {
2357 {
2358 .name = "weight",
2359 .flags = CFTYPE_NOT_ON_ROOT,
2360 .seq_show = ioc_weight_show,
2361 .write = ioc_weight_write,
2362 },
2363 {
2364 .name = "cost.qos",
2365 .flags = CFTYPE_ONLY_ON_ROOT,
2366 .seq_show = ioc_qos_show,
2367 .write = ioc_qos_write,
2368 },
2369 {
2370 .name = "cost.model",
2371 .flags = CFTYPE_ONLY_ON_ROOT,
2372 .seq_show = ioc_cost_model_show,
2373 .write = ioc_cost_model_write,
2374 },
2375 {}
2376};
2377
2378static struct blkcg_policy blkcg_policy_iocost = {
2379 .dfl_cftypes = ioc_files,
2380 .cpd_alloc_fn = ioc_cpd_alloc,
2381 .cpd_free_fn = ioc_cpd_free,
2382 .pd_alloc_fn = ioc_pd_alloc,
2383 .pd_init_fn = ioc_pd_init,
2384 .pd_free_fn = ioc_pd_free,
2385};
2386
2387static int __init ioc_init(void)
2388{
2389 return blkcg_policy_register(&blkcg_policy_iocost);
2390}
2391
2392static void __exit ioc_exit(void)
2393{
2394 return blkcg_policy_unregister(&blkcg_policy_iocost);
2395}
2396
2397module_init(ioc_init);
2398module_exit(ioc_exit);