blob: 680815620095b33cae762b1f7fb591f28b2716c2 [file] [log] [blame]
Tejun Heo7caa4712019-08-28 15:05:58 -07001/* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
42 * paramters for several different classes of devices are provided and the
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
49 * 2. Control Strategy
50 *
51 * The device virtual time (vtime) is used as the primary control metric.
52 * The control strategy is composed of the following three parts.
53 *
54 * 2-1. Vtime Distribution
55 *
56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
57 * calculated. Please consider the following hierarchy where the numbers
58 * inside parentheses denote the configured weights.
59 *
60 * root
61 * / \
62 * A (w:100) B (w:300)
63 * / \
64 * A0 (w:100) A1 (w:100)
65 *
66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69 * 12.5% each. The distribution mechanism only cares about these flattened
70 * shares. They're called hweights (hierarchical weights) and always add
71 * upto 1 (HWEIGHT_WHOLE).
72 *
73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75 * against the device vtime - an IO which takes 10ms on the underlying
76 * device is considered to take 80ms on A0.
77 *
78 * This constitutes the basis of IO capacity distribution. Each cgroup's
79 * vtime is running at a rate determined by its hweight. A cgroup tracks
80 * the vtime consumed by past IOs and can issue a new IO iff doing so
81 * wouldn't outrun the current device vtime. Otherwise, the IO is
82 * suspended until the vtime has progressed enough to cover it.
83 *
84 * 2-2. Vrate Adjustment
85 *
86 * It's unrealistic to expect the cost model to be perfect. There are too
87 * many devices and even on the same device the overall performance
88 * fluctuates depending on numerous factors such as IO mixture and device
89 * internal garbage collection. The controller needs to adapt dynamically.
90 *
91 * This is achieved by adjusting the overall IO rate according to how busy
92 * the device is. If the device becomes overloaded, we're sending down too
93 * many IOs and should generally slow down. If there are waiting issuers
94 * but the device isn't saturated, we're issuing too few and should
95 * generally speed up.
96 *
97 * To slow down, we lower the vrate - the rate at which the device vtime
98 * passes compared to the wall clock. For example, if the vtime is running
99 * at the vrate of 75%, all cgroups added up would only be able to issue
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
101 *
102 * Device business is determined using two criteria - rq wait and
103 * completion latencies.
104 *
105 * When a device gets saturated, the on-device and then the request queues
106 * fill up and a bio which is ready to be issued has to wait for a request
107 * to become available. When this delay becomes noticeable, it's a clear
108 * indication that the device is saturated and we lower the vrate. This
109 * saturation signal is fairly conservative as it only triggers when both
110 * hardware and software queues are filled up, and is used as the default
111 * busy signal.
112 *
113 * As devices can have deep queues and be unfair in how the queued commands
114 * are executed, soley depending on rq wait may not result in satisfactory
115 * control quality. For a better control quality, completion latency QoS
116 * parameters can be configured so that the device is considered saturated
117 * if N'th percentile completion latency rises above the set point.
118 *
119 * The completion latency requirements are a function of both the
120 * underlying device characteristics and the desired IO latency quality of
121 * service. There is an inherent trade-off - the tighter the latency QoS,
122 * the higher the bandwidth lossage. Latency QoS is disabled by default
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
124 *
125 * 2-3. Work Conservation
126 *
127 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
128 * periodically while B is sending out enough parallel IOs to saturate the
129 * device on its own. Let's say A's usage amounts to 100ms worth of IO
130 * cost per second, i.e., 10% of the device capacity. The naive
131 * distribution of half and half would lead to 60% utilization of the
132 * device, a significant reduction in the total amount of work done
133 * compared to free-for-all competition. This is too high a cost to pay
134 * for IO control.
135 *
136 * To conserve the total amount of work done, we keep track of how much
137 * each active cgroup is actually using and yield part of its weight if
138 * there are other cgroups which can make use of it. In the above case,
139 * A's weight will be lowered so that it hovers above the actual usage and
140 * B would be able to use the rest.
141 *
142 * As we don't want to penalize a cgroup for donating its weight, the
143 * surplus weight adjustment factors in a margin and has an immediate
144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
145 *
146 * Note that adjusting down surplus weights has the same effects as
147 * accelerating vtime for other cgroups and work conservation can also be
148 * implemented by adjusting vrate dynamically. However, squaring who can
149 * donate and should take back how much requires hweight propagations
150 * anyway making it easier to implement and understand as a separate
151 * mechanism.
152 */
153
154#include <linux/kernel.h>
155#include <linux/module.h>
156#include <linux/timer.h>
157#include <linux/time64.h>
158#include <linux/parser.h>
159#include <linux/sched/signal.h>
160#include <linux/blk-cgroup.h>
161#include "blk-rq-qos.h"
162#include "blk-stat.h"
163#include "blk-wbt.h"
164
165#ifdef CONFIG_TRACEPOINTS
166
167/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
168#define TRACE_IOCG_PATH_LEN 1024
169static DEFINE_SPINLOCK(trace_iocg_path_lock);
170static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
171
172#define TRACE_IOCG_PATH(type, iocg, ...) \
173 do { \
174 unsigned long flags; \
175 if (trace_iocost_##type##_enabled()) { \
176 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
177 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
178 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
179 trace_iocost_##type(iocg, trace_iocg_path, \
180 ##__VA_ARGS__); \
181 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
182 } \
183 } while (0)
184
185#else /* CONFIG_TRACE_POINTS */
186#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
187#endif /* CONFIG_TRACE_POINTS */
188
189enum {
190 MILLION = 1000000,
191
192 /* timer period is calculated from latency requirements, bound it */
193 MIN_PERIOD = USEC_PER_MSEC,
194 MAX_PERIOD = USEC_PER_SEC,
195
196 /*
197 * A cgroup's vtime can run 50% behind the device vtime, which
198 * serves as its IO credit buffer. Surplus weight adjustment is
199 * immediately canceled if the vtime margin runs below 10%.
200 */
201 MARGIN_PCT = 50,
202 INUSE_MARGIN_PCT = 10,
203
204 /* Have some play in waitq timer operations */
205 WAITQ_TIMER_MARGIN_PCT = 5,
206
207 /*
208 * vtime can wrap well within a reasonable uptime when vrate is
209 * consistently raised. Don't trust recorded cgroup vtime if the
210 * period counter indicates that it's older than 5mins.
211 */
212 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
213
214 /*
215 * Remember the past three non-zero usages and use the max for
216 * surplus calculation. Three slots guarantee that we remember one
217 * full period usage from the last active stretch even after
218 * partial deactivation and re-activation periods. Don't start
219 * giving away weight before collecting two data points to prevent
220 * hweight adjustments based on one partial activation period.
221 */
222 NR_USAGE_SLOTS = 3,
223 MIN_VALID_USAGES = 2,
224
225 /* 1/64k is granular enough and can easily be handled w/ u32 */
226 HWEIGHT_WHOLE = 1 << 16,
227
228 /*
229 * As vtime is used to calculate the cost of each IO, it needs to
230 * be fairly high precision. For example, it should be able to
231 * represent the cost of a single page worth of discard with
232 * suffificient accuracy. At the same time, it should be able to
233 * represent reasonably long enough durations to be useful and
234 * convenient during operation.
235 *
236 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
237 * granularity and days of wrap-around time even at extreme vrates.
238 */
239 VTIME_PER_SEC_SHIFT = 37,
240 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
241 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
242
243 /* bound vrate adjustments within two orders of magnitude */
244 VRATE_MIN_PPM = 10000, /* 1% */
245 VRATE_MAX_PPM = 100000000, /* 10000% */
246
247 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
248 VRATE_CLAMP_ADJ_PCT = 4,
249
250 /* if IOs end up waiting for requests, issue less */
251 RQ_WAIT_BUSY_PCT = 5,
252
253 /* unbusy hysterisis */
254 UNBUSY_THR_PCT = 75,
255
256 /* don't let cmds which take a very long time pin lagging for too long */
257 MAX_LAGGING_PERIODS = 10,
258
259 /*
260 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
261 * donate the surplus.
262 */
263 SURPLUS_SCALE_PCT = 125, /* * 125% */
264 SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
265 SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
266
267 /* switch iff the conditions are met for longer than this */
268 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
269
270 /*
271 * Count IO size in 4k pages. The 12bit shift helps keeping
272 * size-proportional components of cost calculation in closer
273 * numbers of digits to per-IO cost components.
274 */
275 IOC_PAGE_SHIFT = 12,
276 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
277 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
278
279 /* if apart further than 16M, consider randio for linear model */
280 LCOEF_RANDIO_PAGES = 4096,
281};
282
283enum ioc_running {
284 IOC_IDLE,
285 IOC_RUNNING,
286 IOC_STOP,
287};
288
289/* io.cost.qos controls including per-dev enable of the whole controller */
290enum {
291 QOS_ENABLE,
292 QOS_CTRL,
293 NR_QOS_CTRL_PARAMS,
294};
295
296/* io.cost.qos params */
297enum {
298 QOS_RPPM,
299 QOS_RLAT,
300 QOS_WPPM,
301 QOS_WLAT,
302 QOS_MIN,
303 QOS_MAX,
304 NR_QOS_PARAMS,
305};
306
307/* io.cost.model controls */
308enum {
309 COST_CTRL,
310 COST_MODEL,
311 NR_COST_CTRL_PARAMS,
312};
313
314/* builtin linear cost model coefficients */
315enum {
316 I_LCOEF_RBPS,
317 I_LCOEF_RSEQIOPS,
318 I_LCOEF_RRANDIOPS,
319 I_LCOEF_WBPS,
320 I_LCOEF_WSEQIOPS,
321 I_LCOEF_WRANDIOPS,
322 NR_I_LCOEFS,
323};
324
325enum {
326 LCOEF_RPAGE,
327 LCOEF_RSEQIO,
328 LCOEF_RRANDIO,
329 LCOEF_WPAGE,
330 LCOEF_WSEQIO,
331 LCOEF_WRANDIO,
332 NR_LCOEFS,
333};
334
335enum {
336 AUTOP_INVALID,
337 AUTOP_HDD,
338 AUTOP_SSD_QD1,
339 AUTOP_SSD_DFL,
340 AUTOP_SSD_FAST,
341};
342
343struct ioc_gq;
344
345struct ioc_params {
346 u32 qos[NR_QOS_PARAMS];
347 u64 i_lcoefs[NR_I_LCOEFS];
348 u64 lcoefs[NR_LCOEFS];
349 u32 too_fast_vrate_pct;
350 u32 too_slow_vrate_pct;
351};
352
353struct ioc_missed {
354 u32 nr_met;
355 u32 nr_missed;
356 u32 last_met;
357 u32 last_missed;
358};
359
360struct ioc_pcpu_stat {
361 struct ioc_missed missed[2];
362
363 u64 rq_wait_ns;
364 u64 last_rq_wait_ns;
365};
366
367/* per device */
368struct ioc {
369 struct rq_qos rqos;
370
371 bool enabled;
372
373 struct ioc_params params;
374 u32 period_us;
375 u32 margin_us;
376 u64 vrate_min;
377 u64 vrate_max;
378
379 spinlock_t lock;
380 struct timer_list timer;
381 struct list_head active_iocgs; /* active cgroups */
382 struct ioc_pcpu_stat __percpu *pcpu_stat;
383
384 enum ioc_running running;
385 atomic64_t vtime_rate;
386
387 seqcount_t period_seqcount;
388 u32 period_at; /* wallclock starttime */
389 u64 period_at_vtime; /* vtime starttime */
390
391 atomic64_t cur_period; /* inc'd each period */
392 int busy_level; /* saturation history */
393
394 u64 inuse_margin_vtime;
395 bool weights_updated;
396 atomic_t hweight_gen; /* for lazy hweights */
397
398 u64 autop_too_fast_at;
399 u64 autop_too_slow_at;
400 int autop_idx;
401 bool user_qos_params:1;
402 bool user_cost_model:1;
403};
404
405/* per device-cgroup pair */
406struct ioc_gq {
407 struct blkg_policy_data pd;
408 struct ioc *ioc;
409
410 /*
411 * A iocg can get its weight from two sources - an explicit
412 * per-device-cgroup configuration or the default weight of the
413 * cgroup. `cfg_weight` is the explicit per-device-cgroup
414 * configuration. `weight` is the effective considering both
415 * sources.
416 *
417 * When an idle cgroup becomes active its `active` goes from 0 to
418 * `weight`. `inuse` is the surplus adjusted active weight.
419 * `active` and `inuse` are used to calculate `hweight_active` and
420 * `hweight_inuse`.
421 *
422 * `last_inuse` remembers `inuse` while an iocg is idle to persist
423 * surplus adjustments.
424 */
425 u32 cfg_weight;
426 u32 weight;
427 u32 active;
428 u32 inuse;
429 u32 last_inuse;
430
431 sector_t cursor; /* to detect randio */
432
433 /*
434 * `vtime` is this iocg's vtime cursor which progresses as IOs are
435 * issued. If lagging behind device vtime, the delta represents
436 * the currently available IO budget. If runnning ahead, the
437 * overage.
438 *
439 * `vtime_done` is the same but progressed on completion rather
440 * than issue. The delta behind `vtime` represents the cost of
441 * currently in-flight IOs.
442 *
443 * `last_vtime` is used to remember `vtime` at the end of the last
444 * period to calculate utilization.
445 */
446 atomic64_t vtime;
447 atomic64_t done_vtime;
448 u64 last_vtime;
449
450 /*
451 * The period this iocg was last active in. Used for deactivation
452 * and invalidating `vtime`.
453 */
454 atomic64_t active_period;
455 struct list_head active_list;
456
457 /* see __propagate_active_weight() and current_hweight() for details */
458 u64 child_active_sum;
459 u64 child_inuse_sum;
460 int hweight_gen;
461 u32 hweight_active;
462 u32 hweight_inuse;
463 bool has_surplus;
464
465 struct wait_queue_head waitq;
466 struct hrtimer waitq_timer;
467 struct hrtimer delay_timer;
468
469 /* usage is recorded as fractions of HWEIGHT_WHOLE */
470 int usage_idx;
471 u32 usages[NR_USAGE_SLOTS];
472
473 /* this iocg's depth in the hierarchy and ancestors including self */
474 int level;
475 struct ioc_gq *ancestors[];
476};
477
478/* per cgroup */
479struct ioc_cgrp {
480 struct blkcg_policy_data cpd;
481 unsigned int dfl_weight;
482};
483
484struct ioc_now {
485 u64 now_ns;
486 u32 now;
487 u64 vnow;
488 u64 vrate;
489};
490
491struct iocg_wait {
492 struct wait_queue_entry wait;
493 struct bio *bio;
494 u64 abs_cost;
495 bool committed;
496};
497
498struct iocg_wake_ctx {
499 struct ioc_gq *iocg;
500 u32 hw_inuse;
501 s64 vbudget;
502};
503
504static const struct ioc_params autop[] = {
505 [AUTOP_HDD] = {
506 .qos = {
507 [QOS_RLAT] = 50000, /* 50ms */
508 [QOS_WLAT] = 50000,
509 [QOS_MIN] = VRATE_MIN_PPM,
510 [QOS_MAX] = VRATE_MAX_PPM,
511 },
512 .i_lcoefs = {
513 [I_LCOEF_RBPS] = 174019176,
514 [I_LCOEF_RSEQIOPS] = 41708,
515 [I_LCOEF_RRANDIOPS] = 370,
516 [I_LCOEF_WBPS] = 178075866,
517 [I_LCOEF_WSEQIOPS] = 42705,
518 [I_LCOEF_WRANDIOPS] = 378,
519 },
520 },
521 [AUTOP_SSD_QD1] = {
522 .qos = {
523 [QOS_RLAT] = 25000, /* 25ms */
524 [QOS_WLAT] = 25000,
525 [QOS_MIN] = VRATE_MIN_PPM,
526 [QOS_MAX] = VRATE_MAX_PPM,
527 },
528 .i_lcoefs = {
529 [I_LCOEF_RBPS] = 245855193,
530 [I_LCOEF_RSEQIOPS] = 61575,
531 [I_LCOEF_RRANDIOPS] = 6946,
532 [I_LCOEF_WBPS] = 141365009,
533 [I_LCOEF_WSEQIOPS] = 33716,
534 [I_LCOEF_WRANDIOPS] = 26796,
535 },
536 },
537 [AUTOP_SSD_DFL] = {
538 .qos = {
539 [QOS_RLAT] = 25000, /* 25ms */
540 [QOS_WLAT] = 25000,
541 [QOS_MIN] = VRATE_MIN_PPM,
542 [QOS_MAX] = VRATE_MAX_PPM,
543 },
544 .i_lcoefs = {
545 [I_LCOEF_RBPS] = 488636629,
546 [I_LCOEF_RSEQIOPS] = 8932,
547 [I_LCOEF_RRANDIOPS] = 8518,
548 [I_LCOEF_WBPS] = 427891549,
549 [I_LCOEF_WSEQIOPS] = 28755,
550 [I_LCOEF_WRANDIOPS] = 21940,
551 },
552 .too_fast_vrate_pct = 500,
553 },
554 [AUTOP_SSD_FAST] = {
555 .qos = {
556 [QOS_RLAT] = 5000, /* 5ms */
557 [QOS_WLAT] = 5000,
558 [QOS_MIN] = VRATE_MIN_PPM,
559 [QOS_MAX] = VRATE_MAX_PPM,
560 },
561 .i_lcoefs = {
562 [I_LCOEF_RBPS] = 3102524156LLU,
563 [I_LCOEF_RSEQIOPS] = 724816,
564 [I_LCOEF_RRANDIOPS] = 778122,
565 [I_LCOEF_WBPS] = 1742780862LLU,
566 [I_LCOEF_WSEQIOPS] = 425702,
567 [I_LCOEF_WRANDIOPS] = 443193,
568 },
569 .too_slow_vrate_pct = 10,
570 },
571};
572
573/*
574 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
575 * vtime credit shortage and down on device saturation.
576 */
577static u32 vrate_adj_pct[] =
578 { 0, 0, 0, 0,
579 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
580 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
581 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
582
583static struct blkcg_policy blkcg_policy_iocost;
584
585/* accessors and helpers */
586static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
587{
588 return container_of(rqos, struct ioc, rqos);
589}
590
591static struct ioc *q_to_ioc(struct request_queue *q)
592{
593 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
594}
595
596static const char *q_name(struct request_queue *q)
597{
598 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
599 return kobject_name(q->kobj.parent);
600 else
601 return "<unknown>";
602}
603
604static const char __maybe_unused *ioc_name(struct ioc *ioc)
605{
606 return q_name(ioc->rqos.q);
607}
608
609static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
610{
611 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
612}
613
614static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
615{
616 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
617}
618
619static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
620{
621 return pd_to_blkg(&iocg->pd);
622}
623
624static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
625{
626 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
627 struct ioc_cgrp, cpd);
628}
629
630/*
631 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
632 * weight, the more expensive each IO.
633 */
634static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
635{
636 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
637}
638
639static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
640{
641 bio->bi_iocost_cost = cost;
642 atomic64_add(cost, &iocg->vtime);
643}
644
645#define CREATE_TRACE_POINTS
646#include <trace/events/iocost.h>
647
648/* latency Qos params changed, update period_us and all the dependent params */
649static void ioc_refresh_period_us(struct ioc *ioc)
650{
651 u32 ppm, lat, multi, period_us;
652
653 lockdep_assert_held(&ioc->lock);
654
655 /* pick the higher latency target */
656 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
657 ppm = ioc->params.qos[QOS_RPPM];
658 lat = ioc->params.qos[QOS_RLAT];
659 } else {
660 ppm = ioc->params.qos[QOS_WPPM];
661 lat = ioc->params.qos[QOS_WLAT];
662 }
663
664 /*
665 * We want the period to be long enough to contain a healthy number
666 * of IOs while short enough for granular control. Define it as a
667 * multiple of the latency target. Ideally, the multiplier should
668 * be scaled according to the percentile so that it would nominally
669 * contain a certain number of requests. Let's be simpler and
670 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
671 */
672 if (ppm)
673 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
674 else
675 multi = 2;
676 period_us = multi * lat;
677 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
678
679 /* calculate dependent params */
680 ioc->period_us = period_us;
681 ioc->margin_us = period_us * MARGIN_PCT / 100;
682 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
683 period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
684}
685
686static int ioc_autop_idx(struct ioc *ioc)
687{
688 int idx = ioc->autop_idx;
689 const struct ioc_params *p = &autop[idx];
690 u32 vrate_pct;
691 u64 now_ns;
692
693 /* rotational? */
694 if (!blk_queue_nonrot(ioc->rqos.q))
695 return AUTOP_HDD;
696
697 /* handle SATA SSDs w/ broken NCQ */
698 if (blk_queue_depth(ioc->rqos.q) == 1)
699 return AUTOP_SSD_QD1;
700
701 /* use one of the normal ssd sets */
702 if (idx < AUTOP_SSD_DFL)
703 return AUTOP_SSD_DFL;
704
705 /* if user is overriding anything, maintain what was there */
706 if (ioc->user_qos_params || ioc->user_cost_model)
707 return idx;
708
709 /* step up/down based on the vrate */
710 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
711 VTIME_PER_USEC);
712 now_ns = ktime_get_ns();
713
714 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
715 if (!ioc->autop_too_fast_at)
716 ioc->autop_too_fast_at = now_ns;
717 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
718 return idx + 1;
719 } else {
720 ioc->autop_too_fast_at = 0;
721 }
722
723 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
724 if (!ioc->autop_too_slow_at)
725 ioc->autop_too_slow_at = now_ns;
726 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
727 return idx - 1;
728 } else {
729 ioc->autop_too_slow_at = 0;
730 }
731
732 return idx;
733}
734
735/*
736 * Take the followings as input
737 *
738 * @bps maximum sequential throughput
739 * @seqiops maximum sequential 4k iops
740 * @randiops maximum random 4k iops
741 *
742 * and calculate the linear model cost coefficients.
743 *
744 * *@page per-page cost 1s / (@bps / 4096)
745 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
746 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
747 */
748static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
749 u64 *page, u64 *seqio, u64 *randio)
750{
751 u64 v;
752
753 *page = *seqio = *randio = 0;
754
755 if (bps)
756 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
757 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
758
759 if (seqiops) {
760 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
761 if (v > *page)
762 *seqio = v - *page;
763 }
764
765 if (randiops) {
766 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
767 if (v > *page)
768 *randio = v - *page;
769 }
770}
771
772static void ioc_refresh_lcoefs(struct ioc *ioc)
773{
774 u64 *u = ioc->params.i_lcoefs;
775 u64 *c = ioc->params.lcoefs;
776
777 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
778 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
779 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
780 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
781}
782
783static bool ioc_refresh_params(struct ioc *ioc, bool force)
784{
785 const struct ioc_params *p;
786 int idx;
787
788 lockdep_assert_held(&ioc->lock);
789
790 idx = ioc_autop_idx(ioc);
791 p = &autop[idx];
792
793 if (idx == ioc->autop_idx && !force)
794 return false;
795
796 if (idx != ioc->autop_idx)
797 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
798
799 ioc->autop_idx = idx;
800 ioc->autop_too_fast_at = 0;
801 ioc->autop_too_slow_at = 0;
802
803 if (!ioc->user_qos_params)
804 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
805 if (!ioc->user_cost_model)
806 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
807
808 ioc_refresh_period_us(ioc);
809 ioc_refresh_lcoefs(ioc);
810
811 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
812 VTIME_PER_USEC, MILLION);
813 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
814 VTIME_PER_USEC, MILLION);
815
816 return true;
817}
818
819/* take a snapshot of the current [v]time and vrate */
820static void ioc_now(struct ioc *ioc, struct ioc_now *now)
821{
822 unsigned seq;
823
824 now->now_ns = ktime_get();
825 now->now = ktime_to_us(now->now_ns);
826 now->vrate = atomic64_read(&ioc->vtime_rate);
827
828 /*
829 * The current vtime is
830 *
831 * vtime at period start + (wallclock time since the start) * vrate
832 *
833 * As a consistent snapshot of `period_at_vtime` and `period_at` is
834 * needed, they're seqcount protected.
835 */
836 do {
837 seq = read_seqcount_begin(&ioc->period_seqcount);
838 now->vnow = ioc->period_at_vtime +
839 (now->now - ioc->period_at) * now->vrate;
840 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
841}
842
843static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
844{
845 lockdep_assert_held(&ioc->lock);
846 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
847
848 write_seqcount_begin(&ioc->period_seqcount);
849 ioc->period_at = now->now;
850 ioc->period_at_vtime = now->vnow;
851 write_seqcount_end(&ioc->period_seqcount);
852
853 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
854 add_timer(&ioc->timer);
855}
856
857/*
858 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
859 * weight sums and propagate upwards accordingly.
860 */
861static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
862{
863 struct ioc *ioc = iocg->ioc;
864 int lvl;
865
866 lockdep_assert_held(&ioc->lock);
867
868 inuse = min(active, inuse);
869
870 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
871 struct ioc_gq *parent = iocg->ancestors[lvl];
872 struct ioc_gq *child = iocg->ancestors[lvl + 1];
873 u32 parent_active = 0, parent_inuse = 0;
874
875 /* update the level sums */
876 parent->child_active_sum += (s32)(active - child->active);
877 parent->child_inuse_sum += (s32)(inuse - child->inuse);
878 /* apply the udpates */
879 child->active = active;
880 child->inuse = inuse;
881
882 /*
883 * The delta between inuse and active sums indicates that
884 * that much of weight is being given away. Parent's inuse
885 * and active should reflect the ratio.
886 */
887 if (parent->child_active_sum) {
888 parent_active = parent->weight;
889 parent_inuse = DIV64_U64_ROUND_UP(
890 parent_active * parent->child_inuse_sum,
891 parent->child_active_sum);
892 }
893
894 /* do we need to keep walking up? */
895 if (parent_active == parent->active &&
896 parent_inuse == parent->inuse)
897 break;
898
899 active = parent_active;
900 inuse = parent_inuse;
901 }
902
903 ioc->weights_updated = true;
904}
905
906static void commit_active_weights(struct ioc *ioc)
907{
908 lockdep_assert_held(&ioc->lock);
909
910 if (ioc->weights_updated) {
911 /* paired with rmb in current_hweight(), see there */
912 smp_wmb();
913 atomic_inc(&ioc->hweight_gen);
914 ioc->weights_updated = false;
915 }
916}
917
918static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
919{
920 __propagate_active_weight(iocg, active, inuse);
921 commit_active_weights(iocg->ioc);
922}
923
924static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
925{
926 struct ioc *ioc = iocg->ioc;
927 int lvl;
928 u32 hwa, hwi;
929 int ioc_gen;
930
931 /* hot path - if uptodate, use cached */
932 ioc_gen = atomic_read(&ioc->hweight_gen);
933 if (ioc_gen == iocg->hweight_gen)
934 goto out;
935
936 /*
937 * Paired with wmb in commit_active_weights(). If we saw the
938 * updated hweight_gen, all the weight updates from
939 * __propagate_active_weight() are visible too.
940 *
941 * We can race with weight updates during calculation and get it
942 * wrong. However, hweight_gen would have changed and a future
943 * reader will recalculate and we're guaranteed to discard the
944 * wrong result soon.
945 */
946 smp_rmb();
947
948 hwa = hwi = HWEIGHT_WHOLE;
949 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
950 struct ioc_gq *parent = iocg->ancestors[lvl];
951 struct ioc_gq *child = iocg->ancestors[lvl + 1];
952 u32 active_sum = READ_ONCE(parent->child_active_sum);
953 u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
954 u32 active = READ_ONCE(child->active);
955 u32 inuse = READ_ONCE(child->inuse);
956
957 /* we can race with deactivations and either may read as zero */
958 if (!active_sum || !inuse_sum)
959 continue;
960
961 active_sum = max(active, active_sum);
962 hwa = hwa * active / active_sum; /* max 16bits * 10000 */
963
964 inuse_sum = max(inuse, inuse_sum);
965 hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
966 }
967
968 iocg->hweight_active = max_t(u32, hwa, 1);
969 iocg->hweight_inuse = max_t(u32, hwi, 1);
970 iocg->hweight_gen = ioc_gen;
971out:
972 if (hw_activep)
973 *hw_activep = iocg->hweight_active;
974 if (hw_inusep)
975 *hw_inusep = iocg->hweight_inuse;
976}
977
978static void weight_updated(struct ioc_gq *iocg)
979{
980 struct ioc *ioc = iocg->ioc;
981 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
982 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
983 u32 weight;
984
985 lockdep_assert_held(&ioc->lock);
986
987 weight = iocg->cfg_weight ?: iocc->dfl_weight;
988 if (weight != iocg->weight && iocg->active)
989 propagate_active_weight(iocg, weight,
990 DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
991 iocg->weight = weight;
992}
993
994static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
995{
996 struct ioc *ioc = iocg->ioc;
997 u64 last_period, cur_period, max_period_delta;
998 u64 vtime, vmargin, vmin;
999 int i;
1000
1001 /*
1002 * If seem to be already active, just update the stamp to tell the
1003 * timer that we're still active. We don't mind occassional races.
1004 */
1005 if (!list_empty(&iocg->active_list)) {
1006 ioc_now(ioc, now);
1007 cur_period = atomic64_read(&ioc->cur_period);
1008 if (atomic64_read(&iocg->active_period) != cur_period)
1009 atomic64_set(&iocg->active_period, cur_period);
1010 return true;
1011 }
1012
1013 /* racy check on internal node IOs, treat as root level IOs */
1014 if (iocg->child_active_sum)
1015 return false;
1016
1017 spin_lock_irq(&ioc->lock);
1018
1019 ioc_now(ioc, now);
1020
1021 /* update period */
1022 cur_period = atomic64_read(&ioc->cur_period);
1023 last_period = atomic64_read(&iocg->active_period);
1024 atomic64_set(&iocg->active_period, cur_period);
1025
1026 /* already activated or breaking leaf-only constraint? */
1027 for (i = iocg->level; i > 0; i--)
1028 if (!list_empty(&iocg->active_list))
1029 goto fail_unlock;
1030 if (iocg->child_active_sum)
1031 goto fail_unlock;
1032
1033 /*
1034 * vtime may wrap when vrate is raised substantially due to
1035 * underestimated IO costs. Look at the period and ignore its
1036 * vtime if the iocg has been idle for too long. Also, cap the
1037 * budget it can start with to the margin.
1038 */
1039 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1040 vtime = atomic64_read(&iocg->vtime);
1041 vmargin = ioc->margin_us * now->vrate;
1042 vmin = now->vnow - vmargin;
1043
1044 if (last_period + max_period_delta < cur_period ||
1045 time_before64(vtime, vmin)) {
1046 atomic64_add(vmin - vtime, &iocg->vtime);
1047 atomic64_add(vmin - vtime, &iocg->done_vtime);
1048 vtime = vmin;
1049 }
1050
1051 /*
1052 * Activate, propagate weight and start period timer if not
1053 * running. Reset hweight_gen to avoid accidental match from
1054 * wrapping.
1055 */
1056 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1057 list_add(&iocg->active_list, &ioc->active_iocgs);
1058 propagate_active_weight(iocg, iocg->weight,
1059 iocg->last_inuse ?: iocg->weight);
1060
1061 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1062 last_period, cur_period, vtime);
1063
1064 iocg->last_vtime = vtime;
1065
1066 if (ioc->running == IOC_IDLE) {
1067 ioc->running = IOC_RUNNING;
1068 ioc_start_period(ioc, now);
1069 }
1070
1071 spin_unlock_irq(&ioc->lock);
1072 return true;
1073
1074fail_unlock:
1075 spin_unlock_irq(&ioc->lock);
1076 return false;
1077}
1078
1079static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1080 int flags, void *key)
1081{
1082 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1083 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1084 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1085
1086 ctx->vbudget -= cost;
1087
1088 if (ctx->vbudget < 0)
1089 return -1;
1090
1091 iocg_commit_bio(ctx->iocg, wait->bio, cost);
1092
1093 /*
1094 * autoremove_wake_function() removes the wait entry only when it
1095 * actually changed the task state. We want the wait always
1096 * removed. Remove explicitly and use default_wake_function().
1097 */
1098 list_del_init(&wq_entry->entry);
1099 wait->committed = true;
1100
1101 default_wake_function(wq_entry, mode, flags, key);
1102 return 0;
1103}
1104
1105static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1106{
1107 struct ioc *ioc = iocg->ioc;
1108 struct iocg_wake_ctx ctx = { .iocg = iocg };
1109 u64 margin_ns = (u64)(ioc->period_us *
1110 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1111 u64 vshortage, expires, oexpires;
1112
1113 lockdep_assert_held(&iocg->waitq.lock);
1114
1115 /*
1116 * Wake up the ones which are due and see how much vtime we'll need
1117 * for the next one.
1118 */
1119 current_hweight(iocg, NULL, &ctx.hw_inuse);
1120 ctx.vbudget = now->vnow - atomic64_read(&iocg->vtime);
1121 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1122 if (!waitqueue_active(&iocg->waitq))
1123 return;
1124 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1125 return;
1126
1127 /* determine next wakeup, add a quarter margin to guarantee chunking */
1128 vshortage = -ctx.vbudget;
1129 expires = now->now_ns +
1130 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1131 expires += margin_ns / 4;
1132
1133 /* if already active and close enough, don't bother */
1134 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1135 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1136 abs(oexpires - expires) <= margin_ns / 4)
1137 return;
1138
1139 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1140 margin_ns / 4, HRTIMER_MODE_ABS);
1141}
1142
1143static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1144{
1145 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1146 struct ioc_now now;
1147 unsigned long flags;
1148
1149 ioc_now(iocg->ioc, &now);
1150
1151 spin_lock_irqsave(&iocg->waitq.lock, flags);
1152 iocg_kick_waitq(iocg, &now);
1153 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1154
1155 return HRTIMER_NORESTART;
1156}
1157
1158static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1159{
1160 struct ioc *ioc = iocg->ioc;
1161 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1162 u64 vtime = atomic64_read(&iocg->vtime);
1163 u64 vmargin = ioc->margin_us * now->vrate;
1164 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1165 u64 expires, oexpires;
1166
1167 /* clear or maintain depending on the overage */
1168 if (time_before_eq64(vtime, now->vnow)) {
1169 blkcg_clear_delay(blkg);
1170 return;
1171 }
1172 if (!atomic_read(&blkg->use_delay) &&
1173 time_before_eq64(vtime, now->vnow + vmargin))
1174 return;
1175
1176 /* use delay */
1177 if (cost) {
1178 u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1179 now->vrate);
1180 blkcg_add_delay(blkg, now->now_ns, cost_ns);
1181 }
1182 blkcg_use_delay(blkg);
1183
1184 expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1185 now->vrate) * NSEC_PER_USEC;
1186
1187 /* if already active and close enough, don't bother */
1188 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1189 if (hrtimer_is_queued(&iocg->delay_timer) &&
1190 abs(oexpires - expires) <= margin_ns / 4)
1191 return;
1192
1193 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1194 margin_ns / 4, HRTIMER_MODE_ABS);
1195}
1196
1197static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1198{
1199 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1200 struct ioc_now now;
1201
1202 ioc_now(iocg->ioc, &now);
1203 iocg_kick_delay(iocg, &now, 0);
1204
1205 return HRTIMER_NORESTART;
1206}
1207
1208static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1209{
1210 u32 nr_met[2] = { };
1211 u32 nr_missed[2] = { };
1212 u64 rq_wait_ns = 0;
1213 int cpu, rw;
1214
1215 for_each_online_cpu(cpu) {
1216 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1217 u64 this_rq_wait_ns;
1218
1219 for (rw = READ; rw <= WRITE; rw++) {
1220 u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1221 u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1222
1223 nr_met[rw] += this_met - stat->missed[rw].last_met;
1224 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1225 stat->missed[rw].last_met = this_met;
1226 stat->missed[rw].last_missed = this_missed;
1227 }
1228
1229 this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1230 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1231 stat->last_rq_wait_ns = this_rq_wait_ns;
1232 }
1233
1234 for (rw = READ; rw <= WRITE; rw++) {
1235 if (nr_met[rw] + nr_missed[rw])
1236 missed_ppm_ar[rw] =
1237 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1238 nr_met[rw] + nr_missed[rw]);
1239 else
1240 missed_ppm_ar[rw] = 0;
1241 }
1242
1243 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1244 ioc->period_us * NSEC_PER_USEC);
1245}
1246
1247/* was iocg idle this period? */
1248static bool iocg_is_idle(struct ioc_gq *iocg)
1249{
1250 struct ioc *ioc = iocg->ioc;
1251
1252 /* did something get issued this period? */
1253 if (atomic64_read(&iocg->active_period) ==
1254 atomic64_read(&ioc->cur_period))
1255 return false;
1256
1257 /* is something in flight? */
1258 if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime))
1259 return false;
1260
1261 return true;
1262}
1263
1264/* returns usage with margin added if surplus is large enough */
1265static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1266{
1267 /* add margin */
1268 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1269 usage += SURPLUS_SCALE_ABS;
1270
1271 /* don't bother if the surplus is too small */
1272 if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1273 return 0;
1274
1275 return usage;
1276}
1277
1278static void ioc_timer_fn(struct timer_list *timer)
1279{
1280 struct ioc *ioc = container_of(timer, struct ioc, timer);
1281 struct ioc_gq *iocg, *tiocg;
1282 struct ioc_now now;
1283 int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1284 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1285 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1286 u32 missed_ppm[2], rq_wait_pct;
1287 u64 period_vtime;
1288 int i;
1289
1290 /* how were the latencies during the period? */
1291 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1292
1293 /* take care of active iocgs */
1294 spin_lock_irq(&ioc->lock);
1295
1296 ioc_now(ioc, &now);
1297
1298 period_vtime = now.vnow - ioc->period_at_vtime;
1299 if (WARN_ON_ONCE(!period_vtime)) {
1300 spin_unlock_irq(&ioc->lock);
1301 return;
1302 }
1303
1304 /*
1305 * Waiters determine the sleep durations based on the vrate they
1306 * saw at the time of sleep. If vrate has increased, some waiters
1307 * could be sleeping for too long. Wake up tardy waiters which
1308 * should have woken up in the last period and expire idle iocgs.
1309 */
1310 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1311 if (!waitqueue_active(&iocg->waitq) && !iocg_is_idle(iocg))
1312 continue;
1313
1314 spin_lock(&iocg->waitq.lock);
1315
1316 if (waitqueue_active(&iocg->waitq)) {
1317 /* might be oversleeping vtime / hweight changes, kick */
1318 iocg_kick_waitq(iocg, &now);
1319 iocg_kick_delay(iocg, &now, 0);
1320 } else if (iocg_is_idle(iocg)) {
1321 /* no waiter and idle, deactivate */
1322 iocg->last_inuse = iocg->inuse;
1323 __propagate_active_weight(iocg, 0, 0);
1324 list_del_init(&iocg->active_list);
1325 }
1326
1327 spin_unlock(&iocg->waitq.lock);
1328 }
1329 commit_active_weights(ioc);
1330
1331 /* calc usages and see whether some weights need to be moved around */
1332 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1333 u64 vdone, vtime, vusage, vmargin, vmin;
1334 u32 hw_active, hw_inuse, usage;
1335
1336 /*
1337 * Collect unused and wind vtime closer to vnow to prevent
1338 * iocgs from accumulating a large amount of budget.
1339 */
1340 vdone = atomic64_read(&iocg->done_vtime);
1341 vtime = atomic64_read(&iocg->vtime);
1342 current_hweight(iocg, &hw_active, &hw_inuse);
1343
1344 /*
1345 * Latency QoS detection doesn't account for IOs which are
1346 * in-flight for longer than a period. Detect them by
1347 * comparing vdone against period start. If lagging behind
1348 * IOs from past periods, don't increase vrate.
1349 */
1350 if (!atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1351 time_after64(vtime, vdone) &&
1352 time_after64(vtime, now.vnow -
1353 MAX_LAGGING_PERIODS * period_vtime) &&
1354 time_before64(vdone, now.vnow - period_vtime))
1355 nr_lagging++;
1356
1357 if (waitqueue_active(&iocg->waitq))
1358 vusage = now.vnow - iocg->last_vtime;
1359 else if (time_before64(iocg->last_vtime, vtime))
1360 vusage = vtime - iocg->last_vtime;
1361 else
1362 vusage = 0;
1363
1364 iocg->last_vtime += vusage;
1365 /*
1366 * Factor in in-flight vtime into vusage to avoid
1367 * high-latency completions appearing as idle. This should
1368 * be done after the above ->last_time adjustment.
1369 */
1370 vusage = max(vusage, vtime - vdone);
1371
1372 /* calculate hweight based usage ratio and record */
1373 if (vusage) {
1374 usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1375 period_vtime);
1376 iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1377 iocg->usages[iocg->usage_idx] = usage;
1378 } else {
1379 usage = 0;
1380 }
1381
1382 /* see whether there's surplus vtime */
1383 vmargin = ioc->margin_us * now.vrate;
1384 vmin = now.vnow - vmargin;
1385
1386 iocg->has_surplus = false;
1387
1388 if (!waitqueue_active(&iocg->waitq) &&
1389 time_before64(vtime, vmin)) {
1390 u64 delta = vmin - vtime;
1391
1392 /* throw away surplus vtime */
1393 atomic64_add(delta, &iocg->vtime);
1394 atomic64_add(delta, &iocg->done_vtime);
1395 iocg->last_vtime += delta;
1396 /* if usage is sufficiently low, maybe it can donate */
1397 if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1398 iocg->has_surplus = true;
1399 nr_surpluses++;
1400 }
1401 } else if (hw_inuse < hw_active) {
1402 u32 new_hwi, new_inuse;
1403
1404 /* was donating but might need to take back some */
1405 if (waitqueue_active(&iocg->waitq)) {
1406 new_hwi = hw_active;
1407 } else {
1408 new_hwi = max(hw_inuse,
1409 usage * SURPLUS_SCALE_PCT / 100 +
1410 SURPLUS_SCALE_ABS);
1411 }
1412
1413 new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1414 hw_inuse);
1415 new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1416
1417 if (new_inuse > iocg->inuse) {
1418 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1419 iocg->inuse, new_inuse,
1420 hw_inuse, new_hwi);
1421 __propagate_active_weight(iocg, iocg->weight,
1422 new_inuse);
1423 }
1424 } else {
1425 /* genuninely out of vtime */
1426 nr_shortages++;
1427 }
1428 }
1429
1430 if (!nr_shortages || !nr_surpluses)
1431 goto skip_surplus_transfers;
1432
1433 /* there are both shortages and surpluses, transfer surpluses */
1434 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1435 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1436 int nr_valid = 0;
1437
1438 if (!iocg->has_surplus)
1439 continue;
1440
1441 /* base the decision on max historical usage */
1442 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1443 if (iocg->usages[i]) {
1444 usage = max(usage, iocg->usages[i]);
1445 nr_valid++;
1446 }
1447 }
1448 if (nr_valid < MIN_VALID_USAGES)
1449 continue;
1450
1451 current_hweight(iocg, &hw_active, &hw_inuse);
1452 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1453 if (!new_hwi)
1454 continue;
1455
1456 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1457 hw_inuse);
1458 if (new_inuse < iocg->inuse) {
1459 TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1460 iocg->inuse, new_inuse,
1461 hw_inuse, new_hwi);
1462 __propagate_active_weight(iocg, iocg->weight, new_inuse);
1463 }
1464 }
1465skip_surplus_transfers:
1466 commit_active_weights(ioc);
1467
1468 /*
1469 * If q is getting clogged or we're missing too much, we're issuing
1470 * too much IO and should lower vtime rate. If we're not missing
1471 * and experiencing shortages but not surpluses, we're too stingy
1472 * and should increase vtime rate.
1473 */
1474 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1475 missed_ppm[READ] > ppm_rthr ||
1476 missed_ppm[WRITE] > ppm_wthr) {
1477 ioc->busy_level = max(ioc->busy_level, 0);
1478 ioc->busy_level++;
1479 } else if (nr_lagging) {
1480 ioc->busy_level = max(ioc->busy_level, 0);
1481 } else if (nr_shortages && !nr_surpluses &&
1482 rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1483 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1484 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1485 ioc->busy_level = min(ioc->busy_level, 0);
1486 ioc->busy_level--;
1487 } else {
1488 ioc->busy_level = 0;
1489 }
1490
1491 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1492
1493 if (ioc->busy_level) {
1494 u64 vrate = atomic64_read(&ioc->vtime_rate);
1495 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1496
1497 /* rq_wait signal is always reliable, ignore user vrate_min */
1498 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1499 vrate_min = VRATE_MIN;
1500
1501 /*
1502 * If vrate is out of bounds, apply clamp gradually as the
1503 * bounds can change abruptly. Otherwise, apply busy_level
1504 * based adjustment.
1505 */
1506 if (vrate < vrate_min) {
1507 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1508 100);
1509 vrate = min(vrate, vrate_min);
1510 } else if (vrate > vrate_max) {
1511 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1512 100);
1513 vrate = max(vrate, vrate_max);
1514 } else {
1515 int idx = min_t(int, abs(ioc->busy_level),
1516 ARRAY_SIZE(vrate_adj_pct) - 1);
1517 u32 adj_pct = vrate_adj_pct[idx];
1518
1519 if (ioc->busy_level > 0)
1520 adj_pct = 100 - adj_pct;
1521 else
1522 adj_pct = 100 + adj_pct;
1523
1524 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1525 vrate_min, vrate_max);
1526 }
1527
1528 trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
1529 nr_lagging, nr_shortages,
1530 nr_surpluses);
1531
1532 atomic64_set(&ioc->vtime_rate, vrate);
1533 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1534 ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1535 }
1536
1537 ioc_refresh_params(ioc, false);
1538
1539 /*
1540 * This period is done. Move onto the next one. If nothing's
1541 * going on with the device, stop the timer.
1542 */
1543 atomic64_inc(&ioc->cur_period);
1544
1545 if (ioc->running != IOC_STOP) {
1546 if (!list_empty(&ioc->active_iocgs)) {
1547 ioc_start_period(ioc, &now);
1548 } else {
1549 ioc->busy_level = 0;
1550 ioc->running = IOC_IDLE;
1551 }
1552 }
1553
1554 spin_unlock_irq(&ioc->lock);
1555}
1556
1557static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1558 bool is_merge, u64 *costp)
1559{
1560 struct ioc *ioc = iocg->ioc;
1561 u64 coef_seqio, coef_randio, coef_page;
1562 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1563 u64 seek_pages = 0;
1564 u64 cost = 0;
1565
1566 switch (bio_op(bio)) {
1567 case REQ_OP_READ:
1568 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
1569 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
1570 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
1571 break;
1572 case REQ_OP_WRITE:
1573 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
1574 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
1575 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
1576 break;
1577 default:
1578 goto out;
1579 }
1580
1581 if (iocg->cursor) {
1582 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1583 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1584 }
1585
1586 if (!is_merge) {
1587 if (seek_pages > LCOEF_RANDIO_PAGES) {
1588 cost += coef_randio;
1589 } else {
1590 cost += coef_seqio;
1591 }
1592 }
1593 cost += pages * coef_page;
1594out:
1595 *costp = cost;
1596}
1597
1598static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1599{
1600 u64 cost;
1601
1602 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1603 return cost;
1604}
1605
1606static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1607{
1608 struct blkcg_gq *blkg = bio->bi_blkg;
1609 struct ioc *ioc = rqos_to_ioc(rqos);
1610 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1611 struct ioc_now now;
1612 struct iocg_wait wait;
1613 u32 hw_active, hw_inuse;
1614 u64 abs_cost, cost, vtime;
1615
1616 /* bypass IOs if disabled or for root cgroup */
1617 if (!ioc->enabled || !iocg->level)
1618 return;
1619
1620 /* always activate so that even 0 cost IOs get protected to some level */
1621 if (!iocg_activate(iocg, &now))
1622 return;
1623
1624 /* calculate the absolute vtime cost */
1625 abs_cost = calc_vtime_cost(bio, iocg, false);
1626 if (!abs_cost)
1627 return;
1628
1629 iocg->cursor = bio_end_sector(bio);
1630
1631 vtime = atomic64_read(&iocg->vtime);
1632 current_hweight(iocg, &hw_active, &hw_inuse);
1633
1634 if (hw_inuse < hw_active &&
1635 time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1636 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1637 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1638 spin_lock_irq(&ioc->lock);
1639 propagate_active_weight(iocg, iocg->weight, iocg->weight);
1640 spin_unlock_irq(&ioc->lock);
1641 current_hweight(iocg, &hw_active, &hw_inuse);
1642 }
1643
1644 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1645
1646 /*
1647 * If no one's waiting and within budget, issue right away. The
1648 * tests are racy but the races aren't systemic - we only miss once
1649 * in a while which is fine.
1650 */
1651 if (!waitqueue_active(&iocg->waitq) &&
1652 time_before_eq64(vtime + cost, now.vnow)) {
1653 iocg_commit_bio(iocg, bio, cost);
1654 return;
1655 }
1656
1657 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1658 iocg_commit_bio(iocg, bio, cost);
1659 iocg_kick_delay(iocg, &now, cost);
1660 return;
1661 }
1662
1663 /*
1664 * Append self to the waitq and schedule the wakeup timer if we're
1665 * the first waiter. The timer duration is calculated based on the
1666 * current vrate. vtime and hweight changes can make it too short
1667 * or too long. Each wait entry records the absolute cost it's
1668 * waiting for to allow re-evaluation using a custom wait entry.
1669 *
1670 * If too short, the timer simply reschedules itself. If too long,
1671 * the period timer will notice and trigger wakeups.
1672 *
1673 * All waiters are on iocg->waitq and the wait states are
1674 * synchronized using waitq.lock.
1675 */
1676 spin_lock_irq(&iocg->waitq.lock);
1677
1678 /*
1679 * We activated above but w/o any synchronization. Deactivation is
1680 * synchronized with waitq.lock and we won't get deactivated as
1681 * long as we're waiting, so we're good if we're activated here.
1682 * In the unlikely case that we are deactivated, just issue the IO.
1683 */
1684 if (unlikely(list_empty(&iocg->active_list))) {
1685 spin_unlock_irq(&iocg->waitq.lock);
1686 iocg_commit_bio(iocg, bio, cost);
1687 return;
1688 }
1689
1690 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1691 wait.wait.private = current;
1692 wait.bio = bio;
1693 wait.abs_cost = abs_cost;
1694 wait.committed = false; /* will be set true by waker */
1695
1696 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1697 iocg_kick_waitq(iocg, &now);
1698
1699 spin_unlock_irq(&iocg->waitq.lock);
1700
1701 while (true) {
1702 set_current_state(TASK_UNINTERRUPTIBLE);
1703 if (wait.committed)
1704 break;
1705 io_schedule();
1706 }
1707
1708 /* waker already committed us, proceed */
1709 finish_wait(&iocg->waitq, &wait.wait);
1710}
1711
1712static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1713 struct bio *bio)
1714{
1715 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1716 sector_t bio_end = bio_end_sector(bio);
1717 u32 hw_inuse;
1718 u64 abs_cost, cost;
1719
1720 /* add iff the existing request has cost assigned */
1721 if (!rq->bio || !rq->bio->bi_iocost_cost)
1722 return;
1723
1724 abs_cost = calc_vtime_cost(bio, iocg, true);
1725 if (!abs_cost)
1726 return;
1727
1728 /* update cursor if backmerging into the request at the cursor */
1729 if (blk_rq_pos(rq) < bio_end &&
1730 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1731 iocg->cursor = bio_end;
1732
1733 current_hweight(iocg, NULL, &hw_inuse);
1734 cost = div64_u64(abs_cost * HWEIGHT_WHOLE, hw_inuse);
1735 bio->bi_iocost_cost = cost;
1736
1737 atomic64_add(cost, &iocg->vtime);
1738}
1739
1740static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1741{
1742 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1743
1744 if (iocg && bio->bi_iocost_cost)
1745 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1746}
1747
1748static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1749{
1750 struct ioc *ioc = rqos_to_ioc(rqos);
1751 u64 on_q_ns, rq_wait_ns;
1752 int pidx, rw;
1753
1754 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1755 return;
1756
1757 switch (req_op(rq) & REQ_OP_MASK) {
1758 case REQ_OP_READ:
1759 pidx = QOS_RLAT;
1760 rw = READ;
1761 break;
1762 case REQ_OP_WRITE:
1763 pidx = QOS_WLAT;
1764 rw = WRITE;
1765 break;
1766 default:
1767 return;
1768 }
1769
1770 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1771 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1772
1773 if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1774 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1775 else
1776 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1777
1778 this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1779}
1780
1781static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1782{
1783 struct ioc *ioc = rqos_to_ioc(rqos);
1784
1785 spin_lock_irq(&ioc->lock);
1786 ioc_refresh_params(ioc, false);
1787 spin_unlock_irq(&ioc->lock);
1788}
1789
1790static void ioc_rqos_exit(struct rq_qos *rqos)
1791{
1792 struct ioc *ioc = rqos_to_ioc(rqos);
1793
1794 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1795
1796 spin_lock_irq(&ioc->lock);
1797 ioc->running = IOC_STOP;
1798 spin_unlock_irq(&ioc->lock);
1799
1800 del_timer_sync(&ioc->timer);
1801 free_percpu(ioc->pcpu_stat);
1802 kfree(ioc);
1803}
1804
1805static struct rq_qos_ops ioc_rqos_ops = {
1806 .throttle = ioc_rqos_throttle,
1807 .merge = ioc_rqos_merge,
1808 .done_bio = ioc_rqos_done_bio,
1809 .done = ioc_rqos_done,
1810 .queue_depth_changed = ioc_rqos_queue_depth_changed,
1811 .exit = ioc_rqos_exit,
1812};
1813
1814static int blk_iocost_init(struct request_queue *q)
1815{
1816 struct ioc *ioc;
1817 struct rq_qos *rqos;
1818 int ret;
1819
1820 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1821 if (!ioc)
1822 return -ENOMEM;
1823
1824 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1825 if (!ioc->pcpu_stat) {
1826 kfree(ioc);
1827 return -ENOMEM;
1828 }
1829
1830 rqos = &ioc->rqos;
1831 rqos->id = RQ_QOS_COST;
1832 rqos->ops = &ioc_rqos_ops;
1833 rqos->q = q;
1834
1835 spin_lock_init(&ioc->lock);
1836 timer_setup(&ioc->timer, ioc_timer_fn, 0);
1837 INIT_LIST_HEAD(&ioc->active_iocgs);
1838
1839 ioc->running = IOC_IDLE;
1840 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1841 seqcount_init(&ioc->period_seqcount);
1842 ioc->period_at = ktime_to_us(ktime_get());
1843 atomic64_set(&ioc->cur_period, 0);
1844 atomic_set(&ioc->hweight_gen, 0);
1845
1846 spin_lock_irq(&ioc->lock);
1847 ioc->autop_idx = AUTOP_INVALID;
1848 ioc_refresh_params(ioc, true);
1849 spin_unlock_irq(&ioc->lock);
1850
1851 rq_qos_add(q, rqos);
1852 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1853 if (ret) {
1854 rq_qos_del(q, rqos);
1855 kfree(ioc);
1856 return ret;
1857 }
1858 return 0;
1859}
1860
1861static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
1862{
1863 struct ioc_cgrp *iocc;
1864
1865 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
1866 iocc->dfl_weight = CGROUP_WEIGHT_DFL;
1867
1868 return &iocc->cpd;
1869}
1870
1871static void ioc_cpd_free(struct blkcg_policy_data *cpd)
1872{
1873 kfree(container_of(cpd, struct ioc_cgrp, cpd));
1874}
1875
1876static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
1877 struct blkcg *blkcg)
1878{
1879 int levels = blkcg->css.cgroup->level + 1;
1880 struct ioc_gq *iocg;
1881
1882 iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
1883 gfp, q->node);
1884 if (!iocg)
1885 return NULL;
1886
1887 return &iocg->pd;
1888}
1889
1890static void ioc_pd_init(struct blkg_policy_data *pd)
1891{
1892 struct ioc_gq *iocg = pd_to_iocg(pd);
1893 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
1894 struct ioc *ioc = q_to_ioc(blkg->q);
1895 struct ioc_now now;
1896 struct blkcg_gq *tblkg;
1897 unsigned long flags;
1898
1899 ioc_now(ioc, &now);
1900
1901 iocg->ioc = ioc;
1902 atomic64_set(&iocg->vtime, now.vnow);
1903 atomic64_set(&iocg->done_vtime, now.vnow);
1904 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
1905 INIT_LIST_HEAD(&iocg->active_list);
1906 iocg->hweight_active = HWEIGHT_WHOLE;
1907 iocg->hweight_inuse = HWEIGHT_WHOLE;
1908
1909 init_waitqueue_head(&iocg->waitq);
1910 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1911 iocg->waitq_timer.function = iocg_waitq_timer_fn;
1912 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1913 iocg->delay_timer.function = iocg_delay_timer_fn;
1914
1915 iocg->level = blkg->blkcg->css.cgroup->level;
1916
1917 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
1918 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
1919 iocg->ancestors[tiocg->level] = tiocg;
1920 }
1921
1922 spin_lock_irqsave(&ioc->lock, flags);
1923 weight_updated(iocg);
1924 spin_unlock_irqrestore(&ioc->lock, flags);
1925}
1926
1927static void ioc_pd_free(struct blkg_policy_data *pd)
1928{
1929 struct ioc_gq *iocg = pd_to_iocg(pd);
1930 struct ioc *ioc = iocg->ioc;
1931
1932 if (ioc) {
1933 hrtimer_cancel(&iocg->waitq_timer);
1934 hrtimer_cancel(&iocg->delay_timer);
1935
1936 spin_lock(&ioc->lock);
1937 if (!list_empty(&iocg->active_list)) {
1938 propagate_active_weight(iocg, 0, 0);
1939 list_del_init(&iocg->active_list);
1940 }
1941 spin_unlock(&ioc->lock);
1942 }
1943 kfree(iocg);
1944}
1945
1946static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
1947 int off)
1948{
1949 const char *dname = blkg_dev_name(pd->blkg);
1950 struct ioc_gq *iocg = pd_to_iocg(pd);
1951
1952 if (dname && iocg->cfg_weight)
1953 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
1954 return 0;
1955}
1956
1957
1958static int ioc_weight_show(struct seq_file *sf, void *v)
1959{
1960 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1961 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
1962
1963 seq_printf(sf, "default %u\n", iocc->dfl_weight);
1964 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
1965 &blkcg_policy_iocost, seq_cft(sf)->private, false);
1966 return 0;
1967}
1968
1969static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
1970 size_t nbytes, loff_t off)
1971{
1972 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1973 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
1974 struct blkg_conf_ctx ctx;
1975 struct ioc_gq *iocg;
1976 u32 v;
1977 int ret;
1978
1979 if (!strchr(buf, ':')) {
1980 struct blkcg_gq *blkg;
1981
1982 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
1983 return -EINVAL;
1984
1985 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
1986 return -EINVAL;
1987
1988 spin_lock(&blkcg->lock);
1989 iocc->dfl_weight = v;
1990 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1991 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1992
1993 if (iocg) {
1994 spin_lock_irq(&iocg->ioc->lock);
1995 weight_updated(iocg);
1996 spin_unlock_irq(&iocg->ioc->lock);
1997 }
1998 }
1999 spin_unlock(&blkcg->lock);
2000
2001 return nbytes;
2002 }
2003
2004 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2005 if (ret)
2006 return ret;
2007
2008 iocg = blkg_to_iocg(ctx.blkg);
2009
2010 if (!strncmp(ctx.body, "default", 7)) {
2011 v = 0;
2012 } else {
2013 if (!sscanf(ctx.body, "%u", &v))
2014 goto einval;
2015 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2016 goto einval;
2017 }
2018
2019 spin_lock_irq(&iocg->ioc->lock);
2020 iocg->cfg_weight = v;
2021 weight_updated(iocg);
2022 spin_unlock_irq(&iocg->ioc->lock);
2023
2024 blkg_conf_finish(&ctx);
2025 return nbytes;
2026
2027einval:
2028 blkg_conf_finish(&ctx);
2029 return -EINVAL;
2030}
2031
2032static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2033 int off)
2034{
2035 const char *dname = blkg_dev_name(pd->blkg);
2036 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2037
2038 if (!dname)
2039 return 0;
2040
2041 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2042 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2043 ioc->params.qos[QOS_RPPM] / 10000,
2044 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2045 ioc->params.qos[QOS_RLAT],
2046 ioc->params.qos[QOS_WPPM] / 10000,
2047 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2048 ioc->params.qos[QOS_WLAT],
2049 ioc->params.qos[QOS_MIN] / 10000,
2050 ioc->params.qos[QOS_MIN] % 10000 / 100,
2051 ioc->params.qos[QOS_MAX] / 10000,
2052 ioc->params.qos[QOS_MAX] % 10000 / 100);
2053 return 0;
2054}
2055
2056static int ioc_qos_show(struct seq_file *sf, void *v)
2057{
2058 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2059
2060 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2061 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2062 return 0;
2063}
2064
2065static const match_table_t qos_ctrl_tokens = {
2066 { QOS_ENABLE, "enable=%u" },
2067 { QOS_CTRL, "ctrl=%s" },
2068 { NR_QOS_CTRL_PARAMS, NULL },
2069};
2070
2071static const match_table_t qos_tokens = {
2072 { QOS_RPPM, "rpct=%s" },
2073 { QOS_RLAT, "rlat=%u" },
2074 { QOS_WPPM, "wpct=%s" },
2075 { QOS_WLAT, "wlat=%u" },
2076 { QOS_MIN, "min=%s" },
2077 { QOS_MAX, "max=%s" },
2078 { NR_QOS_PARAMS, NULL },
2079};
2080
2081static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2082 size_t nbytes, loff_t off)
2083{
2084 struct gendisk *disk;
2085 struct ioc *ioc;
2086 u32 qos[NR_QOS_PARAMS];
2087 bool enable, user;
2088 char *p;
2089 int ret;
2090
2091 disk = blkcg_conf_get_disk(&input);
2092 if (IS_ERR(disk))
2093 return PTR_ERR(disk);
2094
2095 ioc = q_to_ioc(disk->queue);
2096 if (!ioc) {
2097 ret = blk_iocost_init(disk->queue);
2098 if (ret)
2099 goto err;
2100 ioc = q_to_ioc(disk->queue);
2101 }
2102
2103 spin_lock_irq(&ioc->lock);
2104 memcpy(qos, ioc->params.qos, sizeof(qos));
2105 enable = ioc->enabled;
2106 user = ioc->user_qos_params;
2107 spin_unlock_irq(&ioc->lock);
2108
2109 while ((p = strsep(&input, " \t\n"))) {
2110 substring_t args[MAX_OPT_ARGS];
2111 char buf[32];
2112 int tok;
2113 s64 v;
2114
2115 if (!*p)
2116 continue;
2117
2118 switch (match_token(p, qos_ctrl_tokens, args)) {
2119 case QOS_ENABLE:
2120 match_u64(&args[0], &v);
2121 enable = v;
2122 continue;
2123 case QOS_CTRL:
2124 match_strlcpy(buf, &args[0], sizeof(buf));
2125 if (!strcmp(buf, "auto"))
2126 user = false;
2127 else if (!strcmp(buf, "user"))
2128 user = true;
2129 else
2130 goto einval;
2131 continue;
2132 }
2133
2134 tok = match_token(p, qos_tokens, args);
2135 switch (tok) {
2136 case QOS_RPPM:
2137 case QOS_WPPM:
2138 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2139 sizeof(buf))
2140 goto einval;
2141 if (cgroup_parse_float(buf, 2, &v))
2142 goto einval;
2143 if (v < 0 || v > 10000)
2144 goto einval;
2145 qos[tok] = v * 100;
2146 break;
2147 case QOS_RLAT:
2148 case QOS_WLAT:
2149 if (match_u64(&args[0], &v))
2150 goto einval;
2151 qos[tok] = v;
2152 break;
2153 case QOS_MIN:
2154 case QOS_MAX:
2155 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2156 sizeof(buf))
2157 goto einval;
2158 if (cgroup_parse_float(buf, 2, &v))
2159 goto einval;
2160 if (v < 0)
2161 goto einval;
2162 qos[tok] = clamp_t(s64, v * 100,
2163 VRATE_MIN_PPM, VRATE_MAX_PPM);
2164 break;
2165 default:
2166 goto einval;
2167 }
2168 user = true;
2169 }
2170
2171 if (qos[QOS_MIN] > qos[QOS_MAX])
2172 goto einval;
2173
2174 spin_lock_irq(&ioc->lock);
2175
2176 if (enable) {
2177 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2178 ioc->enabled = true;
2179 } else {
2180 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2181 ioc->enabled = false;
2182 }
2183
2184 if (user) {
2185 memcpy(ioc->params.qos, qos, sizeof(qos));
2186 ioc->user_qos_params = true;
2187 } else {
2188 ioc->user_qos_params = false;
2189 }
2190
2191 ioc_refresh_params(ioc, true);
2192 spin_unlock_irq(&ioc->lock);
2193
2194 put_disk_and_module(disk);
2195 return nbytes;
2196einval:
2197 ret = -EINVAL;
2198err:
2199 put_disk_and_module(disk);
2200 return ret;
2201}
2202
2203static u64 ioc_cost_model_prfill(struct seq_file *sf,
2204 struct blkg_policy_data *pd, int off)
2205{
2206 const char *dname = blkg_dev_name(pd->blkg);
2207 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2208 u64 *u = ioc->params.i_lcoefs;
2209
2210 if (!dname)
2211 return 0;
2212
2213 seq_printf(sf, "%s ctrl=%s model=linear "
2214 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2215 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2216 dname, ioc->user_cost_model ? "user" : "auto",
2217 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2218 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2219 return 0;
2220}
2221
2222static int ioc_cost_model_show(struct seq_file *sf, void *v)
2223{
2224 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2225
2226 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2227 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2228 return 0;
2229}
2230
2231static const match_table_t cost_ctrl_tokens = {
2232 { COST_CTRL, "ctrl=%s" },
2233 { COST_MODEL, "model=%s" },
2234 { NR_COST_CTRL_PARAMS, NULL },
2235};
2236
2237static const match_table_t i_lcoef_tokens = {
2238 { I_LCOEF_RBPS, "rbps=%u" },
2239 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2240 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2241 { I_LCOEF_WBPS, "wbps=%u" },
2242 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2243 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2244 { NR_I_LCOEFS, NULL },
2245};
2246
2247static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2248 size_t nbytes, loff_t off)
2249{
2250 struct gendisk *disk;
2251 struct ioc *ioc;
2252 u64 u[NR_I_LCOEFS];
2253 bool user;
2254 char *p;
2255 int ret;
2256
2257 disk = blkcg_conf_get_disk(&input);
2258 if (IS_ERR(disk))
2259 return PTR_ERR(disk);
2260
2261 ioc = q_to_ioc(disk->queue);
2262 if (!ioc) {
2263 ret = blk_iocost_init(disk->queue);
2264 if (ret)
2265 goto err;
2266 ioc = q_to_ioc(disk->queue);
2267 }
2268
2269 spin_lock_irq(&ioc->lock);
2270 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2271 user = ioc->user_cost_model;
2272 spin_unlock_irq(&ioc->lock);
2273
2274 while ((p = strsep(&input, " \t\n"))) {
2275 substring_t args[MAX_OPT_ARGS];
2276 char buf[32];
2277 int tok;
2278 u64 v;
2279
2280 if (!*p)
2281 continue;
2282
2283 switch (match_token(p, cost_ctrl_tokens, args)) {
2284 case COST_CTRL:
2285 match_strlcpy(buf, &args[0], sizeof(buf));
2286 if (!strcmp(buf, "auto"))
2287 user = false;
2288 else if (!strcmp(buf, "user"))
2289 user = true;
2290 else
2291 goto einval;
2292 continue;
2293 case COST_MODEL:
2294 match_strlcpy(buf, &args[0], sizeof(buf));
2295 if (strcmp(buf, "linear"))
2296 goto einval;
2297 continue;
2298 }
2299
2300 tok = match_token(p, i_lcoef_tokens, args);
2301 if (tok == NR_I_LCOEFS)
2302 goto einval;
2303 if (match_u64(&args[0], &v))
2304 goto einval;
2305 u[tok] = v;
2306 user = true;
2307 }
2308
2309 spin_lock_irq(&ioc->lock);
2310 if (user) {
2311 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2312 ioc->user_cost_model = true;
2313 } else {
2314 ioc->user_cost_model = false;
2315 }
2316 ioc_refresh_params(ioc, true);
2317 spin_unlock_irq(&ioc->lock);
2318
2319 put_disk_and_module(disk);
2320 return nbytes;
2321
2322einval:
2323 ret = -EINVAL;
2324err:
2325 put_disk_and_module(disk);
2326 return ret;
2327}
2328
2329static struct cftype ioc_files[] = {
2330 {
2331 .name = "weight",
2332 .flags = CFTYPE_NOT_ON_ROOT,
2333 .seq_show = ioc_weight_show,
2334 .write = ioc_weight_write,
2335 },
2336 {
2337 .name = "cost.qos",
2338 .flags = CFTYPE_ONLY_ON_ROOT,
2339 .seq_show = ioc_qos_show,
2340 .write = ioc_qos_write,
2341 },
2342 {
2343 .name = "cost.model",
2344 .flags = CFTYPE_ONLY_ON_ROOT,
2345 .seq_show = ioc_cost_model_show,
2346 .write = ioc_cost_model_write,
2347 },
2348 {}
2349};
2350
2351static struct blkcg_policy blkcg_policy_iocost = {
2352 .dfl_cftypes = ioc_files,
2353 .cpd_alloc_fn = ioc_cpd_alloc,
2354 .cpd_free_fn = ioc_cpd_free,
2355 .pd_alloc_fn = ioc_pd_alloc,
2356 .pd_init_fn = ioc_pd_init,
2357 .pd_free_fn = ioc_pd_free,
2358};
2359
2360static int __init ioc_init(void)
2361{
2362 return blkcg_policy_register(&blkcg_policy_iocost);
2363}
2364
2365static void __exit ioc_exit(void)
2366{
2367 return blkcg_policy_unregister(&blkcg_policy_iocost);
2368}
2369
2370module_init(ioc_init);
2371module_exit(ioc_exit);