Blame - block/blk-iocost.c - kernel/msm-5.4

blob: 680815620095b33cae762b1f7fb591f28b2716c2 [file] [log] [blame]

Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame^]	1	/* SPDX-License-Identifier: GPL-2.0
				2	*
				3	* IO cost model based controller.
				4	*
				5	* Copyright (C) 2019 Tejun Heo <tj@kernel.org>
				6	* Copyright (C) 2019 Andy Newell <newella@fb.com>
				7	* Copyright (C) 2019 Facebook
				8	*
				9	* One challenge of controlling IO resources is the lack of trivially
				10	* observable cost metric. This is distinguished from CPU and memory where
				11	* wallclock time and the number of bytes can serve as accurate enough
				12	* approximations.
				13	*
				14	* Bandwidth and iops are the most commonly used metrics for IO devices but
				15	* depending on the type and specifics of the device, different IO patterns
				16	* easily lead to multiple orders of magnitude variations rendering them
				17	* useless for the purpose of IO capacity distribution. While on-device
				18	* time, with a lot of clutches, could serve as a useful approximation for
				19	* non-queued rotational devices, this is no longer viable with modern
				20	* devices, even the rotational ones.
				21	*
				22	* While there is no cost metric we can trivially observe, it isn't a
				23	* complete mystery. For example, on a rotational device, seek cost
				24	* dominates while a contiguous transfer contributes a smaller amount
				25	* proportional to the size. If we can characterize at least the relative
				26	* costs of these different types of IOs, it should be possible to
				27	* implement a reasonable work-conserving proportional IO resource
				28	* distribution.
				29	*
				30	* 1. IO Cost Model
				31	*
				32	* IO cost model estimates the cost of an IO given its basic parameters and
				33	* history (e.g. the end sector of the last IO). The cost is measured in
				34	* device time. If a given IO is estimated to cost 10ms, the device should
				35	* be able to process ~100 of those IOs in a second.
				36	*
				37	* Currently, there's only one builtin cost model - linear. Each IO is
				38	* classified as sequential or random and given a base cost accordingly.
				39	* On top of that, a size cost proportional to the length of the IO is
				40	* added. While simple, this model captures the operational
				41	* characteristics of a wide varienty of devices well enough. Default
				42	* paramters for several different classes of devices are provided and the
				43	* parameters can be configured from userspace via
				44	* /sys/fs/cgroup/io.cost.model.
				45	*
				46	* If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
				47	* device-specific coefficients.
				48	*
				49	* 2. Control Strategy
				50	*
				51	* The device virtual time (vtime) is used as the primary control metric.
				52	* The control strategy is composed of the following three parts.
				53	*
				54	* 2-1. Vtime Distribution
				55	*
				56	* When a cgroup becomes active in terms of IOs, its hierarchical share is
				57	* calculated. Please consider the following hierarchy where the numbers
				58	* inside parentheses denote the configured weights.
				59	*
				60	* root
				61	* / \
				62	* A (w:100) B (w:300)
				63	* / \
				64	* A0 (w:100) A1 (w:100)
				65	*
				66	* If B is idle and only A0 and A1 are actively issuing IOs, as the two are
				67	* of equal weight, each gets 50% share. If then B starts issuing IOs, B
				68	* gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
				69	* 12.5% each. The distribution mechanism only cares about these flattened
				70	* shares. They're called hweights (hierarchical weights) and always add
				71	* upto 1 (HWEIGHT_WHOLE).
				72	*
				73	* A given cgroup's vtime runs slower in inverse proportion to its hweight.
				74	* For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
				75	* against the device vtime - an IO which takes 10ms on the underlying
				76	* device is considered to take 80ms on A0.
				77	*
				78	* This constitutes the basis of IO capacity distribution. Each cgroup's
				79	* vtime is running at a rate determined by its hweight. A cgroup tracks
				80	* the vtime consumed by past IOs and can issue a new IO iff doing so
				81	* wouldn't outrun the current device vtime. Otherwise, the IO is
				82	* suspended until the vtime has progressed enough to cover it.
				83	*
				84	* 2-2. Vrate Adjustment
				85	*
				86	* It's unrealistic to expect the cost model to be perfect. There are too
				87	* many devices and even on the same device the overall performance
				88	* fluctuates depending on numerous factors such as IO mixture and device
				89	* internal garbage collection. The controller needs to adapt dynamically.
				90	*
				91	* This is achieved by adjusting the overall IO rate according to how busy
				92	* the device is. If the device becomes overloaded, we're sending down too
				93	* many IOs and should generally slow down. If there are waiting issuers
				94	* but the device isn't saturated, we're issuing too few and should
				95	* generally speed up.
				96	*
				97	* To slow down, we lower the vrate - the rate at which the device vtime
				98	* passes compared to the wall clock. For example, if the vtime is running
				99	* at the vrate of 75%, all cgroups added up would only be able to issue
				100	* 750ms worth of IOs per second, and vice-versa for speeding up.
				101	*
				102	* Device business is determined using two criteria - rq wait and
				103	* completion latencies.
				104	*
				105	* When a device gets saturated, the on-device and then the request queues
				106	* fill up and a bio which is ready to be issued has to wait for a request
				107	* to become available. When this delay becomes noticeable, it's a clear
				108	* indication that the device is saturated and we lower the vrate. This
				109	* saturation signal is fairly conservative as it only triggers when both
				110	* hardware and software queues are filled up, and is used as the default
				111	* busy signal.
				112	*
				113	* As devices can have deep queues and be unfair in how the queued commands
				114	* are executed, soley depending on rq wait may not result in satisfactory
				115	* control quality. For a better control quality, completion latency QoS
				116	* parameters can be configured so that the device is considered saturated
				117	* if N'th percentile completion latency rises above the set point.
				118	*
				119	* The completion latency requirements are a function of both the
				120	* underlying device characteristics and the desired IO latency quality of
				121	* service. There is an inherent trade-off - the tighter the latency QoS,
				122	* the higher the bandwidth lossage. Latency QoS is disabled by default
				123	* and can be set through /sys/fs/cgroup/io.cost.qos.
				124	*
				125	* 2-3. Work Conservation
				126	*
				127	* Imagine two cgroups A and B with equal weights. A is issuing a small IO
				128	* periodically while B is sending out enough parallel IOs to saturate the
				129	* device on its own. Let's say A's usage amounts to 100ms worth of IO
				130	* cost per second, i.e., 10% of the device capacity. The naive
				131	* distribution of half and half would lead to 60% utilization of the
				132	* device, a significant reduction in the total amount of work done
				133	* compared to free-for-all competition. This is too high a cost to pay
				134	* for IO control.
				135	*
				136	* To conserve the total amount of work done, we keep track of how much
				137	* each active cgroup is actually using and yield part of its weight if
				138	* there are other cgroups which can make use of it. In the above case,
				139	* A's weight will be lowered so that it hovers above the actual usage and
				140	* B would be able to use the rest.
				141	*
				142	* As we don't want to penalize a cgroup for donating its weight, the
				143	* surplus weight adjustment factors in a margin and has an immediate
				144	* snapback mechanism in case the cgroup needs more IO vtime for itself.
				145	*
				146	* Note that adjusting down surplus weights has the same effects as
				147	* accelerating vtime for other cgroups and work conservation can also be
				148	* implemented by adjusting vrate dynamically. However, squaring who can
				149	* donate and should take back how much requires hweight propagations
				150	* anyway making it easier to implement and understand as a separate
				151	* mechanism.
				152	*/
				153
				154	#include <linux/kernel.h>
				155	#include <linux/module.h>
				156	#include <linux/timer.h>
				157	#include <linux/time64.h>
				158	#include <linux/parser.h>
				159	#include <linux/sched/signal.h>
				160	#include <linux/blk-cgroup.h>
				161	#include "blk-rq-qos.h"
				162	#include "blk-stat.h"
				163	#include "blk-wbt.h"
				164
				165	#ifdef CONFIG_TRACEPOINTS
				166
				167	/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
				168	#define TRACE_IOCG_PATH_LEN 1024
				169	static DEFINE_SPINLOCK(trace_iocg_path_lock);
				170	static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
				171
				172	#define TRACE_IOCG_PATH(type, iocg, ...) \
				173	do { \
				174	unsigned long flags; \
				175	if (trace_iocost_##type##_enabled()) { \
				176	spin_lock_irqsave(&trace_iocg_path_lock, flags); \
				177	cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
				178	trace_iocg_path, TRACE_IOCG_PATH_LEN); \
				179	trace_iocost_##type(iocg, trace_iocg_path, \
				180	##__VA_ARGS__); \
				181	spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
				182	} \
				183	} while (0)
				184
				185	#else /* CONFIG_TRACE_POINTS */
				186	#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
				187	#endif /* CONFIG_TRACE_POINTS */
				188
				189	enum {
				190	MILLION = 1000000,
				191
				192	/* timer period is calculated from latency requirements, bound it */
				193	MIN_PERIOD = USEC_PER_MSEC,
				194	MAX_PERIOD = USEC_PER_SEC,
				195
				196	/*
				197	* A cgroup's vtime can run 50% behind the device vtime, which
				198	* serves as its IO credit buffer. Surplus weight adjustment is
				199	* immediately canceled if the vtime margin runs below 10%.
				200	*/
				201	MARGIN_PCT = 50,
				202	INUSE_MARGIN_PCT = 10,
				203
				204	/* Have some play in waitq timer operations */
				205	WAITQ_TIMER_MARGIN_PCT = 5,
				206
				207	/*
				208	* vtime can wrap well within a reasonable uptime when vrate is
				209	* consistently raised. Don't trust recorded cgroup vtime if the
				210	* period counter indicates that it's older than 5mins.
				211	*/
				212	VTIME_VALID_DUR = 300 * USEC_PER_SEC,
				213
				214	/*
				215	* Remember the past three non-zero usages and use the max for
				216	* surplus calculation. Three slots guarantee that we remember one
				217	* full period usage from the last active stretch even after
				218	* partial deactivation and re-activation periods. Don't start
				219	* giving away weight before collecting two data points to prevent
				220	* hweight adjustments based on one partial activation period.
				221	*/
				222	NR_USAGE_SLOTS = 3,
				223	MIN_VALID_USAGES = 2,
				224
				225	/* 1/64k is granular enough and can easily be handled w/ u32 */
				226	HWEIGHT_WHOLE = 1 << 16,
				227
				228	/*
				229	* As vtime is used to calculate the cost of each IO, it needs to
				230	* be fairly high precision. For example, it should be able to
				231	* represent the cost of a single page worth of discard with
				232	* suffificient accuracy. At the same time, it should be able to
				233	* represent reasonably long enough durations to be useful and
				234	* convenient during operation.
				235	*
				236	* 1s worth of vtime is 2^37. This gives us both sub-nanosecond
				237	* granularity and days of wrap-around time even at extreme vrates.
				238	*/
				239	VTIME_PER_SEC_SHIFT = 37,
				240	VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
				241	VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
				242
				243	/* bound vrate adjustments within two orders of magnitude */
				244	VRATE_MIN_PPM = 10000, /* 1% */
				245	VRATE_MAX_PPM = 100000000, /* 10000% */
				246
				247	VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
				248	VRATE_CLAMP_ADJ_PCT = 4,
				249
				250	/* if IOs end up waiting for requests, issue less */
				251	RQ_WAIT_BUSY_PCT = 5,
				252
				253	/* unbusy hysterisis */
				254	UNBUSY_THR_PCT = 75,
				255
				256	/* don't let cmds which take a very long time pin lagging for too long */
				257	MAX_LAGGING_PERIODS = 10,
				258
				259	/*
				260	* If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
				261	* donate the surplus.
				262	*/
				263	SURPLUS_SCALE_PCT = 125, /* * 125% */
				264	SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
				265	SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
				266
				267	/* switch iff the conditions are met for longer than this */
				268	AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
				269
				270	/*
				271	* Count IO size in 4k pages. The 12bit shift helps keeping
				272	* size-proportional components of cost calculation in closer
				273	* numbers of digits to per-IO cost components.
				274	*/
				275	IOC_PAGE_SHIFT = 12,
				276	IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
				277	IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
				278
				279	/* if apart further than 16M, consider randio for linear model */
				280	LCOEF_RANDIO_PAGES = 4096,
				281	};
				282
				283	enum ioc_running {
				284	IOC_IDLE,
				285	IOC_RUNNING,
				286	IOC_STOP,
				287	};
				288
				289	/* io.cost.qos controls including per-dev enable of the whole controller */
				290	enum {
				291	QOS_ENABLE,
				292	QOS_CTRL,
				293	NR_QOS_CTRL_PARAMS,
				294	};
				295
				296	/* io.cost.qos params */
				297	enum {
				298	QOS_RPPM,
				299	QOS_RLAT,
				300	QOS_WPPM,
				301	QOS_WLAT,
				302	QOS_MIN,
				303	QOS_MAX,
				304	NR_QOS_PARAMS,
				305	};
				306
				307	/* io.cost.model controls */
				308	enum {
				309	COST_CTRL,
				310	COST_MODEL,
				311	NR_COST_CTRL_PARAMS,
				312	};
				313
				314	/* builtin linear cost model coefficients */
				315	enum {
				316	I_LCOEF_RBPS,
				317	I_LCOEF_RSEQIOPS,
				318	I_LCOEF_RRANDIOPS,
				319	I_LCOEF_WBPS,
				320	I_LCOEF_WSEQIOPS,
				321	I_LCOEF_WRANDIOPS,
				322	NR_I_LCOEFS,
				323	};
				324
				325	enum {
				326	LCOEF_RPAGE,
				327	LCOEF_RSEQIO,
				328	LCOEF_RRANDIO,
				329	LCOEF_WPAGE,
				330	LCOEF_WSEQIO,
				331	LCOEF_WRANDIO,
				332	NR_LCOEFS,
				333	};
				334
				335	enum {
				336	AUTOP_INVALID,
				337	AUTOP_HDD,
				338	AUTOP_SSD_QD1,
				339	AUTOP_SSD_DFL,
				340	AUTOP_SSD_FAST,
				341	};
				342
				343	struct ioc_gq;
				344
				345	struct ioc_params {
				346	u32 qos[NR_QOS_PARAMS];
				347	u64 i_lcoefs[NR_I_LCOEFS];
				348	u64 lcoefs[NR_LCOEFS];
				349	u32 too_fast_vrate_pct;
				350	u32 too_slow_vrate_pct;
				351	};
				352
				353	struct ioc_missed {
				354	u32 nr_met;
				355	u32 nr_missed;
				356	u32 last_met;
				357	u32 last_missed;
				358	};
				359
				360	struct ioc_pcpu_stat {
				361	struct ioc_missed missed[2];
				362
				363	u64 rq_wait_ns;
				364	u64 last_rq_wait_ns;
				365	};
				366
				367	/* per device */
				368	struct ioc {
				369	struct rq_qos rqos;
				370
				371	bool enabled;
				372
				373	struct ioc_params params;
				374	u32 period_us;
				375	u32 margin_us;
				376	u64 vrate_min;
				377	u64 vrate_max;
				378
				379	spinlock_t lock;
				380	struct timer_list timer;
				381	struct list_head active_iocgs; /* active cgroups */
				382	struct ioc_pcpu_stat __percpu *pcpu_stat;
				383
				384	enum ioc_running running;
				385	atomic64_t vtime_rate;
				386
				387	seqcount_t period_seqcount;
				388	u32 period_at; /* wallclock starttime */
				389	u64 period_at_vtime; /* vtime starttime */
				390
				391	atomic64_t cur_period; /* inc'd each period */
				392	int busy_level; /* saturation history */
				393
				394	u64 inuse_margin_vtime;
				395	bool weights_updated;
				396	atomic_t hweight_gen; /* for lazy hweights */
				397
				398	u64 autop_too_fast_at;
				399	u64 autop_too_slow_at;
				400	int autop_idx;
				401	bool user_qos_params:1;
				402	bool user_cost_model:1;
				403	};
				404
				405	/* per device-cgroup pair */
				406	struct ioc_gq {
				407	struct blkg_policy_data pd;
				408	struct ioc *ioc;
				409
				410	/*
				411	* A iocg can get its weight from two sources - an explicit
				412	* per-device-cgroup configuration or the default weight of the
				413	* cgroup. `cfg_weight` is the explicit per-device-cgroup
				414	* configuration. `weight` is the effective considering both
				415	* sources.
				416	*
				417	* When an idle cgroup becomes active its `active` goes from 0 to
				418	* `weight`. `inuse` is the surplus adjusted active weight.
				419	* `active` and `inuse` are used to calculate `hweight_active` and
				420	* `hweight_inuse`.
				421	*
				422	* `last_inuse` remembers `inuse` while an iocg is idle to persist
				423	* surplus adjustments.
				424	*/
				425	u32 cfg_weight;
				426	u32 weight;
				427	u32 active;
				428	u32 inuse;
				429	u32 last_inuse;
				430
				431	sector_t cursor; /* to detect randio */
				432
				433	/*
				434	* `vtime` is this iocg's vtime cursor which progresses as IOs are
				435	* issued. If lagging behind device vtime, the delta represents
				436	* the currently available IO budget. If runnning ahead, the
				437	* overage.
				438	*
				439	* `vtime_done` is the same but progressed on completion rather
				440	* than issue. The delta behind `vtime` represents the cost of
				441	* currently in-flight IOs.
				442	*
				443	* `last_vtime` is used to remember `vtime` at the end of the last
				444	* period to calculate utilization.
				445	*/
				446	atomic64_t vtime;
				447	atomic64_t done_vtime;
				448	u64 last_vtime;
				449
				450	/*
				451	* The period this iocg was last active in. Used for deactivation
				452	* and invalidating `vtime`.
				453	*/
				454	atomic64_t active_period;
				455	struct list_head active_list;
				456
				457	/* see __propagate_active_weight() and current_hweight() for details */
				458	u64 child_active_sum;
				459	u64 child_inuse_sum;
				460	int hweight_gen;
				461	u32 hweight_active;
				462	u32 hweight_inuse;
				463	bool has_surplus;
				464
				465	struct wait_queue_head waitq;
				466	struct hrtimer waitq_timer;
				467	struct hrtimer delay_timer;
				468
				469	/* usage is recorded as fractions of HWEIGHT_WHOLE */
				470	int usage_idx;
				471	u32 usages[NR_USAGE_SLOTS];
				472
				473	/* this iocg's depth in the hierarchy and ancestors including self */
				474	int level;
				475	struct ioc_gq *ancestors[];
				476	};
				477
				478	/* per cgroup */
				479	struct ioc_cgrp {
				480	struct blkcg_policy_data cpd;
				481	unsigned int dfl_weight;
				482	};
				483
				484	struct ioc_now {
				485	u64 now_ns;
				486	u32 now;
				487	u64 vnow;
				488	u64 vrate;
				489	};
				490
				491	struct iocg_wait {
				492	struct wait_queue_entry wait;
				493	struct bio *bio;
				494	u64 abs_cost;
				495	bool committed;
				496	};
				497
				498	struct iocg_wake_ctx {
				499	struct ioc_gq *iocg;
				500	u32 hw_inuse;
				501	s64 vbudget;
				502	};
				503
				504	static const struct ioc_params autop[] = {
				505	[AUTOP_HDD] = {
				506	.qos = {
				507	[QOS_RLAT] = 50000, /* 50ms */
				508	[QOS_WLAT] = 50000,
				509	[QOS_MIN] = VRATE_MIN_PPM,
				510	[QOS_MAX] = VRATE_MAX_PPM,
				511	},
				512	.i_lcoefs = {
				513	[I_LCOEF_RBPS] = 174019176,
				514	[I_LCOEF_RSEQIOPS] = 41708,
				515	[I_LCOEF_RRANDIOPS] = 370,
				516	[I_LCOEF_WBPS] = 178075866,
				517	[I_LCOEF_WSEQIOPS] = 42705,
				518	[I_LCOEF_WRANDIOPS] = 378,
				519	},
				520	},
				521	[AUTOP_SSD_QD1] = {
				522	.qos = {
				523	[QOS_RLAT] = 25000, /* 25ms */
				524	[QOS_WLAT] = 25000,
				525	[QOS_MIN] = VRATE_MIN_PPM,
				526	[QOS_MAX] = VRATE_MAX_PPM,
				527	},
				528	.i_lcoefs = {
				529	[I_LCOEF_RBPS] = 245855193,
				530	[I_LCOEF_RSEQIOPS] = 61575,
				531	[I_LCOEF_RRANDIOPS] = 6946,
				532	[I_LCOEF_WBPS] = 141365009,
				533	[I_LCOEF_WSEQIOPS] = 33716,
				534	[I_LCOEF_WRANDIOPS] = 26796,
				535	},
				536	},
				537	[AUTOP_SSD_DFL] = {
				538	.qos = {
				539	[QOS_RLAT] = 25000, /* 25ms */
				540	[QOS_WLAT] = 25000,
				541	[QOS_MIN] = VRATE_MIN_PPM,
				542	[QOS_MAX] = VRATE_MAX_PPM,
				543	},
				544	.i_lcoefs = {
				545	[I_LCOEF_RBPS] = 488636629,
				546	[I_LCOEF_RSEQIOPS] = 8932,
				547	[I_LCOEF_RRANDIOPS] = 8518,
				548	[I_LCOEF_WBPS] = 427891549,
				549	[I_LCOEF_WSEQIOPS] = 28755,
				550	[I_LCOEF_WRANDIOPS] = 21940,
				551	},
				552	.too_fast_vrate_pct = 500,
				553	},
				554	[AUTOP_SSD_FAST] = {
				555	.qos = {
				556	[QOS_RLAT] = 5000, /* 5ms */
				557	[QOS_WLAT] = 5000,
				558	[QOS_MIN] = VRATE_MIN_PPM,
				559	[QOS_MAX] = VRATE_MAX_PPM,
				560	},
				561	.i_lcoefs = {
				562	[I_LCOEF_RBPS] = 3102524156LLU,
				563	[I_LCOEF_RSEQIOPS] = 724816,
				564	[I_LCOEF_RRANDIOPS] = 778122,
				565	[I_LCOEF_WBPS] = 1742780862LLU,
				566	[I_LCOEF_WSEQIOPS] = 425702,
				567	[I_LCOEF_WRANDIOPS] = 443193,
				568	},
				569	.too_slow_vrate_pct = 10,
				570	},
				571	};
				572
				573	/*
				574	* vrate adjust percentages indexed by ioc->busy_level. We adjust up on
				575	* vtime credit shortage and down on device saturation.
				576	*/
				577	static u32 vrate_adj_pct[] =
				578	{ 0, 0, 0, 0,
				579	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				580	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				581	4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
				582
				583	static struct blkcg_policy blkcg_policy_iocost;
				584
				585	/* accessors and helpers */
				586	static struct ioc rqos_to_ioc(struct rq_qos rqos)
				587	{
				588	return container_of(rqos, struct ioc, rqos);
				589	}
				590
				591	static struct ioc q_to_ioc(struct request_queue q)
				592	{
				593	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
				594	}
				595
				596	static const char q_name(struct request_queue q)
				597	{
				598	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
				599	return kobject_name(q->kobj.parent);
				600	else
				601	return "<unknown>";
				602	}
				603
				604	static const char __maybe_unused ioc_name(struct ioc ioc)
				605	{
				606	return q_name(ioc->rqos.q);
				607	}
				608
				609	static struct ioc_gq pd_to_iocg(struct blkg_policy_data pd)
				610	{
				611	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
				612	}
				613
				614	static struct ioc_gq blkg_to_iocg(struct blkcg_gq blkg)
				615	{
				616	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
				617	}
				618
				619	static struct blkcg_gq iocg_to_blkg(struct ioc_gq iocg)
				620	{
				621	return pd_to_blkg(&iocg->pd);
				622	}
				623
				624	static struct ioc_cgrp blkcg_to_iocc(struct blkcg blkcg)
				625	{
				626	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
				627	struct ioc_cgrp, cpd);
				628	}
				629
				630	/*
				631	* Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
				632	* weight, the more expensive each IO.
				633	*/
				634	static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
				635	{
				636	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
				637	}
				638
				639	static void iocg_commit_bio(struct ioc_gq iocg, struct bio bio, u64 cost)
				640	{
				641	bio->bi_iocost_cost = cost;
				642	atomic64_add(cost, &iocg->vtime);
				643	}
				644
				645	#define CREATE_TRACE_POINTS
				646	#include <trace/events/iocost.h>
				647
				648	/* latency Qos params changed, update period_us and all the dependent params */
				649	static void ioc_refresh_period_us(struct ioc *ioc)
				650	{
				651	u32 ppm, lat, multi, period_us;
				652
				653	lockdep_assert_held(&ioc->lock);
				654
				655	/* pick the higher latency target */
				656	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
				657	ppm = ioc->params.qos[QOS_RPPM];
				658	lat = ioc->params.qos[QOS_RLAT];
				659	} else {
				660	ppm = ioc->params.qos[QOS_WPPM];
				661	lat = ioc->params.qos[QOS_WLAT];
				662	}
				663
				664	/*
				665	* We want the period to be long enough to contain a healthy number
				666	* of IOs while short enough for granular control. Define it as a
				667	* multiple of the latency target. Ideally, the multiplier should
				668	* be scaled according to the percentile so that it would nominally
				669	* contain a certain number of requests. Let's be simpler and
				670	* scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
				671	*/
				672	if (ppm)
				673	multi = max_t(u32, (MILLION - ppm) / 50000, 2);
				674	else
				675	multi = 2;
				676	period_us = multi * lat;
				677	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
				678
				679	/* calculate dependent params */
				680	ioc->period_us = period_us;
				681	ioc->margin_us = period_us * MARGIN_PCT / 100;
				682	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
				683	period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
				684	}
				685
				686	static int ioc_autop_idx(struct ioc *ioc)
				687	{
				688	int idx = ioc->autop_idx;
				689	const struct ioc_params *p = &autop[idx];
				690	u32 vrate_pct;
				691	u64 now_ns;
				692
				693	/* rotational? */
				694	if (!blk_queue_nonrot(ioc->rqos.q))
				695	return AUTOP_HDD;
				696
				697	/* handle SATA SSDs w/ broken NCQ */
				698	if (blk_queue_depth(ioc->rqos.q) == 1)
				699	return AUTOP_SSD_QD1;
				700
				701	/* use one of the normal ssd sets */
				702	if (idx < AUTOP_SSD_DFL)
				703	return AUTOP_SSD_DFL;
				704
				705	/* if user is overriding anything, maintain what was there */
				706	if (ioc->user_qos_params \|\| ioc->user_cost_model)
				707	return idx;
				708
				709	/* step up/down based on the vrate */
				710	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
				711	VTIME_PER_USEC);
				712	now_ns = ktime_get_ns();
				713
				714	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
				715	if (!ioc->autop_too_fast_at)
				716	ioc->autop_too_fast_at = now_ns;
				717	if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
				718	return idx + 1;
				719	} else {
				720	ioc->autop_too_fast_at = 0;
				721	}
				722
				723	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
				724	if (!ioc->autop_too_slow_at)
				725	ioc->autop_too_slow_at = now_ns;
				726	if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
				727	return idx - 1;
				728	} else {
				729	ioc->autop_too_slow_at = 0;
				730	}
				731
				732	return idx;
				733	}
				734
				735	/*
				736	* Take the followings as input
				737	*
				738	* @bps maximum sequential throughput
				739	* @seqiops maximum sequential 4k iops
				740	* @randiops maximum random 4k iops
				741	*
				742	* and calculate the linear model cost coefficients.
				743	*
				744	* *@page per-page cost 1s / (@bps / 4096)
				745	* @seqio base cost of a seq IO max((1s / @seqiops) - @page, 0)
				746	* @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
				747	*/
				748	static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
				749	u64 page, u64 seqio, u64 *randio)
				750	{
				751	u64 v;
				752
				753	page = seqio = *randio = 0;
				754
				755	if (bps)
				756	*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
				757	DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
				758
				759	if (seqiops) {
				760	v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
				761	if (v > *page)
				762	seqio = v - page;
				763	}
				764
				765	if (randiops) {
				766	v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
				767	if (v > *page)
				768	randio = v - page;
				769	}
				770	}
				771
				772	static void ioc_refresh_lcoefs(struct ioc *ioc)
				773	{
				774	u64 *u = ioc->params.i_lcoefs;
				775	u64 *c = ioc->params.lcoefs;
				776
				777	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
				778	&c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
				779	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
				780	&c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
				781	}
				782
				783	static bool ioc_refresh_params(struct ioc *ioc, bool force)
				784	{
				785	const struct ioc_params *p;
				786	int idx;
				787
				788	lockdep_assert_held(&ioc->lock);
				789
				790	idx = ioc_autop_idx(ioc);
				791	p = &autop[idx];
				792
				793	if (idx == ioc->autop_idx && !force)
				794	return false;
				795
				796	if (idx != ioc->autop_idx)
				797	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
				798
				799	ioc->autop_idx = idx;
				800	ioc->autop_too_fast_at = 0;
				801	ioc->autop_too_slow_at = 0;
				802
				803	if (!ioc->user_qos_params)
				804	memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
				805	if (!ioc->user_cost_model)
				806	memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
				807
				808	ioc_refresh_period_us(ioc);
				809	ioc_refresh_lcoefs(ioc);
				810
				811	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
				812	VTIME_PER_USEC, MILLION);
				813	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
				814	VTIME_PER_USEC, MILLION);
				815
				816	return true;
				817	}
				818
				819	/* take a snapshot of the current [v]time and vrate */
				820	static void ioc_now(struct ioc ioc, struct ioc_now now)
				821	{
				822	unsigned seq;
				823
				824	now->now_ns = ktime_get();
				825	now->now = ktime_to_us(now->now_ns);
				826	now->vrate = atomic64_read(&ioc->vtime_rate);
				827
				828	/*
				829	* The current vtime is
				830	*
				831	* vtime at period start + (wallclock time since the start) * vrate
				832	*
				833	* As a consistent snapshot of `period_at_vtime` and `period_at` is
				834	* needed, they're seqcount protected.
				835	*/
				836	do {
				837	seq = read_seqcount_begin(&ioc->period_seqcount);
				838	now->vnow = ioc->period_at_vtime +
				839	(now->now - ioc->period_at) * now->vrate;
				840	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
				841	}
				842
				843	static void ioc_start_period(struct ioc ioc, struct ioc_now now)
				844	{
				845	lockdep_assert_held(&ioc->lock);
				846	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
				847
				848	write_seqcount_begin(&ioc->period_seqcount);
				849	ioc->period_at = now->now;
				850	ioc->period_at_vtime = now->vnow;
				851	write_seqcount_end(&ioc->period_seqcount);
				852
				853	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
				854	add_timer(&ioc->timer);
				855	}
				856
				857	/*
				858	* Update @iocg's `active` and `inuse` to @active and @inuse, update level
				859	* weight sums and propagate upwards accordingly.
				860	*/
				861	static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
				862	{
				863	struct ioc *ioc = iocg->ioc;
				864	int lvl;
				865
				866	lockdep_assert_held(&ioc->lock);
				867
				868	inuse = min(active, inuse);
				869
				870	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
				871	struct ioc_gq *parent = iocg->ancestors[lvl];
				872	struct ioc_gq *child = iocg->ancestors[lvl + 1];
				873	u32 parent_active = 0, parent_inuse = 0;
				874
				875	/* update the level sums */
				876	parent->child_active_sum += (s32)(active - child->active);
				877	parent->child_inuse_sum += (s32)(inuse - child->inuse);
				878	/* apply the udpates */
				879	child->active = active;
				880	child->inuse = inuse;
				881
				882	/*
				883	* The delta between inuse and active sums indicates that
				884	* that much of weight is being given away. Parent's inuse
				885	* and active should reflect the ratio.
				886	*/
				887	if (parent->child_active_sum) {
				888	parent_active = parent->weight;
				889	parent_inuse = DIV64_U64_ROUND_UP(
				890	parent_active * parent->child_inuse_sum,
				891	parent->child_active_sum);
				892	}
				893
				894	/* do we need to keep walking up? */
				895	if (parent_active == parent->active &&
				896	parent_inuse == parent->inuse)
				897	break;
				898
				899	active = parent_active;
				900	inuse = parent_inuse;
				901	}
				902
				903	ioc->weights_updated = true;
				904	}
				905
				906	static void commit_active_weights(struct ioc *ioc)
				907	{
				908	lockdep_assert_held(&ioc->lock);
				909
				910	if (ioc->weights_updated) {
				911	/* paired with rmb in current_hweight(), see there */
				912	smp_wmb();
				913	atomic_inc(&ioc->hweight_gen);
				914	ioc->weights_updated = false;
				915	}
				916	}
				917
				918	static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
				919	{
				920	__propagate_active_weight(iocg, active, inuse);
				921	commit_active_weights(iocg->ioc);
				922	}
				923
				924	static void current_hweight(struct ioc_gq iocg, u32 hw_activep, u32 *hw_inusep)
				925	{
				926	struct ioc *ioc = iocg->ioc;
				927	int lvl;
				928	u32 hwa, hwi;
				929	int ioc_gen;
				930
				931	/* hot path - if uptodate, use cached */
				932	ioc_gen = atomic_read(&ioc->hweight_gen);
				933	if (ioc_gen == iocg->hweight_gen)
				934	goto out;
				935
				936	/*
				937	* Paired with wmb in commit_active_weights(). If we saw the
				938	* updated hweight_gen, all the weight updates from
				939	* __propagate_active_weight() are visible too.
				940	*
				941	* We can race with weight updates during calculation and get it
				942	* wrong. However, hweight_gen would have changed and a future
				943	* reader will recalculate and we're guaranteed to discard the
				944	* wrong result soon.
				945	*/
				946	smp_rmb();
				947
				948	hwa = hwi = HWEIGHT_WHOLE;
				949	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
				950	struct ioc_gq *parent = iocg->ancestors[lvl];
				951	struct ioc_gq *child = iocg->ancestors[lvl + 1];
				952	u32 active_sum = READ_ONCE(parent->child_active_sum);
				953	u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
				954	u32 active = READ_ONCE(child->active);
				955	u32 inuse = READ_ONCE(child->inuse);
				956
				957	/* we can race with deactivations and either may read as zero */
				958	if (!active_sum \|\| !inuse_sum)
				959	continue;
				960
				961	active_sum = max(active, active_sum);
				962	hwa = hwa * active / active_sum; /* max 16bits * 10000 */
				963
				964	inuse_sum = max(inuse, inuse_sum);
				965	hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
				966	}
				967
				968	iocg->hweight_active = max_t(u32, hwa, 1);
				969	iocg->hweight_inuse = max_t(u32, hwi, 1);
				970	iocg->hweight_gen = ioc_gen;
				971	out:
				972	if (hw_activep)
				973	*hw_activep = iocg->hweight_active;
				974	if (hw_inusep)
				975	*hw_inusep = iocg->hweight_inuse;
				976	}
				977
				978	static void weight_updated(struct ioc_gq *iocg)
				979	{
				980	struct ioc *ioc = iocg->ioc;
				981	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
				982	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
				983	u32 weight;
				984
				985	lockdep_assert_held(&ioc->lock);
				986
				987	weight = iocg->cfg_weight ?: iocc->dfl_weight;
				988	if (weight != iocg->weight && iocg->active)
				989	propagate_active_weight(iocg, weight,
				990	DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
				991	iocg->weight = weight;
				992	}
				993
				994	static bool iocg_activate(struct ioc_gq iocg, struct ioc_now now)
				995	{
				996	struct ioc *ioc = iocg->ioc;
				997	u64 last_period, cur_period, max_period_delta;
				998	u64 vtime, vmargin, vmin;
				999	int i;
				1000
				1001	/*
				1002	* If seem to be already active, just update the stamp to tell the
				1003	* timer that we're still active. We don't mind occassional races.
				1004	*/
				1005	if (!list_empty(&iocg->active_list)) {
				1006	ioc_now(ioc, now);
				1007	cur_period = atomic64_read(&ioc->cur_period);
				1008	if (atomic64_read(&iocg->active_period) != cur_period)
				1009	atomic64_set(&iocg->active_period, cur_period);
				1010	return true;
				1011	}
				1012
				1013	/* racy check on internal node IOs, treat as root level IOs */
				1014	if (iocg->child_active_sum)
				1015	return false;
				1016
				1017	spin_lock_irq(&ioc->lock);
				1018
				1019	ioc_now(ioc, now);
				1020
				1021	/* update period */
				1022	cur_period = atomic64_read(&ioc->cur_period);
				1023	last_period = atomic64_read(&iocg->active_period);
				1024	atomic64_set(&iocg->active_period, cur_period);
				1025
				1026	/* already activated or breaking leaf-only constraint? */
				1027	for (i = iocg->level; i > 0; i--)
				1028	if (!list_empty(&iocg->active_list))
				1029	goto fail_unlock;
				1030	if (iocg->child_active_sum)
				1031	goto fail_unlock;
				1032
				1033	/*
				1034	* vtime may wrap when vrate is raised substantially due to
				1035	* underestimated IO costs. Look at the period and ignore its
				1036	* vtime if the iocg has been idle for too long. Also, cap the
				1037	* budget it can start with to the margin.
				1038	*/
				1039	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
				1040	vtime = atomic64_read(&iocg->vtime);
				1041	vmargin = ioc->margin_us * now->vrate;
				1042	vmin = now->vnow - vmargin;
				1043
				1044	if (last_period + max_period_delta < cur_period \|\|
				1045	time_before64(vtime, vmin)) {
				1046	atomic64_add(vmin - vtime, &iocg->vtime);
				1047	atomic64_add(vmin - vtime, &iocg->done_vtime);
				1048	vtime = vmin;
				1049	}
				1050
				1051	/*
				1052	* Activate, propagate weight and start period timer if not
				1053	* running. Reset hweight_gen to avoid accidental match from
				1054	* wrapping.
				1055	*/
				1056	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
				1057	list_add(&iocg->active_list, &ioc->active_iocgs);
				1058	propagate_active_weight(iocg, iocg->weight,
				1059	iocg->last_inuse ?: iocg->weight);
				1060
				1061	TRACE_IOCG_PATH(iocg_activate, iocg, now,
				1062	last_period, cur_period, vtime);
				1063
				1064	iocg->last_vtime = vtime;
				1065
				1066	if (ioc->running == IOC_IDLE) {
				1067	ioc->running = IOC_RUNNING;
				1068	ioc_start_period(ioc, now);
				1069	}
				1070
				1071	spin_unlock_irq(&ioc->lock);
				1072	return true;
				1073
				1074	fail_unlock:
				1075	spin_unlock_irq(&ioc->lock);
				1076	return false;
				1077	}
				1078
				1079	static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
				1080	int flags, void *key)
				1081	{
				1082	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
				1083	struct iocg_wake_ctx ctx = (struct iocg_wake_ctx )key;
				1084	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
				1085
				1086	ctx->vbudget -= cost;
				1087
				1088	if (ctx->vbudget < 0)
				1089	return -1;
				1090
				1091	iocg_commit_bio(ctx->iocg, wait->bio, cost);
				1092
				1093	/*
				1094	* autoremove_wake_function() removes the wait entry only when it
				1095	* actually changed the task state. We want the wait always
				1096	* removed. Remove explicitly and use default_wake_function().
				1097	*/
				1098	list_del_init(&wq_entry->entry);
				1099	wait->committed = true;
				1100
				1101	default_wake_function(wq_entry, mode, flags, key);
				1102	return 0;
				1103	}
				1104
				1105	static void iocg_kick_waitq(struct ioc_gq iocg, struct ioc_now now)
				1106	{
				1107	struct ioc *ioc = iocg->ioc;
				1108	struct iocg_wake_ctx ctx = { .iocg = iocg };
				1109	u64 margin_ns = (u64)(ioc->period_us *
				1110	WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
				1111	u64 vshortage, expires, oexpires;
				1112
				1113	lockdep_assert_held(&iocg->waitq.lock);
				1114
				1115	/*
				1116	* Wake up the ones which are due and see how much vtime we'll need
				1117	* for the next one.
				1118	*/
				1119	current_hweight(iocg, NULL, &ctx.hw_inuse);
				1120	ctx.vbudget = now->vnow - atomic64_read(&iocg->vtime);
				1121	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
				1122	if (!waitqueue_active(&iocg->waitq))
				1123	return;
				1124	if (WARN_ON_ONCE(ctx.vbudget >= 0))
				1125	return;
				1126
				1127	/* determine next wakeup, add a quarter margin to guarantee chunking */
				1128	vshortage = -ctx.vbudget;
				1129	expires = now->now_ns +
				1130	DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
				1131	expires += margin_ns / 4;
				1132
				1133	/* if already active and close enough, don't bother */
				1134	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
				1135	if (hrtimer_is_queued(&iocg->waitq_timer) &&
				1136	abs(oexpires - expires) <= margin_ns / 4)
				1137	return;
				1138
				1139	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
				1140	margin_ns / 4, HRTIMER_MODE_ABS);
				1141	}
				1142
				1143	static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
				1144	{
				1145	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
				1146	struct ioc_now now;
				1147	unsigned long flags;
				1148
				1149	ioc_now(iocg->ioc, &now);
				1150
				1151	spin_lock_irqsave(&iocg->waitq.lock, flags);
				1152	iocg_kick_waitq(iocg, &now);
				1153	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
				1154
				1155	return HRTIMER_NORESTART;
				1156	}
				1157
				1158	static void iocg_kick_delay(struct ioc_gq iocg, struct ioc_now now, u64 cost)
				1159	{
				1160	struct ioc *ioc = iocg->ioc;
				1161	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
				1162	u64 vtime = atomic64_read(&iocg->vtime);
				1163	u64 vmargin = ioc->margin_us * now->vrate;
				1164	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
				1165	u64 expires, oexpires;
				1166
				1167	/* clear or maintain depending on the overage */
				1168	if (time_before_eq64(vtime, now->vnow)) {
				1169	blkcg_clear_delay(blkg);
				1170	return;
				1171	}
				1172	if (!atomic_read(&blkg->use_delay) &&
				1173	time_before_eq64(vtime, now->vnow + vmargin))
				1174	return;
				1175
				1176	/* use delay */
				1177	if (cost) {
				1178	u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
				1179	now->vrate);
				1180	blkcg_add_delay(blkg, now->now_ns, cost_ns);
				1181	}
				1182	blkcg_use_delay(blkg);
				1183
				1184	expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
				1185	now->vrate) * NSEC_PER_USEC;
				1186
				1187	/* if already active and close enough, don't bother */
				1188	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
				1189	if (hrtimer_is_queued(&iocg->delay_timer) &&
				1190	abs(oexpires - expires) <= margin_ns / 4)
				1191	return;
				1192
				1193	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
				1194	margin_ns / 4, HRTIMER_MODE_ABS);
				1195	}
				1196
				1197	static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
				1198	{
				1199	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
				1200	struct ioc_now now;
				1201
				1202	ioc_now(iocg->ioc, &now);
				1203	iocg_kick_delay(iocg, &now, 0);
				1204
				1205	return HRTIMER_NORESTART;
				1206	}
				1207
				1208	static void ioc_lat_stat(struct ioc ioc, u32 missed_ppm_ar, u32 *rq_wait_pct_p)
				1209	{
				1210	u32 nr_met[2] = { };
				1211	u32 nr_missed[2] = { };
				1212	u64 rq_wait_ns = 0;
				1213	int cpu, rw;
				1214
				1215	for_each_online_cpu(cpu) {
				1216	struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
				1217	u64 this_rq_wait_ns;
				1218
				1219	for (rw = READ; rw <= WRITE; rw++) {
				1220	u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
				1221	u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
				1222
				1223	nr_met[rw] += this_met - stat->missed[rw].last_met;
				1224	nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
				1225	stat->missed[rw].last_met = this_met;
				1226	stat->missed[rw].last_missed = this_missed;
				1227	}
				1228
				1229	this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
				1230	rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
				1231	stat->last_rq_wait_ns = this_rq_wait_ns;
				1232	}
				1233
				1234	for (rw = READ; rw <= WRITE; rw++) {
				1235	if (nr_met[rw] + nr_missed[rw])
				1236	missed_ppm_ar[rw] =
				1237	DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
				1238	nr_met[rw] + nr_missed[rw]);
				1239	else
				1240	missed_ppm_ar[rw] = 0;
				1241	}
				1242
				1243	rq_wait_pct_p = div64_u64(rq_wait_ns 100,
				1244	ioc->period_us * NSEC_PER_USEC);
				1245	}
				1246
				1247	/* was iocg idle this period? */
				1248	static bool iocg_is_idle(struct ioc_gq *iocg)
				1249	{
				1250	struct ioc *ioc = iocg->ioc;
				1251
				1252	/* did something get issued this period? */
				1253	if (atomic64_read(&iocg->active_period) ==
				1254	atomic64_read(&ioc->cur_period))
				1255	return false;
				1256
				1257	/* is something in flight? */
				1258	if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime))
				1259	return false;
				1260
				1261	return true;
				1262	}
				1263
				1264	/* returns usage with margin added if surplus is large enough */
				1265	static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
				1266	{
				1267	/* add margin */
				1268	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
				1269	usage += SURPLUS_SCALE_ABS;
				1270
				1271	/* don't bother if the surplus is too small */
				1272	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
				1273	return 0;
				1274
				1275	return usage;
				1276	}
				1277
				1278	static void ioc_timer_fn(struct timer_list *timer)
				1279	{
				1280	struct ioc *ioc = container_of(timer, struct ioc, timer);
				1281	struct ioc_gq iocg, tiocg;
				1282	struct ioc_now now;
				1283	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
				1284	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
				1285	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
				1286	u32 missed_ppm[2], rq_wait_pct;
				1287	u64 period_vtime;
				1288	int i;
				1289
				1290	/* how were the latencies during the period? */
				1291	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
				1292
				1293	/* take care of active iocgs */
				1294	spin_lock_irq(&ioc->lock);
				1295
				1296	ioc_now(ioc, &now);
				1297
				1298	period_vtime = now.vnow - ioc->period_at_vtime;
				1299	if (WARN_ON_ONCE(!period_vtime)) {
				1300	spin_unlock_irq(&ioc->lock);
				1301	return;
				1302	}
				1303
				1304	/*
				1305	* Waiters determine the sleep durations based on the vrate they
				1306	* saw at the time of sleep. If vrate has increased, some waiters
				1307	* could be sleeping for too long. Wake up tardy waiters which
				1308	* should have woken up in the last period and expire idle iocgs.
				1309	*/
				1310	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
				1311	if (!waitqueue_active(&iocg->waitq) && !iocg_is_idle(iocg))
				1312	continue;
				1313
				1314	spin_lock(&iocg->waitq.lock);
				1315
				1316	if (waitqueue_active(&iocg->waitq)) {
				1317	/* might be oversleeping vtime / hweight changes, kick */
				1318	iocg_kick_waitq(iocg, &now);
				1319	iocg_kick_delay(iocg, &now, 0);
				1320	} else if (iocg_is_idle(iocg)) {
				1321	/* no waiter and idle, deactivate */
				1322	iocg->last_inuse = iocg->inuse;
				1323	__propagate_active_weight(iocg, 0, 0);
				1324	list_del_init(&iocg->active_list);
				1325	}
				1326
				1327	spin_unlock(&iocg->waitq.lock);
				1328	}
				1329	commit_active_weights(ioc);
				1330
				1331	/* calc usages and see whether some weights need to be moved around */
				1332	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
				1333	u64 vdone, vtime, vusage, vmargin, vmin;
				1334	u32 hw_active, hw_inuse, usage;
				1335
				1336	/*
				1337	* Collect unused and wind vtime closer to vnow to prevent
				1338	* iocgs from accumulating a large amount of budget.
				1339	*/
				1340	vdone = atomic64_read(&iocg->done_vtime);
				1341	vtime = atomic64_read(&iocg->vtime);
				1342	current_hweight(iocg, &hw_active, &hw_inuse);
				1343
				1344	/*
				1345	* Latency QoS detection doesn't account for IOs which are
				1346	* in-flight for longer than a period. Detect them by
				1347	* comparing vdone against period start. If lagging behind
				1348	* IOs from past periods, don't increase vrate.
				1349	*/
				1350	if (!atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
				1351	time_after64(vtime, vdone) &&
				1352	time_after64(vtime, now.vnow -
				1353	MAX_LAGGING_PERIODS * period_vtime) &&
				1354	time_before64(vdone, now.vnow - period_vtime))
				1355	nr_lagging++;
				1356
				1357	if (waitqueue_active(&iocg->waitq))
				1358	vusage = now.vnow - iocg->last_vtime;
				1359	else if (time_before64(iocg->last_vtime, vtime))
				1360	vusage = vtime - iocg->last_vtime;
				1361	else
				1362	vusage = 0;
				1363
				1364	iocg->last_vtime += vusage;
				1365	/*
				1366	* Factor in in-flight vtime into vusage to avoid
				1367	* high-latency completions appearing as idle. This should
				1368	* be done after the above ->last_time adjustment.
				1369	*/
				1370	vusage = max(vusage, vtime - vdone);
				1371
				1372	/* calculate hweight based usage ratio and record */
				1373	if (vusage) {
				1374	usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
				1375	period_vtime);
				1376	iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
				1377	iocg->usages[iocg->usage_idx] = usage;
				1378	} else {
				1379	usage = 0;
				1380	}
				1381
				1382	/* see whether there's surplus vtime */
				1383	vmargin = ioc->margin_us * now.vrate;
				1384	vmin = now.vnow - vmargin;
				1385
				1386	iocg->has_surplus = false;
				1387
				1388	if (!waitqueue_active(&iocg->waitq) &&
				1389	time_before64(vtime, vmin)) {
				1390	u64 delta = vmin - vtime;
				1391
				1392	/* throw away surplus vtime */
				1393	atomic64_add(delta, &iocg->vtime);
				1394	atomic64_add(delta, &iocg->done_vtime);
				1395	iocg->last_vtime += delta;
				1396	/* if usage is sufficiently low, maybe it can donate */
				1397	if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
				1398	iocg->has_surplus = true;
				1399	nr_surpluses++;
				1400	}
				1401	} else if (hw_inuse < hw_active) {
				1402	u32 new_hwi, new_inuse;
				1403
				1404	/* was donating but might need to take back some */
				1405	if (waitqueue_active(&iocg->waitq)) {
				1406	new_hwi = hw_active;
				1407	} else {
				1408	new_hwi = max(hw_inuse,
				1409	usage * SURPLUS_SCALE_PCT / 100 +
				1410	SURPLUS_SCALE_ABS);
				1411	}
				1412
				1413	new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
				1414	hw_inuse);
				1415	new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
				1416
				1417	if (new_inuse > iocg->inuse) {
				1418	TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
				1419	iocg->inuse, new_inuse,
				1420	hw_inuse, new_hwi);
				1421	__propagate_active_weight(iocg, iocg->weight,
				1422	new_inuse);
				1423	}
				1424	} else {
				1425	/* genuninely out of vtime */
				1426	nr_shortages++;
				1427	}
				1428	}
				1429
				1430	if (!nr_shortages \|\| !nr_surpluses)
				1431	goto skip_surplus_transfers;
				1432
				1433	/* there are both shortages and surpluses, transfer surpluses */
				1434	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
				1435	u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
				1436	int nr_valid = 0;
				1437
				1438	if (!iocg->has_surplus)
				1439	continue;
				1440
				1441	/* base the decision on max historical usage */
				1442	for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
				1443	if (iocg->usages[i]) {
				1444	usage = max(usage, iocg->usages[i]);
				1445	nr_valid++;
				1446	}
				1447	}
				1448	if (nr_valid < MIN_VALID_USAGES)
				1449	continue;
				1450
				1451	current_hweight(iocg, &hw_active, &hw_inuse);
				1452	new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
				1453	if (!new_hwi)
				1454	continue;
				1455
				1456	new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
				1457	hw_inuse);
				1458	if (new_inuse < iocg->inuse) {
				1459	TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
				1460	iocg->inuse, new_inuse,
				1461	hw_inuse, new_hwi);
				1462	__propagate_active_weight(iocg, iocg->weight, new_inuse);
				1463	}
				1464	}
				1465	skip_surplus_transfers:
				1466	commit_active_weights(ioc);
				1467
				1468	/*
				1469	* If q is getting clogged or we're missing too much, we're issuing
				1470	* too much IO and should lower vtime rate. If we're not missing
				1471	* and experiencing shortages but not surpluses, we're too stingy
				1472	* and should increase vtime rate.
				1473	*/
				1474	if (rq_wait_pct > RQ_WAIT_BUSY_PCT \|\|
				1475	missed_ppm[READ] > ppm_rthr \|\|
				1476	missed_ppm[WRITE] > ppm_wthr) {
				1477	ioc->busy_level = max(ioc->busy_level, 0);
				1478	ioc->busy_level++;
				1479	} else if (nr_lagging) {
				1480	ioc->busy_level = max(ioc->busy_level, 0);
				1481	} else if (nr_shortages && !nr_surpluses &&
				1482	rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
				1483	missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
				1484	missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
				1485	ioc->busy_level = min(ioc->busy_level, 0);
				1486	ioc->busy_level--;
				1487	} else {
				1488	ioc->busy_level = 0;
				1489	}
				1490
				1491	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
				1492
				1493	if (ioc->busy_level) {
				1494	u64 vrate = atomic64_read(&ioc->vtime_rate);
				1495	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
				1496
				1497	/* rq_wait signal is always reliable, ignore user vrate_min */
				1498	if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
				1499	vrate_min = VRATE_MIN;
				1500
				1501	/*
				1502	* If vrate is out of bounds, apply clamp gradually as the
				1503	* bounds can change abruptly. Otherwise, apply busy_level
				1504	* based adjustment.
				1505	*/
				1506	if (vrate < vrate_min) {
				1507	vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
				1508	100);
				1509	vrate = min(vrate, vrate_min);
				1510	} else if (vrate > vrate_max) {
				1511	vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
				1512	100);
				1513	vrate = max(vrate, vrate_max);
				1514	} else {
				1515	int idx = min_t(int, abs(ioc->busy_level),
				1516	ARRAY_SIZE(vrate_adj_pct) - 1);
				1517	u32 adj_pct = vrate_adj_pct[idx];
				1518
				1519	if (ioc->busy_level > 0)
				1520	adj_pct = 100 - adj_pct;
				1521	else
				1522	adj_pct = 100 + adj_pct;
				1523
				1524	vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
				1525	vrate_min, vrate_max);
				1526	}
				1527
				1528	trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
				1529	nr_lagging, nr_shortages,
				1530	nr_surpluses);
				1531
				1532	atomic64_set(&ioc->vtime_rate, vrate);
				1533	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
				1534	ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
				1535	}
				1536
				1537	ioc_refresh_params(ioc, false);
				1538
				1539	/*
				1540	* This period is done. Move onto the next one. If nothing's
				1541	* going on with the device, stop the timer.
				1542	*/
				1543	atomic64_inc(&ioc->cur_period);
				1544
				1545	if (ioc->running != IOC_STOP) {
				1546	if (!list_empty(&ioc->active_iocgs)) {
				1547	ioc_start_period(ioc, &now);
				1548	} else {
				1549	ioc->busy_level = 0;
				1550	ioc->running = IOC_IDLE;
				1551	}
				1552	}
				1553
				1554	spin_unlock_irq(&ioc->lock);
				1555	}
				1556
				1557	static void calc_vtime_cost_builtin(struct bio bio, struct ioc_gq iocg,
				1558	bool is_merge, u64 *costp)
				1559	{
				1560	struct ioc *ioc = iocg->ioc;
				1561	u64 coef_seqio, coef_randio, coef_page;
				1562	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
				1563	u64 seek_pages = 0;
				1564	u64 cost = 0;
				1565
				1566	switch (bio_op(bio)) {
				1567	case REQ_OP_READ:
				1568	coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
				1569	coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
				1570	coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
				1571	break;
				1572	case REQ_OP_WRITE:
				1573	coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
				1574	coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
				1575	coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
				1576	break;
				1577	default:
				1578	goto out;
				1579	}
				1580
				1581	if (iocg->cursor) {
				1582	seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
				1583	seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
				1584	}
				1585
				1586	if (!is_merge) {
				1587	if (seek_pages > LCOEF_RANDIO_PAGES) {
				1588	cost += coef_randio;
				1589	} else {
				1590	cost += coef_seqio;
				1591	}
				1592	}
				1593	cost += pages * coef_page;
				1594	out:
				1595	*costp = cost;
				1596	}
				1597
				1598	static u64 calc_vtime_cost(struct bio bio, struct ioc_gq iocg, bool is_merge)
				1599	{
				1600	u64 cost;
				1601
				1602	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
				1603	return cost;
				1604	}
				1605
				1606	static void ioc_rqos_throttle(struct rq_qos rqos, struct bio bio)
				1607	{
				1608	struct blkcg_gq *blkg = bio->bi_blkg;
				1609	struct ioc *ioc = rqos_to_ioc(rqos);
				1610	struct ioc_gq *iocg = blkg_to_iocg(blkg);
				1611	struct ioc_now now;
				1612	struct iocg_wait wait;
				1613	u32 hw_active, hw_inuse;
				1614	u64 abs_cost, cost, vtime;
				1615
				1616	/* bypass IOs if disabled or for root cgroup */
				1617	if (!ioc->enabled \|\| !iocg->level)
				1618	return;
				1619
				1620	/* always activate so that even 0 cost IOs get protected to some level */
				1621	if (!iocg_activate(iocg, &now))
				1622	return;
				1623
				1624	/* calculate the absolute vtime cost */
				1625	abs_cost = calc_vtime_cost(bio, iocg, false);
				1626	if (!abs_cost)
				1627	return;
				1628
				1629	iocg->cursor = bio_end_sector(bio);
				1630
				1631	vtime = atomic64_read(&iocg->vtime);
				1632	current_hweight(iocg, &hw_active, &hw_inuse);
				1633
				1634	if (hw_inuse < hw_active &&
				1635	time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
				1636	TRACE_IOCG_PATH(inuse_reset, iocg, &now,
				1637	iocg->inuse, iocg->weight, hw_inuse, hw_active);
				1638	spin_lock_irq(&ioc->lock);
				1639	propagate_active_weight(iocg, iocg->weight, iocg->weight);
				1640	spin_unlock_irq(&ioc->lock);
				1641	current_hweight(iocg, &hw_active, &hw_inuse);
				1642	}
				1643
				1644	cost = abs_cost_to_cost(abs_cost, hw_inuse);
				1645
				1646	/*
				1647	* If no one's waiting and within budget, issue right away. The
				1648	* tests are racy but the races aren't systemic - we only miss once
				1649	* in a while which is fine.
				1650	*/
				1651	if (!waitqueue_active(&iocg->waitq) &&
				1652	time_before_eq64(vtime + cost, now.vnow)) {
				1653	iocg_commit_bio(iocg, bio, cost);
				1654	return;
				1655	}
				1656
				1657	if (bio_issue_as_root_blkg(bio) \|\| fatal_signal_pending(current)) {
				1658	iocg_commit_bio(iocg, bio, cost);
				1659	iocg_kick_delay(iocg, &now, cost);
				1660	return;
				1661	}
				1662
				1663	/*
				1664	* Append self to the waitq and schedule the wakeup timer if we're
				1665	* the first waiter. The timer duration is calculated based on the
				1666	* current vrate. vtime and hweight changes can make it too short
				1667	* or too long. Each wait entry records the absolute cost it's
				1668	* waiting for to allow re-evaluation using a custom wait entry.
				1669	*
				1670	* If too short, the timer simply reschedules itself. If too long,
				1671	* the period timer will notice and trigger wakeups.
				1672	*
				1673	* All waiters are on iocg->waitq and the wait states are
				1674	* synchronized using waitq.lock.
				1675	*/
				1676	spin_lock_irq(&iocg->waitq.lock);
				1677
				1678	/*
				1679	* We activated above but w/o any synchronization. Deactivation is
				1680	* synchronized with waitq.lock and we won't get deactivated as
				1681	* long as we're waiting, so we're good if we're activated here.
				1682	* In the unlikely case that we are deactivated, just issue the IO.
				1683	*/
				1684	if (unlikely(list_empty(&iocg->active_list))) {
				1685	spin_unlock_irq(&iocg->waitq.lock);
				1686	iocg_commit_bio(iocg, bio, cost);
				1687	return;
				1688	}
				1689
				1690	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
				1691	wait.wait.private = current;
				1692	wait.bio = bio;
				1693	wait.abs_cost = abs_cost;
				1694	wait.committed = false; /* will be set true by waker */
				1695
				1696	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
				1697	iocg_kick_waitq(iocg, &now);
				1698
				1699	spin_unlock_irq(&iocg->waitq.lock);
				1700
				1701	while (true) {
				1702	set_current_state(TASK_UNINTERRUPTIBLE);
				1703	if (wait.committed)
				1704	break;
				1705	io_schedule();
				1706	}
				1707
				1708	/* waker already committed us, proceed */
				1709	finish_wait(&iocg->waitq, &wait.wait);
				1710	}
				1711
				1712	static void ioc_rqos_merge(struct rq_qos rqos, struct request rq,
				1713	struct bio *bio)
				1714	{
				1715	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
				1716	sector_t bio_end = bio_end_sector(bio);
				1717	u32 hw_inuse;
				1718	u64 abs_cost, cost;
				1719
				1720	/* add iff the existing request has cost assigned */
				1721	if (!rq->bio \|\| !rq->bio->bi_iocost_cost)
				1722	return;
				1723
				1724	abs_cost = calc_vtime_cost(bio, iocg, true);
				1725	if (!abs_cost)
				1726	return;
				1727
				1728	/* update cursor if backmerging into the request at the cursor */
				1729	if (blk_rq_pos(rq) < bio_end &&
				1730	blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
				1731	iocg->cursor = bio_end;
				1732
				1733	current_hweight(iocg, NULL, &hw_inuse);
				1734	cost = div64_u64(abs_cost * HWEIGHT_WHOLE, hw_inuse);
				1735	bio->bi_iocost_cost = cost;
				1736
				1737	atomic64_add(cost, &iocg->vtime);
				1738	}
				1739
				1740	static void ioc_rqos_done_bio(struct rq_qos rqos, struct bio bio)
				1741	{
				1742	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
				1743
				1744	if (iocg && bio->bi_iocost_cost)
				1745	atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
				1746	}
				1747
				1748	static void ioc_rqos_done(struct rq_qos rqos, struct request rq)
				1749	{
				1750	struct ioc *ioc = rqos_to_ioc(rqos);
				1751	u64 on_q_ns, rq_wait_ns;
				1752	int pidx, rw;
				1753
				1754	if (!ioc->enabled \|\| !rq->alloc_time_ns \|\| !rq->start_time_ns)
				1755	return;
				1756
				1757	switch (req_op(rq) & REQ_OP_MASK) {
				1758	case REQ_OP_READ:
				1759	pidx = QOS_RLAT;
				1760	rw = READ;
				1761	break;
				1762	case REQ_OP_WRITE:
				1763	pidx = QOS_WLAT;
				1764	rw = WRITE;
				1765	break;
				1766	default:
				1767	return;
				1768	}
				1769
				1770	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
				1771	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
				1772
				1773	if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
				1774	this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
				1775	else
				1776	this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
				1777
				1778	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
				1779	}
				1780
				1781	static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
				1782	{
				1783	struct ioc *ioc = rqos_to_ioc(rqos);
				1784
				1785	spin_lock_irq(&ioc->lock);
				1786	ioc_refresh_params(ioc, false);
				1787	spin_unlock_irq(&ioc->lock);
				1788	}
				1789
				1790	static void ioc_rqos_exit(struct rq_qos *rqos)
				1791	{
				1792	struct ioc *ioc = rqos_to_ioc(rqos);
				1793
				1794	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
				1795
				1796	spin_lock_irq(&ioc->lock);
				1797	ioc->running = IOC_STOP;
				1798	spin_unlock_irq(&ioc->lock);
				1799
				1800	del_timer_sync(&ioc->timer);
				1801	free_percpu(ioc->pcpu_stat);
				1802	kfree(ioc);
				1803	}
				1804
				1805	static struct rq_qos_ops ioc_rqos_ops = {
				1806	.throttle = ioc_rqos_throttle,
				1807	.merge = ioc_rqos_merge,
				1808	.done_bio = ioc_rqos_done_bio,
				1809	.done = ioc_rqos_done,
				1810	.queue_depth_changed = ioc_rqos_queue_depth_changed,
				1811	.exit = ioc_rqos_exit,
				1812	};
				1813
				1814	static int blk_iocost_init(struct request_queue *q)
				1815	{
				1816	struct ioc *ioc;
				1817	struct rq_qos *rqos;
				1818	int ret;
				1819
				1820	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
				1821	if (!ioc)
				1822	return -ENOMEM;
				1823
				1824	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
				1825	if (!ioc->pcpu_stat) {
				1826	kfree(ioc);
				1827	return -ENOMEM;
				1828	}
				1829
				1830	rqos = &ioc->rqos;
				1831	rqos->id = RQ_QOS_COST;
				1832	rqos->ops = &ioc_rqos_ops;
				1833	rqos->q = q;
				1834
				1835	spin_lock_init(&ioc->lock);
				1836	timer_setup(&ioc->timer, ioc_timer_fn, 0);
				1837	INIT_LIST_HEAD(&ioc->active_iocgs);
				1838
				1839	ioc->running = IOC_IDLE;
				1840	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
				1841	seqcount_init(&ioc->period_seqcount);
				1842	ioc->period_at = ktime_to_us(ktime_get());
				1843	atomic64_set(&ioc->cur_period, 0);
				1844	atomic_set(&ioc->hweight_gen, 0);
				1845
				1846	spin_lock_irq(&ioc->lock);
				1847	ioc->autop_idx = AUTOP_INVALID;
				1848	ioc_refresh_params(ioc, true);
				1849	spin_unlock_irq(&ioc->lock);
				1850
				1851	rq_qos_add(q, rqos);
				1852	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
				1853	if (ret) {
				1854	rq_qos_del(q, rqos);
				1855	kfree(ioc);
				1856	return ret;
				1857	}
				1858	return 0;
				1859	}
				1860
				1861	static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
				1862	{
				1863	struct ioc_cgrp *iocc;
				1864
				1865	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
				1866	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
				1867
				1868	return &iocc->cpd;
				1869	}
				1870
				1871	static void ioc_cpd_free(struct blkcg_policy_data *cpd)
				1872	{
				1873	kfree(container_of(cpd, struct ioc_cgrp, cpd));
				1874	}
				1875
				1876	static struct blkg_policy_data ioc_pd_alloc(gfp_t gfp, struct request_queue q,
				1877	struct blkcg *blkcg)
				1878	{
				1879	int levels = blkcg->css.cgroup->level + 1;
				1880	struct ioc_gq *iocg;
				1881
				1882	iocg = kzalloc_node(sizeof(iocg) + levels sizeof(iocg->ancestors[0]),
				1883	gfp, q->node);
				1884	if (!iocg)
				1885	return NULL;
				1886
				1887	return &iocg->pd;
				1888	}
				1889
				1890	static void ioc_pd_init(struct blkg_policy_data *pd)
				1891	{
				1892	struct ioc_gq *iocg = pd_to_iocg(pd);
				1893	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
				1894	struct ioc *ioc = q_to_ioc(blkg->q);
				1895	struct ioc_now now;
				1896	struct blkcg_gq *tblkg;
				1897	unsigned long flags;
				1898
				1899	ioc_now(ioc, &now);
				1900
				1901	iocg->ioc = ioc;
				1902	atomic64_set(&iocg->vtime, now.vnow);
				1903	atomic64_set(&iocg->done_vtime, now.vnow);
				1904	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
				1905	INIT_LIST_HEAD(&iocg->active_list);
				1906	iocg->hweight_active = HWEIGHT_WHOLE;
				1907	iocg->hweight_inuse = HWEIGHT_WHOLE;
				1908
				1909	init_waitqueue_head(&iocg->waitq);
				1910	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
				1911	iocg->waitq_timer.function = iocg_waitq_timer_fn;
				1912	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
				1913	iocg->delay_timer.function = iocg_delay_timer_fn;
				1914
				1915	iocg->level = blkg->blkcg->css.cgroup->level;
				1916
				1917	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
				1918	struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
				1919	iocg->ancestors[tiocg->level] = tiocg;
				1920	}
				1921
				1922	spin_lock_irqsave(&ioc->lock, flags);
				1923	weight_updated(iocg);
				1924	spin_unlock_irqrestore(&ioc->lock, flags);
				1925	}
				1926
				1927	static void ioc_pd_free(struct blkg_policy_data *pd)
				1928	{
				1929	struct ioc_gq *iocg = pd_to_iocg(pd);
				1930	struct ioc *ioc = iocg->ioc;
				1931
				1932	if (ioc) {
				1933	hrtimer_cancel(&iocg->waitq_timer);
				1934	hrtimer_cancel(&iocg->delay_timer);
				1935
				1936	spin_lock(&ioc->lock);
				1937	if (!list_empty(&iocg->active_list)) {
				1938	propagate_active_weight(iocg, 0, 0);
				1939	list_del_init(&iocg->active_list);
				1940	}
				1941	spin_unlock(&ioc->lock);
				1942	}
				1943	kfree(iocg);
				1944	}
				1945
				1946	static u64 ioc_weight_prfill(struct seq_file sf, struct blkg_policy_data pd,
				1947	int off)
				1948	{
				1949	const char *dname = blkg_dev_name(pd->blkg);
				1950	struct ioc_gq *iocg = pd_to_iocg(pd);
				1951
				1952	if (dname && iocg->cfg_weight)
				1953	seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
				1954	return 0;
				1955	}
				1956
				1957
				1958	static int ioc_weight_show(struct seq_file sf, void v)
				1959	{
				1960	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				1961	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
				1962
				1963	seq_printf(sf, "default %u\n", iocc->dfl_weight);
				1964	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
				1965	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				1966	return 0;
				1967	}
				1968
				1969	static ssize_t ioc_weight_write(struct kernfs_open_file of, char buf,
				1970	size_t nbytes, loff_t off)
				1971	{
				1972	struct blkcg *blkcg = css_to_blkcg(of_css(of));
				1973	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
				1974	struct blkg_conf_ctx ctx;
				1975	struct ioc_gq *iocg;
				1976	u32 v;
				1977	int ret;
				1978
				1979	if (!strchr(buf, ':')) {
				1980	struct blkcg_gq *blkg;
				1981
				1982	if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
				1983	return -EINVAL;
				1984
				1985	if (v < CGROUP_WEIGHT_MIN \|\| v > CGROUP_WEIGHT_MAX)
				1986	return -EINVAL;
				1987
				1988	spin_lock(&blkcg->lock);
				1989	iocc->dfl_weight = v;
				1990	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
				1991	struct ioc_gq *iocg = blkg_to_iocg(blkg);
				1992
				1993	if (iocg) {
				1994	spin_lock_irq(&iocg->ioc->lock);
				1995	weight_updated(iocg);
				1996	spin_unlock_irq(&iocg->ioc->lock);
				1997	}
				1998	}
				1999	spin_unlock(&blkcg->lock);
				2000
				2001	return nbytes;
				2002	}
				2003
				2004	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
				2005	if (ret)
				2006	return ret;
				2007
				2008	iocg = blkg_to_iocg(ctx.blkg);
				2009
				2010	if (!strncmp(ctx.body, "default", 7)) {
				2011	v = 0;
				2012	} else {
				2013	if (!sscanf(ctx.body, "%u", &v))
				2014	goto einval;
				2015	if (v < CGROUP_WEIGHT_MIN \|\| v > CGROUP_WEIGHT_MAX)
				2016	goto einval;
				2017	}
				2018
				2019	spin_lock_irq(&iocg->ioc->lock);
				2020	iocg->cfg_weight = v;
				2021	weight_updated(iocg);
				2022	spin_unlock_irq(&iocg->ioc->lock);
				2023
				2024	blkg_conf_finish(&ctx);
				2025	return nbytes;
				2026
				2027	einval:
				2028	blkg_conf_finish(&ctx);
				2029	return -EINVAL;
				2030	}
				2031
				2032	static u64 ioc_qos_prfill(struct seq_file sf, struct blkg_policy_data pd,
				2033	int off)
				2034	{
				2035	const char *dname = blkg_dev_name(pd->blkg);
				2036	struct ioc *ioc = pd_to_iocg(pd)->ioc;
				2037
				2038	if (!dname)
				2039	return 0;
				2040
				2041	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
				2042	dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
				2043	ioc->params.qos[QOS_RPPM] / 10000,
				2044	ioc->params.qos[QOS_RPPM] % 10000 / 100,
				2045	ioc->params.qos[QOS_RLAT],
				2046	ioc->params.qos[QOS_WPPM] / 10000,
				2047	ioc->params.qos[QOS_WPPM] % 10000 / 100,
				2048	ioc->params.qos[QOS_WLAT],
				2049	ioc->params.qos[QOS_MIN] / 10000,
				2050	ioc->params.qos[QOS_MIN] % 10000 / 100,
				2051	ioc->params.qos[QOS_MAX] / 10000,
				2052	ioc->params.qos[QOS_MAX] % 10000 / 100);
				2053	return 0;
				2054	}
				2055
				2056	static int ioc_qos_show(struct seq_file sf, void v)
				2057	{
				2058	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2059
				2060	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
				2061	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2062	return 0;
				2063	}
				2064
				2065	static const match_table_t qos_ctrl_tokens = {
				2066	{ QOS_ENABLE, "enable=%u" },
				2067	{ QOS_CTRL, "ctrl=%s" },
				2068	{ NR_QOS_CTRL_PARAMS, NULL },
				2069	};
				2070
				2071	static const match_table_t qos_tokens = {
				2072	{ QOS_RPPM, "rpct=%s" },
				2073	{ QOS_RLAT, "rlat=%u" },
				2074	{ QOS_WPPM, "wpct=%s" },
				2075	{ QOS_WLAT, "wlat=%u" },
				2076	{ QOS_MIN, "min=%s" },
				2077	{ QOS_MAX, "max=%s" },
				2078	{ NR_QOS_PARAMS, NULL },
				2079	};
				2080
				2081	static ssize_t ioc_qos_write(struct kernfs_open_file of, char input,
				2082	size_t nbytes, loff_t off)
				2083	{
				2084	struct gendisk *disk;
				2085	struct ioc *ioc;
				2086	u32 qos[NR_QOS_PARAMS];
				2087	bool enable, user;
				2088	char *p;
				2089	int ret;
				2090
				2091	disk = blkcg_conf_get_disk(&input);
				2092	if (IS_ERR(disk))
				2093	return PTR_ERR(disk);
				2094
				2095	ioc = q_to_ioc(disk->queue);
				2096	if (!ioc) {
				2097	ret = blk_iocost_init(disk->queue);
				2098	if (ret)
				2099	goto err;
				2100	ioc = q_to_ioc(disk->queue);
				2101	}
				2102
				2103	spin_lock_irq(&ioc->lock);
				2104	memcpy(qos, ioc->params.qos, sizeof(qos));
				2105	enable = ioc->enabled;
				2106	user = ioc->user_qos_params;
				2107	spin_unlock_irq(&ioc->lock);
				2108
				2109	while ((p = strsep(&input, " \t\n"))) {
				2110	substring_t args[MAX_OPT_ARGS];
				2111	char buf[32];
				2112	int tok;
				2113	s64 v;
				2114
				2115	if (!*p)
				2116	continue;
				2117
				2118	switch (match_token(p, qos_ctrl_tokens, args)) {
				2119	case QOS_ENABLE:
				2120	match_u64(&args[0], &v);
				2121	enable = v;
				2122	continue;
				2123	case QOS_CTRL:
				2124	match_strlcpy(buf, &args[0], sizeof(buf));
				2125	if (!strcmp(buf, "auto"))
				2126	user = false;
				2127	else if (!strcmp(buf, "user"))
				2128	user = true;
				2129	else
				2130	goto einval;
				2131	continue;
				2132	}
				2133
				2134	tok = match_token(p, qos_tokens, args);
				2135	switch (tok) {
				2136	case QOS_RPPM:
				2137	case QOS_WPPM:
				2138	if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
				2139	sizeof(buf))
				2140	goto einval;
				2141	if (cgroup_parse_float(buf, 2, &v))
				2142	goto einval;
				2143	if (v < 0 \|\| v > 10000)
				2144	goto einval;
				2145	qos[tok] = v * 100;
				2146	break;
				2147	case QOS_RLAT:
				2148	case QOS_WLAT:
				2149	if (match_u64(&args[0], &v))
				2150	goto einval;
				2151	qos[tok] = v;
				2152	break;
				2153	case QOS_MIN:
				2154	case QOS_MAX:
				2155	if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
				2156	sizeof(buf))
				2157	goto einval;
				2158	if (cgroup_parse_float(buf, 2, &v))
				2159	goto einval;
				2160	if (v < 0)
				2161	goto einval;
				2162	qos[tok] = clamp_t(s64, v * 100,
				2163	VRATE_MIN_PPM, VRATE_MAX_PPM);
				2164	break;
				2165	default:
				2166	goto einval;
				2167	}
				2168	user = true;
				2169	}
				2170
				2171	if (qos[QOS_MIN] > qos[QOS_MAX])
				2172	goto einval;
				2173
				2174	spin_lock_irq(&ioc->lock);
				2175
				2176	if (enable) {
				2177	blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
				2178	ioc->enabled = true;
				2179	} else {
				2180	blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
				2181	ioc->enabled = false;
				2182	}
				2183
				2184	if (user) {
				2185	memcpy(ioc->params.qos, qos, sizeof(qos));
				2186	ioc->user_qos_params = true;
				2187	} else {
				2188	ioc->user_qos_params = false;
				2189	}
				2190
				2191	ioc_refresh_params(ioc, true);
				2192	spin_unlock_irq(&ioc->lock);
				2193
				2194	put_disk_and_module(disk);
				2195	return nbytes;
				2196	einval:
				2197	ret = -EINVAL;
				2198	err:
				2199	put_disk_and_module(disk);
				2200	return ret;
				2201	}
				2202
				2203	static u64 ioc_cost_model_prfill(struct seq_file *sf,
				2204	struct blkg_policy_data *pd, int off)
				2205	{
				2206	const char *dname = blkg_dev_name(pd->blkg);
				2207	struct ioc *ioc = pd_to_iocg(pd)->ioc;
				2208	u64 *u = ioc->params.i_lcoefs;
				2209
				2210	if (!dname)
				2211	return 0;
				2212
				2213	seq_printf(sf, "%s ctrl=%s model=linear "
				2214	"rbps=%llu rseqiops=%llu rrandiops=%llu "
				2215	"wbps=%llu wseqiops=%llu wrandiops=%llu\n",
				2216	dname, ioc->user_cost_model ? "user" : "auto",
				2217	u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
				2218	u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
				2219	return 0;
				2220	}
				2221
				2222	static int ioc_cost_model_show(struct seq_file sf, void v)
				2223	{
				2224	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2225
				2226	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
				2227	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2228	return 0;
				2229	}
				2230
				2231	static const match_table_t cost_ctrl_tokens = {
				2232	{ COST_CTRL, "ctrl=%s" },
				2233	{ COST_MODEL, "model=%s" },
				2234	{ NR_COST_CTRL_PARAMS, NULL },
				2235	};
				2236
				2237	static const match_table_t i_lcoef_tokens = {
				2238	{ I_LCOEF_RBPS, "rbps=%u" },
				2239	{ I_LCOEF_RSEQIOPS, "rseqiops=%u" },
				2240	{ I_LCOEF_RRANDIOPS, "rrandiops=%u" },
				2241	{ I_LCOEF_WBPS, "wbps=%u" },
				2242	{ I_LCOEF_WSEQIOPS, "wseqiops=%u" },
				2243	{ I_LCOEF_WRANDIOPS, "wrandiops=%u" },
				2244	{ NR_I_LCOEFS, NULL },
				2245	};
				2246
				2247	static ssize_t ioc_cost_model_write(struct kernfs_open_file of, char input,
				2248	size_t nbytes, loff_t off)
				2249	{
				2250	struct gendisk *disk;
				2251	struct ioc *ioc;
				2252	u64 u[NR_I_LCOEFS];
				2253	bool user;
				2254	char *p;
				2255	int ret;
				2256
				2257	disk = blkcg_conf_get_disk(&input);
				2258	if (IS_ERR(disk))
				2259	return PTR_ERR(disk);
				2260
				2261	ioc = q_to_ioc(disk->queue);
				2262	if (!ioc) {
				2263	ret = blk_iocost_init(disk->queue);
				2264	if (ret)
				2265	goto err;
				2266	ioc = q_to_ioc(disk->queue);
				2267	}
				2268
				2269	spin_lock_irq(&ioc->lock);
				2270	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
				2271	user = ioc->user_cost_model;
				2272	spin_unlock_irq(&ioc->lock);
				2273
				2274	while ((p = strsep(&input, " \t\n"))) {
				2275	substring_t args[MAX_OPT_ARGS];
				2276	char buf[32];
				2277	int tok;
				2278	u64 v;
				2279
				2280	if (!*p)
				2281	continue;
				2282
				2283	switch (match_token(p, cost_ctrl_tokens, args)) {
				2284	case COST_CTRL:
				2285	match_strlcpy(buf, &args[0], sizeof(buf));
				2286	if (!strcmp(buf, "auto"))
				2287	user = false;
				2288	else if (!strcmp(buf, "user"))
				2289	user = true;
				2290	else
				2291	goto einval;
				2292	continue;
				2293	case COST_MODEL:
				2294	match_strlcpy(buf, &args[0], sizeof(buf));
				2295	if (strcmp(buf, "linear"))
				2296	goto einval;
				2297	continue;
				2298	}
				2299
				2300	tok = match_token(p, i_lcoef_tokens, args);
				2301	if (tok == NR_I_LCOEFS)
				2302	goto einval;
				2303	if (match_u64(&args[0], &v))
				2304	goto einval;
				2305	u[tok] = v;
				2306	user = true;
				2307	}
				2308
				2309	spin_lock_irq(&ioc->lock);
				2310	if (user) {
				2311	memcpy(ioc->params.i_lcoefs, u, sizeof(u));
				2312	ioc->user_cost_model = true;
				2313	} else {
				2314	ioc->user_cost_model = false;
				2315	}
				2316	ioc_refresh_params(ioc, true);
				2317	spin_unlock_irq(&ioc->lock);
				2318
				2319	put_disk_and_module(disk);
				2320	return nbytes;
				2321
				2322	einval:
				2323	ret = -EINVAL;
				2324	err:
				2325	put_disk_and_module(disk);
				2326	return ret;
				2327	}
				2328
				2329	static struct cftype ioc_files[] = {
				2330	{
				2331	.name = "weight",
				2332	.flags = CFTYPE_NOT_ON_ROOT,
				2333	.seq_show = ioc_weight_show,
				2334	.write = ioc_weight_write,
				2335	},
				2336	{
				2337	.name = "cost.qos",
				2338	.flags = CFTYPE_ONLY_ON_ROOT,
				2339	.seq_show = ioc_qos_show,
				2340	.write = ioc_qos_write,
				2341	},
				2342	{
				2343	.name = "cost.model",
				2344	.flags = CFTYPE_ONLY_ON_ROOT,
				2345	.seq_show = ioc_cost_model_show,
				2346	.write = ioc_cost_model_write,
				2347	},
				2348	{}
				2349	};
				2350
				2351	static struct blkcg_policy blkcg_policy_iocost = {
				2352	.dfl_cftypes = ioc_files,
				2353	.cpd_alloc_fn = ioc_cpd_alloc,
				2354	.cpd_free_fn = ioc_cpd_free,
				2355	.pd_alloc_fn = ioc_pd_alloc,
				2356	.pd_init_fn = ioc_pd_init,
				2357	.pd_free_fn = ioc_pd_free,
				2358	};
				2359
				2360	static int __init ioc_init(void)
				2361	{
				2362	return blkcg_policy_register(&blkcg_policy_iocost);
				2363	}
				2364
				2365	static void __exit ioc_exit(void)
				2366	{
				2367	return blkcg_policy_unregister(&blkcg_policy_iocost);
				2368	}
				2369
				2370	module_init(ioc_init);
				2371	module_exit(ioc_exit);