Blame - block/bfq-iosched.c - kernel/msm

blob: 637c9066506ba529bdfcbe7decdc1ec0c2730929 [file] [log] [blame]

Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1	/*
				2	* Budget Fair Queueing (BFQ) disk scheduler.
				3	*
				4	* Based on ideas and code from CFQ:
				5	* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
				6	*
				7	* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
				8	* Paolo Valente <paolo.valente@unimore.it>
				9	*
				10	* Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
				11	*
				12	* Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
				13	* file.
				14	*
				15	* BFQ is a proportional-share storage-I/O scheduling algorithm based on
				16	* the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
				17	* measured in number of sectors, to processes instead of time slices. The
				18	* device is not granted to the in-service process for a given time slice,
				19	* but until it has exhausted its assigned budget. This change from the time
				20	* to the service domain allows BFQ to distribute the device throughput
				21	* among processes as desired, without any distortion due to ZBR, workload
				22	* fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
				23	* called B-WF2Q+, to schedule processes according to their budgets. More
				24	* precisely, BFQ schedules queues associated to processes. Thanks to the
				25	* accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
				26	* I/O-bound processes issuing sequential requests (to boost the
				27	* throughput), and yet guarantee a low latency to interactive and soft
				28	* real-time applications.
				29	*
				30	* BFQ is described in [1], where also a reference to the initial, more
				31	* theoretical paper on BFQ can be found. The interested reader can find
				32	* in the latter paper full details on the main algorithm, as well as
				33	* formulas of the guarantees and formal proofs of all the properties.
				34	* With respect to the version of BFQ presented in these papers, this
				35	* implementation adds a few more heuristics, such as the one that
				36	* guarantees a low latency to soft real-time applications, and a
				37	* hierarchical extension based on H-WF2Q+.
				38	*
				39	* B-WF2Q+ is based on WF2Q+, that is described in [2], together with
				40	* H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
				41	* complexity derives from the one introduced with EEVDF in [3].
				42	*
				43	* [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
				44	* with the BFQ Disk I/O Scheduler'',
				45	* Proceedings of the 5th Annual International Systems and Storage
				46	* Conference (SYSTOR '12), June 2012.
				47	*
				48	* http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
				49	*
				50	* [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
				51	* Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
				52	* Oct 1997.
				53	*
				54	* http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
				55	*
				56	* [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
				57	* First: A Flexible and Accurate Mechanism for Proportional Share
				58	* Resource Allocation,'' technical report.
				59	*
				60	* http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
				61	*/
				62	#include <linux/module.h>
				63	#include <linux/slab.h>
				64	#include <linux/blkdev.h>
				65	#include <linux/cgroup.h>
				66	#include <linux/elevator.h>
				67	#include <linux/jiffies.h>
				68	#include <linux/rbtree.h>
				69	#include <linux/ioprio.h>
				70	#include "bfq.h"
				71	#include "blk.h"
				72
				73	/* Expiration time of sync (0) and async (1) requests, in jiffies. */
				74	static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
				75
				76	/* Maximum backwards seek, in KiB. */
				77	static const int bfq_back_max = 16 * 1024;
				78
				79	/* Penalty of a backwards seek, in number of sectors. */
				80	static const int bfq_back_penalty = 2;
				81
				82	/* Idling period duration, in jiffies. */
				83	static int bfq_slice_idle = HZ / 125;
				84
				85	/* Default maximum budget values, in sectors and number of requests. */
				86	static const int bfq_default_max_budget = 16 * 1024;
				87	static const int bfq_max_budget_async_rq = 4;
				88
				89	/*
				90	* Async to sync throughput distribution is controlled as follows:
				91	* when an async request is served, the entity is charged the number
				92	* of sectors of the request, multiplied by the factor below
				93	*/
				94	static const int bfq_async_charge_factor = 10;
				95
				96	/* Default timeout values, in jiffies, approximating CFQ defaults. */
				97	static const int bfq_timeout_sync = HZ / 8;
				98	static int bfq_timeout_async = HZ / 25;
				99
				100	struct kmem_cache *bfq_pool;
				101
				102	/* Below this threshold (in ms), we consider thinktime immediate. */
				103	#define BFQ_MIN_TT 2
				104
				105	/* hw_tag detection: parallel requests threshold and min samples needed. */
				106	#define BFQ_HW_QUEUE_THRESHOLD 4
				107	#define BFQ_HW_QUEUE_SAMPLES 32
				108
				109	#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
				110	#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
				111
				112	/* Min samples used for peak rate estimation (for autotuning). */
				113	#define BFQ_PEAK_RATE_SAMPLES 32
				114
				115	/* Shift used for peak rate fixed precision calculations. */
				116	#define BFQ_RATE_SHIFT 16
				117
				118	/*
				119	* By default, BFQ computes the duration of the weight raising for
				120	* interactive applications automatically, using the following formula:
				121	* duration = (R / r) * T, where r is the peak rate of the device, and
				122	* R and T are two reference parameters.
				123	* In particular, R is the peak rate of the reference device (see below),
				124	* and T is a reference time: given the systems that are likely to be
				125	* installed on the reference device according to its speed class, T is
				126	* about the maximum time needed, under BFQ and while reading two files in
				127	* parallel, to load typical large applications on these systems.
				128	* In practice, the slower/faster the device at hand is, the more/less it
				129	* takes to load applications with respect to the reference device.
				130	* Accordingly, the longer/shorter BFQ grants weight raising to interactive
				131	* applications.
				132	*
				133	* BFQ uses four different reference pairs (R, T), depending on:
				134	* . whether the device is rotational or non-rotational;
				135	* . whether the device is slow, such as old or portable HDDs, as well as
				136	* SD cards, or fast, such as newer HDDs and SSDs.
				137	*
				138	* The device's speed class is dynamically (re)detected in
				139	* bfq_update_peak_rate() every time the estimated peak rate is updated.
				140	*
				141	* In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
				142	* are the reference values for a slow/fast rotational device, whereas
				143	* R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
				144	* a slow/fast non-rotational device. Finally, device_speed_thresh are the
				145	* thresholds used to switch between speed classes.
				146	* Both the reference peak rates and the thresholds are measured in
				147	* sectors/usec, left-shifted by BFQ_RATE_SHIFT.
				148	*/
				149	static int R_slow[2] = {1536, 10752};
				150	static int R_fast[2] = {17415, 34791};
				151	/*
				152	* To improve readability, a conversion function is used to initialize the
				153	* following arrays, which entails that they can be initialized only in a
				154	* function.
				155	*/
				156	static int T_slow[2];
				157	static int T_fast[2];
				158	static int device_speed_thresh[2];
				159
				160	#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
				161	{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
				162
				163	#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
				164	#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
				165
				166	static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
				167
				168	#include "bfq-ioc.c"
				169	#include "bfq-sched.c"
				170	#include "bfq-cgroup.c"
				171
				172	#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
				173	IOPRIO_CLASS_IDLE)
				174	#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
				175	IOPRIO_CLASS_RT)
				176
				177	#define bfq_sample_valid(samples) ((samples) > 80)
				178
				179	/*
				180	* The following macro groups conditions that need to be evaluated when
				181	* checking if existing queues and groups form a symmetric scenario
				182	* and therefore idling can be reduced or disabled for some of the
				183	* queues. See the comment to the function bfq_bfqq_must_not_expire()
				184	* for further details.
				185	*/
				186	#ifdef CONFIG_CGROUP_BFQIO
				187	#define symmetric_scenario (!bfqd->active_numerous_groups && \
				188	!bfq_differentiated_weights(bfqd))
				189	#else
				190	#define symmetric_scenario (!bfq_differentiated_weights(bfqd))
				191	#endif
				192
				193	/*
				194	* We regard a request as SYNC, if either it's a read or has the SYNC bit
				195	* set (in which case it could also be a direct WRITE).
				196	*/
				197	static inline int bfq_bio_sync(struct bio *bio)
				198	{
				199	if (bio_data_dir(bio) == READ \|\| (bio->bi_rw & REQ_SYNC))
				200	return 1;
				201
				202	return 0;
				203	}
				204
				205	/*
				206	* Scheduler run of queue, if there are requests pending and no one in the
				207	* driver that will restart queueing.
				208	*/
				209	static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
				210	{
				211	if (bfqd->queued != 0) {
				212	bfq_log(bfqd, "schedule dispatch");
				213	kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
				214	}
				215	}
				216
				217	/*
				218	* Lifted from AS - choose which of rq1 and rq2 that is best served now.
				219	* We choose the request that is closesr to the head right now. Distance
				220	* behind the head is penalized and only allowed to a certain extent.
				221	*/
				222	static struct request bfq_choose_req(struct bfq_data bfqd,
				223	struct request *rq1,
				224	struct request *rq2,
				225	sector_t last)
				226	{
				227	sector_t s1, s2, d1 = 0, d2 = 0;
				228	unsigned long back_max;
				229	#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
				230	#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
				231	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
				232
				233	if (rq1 == NULL \|\| rq1 == rq2)
				234	return rq2;
				235	if (rq2 == NULL)
				236	return rq1;
				237
				238	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
				239	return rq1;
				240	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
				241	return rq2;
				242	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
				243	return rq1;
				244	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
				245	return rq2;
				246
				247	s1 = blk_rq_pos(rq1);
				248	s2 = blk_rq_pos(rq2);
				249
				250	/*
				251	* By definition, 1KiB is 2 sectors.
				252	*/
				253	back_max = bfqd->bfq_back_max * 2;
				254
				255	/*
				256	* Strict one way elevator _except_ in the case where we allow
				257	* short backward seeks which are biased as twice the cost of a
				258	* similar forward seek.
				259	*/
				260	if (s1 >= last)
				261	d1 = s1 - last;
				262	else if (s1 + back_max >= last)
				263	d1 = (last - s1) * bfqd->bfq_back_penalty;
				264	else
				265	wrap \|= BFQ_RQ1_WRAP;
				266
				267	if (s2 >= last)
				268	d2 = s2 - last;
				269	else if (s2 + back_max >= last)
				270	d2 = (last - s2) * bfqd->bfq_back_penalty;
				271	else
				272	wrap \|= BFQ_RQ2_WRAP;
				273
				274	/* Found required data */
				275
				276	/*
				277	* By doing switch() on the bit mask "wrap" we avoid having to
				278	* check two variables for all permutations: --> faster!
				279	*/
				280	switch (wrap) {
				281	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
				282	if (d1 < d2)
				283	return rq1;
				284	else if (d2 < d1)
				285	return rq2;
				286	else {
				287	if (s1 >= s2)
				288	return rq1;
				289	else
				290	return rq2;
				291	}
				292
				293	case BFQ_RQ2_WRAP:
				294	return rq1;
				295	case BFQ_RQ1_WRAP:
				296	return rq2;
				297	case (BFQ_RQ1_WRAP\|BFQ_RQ2_WRAP): /* both rqs wrapped */
				298	default:
				299	/*
				300	* Since both rqs are wrapped,
				301	* start with the one that's further behind head
				302	* (--> only one back seek required),
				303	* since back seek takes more time than forward.
				304	*/
				305	if (s1 <= s2)
				306	return rq1;
				307	else
				308	return rq2;
				309	}
				310	}
				311
				312	static struct bfq_queue *
				313	bfq_rq_pos_tree_lookup(struct bfq_data bfqd, struct rb_root root,
				314	sector_t sector, struct rb_node **ret_parent,
				315	struct rb_node ***rb_link)
				316	{
				317	struct rb_node *p, parent;
				318	struct bfq_queue *bfqq = NULL;
				319
				320	parent = NULL;
				321	p = &root->rb_node;
				322	while (*p) {
				323	struct rb_node **n;
				324
				325	parent = *p;
				326	bfqq = rb_entry(parent, struct bfq_queue, pos_node);
				327
				328	/*
				329	* Sort strictly based on sector. Smallest to the left,
				330	* largest to the right.
				331	*/
				332	if (sector > blk_rq_pos(bfqq->next_rq))
				333	n = &(*p)->rb_right;
				334	else if (sector < blk_rq_pos(bfqq->next_rq))
				335	n = &(*p)->rb_left;
				336	else
				337	break;
				338	p = n;
				339	bfqq = NULL;
				340	}
				341
				342	*ret_parent = parent;
				343	if (rb_link)
				344	*rb_link = p;
				345
				346	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
				347	(long long unsigned)sector,
				348	bfqq != NULL ? bfqq->pid : 0);
				349
				350	return bfqq;
				351	}
				352
				353	static void bfq_rq_pos_tree_add(struct bfq_data bfqd, struct bfq_queue bfqq)
				354	{
				355	struct rb_node *p, parent;
				356	struct bfq_queue *__bfqq;
				357
				358	if (bfqq->pos_root != NULL) {
				359	rb_erase(&bfqq->pos_node, bfqq->pos_root);
				360	bfqq->pos_root = NULL;
				361	}
				362
				363	if (bfq_class_idle(bfqq))
				364	return;
				365	if (!bfqq->next_rq)
				366	return;
				367
				368	bfqq->pos_root = &bfqd->rq_pos_tree;
				369	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
				370	blk_rq_pos(bfqq->next_rq), &parent, &p);
				371	if (__bfqq == NULL) {
				372	rb_link_node(&bfqq->pos_node, parent, p);
				373	rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
				374	} else
				375	bfqq->pos_root = NULL;
				376	}
				377
				378	/*
				379	* Tell whether there are active queues or groups with differentiated weights.
				380	*/
				381	static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
				382	{
				383	/*
				384	* For weights to differ, at least one of the trees must contain
				385	* at least two nodes.
				386	*/
				387	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
				388	(bfqd->queue_weights_tree.rb_node->rb_left \|\|
				389	bfqd->queue_weights_tree.rb_node->rb_right)
				390	#ifdef CONFIG_CGROUP_BFQIO
				391	) \|\|
				392	(!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
				393	(bfqd->group_weights_tree.rb_node->rb_left \|\|
				394	bfqd->group_weights_tree.rb_node->rb_right)
				395	#endif
				396	);
				397	}
				398
				399	/*
				400	* If the weight-counter tree passed as input contains no counter for
				401	* the weight of the input entity, then add that counter; otherwise just
				402	* increment the existing counter.
				403	*
				404	* Note that weight-counter trees contain few nodes in mostly symmetric
				405	* scenarios. For example, if all queues have the same weight, then the
				406	* weight-counter tree for the queues may contain at most one node.
				407	* This holds even if low_latency is on, because weight-raised queues
				408	* are not inserted in the tree.
				409	* In most scenarios, the rate at which nodes are created/destroyed
				410	* should be low too.
				411	*/
				412	static void bfq_weights_tree_add(struct bfq_data *bfqd,
				413	struct bfq_entity *entity,
				414	struct rb_root *root)
				415	{
				416	struct rb_node *new = &(root->rb_node), parent = NULL;
				417
				418	/*
				419	* Do not insert if the entity is already associated with a
				420	* counter, which happens if:
				421	* 1) the entity is associated with a queue,
				422	* 2) a request arrival has caused the queue to become both
				423	* non-weight-raised, and hence change its weight, and
				424	* backlogged; in this respect, each of the two events
				425	* causes an invocation of this function,
				426	* 3) this is the invocation of this function caused by the
				427	* second event. This second invocation is actually useless,
				428	* and we handle this fact by exiting immediately. More
				429	* efficient or clearer solutions might possibly be adopted.
				430	*/
				431	if (entity->weight_counter)
				432	return;
				433
				434	while (*new) {
				435	struct bfq_weight_counter __counter = container_of(new,
				436	struct bfq_weight_counter,
				437	weights_node);
				438	parent = *new;
				439
				440	if (entity->weight == __counter->weight) {
				441	entity->weight_counter = __counter;
				442	goto inc_counter;
				443	}
				444	if (entity->weight < __counter->weight)
				445	new = &((*new)->rb_left);
				446	else
				447	new = &((*new)->rb_right);
				448	}
				449
				450	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
				451	GFP_ATOMIC);
				452	entity->weight_counter->weight = entity->weight;
				453	rb_link_node(&entity->weight_counter->weights_node, parent, new);
				454	rb_insert_color(&entity->weight_counter->weights_node, root);
				455
				456	inc_counter:
				457	entity->weight_counter->num_active++;
				458	}
				459
				460	/*
				461	* Decrement the weight counter associated with the entity, and, if the
				462	* counter reaches 0, remove the counter from the tree.
				463	* See the comments to the function bfq_weights_tree_add() for considerations
				464	* about overhead.
				465	*/
				466	static void bfq_weights_tree_remove(struct bfq_data *bfqd,
				467	struct bfq_entity *entity,
				468	struct rb_root *root)
				469	{
				470	if (!entity->weight_counter)
				471	return;
				472
				473	BUG_ON(RB_EMPTY_ROOT(root));
				474	BUG_ON(entity->weight_counter->weight != entity->weight);
				475
				476	BUG_ON(!entity->weight_counter->num_active);
				477	entity->weight_counter->num_active--;
				478	if (entity->weight_counter->num_active > 0)
				479	goto reset_entity_pointer;
				480
				481	rb_erase(&entity->weight_counter->weights_node, root);
				482	kfree(entity->weight_counter);
				483
				484	reset_entity_pointer:
				485	entity->weight_counter = NULL;
				486	}
				487
				488	static struct request bfq_find_next_rq(struct bfq_data bfqd,
				489	struct bfq_queue *bfqq,
				490	struct request *last)
				491	{
				492	struct rb_node *rbnext = rb_next(&last->rb_node);
				493	struct rb_node *rbprev = rb_prev(&last->rb_node);
				494	struct request next = NULL, prev = NULL;
				495
				496	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
				497
				498	if (rbprev != NULL)
				499	prev = rb_entry_rq(rbprev);
				500
				501	if (rbnext != NULL)
				502	next = rb_entry_rq(rbnext);
				503	else {
				504	rbnext = rb_first(&bfqq->sort_list);
				505	if (rbnext && rbnext != &last->rb_node)
				506	next = rb_entry_rq(rbnext);
				507	}
				508
				509	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
				510	}
				511
				512	/* see the definition of bfq_async_charge_factor for details */
				513	static inline unsigned long bfq_serv_to_charge(struct request *rq,
				514	struct bfq_queue *bfqq)
				515	{
				516	return blk_rq_sectors(rq) *
				517	(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
				518	bfq_async_charge_factor));
				519	}
				520
				521	/**
				522	* bfq_updated_next_req - update the queue after a new next_rq selection.
				523	* @bfqd: the device data the queue belongs to.
				524	* @bfqq: the queue to update.
				525	*
				526	* If the first request of a queue changes we make sure that the queue
				527	* has enough budget to serve at least its first request (if the
				528	* request has grown). We do this because if the queue has not enough
				529	* budget for its first request, it has to go through two dispatch
				530	* rounds to actually get it dispatched.
				531	*/
				532	static void bfq_updated_next_req(struct bfq_data *bfqd,
				533	struct bfq_queue *bfqq)
				534	{
				535	struct bfq_entity *entity = &bfqq->entity;
				536	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				537	struct request *next_rq = bfqq->next_rq;
				538	unsigned long new_budget;
				539
				540	if (next_rq == NULL)
				541	return;
				542
				543	if (bfqq == bfqd->in_service_queue)
				544	/*
				545	* In order not to break guarantees, budgets cannot be
				546	* changed after an entity has been selected.
				547	*/
				548	return;
				549
				550	BUG_ON(entity->tree != &st->active);
				551	BUG_ON(entity == entity->sched_data->in_service_entity);
				552
				553	new_budget = max_t(unsigned long, bfqq->max_budget,
				554	bfq_serv_to_charge(next_rq, bfqq));
				555	if (entity->budget != new_budget) {
				556	entity->budget = new_budget;
				557	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
				558	new_budget);
				559	bfq_activate_bfqq(bfqd, bfqq);
				560	}
				561	}
				562
				563	static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
				564	{
				565	u64 dur;
				566
				567	if (bfqd->bfq_wr_max_time > 0)
				568	return bfqd->bfq_wr_max_time;
				569
				570	dur = bfqd->RT_prod;
				571	do_div(dur, bfqd->peak_rate);
				572
				573	return dur;
				574	}
				575
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	576	static inline unsigned
				577	bfq_bfqq_cooperations(struct bfq_queue *bfqq)
				578	{
				579	return bfqq->bic ? bfqq->bic->cooperations : 0;
				580	}
				581
				582	static inline void
				583	bfq_bfqq_resume_state(struct bfq_queue bfqq, struct bfq_io_cq bic)
				584	{
				585	if (bic->saved_idle_window)
				586	bfq_mark_bfqq_idle_window(bfqq);
				587	else
				588	bfq_clear_bfqq_idle_window(bfqq);
				589	if (bic->saved_IO_bound)
				590	bfq_mark_bfqq_IO_bound(bfqq);
				591	else
				592	bfq_clear_bfqq_IO_bound(bfqq);
				593	/* Assuming that the flag in_large_burst is already correctly set */
				594	if (bic->wr_time_left && bfqq->bfqd->low_latency &&
				595	!bfq_bfqq_in_large_burst(bfqq) &&
				596	bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {
				597	/*
				598	* Start a weight raising period with the duration given by
				599	* the raising_time_left snapshot.
				600	*/
				601	if (bfq_bfqq_busy(bfqq))
				602	bfqq->bfqd->wr_busy_queues++;
				603	bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
				604	bfqq->wr_cur_max_time = bic->wr_time_left;
				605	bfqq->last_wr_start_finish = jiffies;
				606	bfqq->entity.ioprio_changed = 1;
				607	}
				608	/*
				609	* Clear wr_time_left to prevent bfq_bfqq_save_state() from
				610	* getting confused about the queue's need of a weight-raising
				611	* period.
				612	*/
				613	bic->wr_time_left = 0;
				614	}
				615
				616	/* Must be called with the queue_lock held. */
				617	static int bfqq_process_refs(struct bfq_queue *bfqq)
				618	{
				619	int process_refs, io_refs;
				620
				621	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
				622	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
				623	BUG_ON(process_refs < 0);
				624	return process_refs;
				625	}
				626
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	627	/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
				628	static inline void bfq_reset_burst_list(struct bfq_data *bfqd,
				629	struct bfq_queue *bfqq)
				630	{
				631	struct bfq_queue *item;
				632	struct hlist_node pos, n;
				633
				634	hlist_for_each_entry_safe(item, pos, n,
				635	&bfqd->burst_list, burst_list_node)
				636	hlist_del_init(&item->burst_list_node);
				637	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
				638	bfqd->burst_size = 1;
				639	}
				640
				641	/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
				642	static void bfq_add_to_burst(struct bfq_data bfqd, struct bfq_queue bfqq)
				643	{
				644	/* Increment burst size to take into account also bfqq */
				645	bfqd->burst_size++;
				646
				647	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
				648	struct bfq_queue pos, bfqq_item;
				649	struct hlist_node p, n;
				650
				651	/*
				652	* Enough queues have been activated shortly after each
				653	* other to consider this burst as large.
				654	*/
				655	bfqd->large_burst = true;
				656
				657	/*
				658	* We can now mark all queues in the burst list as
				659	* belonging to a large burst.
				660	*/
				661	hlist_for_each_entry(bfqq_item, n, &bfqd->burst_list,
				662	burst_list_node)
				663	bfq_mark_bfqq_in_large_burst(bfqq_item);
				664	bfq_mark_bfqq_in_large_burst(bfqq);
				665
				666	/*
				667	* From now on, and until the current burst finishes, any
				668	* new queue being activated shortly after the last queue
				669	* was inserted in the burst can be immediately marked as
				670	* belonging to a large burst. So the burst list is not
				671	* needed any more. Remove it.
				672	*/
				673	hlist_for_each_entry_safe(pos, p, n, &bfqd->burst_list,
				674	burst_list_node)
				675	hlist_del_init(&pos->burst_list_node);
				676	} else /* burst not yet large: add bfqq to the burst list */
				677	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
				678	}
				679
				680	/*
				681	* If many queues happen to become active shortly after each other, then,
				682	* to help the processes associated to these queues get their job done as
				683	* soon as possible, it is usually better to not grant either weight-raising
				684	* or device idling to these queues. In this comment we describe, firstly,
				685	* the reasons why this fact holds, and, secondly, the next function, which
				686	* implements the main steps needed to properly mark these queues so that
				687	* they can then be treated in a different way.
				688	*
				689	* As for the terminology, we say that a queue becomes active, i.e.,
				690	* switches from idle to backlogged, either when it is created (as a
				691	* consequence of the arrival of an I/O request), or, if already existing,
				692	* when a new request for the queue arrives while the queue is idle.
				693	* Bursts of activations, i.e., activations of different queues occurring
				694	* shortly after each other, are typically caused by services or applications
				695	* that spawn or reactivate many parallel threads/processes. Examples are
				696	* systemd during boot or git grep.
				697	*
				698	* These services or applications benefit mostly from a high throughput:
				699	* the quicker the requests of the activated queues are cumulatively served,
				700	* the sooner the target job of these queues gets completed. As a consequence,
				701	* weight-raising any of these queues, which also implies idling the device
				702	* for it, is almost always counterproductive: in most cases it just lowers
				703	* throughput.
				704	*
				705	* On the other hand, a burst of activations may be also caused by the start
				706	* of an application that does not consist in a lot of parallel I/O-bound
				707	* threads. In fact, with a complex application, the burst may be just a
				708	* consequence of the fact that several processes need to be executed to
				709	* start-up the application. To start an application as quickly as possible,
				710	* the best thing to do is to privilege the I/O related to the application
				711	* with respect to all other I/O. Therefore, the best strategy to start as
				712	* quickly as possible an application that causes a burst of activations is
				713	* to weight-raise all the queues activated during the burst. This is the
				714	* exact opposite of the best strategy for the other type of bursts.
				715	*
				716	* In the end, to take the best action for each of the two cases, the two
				717	* types of bursts need to be distinguished. Fortunately, this seems
				718	* relatively easy to do, by looking at the sizes of the bursts. In
				719	* particular, we found a threshold such that bursts with a larger size
				720	* than that threshold are apparently caused only by services or commands
				721	* such as systemd or git grep. For brevity, hereafter we call just 'large'
				722	* these bursts. BFQ does not weight-raise queues whose activations occur
				723	* in a large burst. In addition, for each of these queues BFQ performs or
				724	* does not perform idling depending on which choice boosts the throughput
				725	* most. The exact choice depends on the device and request pattern at
				726	* hand.
				727	*
				728	* Turning back to the next function, it implements all the steps needed
				729	* to detect the occurrence of a large burst and to properly mark all the
				730	* queues belonging to it (so that they can then be treated in a different
				731	* way). This goal is achieved by maintaining a special "burst list" that
				732	* holds, temporarily, the queues that belong to the burst in progress. The
				733	* list is then used to mark these queues as belonging to a large burst if
				734	* the burst does become large. The main steps are the following.
				735	*
				736	* . when the very first queue is activated, the queue is inserted into the
				737	* list (as it could be the first queue in a possible burst)
				738	*
				739	* . if the current burst has not yet become large, and a queue Q that does
				740	* not yet belong to the burst is activated shortly after the last time
				741	* at which a new queue entered the burst list, then the function appends
				742	* Q to the burst list
				743	*
				744	* . if, as a consequence of the previous step, the burst size reaches
				745	* the large-burst threshold, then
				746	*
				747	* . all the queues in the burst list are marked as belonging to a
				748	* large burst
				749	*
				750	* . the burst list is deleted; in fact, the burst list already served
				751	* its purpose (keeping temporarily track of the queues in a burst,
				752	* so as to be able to mark them as belonging to a large burst in the
				753	* previous sub-step), and now is not needed any more
				754	*
				755	* . the device enters a large-burst mode
				756	*
				757	* . if a queue Q that does not belong to the burst is activated while
				758	* the device is in large-burst mode and shortly after the last time
				759	* at which a queue either entered the burst list or was marked as
				760	* belonging to the current large burst, then Q is immediately marked
				761	* as belonging to a large burst.
				762	*
				763	* . if a queue Q that does not belong to the burst is activated a while
				764	* later, i.e., not shortly after, than the last time at which a queue
				765	* either entered the burst list or was marked as belonging to the
				766	* current large burst, then the current burst is deemed as finished and:
				767	*
				768	* . the large-burst mode is reset if set
				769	*
				770	* . the burst list is emptied
				771	*
				772	* . Q is inserted in the burst list, as Q may be the first queue
				773	* in a possible new burst (then the burst list contains just Q
				774	* after this step).
				775	*/
				776	static void bfq_handle_burst(struct bfq_data bfqd, struct bfq_queue bfqq,
				777	bool idle_for_long_time)
				778	{
				779	/*
				780	* If bfqq happened to be activated in a burst, but has been idle
				781	* for at least as long as an interactive queue, then we assume
				782	* that, in the overall I/O initiated in the burst, the I/O
				783	* associated to bfqq is finished. So bfqq does not need to be
				784	* treated as a queue belonging to a burst anymore. Accordingly,
				785	* we reset bfqq's in_large_burst flag if set, and remove bfqq
				786	* from the burst list if it's there. We do not decrement instead
				787	* burst_size, because the fact that bfqq does not need to belong
				788	* to the burst list any more does not invalidate the fact that
				789	* bfqq may have been activated during the current burst.
				790	*/
				791	if (idle_for_long_time) {
				792	hlist_del_init(&bfqq->burst_list_node);
				793	bfq_clear_bfqq_in_large_burst(bfqq);
				794	}
				795
				796	/*
				797	* If bfqq is already in the burst list or is part of a large
				798	* burst, then there is nothing else to do.
				799	*/
				800	if (!hlist_unhashed(&bfqq->burst_list_node) \|\|
				801	bfq_bfqq_in_large_burst(bfqq))
				802	return;
				803
				804	/*
				805	* If bfqq's activation happens late enough, then the current
				806	* burst is finished, and related data structures must be reset.
				807	*
				808	* In this respect, consider the special case where bfqq is the very
				809	* first queue being activated. In this case, last_ins_in_burst is
				810	* not yet significant when we get here. But it is easy to verify
				811	* that, whether or not the following condition is true, bfqq will
				812	* end up being inserted into the burst list. In particular the
				813	* list will happen to contain only bfqq. And this is exactly what
				814	* has to happen, as bfqq may be the first queue in a possible
				815	* burst.
				816	*/
				817	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
				818	bfqd->bfq_burst_interval)) {
				819	bfqd->large_burst = false;
				820	bfq_reset_burst_list(bfqd, bfqq);
				821	return;
				822	}
				823
				824	/*
				825	* If we get here, then bfqq is being activated shortly after the
				826	* last queue. So, if the current burst is also large, we can mark
				827	* bfqq as belonging to this large burst immediately.
				828	*/
				829	if (bfqd->large_burst) {
				830	bfq_mark_bfqq_in_large_burst(bfqq);
				831	return;
				832	}
				833
				834	/*
				835	* If we get here, then a large-burst state has not yet been
				836	* reached, but bfqq is being activated shortly after the last
				837	* queue. Then we add bfqq to the burst.
				838	*/
				839	bfq_add_to_burst(bfqd, bfqq);
				840	}
				841
				842	static void bfq_add_request(struct request *rq)
				843	{
				844	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				845	struct bfq_entity *entity = &bfqq->entity;
				846	struct bfq_data *bfqd = bfqq->bfqd;
				847	struct request next_rq, prev;
				848	unsigned long old_wr_coeff = bfqq->wr_coeff;
				849	bool interactive = false;
				850
				851	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
				852	bfqq->queued[rq_is_sync(rq)]++;
				853	bfqd->queued++;
				854
				855	elv_rb_add(&bfqq->sort_list, rq);
				856
				857	/*
				858	* Check if this request is a better next-serve candidate.
				859	*/
				860	prev = bfqq->next_rq;
				861	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
				862	BUG_ON(next_rq == NULL);
				863	bfqq->next_rq = next_rq;
				864
				865	/*
				866	* Adjust priority tree position, if next_rq changes.
				867	*/
				868	if (prev != bfqq->next_rq)
				869	bfq_rq_pos_tree_add(bfqd, bfqq);
				870
				871	if (!bfq_bfqq_busy(bfqq)) {
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	872	bool soft_rt, coop_or_in_burst,
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	873	idle_for_long_time = time_is_before_jiffies(
				874	bfqq->budget_timeout +
				875	bfqd->bfq_wr_min_idle_time);
				876
				877	if (bfq_bfqq_sync(bfqq)) {
				878	bool already_in_burst =
				879	!hlist_unhashed(&bfqq->burst_list_node) \|\|
				880	bfq_bfqq_in_large_burst(bfqq);
				881	bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
				882	/*
				883	* If bfqq was not already in the current burst,
				884	* then, at this point, bfqq either has been
				885	* added to the current burst or has caused the
				886	* current burst to terminate. In particular, in
				887	* the second case, bfqq has become the first
				888	* queue in a possible new burst.
				889	* In both cases last_ins_in_burst needs to be
				890	* moved forward.
				891	*/
				892	if (!already_in_burst)
				893	bfqd->last_ins_in_burst = jiffies;
				894	}
				895
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	896	coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) \|\|
				897	bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	898	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	899	!coop_or_in_burst &&
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	900	time_is_before_jiffies(bfqq->soft_rt_next_start);
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	901	interactive = !coop_or_in_burst && idle_for_long_time;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	902	entity->budget = max_t(unsigned long, bfqq->max_budget,
				903	bfq_serv_to_charge(next_rq, bfqq));
				904
				905	if (!bfq_bfqq_IO_bound(bfqq)) {
				906	if (time_before(jiffies,
				907	RQ_BIC(rq)->ttime.last_end_request +
				908	bfqd->bfq_slice_idle)) {
				909	bfqq->requests_within_timer++;
				910	if (bfqq->requests_within_timer >=
				911	bfqd->bfq_requests_within_timer)
				912	bfq_mark_bfqq_IO_bound(bfqq);
				913	} else
				914	bfqq->requests_within_timer = 0;
				915	}
				916
				917	if (!bfqd->low_latency)
				918	goto add_bfqq_busy;
				919
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	920	if (bfq_bfqq_just_split(bfqq))
				921	goto set_ioprio_changed;
				922
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	923	/*
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	924	* If the queue:
				925	* - is not being boosted,
				926	* - has been idle for enough time,
				927	* - is not a sync queue or is linked to a bfq_io_cq (it is
				928	* shared "for its nature" or it is not shared and its
				929	* requests have not been redirected to a shared queue)
				930	* start a weight-raising period.
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	931	*/
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	932	if (old_wr_coeff == 1 && (interactive \|\| soft_rt) &&
				933	(!bfq_bfqq_sync(bfqq) \|\| bfqq->bic != NULL)) {
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	934	bfqq->wr_coeff = bfqd->bfq_wr_coeff;
				935	if (interactive)
				936	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
				937	else
				938	bfqq->wr_cur_max_time =
				939	bfqd->bfq_wr_rt_max_time;
				940	bfq_log_bfqq(bfqd, bfqq,
				941	"wrais starting at %lu, rais_max_time %u",
				942	jiffies,
				943	jiffies_to_msecs(bfqq->wr_cur_max_time));
				944	} else if (old_wr_coeff > 1) {
				945	if (interactive)
				946	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	947	else if (coop_or_in_burst \|\|
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	948	(bfqq->wr_cur_max_time ==
				949	bfqd->bfq_wr_rt_max_time &&
				950	!soft_rt)) {
				951	bfqq->wr_coeff = 1;
				952	bfq_log_bfqq(bfqd, bfqq,
				953	"wrais ending at %lu, rais_max_time %u",
				954	jiffies,
				955	jiffies_to_msecs(bfqq->
				956	wr_cur_max_time));
				957	} else if (time_before(
				958	bfqq->last_wr_start_finish +
				959	bfqq->wr_cur_max_time,
				960	jiffies +
				961	bfqd->bfq_wr_rt_max_time) &&
				962	soft_rt) {
				963	/*
				964	*
				965	* The remaining weight-raising time is lower
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	966	* than bfqd->bfq_wr_rt_max_time, which means
				967	* that the application is enjoying weight
				968	* raising either because deemed soft-rt in
				969	* the near past, or because deemed interactive
				970	* a long ago.
				971	* In both cases, resetting now the current
				972	* remaining weight-raising time for the
				973	* application to the weight-raising duration
				974	* for soft rt applications would not cause any
				975	* latency increase for the application (as the
				976	* new duration would be higher than the
				977	* remaining time).
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	978	*
				979	* In addition, the application is now meeting
				980	* the requirements for being deemed soft rt.
				981	* In the end we can correctly and safely
				982	* (re)charge the weight-raising duration for
				983	* the application with the weight-raising
				984	* duration for soft rt applications.
				985	*
				986	* In particular, doing this recharge now, i.e.,
				987	* before the weight-raising period for the
				988	* application finishes, reduces the probability
				989	* of the following negative scenario:
				990	* 1) the weight of a soft rt application is
				991	* raised at startup (as for any newly
				992	* created application),
				993	* 2) since the application is not interactive,
				994	* at a certain time weight-raising is
				995	* stopped for the application,
				996	* 3) at that time the application happens to
				997	* still have pending requests, and hence
				998	* is destined to not have a chance to be
				999	* deemed soft rt before these requests are
				1000	* completed (see the comments to the
				1001	* function bfq_bfqq_softrt_next_start()
				1002	* for details on soft rt detection),
				1003	* 4) these pending requests experience a high
				1004	* latency because the application is not
				1005	* weight-raised while they are pending.
				1006	*/
				1007	bfqq->last_wr_start_finish = jiffies;
				1008	bfqq->wr_cur_max_time =
				1009	bfqd->bfq_wr_rt_max_time;
				1010	}
				1011	}
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	1012	set_ioprio_changed:
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1013	if (old_wr_coeff != bfqq->wr_coeff)
				1014	entity->ioprio_changed = 1;
				1015	add_bfqq_busy:
				1016	bfqq->last_idle_bklogged = jiffies;
				1017	bfqq->service_from_backlogged = 0;
				1018	bfq_clear_bfqq_softrt_update(bfqq);
				1019	bfq_add_bfqq_busy(bfqd, bfqq);
				1020	} else {
				1021	if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
				1022	time_is_before_jiffies(
				1023	bfqq->last_wr_start_finish +
				1024	bfqd->bfq_wr_min_inter_arr_async)) {
				1025	bfqq->wr_coeff = bfqd->bfq_wr_coeff;
				1026	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
				1027
				1028	bfqd->wr_busy_queues++;
				1029	entity->ioprio_changed = 1;
				1030	bfq_log_bfqq(bfqd, bfqq,
				1031	"non-idle wrais starting at %lu, rais_max_time %u",
				1032	jiffies,
				1033	jiffies_to_msecs(bfqq->wr_cur_max_time));
				1034	}
				1035	if (prev != bfqq->next_rq)
				1036	bfq_updated_next_req(bfqd, bfqq);
				1037	}
				1038
				1039	if (bfqd->low_latency &&
				1040	(old_wr_coeff == 1 \|\| bfqq->wr_coeff == 1 \|\| interactive))
				1041	bfqq->last_wr_start_finish = jiffies;
				1042	}
				1043
				1044	static struct request bfq_find_rq_fmerge(struct bfq_data bfqd,
				1045	struct bio *bio)
				1046	{
				1047	struct task_struct *tsk = current;
				1048	struct bfq_io_cq *bic;
				1049	struct bfq_queue *bfqq;
				1050
				1051	bic = bfq_bic_lookup(bfqd, tsk->io_context);
				1052	if (bic == NULL)
				1053	return NULL;
				1054
				1055	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
				1056	if (bfqq != NULL) {
				1057	sector_t sector = bio->bi_sector + bio_sectors(bio);
				1058
				1059	return elv_rb_find(&bfqq->sort_list, sector);
				1060	}
				1061
				1062	return NULL;
				1063	}
				1064
				1065	static void bfq_activate_request(struct request_queue q, struct request rq)
				1066	{
				1067	struct bfq_data *bfqd = q->elevator->elevator_data;
				1068
				1069	bfqd->rq_in_driver++;
				1070	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
				1071	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
				1072	(long long unsigned)bfqd->last_position);
				1073	}
				1074
				1075	static inline void bfq_deactivate_request(struct request_queue *q,
				1076	struct request *rq)
				1077	{
				1078	struct bfq_data *bfqd = q->elevator->elevator_data;
				1079
				1080	BUG_ON(bfqd->rq_in_driver == 0);
				1081	bfqd->rq_in_driver--;
				1082	}
				1083
				1084	static void bfq_remove_request(struct request *rq)
				1085	{
				1086	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				1087	struct bfq_data *bfqd = bfqq->bfqd;
				1088	const int sync = rq_is_sync(rq);
				1089
				1090	if (bfqq->next_rq == rq) {
				1091	bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
				1092	bfq_updated_next_req(bfqd, bfqq);
				1093	}
				1094
				1095	if (rq->queuelist.prev != &rq->queuelist)
				1096	list_del_init(&rq->queuelist);
				1097	BUG_ON(bfqq->queued[sync] == 0);
				1098	bfqq->queued[sync]--;
				1099	bfqd->queued--;
				1100	elv_rb_del(&bfqq->sort_list, rq);
				1101
				1102	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
				1103	if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
				1104	bfq_del_bfqq_busy(bfqd, bfqq, 1);
				1105	/*
				1106	* Remove queue from request-position tree as it is empty.
				1107	*/
				1108	if (bfqq->pos_root != NULL) {
				1109	rb_erase(&bfqq->pos_node, bfqq->pos_root);
				1110	bfqq->pos_root = NULL;
				1111	}
				1112	}
				1113
				1114	if (rq->cmd_flags & REQ_META) {
				1115	BUG_ON(bfqq->meta_pending == 0);
				1116	bfqq->meta_pending--;
				1117	}
				1118	}
				1119
				1120	static int bfq_merge(struct request_queue q, struct request *req,
				1121	struct bio *bio)
				1122	{
				1123	struct bfq_data *bfqd = q->elevator->elevator_data;
				1124	struct request *__rq;
				1125
				1126	__rq = bfq_find_rq_fmerge(bfqd, bio);
				1127	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
				1128	*req = __rq;
				1129	return ELEVATOR_FRONT_MERGE;
				1130	}
				1131
				1132	return ELEVATOR_NO_MERGE;
				1133	}
				1134
				1135	static void bfq_merged_request(struct request_queue q, struct request req,
				1136	int type)
				1137	{
				1138	if (type == ELEVATOR_FRONT_MERGE &&
				1139	rb_prev(&req->rb_node) &&
				1140	blk_rq_pos(req) <
				1141	blk_rq_pos(container_of(rb_prev(&req->rb_node),
				1142	struct request, rb_node))) {
				1143	struct bfq_queue *bfqq = RQ_BFQQ(req);
				1144	struct bfq_data *bfqd = bfqq->bfqd;
				1145	struct request prev, next_rq;
				1146
				1147	/* Reposition request in its sort_list */
				1148	elv_rb_del(&bfqq->sort_list, req);
				1149	elv_rb_add(&bfqq->sort_list, req);
				1150	/* Choose next request to be served for bfqq */
				1151	prev = bfqq->next_rq;
				1152	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
				1153	bfqd->last_position);
				1154	BUG_ON(next_rq == NULL);
				1155	bfqq->next_rq = next_rq;
				1156	/*
				1157	* If next_rq changes, update both the queue's budget to
				1158	* fit the new request and the queue's position in its
				1159	* rq_pos_tree.
				1160	*/
				1161	if (prev != bfqq->next_rq) {
				1162	bfq_updated_next_req(bfqd, bfqq);
				1163	bfq_rq_pos_tree_add(bfqd, bfqq);
				1164	}
				1165	}
				1166	}
				1167
				1168	static void bfq_merged_requests(struct request_queue q, struct request rq,
				1169	struct request *next)
				1170	{
				1171	struct bfq_queue bfqq = RQ_BFQQ(rq), next_bfqq = RQ_BFQQ(next);
				1172
				1173	/*
				1174	* If next and rq belong to the same bfq_queue and next is older
				1175	* than rq, then reposition rq in the fifo (by substituting next
				1176	* with rq). Otherwise, if next and rq belong to different
				1177	* bfq_queues, never reposition rq: in fact, we would have to
				1178	* reposition it with respect to next's position in its own fifo,
				1179	* which would most certainly be too expensive with respect to
				1180	* the benefits.
				1181	*/
				1182	if (bfqq == next_bfqq &&
				1183	!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
				1184	time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
				1185	list_del_init(&rq->queuelist);
				1186	list_replace_init(&next->queuelist, &rq->queuelist);
				1187	rq_set_fifo_time(rq, rq_fifo_time(next));
				1188	}
				1189
				1190	if (bfqq->next_rq == next)
				1191	bfqq->next_rq = rq;
				1192
				1193	bfq_remove_request(next);
				1194	}
				1195
				1196	/* Must be called with bfqq != NULL */
				1197	static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
				1198	{
				1199	BUG_ON(bfqq == NULL);
				1200	if (bfq_bfqq_busy(bfqq))
				1201	bfqq->bfqd->wr_busy_queues--;
				1202	bfqq->wr_coeff = 1;
				1203	bfqq->wr_cur_max_time = 0;
				1204	/* Trigger a weight change on the next activation of the queue */
				1205	bfqq->entity.ioprio_changed = 1;
				1206	}
				1207
				1208	static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
				1209	struct bfq_group *bfqg)
				1210	{
				1211	int i, j;
				1212
				1213	for (i = 0; i < 2; i++)
				1214	for (j = 0; j < IOPRIO_BE_NR; j++)
				1215	if (bfqg->async_bfqq[i][j] != NULL)
				1216	bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
				1217	if (bfqg->async_idle_bfqq != NULL)
				1218	bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
				1219	}
				1220
				1221	static void bfq_end_wr(struct bfq_data *bfqd)
				1222	{
				1223	struct bfq_queue *bfqq;
				1224
				1225	spin_lock_irq(bfqd->queue->queue_lock);
				1226
				1227	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
				1228	bfq_bfqq_end_wr(bfqq);
				1229	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
				1230	bfq_bfqq_end_wr(bfqq);
				1231	bfq_end_wr_async(bfqd);
				1232
				1233	spin_unlock_irq(bfqd->queue->queue_lock);
				1234	}
				1235
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	1236	static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
				1237	{
				1238	if (request)
				1239	return blk_rq_pos(io_struct);
				1240	else
				1241	return ((struct bio *)io_struct)->bi_sector;
				1242	}
				1243
				1244	static inline sector_t bfq_dist_from(sector_t pos1,
				1245	sector_t pos2)
				1246	{
				1247	if (pos1 >= pos2)
				1248	return pos1 - pos2;
				1249	else
				1250	return pos2 - pos1;
				1251	}
				1252
				1253	static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
				1254	sector_t sector)
				1255	{
				1256	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
				1257	BFQQ_SEEK_THR;
				1258	}
				1259
				1260	static struct bfq_queue bfqq_close(struct bfq_data bfqd, sector_t sector)
				1261	{
				1262	struct rb_root *root = &bfqd->rq_pos_tree;
				1263	struct rb_node parent, node;
				1264	struct bfq_queue *__bfqq;
				1265
				1266	if (RB_EMPTY_ROOT(root))
				1267	return NULL;
				1268
				1269	/*
				1270	* First, if we find a request starting at the end of the last
				1271	* request, choose it.
				1272	*/
				1273	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
				1274	if (__bfqq != NULL)
				1275	return __bfqq;
				1276
				1277	/*
				1278	* If the exact sector wasn't found, the parent of the NULL leaf
				1279	* will contain the closest sector (rq_pos_tree sorted by
				1280	* next_request position).
				1281	*/
				1282	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
				1283	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
				1284	return __bfqq;
				1285
				1286	if (blk_rq_pos(__bfqq->next_rq) < sector)
				1287	node = rb_next(&__bfqq->pos_node);
				1288	else
				1289	node = rb_prev(&__bfqq->pos_node);
				1290	if (node == NULL)
				1291	return NULL;
				1292
				1293	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
				1294	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
				1295	return __bfqq;
				1296
				1297	return NULL;
				1298	}
				1299
				1300	/*
				1301	* bfqd - obvious
				1302	* cur_bfqq - passed in so that we don't decide that the current queue
				1303	* is closely cooperating with itself
				1304	* sector - used as a reference point to search for a close queue
				1305	*/
				1306	static struct bfq_queue bfq_close_cooperator(struct bfq_data bfqd,
				1307	struct bfq_queue *cur_bfqq,
				1308	sector_t sector)
				1309	{
				1310	struct bfq_queue *bfqq;
				1311
				1312	if (bfq_class_idle(cur_bfqq))
				1313	return NULL;
				1314	if (!bfq_bfqq_sync(cur_bfqq))
				1315	return NULL;
				1316	if (BFQQ_SEEKY(cur_bfqq))
				1317	return NULL;
				1318
				1319	/* If device has only one backlogged bfq_queue, don't search. */
				1320	if (bfqd->busy_queues == 1)
				1321	return NULL;
				1322
				1323	/*
				1324	* We should notice if some of the queues are cooperating, e.g.
				1325	* working closely on the same area of the disk. In that case,
				1326	* we can group them together and don't waste time idling.
				1327	*/
				1328	bfqq = bfqq_close(bfqd, sector);
				1329	if (bfqq == NULL \|\| bfqq == cur_bfqq)
				1330	return NULL;
				1331
				1332	/*
				1333	* Do not merge queues from different bfq_groups.
				1334	*/
				1335	if (bfqq->entity.parent != cur_bfqq->entity.parent)
				1336	return NULL;
				1337
				1338	/*
				1339	* It only makes sense to merge sync queues.
				1340	*/
				1341	if (!bfq_bfqq_sync(bfqq))
				1342	return NULL;
				1343	if (BFQQ_SEEKY(bfqq))
				1344	return NULL;
				1345
				1346	/*
				1347	* Do not merge queues of different priority classes.
				1348	*/
				1349	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
				1350	return NULL;
				1351
				1352	return bfqq;
				1353	}
				1354
				1355	static struct bfq_queue *
				1356	bfq_setup_merge(struct bfq_queue bfqq, struct bfq_queue new_bfqq)
				1357	{
				1358	int process_refs, new_process_refs;
				1359	struct bfq_queue *__bfqq;
				1360
				1361	/*
				1362	* If there are no process references on the new_bfqq, then it is
				1363	* unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
				1364	* may have dropped their last reference (not just their last process
				1365	* reference).
				1366	*/
				1367	if (!bfqq_process_refs(new_bfqq))
				1368	return NULL;
				1369
				1370	/* Avoid a circular list and skip interim queue merges. */
				1371	while ((__bfqq = new_bfqq->new_bfqq)) {
				1372	if (__bfqq == bfqq)
				1373	return NULL;
				1374	new_bfqq = __bfqq;
				1375	}
				1376
				1377	process_refs = bfqq_process_refs(bfqq);
				1378	new_process_refs = bfqq_process_refs(new_bfqq);
				1379	/*
				1380	* If the process for the bfqq has gone away, there is no
				1381	* sense in merging the queues.
				1382	*/
				1383	if (process_refs == 0 \|\| new_process_refs == 0)
				1384	return NULL;
				1385
				1386	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
				1387	new_bfqq->pid);
				1388
				1389	/*
				1390	* Merging is just a redirection: the requests of the process
				1391	* owning one of the two queues are redirected to the other queue.
				1392	* The latter queue, in its turn, is set as shared if this is the
				1393	* first time that the requests of some process are redirected to
				1394	* it.
				1395	*
				1396	* We redirect bfqq to new_bfqq and not the opposite, because we
				1397	* are in the context of the process owning bfqq, hence we have
				1398	* the io_cq of this process. So we can immediately configure this
				1399	* io_cq to redirect the requests of the process to new_bfqq.
				1400	*
				1401	* NOTE, even if new_bfqq coincides with the in-service queue, the
				1402	* io_cq of new_bfqq is not available, because, if the in-service
				1403	* queue is shared, bfqd->in_service_bic may not point to the
				1404	* io_cq of the in-service queue.
				1405	* Redirecting the requests of the process owning bfqq to the
				1406	* currently in-service queue is in any case the best option, as
				1407	* we feed the in-service queue with new requests close to the
				1408	* last request served and, by doing so, hopefully increase the
				1409	* throughput.
				1410	*/
				1411	bfqq->new_bfqq = new_bfqq;
				1412	atomic_add(process_refs, &new_bfqq->ref);
				1413	return new_bfqq;
				1414	}
				1415
				1416	/*
				1417	* Attempt to schedule a merge of bfqq with the currently in-service queue
				1418	* or with a close queue among the scheduled queues.
				1419	* Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
				1420	* structure otherwise.
				1421	*
				1422	* The OOM queue is not allowed to participate to cooperation: in fact, since
				1423	* the requests temporarily redirected to the OOM queue could be redirected
				1424	* again to dedicated queues at any time, the state needed to correctly
				1425	* handle merging with the OOM queue would be quite complex and expensive
				1426	* to maintain. Besides, in such a critical condition as an out of memory,
				1427	* the benefits of queue merging may be little relevant, or even negligible.
				1428	*/
				1429	static struct bfq_queue *
				1430	bfq_setup_cooperator(struct bfq_data bfqd, struct bfq_queue bfqq,
				1431	void *io_struct, bool request)
				1432	{
				1433	struct bfq_queue in_service_bfqq, new_bfqq;
				1434
				1435	if (bfqq->new_bfqq)
				1436	return bfqq->new_bfqq;
				1437
				1438	if (!io_struct \|\| unlikely(bfqq == &bfqd->oom_bfqq))
				1439	return NULL;
				1440
				1441	in_service_bfqq = bfqd->in_service_queue;
				1442
				1443	if (in_service_bfqq == NULL \|\| in_service_bfqq == bfqq \|\|
				1444	!bfqd->in_service_bic \|\|
				1445	unlikely(in_service_bfqq == &bfqd->oom_bfqq))
				1446	goto check_scheduled;
				1447
				1448	if (bfq_class_idle(in_service_bfqq) \|\| bfq_class_idle(bfqq))
				1449	goto check_scheduled;
				1450
				1451	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
				1452	goto check_scheduled;
				1453
				1454	if (in_service_bfqq->entity.parent != bfqq->entity.parent)
				1455	goto check_scheduled;
				1456
				1457	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
				1458	bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
				1459	new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
				1460	if (new_bfqq != NULL)
				1461	return new_bfqq; /* Merge with in-service queue */
				1462	}
				1463
				1464	/*
				1465	* Check whether there is a cooperator among currently scheduled
				1466	* queues. The only thing we need is that the bio/request is not
				1467	* NULL, as we need it to establish whether a cooperator exists.
				1468	*/
				1469	check_scheduled:
				1470	new_bfqq = bfq_close_cooperator(bfqd, bfqq,
				1471	bfq_io_struct_pos(io_struct, request));
				1472	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))
				1473	return bfq_setup_merge(bfqq, new_bfqq);
				1474
				1475	return NULL;
				1476	}
				1477
				1478	static inline void
				1479	bfq_bfqq_save_state(struct bfq_queue *bfqq)
				1480	{
				1481	/*
				1482	* If bfqq->bic == NULL, the queue is already shared or its requests
				1483	* have already been redirected to a shared queue; both idle window
				1484	* and weight raising state have already been saved. Do nothing.
				1485	*/
				1486	if (bfqq->bic == NULL)
				1487	return;
				1488	if (bfqq->bic->wr_time_left)
				1489	/*
				1490	* This is the queue of a just-started process, and would
				1491	* deserve weight raising: we set wr_time_left to the full
				1492	* weight-raising duration to trigger weight-raising when
				1493	* and if the queue is split and the first request of the
				1494	* queue is enqueued.
				1495	*/
				1496	bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);
				1497	else if (bfqq->wr_coeff > 1) {
				1498	unsigned long wr_duration =
				1499	jiffies - bfqq->last_wr_start_finish;
				1500	/*
				1501	* It may happen that a queue's weight raising period lasts
				1502	* longer than its wr_cur_max_time, as weight raising is
				1503	* handled only when a request is enqueued or dispatched (it
				1504	* does not use any timer). If the weight raising period is
				1505	* about to end, don't save it.
				1506	*/
				1507	if (bfqq->wr_cur_max_time <= wr_duration)
				1508	bfqq->bic->wr_time_left = 0;
				1509	else
				1510	bfqq->bic->wr_time_left =
				1511	bfqq->wr_cur_max_time - wr_duration;
				1512	/*
				1513	* The bfq_queue is becoming shared or the requests of the
				1514	* process owning the queue are being redirected to a shared
				1515	* queue. Stop the weight raising period of the queue, as in
				1516	* both cases it should not be owned by an interactive or
				1517	* soft real-time application.
				1518	*/
				1519	bfq_bfqq_end_wr(bfqq);
				1520	} else
				1521	bfqq->bic->wr_time_left = 0;
				1522	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
				1523	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
				1524	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
				1525	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
				1526	bfqq->bic->cooperations++;
				1527	bfqq->bic->failed_cooperations = 0;
				1528	}
				1529
				1530	static inline void
				1531	bfq_get_bic_reference(struct bfq_queue *bfqq)
				1532	{
				1533	/*
				1534	* If bfqq->bic has a non-NULL value, the bic to which it belongs
				1535	* is about to begin using a shared bfq_queue.
				1536	*/
				1537	if (bfqq->bic)
				1538	atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
				1539	}
				1540
				1541	static void
				1542	bfq_merge_bfqqs(struct bfq_data bfqd, struct bfq_io_cq bic,
				1543	struct bfq_queue bfqq, struct bfq_queue new_bfqq)
				1544	{
				1545	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
				1546	(long unsigned)new_bfqq->pid);
				1547	/* Save weight raising and idle window of the merged queues */
				1548	bfq_bfqq_save_state(bfqq);
				1549	bfq_bfqq_save_state(new_bfqq);
				1550	if (bfq_bfqq_IO_bound(bfqq))
				1551	bfq_mark_bfqq_IO_bound(new_bfqq);
				1552	bfq_clear_bfqq_IO_bound(bfqq);
				1553	/*
				1554	* Grab a reference to the bic, to prevent it from being destroyed
				1555	* before being possibly touched by a bfq_split_bfqq().
				1556	*/
				1557	bfq_get_bic_reference(bfqq);
				1558	bfq_get_bic_reference(new_bfqq);
				1559	/*
				1560	* Merge queues (that is, let bic redirect its requests to new_bfqq)
				1561	*/
				1562	bic_set_bfqq(bic, new_bfqq, 1);
				1563	bfq_mark_bfqq_coop(new_bfqq);
				1564	/*
				1565	* new_bfqq now belongs to at least two bics (it is a shared queue):
				1566	* set new_bfqq->bic to NULL. bfqq either:
				1567	* - does not belong to any bic any more, and hence bfqq->bic must
				1568	* be set to NULL, or
				1569	* - is a queue whose owning bics have already been redirected to a
				1570	* different queue, hence the queue is destined to not belong to
				1571	* any bic soon and bfqq->bic is already NULL (therefore the next
				1572	* assignment causes no harm).
				1573	*/
				1574	new_bfqq->bic = NULL;
				1575	bfqq->bic = NULL;
				1576	bfq_put_queue(bfqq);
				1577	}
				1578
				1579	static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
				1580	{
				1581	struct bfq_io_cq *bic = bfqq->bic;
				1582	struct bfq_data *bfqd = bfqq->bfqd;
				1583
				1584	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {
				1585	bic->failed_cooperations++;
				1586	if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)
				1587	bic->cooperations = 0;
				1588	}
				1589	}
				1590
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1591	static int bfq_allow_merge(struct request_queue q, struct request rq,
				1592	struct bio *bio)
				1593	{
				1594	struct bfq_data *bfqd = q->elevator->elevator_data;
				1595	struct bfq_io_cq *bic;
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	1596	struct bfq_queue bfqq, new_bfqq;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1597
				1598	/*
				1599	* Disallow merge of a sync bio into an async request.
				1600	*/
				1601	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
				1602	return 0;
				1603
				1604	/*
				1605	* Lookup the bfqq that this bio will be queued with. Allow
				1606	* merge only if rq is queued there.
				1607	* Queue lock is held here.
				1608	*/
				1609	bic = bfq_bic_lookup(bfqd, current->io_context);
				1610	if (bic == NULL)
				1611	return 0;
				1612
				1613	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	1614	/*
				1615	* We take advantage of this function to perform an early merge
				1616	* of the queues of possible cooperating processes.
				1617	*/
				1618	if (bfqq != NULL) {
				1619	new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
				1620	if (new_bfqq != NULL) {
				1621	bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
				1622	/*
				1623	* If we get here, the bio will be queued in the
				1624	* shared queue, i.e., new_bfqq, so use new_bfqq
				1625	* to decide whether bio and rq can be merged.
				1626	*/
				1627	bfqq = new_bfqq;
				1628	} else
				1629	bfq_bfqq_increase_failed_cooperations(bfqq);
				1630	}
				1631
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1632	return bfqq == RQ_BFQQ(rq);
				1633	}
				1634
				1635	static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
				1636	struct bfq_queue *bfqq)
				1637	{
				1638	if (bfqq != NULL) {
				1639	bfq_mark_bfqq_must_alloc(bfqq);
				1640	bfq_mark_bfqq_budget_new(bfqq);
				1641	bfq_clear_bfqq_fifo_expire(bfqq);
				1642
				1643	bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
				1644
				1645	bfq_log_bfqq(bfqd, bfqq,
				1646	"set_in_service_queue, cur-budget = %lu",
				1647	bfqq->entity.budget);
				1648	}
				1649
				1650	bfqd->in_service_queue = bfqq;
				1651	}
				1652
				1653	/*
				1654	* Get and set a new queue for service.
				1655	*/
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	1656	static struct bfq_queue bfq_set_in_service_queue(struct bfq_data bfqd)
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1657	{
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	1658	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1659
				1660	__bfq_set_in_service_queue(bfqd, bfqq);
				1661	return bfqq;
				1662	}
				1663
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1664	/*
				1665	* If enough samples have been computed, return the current max budget
				1666	* stored in bfqd, which is dynamically updated according to the
				1667	* estimated disk peak rate; otherwise return the default max budget
				1668	*/
				1669	static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
				1670	{
				1671	if (bfqd->budgets_assigned < 194)
				1672	return bfq_default_max_budget;
				1673	else
				1674	return bfqd->bfq_max_budget;
				1675	}
				1676
				1677	/*
				1678	* Return min budget, which is a fraction of the current or default
				1679	* max budget (trying with 1/32)
				1680	*/
				1681	static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
				1682	{
				1683	if (bfqd->budgets_assigned < 194)
				1684	return bfq_default_max_budget / 32;
				1685	else
				1686	return bfqd->bfq_max_budget / 32;
				1687	}
				1688
				1689	static void bfq_arm_slice_timer(struct bfq_data *bfqd)
				1690	{
				1691	struct bfq_queue *bfqq = bfqd->in_service_queue;
				1692	struct bfq_io_cq *bic;
				1693	unsigned long sl;
				1694
				1695	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
				1696
				1697	/* Processes have exited, don't wait. */
				1698	bic = bfqd->in_service_bic;
				1699	if (bic == NULL \|\| atomic_read(&bic->icq.ioc->nr_tasks) == 0)
				1700	return;
				1701
				1702	bfq_mark_bfqq_wait_request(bfqq);
				1703
				1704	/*
				1705	* We don't want to idle for seeks, but we do want to allow
				1706	* fair distribution of slice time for a process doing back-to-back
				1707	* seeks. So allow a little bit of time for him to submit a new rq.
				1708	*
				1709	* To prevent processes with (partly) seeky workloads from
				1710	* being too ill-treated, grant them a small fraction of the
				1711	* assigned budget before reducing the waiting time to
				1712	* BFQ_MIN_TT. This happened to help reduce latency.
				1713	*/
				1714	sl = bfqd->bfq_slice_idle;
				1715	/*
				1716	* Unless the queue is being weight-raised or the scenario is
				1717	* asymmetric, grant only minimum idle time if the queue either
				1718	* has been seeky for long enough or has already proved to be
				1719	* constantly seeky.
				1720	*/
				1721	if (bfq_sample_valid(bfqq->seek_samples) &&
				1722	((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
				1723	bfq_max_budget(bfqq->bfqd) / 8) \|\|
				1724	bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
				1725	symmetric_scenario)
				1726	sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
				1727	else if (bfqq->wr_coeff > 1)
				1728	sl = sl * 3;
				1729	bfqd->last_idling_start = ktime_get();
				1730	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
				1731	bfq_log(bfqd, "arm idle: %u/%u ms",
				1732	jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
				1733	}
				1734
				1735	/*
				1736	* Set the maximum time for the in-service queue to consume its
				1737	* budget. This prevents seeky processes from lowering the disk
				1738	* throughput (always guaranteed with a time slice scheme as in CFQ).
				1739	*/
				1740	static void bfq_set_budget_timeout(struct bfq_data *bfqd)
				1741	{
				1742	struct bfq_queue *bfqq = bfqd->in_service_queue;
				1743	unsigned int timeout_coeff;
				1744	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
				1745	timeout_coeff = 1;
				1746	else
				1747	timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
				1748
				1749	bfqd->last_budget_start = ktime_get();
				1750
				1751	bfq_clear_bfqq_budget_new(bfqq);
				1752	bfqq->budget_timeout = jiffies +
				1753	bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
				1754
				1755	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
				1756	jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
				1757	timeout_coeff));
				1758	}
				1759
				1760	/*
				1761	* Move request from internal lists to the request queue dispatch list.
				1762	*/
				1763	static void bfq_dispatch_insert(struct request_queue q, struct request rq)
				1764	{
				1765	struct bfq_data *bfqd = q->elevator->elevator_data;
				1766	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				1767
				1768	/*
				1769	* For consistency, the next instruction should have been executed
				1770	* after removing the request from the queue and dispatching it.
				1771	* We execute instead this instruction before bfq_remove_request()
				1772	* (and hence introduce a temporary inconsistency), for efficiency.
				1773	* In fact, in a forced_dispatch, this prevents two counters related
				1774	* to bfqq->dispatched to risk to be uselessly decremented if bfqq
				1775	* is not in service, and then to be incremented again after
				1776	* incrementing bfqq->dispatched.
				1777	*/
				1778	bfqq->dispatched++;
				1779	bfq_remove_request(rq);
				1780	elv_dispatch_sort(q, rq);
				1781
				1782	if (bfq_bfqq_sync(bfqq))
				1783	bfqd->sync_flight++;
				1784	}
				1785
				1786	/*
				1787	* Return expired entry, or NULL to just start from scratch in rbtree.
				1788	*/
				1789	static struct request bfq_check_fifo(struct bfq_queue bfqq)
				1790	{
				1791	struct request *rq = NULL;
				1792
				1793	if (bfq_bfqq_fifo_expire(bfqq))
				1794	return NULL;
				1795
				1796	bfq_mark_bfqq_fifo_expire(bfqq);
				1797
				1798	if (list_empty(&bfqq->fifo))
				1799	return NULL;
				1800
				1801	rq = rq_entry_fifo(bfqq->fifo.next);
				1802
				1803	if (time_before(jiffies, rq_fifo_time(rq)))
				1804	return NULL;
				1805
				1806	return rq;
				1807	}
				1808
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	1809	static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
				1810	{
				1811	struct bfq_entity *entity = &bfqq->entity;
				1812	return entity->budget - entity->service;
				1813	}
				1814
				1815	static void __bfq_bfqq_expire(struct bfq_data bfqd, struct bfq_queue bfqq)
				1816	{
				1817	BUG_ON(bfqq != bfqd->in_service_queue);
				1818
				1819	__bfq_bfqd_reset_in_service(bfqd);
				1820
				1821	/*
				1822	* If this bfqq is shared between multiple processes, check
				1823	* to make sure that those processes are still issuing I/Os
				1824	* within the mean seek distance. If not, it may be time to
				1825	* break the queues apart again.
				1826	*/
				1827	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
				1828	bfq_mark_bfqq_split_coop(bfqq);
				1829
				1830	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
				1831	/*
				1832	* Overloading budget_timeout field to store the time
				1833	* at which the queue remains with no backlog; used by
				1834	* the weight-raising mechanism.
				1835	*/
				1836	bfqq->budget_timeout = jiffies;
				1837	bfq_del_bfqq_busy(bfqd, bfqq, 1);
				1838	} else {
				1839	bfq_activate_bfqq(bfqd, bfqq);
				1840	/*
				1841	* Resort priority tree of potential close cooperators.
				1842	*/
				1843	bfq_rq_pos_tree_add(bfqd, bfqq);
				1844	}
				1845	}
				1846
				1847	/**
				1848	* __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
				1849	* @bfqd: device data.
				1850	* @bfqq: queue to update.
				1851	* @reason: reason for expiration.
				1852	*
				1853	* Handle the feedback on @bfqq budget. See the body for detailed
				1854	* comments.
				1855	*/
				1856	static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
				1857	struct bfq_queue *bfqq,
				1858	enum bfqq_expiration reason)
				1859	{
				1860	struct request *next_rq;
				1861	unsigned long budget, min_budget;
				1862
				1863	budget = bfqq->max_budget;
				1864	min_budget = bfq_min_budget(bfqd);
				1865
				1866	BUG_ON(bfqq != bfqd->in_service_queue);
				1867
				1868	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
				1869	bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
				1870	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
				1871	budget, bfq_min_budget(bfqd));
				1872	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
				1873	bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
				1874
				1875	if (bfq_bfqq_sync(bfqq)) {
				1876	switch (reason) {
				1877	/*
				1878	* Caveat: in all the following cases we trade latency
				1879	* for throughput.
				1880	*/
				1881	case BFQ_BFQQ_TOO_IDLE:
				1882	/*
				1883	* This is the only case where we may reduce
				1884	* the budget: if there is no request of the
				1885	* process still waiting for completion, then
				1886	* we assume (tentatively) that the timer has
				1887	* expired because the batch of requests of
				1888	* the process could have been served with a
				1889	* smaller budget. Hence, betting that
				1890	* process will behave in the same way when it
				1891	* becomes backlogged again, we reduce its
				1892	* next budget. As long as we guess right,
				1893	* this budget cut reduces the latency
				1894	* experienced by the process.
				1895	*
				1896	* However, if there are still outstanding
				1897	* requests, then the process may have not yet
				1898	* issued its next request just because it is
				1899	* still waiting for the completion of some of
				1900	* the still outstanding ones. So in this
				1901	* subcase we do not reduce its budget, on the
				1902	* contrary we increase it to possibly boost
				1903	* the throughput, as discussed in the
				1904	* comments to the BUDGET_TIMEOUT case.
				1905	*/
				1906	if (bfqq->dispatched > 0) /* still outstanding reqs */
				1907	budget = min(budget * 2, bfqd->bfq_max_budget);
				1908	else {
				1909	if (budget > 5 * min_budget)
				1910	budget -= 4 * min_budget;
				1911	else
				1912	budget = min_budget;
				1913	}
				1914	break;
				1915	case BFQ_BFQQ_BUDGET_TIMEOUT:
				1916	/*
				1917	* We double the budget here because: 1) it
				1918	* gives the chance to boost the throughput if
				1919	* this is not a seeky process (which may have
				1920	* bumped into this timeout because of, e.g.,
				1921	* ZBR), 2) together with charge_full_budget
				1922	* it helps give seeky processes higher
				1923	* timestamps, and hence be served less
				1924	* frequently.
				1925	*/
				1926	budget = min(budget * 2, bfqd->bfq_max_budget);
				1927	break;
				1928	case BFQ_BFQQ_BUDGET_EXHAUSTED:
				1929	/*
				1930	* The process still has backlog, and did not
				1931	* let either the budget timeout or the disk
				1932	* idling timeout expire. Hence it is not
				1933	* seeky, has a short thinktime and may be
				1934	* happy with a higher budget too. So
				1935	* definitely increase the budget of this good
				1936	* candidate to boost the disk throughput.
				1937	*/
				1938	budget = min(budget * 4, bfqd->bfq_max_budget);
				1939	break;
				1940	case BFQ_BFQQ_NO_MORE_REQUESTS:
				1941	/*
				1942	* Leave the budget unchanged.
				1943	*/
				1944	default:
				1945	return;
				1946	}
				1947	} else /* async queue */
				1948	/* async queues get always the maximum possible budget
				1949	* (their ability to dispatch is limited by
				1950	* @bfqd->bfq_max_budget_async_rq).
				1951	*/
				1952	budget = bfqd->bfq_max_budget;
				1953
				1954	bfqq->max_budget = budget;
				1955
				1956	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
				1957	bfqq->max_budget > bfqd->bfq_max_budget)
				1958	bfqq->max_budget = bfqd->bfq_max_budget;
				1959
				1960	/*
				1961	* Make sure that we have enough budget for the next request.
				1962	* Since the finish time of the bfqq must be kept in sync with
				1963	* the budget, be sure to call __bfq_bfqq_expire() after the
				1964	* update.
				1965	*/
				1966	next_rq = bfqq->next_rq;
				1967	if (next_rq != NULL)
				1968	bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
				1969	bfq_serv_to_charge(next_rq, bfqq));
				1970	else
				1971	bfqq->entity.budget = bfqq->max_budget;
				1972
				1973	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
				1974	next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
				1975	bfqq->entity.budget);
				1976	}
				1977
				1978	static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
				1979	{
				1980	unsigned long max_budget;
				1981
				1982	/*
				1983	* The max_budget calculated when autotuning is equal to the
				1984	* amount of sectors transfered in timeout_sync at the
				1985	* estimated peak rate.
				1986	*/
				1987	max_budget = (unsigned long)(peak_rate * 1000 *
				1988	timeout >> BFQ_RATE_SHIFT);
				1989
				1990	return max_budget;
				1991	}
				1992
				1993	/*
				1994	* In addition to updating the peak rate, checks whether the process
				1995	* is "slow", and returns 1 if so. This slow flag is used, in addition
				1996	* to the budget timeout, to reduce the amount of service provided to
				1997	* seeky processes, and hence reduce their chances to lower the
				1998	* throughput. See the code for more details.
				1999	*/
				2000	static int bfq_update_peak_rate(struct bfq_data bfqd, struct bfq_queue bfqq,
				2001	int compensate, enum bfqq_expiration reason)
				2002	{
				2003	u64 bw, usecs, expected, timeout;
				2004	ktime_t delta;
				2005	int update = 0;
				2006
				2007	if (!bfq_bfqq_sync(bfqq) \|\| bfq_bfqq_budget_new(bfqq))
				2008	return 0;
				2009
				2010	if (compensate)
				2011	delta = bfqd->last_idling_start;
				2012	else
				2013	delta = ktime_get();
				2014	delta = ktime_sub(delta, bfqd->last_budget_start);
				2015	usecs = ktime_to_us(delta);
				2016
				2017	/* Don't trust short/unrealistic values. */
				2018	if (usecs < 100 \|\| usecs >= LONG_MAX)
				2019	return 0;
				2020
				2021	/*
				2022	* Calculate the bandwidth for the last slice. We use a 64 bit
				2023	* value to store the peak rate, in sectors per usec in fixed
				2024	* point math. We do so to have enough precision in the estimate
				2025	* and to avoid overflows.
				2026	*/
				2027	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
				2028	do_div(bw, (unsigned long)usecs);
				2029
				2030	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
				2031
				2032	/*
				2033	* Use only long (> 20ms) intervals to filter out spikes for
				2034	* the peak rate estimation.
				2035	*/
				2036	if (usecs > 20000) {
				2037	if (bw > bfqd->peak_rate \|\|
				2038	(!BFQQ_SEEKY(bfqq) &&
				2039	reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
				2040	bfq_log(bfqd, "measured bw =%llu", bw);
				2041	/*
				2042	* To smooth oscillations use a low-pass filter with
				2043	* alpha=7/8, i.e.,
				2044	* new_rate = (7/8) * old_rate + (1/8) * bw
				2045	*/
				2046	do_div(bw, 8);
				2047	if (bw == 0)
				2048	return 0;
				2049	bfqd->peak_rate *= 7;
				2050	do_div(bfqd->peak_rate, 8);
				2051	bfqd->peak_rate += bw;
				2052	update = 1;
				2053	bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
				2054	}
				2055
				2056	update \|= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
				2057
				2058	if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
				2059	bfqd->peak_rate_samples++;
				2060
				2061	if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
				2062	update) {
				2063	int dev_type = blk_queue_nonrot(bfqd->queue);
				2064	if (bfqd->bfq_user_max_budget == 0) {
				2065	bfqd->bfq_max_budget =
				2066	bfq_calc_max_budget(bfqd->peak_rate,
				2067	timeout);
				2068	bfq_log(bfqd, "new max_budget=%lu",
				2069	bfqd->bfq_max_budget);
				2070	}
				2071	if (bfqd->device_speed == BFQ_BFQD_FAST &&
				2072	bfqd->peak_rate < device_speed_thresh[dev_type]) {
				2073	bfqd->device_speed = BFQ_BFQD_SLOW;
				2074	bfqd->RT_prod = R_slow[dev_type] *
				2075	T_slow[dev_type];
				2076	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
				2077	bfqd->peak_rate > device_speed_thresh[dev_type]) {
				2078	bfqd->device_speed = BFQ_BFQD_FAST;
				2079	bfqd->RT_prod = R_fast[dev_type] *
				2080	T_fast[dev_type];
				2081	}
				2082	}
				2083	}
				2084
				2085	/*
				2086	* If the process has been served for a too short time
				2087	* interval to let its possible sequential accesses prevail on
				2088	* the initial seek time needed to move the disk head on the
				2089	* first sector it requested, then give the process a chance
				2090	* and for the moment return false.
				2091	*/
				2092	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
				2093	return 0;
				2094
				2095	/*
				2096	* A process is considered ``slow'' (i.e., seeky, so that we
				2097	* cannot treat it fairly in the service domain, as it would
				2098	* slow down too much the other processes) if, when a slice
				2099	* ends for whatever reason, it has received service at a
				2100	* rate that would not be high enough to complete the budget
				2101	* before the budget timeout expiration.
				2102	*/
				2103	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
				2104
				2105	/*
				2106	* Caveat: processes doing IO in the slower disk zones will
				2107	* tend to be slow(er) even if not seeky. And the estimated
				2108	* peak rate will actually be an average over the disk
				2109	* surface. Hence, to not be too harsh with unlucky processes,
				2110	* we keep a budget/3 margin of safety before declaring a
				2111	* process slow.
				2112	*/
				2113	return expected > (4 * bfqq->entity.budget) / 3;
				2114	}
				2115
				2116	/*
				2117	* To be deemed as soft real-time, an application must meet two
				2118	* requirements. First, the application must not require an average
				2119	* bandwidth higher than the approximate bandwidth required to playback or
				2120	* record a compressed high-definition video.
				2121	* The next function is invoked on the completion of the last request of a
				2122	* batch, to compute the next-start time instant, soft_rt_next_start, such
				2123	* that, if the next request of the application does not arrive before
				2124	* soft_rt_next_start, then the above requirement on the bandwidth is met.
				2125	*
				2126	* The second requirement is that the request pattern of the application is
				2127	* isochronous, i.e., that, after issuing a request or a batch of requests,
				2128	* the application stops issuing new requests until all its pending requests
				2129	* have been completed. After that, the application may issue a new batch,
				2130	* and so on.
				2131	* For this reason the next function is invoked to compute
				2132	* soft_rt_next_start only for applications that meet this requirement,
				2133	* whereas soft_rt_next_start is set to infinity for applications that do
				2134	* not.
				2135	*
				2136	* Unfortunately, even a greedy application may happen to behave in an
				2137	* isochronous way if the CPU load is high. In fact, the application may
				2138	* stop issuing requests while the CPUs are busy serving other processes,
				2139	* then restart, then stop again for a while, and so on. In addition, if
				2140	* the disk achieves a low enough throughput with the request pattern
				2141	* issued by the application (e.g., because the request pattern is random
				2142	* and/or the device is slow), then the application may meet the above
				2143	* bandwidth requirement too. To prevent such a greedy application to be
				2144	* deemed as soft real-time, a further rule is used in the computation of
				2145	* soft_rt_next_start: soft_rt_next_start must be higher than the current
				2146	* time plus the maximum time for which the arrival of a request is waited
				2147	* for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
				2148	* This filters out greedy applications, as the latter issue instead their
				2149	* next request as soon as possible after the last one has been completed
				2150	* (in contrast, when a batch of requests is completed, a soft real-time
				2151	* application spends some time processing data).
				2152	*
				2153	* Unfortunately, the last filter may easily generate false positives if
				2154	* only bfqd->bfq_slice_idle is used as a reference time interval and one
				2155	* or both the following cases occur:
				2156	* 1) HZ is so low that the duration of a jiffy is comparable to or higher
				2157	* than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
				2158	* HZ=100.
				2159	* 2) jiffies, instead of increasing at a constant rate, may stop increasing
				2160	* for a while, then suddenly 'jump' by several units to recover the lost
				2161	* increments. This seems to happen, e.g., inside virtual machines.
				2162	* To address this issue, we do not use as a reference time interval just
				2163	* bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
				2164	* particular we add the minimum number of jiffies for which the filter
				2165	* seems to be quite precise also in embedded systems and KVM/QEMU virtual
				2166	* machines.
				2167	*/
				2168	static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
				2169	struct bfq_queue *bfqq)
				2170	{
				2171	return max(bfqq->last_idle_bklogged +
				2172	HZ * bfqq->service_from_backlogged /
				2173	bfqd->bfq_wr_max_softrt_rate,
				2174	jiffies + bfqq->bfqd->bfq_slice_idle + 4);
				2175	}
				2176
				2177	/*
				2178	* Return the largest-possible time instant such that, for as long as possible,
				2179	* the current time will be lower than this time instant according to the macro
				2180	* time_is_before_jiffies().
				2181	*/
				2182	static inline unsigned long bfq_infinity_from_now(unsigned long now)
				2183	{
				2184	return now + ULONG_MAX / 2;
				2185	}
				2186
				2187	/**
				2188	* bfq_bfqq_expire - expire a queue.
				2189	* @bfqd: device owning the queue.
				2190	* @bfqq: the queue to expire.
				2191	* @compensate: if true, compensate for the time spent idling.
				2192	* @reason: the reason causing the expiration.
				2193	*
				2194	*
				2195	* If the process associated to the queue is slow (i.e., seeky), or in
				2196	* case of budget timeout, or, finally, if it is async, we
				2197	* artificially charge it an entire budget (independently of the
				2198	* actual service it received). As a consequence, the queue will get
				2199	* higher timestamps than the correct ones upon reactivation, and
				2200	* hence it will be rescheduled as if it had received more service
				2201	* than what it actually received. In the end, this class of processes
				2202	* will receive less service in proportion to how slowly they consume
				2203	* their budgets (and hence how seriously they tend to lower the
				2204	* throughput).
				2205	*
				2206	* In contrast, when a queue expires because it has been idling for
				2207	* too much or because it exhausted its budget, we do not touch the
				2208	* amount of service it has received. Hence when the queue will be
				2209	* reactivated and its timestamps updated, the latter will be in sync
				2210	* with the actual service received by the queue until expiration.
				2211	*
				2212	* Charging a full budget to the first type of queues and the exact
				2213	* service to the others has the effect of using the WF2Q+ policy to
				2214	* schedule the former on a timeslice basis, without violating the
				2215	* service domain guarantees of the latter.
				2216	*/
				2217	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				2218	struct bfq_queue *bfqq,
				2219	int compensate,
				2220	enum bfqq_expiration reason)
				2221	{
				2222	int slow;
				2223	BUG_ON(bfqq != bfqd->in_service_queue);
				2224
				2225	/* Update disk peak rate for autotuning and check whether the
				2226	* process is slow (see bfq_update_peak_rate).
				2227	*/
				2228	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
				2229
				2230	/*
				2231	* As above explained, 'punish' slow (i.e., seeky), timed-out
				2232	* and async queues, to favor sequential sync workloads.
				2233	*
				2234	* Processes doing I/O in the slower disk zones will tend to be
				2235	* slow(er) even if not seeky. Hence, since the estimated peak
				2236	* rate is actually an average over the disk surface, these
				2237	* processes may timeout just for bad luck. To avoid punishing
				2238	* them we do not charge a full budget to a process that
				2239	* succeeded in consuming at least 2/3 of its budget.
				2240	*/
				2241	if (slow \|\| (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
				2242	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
				2243	bfq_bfqq_charge_full_budget(bfqq);
				2244
				2245	bfqq->service_from_backlogged += bfqq->entity.service;
				2246
				2247	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
				2248	!bfq_bfqq_constantly_seeky(bfqq)) {
				2249	bfq_mark_bfqq_constantly_seeky(bfqq);
				2250	if (!blk_queue_nonrot(bfqd->queue))
				2251	bfqd->const_seeky_busy_in_flight_queues++;
				2252	}
				2253
				2254	if (reason == BFQ_BFQQ_TOO_IDLE &&
				2255	bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )
				2256	bfq_clear_bfqq_IO_bound(bfqq);
				2257
				2258	if (bfqd->low_latency && bfqq->wr_coeff == 1)
				2259	bfqq->last_wr_start_finish = jiffies;
				2260
				2261	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
				2262	RB_EMPTY_ROOT(&bfqq->sort_list)) {
				2263	/*
				2264	* If we get here, and there are no outstanding requests,
				2265	* then the request pattern is isochronous (see the comments
				2266	* to the function bfq_bfqq_softrt_next_start()). Hence we
				2267	* can compute soft_rt_next_start. If, instead, the queue
				2268	* still has outstanding requests, then we have to wait
				2269	* for the completion of all the outstanding requests to
				2270	* discover whether the request pattern is actually
				2271	* isochronous.
				2272	*/
				2273	if (bfqq->dispatched == 0)
				2274	bfqq->soft_rt_next_start =
				2275	bfq_bfqq_softrt_next_start(bfqd, bfqq);
				2276	else {
				2277	/*
				2278	* The application is still waiting for the
				2279	* completion of one or more requests:
				2280	* prevent it from possibly being incorrectly
				2281	* deemed as soft real-time by setting its
				2282	* soft_rt_next_start to infinity. In fact,
				2283	* without this assignment, the application
				2284	* would be incorrectly deemed as soft
				2285	* real-time if:
				2286	* 1) it issued a new request before the
				2287	* completion of all its in-flight
				2288	* requests, and
				2289	* 2) at that time, its soft_rt_next_start
				2290	* happened to be in the past.
				2291	*/
				2292	bfqq->soft_rt_next_start =
				2293	bfq_infinity_from_now(jiffies);
				2294	/*
				2295	* Schedule an update of soft_rt_next_start to when
				2296	* the task may be discovered to be isochronous.
				2297	*/
				2298	bfq_mark_bfqq_softrt_update(bfqq);
				2299	}
				2300	}
				2301
				2302	bfq_log_bfqq(bfqd, bfqq,
				2303	"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
				2304	slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
				2305
				2306	/*
				2307	* Increase, decrease or leave budget unchanged according to
				2308	* reason.
				2309	*/
				2310	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
				2311	__bfq_bfqq_expire(bfqd, bfqq);
				2312	}
				2313
				2314	/*
				2315	* Budget timeout is not implemented through a dedicated timer, but
				2316	* just checked on request arrivals and completions, as well as on
				2317	* idle timer expirations.
				2318	*/
				2319	static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
				2320	{
				2321	if (bfq_bfqq_budget_new(bfqq) \|\|
				2322	time_before(jiffies, bfqq->budget_timeout))
				2323	return 0;
				2324	return 1;
				2325	}
				2326
				2327	/*
				2328	* If we expire a queue that is waiting for the arrival of a new
				2329	* request, we may prevent the fictitious timestamp back-shifting that
				2330	* allows the guarantees of the queue to be preserved (see [1] for
				2331	* this tricky aspect). Hence we return true only if this condition
				2332	* does not hold, or if the queue is slow enough to deserve only to be
				2333	* kicked off for preserving a high throughput.
				2334	*/
				2335	static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
				2336	{
				2337	bfq_log_bfqq(bfqq->bfqd, bfqq,
				2338	"may_budget_timeout: wait_request %d left %d timeout %d",
				2339	bfq_bfqq_wait_request(bfqq),
				2340	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
				2341	bfq_bfqq_budget_timeout(bfqq));
				2342
				2343	return (!bfq_bfqq_wait_request(bfqq) \|\|
				2344	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
				2345	&&
				2346	bfq_bfqq_budget_timeout(bfqq);
				2347	}
				2348
				2349	/*
				2350	* Device idling is allowed only for the queues for which this function
				2351	* returns true. For this reason, the return value of this function plays a
				2352	* critical role for both throughput boosting and service guarantees. The
				2353	* return value is computed through a logical expression. In this rather
				2354	* long comment, we try to briefly describe all the details and motivations
				2355	* behind the components of this logical expression.
				2356	*
				2357	* First, the expression is false if bfqq is not sync, or if: bfqq happened
				2358	* to become active during a large burst of queue activations, and the
				2359	* pattern of requests bfqq contains boosts the throughput if bfqq is
				2360	* expired. In fact, queues that became active during a large burst benefit
				2361	* only from throughput, as discussed in the comments to bfq_handle_burst.
				2362	* In this respect, expiring bfqq certainly boosts the throughput on NCQ-
				2363	* capable flash-based devices, whereas, on rotational devices, it boosts
				2364	* the throughput only if bfqq contains random requests.
				2365	*
				2366	* On the opposite end, if (a) bfqq is sync, (b) the above burst-related
				2367	* condition does not hold, and (c) bfqq is being weight-raised, then the
				2368	* expression always evaluates to true, as device idling is instrumental
				2369	* for preserving low-latency guarantees (see [1]). If, instead, conditions
				2370	* (a) and (b) do hold, but (c) does not, then the expression evaluates to
				2371	* true only if: (1) bfqq is I/O-bound and has a non-null idle window, and
				2372	* (2) at least one of the following two conditions holds.
				2373	* The first condition is that the device is not performing NCQ, because
				2374	* idling the device most certainly boosts the throughput if this condition
				2375	* holds and bfqq is I/O-bound and has been granted a non-null idle window.
				2376	* The second compound condition is made of the logical AND of two components.
				2377	*
				2378	* The first component is true only if there is no weight-raised busy
				2379	* queue. This guarantees that the device is not idled for a sync non-
				2380	* weight-raised queue when there are busy weight-raised queues. The former
				2381	* is then expired immediately if empty. Combined with the timestamping
				2382	* rules of BFQ (see [1] for details), this causes sync non-weight-raised
				2383	* queues to get a lower number of requests served, and hence to ask for a
				2384	* lower number of requests from the request pool, before the busy weight-
				2385	* raised queues get served again.
				2386	*
				2387	* This is beneficial for the processes associated with weight-raised
				2388	* queues, when the request pool is saturated (e.g., in the presence of
				2389	* write hogs). In fact, if the processes associated with the other queues
				2390	* ask for requests at a lower rate, then weight-raised processes have a
				2391	* higher probability to get a request from the pool immediately (or at
				2392	* least soon) when they need one. Hence they have a higher probability to
				2393	* actually get a fraction of the disk throughput proportional to their
				2394	* high weight. This is especially true with NCQ-capable drives, which
				2395	* enqueue several requests in advance and further reorder internally-
				2396	* queued requests.
				2397	*
				2398	* In the end, mistreating non-weight-raised queues when there are busy
				2399	* weight-raised queues seems to mitigate starvation problems in the
				2400	* presence of heavy write workloads and NCQ, and hence to guarantee a
				2401	* higher application and system responsiveness in these hostile scenarios.
				2402	*
				2403	* If the first component of the compound condition is instead true, i.e.,
				2404	* there is no weight-raised busy queue, then the second component of the
				2405	* compound condition takes into account service-guarantee and throughput
				2406	* issues related to NCQ (recall that the compound condition is evaluated
				2407	* only if the device is detected as supporting NCQ).
				2408	*
				2409	* As for service guarantees, allowing the drive to enqueue more than one
				2410	* request at a time, and hence delegating de facto final scheduling
				2411	* decisions to the drive's internal scheduler, causes loss of control on
				2412	* the actual request service order. In this respect, when the drive is
				2413	* allowed to enqueue more than one request at a time, the service
				2414	* distribution enforced by the drive's internal scheduler is likely to
				2415	* coincide with the desired device-throughput distribution only in the
				2416	* following, perfectly symmetric, scenario:
				2417	* 1) all active queues have the same weight,
				2418	* 2) all active groups at the same level in the groups tree have the same
				2419	* weight,
				2420	* 3) all active groups at the same level in the groups tree have the same
				2421	* number of children.
				2422	*
				2423	* Even in such a scenario, sequential I/O may still receive a preferential
				2424	* treatment, but this is not likely to be a big issue with flash-based
				2425	* devices, because of their non-dramatic loss of throughput with random
				2426	* I/O. Things do differ with HDDs, for which additional care is taken, as
				2427	* explained after completing the discussion for flash-based devices.
				2428	*
				2429	* Unfortunately, keeping the necessary state for evaluating exactly the
				2430	* above symmetry conditions would be quite complex and time-consuming.
				2431	* Therefore BFQ evaluates instead the following stronger sub-conditions,
				2432	* for which it is much easier to maintain the needed state:
				2433	* 1) all active queues have the same weight,
				2434	* 2) all active groups have the same weight,
				2435	* 3) all active groups have at most one active child each.
				2436	* In particular, the last two conditions are always true if hierarchical
				2437	* support and the cgroups interface are not enabled, hence no state needs
				2438	* to be maintained in this case.
				2439	*
				2440	* According to the above considerations, the second component of the
				2441	* compound condition evaluates to true if any of the above symmetry
				2442	* sub-condition does not hold, or the device is not flash-based. Therefore,
				2443	* if also the first component is true, then idling is allowed for a sync
				2444	* queue. These are the only sub-conditions considered if the device is
				2445	* flash-based, as, for such a device, it is sensible to force idling only
				2446	* for service-guarantee issues. In fact, as for throughput, idling
				2447	* NCQ-capable flash-based devices would not boost the throughput even
				2448	* with sequential I/O; rather it would lower the throughput in proportion
				2449	* to how fast the device is. In the end, (only) if all the three
				2450	* sub-conditions hold and the device is flash-based, the compound
				2451	* condition evaluates to false and therefore no idling is performed.
				2452	*
				2453	* As already said, things change with a rotational device, where idling
				2454	* boosts the throughput with sequential I/O (even with NCQ). Hence, for
				2455	* such a device the second component of the compound condition evaluates
				2456	* to true also if the following additional sub-condition does not hold:
				2457	* the queue is constantly seeky. Unfortunately, this different behavior
				2458	* with respect to flash-based devices causes an additional asymmetry: if
				2459	* some sync queues enjoy idling and some other sync queues do not, then
				2460	* the latter get a low share of the device throughput, simply because the
				2461	* former get many requests served after being set as in service, whereas
				2462	* the latter do not. As a consequence, to guarantee the desired throughput
				2463	* distribution, on HDDs the compound expression evaluates to true (and
				2464	* hence device idling is performed) also if the following last symmetry
				2465	* condition does not hold: no other queue is benefiting from idling. Also
				2466	* this last condition is actually replaced with a simpler-to-maintain and
				2467	* stronger condition: there is no busy queue which is not constantly seeky
				2468	* (and hence may also benefit from idling).
				2469	*
				2470	* To sum up, when all the required symmetry and throughput-boosting
				2471	* sub-conditions hold, the second component of the compound condition
				2472	* evaluates to false, and hence no idling is performed. This helps to
				2473	* keep the drives' internal queues full on NCQ-capable devices, and hence
				2474	* to boost the throughput, without causing 'almost' any loss of service
				2475	* guarantees. The 'almost' follows from the fact that, if the internal
				2476	* queue of one such device is filled while all the sub-conditions hold,
				2477	* but at some point in time some sub-condition stops to hold, then it may
				2478	* become impossible to let requests be served in the new desired order
				2479	* until all the requests already queued in the device have been served.
				2480	*/
				2481	static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
				2482	{
				2483	struct bfq_data *bfqd = bfqq->bfqd;
				2484	#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \
				2485	bfqd->busy_in_flight_queues == \
				2486	bfqd->const_seeky_busy_in_flight_queues)
				2487
				2488	#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \
				2489	bfqd->hw_tag && \
				2490	(blk_queue_nonrot(bfqd->queue) \|\| \
				2491	bfq_bfqq_constantly_seeky(bfqq)))
				2492
				2493	/*
				2494	* Condition for expiring a non-weight-raised queue (and hence not idling
				2495	* the device).
				2496	*/
				2497	#define cond_for_expiring_non_wr (bfqd->hw_tag && \
				2498	(bfqd->wr_busy_queues > 0 \|\| \
				2499	(blk_queue_nonrot(bfqd->queue) \|\| \
				2500	cond_for_seeky_on_ncq_hdd)))
				2501
				2502	return bfq_bfqq_sync(bfqq) &&
				2503	!cond_for_expiring_in_burst &&
				2504	(bfqq->wr_coeff > 1 \|\| !symmetric_scenario \|\|
				2505	(bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&
				2506	!cond_for_expiring_non_wr)
				2507	);
				2508	}
				2509
				2510	/*
				2511	* If the in-service queue is empty but sync, and the function
				2512	* bfq_bfqq_must_not_expire returns true, then:
				2513	* 1) the queue must remain in service and cannot be expired, and
				2514	* 2) the disk must be idled to wait for the possible arrival of a new
				2515	* request for the queue.
				2516	* See the comments to the function bfq_bfqq_must_not_expire for the reasons
				2517	* why performing device idling is the best choice to boost the throughput
				2518	* and preserve service guarantees when bfq_bfqq_must_not_expire itself
				2519	* returns true.
				2520	*/
				2521	static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
				2522	{
				2523	struct bfq_data *bfqd = bfqq->bfqd;
				2524
				2525	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
				2526	bfq_bfqq_must_not_expire(bfqq);
				2527	}
				2528
				2529	/*
				2530	* Select a queue for service. If we have a current queue in service,
				2531	* check whether to continue servicing it, or retrieve and set a new one.
				2532	*/
				2533	static struct bfq_queue bfq_select_queue(struct bfq_data bfqd)
				2534	{
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2535	struct bfq_queue *bfqq;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2536	struct request *next_rq;
				2537	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
				2538
				2539	bfqq = bfqd->in_service_queue;
				2540	if (bfqq == NULL)
				2541	goto new_queue;
				2542
				2543	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
				2544
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2545	if (bfq_may_expire_for_budg_timeout(bfqq) &&
				2546	!timer_pending(&bfqd->idle_slice_timer) &&
				2547	!bfq_bfqq_must_idle(bfqq))
				2548	goto expire;
				2549
				2550	next_rq = bfqq->next_rq;
				2551	/*
				2552	* If bfqq has requests queued and it has enough budget left to
				2553	* serve them, keep the queue, otherwise expire it.
				2554	*/
				2555	if (next_rq != NULL) {
				2556	if (bfq_serv_to_charge(next_rq, bfqq) >
				2557	bfq_bfqq_budget_left(bfqq)) {
				2558	reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
				2559	goto expire;
				2560	} else {
				2561	/*
				2562	* The idle timer may be pending because we may
				2563	* not disable disk idling even when a new request
				2564	* arrives.
				2565	*/
				2566	if (timer_pending(&bfqd->idle_slice_timer)) {
				2567	/*
				2568	* If we get here: 1) at least a new request
				2569	* has arrived but we have not disabled the
				2570	* timer because the request was too small,
				2571	* 2) then the block layer has unplugged
				2572	* the device, causing the dispatch to be
				2573	* invoked.
				2574	*
				2575	* Since the device is unplugged, now the
				2576	* requests are probably large enough to
				2577	* provide a reasonable throughput.
				2578	* So we disable idling.
				2579	*/
				2580	bfq_clear_bfqq_wait_request(bfqq);
				2581	del_timer(&bfqd->idle_slice_timer);
				2582	}
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2583	goto keep_queue;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2584	}
				2585	}
				2586
				2587	/*
				2588	* No requests pending. However, if the in-service queue is idling
				2589	* for a new request, or has requests waiting for a completion and
				2590	* may idle after their completion, then keep it anyway.
				2591	*/
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2592	if (timer_pending(&bfqd->idle_slice_timer) \|\|
				2593	(bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2594	bfqq = NULL;
				2595	goto keep_queue;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2596	}
				2597
				2598	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
				2599	expire:
				2600	bfq_bfqq_expire(bfqd, bfqq, 0, reason);
				2601	new_queue:
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2602	bfqq = bfq_set_in_service_queue(bfqd);
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2603	bfq_log(bfqd, "select_queue: new queue %d returned",
				2604	bfqq != NULL ? bfqq->pid : 0);
				2605	keep_queue:
				2606	return bfqq;
				2607	}
				2608
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2609	static void bfq_update_wr_data(struct bfq_data bfqd, struct bfq_queue bfqq)
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2610	{
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2611	struct bfq_entity *entity = &bfqq->entity;
				2612	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2613	bfq_log_bfqq(bfqd, bfqq,
				2614	"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2615	jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2616	jiffies_to_msecs(bfqq->wr_cur_max_time),
				2617	bfqq->wr_coeff,
				2618	bfqq->entity.weight, bfqq->entity.orig_weight);
				2619
				2620	BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
				2621	entity->orig_weight * bfqq->wr_coeff);
				2622	if (entity->ioprio_changed)
				2623	bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2624
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2625	/*
				2626	* If the queue was activated in a burst, or
				2627	* too much time has elapsed from the beginning
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2628	* of this weight-raising period, or the queue has
				2629	* exceeded the acceptable number of cooperations,
				2630	* then end weight raising.
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2631	*/
				2632	if (bfq_bfqq_in_large_burst(bfqq) \|\|
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2633	bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh \|\|
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2634	time_is_before_jiffies(bfqq->last_wr_start_finish +
				2635	bfqq->wr_cur_max_time)) {
				2636	bfqq->last_wr_start_finish = jiffies;
				2637	bfq_log_bfqq(bfqd, bfqq,
				2638	"wrais ending at %lu, rais_max_time %u",
				2639	bfqq->last_wr_start_finish,
				2640	jiffies_to_msecs(bfqq->wr_cur_max_time));
				2641	bfq_bfqq_end_wr(bfqq);
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2642	}
				2643	}
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2644	/* Update weight both if it must be raised and if it must be lowered */
				2645	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
				2646	__bfq_entity_update_weight_prio(
				2647	bfq_entity_service_tree(entity),
				2648	entity);
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2649	}
				2650
				2651	/*
				2652	* Dispatch one request from bfqq, moving it to the request queue
				2653	* dispatch list.
				2654	*/
				2655	static int bfq_dispatch_request(struct bfq_data *bfqd,
				2656	struct bfq_queue *bfqq)
				2657	{
				2658	int dispatched = 0;
				2659	struct request *rq;
				2660	unsigned long service_to_charge;
				2661
				2662	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
				2663
				2664	/* Follow expired path, else get first next available. */
				2665	rq = bfq_check_fifo(bfqq);
				2666	if (rq == NULL)
				2667	rq = bfqq->next_rq;
				2668	service_to_charge = bfq_serv_to_charge(rq, bfqq);
				2669
				2670	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
				2671	/*
				2672	* This may happen if the next rq is chosen in fifo order
				2673	* instead of sector order. The budget is properly
				2674	* dimensioned to be always sufficient to serve the next
				2675	* request only if it is chosen in sector order. The reason
				2676	* is that it would be quite inefficient and little useful
				2677	* to always make sure that the budget is large enough to
				2678	* serve even the possible next rq in fifo order.
				2679	* In fact, requests are seldom served in fifo order.
				2680	*
				2681	* Expire the queue for budget exhaustion, and make sure
				2682	* that the next act_budget is enough to serve the next
				2683	* request, even if it comes from the fifo expired path.
				2684	*/
				2685	bfqq->next_rq = rq;
				2686	/*
				2687	* Since this dispatch is failed, make sure that
				2688	* a new one will be performed
				2689	*/
				2690	if (!bfqd->rq_in_driver)
				2691	bfq_schedule_dispatch(bfqd);
				2692	goto expire;
				2693	}
				2694
				2695	/* Finally, insert request into driver dispatch list. */
				2696	bfq_bfqq_served(bfqq, service_to_charge);
				2697	bfq_dispatch_insert(bfqd->queue, rq);
				2698
				2699	bfq_update_wr_data(bfqd, bfqq);
				2700
				2701	bfq_log_bfqq(bfqd, bfqq,
				2702	"dispatched %u sec req (%llu), budg left %lu",
				2703	blk_rq_sectors(rq),
				2704	(long long unsigned)blk_rq_pos(rq),
				2705	bfq_bfqq_budget_left(bfqq));
				2706
				2707	dispatched++;
				2708
				2709	if (bfqd->in_service_bic == NULL) {
				2710	atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
				2711	bfqd->in_service_bic = RQ_BIC(rq);
				2712	}
				2713
				2714	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
				2715	dispatched >= bfqd->bfq_max_budget_async_rq) \|\|
				2716	bfq_class_idle(bfqq)))
				2717	goto expire;
				2718
				2719	return dispatched;
				2720
				2721	expire:
				2722	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
				2723	return dispatched;
				2724	}
				2725
				2726	static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
				2727	{
				2728	int dispatched = 0;
				2729
				2730	while (bfqq->next_rq != NULL) {
				2731	bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
				2732	dispatched++;
				2733	}
				2734
				2735	BUG_ON(!list_empty(&bfqq->fifo));
				2736	return dispatched;
				2737	}
				2738
				2739	/*
				2740	* Drain our current requests.
				2741	* Used for barriers and when switching io schedulers on-the-fly.
				2742	*/
				2743	static int bfq_forced_dispatch(struct bfq_data *bfqd)
				2744	{
				2745	struct bfq_queue bfqq, n;
				2746	struct bfq_service_tree *st;
				2747	int dispatched = 0;
				2748
				2749	bfqq = bfqd->in_service_queue;
				2750	if (bfqq != NULL)
				2751	__bfq_bfqq_expire(bfqd, bfqq);
				2752
				2753	/*
				2754	* Loop through classes, and be careful to leave the scheduler
				2755	* in a consistent state, as feedback mechanisms and vtime
				2756	* updates cannot be disabled during the process.
				2757	*/
				2758	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
				2759	st = bfq_entity_service_tree(&bfqq->entity);
				2760
				2761	dispatched += __bfq_forced_dispatch_bfqq(bfqq);
				2762	bfqq->max_budget = bfq_max_budget(bfqd);
				2763
				2764	bfq_forget_idle(st);
				2765	}
				2766
				2767	BUG_ON(bfqd->busy_queues != 0);
				2768
				2769	return dispatched;
				2770	}
				2771
				2772	static int bfq_dispatch_requests(struct request_queue *q, int force)
				2773	{
				2774	struct bfq_data *bfqd = q->elevator->elevator_data;
				2775	struct bfq_queue *bfqq;
				2776	int max_dispatch;
				2777
				2778	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
				2779	if (bfqd->busy_queues == 0)
				2780	return 0;
				2781
				2782	if (unlikely(force))
				2783	return bfq_forced_dispatch(bfqd);
				2784
				2785	bfqq = bfq_select_queue(bfqd);
				2786	if (bfqq == NULL)
				2787	return 0;
				2788
				2789	if (bfq_class_idle(bfqq))
				2790	max_dispatch = 1;
				2791
				2792	if (!bfq_bfqq_sync(bfqq))
				2793	max_dispatch = bfqd->bfq_max_budget_async_rq;
				2794
				2795	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {
				2796	if (bfqd->busy_queues > 1)
				2797	return 0;
				2798	if (bfqq->dispatched >= 4 * max_dispatch)
				2799	return 0;
				2800	}
				2801
				2802	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
				2803	return 0;
				2804
				2805	bfq_clear_bfqq_wait_request(bfqq);
				2806	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
				2807
				2808	if (!bfq_dispatch_request(bfqd, bfqq))
				2809	return 0;
				2810
				2811	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
				2812	bfq_bfqq_sync(bfqq) ? "sync" : "async");
				2813
				2814	return 1;
				2815	}
				2816
				2817	/*
				2818	* Task holds one reference to the queue, dropped when task exits. Each rq
				2819	* in-flight on this queue also holds a reference, dropped when rq is freed.
				2820	*
				2821	* Queue lock must be held here.
				2822	*/
				2823	static void bfq_put_queue(struct bfq_queue *bfqq)
				2824	{
				2825	struct bfq_data *bfqd = bfqq->bfqd;
				2826
				2827	BUG_ON(atomic_read(&bfqq->ref) <= 0);
				2828
				2829	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
				2830	atomic_read(&bfqq->ref));
				2831	if (!atomic_dec_and_test(&bfqq->ref))
				2832	return;
				2833
				2834	BUG_ON(rb_first(&bfqq->sort_list) != NULL);
				2835	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
				2836	BUG_ON(bfqq->entity.tree != NULL);
				2837	BUG_ON(bfq_bfqq_busy(bfqq));
				2838	BUG_ON(bfqd->in_service_queue == bfqq);
				2839
				2840	if (bfq_bfqq_sync(bfqq))
				2841	/*
				2842	* The fact that this queue is being destroyed does not
				2843	* invalidate the fact that this queue may have been
				2844	* activated during the current burst. As a consequence,
				2845	* although the queue does not exist anymore, and hence
				2846	* needs to be removed from the burst list if there,
				2847	* the burst size has not to be decremented.
				2848	*/
				2849	hlist_del_init(&bfqq->burst_list_node);
				2850
				2851	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
				2852
				2853	kmem_cache_free(bfq_pool, bfqq);
				2854	}
				2855
				2856	static void bfq_put_cooperator(struct bfq_queue *bfqq)
				2857	{
				2858	struct bfq_queue __bfqq, next;
				2859
				2860	/*
				2861	* If this queue was scheduled to merge with another queue, be
				2862	* sure to drop the reference taken on that queue (and others in
				2863	* the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
				2864	*/
				2865	__bfqq = bfqq->new_bfqq;
				2866	while (__bfqq) {
				2867	if (__bfqq == bfqq)
				2868	break;
				2869	next = __bfqq->new_bfqq;
				2870	bfq_put_queue(__bfqq);
				2871	__bfqq = next;
				2872	}
				2873	}
				2874
				2875	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				2876	{
				2877	if (bfqq == bfqd->in_service_queue) {
				2878	__bfq_bfqq_expire(bfqd, bfqq);
				2879	bfq_schedule_dispatch(bfqd);
				2880	}
				2881
				2882	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
				2883	atomic_read(&bfqq->ref));
				2884
				2885	bfq_put_cooperator(bfqq);
				2886
				2887	bfq_put_queue(bfqq);
				2888	}
				2889
				2890	static inline void bfq_init_icq(struct io_cq *icq)
				2891	{
				2892	struct bfq_io_cq *bic = icq_to_bic(icq);
				2893
				2894	bic->ttime.last_end_request = jiffies;
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2895	/*
				2896	* A newly created bic indicates that the process has just
				2897	* started doing I/O, and is probably mapping into memory its
				2898	* executable and libraries: it definitely needs weight raising.
				2899	* There is however the possibility that the process performs,
				2900	* for a while, I/O close to some other process. EQM intercepts
				2901	* this behavior and may merge the queue corresponding to the
				2902	* process with some other queue, BEFORE the weight of the queue
				2903	* is raised. Merged queues are not weight-raised (they are assumed
				2904	* to belong to processes that benefit only from high throughput).
				2905	* If the merge is basically the consequence of an accident, then
				2906	* the queue will be split soon and will get back its old weight.
				2907	* It is then important to write down somewhere that this queue
				2908	* does need weight raising, even if it did not make it to get its
				2909	* weight raised before being merged. To this purpose, we overload
				2910	* the field raising_time_left and assign 1 to it, to mark the queue
				2911	* as needing weight raising.
				2912	*/
				2913	bic->wr_time_left = 1;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2914	}
				2915
				2916	static void bfq_exit_icq(struct io_cq *icq)
				2917	{
				2918	struct bfq_io_cq *bic = icq_to_bic(icq);
				2919	struct bfq_data *bfqd = bic_to_bfqd(bic);
				2920
				2921	if (bic->bfqq[BLK_RW_ASYNC]) {
				2922	bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
				2923	bic->bfqq[BLK_RW_ASYNC] = NULL;
				2924	}
				2925
				2926	if (bic->bfqq[BLK_RW_SYNC]) {
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	2927	/*
				2928	* If the bic is using a shared queue, put the reference
				2929	* taken on the io_context when the bic started using a
				2930	* shared bfq_queue.
				2931	*/
				2932	if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
				2933	put_io_context(icq->ioc);
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	2934	bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
				2935	bic->bfqq[BLK_RW_SYNC] = NULL;
				2936	}
				2937	}
				2938
				2939	/*
				2940	* Update the entity prio values; note that the new values will not
				2941	* be used until the next (re)activation.
				2942	*/
				2943	static void bfq_set_next_ioprio_data(struct bfq_queue bfqq, struct bfq_io_cq bic)
				2944	{
				2945	struct task_struct *tsk = current;
				2946	struct io_context *ioc = bic->icq.ioc;
				2947	int ioprio_class;
				2948
				2949	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
				2950	switch (ioprio_class) {
				2951	default:
				2952	dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
				2953	"bfq: bad prio class %d\n", ioprio_class);
				2954	case IOPRIO_CLASS_NONE:
				2955	/*
				2956	* No prio set, inherit CPU scheduling settings.
				2957	*/
				2958	bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
				2959	bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
				2960	break;
				2961	case IOPRIO_CLASS_RT:
				2962	bfqq->entity.new_ioprio = task_ioprio(ioc);
				2963	bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
				2964	break;
				2965	case IOPRIO_CLASS_BE:
				2966	bfqq->entity.new_ioprio = task_ioprio(ioc);
				2967	bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
				2968	break;
				2969	case IOPRIO_CLASS_IDLE:
				2970	bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
				2971	bfqq->entity.new_ioprio = 7;
				2972	bfq_clear_bfqq_idle_window(bfqq);
				2973	break;
				2974	}
				2975
				2976	if (bfqq->entity.new_ioprio < 0 \|\|
				2977	bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {
				2978	printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",
				2979	bfqq->entity.new_ioprio);
				2980	BUG();
				2981	}
				2982
				2983	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);
				2984	bfqq->entity.ioprio_changed = 1;
				2985	}
				2986
				2987	static void bfq_check_ioprio_change(struct io_context *ioc,
				2988	struct bfq_io_cq *bic)
				2989	{
				2990	struct bfq_data *bfqd;
				2991	struct bfq_queue bfqq, new_bfqq;
				2992	struct bfq_group *bfqg;
				2993	unsigned long uninitialized_var(flags);
				2994	int ioprio = bic->icq.ioc->ioprio;
				2995
				2996	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
				2997	&flags);
				2998	if (unlikely(bfqd == NULL))
				2999	return;
				3000
				3001	bic->ioprio = ioprio;
				3002
				3003	bfqq = bic->bfqq[BLK_RW_ASYNC];
				3004	if (bfqq != NULL) {
				3005	bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
				3006	sched_data);
				3007	new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic->icq.ioc,
				3008	GFP_ATOMIC);
				3009	if (new_bfqq != NULL) {
				3010	bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
				3011	bfq_log_bfqq(bfqd, bfqq,
				3012	"check_ioprio_change: bfqq %p %d",
				3013	bfqq, atomic_read(&bfqq->ref));
				3014	bfq_put_queue(bfqq);
				3015	}
				3016	}
				3017
				3018	bfqq = bic->bfqq[BLK_RW_SYNC];
				3019	if (bfqq != NULL)
				3020	bfq_set_next_ioprio_data(bfqq, bic);
				3021
				3022	bfq_put_bfqd_unlock(bfqd, &flags);
				3023	}
				3024
				3025	static void bfq_init_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq,
				3026	struct bfq_io_cq *bic, pid_t pid, int is_sync)
				3027	{
				3028	RB_CLEAR_NODE(&bfqq->entity.rb_node);
				3029	INIT_LIST_HEAD(&bfqq->fifo);
				3030	INIT_HLIST_NODE(&bfqq->burst_list_node);
				3031
				3032	atomic_set(&bfqq->ref, 0);
				3033	bfqq->bfqd = bfqd;
				3034
				3035	if (bic)
				3036	bfq_set_next_ioprio_data(bfqq, bic);
				3037
				3038	if (is_sync) {
				3039	if (!bfq_class_idle(bfqq))
				3040	bfq_mark_bfqq_idle_window(bfqq);
				3041	bfq_mark_bfqq_sync(bfqq);
				3042	}
				3043	bfq_mark_bfqq_IO_bound(bfqq);
				3044
				3045	/* Tentative initial value to trade off between thr and lat */
				3046	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
				3047	bfqq->pid = pid;
				3048
				3049	bfqq->wr_coeff = 1;
				3050	bfqq->last_wr_start_finish = 0;
				3051	/*
				3052	* Set to the value for which bfqq will not be deemed as
				3053	* soft rt when it becomes backlogged.
				3054	*/
				3055	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
				3056	}
				3057
				3058	static struct bfq_queue bfq_find_alloc_queue(struct bfq_data bfqd,
				3059	struct bfq_group *bfqg,
				3060	int is_sync,
				3061	struct io_context *ioc,
				3062	gfp_t gfp_mask)
				3063	{
				3064	struct bfq_queue bfqq, new_bfqq = NULL;
				3065	struct bfq_io_cq *bic;
				3066
				3067	retry:
				3068	bic = bfq_bic_lookup(bfqd, ioc);
				3069	/* bic always exists here */
				3070	bfqq = bic_to_bfqq(bic, is_sync);
				3071
				3072	/*
				3073	* Always try a new alloc if we fall back to the OOM bfqq
				3074	* originally, since it should just be a temporary situation.
				3075	*/
				3076	if (bfqq == NULL \|\| bfqq == &bfqd->oom_bfqq) {
				3077	bfqq = NULL;
				3078	if (new_bfqq != NULL) {
				3079	bfqq = new_bfqq;
				3080	new_bfqq = NULL;
				3081	} else if (gfp_mask & __GFP_WAIT) {
				3082	spin_unlock_irq(bfqd->queue->queue_lock);
				3083	new_bfqq = kmem_cache_alloc_node(bfq_pool,
				3084	gfp_mask \| __GFP_ZERO,
				3085	bfqd->queue->node);
				3086	spin_lock_irq(bfqd->queue->queue_lock);
				3087	if (new_bfqq != NULL)
				3088	goto retry;
				3089	} else {
				3090	bfqq = kmem_cache_alloc_node(bfq_pool,
				3091	gfp_mask \| __GFP_ZERO,
				3092	bfqd->queue->node);
				3093	}
				3094
				3095	if (bfqq != NULL) {
				3096	bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
				3097	is_sync);
				3098	bfq_init_entity(&bfqq->entity, bfqg);
				3099	bfq_log_bfqq(bfqd, bfqq, "allocated");
				3100	} else {
				3101	bfqq = &bfqd->oom_bfqq;
				3102	bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
				3103	}
				3104	}
				3105
				3106	if (new_bfqq != NULL)
				3107	kmem_cache_free(bfq_pool, new_bfqq);
				3108
				3109	return bfqq;
				3110	}
				3111
				3112	static struct bfq_queue *bfq_async_queue_prio(struct bfq_data bfqd,
				3113	struct bfq_group *bfqg,
				3114	int ioprio_class, int ioprio)
				3115	{
				3116	switch (ioprio_class) {
				3117	case IOPRIO_CLASS_RT:
				3118	return &bfqg->async_bfqq[0][ioprio];
				3119	case IOPRIO_CLASS_BE:
				3120	return &bfqg->async_bfqq[1][ioprio];
				3121	case IOPRIO_CLASS_IDLE:
				3122	return &bfqg->async_idle_bfqq;
				3123	default:
				3124	BUG();
				3125	}
				3126	}
				3127
				3128	static struct bfq_queue bfq_get_queue(struct bfq_data bfqd,
				3129	struct bfq_group *bfqg, int is_sync,
				3130	struct io_context *ioc, gfp_t gfp_mask)
				3131	{
				3132	const int ioprio = task_ioprio(ioc);
				3133	const int ioprio_class = task_ioprio_class(ioc);
				3134	struct bfq_queue **async_bfqq = NULL;
				3135	struct bfq_queue *bfqq = NULL;
				3136
				3137	if (!is_sync) {
				3138	async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
				3139	ioprio);
				3140	bfqq = *async_bfqq;
				3141	}
				3142
				3143	if (bfqq == NULL)
				3144	bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
				3145
				3146	/*
				3147	* Pin the queue now that it's allocated, scheduler exit will
				3148	* prune it.
				3149	*/
				3150	if (!is_sync && *async_bfqq == NULL) {
				3151	atomic_inc(&bfqq->ref);
				3152	bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
				3153	bfqq, atomic_read(&bfqq->ref));
				3154	*async_bfqq = bfqq;
				3155	}
				3156
				3157	atomic_inc(&bfqq->ref);
				3158	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
				3159	atomic_read(&bfqq->ref));
				3160	return bfqq;
				3161	}
				3162
				3163	static void bfq_update_io_thinktime(struct bfq_data *bfqd,
				3164	struct bfq_io_cq *bic)
				3165	{
				3166	unsigned long elapsed = jiffies - bic->ttime.last_end_request;
				3167	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
				3168
				3169	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
				3170	bic->ttime.ttime_total = (7bic->ttime.ttime_total + 256ttime) / 8;
				3171	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
				3172	bic->ttime.ttime_samples;
				3173	}
				3174
				3175	static void bfq_update_io_seektime(struct bfq_data *bfqd,
				3176	struct bfq_queue *bfqq,
				3177	struct request *rq)
				3178	{
				3179	sector_t sdist;
				3180	u64 total;
				3181
				3182	if (bfqq->last_request_pos < blk_rq_pos(rq))
				3183	sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
				3184	else
				3185	sdist = bfqq->last_request_pos - blk_rq_pos(rq);
				3186
				3187	/*
				3188	* Don't allow the seek distance to get too large from the
				3189	* odd fragment, pagein, etc.
				3190	*/
				3191	if (bfqq->seek_samples == 0) /* first request, not really a seek */
				3192	sdist = 0;
				3193	else if (bfqq->seek_samples <= 60) /* second & third seek */
				3194	sdist = min(sdist, (bfqq->seek_mean * 4) + 210241024);
				3195	else
				3196	sdist = min(sdist, (bfqq->seek_mean * 4) + 2102464);
				3197
				3198	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
				3199	bfqq->seek_total = (7bfqq->seek_total + (u64)256sdist) / 8;
				3200	total = bfqq->seek_total + (bfqq->seek_samples/2);
				3201	do_div(total, bfqq->seek_samples);
				3202	bfqq->seek_mean = (sector_t)total;
				3203
				3204	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
				3205	(u64)bfqq->seek_mean);
				3206	}
				3207
				3208	/*
				3209	* Disable idle window if the process thinks too long or seeks so much that
				3210	* it doesn't matter.
				3211	*/
				3212	static void bfq_update_idle_window(struct bfq_data *bfqd,
				3213	struct bfq_queue *bfqq,
				3214	struct bfq_io_cq *bic)
				3215	{
				3216	int enable_idle;
				3217
				3218	/* Don't idle for async or idle io prio class. */
				3219	if (!bfq_bfqq_sync(bfqq) \|\| bfq_class_idle(bfqq))
				3220	return;
				3221
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3222	/* Idle window just restored, statistics are meaningless. */
				3223	if (bfq_bfqq_just_split(bfqq))
				3224	return;
				3225
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3226	enable_idle = bfq_bfqq_idle_window(bfqq);
				3227
				3228	if (atomic_read(&bic->icq.ioc->nr_tasks) == 0 \|\|
				3229	bfqd->bfq_slice_idle == 0 \|\|
				3230	(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
				3231	bfqq->wr_coeff == 1))
				3232	enable_idle = 0;
				3233	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
				3234	if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
				3235	bfqq->wr_coeff == 1)
				3236	enable_idle = 0;
				3237	else
				3238	enable_idle = 1;
				3239	}
				3240	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
				3241	enable_idle);
				3242
				3243	if (enable_idle)
				3244	bfq_mark_bfqq_idle_window(bfqq);
				3245	else
				3246	bfq_clear_bfqq_idle_window(bfqq);
				3247	}
				3248
				3249	/*
				3250	* Called when a new fs request (rq) is added to bfqq. Check if there's
				3251	* something we should do about it.
				3252	*/
				3253	static void bfq_rq_enqueued(struct bfq_data bfqd, struct bfq_queue bfqq,
				3254	struct request *rq)
				3255	{
				3256	struct bfq_io_cq *bic = RQ_BIC(rq);
				3257
				3258	if (rq->cmd_flags & REQ_META)
				3259	bfqq->meta_pending++;
				3260
				3261	bfq_update_io_thinktime(bfqd, bic);
				3262	bfq_update_io_seektime(bfqd, bfqq, rq);
				3263	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {
				3264	bfq_clear_bfqq_constantly_seeky(bfqq);
				3265	if (!blk_queue_nonrot(bfqd->queue)) {
				3266	BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);
				3267	bfqd->const_seeky_busy_in_flight_queues--;
				3268	}
				3269	}
				3270	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 \|\|
				3271	!BFQQ_SEEKY(bfqq))
				3272	bfq_update_idle_window(bfqd, bfqq, bic);
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3273	bfq_clear_bfqq_just_split(bfqq);
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3274
				3275	bfq_log_bfqq(bfqd, bfqq,
				3276	"rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
				3277	bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
				3278	(long long unsigned)bfqq->seek_mean);
				3279
				3280	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
				3281
				3282	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
				3283	int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
				3284	blk_rq_sectors(rq) < 32;
				3285	int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
				3286
				3287	/*
				3288	* There is just this request queued: if the request
				3289	* is small and the queue is not to be expired, then
				3290	* just exit.
				3291	*
				3292	* In this way, if the disk is being idled to wait for
				3293	* a new request from the in-service queue, we avoid
				3294	* unplugging the device and committing the disk to serve
				3295	* just a small request. On the contrary, we wait for
				3296	* the block layer to decide when to unplug the device:
				3297	* hopefully, new requests will be merged to this one
				3298	* quickly, then the device will be unplugged and
				3299	* larger requests will be dispatched.
				3300	*/
				3301	if (small_req && !budget_timeout)
				3302	return;
				3303
				3304	/*
				3305	* A large enough request arrived, or the queue is to
				3306	* be expired: in both cases disk idling is to be
				3307	* stopped, so clear wait_request flag and reset
				3308	* timer.
				3309	*/
				3310	bfq_clear_bfqq_wait_request(bfqq);
				3311	del_timer(&bfqd->idle_slice_timer);
				3312
				3313	/*
				3314	* The queue is not empty, because a new request just
				3315	* arrived. Hence we can safely expire the queue, in
				3316	* case of budget timeout, without risking that the
				3317	* timestamps of the queue are not updated correctly.
				3318	* See [1] for more details.
				3319	*/
				3320	if (budget_timeout)
				3321	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
				3322
				3323	/*
				3324	* Let the request rip immediately, or let a new queue be
				3325	* selected if bfqq has just been expired.
				3326	*/
				3327	__blk_run_queue(bfqd->queue);
				3328	}
				3329	}
				3330
				3331	static void bfq_insert_request(struct request_queue q, struct request rq)
				3332	{
				3333	struct bfq_data *bfqd = q->elevator->elevator_data;
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3334	struct bfq_queue bfqq = RQ_BFQQ(rq), new_bfqq;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3335
				3336	assert_spin_locked(bfqd->queue->queue_lock);
				3337
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3338	/*
				3339	* An unplug may trigger a requeue of a request from the device
				3340	* driver: make sure we are in process context while trying to
				3341	* merge two bfq_queues.
				3342	*/
				3343	if (!in_interrupt()) {
				3344	new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
				3345	if (new_bfqq != NULL) {
				3346	if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
				3347	new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
				3348	/*
				3349	* Release the request's reference to the old bfqq
				3350	* and make sure one is taken to the shared queue.
				3351	*/
				3352	new_bfqq->allocated[rq_data_dir(rq)]++;
				3353	bfqq->allocated[rq_data_dir(rq)]--;
				3354	atomic_inc(&new_bfqq->ref);
				3355	bfq_put_queue(bfqq);
				3356	if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
				3357	bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
				3358	bfqq, new_bfqq);
				3359	rq->elv.priv[1] = new_bfqq;
				3360	bfqq = new_bfqq;
				3361	} else
				3362	bfq_bfqq_increase_failed_cooperations(bfqq);
				3363	}
				3364
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3365	bfq_add_request(rq);
				3366
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3367	/*
				3368	* Here a newly-created bfq_queue has already started a weight-raising
				3369	* period: clear raising_time_left to prevent bfq_bfqq_save_state()
				3370	* from assigning it a full weight-raising period. See the detailed
				3371	* comments about this field in bfq_init_icq().
				3372	*/
				3373	if (bfqq->bic != NULL)
				3374	bfqq->bic->wr_time_left = 0;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3375	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
				3376	list_add_tail(&rq->queuelist, &bfqq->fifo);
				3377
				3378	bfq_rq_enqueued(bfqd, bfqq, rq);
				3379	}
				3380
				3381	static void bfq_update_hw_tag(struct bfq_data *bfqd)
				3382	{
				3383	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
				3384	bfqd->rq_in_driver);
				3385
				3386	if (bfqd->hw_tag == 1)
				3387	return;
				3388
				3389	/*
				3390	* This sample is valid if the number of outstanding requests
				3391	* is large enough to allow a queueing behavior. Note that the
				3392	* sum is not exact, as it's not taking into account deactivated
				3393	* requests.
				3394	*/
				3395	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
				3396	return;
				3397
				3398	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
				3399	return;
				3400
				3401	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
				3402	bfqd->max_rq_in_driver = 0;
				3403	bfqd->hw_tag_samples = 0;
				3404	}
				3405
				3406	static void bfq_completed_request(struct request_queue q, struct request rq)
				3407	{
				3408	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				3409	struct bfq_data *bfqd = bfqq->bfqd;
				3410	bool sync = bfq_bfqq_sync(bfqq);
				3411
				3412	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",
				3413	blk_rq_sectors(rq), sync);
				3414
				3415	bfq_update_hw_tag(bfqd);
				3416
				3417	BUG_ON(!bfqd->rq_in_driver);
				3418	BUG_ON(!bfqq->dispatched);
				3419	bfqd->rq_in_driver--;
				3420	bfqq->dispatched--;
				3421
				3422	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
				3423	bfq_weights_tree_remove(bfqd, &bfqq->entity,
				3424	&bfqd->queue_weights_tree);
				3425	if (!blk_queue_nonrot(bfqd->queue)) {
				3426	BUG_ON(!bfqd->busy_in_flight_queues);
				3427	bfqd->busy_in_flight_queues--;
				3428	if (bfq_bfqq_constantly_seeky(bfqq)) {
				3429	BUG_ON(!bfqd->
				3430	const_seeky_busy_in_flight_queues);
				3431	bfqd->const_seeky_busy_in_flight_queues--;
				3432	}
				3433	}
				3434	}
				3435
				3436	if (sync) {
				3437	bfqd->sync_flight--;
				3438	RQ_BIC(rq)->ttime.last_end_request = jiffies;
				3439	}
				3440
				3441	/*
				3442	* If we are waiting to discover whether the request pattern of the
				3443	* task associated with the queue is actually isochronous, and
				3444	* both requisites for this condition to hold are satisfied, then
				3445	* compute soft_rt_next_start (see the comments to the function
				3446	* bfq_bfqq_softrt_next_start()).
				3447	*/
				3448	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
				3449	RB_EMPTY_ROOT(&bfqq->sort_list))
				3450	bfqq->soft_rt_next_start =
				3451	bfq_bfqq_softrt_next_start(bfqd, bfqq);
				3452
				3453	/*
				3454	* If this is the in-service queue, check if it needs to be expired,
				3455	* or if we want to idle in case it has no pending requests.
				3456	*/
				3457	if (bfqd->in_service_queue == bfqq) {
				3458	if (bfq_bfqq_budget_new(bfqq))
				3459	bfq_set_budget_timeout(bfqd);
				3460
				3461	if (bfq_bfqq_must_idle(bfqq)) {
				3462	bfq_arm_slice_timer(bfqd);
				3463	goto out;
				3464	} else if (bfq_may_expire_for_budg_timeout(bfqq))
				3465	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
				3466	else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
				3467	(bfqq->dispatched == 0 \|\|
				3468	!bfq_bfqq_must_not_expire(bfqq)))
				3469	bfq_bfqq_expire(bfqd, bfqq, 0,
				3470	BFQ_BFQQ_NO_MORE_REQUESTS);
				3471	}
				3472
				3473	if (!bfqd->rq_in_driver)
				3474	bfq_schedule_dispatch(bfqd);
				3475
				3476	out:
				3477	return;
				3478	}
				3479
				3480	static inline int __bfq_may_queue(struct bfq_queue *bfqq)
				3481	{
				3482	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
				3483	bfq_clear_bfqq_must_alloc(bfqq);
				3484	return ELV_MQUEUE_MUST;
				3485	}
				3486
				3487	return ELV_MQUEUE_MAY;
				3488	}
				3489
				3490	static int bfq_may_queue(struct request_queue *q, int rw)
				3491	{
				3492	struct bfq_data *bfqd = q->elevator->elevator_data;
				3493	struct task_struct *tsk = current;
				3494	struct bfq_io_cq *bic;
				3495	struct bfq_queue *bfqq;
				3496
				3497	/*
				3498	* Don't force setup of a queue from here, as a call to may_queue
				3499	* does not necessarily imply that a request actually will be
				3500	* queued. So just lookup a possibly existing queue, or return
				3501	* 'may queue' if that fails.
				3502	*/
				3503	bic = bfq_bic_lookup(bfqd, tsk->io_context);
				3504	if (bic == NULL)
				3505	return ELV_MQUEUE_MAY;
				3506
				3507	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
				3508	if (bfqq != NULL)
				3509	return __bfq_may_queue(bfqq);
				3510
				3511	return ELV_MQUEUE_MAY;
				3512	}
				3513
				3514	/*
				3515	* Queue lock held here.
				3516	*/
				3517	static void bfq_put_request(struct request *rq)
				3518	{
				3519	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				3520
				3521	if (bfqq != NULL) {
				3522	const int rw = rq_data_dir(rq);
				3523
				3524	BUG_ON(!bfqq->allocated[rw]);
				3525	bfqq->allocated[rw]--;
				3526
				3527	rq->elv.priv[0] = NULL;
				3528	rq->elv.priv[1] = NULL;
				3529
				3530	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
				3531	bfqq, atomic_read(&bfqq->ref));
				3532	bfq_put_queue(bfqq);
				3533	}
				3534	}
				3535
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3536	/*
				3537	* Returns NULL if a new bfqq should be allocated, or the old bfqq if this
				3538	* was the last process referring to said bfqq.
				3539	*/
				3540	static struct bfq_queue *
				3541	bfq_split_bfqq(struct bfq_io_cq bic, struct bfq_queue bfqq)
				3542	{
				3543	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3544
				3545	put_io_context(bic->icq.ioc);
				3546
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3547	if (bfqq_process_refs(bfqq) == 1) {
				3548	bfqq->pid = current->pid;
				3549	bfq_clear_bfqq_coop(bfqq);
				3550	bfq_clear_bfqq_split_coop(bfqq);
				3551	return bfqq;
				3552	}
				3553
				3554	bic_set_bfqq(bic, NULL, 1);
				3555
				3556	bfq_put_cooperator(bfqq);
				3557
				3558	bfq_put_queue(bfqq);
				3559	return NULL;
				3560	}
				3561
				3562	/*
				3563	* Allocate bfq data structures associated with this request.
				3564	*/
				3565	static int bfq_set_request(struct request_queue q, struct request rq,
				3566	gfp_t gfp_mask)
				3567	{
				3568	struct bfq_data *bfqd = q->elevator->elevator_data;
				3569	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
				3570	const int rw = rq_data_dir(rq);
				3571	const int is_sync = rq_is_sync(rq);
				3572	struct bfq_queue *bfqq;
				3573	struct bfq_group *bfqg;
				3574	unsigned long flags;
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3575	bool split = false;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3576
				3577	/* handle changed prio notifications; cgroup change is handled separately */
				3578	if (unlikely(icq_get_changed(&bic->icq) & ICQ_IOPRIO_CHANGED))
				3579	bfq_check_ioprio_change(bic->icq.ioc, bic);
				3580
				3581	might_sleep_if(gfp_mask & __GFP_WAIT);
				3582
				3583	spin_lock_irqsave(q->queue_lock, flags);
				3584
				3585	if (bic == NULL)
				3586	goto queue_fail;
				3587
				3588	bfqg = bfq_bic_update_cgroup(bic);
				3589
				3590	new_queue:
				3591	bfqq = bic_to_bfqq(bic, is_sync);
				3592	if (bfqq == NULL \|\| bfqq == &bfqd->oom_bfqq) {
				3593	bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic->icq.ioc, gfp_mask);
				3594	bic_set_bfqq(bic, bfqq, is_sync);
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3595	if (split && is_sync) {
				3596	if ((bic->was_in_burst_list && bfqd->large_burst) \|\|
				3597	bic->saved_in_large_burst)
				3598	bfq_mark_bfqq_in_large_burst(bfqq);
				3599	else {
				3600	bfq_clear_bfqq_in_large_burst(bfqq);
				3601	if (bic->was_in_burst_list)
				3602	hlist_add_head(&bfqq->burst_list_node,
				3603	&bfqd->burst_list);
				3604	}
				3605	}
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3606	} else {
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3607	/* If the queue was seeky for too long, break it apart. */
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3608	if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
				3609	bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
				3610	bfqq = bfq_split_bfqq(bic, bfqq);
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3611	split = true;
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3612	if (!bfqq)
				3613	goto new_queue;
				3614	}
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3615	}
				3616
				3617	bfqq->allocated[rw]++;
				3618	atomic_inc(&bfqq->ref);
				3619	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
				3620	atomic_read(&bfqq->ref));
				3621
				3622	rq->elv.priv[0] = bic;
				3623	rq->elv.priv[1] = bfqq;
				3624
Mauro Andreolini	0b33588	2015-06-07 02:34:22 +0200	[diff] [blame]	3625	/*
				3626	* If a bfq_queue has only one process reference, it is owned
				3627	* by only one bfq_io_cq: we can set the bic field of the
				3628	* bfq_queue to the address of that structure. Also, if the
				3629	* queue has just been split, mark a flag so that the
				3630	* information is available to the other scheduler hooks.
				3631	*/
				3632	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
				3633	bfqq->bic = bic;
				3634	if (split) {
				3635	bfq_mark_bfqq_just_split(bfqq);
				3636	/*
				3637	* If the queue has just been split from a shared
				3638	* queue, restore the idle window and the possible
				3639	* weight raising period.
				3640	*/
				3641	bfq_bfqq_resume_state(bfqq, bic);
				3642	}
				3643	}
				3644
Paolo Valente	70f2871	2013-05-09 19:10:02 +0200	[diff] [blame]	3645	spin_unlock_irqrestore(q->queue_lock, flags);
				3646
				3647	return 0;
				3648
				3649	queue_fail:
				3650	bfq_schedule_dispatch(bfqd);
				3651	spin_unlock_irqrestore(q->queue_lock, flags);
				3652
				3653	return 1;
				3654	}
				3655
				3656	static void bfq_kick_queue(struct work_struct *work)
				3657	{
				3658	struct bfq_data *bfqd =
				3659	container_of(work, struct bfq_data, unplug_work);
				3660	struct request_queue *q = bfqd->queue;
				3661
				3662	spin_lock_irq(q->queue_lock);
				3663	__blk_run_queue(q);
				3664	spin_unlock_irq(q->queue_lock);
				3665	}
				3666
				3667	/*
				3668	* Handler of the expiration of the timer running if the in-service queue
				3669	* is idling inside its time slice.
				3670	*/
				3671	static void bfq_idle_slice_timer(unsigned long data)
				3672	{
				3673	struct bfq_data bfqd = (struct bfq_data )data;
				3674	struct bfq_queue *bfqq;
				3675	unsigned long flags;
				3676	enum bfqq_expiration reason;
				3677
				3678	spin_lock_irqsave(bfqd->queue->queue_lock, flags);
				3679
				3680	bfqq = bfqd->in_service_queue;
				3681	/*
				3682	* Theoretical race here: the in-service queue can be NULL or
				3683	* different from the queue that was idling if the timer handler
				3684	* spins on the queue_lock and a new request arrives for the
				3685	* current queue and there is a full dispatch cycle that changes
				3686	* the in-service queue. This can hardly happen, but in the worst
				3687	* case we just expire a queue too early.
				3688	*/
				3689	if (bfqq != NULL) {
				3690	bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
				3691	if (bfq_bfqq_budget_timeout(bfqq))
				3692	/*
				3693	* Also here the queue can be safely expired
				3694	* for budget timeout without wasting
				3695	* guarantees
				3696	*/
				3697	reason = BFQ_BFQQ_BUDGET_TIMEOUT;
				3698	else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
				3699	/*
				3700	* The queue may not be empty upon timer expiration,
				3701	* because we may not disable the timer when the
				3702	* first request of the in-service queue arrives
				3703	* during disk idling.
				3704	*/
				3705	reason = BFQ_BFQQ_TOO_IDLE;
				3706	else
				3707	goto schedule_dispatch;
				3708
				3709	bfq_bfqq_expire(bfqd, bfqq, 1, reason);
				3710	}
				3711
				3712	schedule_dispatch:
				3713	bfq_schedule_dispatch(bfqd);
				3714
				3715	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
				3716	}
				3717
				3718	static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
				3719	{
				3720	del_timer_sync(&bfqd->idle_slice_timer);
				3721	cancel_work_sync(&bfqd->unplug_work);
				3722	}
				3723
				3724	static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
				3725	struct bfq_queue **bfqq_ptr)
				3726	{
				3727	struct bfq_group *root_group = bfqd->root_group;
				3728	struct bfq_queue bfqq = bfqq_ptr;
				3729
				3730	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
				3731	if (bfqq != NULL) {
				3732	bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
				3733	bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
				3734	bfqq, atomic_read(&bfqq->ref));
				3735	bfq_put_queue(bfqq);
				3736	*bfqq_ptr = NULL;
				3737	}
				3738	}
				3739
				3740	/*
				3741	* Release all the bfqg references to its async queues. If we are
				3742	* deallocating the group these queues may still contain requests, so
				3743	* we reparent them to the root cgroup (i.e., the only one that will
				3744	* exist for sure until all the requests on a device are gone).
				3745	*/
				3746	static void bfq_put_async_queues(struct bfq_data bfqd, struct bfq_group bfqg)
				3747	{
				3748	int i, j;
				3749
				3750	for (i = 0; i < 2; i++)
				3751	for (j = 0; j < IOPRIO_BE_NR; j++)
				3752	__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
				3753
				3754	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
				3755	}
				3756
				3757	static void bfq_exit_queue(struct elevator_queue *e)
				3758	{
				3759	struct bfq_data *bfqd = e->elevator_data;
				3760	struct request_queue *q = bfqd->queue;
				3761	struct bfq_queue bfqq, n;
				3762
				3763	bfq_shutdown_timer_wq(bfqd);
				3764
				3765	spin_lock_irq(q->queue_lock);
				3766
				3767	BUG_ON(bfqd->in_service_queue != NULL);
				3768	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
				3769	bfq_deactivate_bfqq(bfqd, bfqq, 0);
				3770
				3771	bfq_disconnect_groups(bfqd);
				3772	spin_unlock_irq(q->queue_lock);
				3773
				3774	bfq_shutdown_timer_wq(bfqd);
				3775
				3776	synchronize_rcu();
				3777
				3778	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
				3779
				3780	bfq_free_root_group(bfqd);
				3781	kfree(bfqd);
				3782	}
				3783
				3784	static void bfq_init_queue(struct request_queue q)
				3785	{
				3786	struct bfq_group *bfqg;
				3787	struct bfq_data *bfqd;
				3788
				3789	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
				3790	if (bfqd == NULL)
				3791	return NULL;
				3792
				3793	/*
				3794	* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
				3795	* Grab a permanent reference to it, so that the normal code flow
				3796	* will not attempt to free it.
				3797	*/
				3798	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
				3799	atomic_inc(&bfqd->oom_bfqq.ref);
				3800	bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
				3801	bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;
				3802	bfqd->oom_bfqq.entity.new_weight =
				3803	bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);
				3804	/*
				3805	* Trigger weight initialization, according to ioprio, at the
				3806	* oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
				3807	* class won't be changed any more.
				3808	*/
				3809	bfqd->oom_bfqq.entity.ioprio_changed = 1;
				3810
				3811	bfqd->queue = q;
				3812
				3813	bfqg = bfq_alloc_root_group(bfqd, q->node);
				3814	if (bfqg == NULL) {
				3815	kfree(bfqd);
				3816	return NULL;
				3817	}
				3818
				3819	bfqd->root_group = bfqg;
				3820	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
				3821	#ifdef CONFIG_CGROUP_BFQIO
				3822	bfqd->active_numerous_groups = 0;
				3823	#endif
				3824
				3825	init_timer(&bfqd->idle_slice_timer);
				3826	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
				3827	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
				3828
				3829	bfqd->rq_pos_tree = RB_ROOT;
				3830	bfqd->queue_weights_tree = RB_ROOT;
				3831	bfqd->group_weights_tree = RB_ROOT;
				3832
				3833	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
				3834
				3835	INIT_LIST_HEAD(&bfqd->active_list);
				3836	INIT_LIST_HEAD(&bfqd->idle_list);
				3837	INIT_HLIST_HEAD(&bfqd->burst_list);
				3838
				3839	bfqd->hw_tag = -1;
				3840
				3841	bfqd->bfq_max_budget = bfq_default_max_budget;
				3842
				3843	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
				3844	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
				3845	bfqd->bfq_back_max = bfq_back_max;
				3846	bfqd->bfq_back_penalty = bfq_back_penalty;
				3847	bfqd->bfq_slice_idle = bfq_slice_idle;
				3848	bfqd->bfq_class_idle_last_service = 0;
				3849	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
				3850	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
				3851	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
				3852
				3853	bfqd->bfq_coop_thresh = 2;
				3854	bfqd->bfq_failed_cooperations = 7000;
				3855	bfqd->bfq_requests_within_timer = 120;
				3856
				3857	bfqd->bfq_large_burst_thresh = 11;
				3858	bfqd->bfq_burst_interval = msecs_to_jiffies(500);
				3859
				3860	bfqd->low_latency = true;
				3861
				3862	bfqd->bfq_wr_coeff = 20;
				3863	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
				3864	bfqd->bfq_wr_max_time = 0;
				3865	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
				3866	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
				3867	bfqd->bfq_wr_max_softrt_rate = 7000; /*
				3868	* Approximate rate required
				3869	* to playback or record a
				3870	* high-definition compressed
				3871	* video.
				3872	*/
				3873	bfqd->wr_busy_queues = 0;
				3874	bfqd->busy_in_flight_queues = 0;
				3875	bfqd->const_seeky_busy_in_flight_queues = 0;
				3876
				3877	/*
				3878	* Begin by assuming, optimistically, that the device peak rate is
				3879	* equal to the highest reference rate.
				3880	*/
				3881	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
				3882	T_fast[blk_queue_nonrot(bfqd->queue)];
				3883	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];
				3884	bfqd->device_speed = BFQ_BFQD_FAST;
				3885
				3886	return bfqd;
				3887	}
				3888
				3889	static void bfq_slab_kill(void)
				3890	{
				3891	if (bfq_pool != NULL)
				3892	kmem_cache_destroy(bfq_pool);
				3893	}
				3894
				3895	static int __init bfq_slab_setup(void)
				3896	{
				3897	bfq_pool = KMEM_CACHE(bfq_queue, 0);
				3898	if (bfq_pool == NULL)
				3899	return -ENOMEM;
				3900	return 0;
				3901	}
				3902
				3903	static ssize_t bfq_var_show(unsigned int var, char *page)
				3904	{
				3905	return sprintf(page, "%d\n", var);
				3906	}
				3907
				3908	static ssize_t bfq_var_store(unsigned long var, const char page,
				3909	size_t count)
				3910	{
				3911	unsigned long new_val;
				3912	int ret = kstrtoul(page, 10, &new_val);
				3913
				3914	if (ret == 0)
				3915	*var = new_val;
				3916
				3917	return count;
				3918	}
				3919
				3920	static ssize_t bfq_wr_max_time_show(struct elevator_queue e, char page)
				3921	{
				3922	struct bfq_data *bfqd = e->elevator_data;
				3923	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
				3924	jiffies_to_msecs(bfqd->bfq_wr_max_time) :
				3925	jiffies_to_msecs(bfq_wr_duration(bfqd)));
				3926	}
				3927
				3928	static ssize_t bfq_weights_show(struct elevator_queue e, char page)
				3929	{
				3930	struct bfq_queue *bfqq;
				3931	struct bfq_data *bfqd = e->elevator_data;
				3932	ssize_t num_char = 0;
				3933
				3934	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
				3935	bfqd->queued);
				3936
				3937	spin_lock_irq(bfqd->queue->queue_lock);
				3938
				3939	num_char += sprintf(page + num_char, "Active:\n");
				3940	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
				3941	num_char += sprintf(page + num_char,
				3942	"pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",
				3943	bfqq->pid,
				3944	bfqq->entity.weight,
				3945	bfqq->queued[0],
				3946	bfqq->queued[1],
				3947	jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
				3948	jiffies_to_msecs(bfqq->wr_cur_max_time));
				3949	}
				3950
				3951	num_char += sprintf(page + num_char, "Idle:\n");
				3952	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
				3953	num_char += sprintf(page + num_char,
				3954	"pid%d: weight %hu, dur %d/%u\n",
				3955	bfqq->pid,
				3956	bfqq->entity.weight,
				3957	jiffies_to_msecs(jiffies -
				3958	bfqq->last_wr_start_finish),
				3959	jiffies_to_msecs(bfqq->wr_cur_max_time));
				3960	}
				3961
				3962	spin_unlock_irq(bfqd->queue->queue_lock);
				3963
				3964	return num_char;
				3965	}
				3966
				3967	#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
				3968	static ssize_t __FUNC(struct elevator_queue e, char page) \
				3969	{ \
				3970	struct bfq_data *bfqd = e->elevator_data; \
				3971	unsigned int __data = __VAR; \
				3972	if (__CONV) \
				3973	__data = jiffies_to_msecs(__data); \
				3974	return bfq_var_show(__data, (page)); \
				3975	}
				3976	SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
				3977	SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
				3978	SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
				3979	SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
				3980	SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
				3981	SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
				3982	SHOW_FUNCTION(bfq_max_budget_async_rq_show,
				3983	bfqd->bfq_max_budget_async_rq, 0);
				3984	SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
				3985	SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
				3986	SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
				3987	SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
				3988	SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
				3989	SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
				3990	SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
				3991	1);
				3992	SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
				3993	#undef SHOW_FUNCTION
				3994
				3995	#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
				3996	static ssize_t \
				3997	__FUNC(struct elevator_queue e, const char page, size_t count) \
				3998	{ \
				3999	struct bfq_data *bfqd = e->elevator_data; \
				4000	unsigned long uninitialized_var(__data); \
				4001	int ret = bfq_var_store(&__data, (page), count); \
				4002	if (__data < (MIN)) \
				4003	__data = (MIN); \
				4004	else if (__data > (MAX)) \
				4005	__data = (MAX); \
				4006	if (__CONV) \
				4007	*(__PTR) = msecs_to_jiffies(__data); \
				4008	else \
				4009	*(__PTR) = __data; \
				4010	return ret; \
				4011	}
				4012	STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
				4013	INT_MAX, 1);
				4014	STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
				4015	INT_MAX, 1);
				4016	STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
				4017	STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
				4018	INT_MAX, 0);
				4019	STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
				4020	STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
				4021	1, INT_MAX, 0);
				4022	STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
				4023	INT_MAX, 1);
				4024	STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
				4025	STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
				4026	STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
				4027	1);
				4028	STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
				4029	INT_MAX, 1);
				4030	STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
				4031	&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
				4032	STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
				4033	INT_MAX, 0);
				4034	#undef STORE_FUNCTION
				4035
				4036	/* do nothing for the moment */
				4037	static ssize_t bfq_weights_store(struct elevator_queue *e,
				4038	const char *page, size_t count)
				4039	{
				4040	return count;
				4041	}
				4042
				4043	static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
				4044	{
				4045	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
				4046
				4047	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
				4048	return bfq_calc_max_budget(bfqd->peak_rate, timeout);
				4049	else
				4050	return bfq_default_max_budget;
				4051	}
				4052
				4053	static ssize_t bfq_max_budget_store(struct elevator_queue *e,
				4054	const char *page, size_t count)
				4055	{
				4056	struct bfq_data *bfqd = e->elevator_data;
				4057	unsigned long uninitialized_var(__data);
				4058	int ret = bfq_var_store(&__data, (page), count);
				4059
				4060	if (__data == 0)
				4061	bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
				4062	else {
				4063	if (__data > INT_MAX)
				4064	__data = INT_MAX;
				4065	bfqd->bfq_max_budget = __data;
				4066	}
				4067
				4068	bfqd->bfq_user_max_budget = __data;
				4069
				4070	return ret;
				4071	}
				4072
				4073	static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
				4074	const char *page, size_t count)
				4075	{
				4076	struct bfq_data *bfqd = e->elevator_data;
				4077	unsigned long uninitialized_var(__data);
				4078	int ret = bfq_var_store(&__data, (page), count);
				4079
				4080	if (__data < 1)
				4081	__data = 1;
				4082	else if (__data > INT_MAX)
				4083	__data = INT_MAX;
				4084
				4085	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
				4086	if (bfqd->bfq_user_max_budget == 0)
				4087	bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
				4088
				4089	return ret;
				4090	}
				4091
				4092	static ssize_t bfq_low_latency_store(struct elevator_queue *e,
				4093	const char *page, size_t count)
				4094	{
				4095	struct bfq_data *bfqd = e->elevator_data;
				4096	unsigned long uninitialized_var(__data);
				4097	int ret = bfq_var_store(&__data, (page), count);
				4098
				4099	if (__data > 1)
				4100	__data = 1;
				4101	if (__data == 0 && bfqd->low_latency != 0)
				4102	bfq_end_wr(bfqd);
				4103	bfqd->low_latency = __data;
				4104
				4105	return ret;
				4106	}
				4107
				4108	#define BFQ_ATTR(name) \
				4109	__ATTR(name, S_IRUGO\|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
				4110
				4111	static struct elv_fs_entry bfq_attrs[] = {
				4112	BFQ_ATTR(fifo_expire_sync),
				4113	BFQ_ATTR(fifo_expire_async),
				4114	BFQ_ATTR(back_seek_max),
				4115	BFQ_ATTR(back_seek_penalty),
				4116	BFQ_ATTR(slice_idle),
				4117	BFQ_ATTR(max_budget),
				4118	BFQ_ATTR(max_budget_async_rq),
				4119	BFQ_ATTR(timeout_sync),
				4120	BFQ_ATTR(timeout_async),
				4121	BFQ_ATTR(low_latency),
				4122	BFQ_ATTR(wr_coeff),
				4123	BFQ_ATTR(wr_max_time),
				4124	BFQ_ATTR(wr_rt_max_time),
				4125	BFQ_ATTR(wr_min_idle_time),
				4126	BFQ_ATTR(wr_min_inter_arr_async),
				4127	BFQ_ATTR(wr_max_softrt_rate),
				4128	BFQ_ATTR(weights),
				4129	__ATTR_NULL
				4130	};
				4131
				4132	static struct elevator_type iosched_bfq = {
				4133	.ops = {
				4134	.elevator_merge_fn = bfq_merge,
				4135	.elevator_merged_fn = bfq_merged_request,
				4136	.elevator_merge_req_fn = bfq_merged_requests,
				4137	.elevator_allow_merge_fn = bfq_allow_merge,
				4138	.elevator_dispatch_fn = bfq_dispatch_requests,
				4139	.elevator_add_req_fn = bfq_insert_request,
				4140	.elevator_activate_req_fn = bfq_activate_request,
				4141	.elevator_deactivate_req_fn = bfq_deactivate_request,
				4142	.elevator_completed_req_fn = bfq_completed_request,
				4143	.elevator_former_req_fn = elv_rb_former_request,
				4144	.elevator_latter_req_fn = elv_rb_latter_request,
				4145	.elevator_init_icq_fn = bfq_init_icq,
				4146	.elevator_exit_icq_fn = bfq_exit_icq,
				4147	.elevator_set_req_fn = bfq_set_request,
				4148	.elevator_put_req_fn = bfq_put_request,
				4149	.elevator_may_queue_fn = bfq_may_queue,
				4150	.elevator_init_fn = bfq_init_queue,
				4151	.elevator_exit_fn = bfq_exit_queue,
				4152	},
				4153	.icq_size = sizeof(struct bfq_io_cq),
				4154	.icq_align = __alignof__(struct bfq_io_cq),
				4155	.elevator_attrs = bfq_attrs,
				4156	.elevator_name = "bfq",
				4157	.elevator_owner = THIS_MODULE,
				4158	};
				4159
				4160	static int __init bfq_init(void)
				4161	{
				4162	/*
				4163	* Can be 0 on HZ < 1000 setups.
				4164	*/
				4165	if (bfq_slice_idle == 0)
				4166	bfq_slice_idle = 1;
				4167
				4168	if (bfq_timeout_async == 0)
				4169	bfq_timeout_async = 1;
				4170
				4171	if (bfq_slab_setup())
				4172	return -ENOMEM;
				4173
				4174	/*
				4175	* Times to load large popular applications for the typical systems
				4176	* installed on the reference devices (see the comments before the
				4177	* definitions of the two arrays).
				4178	*/
				4179	T_slow[0] = msecs_to_jiffies(2600);
				4180	T_slow[1] = msecs_to_jiffies(1000);
				4181	T_fast[0] = msecs_to_jiffies(5500);
				4182	T_fast[1] = msecs_to_jiffies(2000);
				4183
				4184	/*
				4185	* Thresholds that determine the switch between speed classes (see
				4186	* the comments before the definition of the array).
				4187	*/
				4188	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
				4189	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
				4190
				4191	elv_register(&iosched_bfq);
				4192	pr_info("BFQ I/O-scheduler: v7r8");
				4193
				4194	return 0;
				4195	}
				4196
				4197	static void __exit bfq_exit(void)
				4198	{
				4199	elv_unregister(&iosched_bfq);
				4200	bfq_slab_kill();
				4201	}
				4202
				4203	module_init(bfq_init);
				4204	module_exit(bfq_exit);
				4205
				4206	MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
				4207	MODULE_LICENSE("GPL");