Blame - block/blk-mq.c - kernel/msm-4.19

blob: f21ec964e411b0b20a29f17ffd5068a606e17a86 [file] [log] [blame]

Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1	#include <linux/kernel.h>
				2	#include <linux/module.h>
				3	#include <linux/backing-dev.h>
				4	#include <linux/bio.h>
				5	#include <linux/blkdev.h>
				6	#include <linux/mm.h>
				7	#include <linux/init.h>
				8	#include <linux/slab.h>
				9	#include <linux/workqueue.h>
				10	#include <linux/smp.h>
				11	#include <linux/llist.h>
				12	#include <linux/list_sort.h>
				13	#include <linux/cpu.h>
				14	#include <linux/cache.h>
				15	#include <linux/sched/sysctl.h>
				16	#include <linux/delay.h>
				17
				18	#include <trace/events/block.h>
				19
				20	#include <linux/blk-mq.h>
				21	#include "blk.h"
				22	#include "blk-mq.h"
				23	#include "blk-mq-tag.h"
				24
				25	static DEFINE_MUTEX(all_q_mutex);
				26	static LIST_HEAD(all_q_list);
				27
				28	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
				29
				30	DEFINE_PER_CPU(struct llist_head, ipi_lists);
				31
				32	static struct blk_mq_ctx __blk_mq_get_ctx(struct request_queue q,
				33	unsigned int cpu)
				34	{
				35	return per_cpu_ptr(q->queue_ctx, cpu);
				36	}
				37
				38	/*
				39	* This assumes per-cpu software queueing queues. They could be per-node
				40	* as well, for instance. For now this is hardcoded as-is. Note that we don't
				41	* care about preemption, since we know the ctx's are persistent. This does
				42	* mean that we can't rely on ctx always matching the currently running CPU.
				43	*/
				44	static struct blk_mq_ctx blk_mq_get_ctx(struct request_queue q)
				45	{
				46	return __blk_mq_get_ctx(q, get_cpu());
				47	}
				48
				49	static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
				50	{
				51	put_cpu();
				52	}
				53
				54	/*
				55	* Check if any of the ctx's have pending work in this hardware queue
				56	*/
				57	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
				58	{
				59	unsigned int i;
				60
				61	for (i = 0; i < hctx->nr_ctx_map; i++)
				62	if (hctx->ctx_map[i])
				63	return true;
				64
				65	return false;
				66	}
				67
				68	/*
				69	* Mark this ctx as having pending work in this hardware queue
				70	*/
				71	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				72	struct blk_mq_ctx *ctx)
				73	{
				74	if (!test_bit(ctx->index_hw, hctx->ctx_map))
				75	set_bit(ctx->index_hw, hctx->ctx_map);
				76	}
				77
				78	static struct request blk_mq_alloc_rq(struct blk_mq_hw_ctx hctx, gfp_t gfp,
				79	bool reserved)
				80	{
				81	struct request *rq;
				82	unsigned int tag;
				83
				84	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
				85	if (tag != BLK_MQ_TAG_FAIL) {
				86	rq = hctx->rqs[tag];
				87	rq->tag = tag;
				88
				89	return rq;
				90	}
				91
				92	return NULL;
				93	}
				94
				95	static int blk_mq_queue_enter(struct request_queue *q)
				96	{
				97	int ret;
				98
				99	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
				100	smp_wmb();
				101	/* we have problems to freeze the queue if it's initializing */
				102	if (!blk_queue_bypass(q) \|\| !blk_queue_init_done(q))
				103	return 0;
				104
				105	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				106
				107	spin_lock_irq(q->queue_lock);
				108	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
				109	!blk_queue_bypass(q), *q->queue_lock);
				110	/* inc usage with lock hold to avoid freeze_queue runs here */
				111	if (!ret)
				112	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
				113	spin_unlock_irq(q->queue_lock);
				114
				115	return ret;
				116	}
				117
				118	static void blk_mq_queue_exit(struct request_queue *q)
				119	{
				120	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				121	}
				122
				123	/*
				124	* Guarantee no request is in use, so we can change any data structure of
				125	* the queue afterward.
				126	*/
				127	static void blk_mq_freeze_queue(struct request_queue *q)
				128	{
				129	bool drain;
				130
				131	spin_lock_irq(q->queue_lock);
				132	drain = !q->bypass_depth++;
				133	queue_flag_set(QUEUE_FLAG_BYPASS, q);
				134	spin_unlock_irq(q->queue_lock);
				135
				136	if (!drain)
				137	return;
				138
				139	while (true) {
				140	s64 count;
				141
				142	spin_lock_irq(q->queue_lock);
				143	count = percpu_counter_sum(&q->mq_usage_counter);
				144	spin_unlock_irq(q->queue_lock);
				145
				146	if (count == 0)
				147	break;
				148	blk_mq_run_queues(q, false);
				149	msleep(10);
				150	}
				151	}
				152
				153	static void blk_mq_unfreeze_queue(struct request_queue *q)
				154	{
				155	bool wake = false;
				156
				157	spin_lock_irq(q->queue_lock);
				158	if (!--q->bypass_depth) {
				159	queue_flag_clear(QUEUE_FLAG_BYPASS, q);
				160	wake = true;
				161	}
				162	WARN_ON_ONCE(q->bypass_depth < 0);
				163	spin_unlock_irq(q->queue_lock);
				164	if (wake)
				165	wake_up_all(&q->mq_freeze_wq);
				166	}
				167
				168	bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
				169	{
				170	return blk_mq_has_free_tags(hctx->tags);
				171	}
				172	EXPORT_SYMBOL(blk_mq_can_queue);
				173
				174	static void blk_mq_rq_ctx_init(struct blk_mq_ctx ctx, struct request rq,
				175	unsigned int rw_flags)
				176	{
				177	rq->mq_ctx = ctx;
				178	rq->cmd_flags = rw_flags;
				179	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
				180	}
				181
				182	static struct request __blk_mq_alloc_request(struct blk_mq_hw_ctx hctx,
				183	gfp_t gfp, bool reserved)
				184	{
				185	return blk_mq_alloc_rq(hctx, gfp, reserved);
				186	}
				187
				188	static struct request blk_mq_alloc_request_pinned(struct request_queue q,
				189	int rw, gfp_t gfp,
				190	bool reserved)
				191	{
				192	struct request *rq;
				193
				194	do {
				195	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
				196	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
				197
				198	rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
				199	if (rq) {
				200	blk_mq_rq_ctx_init(ctx, rq, rw);
				201	break;
				202	} else if (!(gfp & __GFP_WAIT))
				203	break;
				204
				205	blk_mq_put_ctx(ctx);
				206	__blk_mq_run_hw_queue(hctx);
				207	blk_mq_wait_for_tags(hctx->tags);
				208	} while (1);
				209
				210	return rq;
				211	}
				212
				213	struct request blk_mq_alloc_request(struct request_queue q, int rw, gfp_t gfp)
				214	{
				215	struct request *rq;
				216
				217	if (blk_mq_queue_enter(q))
				218	return NULL;
				219
				220	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
				221	blk_mq_put_ctx(rq->mq_ctx);
				222	return rq;
				223	}
				224
				225	struct request blk_mq_alloc_reserved_request(struct request_queue q, int rw,
				226	gfp_t gfp)
				227	{
				228	struct request *rq;
				229
				230	if (blk_mq_queue_enter(q))
				231	return NULL;
				232
				233	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
				234	blk_mq_put_ctx(rq->mq_ctx);
				235	return rq;
				236	}
				237	EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
				238
				239	/*
				240	* Re-init and set pdu, if we have it
				241	*/
				242	static void blk_mq_rq_init(struct blk_mq_hw_ctx hctx, struct request rq)
				243	{
				244	blk_rq_init(hctx->queue, rq);
				245
				246	if (hctx->cmd_size)
				247	rq->special = blk_mq_rq_to_pdu(rq);
				248	}
				249
				250	static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
				251	struct blk_mq_ctx ctx, struct request rq)
				252	{
				253	const int tag = rq->tag;
				254	struct request_queue *q = rq->q;
				255
				256	blk_mq_rq_init(hctx, rq);
				257	blk_mq_put_tag(hctx->tags, tag);
				258
				259	blk_mq_queue_exit(q);
				260	}
				261
				262	void blk_mq_free_request(struct request *rq)
				263	{
				264	struct blk_mq_ctx *ctx = rq->mq_ctx;
				265	struct blk_mq_hw_ctx *hctx;
				266	struct request_queue *q = rq->q;
				267
				268	ctx->rq_completed[rq_is_sync(rq)]++;
				269
				270	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				271	__blk_mq_free_request(hctx, ctx, rq);
				272	}
				273
				274	static void blk_mq_bio_endio(struct request rq, struct bio bio, int error)
				275	{
				276	if (error)
				277	clear_bit(BIO_UPTODATE, &bio->bi_flags);
				278	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				279	error = -EIO;
				280
				281	if (unlikely(rq->cmd_flags & REQ_QUIET))
				282	set_bit(BIO_QUIET, &bio->bi_flags);
				283
				284	/* don't actually finish bio if it's part of flush sequence */
				285	if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
				286	bio_endio(bio, error);
				287	}
				288
				289	void blk_mq_complete_request(struct request *rq, int error)
				290	{
				291	struct bio *bio = rq->bio;
				292	unsigned int bytes = 0;
				293
				294	trace_block_rq_complete(rq->q, rq);
				295
				296	while (bio) {
				297	struct bio *next = bio->bi_next;
				298
				299	bio->bi_next = NULL;
				300	bytes += bio->bi_size;
				301	blk_mq_bio_endio(rq, bio, error);
				302	bio = next;
				303	}
				304
				305	blk_account_io_completion(rq, bytes);
				306
				307	if (rq->end_io)
				308	rq->end_io(rq, error);
				309	else
				310	blk_mq_free_request(rq);
				311
				312	blk_account_io_done(rq);
				313	}
				314
				315	void __blk_mq_end_io(struct request *rq, int error)
				316	{
				317	if (!blk_mark_rq_complete(rq))
				318	blk_mq_complete_request(rq, error);
				319	}
				320
				321	#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
				322
				323	/*
				324	* Called with interrupts disabled.
				325	*/
				326	static void ipi_end_io(void *data)
				327	{
				328	struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
				329	struct llist_node entry, next;
				330	struct request *rq;
				331
				332	entry = llist_del_all(list);
				333
				334	while (entry) {
				335	next = entry->next;
				336	rq = llist_entry(entry, struct request, ll_list);
				337	__blk_mq_end_io(rq, rq->errors);
				338	entry = next;
				339	}
				340	}
				341
				342	static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
				343	struct request *rq, const int error)
				344	{
				345	struct call_single_data *data = &rq->csd;
				346
				347	rq->errors = error;
				348	rq->ll_list.next = NULL;
				349
				350	/*
				351	* If the list is non-empty, an existing IPI must already
				352	* be "in flight". If that is the case, we need not schedule
				353	* a new one.
				354	*/
				355	if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
				356	data->func = ipi_end_io;
				357	data->flags = 0;
				358	__smp_call_function_single(ctx->cpu, data, 0);
				359	}
				360
				361	return true;
				362	}
				363	#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
				364	static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
				365	struct request *rq, const int error)
				366	{
				367	return false;
				368	}
				369	#endif
				370
				371	/*
				372	* End IO on this request on a multiqueue enabled driver. We'll either do
				373	* it directly inline, or punt to a local IPI handler on the matching
				374	* remote CPU.
				375	*/
				376	void blk_mq_end_io(struct request *rq, int error)
				377	{
				378	struct blk_mq_ctx *ctx = rq->mq_ctx;
				379	int cpu;
				380
				381	if (!ctx->ipi_redirect)
				382	return __blk_mq_end_io(rq, error);
				383
				384	cpu = get_cpu();
				385
				386	if (cpu == ctx->cpu \|\| !cpu_online(ctx->cpu) \|\|
				387	!ipi_remote_cpu(ctx, cpu, rq, error))
				388	__blk_mq_end_io(rq, error);
				389
				390	put_cpu();
				391	}
				392	EXPORT_SYMBOL(blk_mq_end_io);
				393
				394	static void blk_mq_start_request(struct request *rq)
				395	{
				396	struct request_queue *q = rq->q;
				397
				398	trace_block_rq_issue(q, rq);
				399
				400	/*
				401	* Just mark start time and set the started bit. Due to memory
				402	* ordering, we know we'll see the correct deadline as long as
				403	* REQ_ATOMIC_STARTED is seen.
				404	*/
				405	rq->deadline = jiffies + q->rq_timeout;
				406	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
				407	}
				408
				409	static void blk_mq_requeue_request(struct request *rq)
				410	{
				411	struct request_queue *q = rq->q;
				412
				413	trace_block_rq_requeue(q, rq);
				414	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
				415	}
				416
				417	struct blk_mq_timeout_data {
				418	struct blk_mq_hw_ctx *hctx;
				419	unsigned long *next;
				420	unsigned int *next_set;
				421	};
				422
				423	static void blk_mq_timeout_check(void __data, unsigned long free_tags)
				424	{
				425	struct blk_mq_timeout_data *data = __data;
				426	struct blk_mq_hw_ctx *hctx = data->hctx;
				427	unsigned int tag;
				428
				429	/* It may not be in flight yet (this is where
				430	* the REQ_ATOMIC_STARTED flag comes in). The requests are
				431	* statically allocated, so we know it's always safe to access the
				432	* memory associated with a bit offset into ->rqs[].
				433	*/
				434	tag = 0;
				435	do {
				436	struct request *rq;
				437
				438	tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
				439	if (tag >= hctx->queue_depth)
				440	break;
				441
				442	rq = hctx->rqs[tag++];
				443
				444	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
				445	continue;
				446
				447	blk_rq_check_expired(rq, data->next, data->next_set);
				448	} while (1);
				449	}
				450
				451	static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
				452	unsigned long *next,
				453	unsigned int *next_set)
				454	{
				455	struct blk_mq_timeout_data data = {
				456	.hctx = hctx,
				457	.next = next,
				458	.next_set = next_set,
				459	};
				460
				461	/*
				462	* Ask the tagging code to iterate busy requests, so we can
				463	* check them for timeout.
				464	*/
				465	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
				466	}
				467
				468	static void blk_mq_rq_timer(unsigned long data)
				469	{
				470	struct request_queue q = (struct request_queue ) data;
				471	struct blk_mq_hw_ctx *hctx;
				472	unsigned long next = 0;
				473	int i, next_set = 0;
				474
				475	queue_for_each_hw_ctx(q, hctx, i)
				476	blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
				477
				478	if (next_set)
				479	mod_timer(&q->timeout, round_jiffies_up(next));
				480	}
				481
				482	/*
				483	* Reverse check our software queue for entries that we could potentially
				484	* merge with. Currently includes a hand-wavy stop count of 8, to not spend
				485	* too much time checking for merges.
				486	*/
				487	static bool blk_mq_attempt_merge(struct request_queue *q,
				488	struct blk_mq_ctx ctx, struct bio bio)
				489	{
				490	struct request *rq;
				491	int checked = 8;
				492
				493	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
				494	int el_ret;
				495
				496	if (!checked--)
				497	break;
				498
				499	if (!blk_rq_merge_ok(rq, bio))
				500	continue;
				501
				502	el_ret = blk_try_merge(rq, bio);
				503	if (el_ret == ELEVATOR_BACK_MERGE) {
				504	if (bio_attempt_back_merge(q, rq, bio)) {
				505	ctx->rq_merged++;
				506	return true;
				507	}
				508	break;
				509	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
				510	if (bio_attempt_front_merge(q, rq, bio)) {
				511	ctx->rq_merged++;
				512	return true;
				513	}
				514	break;
				515	}
				516	}
				517
				518	return false;
				519	}
				520
				521	void blk_mq_add_timer(struct request *rq)
				522	{
				523	__blk_add_timer(rq, NULL);
				524	}
				525
				526	/*
				527	* Run this hardware queue, pulling any software queues mapped to it in.
				528	* Note that this function currently has various problems around ordering
				529	* of IO. In particular, we'd like FIFO behaviour on handling existing
				530	* items on the hctx->dispatch list. Ignore that for now.
				531	*/
				532	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
				533	{
				534	struct request_queue *q = hctx->queue;
				535	struct blk_mq_ctx *ctx;
				536	struct request *rq;
				537	LIST_HEAD(rq_list);
				538	int bit, queued;
				539
				540	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
				541	return;
				542
				543	hctx->run++;
				544
				545	/*
				546	* Touch any software queue that has pending entries.
				547	*/
				548	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
				549	clear_bit(bit, hctx->ctx_map);
				550	ctx = hctx->ctxs[bit];
				551	BUG_ON(bit != ctx->index_hw);
				552
				553	spin_lock(&ctx->lock);
				554	list_splice_tail_init(&ctx->rq_list, &rq_list);
				555	spin_unlock(&ctx->lock);
				556	}
				557
				558	/*
				559	* If we have previous entries on our dispatch list, grab them
				560	* and stuff them at the front for more fair dispatch.
				561	*/
				562	if (!list_empty_careful(&hctx->dispatch)) {
				563	spin_lock(&hctx->lock);
				564	if (!list_empty(&hctx->dispatch))
				565	list_splice_init(&hctx->dispatch, &rq_list);
				566	spin_unlock(&hctx->lock);
				567	}
				568
				569	/*
				570	* Delete and return all entries from our dispatch list
				571	*/
				572	queued = 0;
				573
				574	/*
				575	* Now process all the entries, sending them to the driver.
				576	*/
				577	while (!list_empty(&rq_list)) {
				578	int ret;
				579
				580	rq = list_first_entry(&rq_list, struct request, queuelist);
				581	list_del_init(&rq->queuelist);
				582	blk_mq_start_request(rq);
				583
				584	/*
				585	* Last request in the series. Flag it as such, this
				586	* enables drivers to know when IO should be kicked off,
				587	* if they don't do it on a per-request basis.
				588	*
				589	* Note: the flag isn't the only condition drivers
				590	* should do kick off. If drive is busy, the last
				591	* request might not have the bit set.
				592	*/
				593	if (list_empty(&rq_list))
				594	rq->cmd_flags \|= REQ_END;
				595
				596	ret = q->mq_ops->queue_rq(hctx, rq);
				597	switch (ret) {
				598	case BLK_MQ_RQ_QUEUE_OK:
				599	queued++;
				600	continue;
				601	case BLK_MQ_RQ_QUEUE_BUSY:
				602	/*
				603	* FIXME: we should have a mechanism to stop the queue
				604	* like blk_stop_queue, otherwise we will waste cpu
				605	* time
				606	*/
				607	list_add(&rq->queuelist, &rq_list);
				608	blk_mq_requeue_request(rq);
				609	break;
				610	default:
				611	pr_err("blk-mq: bad return on queue: %d\n", ret);
				612	rq->errors = -EIO;
				613	case BLK_MQ_RQ_QUEUE_ERROR:
				614	blk_mq_end_io(rq, rq->errors);
				615	break;
				616	}
				617
				618	if (ret == BLK_MQ_RQ_QUEUE_BUSY)
				619	break;
				620	}
				621
				622	if (!queued)
				623	hctx->dispatched[0]++;
				624	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
				625	hctx->dispatched[ilog2(queued) + 1]++;
				626
				627	/*
				628	* Any items that need requeuing? Stuff them into hctx->dispatch,
				629	* that is where we will continue on next queue run.
				630	*/
				631	if (!list_empty(&rq_list)) {
				632	spin_lock(&hctx->lock);
				633	list_splice(&rq_list, &hctx->dispatch);
				634	spin_unlock(&hctx->lock);
				635	}
				636	}
				637
				638	void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
				639	{
				640	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
				641	return;
				642
				643	if (!async)
				644	__blk_mq_run_hw_queue(hctx);
				645	else {
				646	struct request_queue *q = hctx->queue;
				647
				648	kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
				649	}
				650	}
				651
				652	void blk_mq_run_queues(struct request_queue *q, bool async)
				653	{
				654	struct blk_mq_hw_ctx *hctx;
				655	int i;
				656
				657	queue_for_each_hw_ctx(q, hctx, i) {
				658	if ((!blk_mq_hctx_has_pending(hctx) &&
				659	list_empty_careful(&hctx->dispatch)) \|\|
				660	test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
				661	continue;
				662
				663	blk_mq_run_hw_queue(hctx, async);
				664	}
				665	}
				666	EXPORT_SYMBOL(blk_mq_run_queues);
				667
				668	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
				669	{
				670	cancel_delayed_work(&hctx->delayed_work);
				671	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
				672	}
				673	EXPORT_SYMBOL(blk_mq_stop_hw_queue);
				674
				675	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
				676	{
				677	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				678	__blk_mq_run_hw_queue(hctx);
				679	}
				680	EXPORT_SYMBOL(blk_mq_start_hw_queue);
				681
				682	void blk_mq_start_stopped_hw_queues(struct request_queue *q)
				683	{
				684	struct blk_mq_hw_ctx *hctx;
				685	int i;
				686
				687	queue_for_each_hw_ctx(q, hctx, i) {
				688	if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
				689	continue;
				690
				691	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				692	blk_mq_run_hw_queue(hctx, true);
				693	}
				694	}
				695	EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
				696
				697	static void blk_mq_work_fn(struct work_struct *work)
				698	{
				699	struct blk_mq_hw_ctx *hctx;
				700
				701	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
				702	__blk_mq_run_hw_queue(hctx);
				703	}
				704
				705	static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
				706	struct request *rq)
				707	{
				708	struct blk_mq_ctx *ctx = rq->mq_ctx;
				709
				710	list_add_tail(&rq->queuelist, &ctx->rq_list);
				711	blk_mq_hctx_mark_pending(hctx, ctx);
				712
				713	/*
				714	* We do this early, to ensure we are on the right CPU.
				715	*/
				716	blk_mq_add_timer(rq);
				717	}
				718
				719	void blk_mq_insert_request(struct request_queue q, struct request rq,
				720	bool run_queue)
				721	{
				722	struct blk_mq_hw_ctx *hctx;
				723	struct blk_mq_ctx ctx, current_ctx;
				724
				725	ctx = rq->mq_ctx;
				726	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				727
				728	if (rq->cmd_flags & (REQ_FLUSH \| REQ_FUA)) {
				729	blk_insert_flush(rq);
				730	} else {
				731	current_ctx = blk_mq_get_ctx(q);
				732
				733	if (!cpu_online(ctx->cpu)) {
				734	ctx = current_ctx;
				735	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				736	rq->mq_ctx = ctx;
				737	}
				738	spin_lock(&ctx->lock);
				739	__blk_mq_insert_request(hctx, rq);
				740	spin_unlock(&ctx->lock);
				741
				742	blk_mq_put_ctx(current_ctx);
				743	}
				744
				745	if (run_queue)
				746	__blk_mq_run_hw_queue(hctx);
				747	}
				748	EXPORT_SYMBOL(blk_mq_insert_request);
				749
				750	/*
				751	* This is a special version of blk_mq_insert_request to bypass FLUSH request
				752	* check. Should only be used internally.
				753	*/
				754	void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
				755	{
				756	struct request_queue *q = rq->q;
				757	struct blk_mq_hw_ctx *hctx;
				758	struct blk_mq_ctx ctx, current_ctx;
				759
				760	current_ctx = blk_mq_get_ctx(q);
				761
				762	ctx = rq->mq_ctx;
				763	if (!cpu_online(ctx->cpu)) {
				764	ctx = current_ctx;
				765	rq->mq_ctx = ctx;
				766	}
				767	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				768
				769	/* ctx->cpu might be offline */
				770	spin_lock(&ctx->lock);
				771	__blk_mq_insert_request(hctx, rq);
				772	spin_unlock(&ctx->lock);
				773
				774	blk_mq_put_ctx(current_ctx);
				775
				776	if (run_queue)
				777	blk_mq_run_hw_queue(hctx, async);
				778	}
				779
				780	static void blk_mq_insert_requests(struct request_queue *q,
				781	struct blk_mq_ctx *ctx,
				782	struct list_head *list,
				783	int depth,
				784	bool from_schedule)
				785
				786	{
				787	struct blk_mq_hw_ctx *hctx;
				788	struct blk_mq_ctx *current_ctx;
				789
				790	trace_block_unplug(q, depth, !from_schedule);
				791
				792	current_ctx = blk_mq_get_ctx(q);
				793
				794	if (!cpu_online(ctx->cpu))
				795	ctx = current_ctx;
				796	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				797
				798	/*
				799	* preemption doesn't flush plug list, so it's possible ctx->cpu is
				800	* offline now
				801	*/
				802	spin_lock(&ctx->lock);
				803	while (!list_empty(list)) {
				804	struct request *rq;
				805
				806	rq = list_first_entry(list, struct request, queuelist);
				807	list_del_init(&rq->queuelist);
				808	rq->mq_ctx = ctx;
				809	__blk_mq_insert_request(hctx, rq);
				810	}
				811	spin_unlock(&ctx->lock);
				812
				813	blk_mq_put_ctx(current_ctx);
				814
				815	blk_mq_run_hw_queue(hctx, from_schedule);
				816	}
				817
				818	static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
				819	{
				820	struct request *rqa = container_of(a, struct request, queuelist);
				821	struct request *rqb = container_of(b, struct request, queuelist);
				822
				823	return !(rqa->mq_ctx < rqb->mq_ctx \|\|
				824	(rqa->mq_ctx == rqb->mq_ctx &&
				825	blk_rq_pos(rqa) < blk_rq_pos(rqb)));
				826	}
				827
				828	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
				829	{
				830	struct blk_mq_ctx *this_ctx;
				831	struct request_queue *this_q;
				832	struct request *rq;
				833	LIST_HEAD(list);
				834	LIST_HEAD(ctx_list);
				835	unsigned int depth;
				836
				837	list_splice_init(&plug->mq_list, &list);
				838
				839	list_sort(NULL, &list, plug_ctx_cmp);
				840
				841	this_q = NULL;
				842	this_ctx = NULL;
				843	depth = 0;
				844
				845	while (!list_empty(&list)) {
				846	rq = list_entry_rq(list.next);
				847	list_del_init(&rq->queuelist);
				848	BUG_ON(!rq->q);
				849	if (rq->mq_ctx != this_ctx) {
				850	if (this_ctx) {
				851	blk_mq_insert_requests(this_q, this_ctx,
				852	&ctx_list, depth,
				853	from_schedule);
				854	}
				855
				856	this_ctx = rq->mq_ctx;
				857	this_q = rq->q;
				858	depth = 0;
				859	}
				860
				861	depth++;
				862	list_add_tail(&rq->queuelist, &ctx_list);
				863	}
				864
				865	/*
				866	* If 'this_ctx' is set, we know we have entries to complete
				867	* on 'ctx_list'. Do those.
				868	*/
				869	if (this_ctx) {
				870	blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
				871	from_schedule);
				872	}
				873	}
				874
				875	static void blk_mq_bio_to_request(struct request rq, struct bio bio)
				876	{
				877	init_request_from_bio(rq, bio);
				878	blk_account_io_start(rq, 1);
				879	}
				880
				881	static void blk_mq_make_request(struct request_queue q, struct bio bio)
				882	{
				883	struct blk_mq_hw_ctx *hctx;
				884	struct blk_mq_ctx *ctx;
				885	const int is_sync = rw_is_sync(bio->bi_rw);
				886	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
				887	int rw = bio_data_dir(bio);
				888	struct request *rq;
				889	unsigned int use_plug, request_count = 0;
				890
				891	/*
				892	* If we have multiple hardware queues, just go directly to
				893	* one of those for sync IO.
				894	*/
				895	use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) \|\| !is_sync);
				896
				897	blk_queue_bounce(q, &bio);
				898
				899	if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
				900	return;
				901
				902	if (blk_mq_queue_enter(q)) {
				903	bio_endio(bio, -EIO);
				904	return;
				905	}
				906
				907	ctx = blk_mq_get_ctx(q);
				908	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				909
				910	trace_block_getrq(q, bio, rw);
				911	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
				912	if (likely(rq))
				913	blk_mq_rq_ctx_init(ctx, rq, rw);
				914	else {
				915	blk_mq_put_ctx(ctx);
				916	trace_block_sleeprq(q, bio, rw);
				917	rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT\|GFP_ATOMIC,
				918	false);
				919	ctx = rq->mq_ctx;
				920	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				921	}
				922
				923	hctx->queued++;
				924
				925	if (unlikely(is_flush_fua)) {
				926	blk_mq_bio_to_request(rq, bio);
				927	blk_mq_put_ctx(ctx);
				928	blk_insert_flush(rq);
				929	goto run_queue;
				930	}
				931
				932	/*
				933	* A task plug currently exists. Since this is completely lockless,
				934	* utilize that to temporarily store requests until the task is
				935	* either done or scheduled away.
				936	*/
				937	if (use_plug) {
				938	struct blk_plug *plug = current->plug;
				939
				940	if (plug) {
				941	blk_mq_bio_to_request(rq, bio);
				942	if (list_empty(&plug->list))
				943	trace_block_plug(q);
				944	else if (request_count >= BLK_MAX_REQUEST_COUNT) {
				945	blk_flush_plug_list(plug, false);
				946	trace_block_plug(q);
				947	}
				948	list_add_tail(&rq->queuelist, &plug->mq_list);
				949	blk_mq_put_ctx(ctx);
				950	return;
				951	}
				952	}
				953
				954	spin_lock(&ctx->lock);
				955
				956	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
				957	blk_mq_attempt_merge(q, ctx, bio))
				958	__blk_mq_free_request(hctx, ctx, rq);
				959	else {
				960	blk_mq_bio_to_request(rq, bio);
				961	__blk_mq_insert_request(hctx, rq);
				962	}
				963
				964	spin_unlock(&ctx->lock);
				965	blk_mq_put_ctx(ctx);
				966
				967	/*
				968	* For a SYNC request, send it to the hardware immediately. For an
				969	* ASYNC request, just ensure that we run it later on. The latter
				970	* allows for merging opportunities and more efficient dispatching.
				971	*/
				972	run_queue:
				973	blk_mq_run_hw_queue(hctx, !is_sync \|\| is_flush_fua);
				974	}
				975
				976	/*
				977	* Default mapping to a software queue, since we use one per CPU.
				978	*/
				979	struct blk_mq_hw_ctx blk_mq_map_queue(struct request_queue q, const int cpu)
				980	{
				981	return q->queue_hw_ctx[q->mq_map[cpu]];
				982	}
				983	EXPORT_SYMBOL(blk_mq_map_queue);
				984
				985	struct blk_mq_hw_ctx blk_mq_alloc_single_hw_queue(struct blk_mq_reg reg,
				986	unsigned int hctx_index)
				987	{
				988	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
				989	GFP_KERNEL \| __GFP_ZERO, reg->numa_node);
				990	}
				991	EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
				992
				993	void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
				994	unsigned int hctx_index)
				995	{
				996	kfree(hctx);
				997	}
				998	EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
				999
				1000	static void blk_mq_hctx_notify(void *data, unsigned long action,
				1001	unsigned int cpu)
				1002	{
				1003	struct blk_mq_hw_ctx *hctx = data;
				1004	struct blk_mq_ctx *ctx;
				1005	LIST_HEAD(tmp);
				1006
				1007	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
				1008	return;
				1009
				1010	/*
				1011	* Move ctx entries to new CPU, if this one is going away.
				1012	*/
				1013	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
				1014
				1015	spin_lock(&ctx->lock);
				1016	if (!list_empty(&ctx->rq_list)) {
				1017	list_splice_init(&ctx->rq_list, &tmp);
				1018	clear_bit(ctx->index_hw, hctx->ctx_map);
				1019	}
				1020	spin_unlock(&ctx->lock);
				1021
				1022	if (list_empty(&tmp))
				1023	return;
				1024
				1025	ctx = blk_mq_get_ctx(hctx->queue);
				1026	spin_lock(&ctx->lock);
				1027
				1028	while (!list_empty(&tmp)) {
				1029	struct request *rq;
				1030
				1031	rq = list_first_entry(&tmp, struct request, queuelist);
				1032	rq->mq_ctx = ctx;
				1033	list_move_tail(&rq->queuelist, &ctx->rq_list);
				1034	}
				1035
				1036	blk_mq_hctx_mark_pending(hctx, ctx);
				1037
				1038	spin_unlock(&ctx->lock);
				1039	blk_mq_put_ctx(ctx);
				1040	}
				1041
				1042	static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
				1043	void (init)(void , struct blk_mq_hw_ctx *,
				1044	struct request *, unsigned int),
				1045	void *data)
				1046	{
				1047	unsigned int i;
				1048
				1049	for (i = 0; i < hctx->queue_depth; i++) {
				1050	struct request *rq = hctx->rqs[i];
				1051
				1052	init(data, hctx, rq, i);
				1053	}
				1054	}
				1055
				1056	void blk_mq_init_commands(struct request_queue *q,
				1057	void (init)(void , struct blk_mq_hw_ctx *,
				1058	struct request *, unsigned int),
				1059	void *data)
				1060	{
				1061	struct blk_mq_hw_ctx *hctx;
				1062	unsigned int i;
				1063
				1064	queue_for_each_hw_ctx(q, hctx, i)
				1065	blk_mq_init_hw_commands(hctx, init, data);
				1066	}
				1067	EXPORT_SYMBOL(blk_mq_init_commands);
				1068
				1069	static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
				1070	{
				1071	struct page *page;
				1072
				1073	while (!list_empty(&hctx->page_list)) {
				1074	page = list_first_entry(&hctx->page_list, struct page, list);
				1075	list_del_init(&page->list);
				1076	__free_pages(page, page->private);
				1077	}
				1078
				1079	kfree(hctx->rqs);
				1080
				1081	if (hctx->tags)
				1082	blk_mq_free_tags(hctx->tags);
				1083	}
				1084
				1085	static size_t order_to_size(unsigned int order)
				1086	{
				1087	size_t ret = PAGE_SIZE;
				1088
				1089	while (order--)
				1090	ret *= 2;
				1091
				1092	return ret;
				1093	}
				1094
				1095	static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
				1096	unsigned int reserved_tags, int node)
				1097	{
				1098	unsigned int i, j, entries_per_page, max_order = 4;
				1099	size_t rq_size, left;
				1100
				1101	INIT_LIST_HEAD(&hctx->page_list);
				1102
				1103	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
				1104	GFP_KERNEL, node);
				1105	if (!hctx->rqs)
				1106	return -ENOMEM;
				1107
				1108	/*
				1109	* rq_size is the size of the request plus driver payload, rounded
				1110	* to the cacheline size
				1111	*/
				1112	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
				1113	cache_line_size());
				1114	left = rq_size * hctx->queue_depth;
				1115
				1116	for (i = 0; i < hctx->queue_depth;) {
				1117	int this_order = max_order;
				1118	struct page *page;
				1119	int to_do;
				1120	void *p;
				1121
				1122	while (left < order_to_size(this_order - 1) && this_order)
				1123	this_order--;
				1124
				1125	do {
				1126	page = alloc_pages_node(node, GFP_KERNEL, this_order);
				1127	if (page)
				1128	break;
				1129	if (!this_order--)
				1130	break;
				1131	if (order_to_size(this_order) < rq_size)
				1132	break;
				1133	} while (1);
				1134
				1135	if (!page)
				1136	break;
				1137
				1138	page->private = this_order;
				1139	list_add_tail(&page->list, &hctx->page_list);
				1140
				1141	p = page_address(page);
				1142	entries_per_page = order_to_size(this_order) / rq_size;
				1143	to_do = min(entries_per_page, hctx->queue_depth - i);
				1144	left -= to_do * rq_size;
				1145	for (j = 0; j < to_do; j++) {
				1146	hctx->rqs[i] = p;
				1147	blk_mq_rq_init(hctx, hctx->rqs[i]);
				1148	p += rq_size;
				1149	i++;
				1150	}
				1151	}
				1152
				1153	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
				1154	goto err_rq_map;
				1155	else if (i != hctx->queue_depth) {
				1156	hctx->queue_depth = i;
				1157	pr_warn("%s: queue depth set to %u because of low memory\n",
				1158	__func__, i);
				1159	}
				1160
				1161	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
				1162	if (!hctx->tags) {
				1163	err_rq_map:
				1164	blk_mq_free_rq_map(hctx);
				1165	return -ENOMEM;
				1166	}
				1167
				1168	return 0;
				1169	}
				1170
				1171	static int blk_mq_init_hw_queues(struct request_queue *q,
				1172	struct blk_mq_reg reg, void driver_data)
				1173	{
				1174	struct blk_mq_hw_ctx *hctx;
				1175	unsigned int i, j;
				1176
				1177	/*
				1178	* Initialize hardware queues
				1179	*/
				1180	queue_for_each_hw_ctx(q, hctx, i) {
				1181	unsigned int num_maps;
				1182	int node;
				1183
				1184	node = hctx->numa_node;
				1185	if (node == NUMA_NO_NODE)
				1186	node = hctx->numa_node = reg->numa_node;
				1187
				1188	INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
				1189	spin_lock_init(&hctx->lock);
				1190	INIT_LIST_HEAD(&hctx->dispatch);
				1191	hctx->queue = q;
				1192	hctx->queue_num = i;
				1193	hctx->flags = reg->flags;
				1194	hctx->queue_depth = reg->queue_depth;
				1195	hctx->cmd_size = reg->cmd_size;
				1196
				1197	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
				1198	blk_mq_hctx_notify, hctx);
				1199	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
				1200
				1201	if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
				1202	break;
				1203
				1204	/*
				1205	* Allocate space for all possible cpus to avoid allocation in
				1206	* runtime
				1207	*/
				1208	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
				1209	GFP_KERNEL, node);
				1210	if (!hctx->ctxs)
				1211	break;
				1212
				1213	num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
				1214	hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
				1215	GFP_KERNEL, node);
				1216	if (!hctx->ctx_map)
				1217	break;
				1218
				1219	hctx->nr_ctx_map = num_maps;
				1220	hctx->nr_ctx = 0;
				1221
				1222	if (reg->ops->init_hctx &&
				1223	reg->ops->init_hctx(hctx, driver_data, i))
				1224	break;
				1225	}
				1226
				1227	if (i == q->nr_hw_queues)
				1228	return 0;
				1229
				1230	/*
				1231	* Init failed
				1232	*/
				1233	queue_for_each_hw_ctx(q, hctx, j) {
				1234	if (i == j)
				1235	break;
				1236
				1237	if (reg->ops->exit_hctx)
				1238	reg->ops->exit_hctx(hctx, j);
				1239
				1240	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1241	blk_mq_free_rq_map(hctx);
				1242	kfree(hctx->ctxs);
				1243	}
				1244
				1245	return 1;
				1246	}
				1247
				1248	static void blk_mq_init_cpu_queues(struct request_queue *q,
				1249	unsigned int nr_hw_queues)
				1250	{
				1251	unsigned int i;
				1252
				1253	for_each_possible_cpu(i) {
				1254	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
				1255	struct blk_mq_hw_ctx *hctx;
				1256
				1257	memset(__ctx, 0, sizeof(*__ctx));
				1258	__ctx->cpu = i;
				1259	spin_lock_init(&__ctx->lock);
				1260	INIT_LIST_HEAD(&__ctx->rq_list);
				1261	__ctx->queue = q;
				1262
				1263	/* If the cpu isn't online, the cpu is mapped to first hctx */
				1264	hctx = q->mq_ops->map_queue(q, i);
				1265	hctx->nr_ctx++;
				1266
				1267	if (!cpu_online(i))
				1268	continue;
				1269
				1270	/*
				1271	* Set local node, IFF we have more than one hw queue. If
				1272	* not, we remain on the home node of the device
				1273	*/
				1274	if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
				1275	hctx->numa_node = cpu_to_node(i);
				1276	}
				1277	}
				1278
				1279	static void blk_mq_map_swqueue(struct request_queue *q)
				1280	{
				1281	unsigned int i;
				1282	struct blk_mq_hw_ctx *hctx;
				1283	struct blk_mq_ctx *ctx;
				1284
				1285	queue_for_each_hw_ctx(q, hctx, i) {
				1286	hctx->nr_ctx = 0;
				1287	}
				1288
				1289	/*
				1290	* Map software to hardware queues
				1291	*/
				1292	queue_for_each_ctx(q, ctx, i) {
				1293	/* If the cpu isn't online, the cpu is mapped to first hctx */
				1294	hctx = q->mq_ops->map_queue(q, i);
				1295	ctx->index_hw = hctx->nr_ctx;
				1296	hctx->ctxs[hctx->nr_ctx++] = ctx;
				1297	}
				1298	}
				1299
				1300	struct request_queue blk_mq_init_queue(struct blk_mq_reg reg,
				1301	void *driver_data)
				1302	{
				1303	struct blk_mq_hw_ctx **hctxs;
				1304	struct blk_mq_ctx *ctx;
				1305	struct request_queue *q;
				1306	int i;
				1307
				1308	if (!reg->nr_hw_queues \|\|
				1309	!reg->ops->queue_rq \|\| !reg->ops->map_queue \|\|
				1310	!reg->ops->alloc_hctx \|\| !reg->ops->free_hctx)
				1311	return ERR_PTR(-EINVAL);
				1312
				1313	if (!reg->queue_depth)
				1314	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1315	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
				1316	pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
				1317	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1318	}
				1319
				1320	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
				1321	return ERR_PTR(-EINVAL);
				1322
				1323	ctx = alloc_percpu(struct blk_mq_ctx);
				1324	if (!ctx)
				1325	return ERR_PTR(-ENOMEM);
				1326
				1327	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
				1328	reg->numa_node);
				1329
				1330	if (!hctxs)
				1331	goto err_percpu;
				1332
				1333	for (i = 0; i < reg->nr_hw_queues; i++) {
				1334	hctxs[i] = reg->ops->alloc_hctx(reg, i);
				1335	if (!hctxs[i])
				1336	goto err_hctxs;
				1337
				1338	hctxs[i]->numa_node = NUMA_NO_NODE;
				1339	hctxs[i]->queue_num = i;
				1340	}
				1341
				1342	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
				1343	if (!q)
				1344	goto err_hctxs;
				1345
				1346	q->mq_map = blk_mq_make_queue_map(reg);
				1347	if (!q->mq_map)
				1348	goto err_map;
				1349
				1350	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
				1351	blk_queue_rq_timeout(q, 30000);
				1352
				1353	q->nr_queues = nr_cpu_ids;
				1354	q->nr_hw_queues = reg->nr_hw_queues;
				1355
				1356	q->queue_ctx = ctx;
				1357	q->queue_hw_ctx = hctxs;
				1358
				1359	q->mq_ops = reg->ops;
				1360
				1361	blk_queue_make_request(q, blk_mq_make_request);
				1362	blk_queue_rq_timed_out(q, reg->ops->timeout);
				1363	if (reg->timeout)
				1364	blk_queue_rq_timeout(q, reg->timeout);
				1365
				1366	blk_mq_init_flush(q);
				1367	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
				1368
				1369	if (blk_mq_init_hw_queues(q, reg, driver_data))
				1370	goto err_hw;
				1371
				1372	blk_mq_map_swqueue(q);
				1373
				1374	mutex_lock(&all_q_mutex);
				1375	list_add_tail(&q->all_q_node, &all_q_list);
				1376	mutex_unlock(&all_q_mutex);
				1377
				1378	return q;
				1379	err_hw:
				1380	kfree(q->mq_map);
				1381	err_map:
				1382	blk_cleanup_queue(q);
				1383	err_hctxs:
				1384	for (i = 0; i < reg->nr_hw_queues; i++) {
				1385	if (!hctxs[i])
				1386	break;
				1387	reg->ops->free_hctx(hctxs[i], i);
				1388	}
				1389	kfree(hctxs);
				1390	err_percpu:
				1391	free_percpu(ctx);
				1392	return ERR_PTR(-ENOMEM);
				1393	}
				1394	EXPORT_SYMBOL(blk_mq_init_queue);
				1395
				1396	void blk_mq_free_queue(struct request_queue *q)
				1397	{
				1398	struct blk_mq_hw_ctx *hctx;
				1399	int i;
				1400
				1401	queue_for_each_hw_ctx(q, hctx, i) {
				1402	cancel_delayed_work_sync(&hctx->delayed_work);
				1403	kfree(hctx->ctx_map);
				1404	kfree(hctx->ctxs);
				1405	blk_mq_free_rq_map(hctx);
				1406	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1407	if (q->mq_ops->exit_hctx)
				1408	q->mq_ops->exit_hctx(hctx, i);
				1409	q->mq_ops->free_hctx(hctx, i);
				1410	}
				1411
				1412	free_percpu(q->queue_ctx);
				1413	kfree(q->queue_hw_ctx);
				1414	kfree(q->mq_map);
				1415
				1416	q->queue_ctx = NULL;
				1417	q->queue_hw_ctx = NULL;
				1418	q->mq_map = NULL;
				1419
				1420	mutex_lock(&all_q_mutex);
				1421	list_del_init(&q->all_q_node);
				1422	mutex_unlock(&all_q_mutex);
				1423	}
				1424	EXPORT_SYMBOL(blk_mq_free_queue);
				1425
				1426	/* Basically redo blk_mq_init_queue with queue frozen */
				1427	static void __cpuinit blk_mq_queue_reinit(struct request_queue *q)
				1428	{
				1429	blk_mq_freeze_queue(q);
				1430
				1431	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
				1432
				1433	/*
				1434	* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
				1435	* we should change hctx numa_node according to new topology (this
				1436	* involves free and re-allocate memory, worthy doing?)
				1437	*/
				1438
				1439	blk_mq_map_swqueue(q);
				1440
				1441	blk_mq_unfreeze_queue(q);
				1442	}
				1443
				1444	static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb,
				1445	unsigned long action, void *hcpu)
				1446	{
				1447	struct request_queue *q;
				1448
				1449	/*
				1450	* Before new mapping is established, hotadded cpu might already start
				1451	* handling requests. This doesn't break anything as we map offline
				1452	* CPUs to first hardware queue. We will re-init queue below to get
				1453	* optimal settings.
				1454	*/
				1455	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
				1456	action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
				1457	return NOTIFY_OK;
				1458
				1459	mutex_lock(&all_q_mutex);
				1460	list_for_each_entry(q, &all_q_list, all_q_node)
				1461	blk_mq_queue_reinit(q);
				1462	mutex_unlock(&all_q_mutex);
				1463	return NOTIFY_OK;
				1464	}
				1465
				1466	static int __init blk_mq_init(void)
				1467	{
				1468	unsigned int i;
				1469
				1470	for_each_possible_cpu(i)
				1471	init_llist_head(&per_cpu(ipi_lists, i));
				1472
				1473	blk_mq_cpu_init();
				1474
				1475	/* Must be called after percpu_counter_hotcpu_callback() */
				1476	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
				1477
				1478	return 0;
				1479	}
				1480	subsys_initcall(blk_mq_init);