Blame - block/blk-mq.c - kernel/msm-4.19

blob: ac804c6350409241f9cd764c8f097dfe02f0fe0b [file] [log] [blame]

Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1	#include <linux/kernel.h>
				2	#include <linux/module.h>
				3	#include <linux/backing-dev.h>
				4	#include <linux/bio.h>
				5	#include <linux/blkdev.h>
				6	#include <linux/mm.h>
				7	#include <linux/init.h>
				8	#include <linux/slab.h>
				9	#include <linux/workqueue.h>
				10	#include <linux/smp.h>
				11	#include <linux/llist.h>
				12	#include <linux/list_sort.h>
				13	#include <linux/cpu.h>
				14	#include <linux/cache.h>
				15	#include <linux/sched/sysctl.h>
				16	#include <linux/delay.h>
				17
				18	#include <trace/events/block.h>
				19
				20	#include <linux/blk-mq.h>
				21	#include "blk.h"
				22	#include "blk-mq.h"
				23	#include "blk-mq-tag.h"
				24
				25	static DEFINE_MUTEX(all_q_mutex);
				26	static LIST_HEAD(all_q_list);
				27
				28	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
				29
				30	DEFINE_PER_CPU(struct llist_head, ipi_lists);
				31
				32	static struct blk_mq_ctx __blk_mq_get_ctx(struct request_queue q,
				33	unsigned int cpu)
				34	{
				35	return per_cpu_ptr(q->queue_ctx, cpu);
				36	}
				37
				38	/*
				39	* This assumes per-cpu software queueing queues. They could be per-node
				40	* as well, for instance. For now this is hardcoded as-is. Note that we don't
				41	* care about preemption, since we know the ctx's are persistent. This does
				42	* mean that we can't rely on ctx always matching the currently running CPU.
				43	*/
				44	static struct blk_mq_ctx blk_mq_get_ctx(struct request_queue q)
				45	{
				46	return __blk_mq_get_ctx(q, get_cpu());
				47	}
				48
				49	static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
				50	{
				51	put_cpu();
				52	}
				53
				54	/*
				55	* Check if any of the ctx's have pending work in this hardware queue
				56	*/
				57	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
				58	{
				59	unsigned int i;
				60
				61	for (i = 0; i < hctx->nr_ctx_map; i++)
				62	if (hctx->ctx_map[i])
				63	return true;
				64
				65	return false;
				66	}
				67
				68	/*
				69	* Mark this ctx as having pending work in this hardware queue
				70	*/
				71	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				72	struct blk_mq_ctx *ctx)
				73	{
				74	if (!test_bit(ctx->index_hw, hctx->ctx_map))
				75	set_bit(ctx->index_hw, hctx->ctx_map);
				76	}
				77
				78	static struct request blk_mq_alloc_rq(struct blk_mq_hw_ctx hctx, gfp_t gfp,
				79	bool reserved)
				80	{
				81	struct request *rq;
				82	unsigned int tag;
				83
				84	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
				85	if (tag != BLK_MQ_TAG_FAIL) {
				86	rq = hctx->rqs[tag];
				87	rq->tag = tag;
				88
				89	return rq;
				90	}
				91
				92	return NULL;
				93	}
				94
				95	static int blk_mq_queue_enter(struct request_queue *q)
				96	{
				97	int ret;
				98
				99	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
				100	smp_wmb();
				101	/* we have problems to freeze the queue if it's initializing */
				102	if (!blk_queue_bypass(q) \|\| !blk_queue_init_done(q))
				103	return 0;
				104
				105	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				106
				107	spin_lock_irq(q->queue_lock);
				108	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
				109	!blk_queue_bypass(q), *q->queue_lock);
				110	/* inc usage with lock hold to avoid freeze_queue runs here */
				111	if (!ret)
				112	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
				113	spin_unlock_irq(q->queue_lock);
				114
				115	return ret;
				116	}
				117
				118	static void blk_mq_queue_exit(struct request_queue *q)
				119	{
				120	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				121	}
				122
				123	/*
				124	* Guarantee no request is in use, so we can change any data structure of
				125	* the queue afterward.
				126	*/
				127	static void blk_mq_freeze_queue(struct request_queue *q)
				128	{
				129	bool drain;
				130
				131	spin_lock_irq(q->queue_lock);
				132	drain = !q->bypass_depth++;
				133	queue_flag_set(QUEUE_FLAG_BYPASS, q);
				134	spin_unlock_irq(q->queue_lock);
				135
				136	if (!drain)
				137	return;
				138
				139	while (true) {
				140	s64 count;
				141
				142	spin_lock_irq(q->queue_lock);
				143	count = percpu_counter_sum(&q->mq_usage_counter);
				144	spin_unlock_irq(q->queue_lock);
				145
				146	if (count == 0)
				147	break;
				148	blk_mq_run_queues(q, false);
				149	msleep(10);
				150	}
				151	}
				152
				153	static void blk_mq_unfreeze_queue(struct request_queue *q)
				154	{
				155	bool wake = false;
				156
				157	spin_lock_irq(q->queue_lock);
				158	if (!--q->bypass_depth) {
				159	queue_flag_clear(QUEUE_FLAG_BYPASS, q);
				160	wake = true;
				161	}
				162	WARN_ON_ONCE(q->bypass_depth < 0);
				163	spin_unlock_irq(q->queue_lock);
				164	if (wake)
				165	wake_up_all(&q->mq_freeze_wq);
				166	}
				167
				168	bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
				169	{
				170	return blk_mq_has_free_tags(hctx->tags);
				171	}
				172	EXPORT_SYMBOL(blk_mq_can_queue);
				173
				174	static void blk_mq_rq_ctx_init(struct blk_mq_ctx ctx, struct request rq,
				175	unsigned int rw_flags)
				176	{
				177	rq->mq_ctx = ctx;
				178	rq->cmd_flags = rw_flags;
				179	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
				180	}
				181
				182	static struct request __blk_mq_alloc_request(struct blk_mq_hw_ctx hctx,
				183	gfp_t gfp, bool reserved)
				184	{
				185	return blk_mq_alloc_rq(hctx, gfp, reserved);
				186	}
				187
				188	static struct request blk_mq_alloc_request_pinned(struct request_queue q,
				189	int rw, gfp_t gfp,
				190	bool reserved)
				191	{
				192	struct request *rq;
				193
				194	do {
				195	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
				196	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
				197
				198	rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
				199	if (rq) {
				200	blk_mq_rq_ctx_init(ctx, rq, rw);
				201	break;
				202	} else if (!(gfp & __GFP_WAIT))
				203	break;
				204
				205	blk_mq_put_ctx(ctx);
				206	__blk_mq_run_hw_queue(hctx);
				207	blk_mq_wait_for_tags(hctx->tags);
				208	} while (1);
				209
				210	return rq;
				211	}
				212
				213	struct request blk_mq_alloc_request(struct request_queue q, int rw, gfp_t gfp)
				214	{
				215	struct request *rq;
				216
				217	if (blk_mq_queue_enter(q))
				218	return NULL;
				219
				220	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
				221	blk_mq_put_ctx(rq->mq_ctx);
				222	return rq;
				223	}
				224
				225	struct request blk_mq_alloc_reserved_request(struct request_queue q, int rw,
				226	gfp_t gfp)
				227	{
				228	struct request *rq;
				229
				230	if (blk_mq_queue_enter(q))
				231	return NULL;
				232
				233	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
				234	blk_mq_put_ctx(rq->mq_ctx);
				235	return rq;
				236	}
				237	EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
				238
				239	/*
				240	* Re-init and set pdu, if we have it
				241	*/
				242	static void blk_mq_rq_init(struct blk_mq_hw_ctx hctx, struct request rq)
				243	{
				244	blk_rq_init(hctx->queue, rq);
				245
				246	if (hctx->cmd_size)
				247	rq->special = blk_mq_rq_to_pdu(rq);
				248	}
				249
				250	static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
				251	struct blk_mq_ctx ctx, struct request rq)
				252	{
				253	const int tag = rq->tag;
				254	struct request_queue *q = rq->q;
				255
				256	blk_mq_rq_init(hctx, rq);
				257	blk_mq_put_tag(hctx->tags, tag);
				258
				259	blk_mq_queue_exit(q);
				260	}
				261
				262	void blk_mq_free_request(struct request *rq)
				263	{
				264	struct blk_mq_ctx *ctx = rq->mq_ctx;
				265	struct blk_mq_hw_ctx *hctx;
				266	struct request_queue *q = rq->q;
				267
				268	ctx->rq_completed[rq_is_sync(rq)]++;
				269
				270	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				271	__blk_mq_free_request(hctx, ctx, rq);
				272	}
				273
				274	static void blk_mq_bio_endio(struct request rq, struct bio bio, int error)
				275	{
				276	if (error)
				277	clear_bit(BIO_UPTODATE, &bio->bi_flags);
				278	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				279	error = -EIO;
				280
				281	if (unlikely(rq->cmd_flags & REQ_QUIET))
				282	set_bit(BIO_QUIET, &bio->bi_flags);
				283
				284	/* don't actually finish bio if it's part of flush sequence */
				285	if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
				286	bio_endio(bio, error);
				287	}
				288
				289	void blk_mq_complete_request(struct request *rq, int error)
				290	{
				291	struct bio *bio = rq->bio;
				292	unsigned int bytes = 0;
				293
				294	trace_block_rq_complete(rq->q, rq);
				295
				296	while (bio) {
				297	struct bio *next = bio->bi_next;
				298
				299	bio->bi_next = NULL;
				300	bytes += bio->bi_size;
				301	blk_mq_bio_endio(rq, bio, error);
				302	bio = next;
				303	}
				304
				305	blk_account_io_completion(rq, bytes);
				306
				307	if (rq->end_io)
				308	rq->end_io(rq, error);
				309	else
				310	blk_mq_free_request(rq);
				311
				312	blk_account_io_done(rq);
				313	}
				314
				315	void __blk_mq_end_io(struct request *rq, int error)
				316	{
				317	if (!blk_mark_rq_complete(rq))
				318	blk_mq_complete_request(rq, error);
				319	}
				320
				321	#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
				322
				323	/*
				324	* Called with interrupts disabled.
				325	*/
				326	static void ipi_end_io(void *data)
				327	{
				328	struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
				329	struct llist_node entry, next;
				330	struct request *rq;
				331
				332	entry = llist_del_all(list);
				333
				334	while (entry) {
				335	next = entry->next;
				336	rq = llist_entry(entry, struct request, ll_list);
				337	__blk_mq_end_io(rq, rq->errors);
				338	entry = next;
				339	}
				340	}
				341
				342	static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
				343	struct request *rq, const int error)
				344	{
				345	struct call_single_data *data = &rq->csd;
				346
				347	rq->errors = error;
				348	rq->ll_list.next = NULL;
				349
				350	/*
				351	* If the list is non-empty, an existing IPI must already
				352	* be "in flight". If that is the case, we need not schedule
				353	* a new one.
				354	*/
				355	if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
				356	data->func = ipi_end_io;
				357	data->flags = 0;
				358	__smp_call_function_single(ctx->cpu, data, 0);
				359	}
				360
				361	return true;
				362	}
				363	#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
				364	static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
				365	struct request *rq, const int error)
				366	{
				367	return false;
				368	}
				369	#endif
				370
				371	/*
				372	* End IO on this request on a multiqueue enabled driver. We'll either do
				373	* it directly inline, or punt to a local IPI handler on the matching
				374	* remote CPU.
				375	*/
				376	void blk_mq_end_io(struct request *rq, int error)
				377	{
				378	struct blk_mq_ctx *ctx = rq->mq_ctx;
				379	int cpu;
				380
				381	if (!ctx->ipi_redirect)
				382	return __blk_mq_end_io(rq, error);
				383
				384	cpu = get_cpu();
				385
				386	if (cpu == ctx->cpu \|\| !cpu_online(ctx->cpu) \|\|
				387	!ipi_remote_cpu(ctx, cpu, rq, error))
				388	__blk_mq_end_io(rq, error);
				389
				390	put_cpu();
				391	}
				392	EXPORT_SYMBOL(blk_mq_end_io);
				393
				394	static void blk_mq_start_request(struct request *rq)
				395	{
				396	struct request_queue *q = rq->q;
				397
				398	trace_block_rq_issue(q, rq);
				399
				400	/*
				401	* Just mark start time and set the started bit. Due to memory
				402	* ordering, we know we'll see the correct deadline as long as
				403	* REQ_ATOMIC_STARTED is seen.
				404	*/
				405	rq->deadline = jiffies + q->rq_timeout;
				406	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
				407	}
				408
				409	static void blk_mq_requeue_request(struct request *rq)
				410	{
				411	struct request_queue *q = rq->q;
				412
				413	trace_block_rq_requeue(q, rq);
				414	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
				415	}
				416
				417	struct blk_mq_timeout_data {
				418	struct blk_mq_hw_ctx *hctx;
				419	unsigned long *next;
				420	unsigned int *next_set;
				421	};
				422
				423	static void blk_mq_timeout_check(void __data, unsigned long free_tags)
				424	{
				425	struct blk_mq_timeout_data *data = __data;
				426	struct blk_mq_hw_ctx *hctx = data->hctx;
				427	unsigned int tag;
				428
				429	/* It may not be in flight yet (this is where
				430	* the REQ_ATOMIC_STARTED flag comes in). The requests are
				431	* statically allocated, so we know it's always safe to access the
				432	* memory associated with a bit offset into ->rqs[].
				433	*/
				434	tag = 0;
				435	do {
				436	struct request *rq;
				437
				438	tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
				439	if (tag >= hctx->queue_depth)
				440	break;
				441
				442	rq = hctx->rqs[tag++];
				443
				444	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
				445	continue;
				446
				447	blk_rq_check_expired(rq, data->next, data->next_set);
				448	} while (1);
				449	}
				450
				451	static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
				452	unsigned long *next,
				453	unsigned int *next_set)
				454	{
				455	struct blk_mq_timeout_data data = {
				456	.hctx = hctx,
				457	.next = next,
				458	.next_set = next_set,
				459	};
				460
				461	/*
				462	* Ask the tagging code to iterate busy requests, so we can
				463	* check them for timeout.
				464	*/
				465	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
				466	}
				467
				468	static void blk_mq_rq_timer(unsigned long data)
				469	{
				470	struct request_queue q = (struct request_queue ) data;
				471	struct blk_mq_hw_ctx *hctx;
				472	unsigned long next = 0;
				473	int i, next_set = 0;
				474
				475	queue_for_each_hw_ctx(q, hctx, i)
				476	blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
				477
				478	if (next_set)
				479	mod_timer(&q->timeout, round_jiffies_up(next));
				480	}
				481
				482	/*
				483	* Reverse check our software queue for entries that we could potentially
				484	* merge with. Currently includes a hand-wavy stop count of 8, to not spend
				485	* too much time checking for merges.
				486	*/
				487	static bool blk_mq_attempt_merge(struct request_queue *q,
				488	struct blk_mq_ctx ctx, struct bio bio)
				489	{
				490	struct request *rq;
				491	int checked = 8;
				492
				493	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
				494	int el_ret;
				495
				496	if (!checked--)
				497	break;
				498
				499	if (!blk_rq_merge_ok(rq, bio))
				500	continue;
				501
				502	el_ret = blk_try_merge(rq, bio);
				503	if (el_ret == ELEVATOR_BACK_MERGE) {
				504	if (bio_attempt_back_merge(q, rq, bio)) {
				505	ctx->rq_merged++;
				506	return true;
				507	}
				508	break;
				509	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
				510	if (bio_attempt_front_merge(q, rq, bio)) {
				511	ctx->rq_merged++;
				512	return true;
				513	}
				514	break;
				515	}
				516	}
				517
				518	return false;
				519	}
				520
				521	void blk_mq_add_timer(struct request *rq)
				522	{
				523	__blk_add_timer(rq, NULL);
				524	}
				525
				526	/*
				527	* Run this hardware queue, pulling any software queues mapped to it in.
				528	* Note that this function currently has various problems around ordering
				529	* of IO. In particular, we'd like FIFO behaviour on handling existing
				530	* items on the hctx->dispatch list. Ignore that for now.
				531	*/
				532	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
				533	{
				534	struct request_queue *q = hctx->queue;
				535	struct blk_mq_ctx *ctx;
				536	struct request *rq;
				537	LIST_HEAD(rq_list);
				538	int bit, queued;
				539
				540	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
				541	return;
				542
				543	hctx->run++;
				544
				545	/*
				546	* Touch any software queue that has pending entries.
				547	*/
				548	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
				549	clear_bit(bit, hctx->ctx_map);
				550	ctx = hctx->ctxs[bit];
				551	BUG_ON(bit != ctx->index_hw);
				552
				553	spin_lock(&ctx->lock);
				554	list_splice_tail_init(&ctx->rq_list, &rq_list);
				555	spin_unlock(&ctx->lock);
				556	}
				557
				558	/*
				559	* If we have previous entries on our dispatch list, grab them
				560	* and stuff them at the front for more fair dispatch.
				561	*/
				562	if (!list_empty_careful(&hctx->dispatch)) {
				563	spin_lock(&hctx->lock);
				564	if (!list_empty(&hctx->dispatch))
				565	list_splice_init(&hctx->dispatch, &rq_list);
				566	spin_unlock(&hctx->lock);
				567	}
				568
				569	/*
				570	* Delete and return all entries from our dispatch list
				571	*/
				572	queued = 0;
				573
				574	/*
				575	* Now process all the entries, sending them to the driver.
				576	*/
				577	while (!list_empty(&rq_list)) {
				578	int ret;
				579
				580	rq = list_first_entry(&rq_list, struct request, queuelist);
				581	list_del_init(&rq->queuelist);
				582	blk_mq_start_request(rq);
				583
				584	/*
				585	* Last request in the series. Flag it as such, this
				586	* enables drivers to know when IO should be kicked off,
				587	* if they don't do it on a per-request basis.
				588	*
				589	* Note: the flag isn't the only condition drivers
				590	* should do kick off. If drive is busy, the last
				591	* request might not have the bit set.
				592	*/
				593	if (list_empty(&rq_list))
				594	rq->cmd_flags \|= REQ_END;
				595
				596	ret = q->mq_ops->queue_rq(hctx, rq);
				597	switch (ret) {
				598	case BLK_MQ_RQ_QUEUE_OK:
				599	queued++;
				600	continue;
				601	case BLK_MQ_RQ_QUEUE_BUSY:
				602	/*
				603	* FIXME: we should have a mechanism to stop the queue
				604	* like blk_stop_queue, otherwise we will waste cpu
				605	* time
				606	*/
				607	list_add(&rq->queuelist, &rq_list);
				608	blk_mq_requeue_request(rq);
				609	break;
				610	default:
				611	pr_err("blk-mq: bad return on queue: %d\n", ret);
				612	rq->errors = -EIO;
				613	case BLK_MQ_RQ_QUEUE_ERROR:
				614	blk_mq_end_io(rq, rq->errors);
				615	break;
				616	}
				617
				618	if (ret == BLK_MQ_RQ_QUEUE_BUSY)
				619	break;
				620	}
				621
				622	if (!queued)
				623	hctx->dispatched[0]++;
				624	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
				625	hctx->dispatched[ilog2(queued) + 1]++;
				626
				627	/*
				628	* Any items that need requeuing? Stuff them into hctx->dispatch,
				629	* that is where we will continue on next queue run.
				630	*/
				631	if (!list_empty(&rq_list)) {
				632	spin_lock(&hctx->lock);
				633	list_splice(&rq_list, &hctx->dispatch);
				634	spin_unlock(&hctx->lock);
				635	}
				636	}
				637
				638	void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
				639	{
				640	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
				641	return;
				642
				643	if (!async)
				644	__blk_mq_run_hw_queue(hctx);
				645	else {
				646	struct request_queue *q = hctx->queue;
				647
				648	kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
				649	}
				650	}
				651
				652	void blk_mq_run_queues(struct request_queue *q, bool async)
				653	{
				654	struct blk_mq_hw_ctx *hctx;
				655	int i;
				656
				657	queue_for_each_hw_ctx(q, hctx, i) {
				658	if ((!blk_mq_hctx_has_pending(hctx) &&
				659	list_empty_careful(&hctx->dispatch)) \|\|
				660	test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
				661	continue;
				662
				663	blk_mq_run_hw_queue(hctx, async);
				664	}
				665	}
				666	EXPORT_SYMBOL(blk_mq_run_queues);
				667
				668	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
				669	{
				670	cancel_delayed_work(&hctx->delayed_work);
				671	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
				672	}
				673	EXPORT_SYMBOL(blk_mq_stop_hw_queue);
				674
Christoph Hellwig	280d45f	2013-10-25 14:45:58 +0100	[diff] [blame^]	675	void blk_mq_stop_hw_queues(struct request_queue *q)
				676	{
				677	struct blk_mq_hw_ctx *hctx;
				678	int i;
				679
				680	queue_for_each_hw_ctx(q, hctx, i)
				681	blk_mq_stop_hw_queue(hctx);
				682	}
				683	EXPORT_SYMBOL(blk_mq_stop_hw_queues);
				684
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	685	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
				686	{
				687	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				688	__blk_mq_run_hw_queue(hctx);
				689	}
				690	EXPORT_SYMBOL(blk_mq_start_hw_queue);
				691
				692	void blk_mq_start_stopped_hw_queues(struct request_queue *q)
				693	{
				694	struct blk_mq_hw_ctx *hctx;
				695	int i;
				696
				697	queue_for_each_hw_ctx(q, hctx, i) {
				698	if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
				699	continue;
				700
				701	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				702	blk_mq_run_hw_queue(hctx, true);
				703	}
				704	}
				705	EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
				706
				707	static void blk_mq_work_fn(struct work_struct *work)
				708	{
				709	struct blk_mq_hw_ctx *hctx;
				710
				711	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
				712	__blk_mq_run_hw_queue(hctx);
				713	}
				714
				715	static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
				716	struct request *rq)
				717	{
				718	struct blk_mq_ctx *ctx = rq->mq_ctx;
				719
				720	list_add_tail(&rq->queuelist, &ctx->rq_list);
				721	blk_mq_hctx_mark_pending(hctx, ctx);
				722
				723	/*
				724	* We do this early, to ensure we are on the right CPU.
				725	*/
				726	blk_mq_add_timer(rq);
				727	}
				728
				729	void blk_mq_insert_request(struct request_queue q, struct request rq,
				730	bool run_queue)
				731	{
				732	struct blk_mq_hw_ctx *hctx;
				733	struct blk_mq_ctx ctx, current_ctx;
				734
				735	ctx = rq->mq_ctx;
				736	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				737
				738	if (rq->cmd_flags & (REQ_FLUSH \| REQ_FUA)) {
				739	blk_insert_flush(rq);
				740	} else {
				741	current_ctx = blk_mq_get_ctx(q);
				742
				743	if (!cpu_online(ctx->cpu)) {
				744	ctx = current_ctx;
				745	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				746	rq->mq_ctx = ctx;
				747	}
				748	spin_lock(&ctx->lock);
				749	__blk_mq_insert_request(hctx, rq);
				750	spin_unlock(&ctx->lock);
				751
				752	blk_mq_put_ctx(current_ctx);
				753	}
				754
				755	if (run_queue)
				756	__blk_mq_run_hw_queue(hctx);
				757	}
				758	EXPORT_SYMBOL(blk_mq_insert_request);
				759
				760	/*
				761	* This is a special version of blk_mq_insert_request to bypass FLUSH request
				762	* check. Should only be used internally.
				763	*/
				764	void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
				765	{
				766	struct request_queue *q = rq->q;
				767	struct blk_mq_hw_ctx *hctx;
				768	struct blk_mq_ctx ctx, current_ctx;
				769
				770	current_ctx = blk_mq_get_ctx(q);
				771
				772	ctx = rq->mq_ctx;
				773	if (!cpu_online(ctx->cpu)) {
				774	ctx = current_ctx;
				775	rq->mq_ctx = ctx;
				776	}
				777	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				778
				779	/* ctx->cpu might be offline */
				780	spin_lock(&ctx->lock);
				781	__blk_mq_insert_request(hctx, rq);
				782	spin_unlock(&ctx->lock);
				783
				784	blk_mq_put_ctx(current_ctx);
				785
				786	if (run_queue)
				787	blk_mq_run_hw_queue(hctx, async);
				788	}
				789
				790	static void blk_mq_insert_requests(struct request_queue *q,
				791	struct blk_mq_ctx *ctx,
				792	struct list_head *list,
				793	int depth,
				794	bool from_schedule)
				795
				796	{
				797	struct blk_mq_hw_ctx *hctx;
				798	struct blk_mq_ctx *current_ctx;
				799
				800	trace_block_unplug(q, depth, !from_schedule);
				801
				802	current_ctx = blk_mq_get_ctx(q);
				803
				804	if (!cpu_online(ctx->cpu))
				805	ctx = current_ctx;
				806	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				807
				808	/*
				809	* preemption doesn't flush plug list, so it's possible ctx->cpu is
				810	* offline now
				811	*/
				812	spin_lock(&ctx->lock);
				813	while (!list_empty(list)) {
				814	struct request *rq;
				815
				816	rq = list_first_entry(list, struct request, queuelist);
				817	list_del_init(&rq->queuelist);
				818	rq->mq_ctx = ctx;
				819	__blk_mq_insert_request(hctx, rq);
				820	}
				821	spin_unlock(&ctx->lock);
				822
				823	blk_mq_put_ctx(current_ctx);
				824
				825	blk_mq_run_hw_queue(hctx, from_schedule);
				826	}
				827
				828	static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
				829	{
				830	struct request *rqa = container_of(a, struct request, queuelist);
				831	struct request *rqb = container_of(b, struct request, queuelist);
				832
				833	return !(rqa->mq_ctx < rqb->mq_ctx \|\|
				834	(rqa->mq_ctx == rqb->mq_ctx &&
				835	blk_rq_pos(rqa) < blk_rq_pos(rqb)));
				836	}
				837
				838	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
				839	{
				840	struct blk_mq_ctx *this_ctx;
				841	struct request_queue *this_q;
				842	struct request *rq;
				843	LIST_HEAD(list);
				844	LIST_HEAD(ctx_list);
				845	unsigned int depth;
				846
				847	list_splice_init(&plug->mq_list, &list);
				848
				849	list_sort(NULL, &list, plug_ctx_cmp);
				850
				851	this_q = NULL;
				852	this_ctx = NULL;
				853	depth = 0;
				854
				855	while (!list_empty(&list)) {
				856	rq = list_entry_rq(list.next);
				857	list_del_init(&rq->queuelist);
				858	BUG_ON(!rq->q);
				859	if (rq->mq_ctx != this_ctx) {
				860	if (this_ctx) {
				861	blk_mq_insert_requests(this_q, this_ctx,
				862	&ctx_list, depth,
				863	from_schedule);
				864	}
				865
				866	this_ctx = rq->mq_ctx;
				867	this_q = rq->q;
				868	depth = 0;
				869	}
				870
				871	depth++;
				872	list_add_tail(&rq->queuelist, &ctx_list);
				873	}
				874
				875	/*
				876	* If 'this_ctx' is set, we know we have entries to complete
				877	* on 'ctx_list'. Do those.
				878	*/
				879	if (this_ctx) {
				880	blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
				881	from_schedule);
				882	}
				883	}
				884
				885	static void blk_mq_bio_to_request(struct request rq, struct bio bio)
				886	{
				887	init_request_from_bio(rq, bio);
				888	blk_account_io_start(rq, 1);
				889	}
				890
				891	static void blk_mq_make_request(struct request_queue q, struct bio bio)
				892	{
				893	struct blk_mq_hw_ctx *hctx;
				894	struct blk_mq_ctx *ctx;
				895	const int is_sync = rw_is_sync(bio->bi_rw);
				896	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
				897	int rw = bio_data_dir(bio);
				898	struct request *rq;
				899	unsigned int use_plug, request_count = 0;
				900
				901	/*
				902	* If we have multiple hardware queues, just go directly to
				903	* one of those for sync IO.
				904	*/
				905	use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) \|\| !is_sync);
				906
				907	blk_queue_bounce(q, &bio);
				908
				909	if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
				910	return;
				911
				912	if (blk_mq_queue_enter(q)) {
				913	bio_endio(bio, -EIO);
				914	return;
				915	}
				916
				917	ctx = blk_mq_get_ctx(q);
				918	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				919
				920	trace_block_getrq(q, bio, rw);
				921	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
				922	if (likely(rq))
				923	blk_mq_rq_ctx_init(ctx, rq, rw);
				924	else {
				925	blk_mq_put_ctx(ctx);
				926	trace_block_sleeprq(q, bio, rw);
				927	rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT\|GFP_ATOMIC,
				928	false);
				929	ctx = rq->mq_ctx;
				930	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				931	}
				932
				933	hctx->queued++;
				934
				935	if (unlikely(is_flush_fua)) {
				936	blk_mq_bio_to_request(rq, bio);
				937	blk_mq_put_ctx(ctx);
				938	blk_insert_flush(rq);
				939	goto run_queue;
				940	}
				941
				942	/*
				943	* A task plug currently exists. Since this is completely lockless,
				944	* utilize that to temporarily store requests until the task is
				945	* either done or scheduled away.
				946	*/
				947	if (use_plug) {
				948	struct blk_plug *plug = current->plug;
				949
				950	if (plug) {
				951	blk_mq_bio_to_request(rq, bio);
				952	if (list_empty(&plug->list))
				953	trace_block_plug(q);
				954	else if (request_count >= BLK_MAX_REQUEST_COUNT) {
				955	blk_flush_plug_list(plug, false);
				956	trace_block_plug(q);
				957	}
				958	list_add_tail(&rq->queuelist, &plug->mq_list);
				959	blk_mq_put_ctx(ctx);
				960	return;
				961	}
				962	}
				963
				964	spin_lock(&ctx->lock);
				965
				966	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
				967	blk_mq_attempt_merge(q, ctx, bio))
				968	__blk_mq_free_request(hctx, ctx, rq);
				969	else {
				970	blk_mq_bio_to_request(rq, bio);
				971	__blk_mq_insert_request(hctx, rq);
				972	}
				973
				974	spin_unlock(&ctx->lock);
				975	blk_mq_put_ctx(ctx);
				976
				977	/*
				978	* For a SYNC request, send it to the hardware immediately. For an
				979	* ASYNC request, just ensure that we run it later on. The latter
				980	* allows for merging opportunities and more efficient dispatching.
				981	*/
				982	run_queue:
				983	blk_mq_run_hw_queue(hctx, !is_sync \|\| is_flush_fua);
				984	}
				985
				986	/*
				987	* Default mapping to a software queue, since we use one per CPU.
				988	*/
				989	struct blk_mq_hw_ctx blk_mq_map_queue(struct request_queue q, const int cpu)
				990	{
				991	return q->queue_hw_ctx[q->mq_map[cpu]];
				992	}
				993	EXPORT_SYMBOL(blk_mq_map_queue);
				994
				995	struct blk_mq_hw_ctx blk_mq_alloc_single_hw_queue(struct blk_mq_reg reg,
				996	unsigned int hctx_index)
				997	{
				998	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
				999	GFP_KERNEL \| __GFP_ZERO, reg->numa_node);
				1000	}
				1001	EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
				1002
				1003	void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
				1004	unsigned int hctx_index)
				1005	{
				1006	kfree(hctx);
				1007	}
				1008	EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
				1009
				1010	static void blk_mq_hctx_notify(void *data, unsigned long action,
				1011	unsigned int cpu)
				1012	{
				1013	struct blk_mq_hw_ctx *hctx = data;
				1014	struct blk_mq_ctx *ctx;
				1015	LIST_HEAD(tmp);
				1016
				1017	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
				1018	return;
				1019
				1020	/*
				1021	* Move ctx entries to new CPU, if this one is going away.
				1022	*/
				1023	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
				1024
				1025	spin_lock(&ctx->lock);
				1026	if (!list_empty(&ctx->rq_list)) {
				1027	list_splice_init(&ctx->rq_list, &tmp);
				1028	clear_bit(ctx->index_hw, hctx->ctx_map);
				1029	}
				1030	spin_unlock(&ctx->lock);
				1031
				1032	if (list_empty(&tmp))
				1033	return;
				1034
				1035	ctx = blk_mq_get_ctx(hctx->queue);
				1036	spin_lock(&ctx->lock);
				1037
				1038	while (!list_empty(&tmp)) {
				1039	struct request *rq;
				1040
				1041	rq = list_first_entry(&tmp, struct request, queuelist);
				1042	rq->mq_ctx = ctx;
				1043	list_move_tail(&rq->queuelist, &ctx->rq_list);
				1044	}
				1045
				1046	blk_mq_hctx_mark_pending(hctx, ctx);
				1047
				1048	spin_unlock(&ctx->lock);
				1049	blk_mq_put_ctx(ctx);
				1050	}
				1051
				1052	static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
				1053	void (init)(void , struct blk_mq_hw_ctx *,
				1054	struct request *, unsigned int),
				1055	void *data)
				1056	{
				1057	unsigned int i;
				1058
				1059	for (i = 0; i < hctx->queue_depth; i++) {
				1060	struct request *rq = hctx->rqs[i];
				1061
				1062	init(data, hctx, rq, i);
				1063	}
				1064	}
				1065
				1066	void blk_mq_init_commands(struct request_queue *q,
				1067	void (init)(void , struct blk_mq_hw_ctx *,
				1068	struct request *, unsigned int),
				1069	void *data)
				1070	{
				1071	struct blk_mq_hw_ctx *hctx;
				1072	unsigned int i;
				1073
				1074	queue_for_each_hw_ctx(q, hctx, i)
				1075	blk_mq_init_hw_commands(hctx, init, data);
				1076	}
				1077	EXPORT_SYMBOL(blk_mq_init_commands);
				1078
				1079	static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
				1080	{
				1081	struct page *page;
				1082
				1083	while (!list_empty(&hctx->page_list)) {
				1084	page = list_first_entry(&hctx->page_list, struct page, list);
				1085	list_del_init(&page->list);
				1086	__free_pages(page, page->private);
				1087	}
				1088
				1089	kfree(hctx->rqs);
				1090
				1091	if (hctx->tags)
				1092	blk_mq_free_tags(hctx->tags);
				1093	}
				1094
				1095	static size_t order_to_size(unsigned int order)
				1096	{
				1097	size_t ret = PAGE_SIZE;
				1098
				1099	while (order--)
				1100	ret *= 2;
				1101
				1102	return ret;
				1103	}
				1104
				1105	static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
				1106	unsigned int reserved_tags, int node)
				1107	{
				1108	unsigned int i, j, entries_per_page, max_order = 4;
				1109	size_t rq_size, left;
				1110
				1111	INIT_LIST_HEAD(&hctx->page_list);
				1112
				1113	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
				1114	GFP_KERNEL, node);
				1115	if (!hctx->rqs)
				1116	return -ENOMEM;
				1117
				1118	/*
				1119	* rq_size is the size of the request plus driver payload, rounded
				1120	* to the cacheline size
				1121	*/
				1122	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
				1123	cache_line_size());
				1124	left = rq_size * hctx->queue_depth;
				1125
				1126	for (i = 0; i < hctx->queue_depth;) {
				1127	int this_order = max_order;
				1128	struct page *page;
				1129	int to_do;
				1130	void *p;
				1131
				1132	while (left < order_to_size(this_order - 1) && this_order)
				1133	this_order--;
				1134
				1135	do {
				1136	page = alloc_pages_node(node, GFP_KERNEL, this_order);
				1137	if (page)
				1138	break;
				1139	if (!this_order--)
				1140	break;
				1141	if (order_to_size(this_order) < rq_size)
				1142	break;
				1143	} while (1);
				1144
				1145	if (!page)
				1146	break;
				1147
				1148	page->private = this_order;
				1149	list_add_tail(&page->list, &hctx->page_list);
				1150
				1151	p = page_address(page);
				1152	entries_per_page = order_to_size(this_order) / rq_size;
				1153	to_do = min(entries_per_page, hctx->queue_depth - i);
				1154	left -= to_do * rq_size;
				1155	for (j = 0; j < to_do; j++) {
				1156	hctx->rqs[i] = p;
				1157	blk_mq_rq_init(hctx, hctx->rqs[i]);
				1158	p += rq_size;
				1159	i++;
				1160	}
				1161	}
				1162
				1163	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
				1164	goto err_rq_map;
				1165	else if (i != hctx->queue_depth) {
				1166	hctx->queue_depth = i;
				1167	pr_warn("%s: queue depth set to %u because of low memory\n",
				1168	__func__, i);
				1169	}
				1170
				1171	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
				1172	if (!hctx->tags) {
				1173	err_rq_map:
				1174	blk_mq_free_rq_map(hctx);
				1175	return -ENOMEM;
				1176	}
				1177
				1178	return 0;
				1179	}
				1180
				1181	static int blk_mq_init_hw_queues(struct request_queue *q,
				1182	struct blk_mq_reg reg, void driver_data)
				1183	{
				1184	struct blk_mq_hw_ctx *hctx;
				1185	unsigned int i, j;
				1186
				1187	/*
				1188	* Initialize hardware queues
				1189	*/
				1190	queue_for_each_hw_ctx(q, hctx, i) {
				1191	unsigned int num_maps;
				1192	int node;
				1193
				1194	node = hctx->numa_node;
				1195	if (node == NUMA_NO_NODE)
				1196	node = hctx->numa_node = reg->numa_node;
				1197
				1198	INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
				1199	spin_lock_init(&hctx->lock);
				1200	INIT_LIST_HEAD(&hctx->dispatch);
				1201	hctx->queue = q;
				1202	hctx->queue_num = i;
				1203	hctx->flags = reg->flags;
				1204	hctx->queue_depth = reg->queue_depth;
				1205	hctx->cmd_size = reg->cmd_size;
				1206
				1207	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
				1208	blk_mq_hctx_notify, hctx);
				1209	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
				1210
				1211	if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
				1212	break;
				1213
				1214	/*
				1215	* Allocate space for all possible cpus to avoid allocation in
				1216	* runtime
				1217	*/
				1218	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
				1219	GFP_KERNEL, node);
				1220	if (!hctx->ctxs)
				1221	break;
				1222
				1223	num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
				1224	hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
				1225	GFP_KERNEL, node);
				1226	if (!hctx->ctx_map)
				1227	break;
				1228
				1229	hctx->nr_ctx_map = num_maps;
				1230	hctx->nr_ctx = 0;
				1231
				1232	if (reg->ops->init_hctx &&
				1233	reg->ops->init_hctx(hctx, driver_data, i))
				1234	break;
				1235	}
				1236
				1237	if (i == q->nr_hw_queues)
				1238	return 0;
				1239
				1240	/*
				1241	* Init failed
				1242	*/
				1243	queue_for_each_hw_ctx(q, hctx, j) {
				1244	if (i == j)
				1245	break;
				1246
				1247	if (reg->ops->exit_hctx)
				1248	reg->ops->exit_hctx(hctx, j);
				1249
				1250	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1251	blk_mq_free_rq_map(hctx);
				1252	kfree(hctx->ctxs);
				1253	}
				1254
				1255	return 1;
				1256	}
				1257
				1258	static void blk_mq_init_cpu_queues(struct request_queue *q,
				1259	unsigned int nr_hw_queues)
				1260	{
				1261	unsigned int i;
				1262
				1263	for_each_possible_cpu(i) {
				1264	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
				1265	struct blk_mq_hw_ctx *hctx;
				1266
				1267	memset(__ctx, 0, sizeof(*__ctx));
				1268	__ctx->cpu = i;
				1269	spin_lock_init(&__ctx->lock);
				1270	INIT_LIST_HEAD(&__ctx->rq_list);
				1271	__ctx->queue = q;
				1272
				1273	/* If the cpu isn't online, the cpu is mapped to first hctx */
				1274	hctx = q->mq_ops->map_queue(q, i);
				1275	hctx->nr_ctx++;
				1276
				1277	if (!cpu_online(i))
				1278	continue;
				1279
				1280	/*
				1281	* Set local node, IFF we have more than one hw queue. If
				1282	* not, we remain on the home node of the device
				1283	*/
				1284	if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
				1285	hctx->numa_node = cpu_to_node(i);
				1286	}
				1287	}
				1288
				1289	static void blk_mq_map_swqueue(struct request_queue *q)
				1290	{
				1291	unsigned int i;
				1292	struct blk_mq_hw_ctx *hctx;
				1293	struct blk_mq_ctx *ctx;
				1294
				1295	queue_for_each_hw_ctx(q, hctx, i) {
				1296	hctx->nr_ctx = 0;
				1297	}
				1298
				1299	/*
				1300	* Map software to hardware queues
				1301	*/
				1302	queue_for_each_ctx(q, ctx, i) {
				1303	/* If the cpu isn't online, the cpu is mapped to first hctx */
				1304	hctx = q->mq_ops->map_queue(q, i);
				1305	ctx->index_hw = hctx->nr_ctx;
				1306	hctx->ctxs[hctx->nr_ctx++] = ctx;
				1307	}
				1308	}
				1309
				1310	struct request_queue blk_mq_init_queue(struct blk_mq_reg reg,
				1311	void *driver_data)
				1312	{
				1313	struct blk_mq_hw_ctx **hctxs;
				1314	struct blk_mq_ctx *ctx;
				1315	struct request_queue *q;
				1316	int i;
				1317
				1318	if (!reg->nr_hw_queues \|\|
				1319	!reg->ops->queue_rq \|\| !reg->ops->map_queue \|\|
				1320	!reg->ops->alloc_hctx \|\| !reg->ops->free_hctx)
				1321	return ERR_PTR(-EINVAL);
				1322
				1323	if (!reg->queue_depth)
				1324	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1325	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
				1326	pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
				1327	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1328	}
				1329
				1330	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
				1331	return ERR_PTR(-EINVAL);
				1332
				1333	ctx = alloc_percpu(struct blk_mq_ctx);
				1334	if (!ctx)
				1335	return ERR_PTR(-ENOMEM);
				1336
				1337	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
				1338	reg->numa_node);
				1339
				1340	if (!hctxs)
				1341	goto err_percpu;
				1342
				1343	for (i = 0; i < reg->nr_hw_queues; i++) {
				1344	hctxs[i] = reg->ops->alloc_hctx(reg, i);
				1345	if (!hctxs[i])
				1346	goto err_hctxs;
				1347
				1348	hctxs[i]->numa_node = NUMA_NO_NODE;
				1349	hctxs[i]->queue_num = i;
				1350	}
				1351
				1352	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
				1353	if (!q)
				1354	goto err_hctxs;
				1355
				1356	q->mq_map = blk_mq_make_queue_map(reg);
				1357	if (!q->mq_map)
				1358	goto err_map;
				1359
				1360	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
				1361	blk_queue_rq_timeout(q, 30000);
				1362
				1363	q->nr_queues = nr_cpu_ids;
				1364	q->nr_hw_queues = reg->nr_hw_queues;
				1365
				1366	q->queue_ctx = ctx;
				1367	q->queue_hw_ctx = hctxs;
				1368
				1369	q->mq_ops = reg->ops;
				1370
				1371	blk_queue_make_request(q, blk_mq_make_request);
				1372	blk_queue_rq_timed_out(q, reg->ops->timeout);
				1373	if (reg->timeout)
				1374	blk_queue_rq_timeout(q, reg->timeout);
				1375
				1376	blk_mq_init_flush(q);
				1377	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
				1378
				1379	if (blk_mq_init_hw_queues(q, reg, driver_data))
				1380	goto err_hw;
				1381
				1382	blk_mq_map_swqueue(q);
				1383
				1384	mutex_lock(&all_q_mutex);
				1385	list_add_tail(&q->all_q_node, &all_q_list);
				1386	mutex_unlock(&all_q_mutex);
				1387
				1388	return q;
				1389	err_hw:
				1390	kfree(q->mq_map);
				1391	err_map:
				1392	blk_cleanup_queue(q);
				1393	err_hctxs:
				1394	for (i = 0; i < reg->nr_hw_queues; i++) {
				1395	if (!hctxs[i])
				1396	break;
				1397	reg->ops->free_hctx(hctxs[i], i);
				1398	}
				1399	kfree(hctxs);
				1400	err_percpu:
				1401	free_percpu(ctx);
				1402	return ERR_PTR(-ENOMEM);
				1403	}
				1404	EXPORT_SYMBOL(blk_mq_init_queue);
				1405
				1406	void blk_mq_free_queue(struct request_queue *q)
				1407	{
				1408	struct blk_mq_hw_ctx *hctx;
				1409	int i;
				1410
				1411	queue_for_each_hw_ctx(q, hctx, i) {
				1412	cancel_delayed_work_sync(&hctx->delayed_work);
				1413	kfree(hctx->ctx_map);
				1414	kfree(hctx->ctxs);
				1415	blk_mq_free_rq_map(hctx);
				1416	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1417	if (q->mq_ops->exit_hctx)
				1418	q->mq_ops->exit_hctx(hctx, i);
				1419	q->mq_ops->free_hctx(hctx, i);
				1420	}
				1421
				1422	free_percpu(q->queue_ctx);
				1423	kfree(q->queue_hw_ctx);
				1424	kfree(q->mq_map);
				1425
				1426	q->queue_ctx = NULL;
				1427	q->queue_hw_ctx = NULL;
				1428	q->mq_map = NULL;
				1429
				1430	mutex_lock(&all_q_mutex);
				1431	list_del_init(&q->all_q_node);
				1432	mutex_unlock(&all_q_mutex);
				1433	}
				1434	EXPORT_SYMBOL(blk_mq_free_queue);
				1435
				1436	/* Basically redo blk_mq_init_queue with queue frozen */
				1437	static void __cpuinit blk_mq_queue_reinit(struct request_queue *q)
				1438	{
				1439	blk_mq_freeze_queue(q);
				1440
				1441	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
				1442
				1443	/*
				1444	* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
				1445	* we should change hctx numa_node according to new topology (this
				1446	* involves free and re-allocate memory, worthy doing?)
				1447	*/
				1448
				1449	blk_mq_map_swqueue(q);
				1450
				1451	blk_mq_unfreeze_queue(q);
				1452	}
				1453
				1454	static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb,
				1455	unsigned long action, void *hcpu)
				1456	{
				1457	struct request_queue *q;
				1458
				1459	/*
				1460	* Before new mapping is established, hotadded cpu might already start
				1461	* handling requests. This doesn't break anything as we map offline
				1462	* CPUs to first hardware queue. We will re-init queue below to get
				1463	* optimal settings.
				1464	*/
				1465	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
				1466	action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
				1467	return NOTIFY_OK;
				1468
				1469	mutex_lock(&all_q_mutex);
				1470	list_for_each_entry(q, &all_q_list, all_q_node)
				1471	blk_mq_queue_reinit(q);
				1472	mutex_unlock(&all_q_mutex);
				1473	return NOTIFY_OK;
				1474	}
				1475
				1476	static int __init blk_mq_init(void)
				1477	{
				1478	unsigned int i;
				1479
				1480	for_each_possible_cpu(i)
				1481	init_llist_head(&per_cpu(ipi_lists, i));
				1482
				1483	blk_mq_cpu_init();
				1484
				1485	/* Must be called after percpu_counter_hotcpu_callback() */
				1486	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
				1487
				1488	return 0;
				1489	}
				1490	subsys_initcall(blk_mq_init);