Blame - fs/bio.c - kernel/msm-4.9

blob: 711cee10360273cd76c535ed346060a7a32f7c29 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Jens Axboe	0fe2347	2006-09-04 15:41:16 +0200	[diff] [blame]	2	* Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* This program is free software; you can redistribute it and/or modify
				5	* it under the terms of the GNU General Public License version 2 as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				11	* GNU General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public Licens
				14	* along with this program; if not, write to the Free Software
				15	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
				16	*
				17	*/
				18	#include <linux/mm.h>
				19	#include <linux/swap.h>
				20	#include <linux/bio.h>
				21	#include <linux/blkdev.h>
				22	#include <linux/slab.h>
				23	#include <linux/init.h>
				24	#include <linux/kernel.h>
				25	#include <linux/module.h>
				26	#include <linux/mempool.h>
				27	#include <linux/workqueue.h>
Jens Axboe	2056a78	2006-03-23 20:00:26 +0100	[diff] [blame]	28	#include <linux/blktrace_api.h>
Arnaldo Carvalho de Melo	5f3ea37	2008-10-30 08:34:33 +0100	[diff] [blame]	29	#include <trace/block.h>
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	30	#include <scsi/sg.h> /* for struct sg_iovec */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	31
Ingo Molnar	0bfc245	2008-11-26 11:59:56 +0100	[diff] [blame]	32	DEFINE_TRACE(block_split);
				33
Jens Axboe	392ddc3	2008-12-23 12:42:54 +0100	[diff] [blame]	34	/*
				35	* Test patch to inline a certain number of bi_io_vec's inside the bio
				36	* itself, to shrink a bio data allocation from two mempool calls to one
				37	*/
				38	#define BIO_INLINE_VECS 4
				39
Denis ChengRq	6feef53	2008-10-09 08:57:05 +0200	[diff] [blame]	40	static mempool_t *bio_split_pool __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	41
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	42	/*
				43	* if you change this list, also change bvec_alloc or things will
				44	* break badly! cannot be bigger than what you can fit into an
				45	* unsigned short
				46	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	47	#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	48	struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	49	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
				50	};
				51	#undef BV
				52
				53	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	54	* fs_bio_set is the bio_set containing bio and iovec memory pools used by
				55	* IO code that does not need private memory pools.
				56	*/
Martin K. Petersen	51d654e	2008-06-17 18:59:56 +0200	[diff] [blame]	57	struct bio_set *fs_bio_set;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	58
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	59	/*
				60	* Our slab pool management
				61	*/
				62	struct bio_slab {
				63	struct kmem_cache *slab;
				64	unsigned int slab_ref;
				65	unsigned int slab_size;
				66	char name[8];
				67	};
				68	static DEFINE_MUTEX(bio_slab_lock);
				69	static struct bio_slab *bio_slabs;
				70	static unsigned int bio_slab_nr, bio_slab_max;
				71
				72	static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
				73	{
				74	unsigned int sz = sizeof(struct bio) + extra_size;
				75	struct kmem_cache *slab = NULL;
				76	struct bio_slab *bslab;
				77	unsigned int i, entry = -1;
				78
				79	mutex_lock(&bio_slab_lock);
				80
				81	i = 0;
				82	while (i < bio_slab_nr) {
				83	struct bio_slab *bslab = &bio_slabs[i];
				84
				85	if (!bslab->slab && entry == -1)
				86	entry = i;
				87	else if (bslab->slab_size == sz) {
				88	slab = bslab->slab;
				89	bslab->slab_ref++;
				90	break;
				91	}
				92	i++;
				93	}
				94
				95	if (slab)
				96	goto out_unlock;
				97
				98	if (bio_slab_nr == bio_slab_max && entry == -1) {
				99	bio_slab_max <<= 1;
				100	bio_slabs = krealloc(bio_slabs,
				101	bio_slab_max * sizeof(struct bio_slab),
				102	GFP_KERNEL);
				103	if (!bio_slabs)
				104	goto out_unlock;
				105	}
				106	if (entry == -1)
				107	entry = bio_slab_nr++;
				108
				109	bslab = &bio_slabs[entry];
				110
				111	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
				112	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
				113	if (!slab)
				114	goto out_unlock;
				115
				116	printk("bio: create slab <%s> at %d\n", bslab->name, entry);
				117	bslab->slab = slab;
				118	bslab->slab_ref = 1;
				119	bslab->slab_size = sz;
				120	out_unlock:
				121	mutex_unlock(&bio_slab_lock);
				122	return slab;
				123	}
				124
				125	static void bio_put_slab(struct bio_set *bs)
				126	{
				127	struct bio_slab *bslab = NULL;
				128	unsigned int i;
				129
				130	mutex_lock(&bio_slab_lock);
				131
				132	for (i = 0; i < bio_slab_nr; i++) {
				133	if (bs->bio_slab == bio_slabs[i].slab) {
				134	bslab = &bio_slabs[i];
				135	break;
				136	}
				137	}
				138
				139	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
				140	goto out;
				141
				142	WARN_ON(!bslab->slab_ref);
				143
				144	if (--bslab->slab_ref)
				145	goto out;
				146
				147	kmem_cache_destroy(bslab->slab);
				148	bslab->slab = NULL;
				149
				150	out:
				151	mutex_unlock(&bio_slab_lock);
				152	}
				153
Martin K. Petersen	7ba1ba1	2008-06-30 20:04:41 +0200	[diff] [blame]	154	unsigned int bvec_nr_vecs(unsigned short idx)
				155	{
				156	return bvec_slabs[idx].nr_vecs;
				157	}
				158
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	159	void bvec_free_bs(struct bio_set bs, struct bio_vec bv, unsigned int idx)
				160	{
				161	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
				162
				163	if (idx == BIOVEC_MAX_IDX)
				164	mempool_free(bv, bs->bvec_pool);
				165	else {
				166	struct biovec_slab *bvs = bvec_slabs + idx;
				167
				168	kmem_cache_free(bvs->slab, bv);
				169	}
				170	}
				171
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	172	struct bio_vec bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long idx,
				173	struct bio_set *bs)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	174	{
				175	struct bio_vec *bvl;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	176
				177	/*
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	178	* If 'bs' is given, lookup the pool and do the mempool alloc.
				179	* If not, this is a bio_kmalloc() allocation and just do a
				180	* kzalloc() for the exact number of vecs right away.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	181	*/
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	182	if (!bs)
Jens Axboe	d3f7611	2008-12-23 12:46:21 +0100	[diff] [blame]	183	bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	184
				185	/*
				186	* see comment near bvec_array define!
				187	*/
				188	switch (nr) {
				189	case 1:
				190	*idx = 0;
				191	break;
				192	case 2 ... 4:
				193	*idx = 1;
				194	break;
				195	case 5 ... 16:
				196	*idx = 2;
				197	break;
				198	case 17 ... 64:
				199	*idx = 3;
				200	break;
				201	case 65 ... 128:
				202	*idx = 4;
				203	break;
				204	case 129 ... BIO_MAX_PAGES:
				205	*idx = 5;
				206	break;
				207	default:
				208	return NULL;
				209	}
				210
				211	/*
				212	* idx now points to the pool we want to allocate from. only the
				213	* 1-vec entry pool is mempool backed.
				214	*/
				215	if (*idx == BIOVEC_MAX_IDX) {
				216	fallback:
				217	bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
				218	} else {
				219	struct biovec_slab bvs = bvec_slabs + idx;
				220	gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT \| __GFP_IO);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	221
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	222	/*
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	223	* Make this allocation restricted and don't dump info on
				224	* allocation failures, since we'll fallback to the mempool
				225	* in case of failure.
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	226	*/
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	227	__gfp_mask \|= __GFP_NOMEMALLOC \| __GFP_NORETRY \| __GFP_NOWARN;
				228
				229	/*
				230	* Try a slab allocation. If this fails and __GFP_WAIT
				231	* is set, retry with the 1-entry mempool
				232	*/
				233	bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
				234	if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
				235	*idx = BIOVEC_MAX_IDX;
				236	goto fallback;
				237	}
				238	}
				239
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	240	return bvl;
				241	}
				242
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	243	void bio_free(struct bio bio, struct bio_set bs)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	244	{
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	245	void *p;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	246
Jens Axboe	392ddc3	2008-12-23 12:42:54 +0100	[diff] [blame]	247	if (bio_has_allocated_vec(bio))
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	248	bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
Jens Axboe	992c5dd	2007-07-18 13:18:08 +0200	[diff] [blame]	249
Martin K. Petersen	7ba1ba1	2008-06-30 20:04:41 +0200	[diff] [blame]	250	if (bio_integrity(bio))
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	251	bio_integrity_free(bio, bs);
Martin K. Petersen	7ba1ba1	2008-06-30 20:04:41 +0200	[diff] [blame]	252
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	253	/*
				254	* If we have front padding, adjust the bio pointer before freeing
				255	*/
				256	p = bio;
				257	if (bs->front_pad)
				258	p -= bs->front_pad;
				259
				260	mempool_free(p, bs->bio_pool);
Peter Osterlund	3676347	2005-09-06 15:16:42 -0700	[diff] [blame]	261	}
				262
				263	/*
				264	* default destructor for a bio allocated with bio_alloc_bioset()
				265	*/
				266	static void bio_fs_destructor(struct bio *bio)
				267	{
				268	bio_free(bio, fs_bio_set);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	269	}
				270
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	271	static void bio_kmalloc_destructor(struct bio *bio)
				272	{
Jens Axboe	392ddc3	2008-12-23 12:42:54 +0100	[diff] [blame]	273	if (bio_has_allocated_vec(bio))
				274	kfree(bio->bi_io_vec);
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	275	kfree(bio);
				276	}
				277
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	278	void bio_init(struct bio *bio)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	279	{
Jens Axboe	2b94de5	2007-07-18 13:14:03 +0200	[diff] [blame]	280	memset(bio, 0, sizeof(*bio));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	281	bio->bi_flags = 1 << BIO_UPTODATE;
Jens Axboe	c7c22e4	2008-09-13 20:26:01 +0200	[diff] [blame]	282	bio->bi_comp_cpu = -1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	283	atomic_set(&bio->bi_cnt, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	284	}
				285
				286	/**
				287	* bio_alloc_bioset - allocate a bio for I/O
				288	* @gfp_mask: the GFP_ mask given to the slab allocator
				289	* @nr_iovecs: number of iovecs to pre-allocate
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	290	* @bs: the bio_set to allocate from. If %NULL, just use kmalloc
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	291	*
				292	* Description:
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	293	* bio_alloc_bioset will first try its own mempool to satisfy the allocation.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	294	* If %__GFP_WAIT is set then we will block on the internal pool waiting
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	295	* for a &struct bio to become free. If a %NULL @bs is passed in, we will
				296	* fall back to just using @kmalloc to allocate the required memory.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	297	*
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	298	* Note that the caller must set ->bi_destructor on succesful return
				299	* of a bio, to do the appropriate freeing of the bio once the reference
				300	* count drops to zero.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	301	**/
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	302	struct bio bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set bs)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	303	{
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	304	struct bio *bio = NULL;
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	305
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	306	if (bs) {
				307	void *p = mempool_alloc(bs->bio_pool, gfp_mask);
				308
				309	if (p)
				310	bio = p + bs->front_pad;
				311	} else
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	312	bio = kmalloc(sizeof(*bio), gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	313
				314	if (likely(bio)) {
				315	struct bio_vec *bvl = NULL;
				316
				317	bio_init(bio);
				318	if (likely(nr_iovecs)) {
Jens Axboe	eeae1d4	2008-05-07 13:26:27 +0200	[diff] [blame]	319	unsigned long uninitialized_var(idx);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	320
Jens Axboe	392ddc3	2008-12-23 12:42:54 +0100	[diff] [blame]	321	if (nr_iovecs <= BIO_INLINE_VECS) {
				322	idx = 0;
				323	bvl = bio->bi_inline_vecs;
				324	nr_iovecs = BIO_INLINE_VECS;
Jens Axboe	392ddc3	2008-12-23 12:42:54 +0100	[diff] [blame]	325	} else {
				326	bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
				327	bs);
				328	nr_iovecs = bvec_nr_vecs(idx);
				329	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	330	if (unlikely(!bvl)) {
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	331	if (bs)
				332	mempool_free(bio, bs->bio_pool);
				333	else
				334	kfree(bio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	335	bio = NULL;
				336	goto out;
				337	}
				338	bio->bi_flags \|= idx << BIO_POOL_OFFSET;
Jens Axboe	392ddc3	2008-12-23 12:42:54 +0100	[diff] [blame]	339	bio->bi_max_vecs = nr_iovecs;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	340	}
				341	bio->bi_io_vec = bvl;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	342	}
				343	out:
				344	return bio;
				345	}
				346
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	347	struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	{
Peter Osterlund	3676347	2005-09-06 15:16:42 -0700	[diff] [blame]	349	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
				350
				351	if (bio)
				352	bio->bi_destructor = bio_fs_destructor;
				353
				354	return bio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	355	}
				356
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	357	/*
				358	* Like bio_alloc(), but doesn't use a mempool backing. This means that
				359	* it CAN fail, but while bio_alloc() can only be used for allocations
				360	* that have a short (finite) life span, bio_kmalloc() should be used
				361	* for more permanent bio allocations (like allocating some bio's for
				362	* initalization or setup purposes).
				363	*/
				364	struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
				365	{
				366	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
				367
				368	if (bio)
				369	bio->bi_destructor = bio_kmalloc_destructor;
				370
				371	return bio;
				372	}
				373
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	374	void zero_fill_bio(struct bio *bio)
				375	{
				376	unsigned long flags;
				377	struct bio_vec *bv;
				378	int i;
				379
				380	bio_for_each_segment(bv, bio, i) {
				381	char *data = bvec_kmap_irq(bv, &flags);
				382	memset(data, 0, bv->bv_len);
				383	flush_dcache_page(bv->bv_page);
				384	bvec_kunmap_irq(data, &flags);
				385	}
				386	}
				387	EXPORT_SYMBOL(zero_fill_bio);
				388
				389	/**
				390	* bio_put - release a reference to a bio
				391	* @bio: bio to release reference to
				392	*
				393	* Description:
				394	* Put a reference to a &struct bio, either one you have gotten with
				395	* bio_alloc or bio_get. The last put of a bio will free it.
				396	**/
				397	void bio_put(struct bio *bio)
				398	{
				399	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
				400
				401	/*
				402	* last put frees it
				403	*/
				404	if (atomic_dec_and_test(&bio->bi_cnt)) {
				405	bio->bi_next = NULL;
				406	bio->bi_destructor(bio);
				407	}
				408	}
				409
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	410	inline int bio_phys_segments(struct request_queue q, struct bio bio)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	411	{
				412	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
				413	blk_recount_segments(q, bio);
				414
				415	return bio->bi_phys_segments;
				416	}
				417
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	418	/**
				419	* __bio_clone - clone a bio
				420	* @bio: destination bio
				421	* @bio_src: bio to clone
				422	*
				423	* Clone a &bio. Caller will own the returned bio, but not
				424	* the actual data it points to. Reference count of returned
				425	* bio will be one.
				426	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	427	void __bio_clone(struct bio bio, struct bio bio_src)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	428	{
Andrew Morton	e525e15	2005-08-07 09:42:12 -0700	[diff] [blame]	429	memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
				430	bio_src->bi_max_vecs * sizeof(struct bio_vec));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	431
Jens Axboe	5d84070	2008-01-25 12:44:44 +0100	[diff] [blame]	432	/*
				433	* most users will be overriding ->bi_bdev with a new target,
				434	* so we don't set nor calculate new physical/hw segment counts here
				435	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	436	bio->bi_sector = bio_src->bi_sector;
				437	bio->bi_bdev = bio_src->bi_bdev;
				438	bio->bi_flags \|= 1 << BIO_CLONED;
				439	bio->bi_rw = bio_src->bi_rw;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	440	bio->bi_vcnt = bio_src->bi_vcnt;
				441	bio->bi_size = bio_src->bi_size;
Andrew Morton	a5453be	2005-07-28 01:07:18 -0700	[diff] [blame]	442	bio->bi_idx = bio_src->bi_idx;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	443	}
				444
				445	/**
				446	* bio_clone - clone a bio
				447	* @bio: bio to clone
				448	* @gfp_mask: allocation priority
				449	*
				450	* Like __bio_clone, only also allocates the returned bio
				451	*/
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	452	struct bio bio_clone(struct bio bio, gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	453	{
				454	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
				455
Martin K. Petersen	7ba1ba1	2008-06-30 20:04:41 +0200	[diff] [blame]	456	if (!b)
				457	return NULL;
				458
				459	b->bi_destructor = bio_fs_destructor;
				460	__bio_clone(b, bio);
				461
				462	if (bio_integrity(bio)) {
				463	int ret;
				464
				465	ret = bio_integrity_clone(b, bio, fs_bio_set);
				466
				467	if (ret < 0)
				468	return NULL;
Peter Osterlund	3676347	2005-09-06 15:16:42 -0700	[diff] [blame]	469	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	470
				471	return b;
				472	}
				473
				474	/**
				475	* bio_get_nr_vecs - return approx number of vecs
				476	* @bdev: I/O target
				477	*
				478	* Return the approximate number of pages we can send to this target.
				479	* There's no guarantee that you will be able to fit this number of pages
				480	* into a bio, it does not account for dynamic restrictions that vary
				481	* on offset.
				482	*/
				483	int bio_get_nr_vecs(struct block_device *bdev)
				484	{
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	485	struct request_queue *q = bdev_get_queue(bdev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	486	int nr_pages;
				487
				488	nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
				489	if (nr_pages > q->max_phys_segments)
				490	nr_pages = q->max_phys_segments;
				491	if (nr_pages > q->max_hw_segments)
				492	nr_pages = q->max_hw_segments;
				493
				494	return nr_pages;
				495	}
				496
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	497	static int __bio_add_page(struct request_queue q, struct bio bio, struct page
Mike Christie	defd94b	2005-12-05 02:37:06 -0600	[diff] [blame]	498	*page, unsigned int len, unsigned int offset,
				499	unsigned short max_sectors)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	500	{
				501	int retried_segments = 0;
				502	struct bio_vec *bvec;
				503
				504	/*
				505	* cloned bio must not modify vec list
				506	*/
				507	if (unlikely(bio_flagged(bio, BIO_CLONED)))
				508	return 0;
				509
Jens Axboe	80cfd54	2006-01-06 09:43:28 +0100	[diff] [blame]	510	if (((bio->bi_size + len) >> 9) > max_sectors)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	511	return 0;
				512
Jens Axboe	80cfd54	2006-01-06 09:43:28 +0100	[diff] [blame]	513	/*
				514	* For filesystems with a blocksize smaller than the pagesize
				515	* we will often be called with the same page as last time and
				516	* a consecutive offset. Optimize this special case.
				517	*/
				518	if (bio->bi_vcnt > 0) {
				519	struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
				520
				521	if (page == prev->bv_page &&
				522	offset == prev->bv_offset + prev->bv_len) {
				523	prev->bv_len += len;
Alasdair G Kergon	cc371e6	2008-07-03 09:53:43 +0200	[diff] [blame]	524
				525	if (q->merge_bvec_fn) {
				526	struct bvec_merge_data bvm = {
				527	.bi_bdev = bio->bi_bdev,
				528	.bi_sector = bio->bi_sector,
				529	.bi_size = bio->bi_size,
				530	.bi_rw = bio->bi_rw,
				531	};
				532
				533	if (q->merge_bvec_fn(q, &bvm, prev) < len) {
				534	prev->bv_len -= len;
				535	return 0;
				536	}
Jens Axboe	80cfd54	2006-01-06 09:43:28 +0100	[diff] [blame]	537	}
				538
				539	goto done;
				540	}
				541	}
				542
				543	if (bio->bi_vcnt >= bio->bi_max_vecs)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	544	return 0;
				545
				546	/*
				547	* we might lose a segment or two here, but rather that than
				548	* make this too complex.
				549	*/
				550
				551	while (bio->bi_phys_segments >= q->max_phys_segments
Mikulas Patocka	5df97b9	2008-08-15 10:20:02 +0200	[diff] [blame]	552	\|\| bio->bi_phys_segments >= q->max_hw_segments) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	553
				554	if (retried_segments)
				555	return 0;
				556
				557	retried_segments = 1;
				558	blk_recount_segments(q, bio);
				559	}
				560
				561	/*
				562	* setup the new entry, we might clear it again later if we
				563	* cannot add the page
				564	*/
				565	bvec = &bio->bi_io_vec[bio->bi_vcnt];
				566	bvec->bv_page = page;
				567	bvec->bv_len = len;
				568	bvec->bv_offset = offset;
				569
				570	/*
				571	* if queue has other restrictions (eg varying max sector size
				572	* depending on offset), it can specify a merge_bvec_fn in the
				573	* queue to get further control
				574	*/
				575	if (q->merge_bvec_fn) {
Alasdair G Kergon	cc371e6	2008-07-03 09:53:43 +0200	[diff] [blame]	576	struct bvec_merge_data bvm = {
				577	.bi_bdev = bio->bi_bdev,
				578	.bi_sector = bio->bi_sector,
				579	.bi_size = bio->bi_size,
				580	.bi_rw = bio->bi_rw,
				581	};
				582
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	583	/*
				584	* merge_bvec_fn() returns number of bytes it can accept
				585	* at this offset
				586	*/
Alasdair G Kergon	cc371e6	2008-07-03 09:53:43 +0200	[diff] [blame]	587	if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	588	bvec->bv_page = NULL;
				589	bvec->bv_len = 0;
				590	bvec->bv_offset = 0;
				591	return 0;
				592	}
				593	}
				594
				595	/* If we may be able to merge these biovecs, force a recount */
Mikulas Patocka	b8b3e16	2008-08-15 10:15:19 +0200	[diff] [blame]	596	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	597	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
				598
				599	bio->bi_vcnt++;
				600	bio->bi_phys_segments++;
Jens Axboe	80cfd54	2006-01-06 09:43:28 +0100	[diff] [blame]	601	done:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	602	bio->bi_size += len;
				603	return len;
				604	}
				605
				606	/**
Mike Christie	6e68af6	2005-11-11 05:30:27 -0600	[diff] [blame]	607	* bio_add_pc_page - attempt to add page to bio
Jens Axboe	fddfdea	2006-01-31 15:24:34 +0100	[diff] [blame]	608	* @q: the target queue
Mike Christie	6e68af6	2005-11-11 05:30:27 -0600	[diff] [blame]	609	* @bio: destination bio
				610	* @page: page to add
				611	* @len: vec entry length
				612	* @offset: vec entry offset
				613	*
				614	* Attempt to add a page to the bio_vec maplist. This can fail for a
				615	* number of reasons, such as the bio being full or target block
				616	* device limitations. The target block device must allow bio's
				617	* smaller than PAGE_SIZE, so it is always possible to add a single
				618	* page to an empty bio. This should only be used by REQ_PC bios.
				619	*/
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	620	int bio_add_pc_page(struct request_queue q, struct bio bio, struct page *page,
Mike Christie	6e68af6	2005-11-11 05:30:27 -0600	[diff] [blame]	621	unsigned int len, unsigned int offset)
				622	{
Mike Christie	defd94b	2005-12-05 02:37:06 -0600	[diff] [blame]	623	return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
Mike Christie	6e68af6	2005-11-11 05:30:27 -0600	[diff] [blame]	624	}
				625
				626	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	627	* bio_add_page - attempt to add page to bio
				628	* @bio: destination bio
				629	* @page: page to add
				630	* @len: vec entry length
				631	* @offset: vec entry offset
				632	*
				633	* Attempt to add a page to the bio_vec maplist. This can fail for a
				634	* number of reasons, such as the bio being full or target block
				635	* device limitations. The target block device must allow bio's
				636	* smaller than PAGE_SIZE, so it is always possible to add a single
				637	* page to an empty bio.
				638	*/
				639	int bio_add_page(struct bio bio, struct page page, unsigned int len,
				640	unsigned int offset)
				641	{
Mike Christie	defd94b	2005-12-05 02:37:06 -0600	[diff] [blame]	642	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
				643	return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	644	}
				645
				646	struct bio_map_data {
				647	struct bio_vec *iovecs;
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	648	struct sg_iovec *sgvecs;
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	649	int nr_sgvecs;
				650	int is_our_pages;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	651	};
				652
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	653	static void bio_set_map_data(struct bio_map_data bmd, struct bio bio,
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	654	struct sg_iovec *iov, int iov_count,
				655	int is_our_pages)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	656	{
				657	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	658	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
				659	bmd->nr_sgvecs = iov_count;
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	660	bmd->is_our_pages = is_our_pages;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	661	bio->bi_private = bmd;
				662	}
				663
				664	static void bio_free_map_data(struct bio_map_data *bmd)
				665	{
				666	kfree(bmd->iovecs);
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	667	kfree(bmd->sgvecs);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	668	kfree(bmd);
				669	}
				670
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	671	static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
				672	gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	673	{
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	674	struct bio_map_data bmd = kmalloc(sizeof(bmd), gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	675
				676	if (!bmd)
				677	return NULL;
				678
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	679	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	680	if (!bmd->iovecs) {
				681	kfree(bmd);
				682	return NULL;
				683	}
				684
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	685	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	686	if (bmd->sgvecs)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	687	return bmd;
				688
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	689	kfree(bmd->iovecs);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	690	kfree(bmd);
				691	return NULL;
				692	}
				693
FUJITA Tomonori	aefcc28	2008-08-25 20:36:08 +0200	[diff] [blame]	694	static int __bio_copy_iov(struct bio bio, struct bio_vec iovecs,
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	695	struct sg_iovec *iov, int iov_count, int uncopy,
				696	int do_free_page)
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	697	{
				698	int ret = 0, i;
				699	struct bio_vec *bvec;
				700	int iov_idx = 0;
				701	unsigned int iov_off = 0;
				702	int read = bio_data_dir(bio) == READ;
				703
				704	__bio_for_each_segment(bvec, bio, i, 0) {
				705	char *bv_addr = page_address(bvec->bv_page);
FUJITA Tomonori	aefcc28	2008-08-25 20:36:08 +0200	[diff] [blame]	706	unsigned int bv_len = iovecs[i].bv_len;
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	707
				708	while (bv_len && iov_idx < iov_count) {
				709	unsigned int bytes;
				710	char *iov_addr;
				711
				712	bytes = min_t(unsigned int,
				713	iov[iov_idx].iov_len - iov_off, bv_len);
				714	iov_addr = iov[iov_idx].iov_base + iov_off;
				715
				716	if (!ret) {
				717	if (!read && !uncopy)
				718	ret = copy_from_user(bv_addr, iov_addr,
				719	bytes);
				720	if (read && uncopy)
				721	ret = copy_to_user(iov_addr, bv_addr,
				722	bytes);
				723
				724	if (ret)
				725	ret = -EFAULT;
				726	}
				727
				728	bv_len -= bytes;
				729	bv_addr += bytes;
				730	iov_addr += bytes;
				731	iov_off += bytes;
				732
				733	if (iov[iov_idx].iov_len == iov_off) {
				734	iov_idx++;
				735	iov_off = 0;
				736	}
				737	}
				738
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	739	if (do_free_page)
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	740	__free_page(bvec->bv_page);
				741	}
				742
				743	return ret;
				744	}
				745
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	746	/**
				747	* bio_uncopy_user - finish previously mapped bio
				748	* @bio: bio being terminated
				749	*
				750	* Free pages allocated from bio_copy_user() and write back data
				751	* to user space in case of a read.
				752	*/
				753	int bio_uncopy_user(struct bio *bio)
				754	{
				755	struct bio_map_data *bmd = bio->bi_private;
FUJITA Tomonori	8188276	2008-09-02 16:20:19 +0900	[diff] [blame]	756	int ret = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	757
FUJITA Tomonori	8188276	2008-09-02 16:20:19 +0900	[diff] [blame]	758	if (!bio_flagged(bio, BIO_NULL_MAPPED))
				759	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
				760	bmd->nr_sgvecs, 1, bmd->is_our_pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	761	bio_free_map_data(bmd);
				762	bio_put(bio);
				763	return ret;
				764	}
				765
				766	/**
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	767	* bio_copy_user_iov - copy user data to bio
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	768	* @q: destination block queue
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	769	* @map_data: pointer to the rq_map_data holding pages (if necessary)
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	770	* @iov: the iovec.
				771	* @iov_count: number of elements in the iovec
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	772	* @write_to_vm: bool indicating writing to pages or not
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	773	* @gfp_mask: memory allocation flags
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	774	*
				775	* Prepares and returns a bio for indirect user io, bouncing data
				776	* to/from kernel pages as necessary. Must be paired with
				777	* call bio_uncopy_user() on io completion.
				778	*/
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	779	struct bio bio_copy_user_iov(struct request_queue q,
				780	struct rq_map_data *map_data,
				781	struct sg_iovec *iov, int iov_count,
				782	int write_to_vm, gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	783	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	784	struct bio_map_data *bmd;
				785	struct bio_vec *bvec;
				786	struct page *page;
				787	struct bio *bio;
				788	int i, ret;
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	789	int nr_pages = 0;
				790	unsigned int len = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	791
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	792	for (i = 0; i < iov_count; i++) {
				793	unsigned long uaddr;
				794	unsigned long end;
				795	unsigned long start;
				796
				797	uaddr = (unsigned long)iov[i].iov_base;
				798	end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				799	start = uaddr >> PAGE_SHIFT;
				800
				801	nr_pages += end - start;
				802	len += iov[i].iov_len;
				803	}
				804
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	805	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	806	if (!bmd)
				807	return ERR_PTR(-ENOMEM);
				808
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	809	ret = -ENOMEM;
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	810	bio = bio_alloc(gfp_mask, nr_pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	811	if (!bio)
				812	goto out_bmd;
				813
				814	bio->bi_rw \|= (!write_to_vm << BIO_RW);
				815
				816	ret = 0;
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	817	i = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	818	while (len) {
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	819	unsigned int bytes;
				820
				821	if (map_data)
				822	bytes = 1U << (PAGE_SHIFT + map_data->page_order);
				823	else
				824	bytes = PAGE_SIZE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	825
				826	if (bytes > len)
				827	bytes = len;
				828
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	829	if (map_data) {
				830	if (i == map_data->nr_entries) {
				831	ret = -ENOMEM;
				832	break;
				833	}
				834	page = map_data->pages[i++];
				835	} else
				836	page = alloc_page(q->bounce_gfp \| gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	837	if (!page) {
				838	ret = -ENOMEM;
				839	break;
				840	}
				841
Mike Christie	0e75f90	2006-12-01 10:40:55 +0100	[diff] [blame]	842	if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	843	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	844
				845	len -= bytes;
				846	}
				847
				848	if (ret)
				849	goto cleanup;
				850
				851	/*
				852	* success
				853	*/
				854	if (!write_to_vm) {
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	855	ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	856	if (ret)
				857	goto cleanup;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	858	}
				859
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	860	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	861	return bio;
				862	cleanup:
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	863	if (!map_data)
				864	bio_for_each_segment(bvec, bio, i)
				865	__free_page(bvec->bv_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	866
				867	bio_put(bio);
				868	out_bmd:
				869	bio_free_map_data(bmd);
				870	return ERR_PTR(ret);
				871	}
				872
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	873	/**
				874	* bio_copy_user - copy user data to bio
				875	* @q: destination block queue
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	876	* @map_data: pointer to the rq_map_data holding pages (if necessary)
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	877	* @uaddr: start of user address
				878	* @len: length in bytes
				879	* @write_to_vm: bool indicating writing to pages or not
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	880	* @gfp_mask: memory allocation flags
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	881	*
				882	* Prepares and returns a bio for indirect user io, bouncing data
				883	* to/from kernel pages as necessary. Must be paired with
				884	* call bio_uncopy_user() on io completion.
				885	*/
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	886	struct bio bio_copy_user(struct request_queue q, struct rq_map_data *map_data,
				887	unsigned long uaddr, unsigned int len,
				888	int write_to_vm, gfp_t gfp_mask)
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	889	{
				890	struct sg_iovec iov;
				891
				892	iov.iov_base = (void __user *)uaddr;
				893	iov.iov_len = len;
				894
FUJITA Tomonori	152e283	2008-08-28 16:17:06 +0900	[diff] [blame]	895	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
FUJITA Tomonori	c5dec1c	2008-04-11 12:56:49 +0200	[diff] [blame]	896	}
				897
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	898	static struct bio __bio_map_user_iov(struct request_queue q,
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	899	struct block_device *bdev,
				900	struct sg_iovec *iov, int iov_count,
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	901	int write_to_vm, gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	902	{
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	903	int i, j;
				904	int nr_pages = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	905	struct page **pages;
				906	struct bio *bio;
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	907	int cur_page = 0;
				908	int ret, offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	909
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	910	for (i = 0; i < iov_count; i++) {
				911	unsigned long uaddr = (unsigned long)iov[i].iov_base;
				912	unsigned long len = iov[i].iov_len;
				913	unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				914	unsigned long start = uaddr >> PAGE_SHIFT;
				915
				916	nr_pages += end - start;
				917	/*
Mike Christie	ad2d722	2006-12-01 10:40:20 +0100	[diff] [blame]	918	* buffer must be aligned to at least hardsector size for now
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	919	*/
Mike Christie	ad2d722	2006-12-01 10:40:20 +0100	[diff] [blame]	920	if (uaddr & queue_dma_alignment(q))
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	921	return ERR_PTR(-EINVAL);
				922	}
				923
				924	if (!nr_pages)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	925	return ERR_PTR(-EINVAL);
				926
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	927	bio = bio_alloc(gfp_mask, nr_pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	928	if (!bio)
				929	return ERR_PTR(-ENOMEM);
				930
				931	ret = -ENOMEM;
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	932	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	933	if (!pages)
				934	goto out;
				935
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	936	for (i = 0; i < iov_count; i++) {
				937	unsigned long uaddr = (unsigned long)iov[i].iov_base;
				938	unsigned long len = iov[i].iov_len;
				939	unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				940	unsigned long start = uaddr >> PAGE_SHIFT;
				941	const int local_nr_pages = end - start;
				942	const int page_limit = cur_page + local_nr_pages;
				943
Nick Piggin	f5dd33c	2008-07-25 19:45:25 -0700	[diff] [blame]	944	ret = get_user_pages_fast(uaddr, local_nr_pages,
				945	write_to_vm, &pages[cur_page]);
Jens Axboe	9917215	2006-06-16 13:02:29 +0200	[diff] [blame]	946	if (ret < local_nr_pages) {
				947	ret = -EFAULT;
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	948	goto out_unmap;
Jens Axboe	9917215	2006-06-16 13:02:29 +0200	[diff] [blame]	949	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	950
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	951	offset = uaddr & ~PAGE_MASK;
				952	for (j = cur_page; j < page_limit; j++) {
				953	unsigned int bytes = PAGE_SIZE - offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	954
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	955	if (len <= 0)
				956	break;
				957
				958	if (bytes > len)
				959	bytes = len;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	960
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	961	/*
				962	* sorry...
				963	*/
Mike Christie	defd94b	2005-12-05 02:37:06 -0600	[diff] [blame]	964	if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
				965	bytes)
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	966	break;
				967
				968	len -= bytes;
				969	offset = 0;
				970	}
				971
				972	cur_page = j;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	973	/*
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	974	* release the pages we didn't map into the bio, if any
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	975	*/
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	976	while (j < page_limit)
				977	page_cache_release(pages[j++]);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	978	}
				979
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	980	kfree(pages);
				981
				982	/*
				983	* set data direction, and check if mapped pages need bouncing
				984	*/
				985	if (!write_to_vm)
				986	bio->bi_rw \|= (1 << BIO_RW);
				987
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	988	bio->bi_bdev = bdev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	989	bio->bi_flags \|= (1 << BIO_USER_MAPPED);
				990	return bio;
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	991
				992	out_unmap:
				993	for (i = 0; i < nr_pages; i++) {
				994	if(!pages[i])
				995	break;
				996	page_cache_release(pages[i]);
				997	}
				998	out:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	999	kfree(pages);
				1000	bio_put(bio);
				1001	return ERR_PTR(ret);
				1002	}
				1003
				1004	/**
				1005	* bio_map_user - map user address into bio
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	1006	* @q: the struct request_queue for the bio
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1007	* @bdev: destination block device
				1008	* @uaddr: start of user address
				1009	* @len: length in bytes
				1010	* @write_to_vm: bool indicating writing to pages or not
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	1011	* @gfp_mask: memory allocation flags
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1012	*
				1013	* Map the user space address into a bio suitable for io to a block
				1014	* device. Returns an error pointer in case of error.
				1015	*/
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	1016	struct bio bio_map_user(struct request_queue q, struct block_device *bdev,
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	1017	unsigned long uaddr, unsigned int len, int write_to_vm,
				1018	gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1019	{
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	1020	struct sg_iovec iov;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1021
viro@ZenIV.linux.org.uk	3f70353	2005-09-09 16:53:56 +0100	[diff] [blame]	1022	iov.iov_base = (void __user *)uaddr;
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	1023	iov.iov_len = len;
				1024
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	1025	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	1026	}
				1027
				1028	/**
				1029	* bio_map_user_iov - map user sg_iovec table into bio
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	1030	* @q: the struct request_queue for the bio
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	1031	* @bdev: destination block device
				1032	* @iov: the iovec.
				1033	* @iov_count: number of elements in the iovec
				1034	* @write_to_vm: bool indicating writing to pages or not
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	1035	* @gfp_mask: memory allocation flags
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	1036	*
				1037	* Map the user space address into a bio suitable for io to a block
				1038	* device. Returns an error pointer in case of error.
				1039	*/
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	1040	struct bio bio_map_user_iov(struct request_queue q, struct block_device *bdev,
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	1041	struct sg_iovec *iov, int iov_count,
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	1042	int write_to_vm, gfp_t gfp_mask)
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	1043	{
				1044	struct bio *bio;
James Bottomley	f1970ba	2005-06-20 14:06:52 +0200	[diff] [blame]	1045
FUJITA Tomonori	a3bce90	2008-08-28 16:17:05 +0900	[diff] [blame]	1046	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
				1047	gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1048	if (IS_ERR(bio))
				1049	return bio;
				1050
				1051	/*
				1052	* subtle -- if __bio_map_user() ended up bouncing a bio,
				1053	* it would normally disappear when its bi_end_io is run.
				1054	* however, we need it for the unmap, so grab an extra
				1055	* reference to it
				1056	*/
				1057	bio_get(bio);
				1058
Mike Christie	0e75f90	2006-12-01 10:40:55 +0100	[diff] [blame]	1059	return bio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1060	}
				1061
				1062	static void __bio_unmap_user(struct bio *bio)
				1063	{
				1064	struct bio_vec *bvec;
				1065	int i;
				1066
				1067	/*
				1068	* make sure we dirty pages we wrote to
				1069	*/
				1070	__bio_for_each_segment(bvec, bio, i, 0) {
				1071	if (bio_data_dir(bio) == READ)
				1072	set_page_dirty_lock(bvec->bv_page);
				1073
				1074	page_cache_release(bvec->bv_page);
				1075	}
				1076
				1077	bio_put(bio);
				1078	}
				1079
				1080	/**
				1081	* bio_unmap_user - unmap a bio
				1082	* @bio: the bio being unmapped
				1083	*
				1084	* Unmap a bio previously mapped by bio_map_user(). Must be called with
				1085	* a process context.
				1086	*
				1087	* bio_unmap_user() may sleep.
				1088	*/
				1089	void bio_unmap_user(struct bio *bio)
				1090	{
				1091	__bio_unmap_user(bio);
				1092	bio_put(bio);
				1093	}
				1094
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	1095	static void bio_map_kern_endio(struct bio *bio, int err)
Jens Axboe	b823825	2005-06-20 14:05:27 +0200	[diff] [blame]	1096	{
Jens Axboe	b823825	2005-06-20 14:05:27 +0200	[diff] [blame]	1097	bio_put(bio);
Jens Axboe	b823825	2005-06-20 14:05:27 +0200	[diff] [blame]	1098	}
				1099
				1100
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	1101	static struct bio __bio_map_kern(struct request_queue q, void *data,
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1102	unsigned int len, gfp_t gfp_mask)
Mike Christie	df46b9a	2005-06-20 14:04:44 +0200	[diff] [blame]	1103	{
				1104	unsigned long kaddr = (unsigned long)data;
				1105	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1106	unsigned long start = kaddr >> PAGE_SHIFT;
				1107	const int nr_pages = end - start;
				1108	int offset, i;
				1109	struct bio *bio;
				1110
				1111	bio = bio_alloc(gfp_mask, nr_pages);
				1112	if (!bio)
				1113	return ERR_PTR(-ENOMEM);
				1114
				1115	offset = offset_in_page(kaddr);
				1116	for (i = 0; i < nr_pages; i++) {
				1117	unsigned int bytes = PAGE_SIZE - offset;
				1118
				1119	if (len <= 0)
				1120	break;
				1121
				1122	if (bytes > len)
				1123	bytes = len;
				1124
Mike Christie	defd94b	2005-12-05 02:37:06 -0600	[diff] [blame]	1125	if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
				1126	offset) < bytes)
Mike Christie	df46b9a	2005-06-20 14:04:44 +0200	[diff] [blame]	1127	break;
				1128
				1129	data += bytes;
				1130	len -= bytes;
				1131	offset = 0;
				1132	}
				1133
Jens Axboe	b823825	2005-06-20 14:05:27 +0200	[diff] [blame]	1134	bio->bi_end_io = bio_map_kern_endio;
Mike Christie	df46b9a	2005-06-20 14:04:44 +0200	[diff] [blame]	1135	return bio;
				1136	}
				1137
				1138	/**
				1139	* bio_map_kern - map kernel address into bio
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	1140	* @q: the struct request_queue for the bio
Mike Christie	df46b9a	2005-06-20 14:04:44 +0200	[diff] [blame]	1141	* @data: pointer to buffer to map
				1142	* @len: length in bytes
				1143	* @gfp_mask: allocation flags for bio allocation
				1144	*
				1145	* Map the kernel address into a bio suitable for io to a block
				1146	* device. Returns an error pointer in case of error.
				1147	*/
Jens Axboe	165125e	2007-07-24 09:28:11 +0200	[diff] [blame]	1148	struct bio bio_map_kern(struct request_queue q, void *data, unsigned int len,
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1149	gfp_t gfp_mask)
Mike Christie	df46b9a	2005-06-20 14:04:44 +0200	[diff] [blame]	1150	{
				1151	struct bio *bio;
				1152
				1153	bio = __bio_map_kern(q, data, len, gfp_mask);
				1154	if (IS_ERR(bio))
				1155	return bio;
				1156
				1157	if (bio->bi_size == len)
				1158	return bio;
				1159
				1160	/*
				1161	* Don't support partial mappings.
				1162	*/
				1163	bio_put(bio);
				1164	return ERR_PTR(-EINVAL);
				1165	}
				1166
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1167	static void bio_copy_kern_endio(struct bio *bio, int err)
				1168	{
				1169	struct bio_vec *bvec;
				1170	const int read = bio_data_dir(bio) == READ;
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	1171	struct bio_map_data *bmd = bio->bi_private;
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1172	int i;
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	1173	char *p = bmd->sgvecs[0].iov_base;
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1174
				1175	__bio_for_each_segment(bvec, bio, i, 0) {
				1176	char *addr = page_address(bvec->bv_page);
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	1177	int len = bmd->iovecs[i].bv_len;
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1178
				1179	if (read && !err)
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	1180	memcpy(p, addr, len);
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1181
				1182	__free_page(bvec->bv_page);
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	1183	p += len;
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1184	}
				1185
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	1186	bio_free_map_data(bmd);
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1187	bio_put(bio);
				1188	}
				1189
				1190	/**
				1191	* bio_copy_kern - copy kernel address into bio
				1192	* @q: the struct request_queue for the bio
				1193	* @data: pointer to buffer to copy
				1194	* @len: length in bytes
				1195	* @gfp_mask: allocation flags for bio and page allocation
Randy Dunlap	ffee025	2008-04-30 09:08:54 +0200	[diff] [blame]	1196	* @reading: data direction is READ
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1197	*
				1198	* copy the kernel address into a bio suitable for io to a block
				1199	* device. Returns an error pointer in case of error.
				1200	*/
				1201	struct bio bio_copy_kern(struct request_queue q, void *data, unsigned int len,
				1202	gfp_t gfp_mask, int reading)
				1203	{
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1204	struct bio *bio;
				1205	struct bio_vec *bvec;
FUJITA Tomonori	4d8ab62	2008-08-28 15:05:57 +0900	[diff] [blame]	1206	int i;
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1207
FUJITA Tomonori	4d8ab62	2008-08-28 15:05:57 +0900	[diff] [blame]	1208	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
				1209	if (IS_ERR(bio))
				1210	return bio;
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1211
				1212	if (!reading) {
				1213	void *p = data;
				1214
				1215	bio_for_each_segment(bvec, bio, i) {
				1216	char *addr = page_address(bvec->bv_page);
				1217
				1218	memcpy(addr, p, bvec->bv_len);
				1219	p += bvec->bv_len;
				1220	}
				1221	}
				1222
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1223	bio->bi_end_io = bio_copy_kern_endio;
FUJITA Tomonori	76029ff	2008-08-25 20:36:08 +0200	[diff] [blame]	1224
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1225	return bio;
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1226	}
				1227
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1228	/*
				1229	* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
				1230	* for performing direct-IO in BIOs.
				1231	*
				1232	* The problem is that we cannot run set_page_dirty() from interrupt context
				1233	* because the required locks are not interrupt-safe. So what we can do is to
				1234	* mark the pages dirty _before_ performing IO. And in interrupt context,
				1235	* check that the pages are still dirty. If so, fine. If not, redirty them
				1236	* in process context.
				1237	*
				1238	* We special-case compound pages here: normally this means reads into hugetlb
				1239	* pages. The logic in here doesn't really work right for compound pages
				1240	* because the VM does not uniformly chase down the head page in all cases.
				1241	* But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
				1242	* handle them at all. So we skip compound pages here at an early stage.
				1243	*
				1244	* Note that this code is very hard to test under normal circumstances because
				1245	* direct-io pins the pages with get_user_pages(). This makes
				1246	* is_page_cache_freeable return false, and the VM will not clean the pages.
				1247	* But other code (eg, pdflush) could clean the pages if they are mapped
				1248	* pagecache.
				1249	*
				1250	* Simply disabling the call to bio_set_pages_dirty() is a good way to test the
				1251	* deferred bio dirtying paths.
				1252	*/
				1253
				1254	/*
				1255	* bio_set_pages_dirty() will mark all the bio's pages as dirty.
				1256	*/
				1257	void bio_set_pages_dirty(struct bio *bio)
				1258	{
				1259	struct bio_vec *bvec = bio->bi_io_vec;
				1260	int i;
				1261
				1262	for (i = 0; i < bio->bi_vcnt; i++) {
				1263	struct page *page = bvec[i].bv_page;
				1264
				1265	if (page && !PageCompound(page))
				1266	set_page_dirty_lock(page);
				1267	}
				1268	}
				1269
Adrian Bunk	86b6c7a	2008-02-18 13:48:32 +0100	[diff] [blame]	1270	static void bio_release_pages(struct bio *bio)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1271	{
				1272	struct bio_vec *bvec = bio->bi_io_vec;
				1273	int i;
				1274
				1275	for (i = 0; i < bio->bi_vcnt; i++) {
				1276	struct page *page = bvec[i].bv_page;
				1277
				1278	if (page)
				1279	put_page(page);
				1280	}
				1281	}
				1282
				1283	/*
				1284	* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
				1285	* If they are, then fine. If, however, some pages are clean then they must
				1286	* have been written out during the direct-IO read. So we take another ref on
				1287	* the BIO and the offending pages and re-dirty the pages in process context.
				1288	*
				1289	* It is expected that bio_check_pages_dirty() will wholly own the BIO from
				1290	* here on. It will run one page_cache_release() against each page and will
				1291	* run one bio_put() against the BIO.
				1292	*/
				1293
David Howells	65f27f3	2006-11-22 14:55:48 +0000	[diff] [blame]	1294	static void bio_dirty_fn(struct work_struct *work);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1295
David Howells	65f27f3	2006-11-22 14:55:48 +0000	[diff] [blame]	1296	static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1297	static DEFINE_SPINLOCK(bio_dirty_lock);
				1298	static struct bio *bio_dirty_list;
				1299
				1300	/*
				1301	* This runs in process context
				1302	*/
David Howells	65f27f3	2006-11-22 14:55:48 +0000	[diff] [blame]	1303	static void bio_dirty_fn(struct work_struct *work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1304	{
				1305	unsigned long flags;
				1306	struct bio *bio;
				1307
				1308	spin_lock_irqsave(&bio_dirty_lock, flags);
				1309	bio = bio_dirty_list;
				1310	bio_dirty_list = NULL;
				1311	spin_unlock_irqrestore(&bio_dirty_lock, flags);
				1312
				1313	while (bio) {
				1314	struct bio *next = bio->bi_private;
				1315
				1316	bio_set_pages_dirty(bio);
				1317	bio_release_pages(bio);
				1318	bio_put(bio);
				1319	bio = next;
				1320	}
				1321	}
				1322
				1323	void bio_check_pages_dirty(struct bio *bio)
				1324	{
				1325	struct bio_vec *bvec = bio->bi_io_vec;
				1326	int nr_clean_pages = 0;
				1327	int i;
				1328
				1329	for (i = 0; i < bio->bi_vcnt; i++) {
				1330	struct page *page = bvec[i].bv_page;
				1331
				1332	if (PageDirty(page) \|\| PageCompound(page)) {
				1333	page_cache_release(page);
				1334	bvec[i].bv_page = NULL;
				1335	} else {
				1336	nr_clean_pages++;
				1337	}
				1338	}
				1339
				1340	if (nr_clean_pages) {
				1341	unsigned long flags;
				1342
				1343	spin_lock_irqsave(&bio_dirty_lock, flags);
				1344	bio->bi_private = bio_dirty_list;
				1345	bio_dirty_list = bio;
				1346	spin_unlock_irqrestore(&bio_dirty_lock, flags);
				1347	schedule_work(&bio_dirty_work);
				1348	} else {
				1349	bio_put(bio);
				1350	}
				1351	}
				1352
				1353	/**
				1354	* bio_endio - end I/O on a bio
				1355	* @bio: bio
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1356	* @error: error, if any
				1357	*
				1358	* Description:
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	1359	* bio_endio() will end I/O on the whole bio. bio_endio() is the
NeilBrown	5bb23a6	2007-09-27 12:46:13 +0200	[diff] [blame]	1360	* preferred way to end I/O on a bio, it takes care of clearing
				1361	* BIO_UPTODATE on error. @error is 0 on success, and and one of the
				1362	* established -Exxxx (-EIO, for instance) error values in case
				1363	* something went wrong. Noone should call bi_end_io() directly on a
				1364	* bio unless they own it and thus know that it has an end_io
				1365	* function.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1366	**/
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	1367	void bio_endio(struct bio *bio, int error)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1368	{
				1369	if (error)
				1370	clear_bit(BIO_UPTODATE, &bio->bi_flags);
NeilBrown	9cc54d4	2007-09-27 12:46:12 +0200	[diff] [blame]	1371	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				1372	error = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1373
NeilBrown	5bb23a6	2007-09-27 12:46:13 +0200	[diff] [blame]	1374	if (bio->bi_end_io)
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	1375	bio->bi_end_io(bio, error);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1376	}
				1377
				1378	void bio_pair_release(struct bio_pair *bp)
				1379	{
				1380	if (atomic_dec_and_test(&bp->cnt)) {
				1381	struct bio *master = bp->bio1.bi_private;
				1382
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	1383	bio_endio(master, bp->error);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1384	mempool_free(bp, bp->bio2.bi_private);
				1385	}
				1386	}
				1387
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	1388	static void bio_pair_end_1(struct bio *bi, int err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1389	{
				1390	struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
				1391
				1392	if (err)
				1393	bp->error = err;
				1394
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1395	bio_pair_release(bp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1396	}
				1397
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	1398	static void bio_pair_end_2(struct bio *bi, int err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1399	{
				1400	struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
				1401
				1402	if (err)
				1403	bp->error = err;
				1404
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1405	bio_pair_release(bp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1406	}
				1407
				1408	/*
				1409	* split a bio - only worry about a bio with a single page
				1410	* in it's iovec
				1411	*/
Denis ChengRq	6feef53	2008-10-09 08:57:05 +0200	[diff] [blame]	1412	struct bio_pair bio_split(struct bio bi, int first_sectors)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1413	{
Denis ChengRq	6feef53	2008-10-09 08:57:05 +0200	[diff] [blame]	1414	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1415
				1416	if (!bp)
				1417	return bp;
				1418
Arnaldo Carvalho de Melo	5f3ea37	2008-10-30 08:34:33 +0100	[diff] [blame]	1419	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
Jens Axboe	2056a78	2006-03-23 20:00:26 +0100	[diff] [blame]	1420	bi->bi_sector + first_sectors);
				1421
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1422	BUG_ON(bi->bi_vcnt != 1);
				1423	BUG_ON(bi->bi_idx != 0);
				1424	atomic_set(&bp->cnt, 3);
				1425	bp->error = 0;
				1426	bp->bio1 = *bi;
				1427	bp->bio2 = *bi;
				1428	bp->bio2.bi_sector += first_sectors;
				1429	bp->bio2.bi_size -= first_sectors << 9;
				1430	bp->bio1.bi_size = first_sectors << 9;
				1431
				1432	bp->bv1 = bi->bi_io_vec[0];
				1433	bp->bv2 = bi->bi_io_vec[0];
				1434	bp->bv2.bv_offset += first_sectors << 9;
				1435	bp->bv2.bv_len -= first_sectors << 9;
				1436	bp->bv1.bv_len = first_sectors << 9;
				1437
				1438	bp->bio1.bi_io_vec = &bp->bv1;
				1439	bp->bio2.bi_io_vec = &bp->bv2;
				1440
NeilBrown	a2eb0c1	2006-05-22 22:35:27 -0700	[diff] [blame]	1441	bp->bio1.bi_max_vecs = 1;
				1442	bp->bio2.bi_max_vecs = 1;
				1443
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1444	bp->bio1.bi_end_io = bio_pair_end_1;
				1445	bp->bio2.bi_end_io = bio_pair_end_2;
				1446
				1447	bp->bio1.bi_private = bi;
Denis ChengRq	6feef53	2008-10-09 08:57:05 +0200	[diff] [blame]	1448	bp->bio2.bi_private = bio_split_pool;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1449
Martin K. Petersen	7ba1ba1	2008-06-30 20:04:41 +0200	[diff] [blame]	1450	if (bio_integrity(bi))
				1451	bio_integrity_split(bi, bp, first_sectors);
				1452
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1453	return bp;
				1454	}
				1455
Martin K. Petersen	ad3316b	2008-10-01 22:42:53 -0400	[diff] [blame]	1456	/**
				1457	* bio_sector_offset - Find hardware sector offset in bio
				1458	* @bio: bio to inspect
				1459	* @index: bio_vec index
				1460	* @offset: offset in bv_page
				1461	*
				1462	* Return the number of hardware sectors between beginning of bio
				1463	* and an end point indicated by a bio_vec index and an offset
				1464	* within that vector's page.
				1465	*/
				1466	sector_t bio_sector_offset(struct bio *bio, unsigned short index,
				1467	unsigned int offset)
				1468	{
				1469	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
				1470	struct bio_vec *bv;
				1471	sector_t sectors;
				1472	int i;
				1473
				1474	sectors = 0;
				1475
				1476	if (index >= bio->bi_idx)
				1477	index = bio->bi_vcnt - 1;
				1478
				1479	__bio_for_each_segment(bv, bio, i, 0) {
				1480	if (i == index) {
				1481	if (offset > bv->bv_offset)
				1482	sectors += (offset - bv->bv_offset) / sector_sz;
				1483	break;
				1484	}
				1485
				1486	sectors += bv->bv_len / sector_sz;
				1487	}
				1488
				1489	return sectors;
				1490	}
				1491	EXPORT_SYMBOL(bio_sector_offset);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1492
				1493	/*
				1494	* create memory pools for biovec's in a bio_set.
				1495	* use the global biovec slabs created for general use.
				1496	*/
Jens Axboe	5972511	2007-04-02 10:06:42 +0200	[diff] [blame]	1497	static int biovec_create_pools(struct bio_set *bs, int pool_entries)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1498	{
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	1499	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1500
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	1501	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
				1502	if (!bs->bvec_pool)
				1503	return -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1504
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1505	return 0;
				1506	}
				1507
				1508	static void biovec_free_pools(struct bio_set *bs)
				1509	{
Jens Axboe	7ff9345	2008-12-11 11:53:43 +0100	[diff] [blame]	1510	mempool_destroy(bs->bvec_pool);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1511	}
				1512
				1513	void bioset_free(struct bio_set *bs)
				1514	{
				1515	if (bs->bio_pool)
				1516	mempool_destroy(bs->bio_pool);
				1517
Martin K. Petersen	7ba1ba1	2008-06-30 20:04:41 +0200	[diff] [blame]	1518	bioset_integrity_free(bs);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1519	biovec_free_pools(bs);
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	1520	bio_put_slab(bs);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1521
				1522	kfree(bs);
				1523	}
				1524
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	1525	/**
				1526	* bioset_create - Create a bio_set
				1527	* @pool_size: Number of bio and bio_vecs to cache in the mempool
				1528	* @front_pad: Number of bytes to allocate in front of the returned bio
				1529	*
				1530	* Description:
				1531	* Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
				1532	* to ask for a number of bytes to be allocated in front of the bio.
				1533	* Front pad allocation is useful for embedding the bio inside
				1534	* another structure, to avoid allocating extra data to go with the bio.
				1535	* Note that the bio must be embedded at the END of that structure always,
				1536	* or things will break badly.
				1537	*/
				1538	struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1539	{
Jens Axboe	392ddc3	2008-12-23 12:42:54 +0100	[diff] [blame]	1540	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
Jens Axboe	1b43449	2008-10-22 20:32:58 +0200	[diff] [blame]	1541	struct bio_set *bs;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1542
Jens Axboe	1b43449	2008-10-22 20:32:58 +0200	[diff] [blame]	1543	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1544	if (!bs)
				1545	return NULL;
				1546
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	1547	bs->front_pad = front_pad;
Jens Axboe	1b43449	2008-10-22 20:32:58 +0200	[diff] [blame]	1548
Jens Axboe	392ddc3	2008-12-23 12:42:54 +0100	[diff] [blame]	1549	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	1550	if (!bs->bio_slab) {
				1551	kfree(bs);
				1552	return NULL;
				1553	}
				1554
				1555	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1556	if (!bs->bio_pool)
				1557	goto bad;
				1558
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	1559	if (bioset_integrity_create(bs, pool_size))
Martin K. Petersen	7ba1ba1	2008-06-30 20:04:41 +0200	[diff] [blame]	1560	goto bad;
				1561
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	1562	if (!biovec_create_pools(bs, pool_size))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1563	return bs;
				1564
				1565	bad:
				1566	bioset_free(bs);
				1567	return NULL;
				1568	}
				1569
				1570	static void __init biovec_init_slabs(void)
				1571	{
				1572	int i;
				1573
				1574	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
				1575	int size;
				1576	struct biovec_slab *bvs = bvec_slabs + i;
				1577
				1578	size = bvs->nr_vecs * sizeof(struct bio_vec);
				1579	bvs->slab = kmem_cache_create(bvs->name, size, 0,
Paul Mundt	20c2df8	2007-07-20 10:11:58 +0900	[diff] [blame]	1580	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1581	}
				1582	}
				1583
				1584	static int __init init_bio(void)
				1585	{
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	1586	bio_slab_max = 2;
				1587	bio_slab_nr = 0;
				1588	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
				1589	if (!bio_slabs)
				1590	panic("bio: can't allocate bios\n");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1591
Martin K. Petersen	7ba1ba1	2008-06-30 20:04:41 +0200	[diff] [blame]	1592	bio_integrity_init_slab();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1593	biovec_init_slabs();
				1594
Jens Axboe	bb799ca	2008-12-10 15:35:05 +0100	[diff] [blame]	1595	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1596	if (!fs_bio_set)
				1597	panic("bio: can't allocate bios\n");
				1598
Matthew Dobson	0eaae62a	2006-03-26 01:37:47 -0800	[diff] [blame]	1599	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
				1600	sizeof(struct bio_pair));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1601	if (!bio_split_pool)
				1602	panic("bio: can't create split pool\n");
				1603
				1604	return 0;
				1605	}
				1606
				1607	subsys_initcall(init_bio);
				1608
				1609	EXPORT_SYMBOL(bio_alloc);
Jens Axboe	0a0d96b	2008-09-11 13:17:37 +0200	[diff] [blame]	1610	EXPORT_SYMBOL(bio_kmalloc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1611	EXPORT_SYMBOL(bio_put);
Peter Osterlund	3676347	2005-09-06 15:16:42 -0700	[diff] [blame]	1612	EXPORT_SYMBOL(bio_free);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1613	EXPORT_SYMBOL(bio_endio);
				1614	EXPORT_SYMBOL(bio_init);
				1615	EXPORT_SYMBOL(__bio_clone);
				1616	EXPORT_SYMBOL(bio_clone);
				1617	EXPORT_SYMBOL(bio_phys_segments);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1618	EXPORT_SYMBOL(bio_add_page);
Mike Christie	6e68af6	2005-11-11 05:30:27 -0600	[diff] [blame]	1619	EXPORT_SYMBOL(bio_add_pc_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1620	EXPORT_SYMBOL(bio_get_nr_vecs);
Jens Axboe	40044ce	2008-03-17 21:14:40 +0100	[diff] [blame]	1621	EXPORT_SYMBOL(bio_map_user);
				1622	EXPORT_SYMBOL(bio_unmap_user);
Mike Christie	df46b9a	2005-06-20 14:04:44 +0200	[diff] [blame]	1623	EXPORT_SYMBOL(bio_map_kern);
FUJITA Tomonori	68154e9	2008-04-25 12:47:50 +0200	[diff] [blame]	1624	EXPORT_SYMBOL(bio_copy_kern);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1625	EXPORT_SYMBOL(bio_pair_release);
				1626	EXPORT_SYMBOL(bio_split);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1627	EXPORT_SYMBOL(bio_copy_user);
				1628	EXPORT_SYMBOL(bio_uncopy_user);
				1629	EXPORT_SYMBOL(bioset_create);
				1630	EXPORT_SYMBOL(bioset_free);
				1631	EXPORT_SYMBOL(bio_alloc_bioset);