Blame - drivers/md/dm-thin.c - kernel/msm-4.9

blob: c3087575fef0ffe5330c261a721abddef3f6771e [file] [log] [blame]

Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 Red Hat UK.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm-thin-metadata.h"
				8
				9	#include <linux/device-mapper.h>
				10	#include <linux/dm-io.h>
				11	#include <linux/dm-kcopyd.h>
				12	#include <linux/list.h>
				13	#include <linux/init.h>
				14	#include <linux/module.h>
				15	#include <linux/slab.h>
				16
				17	#define DM_MSG_PREFIX "thin"
				18
				19	/*
				20	* Tunable constants
				21	*/
				22	#define ENDIO_HOOK_POOL_SIZE 10240
				23	#define DEFERRED_SET_SIZE 64
				24	#define MAPPING_POOL_SIZE 1024
				25	#define PRISON_CELLS 1024
				26
				27	/*
				28	* The block size of the device holding pool data must be
				29	* between 64KB and 1GB.
				30	*/
				31	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
				32	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				33
				34	/*
				35	* The metadata device is currently limited in size. The limitation is
				36	* checked lower down in dm-space-map-metadata, but we also check it here
				37	* so we can fail early.
				38	*
				39	* We have one block of index, which can hold 255 index entries. Each
				40	* index entry contains allocation info about 16k metadata blocks.
				41	*/
				42	#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
				43
				44	/*
				45	* Device id is restricted to 24 bits.
				46	*/
				47	#define MAX_DEV_ID ((1 << 24) - 1)
				48
				49	/*
				50	* How do we handle breaking sharing of data blocks?
				51	* =================================================
				52	*
				53	* We use a standard copy-on-write btree to store the mappings for the
				54	* devices (note I'm talking about copy-on-write of the metadata here, not
				55	* the data). When you take an internal snapshot you clone the root node
				56	* of the origin btree. After this there is no concept of an origin or a
				57	* snapshot. They are just two device trees that happen to point to the
				58	* same data blocks.
				59	*
				60	* When we get a write in we decide if it's to a shared data block using
				61	* some timestamp magic. If it is, we have to break sharing.
				62	*
				63	* Let's say we write to a shared block in what was the origin. The
				64	* steps are:
				65	*
				66	* i) plug io further to this physical block. (see bio_prison code).
				67	*
				68	* ii) quiesce any read io to that shared data block. Obviously
				69	* including all devices that share this block. (see deferred_set code)
				70	*
				71	* iii) copy the data block to a newly allocate block. This step can be
				72	* missed out if the io covers the block. (schedule_copy).
				73	*
				74	* iv) insert the new mapping into the origin's btree
				75	* (process_prepared_mappings). This act of inserting breaks some
				76	* sharing of btree nodes between the two devices. Breaking sharing only
				77	* effects the btree of that specific device. Btrees for the other
				78	* devices that share the block never change. The btree for the origin
				79	* device as it was after the last commit is untouched, ie. we're using
				80	* persistent data structures in the functional programming sense.
				81	*
				82	* v) unplug io to this physical block, including the io that triggered
				83	* the breaking of sharing.
				84	*
				85	* Steps (ii) and (iii) occur in parallel.
				86	*
				87	* The metadata _doesn't_ need to be committed before the io continues. We
				88	* get away with this because the io is always written to a _new_ block.
				89	* If there's a crash, then:
				90	*
				91	* - The origin mapping will point to the old origin block (the shared
				92	* one). This will contain the data as it was before the io that triggered
				93	* the breaking of sharing came in.
				94	*
				95	* - The snap mapping still points to the old block. As it would after
				96	* the commit.
				97	*
				98	* The downside of this scheme is the timestamp magic isn't perfect, and
				99	* will continue to think that data block in the snapshot device is shared
				100	* even after the write to the origin has broken sharing. I suspect data
				101	* blocks will typically be shared by many different devices, so we're
				102	* breaking sharing n + 1 times, rather than n, where n is the number of
				103	* devices that reference this data block. At the moment I think the
				104	* benefits far, far outweigh the disadvantages.
				105	*/
				106
				107	/----------------------------------------------------------------/
				108
				109	/*
				110	* Sometimes we can't deal with a bio straight away. We put them in prison
				111	* where they can't cause any mischief. Bios are put in a cell identified
				112	* by a key, multiple bios can be in the same cell. When the cell is
				113	* subsequently unlocked the bios become available.
				114	*/
				115	struct bio_prison;
				116
				117	struct cell_key {
				118	int virtual;
				119	dm_thin_id dev;
				120	dm_block_t block;
				121	};
				122
				123	struct cell {
				124	struct hlist_node list;
				125	struct bio_prison *prison;
				126	struct cell_key key;
				127	unsigned count;
				128	struct bio_list bios;
				129	};
				130
				131	struct bio_prison {
				132	spinlock_t lock;
				133	mempool_t *cell_pool;
				134
				135	unsigned nr_buckets;
				136	unsigned hash_mask;
				137	struct hlist_head *cells;
				138	};
				139
				140	static uint32_t calc_nr_buckets(unsigned nr_cells)
				141	{
				142	uint32_t n = 128;
				143
				144	nr_cells /= 4;
				145	nr_cells = min(nr_cells, 8192u);
				146
				147	while (n < nr_cells)
				148	n <<= 1;
				149
				150	return n;
				151	}
				152
				153	/*
				154	* @nr_cells should be the number of cells you want in use _concurrently_.
				155	* Don't confuse it with the number of distinct keys.
				156	*/
				157	static struct bio_prison *prison_create(unsigned nr_cells)
				158	{
				159	unsigned i;
				160	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
				161	size_t len = sizeof(struct bio_prison) +
				162	(sizeof(struct hlist_head) * nr_buckets);
				163	struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
				164
				165	if (!prison)
				166	return NULL;
				167
				168	spin_lock_init(&prison->lock);
				169	prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
				170	sizeof(struct cell));
				171	if (!prison->cell_pool) {
				172	kfree(prison);
				173	return NULL;
				174	}
				175
				176	prison->nr_buckets = nr_buckets;
				177	prison->hash_mask = nr_buckets - 1;
				178	prison->cells = (struct hlist_head *) (prison + 1);
				179	for (i = 0; i < nr_buckets; i++)
				180	INIT_HLIST_HEAD(prison->cells + i);
				181
				182	return prison;
				183	}
				184
				185	static void prison_destroy(struct bio_prison *prison)
				186	{
				187	mempool_destroy(prison->cell_pool);
				188	kfree(prison);
				189	}
				190
				191	static uint32_t hash_key(struct bio_prison prison, struct cell_key key)
				192	{
				193	const unsigned long BIG_PRIME = 4294967291UL;
				194	uint64_t hash = key->block * BIG_PRIME;
				195
				196	return (uint32_t) (hash & prison->hash_mask);
				197	}
				198
				199	static int keys_equal(struct cell_key lhs, struct cell_key rhs)
				200	{
				201	return (lhs->virtual == rhs->virtual) &&
				202	(lhs->dev == rhs->dev) &&
				203	(lhs->block == rhs->block);
				204	}
				205
				206	static struct cell __search_bucket(struct hlist_head bucket,
				207	struct cell_key *key)
				208	{
				209	struct cell *cell;
				210	struct hlist_node *tmp;
				211
				212	hlist_for_each_entry(cell, tmp, bucket, list)
				213	if (keys_equal(&cell->key, key))
				214	return cell;
				215
				216	return NULL;
				217	}
				218
				219	/*
				220	* This may block if a new cell needs allocating. You must ensure that
				221	* cells will be unlocked even if the calling thread is blocked.
				222	*
				223	* Returns the number of entries in the cell prior to the new addition
				224	* or < 0 on failure.
				225	*/
				226	static int bio_detain(struct bio_prison prison, struct cell_key key,
				227	struct bio inmate, struct cell *ref)
				228	{
				229	int r;
				230	unsigned long flags;
				231	uint32_t hash = hash_key(prison, key);
				232	struct cell uninitialized_var(cell), cell2 = NULL;
				233
				234	BUG_ON(hash > prison->nr_buckets);
				235
				236	spin_lock_irqsave(&prison->lock, flags);
				237	cell = __search_bucket(prison->cells + hash, key);
				238
				239	if (!cell) {
				240	/*
				241	* Allocate a new cell
				242	*/
				243	spin_unlock_irqrestore(&prison->lock, flags);
				244	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
				245	spin_lock_irqsave(&prison->lock, flags);
				246
				247	/*
				248	* We've been unlocked, so we have to double check that
				249	* nobody else has inserted this cell in the meantime.
				250	*/
				251	cell = __search_bucket(prison->cells + hash, key);
				252
				253	if (!cell) {
				254	cell = cell2;
				255	cell2 = NULL;
				256
				257	cell->prison = prison;
				258	memcpy(&cell->key, key, sizeof(cell->key));
				259	cell->count = 0;
				260	bio_list_init(&cell->bios);
				261	hlist_add_head(&cell->list, prison->cells + hash);
				262	}
				263	}
				264
				265	r = cell->count++;
				266	bio_list_add(&cell->bios, inmate);
				267	spin_unlock_irqrestore(&prison->lock, flags);
				268
				269	if (cell2)
				270	mempool_free(cell2, prison->cell_pool);
				271
				272	*ref = cell;
				273
				274	return r;
				275	}
				276
				277	/*
				278	* @inmates must have been initialised prior to this call
				279	*/
				280	static void __cell_release(struct cell cell, struct bio_list inmates)
				281	{
				282	struct bio_prison *prison = cell->prison;
				283
				284	hlist_del(&cell->list);
				285
				286	if (inmates)
				287	bio_list_merge(inmates, &cell->bios);
				288
				289	mempool_free(cell, prison->cell_pool);
				290	}
				291
				292	static void cell_release(struct cell cell, struct bio_list bios)
				293	{
				294	unsigned long flags;
				295	struct bio_prison *prison = cell->prison;
				296
				297	spin_lock_irqsave(&prison->lock, flags);
				298	__cell_release(cell, bios);
				299	spin_unlock_irqrestore(&prison->lock, flags);
				300	}
				301
				302	/*
				303	* There are a couple of places where we put a bio into a cell briefly
				304	* before taking it out again. In these situations we know that no other
				305	* bio may be in the cell. This function releases the cell, and also does
				306	* a sanity check.
				307	*/
				308	static void cell_release_singleton(struct cell cell, struct bio bio)
				309	{
				310	struct bio_prison *prison = cell->prison;
				311	struct bio_list bios;
				312	struct bio *b;
				313	unsigned long flags;
				314
				315	bio_list_init(&bios);
				316
				317	spin_lock_irqsave(&prison->lock, flags);
				318	__cell_release(cell, &bios);
				319	spin_unlock_irqrestore(&prison->lock, flags);
				320
				321	b = bio_list_pop(&bios);
				322	BUG_ON(b != bio);
				323	BUG_ON(!bio_list_empty(&bios));
				324	}
				325
				326	static void cell_error(struct cell *cell)
				327	{
				328	struct bio_prison *prison = cell->prison;
				329	struct bio_list bios;
				330	struct bio *bio;
				331	unsigned long flags;
				332
				333	bio_list_init(&bios);
				334
				335	spin_lock_irqsave(&prison->lock, flags);
				336	__cell_release(cell, &bios);
				337	spin_unlock_irqrestore(&prison->lock, flags);
				338
				339	while ((bio = bio_list_pop(&bios)))
				340	bio_io_error(bio);
				341	}
				342
				343	/----------------------------------------------------------------/
				344
				345	/*
				346	* We use the deferred set to keep track of pending reads to shared blocks.
				347	* We do this to ensure the new mapping caused by a write isn't performed
				348	* until these prior reads have completed. Otherwise the insertion of the
				349	* new mapping could free the old block that the read bios are mapped to.
				350	*/
				351
				352	struct deferred_set;
				353	struct deferred_entry {
				354	struct deferred_set *ds;
				355	unsigned count;
				356	struct list_head work_items;
				357	};
				358
				359	struct deferred_set {
				360	spinlock_t lock;
				361	unsigned current_entry;
				362	unsigned sweeper;
				363	struct deferred_entry entries[DEFERRED_SET_SIZE];
				364	};
				365
				366	static void ds_init(struct deferred_set *ds)
				367	{
				368	int i;
				369
				370	spin_lock_init(&ds->lock);
				371	ds->current_entry = 0;
				372	ds->sweeper = 0;
				373	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
				374	ds->entries[i].ds = ds;
				375	ds->entries[i].count = 0;
				376	INIT_LIST_HEAD(&ds->entries[i].work_items);
				377	}
				378	}
				379
				380	static struct deferred_entry ds_inc(struct deferred_set ds)
				381	{
				382	unsigned long flags;
				383	struct deferred_entry *entry;
				384
				385	spin_lock_irqsave(&ds->lock, flags);
				386	entry = ds->entries + ds->current_entry;
				387	entry->count++;
				388	spin_unlock_irqrestore(&ds->lock, flags);
				389
				390	return entry;
				391	}
				392
				393	static unsigned ds_next(unsigned index)
				394	{
				395	return (index + 1) % DEFERRED_SET_SIZE;
				396	}
				397
				398	static void __sweep(struct deferred_set ds, struct list_head head)
				399	{
				400	while ((ds->sweeper != ds->current_entry) &&
				401	!ds->entries[ds->sweeper].count) {
				402	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				403	ds->sweeper = ds_next(ds->sweeper);
				404	}
				405
				406	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
				407	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				408	}
				409
				410	static void ds_dec(struct deferred_entry entry, struct list_head head)
				411	{
				412	unsigned long flags;
				413
				414	spin_lock_irqsave(&entry->ds->lock, flags);
				415	BUG_ON(!entry->count);
				416	--entry->count;
				417	__sweep(entry->ds, head);
				418	spin_unlock_irqrestore(&entry->ds->lock, flags);
				419	}
				420
				421	/*
				422	* Returns 1 if deferred or 0 if no pending items to delay job.
				423	*/
				424	static int ds_add_work(struct deferred_set ds, struct list_head work)
				425	{
				426	int r = 1;
				427	unsigned long flags;
				428	unsigned next_entry;
				429
				430	spin_lock_irqsave(&ds->lock, flags);
				431	if ((ds->sweeper == ds->current_entry) &&
				432	!ds->entries[ds->current_entry].count)
				433	r = 0;
				434	else {
				435	list_add(work, &ds->entries[ds->current_entry].work_items);
				436	next_entry = ds_next(ds->current_entry);
				437	if (!ds->entries[next_entry].count)
				438	ds->current_entry = next_entry;
				439	}
				440	spin_unlock_irqrestore(&ds->lock, flags);
				441
				442	return r;
				443	}
				444
				445	/----------------------------------------------------------------/
				446
				447	/*
				448	* Key building.
				449	*/
				450	static void build_data_key(struct dm_thin_device *td,
				451	dm_block_t b, struct cell_key *key)
				452	{
				453	key->virtual = 0;
				454	key->dev = dm_thin_dev_id(td);
				455	key->block = b;
				456	}
				457
				458	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
				459	struct cell_key *key)
				460	{
				461	key->virtual = 1;
				462	key->dev = dm_thin_dev_id(td);
				463	key->block = b;
				464	}
				465
				466	/----------------------------------------------------------------/
				467
				468	/*
				469	* A pool device ties together a metadata device and a data device. It
				470	* also provides the interface for creating and destroying internal
				471	* devices.
				472	*/
				473	struct new_mapping;
				474	struct pool {
				475	struct list_head list;
				476	struct dm_target ti; / Only set if a pool target is bound */
				477
				478	struct mapped_device *pool_md;
				479	struct block_device *md_dev;
				480	struct dm_pool_metadata *pmd;
				481
				482	uint32_t sectors_per_block;
				483	unsigned block_shift;
				484	dm_block_t offset_mask;
				485	dm_block_t low_water_blocks;
				486
				487	unsigned zero_new_blocks:1;
				488	unsigned low_water_triggered:1; /* A dm event has been sent */
				489	unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
				490
				491	struct bio_prison *prison;
				492	struct dm_kcopyd_client *copier;
				493
				494	struct workqueue_struct *wq;
				495	struct work_struct worker;
				496
				497	unsigned ref_count;
				498
				499	spinlock_t lock;
				500	struct bio_list deferred_bios;
				501	struct bio_list deferred_flush_bios;
				502	struct list_head prepared_mappings;
				503
				504	struct bio_list retry_on_resume_list;
				505
				506	struct deferred_set ds; /* FIXME: move to thin_c */
				507
				508	struct new_mapping *next_mapping;
				509	mempool_t *mapping_pool;
				510	mempool_t *endio_hook_pool;
				511	};
				512
				513	/*
				514	* Target context for a pool.
				515	*/
				516	struct pool_c {
				517	struct dm_target *ti;
				518	struct pool *pool;
				519	struct dm_dev *data_dev;
				520	struct dm_dev *metadata_dev;
				521	struct dm_target_callbacks callbacks;
				522
				523	dm_block_t low_water_blocks;
				524	unsigned zero_new_blocks:1;
				525	};
				526
				527	/*
				528	* Target context for a thin.
				529	*/
				530	struct thin_c {
				531	struct dm_dev *pool_dev;
				532	dm_thin_id dev_id;
				533
				534	struct pool *pool;
				535	struct dm_thin_device *td;
				536	};
				537
				538	/----------------------------------------------------------------/
				539
				540	/*
				541	* A global list of pools that uses a struct mapped_device as a key.
				542	*/
				543	static struct dm_thin_pool_table {
				544	struct mutex mutex;
				545	struct list_head pools;
				546	} dm_thin_pool_table;
				547
				548	static void pool_table_init(void)
				549	{
				550	mutex_init(&dm_thin_pool_table.mutex);
				551	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
				552	}
				553
				554	static void __pool_table_insert(struct pool *pool)
				555	{
				556	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				557	list_add(&pool->list, &dm_thin_pool_table.pools);
				558	}
				559
				560	static void __pool_table_remove(struct pool *pool)
				561	{
				562	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				563	list_del(&pool->list);
				564	}
				565
				566	static struct pool __pool_table_lookup(struct mapped_device md)
				567	{
				568	struct pool pool = NULL, tmp;
				569
				570	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				571
				572	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				573	if (tmp->pool_md == md) {
				574	pool = tmp;
				575	break;
				576	}
				577	}
				578
				579	return pool;
				580	}
				581
				582	static struct pool __pool_table_lookup_metadata_dev(struct block_device md_dev)
				583	{
				584	struct pool pool = NULL, tmp;
				585
				586	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				587
				588	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				589	if (tmp->md_dev == md_dev) {
				590	pool = tmp;
				591	break;
				592	}
				593	}
				594
				595	return pool;
				596	}
				597
				598	/----------------------------------------------------------------/
				599
				600	static void __requeue_bio_list(struct thin_c tc, struct bio_list master)
				601	{
				602	struct bio *bio;
				603	struct bio_list bios;
				604
				605	bio_list_init(&bios);
				606	bio_list_merge(&bios, master);
				607	bio_list_init(master);
				608
				609	while ((bio = bio_list_pop(&bios))) {
				610	if (dm_get_mapinfo(bio)->ptr == tc)
				611	bio_endio(bio, DM_ENDIO_REQUEUE);
				612	else
				613	bio_list_add(master, bio);
				614	}
				615	}
				616
				617	static void requeue_io(struct thin_c *tc)
				618	{
				619	struct pool *pool = tc->pool;
				620	unsigned long flags;
				621
				622	spin_lock_irqsave(&pool->lock, flags);
				623	__requeue_bio_list(tc, &pool->deferred_bios);
				624	__requeue_bio_list(tc, &pool->retry_on_resume_list);
				625	spin_unlock_irqrestore(&pool->lock, flags);
				626	}
				627
				628	/*
				629	* This section of code contains the logic for processing a thin device's IO.
				630	* Much of the code depends on pool object resources (lists, workqueues, etc)
				631	* but most is exclusively called from the thin target rather than the thin-pool
				632	* target.
				633	*/
				634
				635	static dm_block_t get_bio_block(struct thin_c tc, struct bio bio)
				636	{
				637	return bio->bi_sector >> tc->pool->block_shift;
				638	}
				639
				640	static void remap(struct thin_c tc, struct bio bio, dm_block_t block)
				641	{
				642	struct pool *pool = tc->pool;
				643
				644	bio->bi_bdev = tc->pool_dev->bdev;
				645	bio->bi_sector = (block << pool->block_shift) +
				646	(bio->bi_sector & pool->offset_mask);
				647	}
				648
				649	static void remap_and_issue(struct thin_c tc, struct bio bio,
				650	dm_block_t block)
				651	{
				652	struct pool *pool = tc->pool;
				653	unsigned long flags;
				654
				655	remap(tc, bio, block);
				656
				657	/*
				658	* Batch together any FUA/FLUSH bios we find and then issue
				659	* a single commit for them in process_deferred_bios().
				660	*/
				661	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA)) {
				662	spin_lock_irqsave(&pool->lock, flags);
				663	bio_list_add(&pool->deferred_flush_bios, bio);
				664	spin_unlock_irqrestore(&pool->lock, flags);
				665	} else
				666	generic_make_request(bio);
				667	}
				668
				669	/*
				670	* wake_worker() is used when new work is queued and when pool_resume is
				671	* ready to continue deferred IO processing.
				672	*/
				673	static void wake_worker(struct pool *pool)
				674	{
				675	queue_work(pool->wq, &pool->worker);
				676	}
				677
				678	/----------------------------------------------------------------/
				679
				680	/*
				681	* Bio endio functions.
				682	*/
				683	struct endio_hook {
				684	struct thin_c *tc;
				685	bio_end_io_t *saved_bi_end_io;
				686	struct deferred_entry *entry;
				687	};
				688
				689	struct new_mapping {
				690	struct list_head list;
				691
				692	int prepared;
				693
				694	struct thin_c *tc;
				695	dm_block_t virt_block;
				696	dm_block_t data_block;
				697	struct cell *cell;
				698	int err;
				699
				700	/*
				701	* If the bio covers the whole area of a block then we can avoid
				702	* zeroing or copying. Instead this bio is hooked. The bio will
				703	* still be in the cell, so care has to be taken to avoid issuing
				704	* the bio twice.
				705	*/
				706	struct bio *bio;
				707	bio_end_io_t *saved_bi_end_io;
				708	};
				709
				710	static void __maybe_add_mapping(struct new_mapping *m)
				711	{
				712	struct pool *pool = m->tc->pool;
				713
				714	if (list_empty(&m->list) && m->prepared) {
				715	list_add(&m->list, &pool->prepared_mappings);
				716	wake_worker(pool);
				717	}
				718	}
				719
				720	static void copy_complete(int read_err, unsigned long write_err, void *context)
				721	{
				722	unsigned long flags;
				723	struct new_mapping *m = context;
				724	struct pool *pool = m->tc->pool;
				725
				726	m->err = read_err \|\| write_err ? -EIO : 0;
				727
				728	spin_lock_irqsave(&pool->lock, flags);
				729	m->prepared = 1;
				730	__maybe_add_mapping(m);
				731	spin_unlock_irqrestore(&pool->lock, flags);
				732	}
				733
				734	static void overwrite_endio(struct bio *bio, int err)
				735	{
				736	unsigned long flags;
				737	struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
				738	struct pool *pool = m->tc->pool;
				739
				740	m->err = err;
				741
				742	spin_lock_irqsave(&pool->lock, flags);
				743	m->prepared = 1;
				744	__maybe_add_mapping(m);
				745	spin_unlock_irqrestore(&pool->lock, flags);
				746	}
				747
				748	static void shared_read_endio(struct bio *bio, int err)
				749	{
				750	struct list_head mappings;
				751	struct new_mapping m, tmp;
				752	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				753	unsigned long flags;
				754	struct pool *pool = h->tc->pool;
				755
				756	bio->bi_end_io = h->saved_bi_end_io;
				757	bio_endio(bio, err);
				758
				759	INIT_LIST_HEAD(&mappings);
				760	ds_dec(h->entry, &mappings);
				761
				762	spin_lock_irqsave(&pool->lock, flags);
				763	list_for_each_entry_safe(m, tmp, &mappings, list) {
				764	list_del(&m->list);
				765	INIT_LIST_HEAD(&m->list);
				766	__maybe_add_mapping(m);
				767	}
				768	spin_unlock_irqrestore(&pool->lock, flags);
				769
				770	mempool_free(h, pool->endio_hook_pool);
				771	}
				772
				773	/----------------------------------------------------------------/
				774
				775	/*
				776	* Workqueue.
				777	*/
				778
				779	/*
				780	* Prepared mapping jobs.
				781	*/
				782
				783	/*
				784	* This sends the bios in the cell back to the deferred_bios list.
				785	*/
				786	static void cell_defer(struct thin_c tc, struct cell cell,
				787	dm_block_t data_block)
				788	{
				789	struct pool *pool = tc->pool;
				790	unsigned long flags;
				791
				792	spin_lock_irqsave(&pool->lock, flags);
				793	cell_release(cell, &pool->deferred_bios);
				794	spin_unlock_irqrestore(&tc->pool->lock, flags);
				795
				796	wake_worker(pool);
				797	}
				798
				799	/*
				800	* Same as cell_defer above, except it omits one particular detainee,
				801	* a write bio that covers the block and has already been processed.
				802	*/
				803	static void cell_defer_except(struct thin_c tc, struct cell cell,
				804	struct bio *exception)
				805	{
				806	struct bio_list bios;
				807	struct bio *bio;
				808	struct pool *pool = tc->pool;
				809	unsigned long flags;
				810
				811	bio_list_init(&bios);
				812	cell_release(cell, &bios);
				813
				814	spin_lock_irqsave(&pool->lock, flags);
				815	while ((bio = bio_list_pop(&bios)))
				816	if (bio != exception)
				817	bio_list_add(&pool->deferred_bios, bio);
				818	spin_unlock_irqrestore(&pool->lock, flags);
				819
				820	wake_worker(pool);
				821	}
				822
				823	static void process_prepared_mapping(struct new_mapping *m)
				824	{
				825	struct thin_c *tc = m->tc;
				826	struct bio *bio;
				827	int r;
				828
				829	bio = m->bio;
				830	if (bio)
				831	bio->bi_end_io = m->saved_bi_end_io;
				832
				833	if (m->err) {
				834	cell_error(m->cell);
				835	return;
				836	}
				837
				838	/*
				839	* Commit the prepared block into the mapping btree.
				840	* Any I/O for this block arriving after this point will get
				841	* remapped to it directly.
				842	*/
				843	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
				844	if (r) {
				845	DMERR("dm_thin_insert_block() failed");
				846	cell_error(m->cell);
				847	return;
				848	}
				849
				850	/*
				851	* Release any bios held while the block was being provisioned.
				852	* If we are processing a write bio that completely covers the block,
				853	* we already processed it so can ignore it now when processing
				854	* the bios in the cell.
				855	*/
				856	if (bio) {
				857	cell_defer_except(tc, m->cell, bio);
				858	bio_endio(bio, 0);
				859	} else
				860	cell_defer(tc, m->cell, m->data_block);
				861
				862	list_del(&m->list);
				863	mempool_free(m, tc->pool->mapping_pool);
				864	}
				865
				866	static void process_prepared_mappings(struct pool *pool)
				867	{
				868	unsigned long flags;
				869	struct list_head maps;
				870	struct new_mapping m, tmp;
				871
				872	INIT_LIST_HEAD(&maps);
				873	spin_lock_irqsave(&pool->lock, flags);
				874	list_splice_init(&pool->prepared_mappings, &maps);
				875	spin_unlock_irqrestore(&pool->lock, flags);
				876
				877	list_for_each_entry_safe(m, tmp, &maps, list)
				878	process_prepared_mapping(m);
				879	}
				880
				881	/*
				882	* Deferred bio jobs.
				883	*/
				884	static int io_overwrites_block(struct pool pool, struct bio bio)
				885	{
				886	return ((bio_data_dir(bio) == WRITE) &&
				887	!(bio->bi_sector & pool->offset_mask)) &&
				888	(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
				889	}
				890
				891	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
				892	bio_end_io_t *fn)
				893	{
				894	*save = bio->bi_end_io;
				895	bio->bi_end_io = fn;
				896	}
				897
				898	static int ensure_next_mapping(struct pool *pool)
				899	{
				900	if (pool->next_mapping)
				901	return 0;
				902
				903	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
				904
				905	return pool->next_mapping ? 0 : -ENOMEM;
				906	}
				907
				908	static struct new_mapping get_next_mapping(struct pool pool)
				909	{
				910	struct new_mapping *r = pool->next_mapping;
				911
				912	BUG_ON(!pool->next_mapping);
				913
				914	pool->next_mapping = NULL;
				915
				916	return r;
				917	}
				918
				919	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
				920	dm_block_t data_origin, dm_block_t data_dest,
				921	struct cell cell, struct bio bio)
				922	{
				923	int r;
				924	struct pool *pool = tc->pool;
				925	struct new_mapping *m = get_next_mapping(pool);
				926
				927	INIT_LIST_HEAD(&m->list);
				928	m->prepared = 0;
				929	m->tc = tc;
				930	m->virt_block = virt_block;
				931	m->data_block = data_dest;
				932	m->cell = cell;
				933	m->err = 0;
				934	m->bio = NULL;
				935
				936	ds_add_work(&pool->ds, &m->list);
				937
				938	/*
				939	* IO to pool_dev remaps to the pool target's data_dev.
				940	*
				941	* If the whole block of data is being overwritten, we can issue the
				942	* bio immediately. Otherwise we use kcopyd to clone the data first.
				943	*/
				944	if (io_overwrites_block(pool, bio)) {
				945	m->bio = bio;
				946	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
				947	dm_get_mapinfo(bio)->ptr = m;
				948	remap_and_issue(tc, bio, data_dest);
				949	} else {
				950	struct dm_io_region from, to;
				951
				952	from.bdev = tc->pool_dev->bdev;
				953	from.sector = data_origin * pool->sectors_per_block;
				954	from.count = pool->sectors_per_block;
				955
				956	to.bdev = tc->pool_dev->bdev;
				957	to.sector = data_dest * pool->sectors_per_block;
				958	to.count = pool->sectors_per_block;
				959
				960	r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
				961	0, copy_complete, m);
				962	if (r < 0) {
				963	mempool_free(m, pool->mapping_pool);
				964	DMERR("dm_kcopyd_copy() failed");
				965	cell_error(cell);
				966	}
				967	}
				968	}
				969
				970	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
				971	dm_block_t data_block, struct cell *cell,
				972	struct bio *bio)
				973	{
				974	struct pool *pool = tc->pool;
				975	struct new_mapping *m = get_next_mapping(pool);
				976
				977	INIT_LIST_HEAD(&m->list);
				978	m->prepared = 0;
				979	m->tc = tc;
				980	m->virt_block = virt_block;
				981	m->data_block = data_block;
				982	m->cell = cell;
				983	m->err = 0;
				984	m->bio = NULL;
				985
				986	/*
				987	* If the whole block of data is being overwritten or we are not
				988	* zeroing pre-existing data, we can issue the bio immediately.
				989	* Otherwise we use kcopyd to zero the data first.
				990	*/
				991	if (!pool->zero_new_blocks)
				992	process_prepared_mapping(m);
				993
				994	else if (io_overwrites_block(pool, bio)) {
				995	m->bio = bio;
				996	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
				997	dm_get_mapinfo(bio)->ptr = m;
				998	remap_and_issue(tc, bio, data_block);
				999
				1000	} else {
				1001	int r;
				1002	struct dm_io_region to;
				1003
				1004	to.bdev = tc->pool_dev->bdev;
				1005	to.sector = data_block * pool->sectors_per_block;
				1006	to.count = pool->sectors_per_block;
				1007
				1008	r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
				1009	if (r < 0) {
				1010	mempool_free(m, pool->mapping_pool);
				1011	DMERR("dm_kcopyd_zero() failed");
				1012	cell_error(cell);
				1013	}
				1014	}
				1015	}
				1016
				1017	static int alloc_data_block(struct thin_c tc, dm_block_t result)
				1018	{
				1019	int r;
				1020	dm_block_t free_blocks;
				1021	unsigned long flags;
				1022	struct pool *pool = tc->pool;
				1023
				1024	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1025	if (r)
				1026	return r;
				1027
				1028	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
				1029	DMWARN("%s: reached low water mark, sending event.",
				1030	dm_device_name(pool->pool_md));
				1031	spin_lock_irqsave(&pool->lock, flags);
				1032	pool->low_water_triggered = 1;
				1033	spin_unlock_irqrestore(&pool->lock, flags);
				1034	dm_table_event(pool->ti->table);
				1035	}
				1036
				1037	if (!free_blocks) {
				1038	if (pool->no_free_space)
				1039	return -ENOSPC;
				1040	else {
				1041	/*
				1042	* Try to commit to see if that will free up some
				1043	* more space.
				1044	*/
				1045	r = dm_pool_commit_metadata(pool->pmd);
				1046	if (r) {
				1047	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1048	__func__, r);
				1049	return r;
				1050	}
				1051
				1052	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1053	if (r)
				1054	return r;
				1055
				1056	/*
				1057	* If we still have no space we set a flag to avoid
				1058	* doing all this checking and return -ENOSPC.
				1059	*/
				1060	if (!free_blocks) {
				1061	DMWARN("%s: no free space available.",
				1062	dm_device_name(pool->pool_md));
				1063	spin_lock_irqsave(&pool->lock, flags);
				1064	pool->no_free_space = 1;
				1065	spin_unlock_irqrestore(&pool->lock, flags);
				1066	return -ENOSPC;
				1067	}
				1068	}
				1069	}
				1070
				1071	r = dm_pool_alloc_data_block(pool->pmd, result);
				1072	if (r)
				1073	return r;
				1074
				1075	return 0;
				1076	}
				1077
				1078	/*
				1079	* If we have run out of space, queue bios until the device is
				1080	* resumed, presumably after having been reloaded with more space.
				1081	*/
				1082	static void retry_on_resume(struct bio *bio)
				1083	{
				1084	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
				1085	struct pool *pool = tc->pool;
				1086	unsigned long flags;
				1087
				1088	spin_lock_irqsave(&pool->lock, flags);
				1089	bio_list_add(&pool->retry_on_resume_list, bio);
				1090	spin_unlock_irqrestore(&pool->lock, flags);
				1091	}
				1092
				1093	static void no_space(struct cell *cell)
				1094	{
				1095	struct bio *bio;
				1096	struct bio_list bios;
				1097
				1098	bio_list_init(&bios);
				1099	cell_release(cell, &bios);
				1100
				1101	while ((bio = bio_list_pop(&bios)))
				1102	retry_on_resume(bio);
				1103	}
				1104
				1105	static void break_sharing(struct thin_c tc, struct bio bio, dm_block_t block,
				1106	struct cell_key *key,
				1107	struct dm_thin_lookup_result *lookup_result,
				1108	struct cell *cell)
				1109	{
				1110	int r;
				1111	dm_block_t data_block;
				1112
				1113	r = alloc_data_block(tc, &data_block);
				1114	switch (r) {
				1115	case 0:
				1116	schedule_copy(tc, block, lookup_result->block,
				1117	data_block, cell, bio);
				1118	break;
				1119
				1120	case -ENOSPC:
				1121	no_space(cell);
				1122	break;
				1123
				1124	default:
				1125	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1126	cell_error(cell);
				1127	break;
				1128	}
				1129	}
				1130
				1131	static void process_shared_bio(struct thin_c tc, struct bio bio,
				1132	dm_block_t block,
				1133	struct dm_thin_lookup_result *lookup_result)
				1134	{
				1135	struct cell *cell;
				1136	struct pool *pool = tc->pool;
				1137	struct cell_key key;
				1138
				1139	/*
				1140	* If cell is already occupied, then sharing is already in the process
				1141	* of being broken so we have nothing further to do here.
				1142	*/
				1143	build_data_key(tc->td, lookup_result->block, &key);
				1144	if (bio_detain(pool->prison, &key, bio, &cell))
				1145	return;
				1146
				1147	if (bio_data_dir(bio) == WRITE)
				1148	break_sharing(tc, bio, block, &key, lookup_result, cell);
				1149	else {
				1150	struct endio_hook *h;
				1151	h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
				1152
				1153	h->tc = tc;
				1154	h->entry = ds_inc(&pool->ds);
				1155	save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
				1156	dm_get_mapinfo(bio)->ptr = h;
				1157
				1158	cell_release_singleton(cell, bio);
				1159	remap_and_issue(tc, bio, lookup_result->block);
				1160	}
				1161	}
				1162
				1163	static void provision_block(struct thin_c tc, struct bio bio, dm_block_t block,
				1164	struct cell *cell)
				1165	{
				1166	int r;
				1167	dm_block_t data_block;
				1168
				1169	/*
				1170	* Remap empty bios (flushes) immediately, without provisioning.
				1171	*/
				1172	if (!bio->bi_size) {
				1173	cell_release_singleton(cell, bio);
				1174	remap_and_issue(tc, bio, 0);
				1175	return;
				1176	}
				1177
				1178	/*
				1179	* Fill read bios with zeroes and complete them immediately.
				1180	*/
				1181	if (bio_data_dir(bio) == READ) {
				1182	zero_fill_bio(bio);
				1183	cell_release_singleton(cell, bio);
				1184	bio_endio(bio, 0);
				1185	return;
				1186	}
				1187
				1188	r = alloc_data_block(tc, &data_block);
				1189	switch (r) {
				1190	case 0:
				1191	schedule_zero(tc, block, data_block, cell, bio);
				1192	break;
				1193
				1194	case -ENOSPC:
				1195	no_space(cell);
				1196	break;
				1197
				1198	default:
				1199	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1200	cell_error(cell);
				1201	break;
				1202	}
				1203	}
				1204
				1205	static void process_bio(struct thin_c tc, struct bio bio)
				1206	{
				1207	int r;
				1208	dm_block_t block = get_bio_block(tc, bio);
				1209	struct cell *cell;
				1210	struct cell_key key;
				1211	struct dm_thin_lookup_result lookup_result;
				1212
				1213	/*
				1214	* If cell is already occupied, then the block is already
				1215	* being provisioned so we have nothing further to do here.
				1216	*/
				1217	build_virtual_key(tc->td, block, &key);
				1218	if (bio_detain(tc->pool->prison, &key, bio, &cell))
				1219	return;
				1220
				1221	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1222	switch (r) {
				1223	case 0:
				1224	/*
				1225	* We can release this cell now. This thread is the only
				1226	* one that puts bios into a cell, and we know there were
				1227	* no preceding bios.
				1228	*/
				1229	/*
				1230	* TODO: this will probably have to change when discard goes
				1231	* back in.
				1232	*/
				1233	cell_release_singleton(cell, bio);
				1234
				1235	if (lookup_result.shared)
				1236	process_shared_bio(tc, bio, block, &lookup_result);
				1237	else
				1238	remap_and_issue(tc, bio, lookup_result.block);
				1239	break;
				1240
				1241	case -ENODATA:
				1242	provision_block(tc, bio, block, cell);
				1243	break;
				1244
				1245	default:
				1246	DMERR("dm_thin_find_block() failed, error = %d", r);
				1247	bio_io_error(bio);
				1248	break;
				1249	}
				1250	}
				1251
				1252	static void process_deferred_bios(struct pool *pool)
				1253	{
				1254	unsigned long flags;
				1255	struct bio *bio;
				1256	struct bio_list bios;
				1257	int r;
				1258
				1259	bio_list_init(&bios);
				1260
				1261	spin_lock_irqsave(&pool->lock, flags);
				1262	bio_list_merge(&bios, &pool->deferred_bios);
				1263	bio_list_init(&pool->deferred_bios);
				1264	spin_unlock_irqrestore(&pool->lock, flags);
				1265
				1266	while ((bio = bio_list_pop(&bios))) {
				1267	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
				1268	/*
				1269	* If we've got no free new_mapping structs, and processing
				1270	* this bio might require one, we pause until there are some
				1271	* prepared mappings to process.
				1272	*/
				1273	if (ensure_next_mapping(pool)) {
				1274	spin_lock_irqsave(&pool->lock, flags);
				1275	bio_list_merge(&pool->deferred_bios, &bios);
				1276	spin_unlock_irqrestore(&pool->lock, flags);
				1277
				1278	break;
				1279	}
				1280	process_bio(tc, bio);
				1281	}
				1282
				1283	/*
				1284	* If there are any deferred flush bios, we must commit
				1285	* the metadata before issuing them.
				1286	*/
				1287	bio_list_init(&bios);
				1288	spin_lock_irqsave(&pool->lock, flags);
				1289	bio_list_merge(&bios, &pool->deferred_flush_bios);
				1290	bio_list_init(&pool->deferred_flush_bios);
				1291	spin_unlock_irqrestore(&pool->lock, flags);
				1292
				1293	if (bio_list_empty(&bios))
				1294	return;
				1295
				1296	r = dm_pool_commit_metadata(pool->pmd);
				1297	if (r) {
				1298	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1299	__func__, r);
				1300	while ((bio = bio_list_pop(&bios)))
				1301	bio_io_error(bio);
				1302	return;
				1303	}
				1304
				1305	while ((bio = bio_list_pop(&bios)))
				1306	generic_make_request(bio);
				1307	}
				1308
				1309	static void do_worker(struct work_struct *ws)
				1310	{
				1311	struct pool *pool = container_of(ws, struct pool, worker);
				1312
				1313	process_prepared_mappings(pool);
				1314	process_deferred_bios(pool);
				1315	}
				1316
				1317	/----------------------------------------------------------------/
				1318
				1319	/*
				1320	* Mapping functions.
				1321	*/
				1322
				1323	/*
				1324	* Called only while mapping a thin bio to hand it over to the workqueue.
				1325	*/
				1326	static void thin_defer_bio(struct thin_c tc, struct bio bio)
				1327	{
				1328	unsigned long flags;
				1329	struct pool *pool = tc->pool;
				1330
				1331	spin_lock_irqsave(&pool->lock, flags);
				1332	bio_list_add(&pool->deferred_bios, bio);
				1333	spin_unlock_irqrestore(&pool->lock, flags);
				1334
				1335	wake_worker(pool);
				1336	}
				1337
				1338	/*
				1339	* Non-blocking function called from the thin target's map function.
				1340	*/
				1341	static int thin_bio_map(struct dm_target ti, struct bio bio,
				1342	union map_info *map_context)
				1343	{
				1344	int r;
				1345	struct thin_c *tc = ti->private;
				1346	dm_block_t block = get_bio_block(tc, bio);
				1347	struct dm_thin_device *td = tc->td;
				1348	struct dm_thin_lookup_result result;
				1349
				1350	/*
				1351	* Save the thin context for easy access from the deferred bio later.
				1352	*/
				1353	map_context->ptr = tc;
				1354
				1355	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA)) {
				1356	thin_defer_bio(tc, bio);
				1357	return DM_MAPIO_SUBMITTED;
				1358	}
				1359
				1360	r = dm_thin_find_block(td, block, 0, &result);
				1361
				1362	/*
				1363	* Note that we defer readahead too.
				1364	*/
				1365	switch (r) {
				1366	case 0:
				1367	if (unlikely(result.shared)) {
				1368	/*
				1369	* We have a race condition here between the
				1370	* result.shared value returned by the lookup and
				1371	* snapshot creation, which may cause new
				1372	* sharing.
				1373	*
				1374	* To avoid this always quiesce the origin before
				1375	* taking the snap. You want to do this anyway to
				1376	* ensure a consistent application view
				1377	* (i.e. lockfs).
				1378	*
				1379	* More distant ancestors are irrelevant. The
				1380	* shared flag will be set in their case.
				1381	*/
				1382	thin_defer_bio(tc, bio);
				1383	r = DM_MAPIO_SUBMITTED;
				1384	} else {
				1385	remap(tc, bio, result.block);
				1386	r = DM_MAPIO_REMAPPED;
				1387	}
				1388	break;
				1389
				1390	case -ENODATA:
				1391	/*
				1392	* In future, the failed dm_thin_find_block above could
				1393	* provide the hint to load the metadata into cache.
				1394	*/
				1395	case -EWOULDBLOCK:
				1396	thin_defer_bio(tc, bio);
				1397	r = DM_MAPIO_SUBMITTED;
				1398	break;
				1399	}
				1400
				1401	return r;
				1402	}
				1403
				1404	static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				1405	{
				1406	int r;
				1407	unsigned long flags;
				1408	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
				1409
				1410	spin_lock_irqsave(&pt->pool->lock, flags);
				1411	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
				1412	spin_unlock_irqrestore(&pt->pool->lock, flags);
				1413
				1414	if (!r) {
				1415	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				1416	r = bdi_congested(&q->backing_dev_info, bdi_bits);
				1417	}
				1418
				1419	return r;
				1420	}
				1421
				1422	static void __requeue_bios(struct pool *pool)
				1423	{
				1424	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
				1425	bio_list_init(&pool->retry_on_resume_list);
				1426	}
				1427
				1428	/*----------------------------------------------------------------
				1429	* Binding of control targets to a pool object
				1430	--------------------------------------------------------------/
				1431	static int bind_control_target(struct pool pool, struct dm_target ti)
				1432	{
				1433	struct pool_c *pt = ti->private;
				1434
				1435	pool->ti = ti;
				1436	pool->low_water_blocks = pt->low_water_blocks;
				1437	pool->zero_new_blocks = pt->zero_new_blocks;
				1438
				1439	return 0;
				1440	}
				1441
				1442	static void unbind_control_target(struct pool pool, struct dm_target ti)
				1443	{
				1444	if (pool->ti == ti)
				1445	pool->ti = NULL;
				1446	}
				1447
				1448	/*----------------------------------------------------------------
				1449	* Pool creation
				1450	--------------------------------------------------------------/
				1451	static void __pool_destroy(struct pool *pool)
				1452	{
				1453	__pool_table_remove(pool);
				1454
				1455	if (dm_pool_metadata_close(pool->pmd) < 0)
				1456	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1457
				1458	prison_destroy(pool->prison);
				1459	dm_kcopyd_client_destroy(pool->copier);
				1460
				1461	if (pool->wq)
				1462	destroy_workqueue(pool->wq);
				1463
				1464	if (pool->next_mapping)
				1465	mempool_free(pool->next_mapping, pool->mapping_pool);
				1466	mempool_destroy(pool->mapping_pool);
				1467	mempool_destroy(pool->endio_hook_pool);
				1468	kfree(pool);
				1469	}
				1470
				1471	static struct pool pool_create(struct mapped_device pool_md,
				1472	struct block_device *metadata_dev,
				1473	unsigned long block_size, char **error)
				1474	{
				1475	int r;
				1476	void *err_p;
				1477	struct pool *pool;
				1478	struct dm_pool_metadata *pmd;
				1479
				1480	pmd = dm_pool_metadata_open(metadata_dev, block_size);
				1481	if (IS_ERR(pmd)) {
				1482	*error = "Error creating metadata object";
				1483	return (struct pool *)pmd;
				1484	}
				1485
				1486	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
				1487	if (!pool) {
				1488	*error = "Error allocating memory for pool";
				1489	err_p = ERR_PTR(-ENOMEM);
				1490	goto bad_pool;
				1491	}
				1492
				1493	pool->pmd = pmd;
				1494	pool->sectors_per_block = block_size;
				1495	pool->block_shift = ffs(block_size) - 1;
				1496	pool->offset_mask = block_size - 1;
				1497	pool->low_water_blocks = 0;
				1498	pool->zero_new_blocks = 1;
				1499	pool->prison = prison_create(PRISON_CELLS);
				1500	if (!pool->prison) {
				1501	*error = "Error creating pool's bio prison";
				1502	err_p = ERR_PTR(-ENOMEM);
				1503	goto bad_prison;
				1504	}
				1505
				1506	pool->copier = dm_kcopyd_client_create();
				1507	if (IS_ERR(pool->copier)) {
				1508	r = PTR_ERR(pool->copier);
				1509	*error = "Error creating pool's kcopyd client";
				1510	err_p = ERR_PTR(r);
				1511	goto bad_kcopyd_client;
				1512	}
				1513
				1514	/*
				1515	* Create singlethreaded workqueue that will service all devices
				1516	* that use this metadata.
				1517	*/
				1518	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
				1519	if (!pool->wq) {
				1520	*error = "Error creating pool's workqueue";
				1521	err_p = ERR_PTR(-ENOMEM);
				1522	goto bad_wq;
				1523	}
				1524
				1525	INIT_WORK(&pool->worker, do_worker);
				1526	spin_lock_init(&pool->lock);
				1527	bio_list_init(&pool->deferred_bios);
				1528	bio_list_init(&pool->deferred_flush_bios);
				1529	INIT_LIST_HEAD(&pool->prepared_mappings);
				1530	pool->low_water_triggered = 0;
				1531	pool->no_free_space = 0;
				1532	bio_list_init(&pool->retry_on_resume_list);
				1533	ds_init(&pool->ds);
				1534
				1535	pool->next_mapping = NULL;
				1536	pool->mapping_pool =
				1537	mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
				1538	if (!pool->mapping_pool) {
				1539	*error = "Error creating pool's mapping mempool";
				1540	err_p = ERR_PTR(-ENOMEM);
				1541	goto bad_mapping_pool;
				1542	}
				1543
				1544	pool->endio_hook_pool =
				1545	mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
				1546	if (!pool->endio_hook_pool) {
				1547	*error = "Error creating pool's endio_hook mempool";
				1548	err_p = ERR_PTR(-ENOMEM);
				1549	goto bad_endio_hook_pool;
				1550	}
				1551	pool->ref_count = 1;
				1552	pool->pool_md = pool_md;
				1553	pool->md_dev = metadata_dev;
				1554	__pool_table_insert(pool);
				1555
				1556	return pool;
				1557
				1558	bad_endio_hook_pool:
				1559	mempool_destroy(pool->mapping_pool);
				1560	bad_mapping_pool:
				1561	destroy_workqueue(pool->wq);
				1562	bad_wq:
				1563	dm_kcopyd_client_destroy(pool->copier);
				1564	bad_kcopyd_client:
				1565	prison_destroy(pool->prison);
				1566	bad_prison:
				1567	kfree(pool);
				1568	bad_pool:
				1569	if (dm_pool_metadata_close(pmd))
				1570	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1571
				1572	return err_p;
				1573	}
				1574
				1575	static void __pool_inc(struct pool *pool)
				1576	{
				1577	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1578	pool->ref_count++;
				1579	}
				1580
				1581	static void __pool_dec(struct pool *pool)
				1582	{
				1583	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1584	BUG_ON(!pool->ref_count);
				1585	if (!--pool->ref_count)
				1586	__pool_destroy(pool);
				1587	}
				1588
				1589	static struct pool __pool_find(struct mapped_device pool_md,
				1590	struct block_device *metadata_dev,
				1591	unsigned long block_size, char **error)
				1592	{
				1593	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
				1594
				1595	if (pool) {
				1596	if (pool->pool_md != pool_md)
				1597	return ERR_PTR(-EBUSY);
				1598	__pool_inc(pool);
				1599
				1600	} else {
				1601	pool = __pool_table_lookup(pool_md);
				1602	if (pool) {
				1603	if (pool->md_dev != metadata_dev)
				1604	return ERR_PTR(-EINVAL);
				1605	__pool_inc(pool);
				1606
				1607	} else
				1608	pool = pool_create(pool_md, metadata_dev, block_size, error);
				1609	}
				1610
				1611	return pool;
				1612	}
				1613
				1614	/*----------------------------------------------------------------
				1615	* Pool target methods
				1616	--------------------------------------------------------------/
				1617	static void pool_dtr(struct dm_target *ti)
				1618	{
				1619	struct pool_c *pt = ti->private;
				1620
				1621	mutex_lock(&dm_thin_pool_table.mutex);
				1622
				1623	unbind_control_target(pt->pool, ti);
				1624	__pool_dec(pt->pool);
				1625	dm_put_device(ti, pt->metadata_dev);
				1626	dm_put_device(ti, pt->data_dev);
				1627	kfree(pt);
				1628
				1629	mutex_unlock(&dm_thin_pool_table.mutex);
				1630	}
				1631
				1632	struct pool_features {
				1633	unsigned zero_new_blocks:1;
				1634	};
				1635
				1636	static int parse_pool_features(struct dm_arg_set as, struct pool_features pf,
				1637	struct dm_target *ti)
				1638	{
				1639	int r;
				1640	unsigned argc;
				1641	const char *arg_name;
				1642
				1643	static struct dm_arg _args[] = {
				1644	{0, 1, "Invalid number of pool feature arguments"},
				1645	};
				1646
				1647	/*
				1648	* No feature arguments supplied.
				1649	*/
				1650	if (!as->argc)
				1651	return 0;
				1652
				1653	r = dm_read_arg_group(_args, as, &argc, &ti->error);
				1654	if (r)
				1655	return -EINVAL;
				1656
				1657	while (argc && !r) {
				1658	arg_name = dm_shift_arg(as);
				1659	argc--;
				1660
				1661	if (!strcasecmp(arg_name, "skip_block_zeroing")) {
				1662	pf->zero_new_blocks = 0;
				1663	continue;
				1664	}
				1665
				1666	ti->error = "Unrecognised pool feature requested";
				1667	r = -EINVAL;
				1668	}
				1669
				1670	return r;
				1671	}
				1672
				1673	/*
				1674	* thin-pool <metadata dev> <data dev>
				1675	* <data block size (sectors)>
				1676	* <low water mark (blocks)>
				1677	* [<#feature args> [<arg>]*]
				1678	*
				1679	* Optional feature arguments are:
				1680	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
				1681	*/
				1682	static int pool_ctr(struct dm_target ti, unsigned argc, char *argv)
				1683	{
				1684	int r;
				1685	struct pool_c *pt;
				1686	struct pool *pool;
				1687	struct pool_features pf;
				1688	struct dm_arg_set as;
				1689	struct dm_dev *data_dev;
				1690	unsigned long block_size;
				1691	dm_block_t low_water_blocks;
				1692	struct dm_dev *metadata_dev;
				1693	sector_t metadata_dev_size;
				1694
				1695	/*
				1696	* FIXME Remove validation from scope of lock.
				1697	*/
				1698	mutex_lock(&dm_thin_pool_table.mutex);
				1699
				1700	if (argc < 4) {
				1701	ti->error = "Invalid argument count";
				1702	r = -EINVAL;
				1703	goto out_unlock;
				1704	}
				1705	as.argc = argc;
				1706	as.argv = argv;
				1707
				1708	r = dm_get_device(ti, argv[0], FMODE_READ \| FMODE_WRITE, &metadata_dev);
				1709	if (r) {
				1710	ti->error = "Error opening metadata block device";
				1711	goto out_unlock;
				1712	}
				1713
				1714	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
				1715	if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
				1716	ti->error = "Metadata device is too large";
				1717	r = -EINVAL;
				1718	goto out_metadata;
				1719	}
				1720
				1721	r = dm_get_device(ti, argv[1], FMODE_READ \| FMODE_WRITE, &data_dev);
				1722	if (r) {
				1723	ti->error = "Error getting data device";
				1724	goto out_metadata;
				1725	}
				1726
				1727	if (kstrtoul(argv[2], 10, &block_size) \|\| !block_size \|\|
				1728	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				1729	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				1730	!is_power_of_2(block_size)) {
				1731	ti->error = "Invalid block size";
				1732	r = -EINVAL;
				1733	goto out;
				1734	}
				1735
				1736	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
				1737	ti->error = "Invalid low water mark";
				1738	r = -EINVAL;
				1739	goto out;
				1740	}
				1741
				1742	/*
				1743	* Set default pool features.
				1744	*/
				1745	memset(&pf, 0, sizeof(pf));
				1746	pf.zero_new_blocks = 1;
				1747
				1748	dm_consume_args(&as, 4);
				1749	r = parse_pool_features(&as, &pf, ti);
				1750	if (r)
				1751	goto out;
				1752
				1753	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
				1754	if (!pt) {
				1755	r = -ENOMEM;
				1756	goto out;
				1757	}
				1758
				1759	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
				1760	block_size, &ti->error);
				1761	if (IS_ERR(pool)) {
				1762	r = PTR_ERR(pool);
				1763	goto out_free_pt;
				1764	}
				1765
				1766	pt->pool = pool;
				1767	pt->ti = ti;
				1768	pt->metadata_dev = metadata_dev;
				1769	pt->data_dev = data_dev;
				1770	pt->low_water_blocks = low_water_blocks;
				1771	pt->zero_new_blocks = pf.zero_new_blocks;
				1772	ti->num_flush_requests = 1;
				1773	ti->num_discard_requests = 0;
				1774	ti->private = pt;
				1775
				1776	pt->callbacks.congested_fn = pool_is_congested;
				1777	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
				1778
				1779	mutex_unlock(&dm_thin_pool_table.mutex);
				1780
				1781	return 0;
				1782
				1783	out_free_pt:
				1784	kfree(pt);
				1785	out:
				1786	dm_put_device(ti, data_dev);
				1787	out_metadata:
				1788	dm_put_device(ti, metadata_dev);
				1789	out_unlock:
				1790	mutex_unlock(&dm_thin_pool_table.mutex);
				1791
				1792	return r;
				1793	}
				1794
				1795	static int pool_map(struct dm_target ti, struct bio bio,
				1796	union map_info *map_context)
				1797	{
				1798	int r;
				1799	struct pool_c *pt = ti->private;
				1800	struct pool *pool = pt->pool;
				1801	unsigned long flags;
				1802
				1803	/*
				1804	* As this is a singleton target, ti->begin is always zero.
				1805	*/
				1806	spin_lock_irqsave(&pool->lock, flags);
				1807	bio->bi_bdev = pt->data_dev->bdev;
				1808	r = DM_MAPIO_REMAPPED;
				1809	spin_unlock_irqrestore(&pool->lock, flags);
				1810
				1811	return r;
				1812	}
				1813
				1814	/*
				1815	* Retrieves the number of blocks of the data device from
				1816	* the superblock and compares it to the actual device size,
				1817	* thus resizing the data device in case it has grown.
				1818	*
				1819	* This both copes with opening preallocated data devices in the ctr
				1820	* being followed by a resume
				1821	* -and-
				1822	* calling the resume method individually after userspace has
				1823	* grown the data device in reaction to a table event.
				1824	*/
				1825	static int pool_preresume(struct dm_target *ti)
				1826	{
				1827	int r;
				1828	struct pool_c *pt = ti->private;
				1829	struct pool *pool = pt->pool;
				1830	dm_block_t data_size, sb_data_size;
				1831
				1832	/*
				1833	* Take control of the pool object.
				1834	*/
				1835	r = bind_control_target(pool, ti);
				1836	if (r)
				1837	return r;
				1838
				1839	data_size = ti->len >> pool->block_shift;
				1840	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
				1841	if (r) {
				1842	DMERR("failed to retrieve data device size");
				1843	return r;
				1844	}
				1845
				1846	if (data_size < sb_data_size) {
				1847	DMERR("pool target too small, is %llu blocks (expected %llu)",
				1848	data_size, sb_data_size);
				1849	return -EINVAL;
				1850
				1851	} else if (data_size > sb_data_size) {
				1852	r = dm_pool_resize_data_dev(pool->pmd, data_size);
				1853	if (r) {
				1854	DMERR("failed to resize data device");
				1855	return r;
				1856	}
				1857
				1858	r = dm_pool_commit_metadata(pool->pmd);
				1859	if (r) {
				1860	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1861	__func__, r);
				1862	return r;
				1863	}
				1864	}
				1865
				1866	return 0;
				1867	}
				1868
				1869	static void pool_resume(struct dm_target *ti)
				1870	{
				1871	struct pool_c *pt = ti->private;
				1872	struct pool *pool = pt->pool;
				1873	unsigned long flags;
				1874
				1875	spin_lock_irqsave(&pool->lock, flags);
				1876	pool->low_water_triggered = 0;
				1877	pool->no_free_space = 0;
				1878	__requeue_bios(pool);
				1879	spin_unlock_irqrestore(&pool->lock, flags);
				1880
				1881	wake_worker(pool);
				1882	}
				1883
				1884	static void pool_postsuspend(struct dm_target *ti)
				1885	{
				1886	int r;
				1887	struct pool_c *pt = ti->private;
				1888	struct pool *pool = pt->pool;
				1889
				1890	flush_workqueue(pool->wq);
				1891
				1892	r = dm_pool_commit_metadata(pool->pmd);
				1893	if (r < 0) {
				1894	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1895	__func__, r);
				1896	/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
				1897	}
				1898	}
				1899
				1900	static int check_arg_count(unsigned argc, unsigned args_required)
				1901	{
				1902	if (argc != args_required) {
				1903	DMWARN("Message received with %u arguments instead of %u.",
				1904	argc, args_required);
				1905	return -EINVAL;
				1906	}
				1907
				1908	return 0;
				1909	}
				1910
				1911	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
				1912	{
				1913	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
				1914	*dev_id <= MAX_DEV_ID)
				1915	return 0;
				1916
				1917	if (warning)
				1918	DMWARN("Message received with invalid device id: %s", arg);
				1919
				1920	return -EINVAL;
				1921	}
				1922
				1923	static int process_create_thin_mesg(unsigned argc, char *argv, struct pool pool)
				1924	{
				1925	dm_thin_id dev_id;
				1926	int r;
				1927
				1928	r = check_arg_count(argc, 2);
				1929	if (r)
				1930	return r;
				1931
				1932	r = read_dev_id(argv[1], &dev_id, 1);
				1933	if (r)
				1934	return r;
				1935
				1936	r = dm_pool_create_thin(pool->pmd, dev_id);
				1937	if (r) {
				1938	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
				1939	argv[1]);
				1940	return r;
				1941	}
				1942
				1943	return 0;
				1944	}
				1945
				1946	static int process_create_snap_mesg(unsigned argc, char *argv, struct pool pool)
				1947	{
				1948	dm_thin_id dev_id;
				1949	dm_thin_id origin_dev_id;
				1950	int r;
				1951
				1952	r = check_arg_count(argc, 3);
				1953	if (r)
				1954	return r;
				1955
				1956	r = read_dev_id(argv[1], &dev_id, 1);
				1957	if (r)
				1958	return r;
				1959
				1960	r = read_dev_id(argv[2], &origin_dev_id, 1);
				1961	if (r)
				1962	return r;
				1963
				1964	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
				1965	if (r) {
				1966	DMWARN("Creation of new snapshot %s of device %s failed.",
				1967	argv[1], argv[2]);
				1968	return r;
				1969	}
				1970
				1971	return 0;
				1972	}
				1973
				1974	static int process_delete_mesg(unsigned argc, char *argv, struct pool pool)
				1975	{
				1976	dm_thin_id dev_id;
				1977	int r;
				1978
				1979	r = check_arg_count(argc, 2);
				1980	if (r)
				1981	return r;
				1982
				1983	r = read_dev_id(argv[1], &dev_id, 1);
				1984	if (r)
				1985	return r;
				1986
				1987	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
				1988	if (r)
				1989	DMWARN("Deletion of thin device %s failed.", argv[1]);
				1990
				1991	return r;
				1992	}
				1993
				1994	static int process_set_transaction_id_mesg(unsigned argc, char *argv, struct pool pool)
				1995	{
				1996	dm_thin_id old_id, new_id;
				1997	int r;
				1998
				1999	r = check_arg_count(argc, 3);
				2000	if (r)
				2001	return r;
				2002
				2003	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
				2004	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
				2005	return -EINVAL;
				2006	}
				2007
				2008	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
				2009	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
				2010	return -EINVAL;
				2011	}
				2012
				2013	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
				2014	if (r) {
				2015	DMWARN("Failed to change transaction id from %s to %s.",
				2016	argv[1], argv[2]);
				2017	return r;
				2018	}
				2019
				2020	return 0;
				2021	}
				2022
				2023	/*
				2024	* Messages supported:
				2025	* create_thin <dev_id>
				2026	* create_snap <dev_id> <origin_id>
				2027	* delete <dev_id>
				2028	* trim <dev_id> <new_size_in_sectors>
				2029	* set_transaction_id <current_trans_id> <new_trans_id>
				2030	*/
				2031	static int pool_message(struct dm_target ti, unsigned argc, char *argv)
				2032	{
				2033	int r = -EINVAL;
				2034	struct pool_c *pt = ti->private;
				2035	struct pool *pool = pt->pool;
				2036
				2037	if (!strcasecmp(argv[0], "create_thin"))
				2038	r = process_create_thin_mesg(argc, argv, pool);
				2039
				2040	else if (!strcasecmp(argv[0], "create_snap"))
				2041	r = process_create_snap_mesg(argc, argv, pool);
				2042
				2043	else if (!strcasecmp(argv[0], "delete"))
				2044	r = process_delete_mesg(argc, argv, pool);
				2045
				2046	else if (!strcasecmp(argv[0], "set_transaction_id"))
				2047	r = process_set_transaction_id_mesg(argc, argv, pool);
				2048
				2049	else
				2050	DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
				2051
				2052	if (!r) {
				2053	r = dm_pool_commit_metadata(pool->pmd);
				2054	if (r)
				2055	DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
				2056	argv[0], r);
				2057	}
				2058
				2059	return r;
				2060	}
				2061
				2062	/*
				2063	* Status line is:
				2064	* <transaction id> <used metadata sectors>/<total metadata sectors>
				2065	* <used data sectors>/<total data sectors> <held metadata root>
				2066	*/
				2067	static int pool_status(struct dm_target *ti, status_type_t type,
				2068	char *result, unsigned maxlen)
				2069	{
				2070	int r;
				2071	unsigned sz = 0;
				2072	uint64_t transaction_id;
				2073	dm_block_t nr_free_blocks_data;
				2074	dm_block_t nr_free_blocks_metadata;
				2075	dm_block_t nr_blocks_data;
				2076	dm_block_t nr_blocks_metadata;
				2077	dm_block_t held_root;
				2078	char buf[BDEVNAME_SIZE];
				2079	char buf2[BDEVNAME_SIZE];
				2080	struct pool_c *pt = ti->private;
				2081	struct pool *pool = pt->pool;
				2082
				2083	switch (type) {
				2084	case STATUSTYPE_INFO:
				2085	r = dm_pool_get_metadata_transaction_id(pool->pmd,
				2086	&transaction_id);
				2087	if (r)
				2088	return r;
				2089
				2090	r = dm_pool_get_free_metadata_block_count(pool->pmd,
				2091	&nr_free_blocks_metadata);
				2092	if (r)
				2093	return r;
				2094
				2095	r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
				2096	if (r)
				2097	return r;
				2098
				2099	r = dm_pool_get_free_block_count(pool->pmd,
				2100	&nr_free_blocks_data);
				2101	if (r)
				2102	return r;
				2103
				2104	r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
				2105	if (r)
				2106	return r;
				2107
				2108	r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
				2109	if (r)
				2110	return r;
				2111
				2112	DMEMIT("%llu %llu/%llu %llu/%llu ",
				2113	(unsigned long long)transaction_id,
				2114	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				2115	(unsigned long long)nr_blocks_metadata,
				2116	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
				2117	(unsigned long long)nr_blocks_data);
				2118
				2119	if (held_root)
				2120	DMEMIT("%llu", held_root);
				2121	else
				2122	DMEMIT("-");
				2123
				2124	break;
				2125
				2126	case STATUSTYPE_TABLE:
				2127	DMEMIT("%s %s %lu %llu ",
				2128	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
				2129	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
				2130	(unsigned long)pool->sectors_per_block,
				2131	(unsigned long long)pt->low_water_blocks);
				2132
				2133	DMEMIT("%u ", !pool->zero_new_blocks);
				2134
				2135	if (!pool->zero_new_blocks)
				2136	DMEMIT("skip_block_zeroing ");
				2137	break;
				2138	}
				2139
				2140	return 0;
				2141	}
				2142
				2143	static int pool_iterate_devices(struct dm_target *ti,
				2144	iterate_devices_callout_fn fn, void *data)
				2145	{
				2146	struct pool_c *pt = ti->private;
				2147
				2148	return fn(ti, pt->data_dev, 0, ti->len, data);
				2149	}
				2150
				2151	static int pool_merge(struct dm_target ti, struct bvec_merge_data bvm,
				2152	struct bio_vec *biovec, int max_size)
				2153	{
				2154	struct pool_c *pt = ti->private;
				2155	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				2156
				2157	if (!q->merge_bvec_fn)
				2158	return max_size;
				2159
				2160	bvm->bi_bdev = pt->data_dev->bdev;
				2161
				2162	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
				2163	}
				2164
				2165	static void pool_io_hints(struct dm_target ti, struct queue_limits limits)
				2166	{
				2167	struct pool_c *pt = ti->private;
				2168	struct pool *pool = pt->pool;
				2169
				2170	blk_limits_io_min(limits, 0);
				2171	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
				2172	}
				2173
				2174	static struct target_type pool_target = {
				2175	.name = "thin-pool",
				2176	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
				2177	DM_TARGET_IMMUTABLE,
				2178	.version = {1, 0, 0},
				2179	.module = THIS_MODULE,
				2180	.ctr = pool_ctr,
				2181	.dtr = pool_dtr,
				2182	.map = pool_map,
				2183	.postsuspend = pool_postsuspend,
				2184	.preresume = pool_preresume,
				2185	.resume = pool_resume,
				2186	.message = pool_message,
				2187	.status = pool_status,
				2188	.merge = pool_merge,
				2189	.iterate_devices = pool_iterate_devices,
				2190	.io_hints = pool_io_hints,
				2191	};
				2192
				2193	/*----------------------------------------------------------------
				2194	* Thin target methods
				2195	--------------------------------------------------------------/
				2196	static void thin_dtr(struct dm_target *ti)
				2197	{
				2198	struct thin_c *tc = ti->private;
				2199
				2200	mutex_lock(&dm_thin_pool_table.mutex);
				2201
				2202	__pool_dec(tc->pool);
				2203	dm_pool_close_thin_device(tc->td);
				2204	dm_put_device(ti, tc->pool_dev);
				2205	kfree(tc);
				2206
				2207	mutex_unlock(&dm_thin_pool_table.mutex);
				2208	}
				2209
				2210	/*
				2211	* Thin target parameters:
				2212	*
				2213	* <pool_dev> <dev_id>
				2214	*
				2215	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
				2216	* dev_id: the internal device identifier
				2217	*/
				2218	static int thin_ctr(struct dm_target ti, unsigned argc, char *argv)
				2219	{
				2220	int r;
				2221	struct thin_c *tc;
				2222	struct dm_dev *pool_dev;
				2223	struct mapped_device *pool_md;
				2224
				2225	mutex_lock(&dm_thin_pool_table.mutex);
				2226
				2227	if (argc != 2) {
				2228	ti->error = "Invalid argument count";
				2229	r = -EINVAL;
				2230	goto out_unlock;
				2231	}
				2232
				2233	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
				2234	if (!tc) {
				2235	ti->error = "Out of memory";
				2236	r = -ENOMEM;
				2237	goto out_unlock;
				2238	}
				2239
				2240	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
				2241	if (r) {
				2242	ti->error = "Error opening pool device";
				2243	goto bad_pool_dev;
				2244	}
				2245	tc->pool_dev = pool_dev;
				2246
				2247	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
				2248	ti->error = "Invalid device id";
				2249	r = -EINVAL;
				2250	goto bad_common;
				2251	}
				2252
				2253	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
				2254	if (!pool_md) {
				2255	ti->error = "Couldn't get pool mapped device";
				2256	r = -EINVAL;
				2257	goto bad_common;
				2258	}
				2259
				2260	tc->pool = __pool_table_lookup(pool_md);
				2261	if (!tc->pool) {
				2262	ti->error = "Couldn't find pool object";
				2263	r = -EINVAL;
				2264	goto bad_pool_lookup;
				2265	}
				2266	__pool_inc(tc->pool);
				2267
				2268	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
				2269	if (r) {
				2270	ti->error = "Couldn't open thin internal device";
				2271	goto bad_thin_open;
				2272	}
				2273
				2274	ti->split_io = tc->pool->sectors_per_block;
				2275	ti->num_flush_requests = 1;
				2276	ti->num_discard_requests = 0;
				2277	ti->discards_supported = 0;
				2278
				2279	dm_put(pool_md);
				2280
				2281	mutex_unlock(&dm_thin_pool_table.mutex);
				2282
				2283	return 0;
				2284
				2285	bad_thin_open:
				2286	__pool_dec(tc->pool);
				2287	bad_pool_lookup:
				2288	dm_put(pool_md);
				2289	bad_common:
				2290	dm_put_device(ti, tc->pool_dev);
				2291	bad_pool_dev:
				2292	kfree(tc);
				2293	out_unlock:
				2294	mutex_unlock(&dm_thin_pool_table.mutex);
				2295
				2296	return r;
				2297	}
				2298
				2299	static int thin_map(struct dm_target ti, struct bio bio,
				2300	union map_info *map_context)
				2301	{
				2302	bio->bi_sector -= ti->begin;
				2303
				2304	return thin_bio_map(ti, bio, map_context);
				2305	}
				2306
				2307	static void thin_postsuspend(struct dm_target *ti)
				2308	{
				2309	if (dm_noflush_suspending(ti))
				2310	requeue_io((struct thin_c *)ti->private);
				2311	}
				2312
				2313	/*
				2314	* <nr mapped sectors> <highest mapped sector>
				2315	*/
				2316	static int thin_status(struct dm_target *ti, status_type_t type,
				2317	char *result, unsigned maxlen)
				2318	{
				2319	int r;
				2320	ssize_t sz = 0;
				2321	dm_block_t mapped, highest;
				2322	char buf[BDEVNAME_SIZE];
				2323	struct thin_c *tc = ti->private;
				2324
				2325	if (!tc->td)
				2326	DMEMIT("-");
				2327	else {
				2328	switch (type) {
				2329	case STATUSTYPE_INFO:
				2330	r = dm_thin_get_mapped_count(tc->td, &mapped);
				2331	if (r)
				2332	return r;
				2333
				2334	r = dm_thin_get_highest_mapped_block(tc->td, &highest);
				2335	if (r < 0)
				2336	return r;
				2337
				2338	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
				2339	if (r)
				2340	DMEMIT("%llu", ((highest + 1) *
				2341	tc->pool->sectors_per_block) - 1);
				2342	else
				2343	DMEMIT("-");
				2344	break;
				2345
				2346	case STATUSTYPE_TABLE:
				2347	DMEMIT("%s %lu",
				2348	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
				2349	(unsigned long) tc->dev_id);
				2350	break;
				2351	}
				2352	}
				2353
				2354	return 0;
				2355	}
				2356
				2357	static int thin_iterate_devices(struct dm_target *ti,
				2358	iterate_devices_callout_fn fn, void *data)
				2359	{
				2360	dm_block_t blocks;
				2361	struct thin_c *tc = ti->private;
				2362
				2363	/*
				2364	* We can't call dm_pool_get_data_dev_size() since that blocks. So
				2365	* we follow a more convoluted path through to the pool's target.
				2366	*/
				2367	if (!tc->pool->ti)
				2368	return 0; /* nothing is bound */
				2369
				2370	blocks = tc->pool->ti->len >> tc->pool->block_shift;
				2371	if (blocks)
				2372	return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
				2373
				2374	return 0;
				2375	}
				2376
				2377	static void thin_io_hints(struct dm_target ti, struct queue_limits limits)
				2378	{
				2379	struct thin_c *tc = ti->private;
				2380
				2381	blk_limits_io_min(limits, 0);
				2382	blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
				2383	}
				2384
				2385	static struct target_type thin_target = {
				2386	.name = "thin",
				2387	.version = {1, 0, 0},
				2388	.module = THIS_MODULE,
				2389	.ctr = thin_ctr,
				2390	.dtr = thin_dtr,
				2391	.map = thin_map,
				2392	.postsuspend = thin_postsuspend,
				2393	.status = thin_status,
				2394	.iterate_devices = thin_iterate_devices,
				2395	.io_hints = thin_io_hints,
				2396	};
				2397
				2398	/----------------------------------------------------------------/
				2399
				2400	static int __init dm_thin_init(void)
				2401	{
				2402	int r;
				2403
				2404	pool_table_init();
				2405
				2406	r = dm_register_target(&thin_target);
				2407	if (r)
				2408	return r;
				2409
				2410	r = dm_register_target(&pool_target);
				2411	if (r)
				2412	dm_unregister_target(&thin_target);
				2413
				2414	return r;
				2415	}
				2416
				2417	static void dm_thin_exit(void)
				2418	{
				2419	dm_unregister_target(&thin_target);
				2420	dm_unregister_target(&pool_target);
				2421	}
				2422
				2423	module_init(dm_thin_init);
				2424	module_exit(dm_thin_exit);
				2425
				2426	MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
				2427	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				2428	MODULE_LICENSE("GPL");