Blame - drivers/md/dm-thin.c - kernel/msm-4.9

blob: bcb143396fe004b609392610ab5df5fa3d9b9748 [file] [log] [blame]

Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 Red Hat UK.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm-thin-metadata.h"
				8
				9	#include <linux/device-mapper.h>
				10	#include <linux/dm-io.h>
				11	#include <linux/dm-kcopyd.h>
				12	#include <linux/list.h>
				13	#include <linux/init.h>
				14	#include <linux/module.h>
				15	#include <linux/slab.h>
				16
				17	#define DM_MSG_PREFIX "thin"
				18
				19	/*
				20	* Tunable constants
				21	*/
				22	#define ENDIO_HOOK_POOL_SIZE 10240
				23	#define DEFERRED_SET_SIZE 64
				24	#define MAPPING_POOL_SIZE 1024
				25	#define PRISON_CELLS 1024
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	26	#define COMMIT_PERIOD HZ
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	27
				28	/*
				29	* The block size of the device holding pool data must be
				30	* between 64KB and 1GB.
				31	*/
				32	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
				33	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				34
				35	/*
				36	* The metadata device is currently limited in size. The limitation is
				37	* checked lower down in dm-space-map-metadata, but we also check it here
				38	* so we can fail early.
				39	*
				40	* We have one block of index, which can hold 255 index entries. Each
				41	* index entry contains allocation info about 16k metadata blocks.
				42	*/
				43	#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
				44
				45	/*
				46	* Device id is restricted to 24 bits.
				47	*/
				48	#define MAX_DEV_ID ((1 << 24) - 1)
				49
				50	/*
				51	* How do we handle breaking sharing of data blocks?
				52	* =================================================
				53	*
				54	* We use a standard copy-on-write btree to store the mappings for the
				55	* devices (note I'm talking about copy-on-write of the metadata here, not
				56	* the data). When you take an internal snapshot you clone the root node
				57	* of the origin btree. After this there is no concept of an origin or a
				58	* snapshot. They are just two device trees that happen to point to the
				59	* same data blocks.
				60	*
				61	* When we get a write in we decide if it's to a shared data block using
				62	* some timestamp magic. If it is, we have to break sharing.
				63	*
				64	* Let's say we write to a shared block in what was the origin. The
				65	* steps are:
				66	*
				67	* i) plug io further to this physical block. (see bio_prison code).
				68	*
				69	* ii) quiesce any read io to that shared data block. Obviously
				70	* including all devices that share this block. (see deferred_set code)
				71	*
				72	* iii) copy the data block to a newly allocate block. This step can be
				73	* missed out if the io covers the block. (schedule_copy).
				74	*
				75	* iv) insert the new mapping into the origin's btree
Joe Thornber	fe878f3	2012-03-28 18:41:24 +0100	[diff] [blame]	76	* (process_prepared_mapping). This act of inserting breaks some
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	77	* sharing of btree nodes between the two devices. Breaking sharing only
				78	* effects the btree of that specific device. Btrees for the other
				79	* devices that share the block never change. The btree for the origin
				80	* device as it was after the last commit is untouched, ie. we're using
				81	* persistent data structures in the functional programming sense.
				82	*
				83	* v) unplug io to this physical block, including the io that triggered
				84	* the breaking of sharing.
				85	*
				86	* Steps (ii) and (iii) occur in parallel.
				87	*
				88	* The metadata _doesn't_ need to be committed before the io continues. We
				89	* get away with this because the io is always written to a _new_ block.
				90	* If there's a crash, then:
				91	*
				92	* - The origin mapping will point to the old origin block (the shared
				93	* one). This will contain the data as it was before the io that triggered
				94	* the breaking of sharing came in.
				95	*
				96	* - The snap mapping still points to the old block. As it would after
				97	* the commit.
				98	*
				99	* The downside of this scheme is the timestamp magic isn't perfect, and
				100	* will continue to think that data block in the snapshot device is shared
				101	* even after the write to the origin has broken sharing. I suspect data
				102	* blocks will typically be shared by many different devices, so we're
				103	* breaking sharing n + 1 times, rather than n, where n is the number of
				104	* devices that reference this data block. At the moment I think the
				105	* benefits far, far outweigh the disadvantages.
				106	*/
				107
				108	/----------------------------------------------------------------/
				109
				110	/*
				111	* Sometimes we can't deal with a bio straight away. We put them in prison
				112	* where they can't cause any mischief. Bios are put in a cell identified
				113	* by a key, multiple bios can be in the same cell. When the cell is
				114	* subsequently unlocked the bios become available.
				115	*/
				116	struct bio_prison;
				117
				118	struct cell_key {
				119	int virtual;
				120	dm_thin_id dev;
				121	dm_block_t block;
				122	};
				123
				124	struct cell {
				125	struct hlist_node list;
				126	struct bio_prison *prison;
				127	struct cell_key key;
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	128	struct bio *holder;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	129	struct bio_list bios;
				130	};
				131
				132	struct bio_prison {
				133	spinlock_t lock;
				134	mempool_t *cell_pool;
				135
				136	unsigned nr_buckets;
				137	unsigned hash_mask;
				138	struct hlist_head *cells;
				139	};
				140
				141	static uint32_t calc_nr_buckets(unsigned nr_cells)
				142	{
				143	uint32_t n = 128;
				144
				145	nr_cells /= 4;
				146	nr_cells = min(nr_cells, 8192u);
				147
				148	while (n < nr_cells)
				149	n <<= 1;
				150
				151	return n;
				152	}
				153
				154	/*
				155	* @nr_cells should be the number of cells you want in use _concurrently_.
				156	* Don't confuse it with the number of distinct keys.
				157	*/
				158	static struct bio_prison *prison_create(unsigned nr_cells)
				159	{
				160	unsigned i;
				161	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
				162	size_t len = sizeof(struct bio_prison) +
				163	(sizeof(struct hlist_head) * nr_buckets);
				164	struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
				165
				166	if (!prison)
				167	return NULL;
				168
				169	spin_lock_init(&prison->lock);
				170	prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
				171	sizeof(struct cell));
				172	if (!prison->cell_pool) {
				173	kfree(prison);
				174	return NULL;
				175	}
				176
				177	prison->nr_buckets = nr_buckets;
				178	prison->hash_mask = nr_buckets - 1;
				179	prison->cells = (struct hlist_head *) (prison + 1);
				180	for (i = 0; i < nr_buckets; i++)
				181	INIT_HLIST_HEAD(prison->cells + i);
				182
				183	return prison;
				184	}
				185
				186	static void prison_destroy(struct bio_prison *prison)
				187	{
				188	mempool_destroy(prison->cell_pool);
				189	kfree(prison);
				190	}
				191
				192	static uint32_t hash_key(struct bio_prison prison, struct cell_key key)
				193	{
				194	const unsigned long BIG_PRIME = 4294967291UL;
				195	uint64_t hash = key->block * BIG_PRIME;
				196
				197	return (uint32_t) (hash & prison->hash_mask);
				198	}
				199
				200	static int keys_equal(struct cell_key lhs, struct cell_key rhs)
				201	{
				202	return (lhs->virtual == rhs->virtual) &&
				203	(lhs->dev == rhs->dev) &&
				204	(lhs->block == rhs->block);
				205	}
				206
				207	static struct cell __search_bucket(struct hlist_head bucket,
				208	struct cell_key *key)
				209	{
				210	struct cell *cell;
				211	struct hlist_node *tmp;
				212
				213	hlist_for_each_entry(cell, tmp, bucket, list)
				214	if (keys_equal(&cell->key, key))
				215	return cell;
				216
				217	return NULL;
				218	}
				219
				220	/*
				221	* This may block if a new cell needs allocating. You must ensure that
				222	* cells will be unlocked even if the calling thread is blocked.
				223	*
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	224	* Returns 1 if the cell was already held, 0 if @inmate is the new holder.
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	225	*/
				226	static int bio_detain(struct bio_prison prison, struct cell_key key,
				227	struct bio inmate, struct cell *ref)
				228	{
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	229	int r = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	230	unsigned long flags;
				231	uint32_t hash = hash_key(prison, key);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	232	struct cell cell, cell2;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	233
				234	BUG_ON(hash > prison->nr_buckets);
				235
				236	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	237
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	238	cell = __search_bucket(prison->cells + hash, key);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	239	if (cell) {
				240	bio_list_add(&cell->bios, inmate);
				241	goto out;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	242	}
				243
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	244	/*
				245	* Allocate a new cell
				246	*/
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	247	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	248	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
				249	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	250
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	251	/*
				252	* We've been unlocked, so we have to double check that
				253	* nobody else has inserted this cell in the meantime.
				254	*/
				255	cell = __search_bucket(prison->cells + hash, key);
				256	if (cell) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	257	mempool_free(cell2, prison->cell_pool);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	258	bio_list_add(&cell->bios, inmate);
				259	goto out;
				260	}
				261
				262	/*
				263	* Use new cell.
				264	*/
				265	cell = cell2;
				266
				267	cell->prison = prison;
				268	memcpy(&cell->key, key, sizeof(cell->key));
				269	cell->holder = inmate;
				270	bio_list_init(&cell->bios);
				271	hlist_add_head(&cell->list, prison->cells + hash);
				272
				273	r = 0;
				274
				275	out:
				276	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	277
				278	*ref = cell;
				279
				280	return r;
				281	}
				282
				283	/*
				284	* @inmates must have been initialised prior to this call
				285	*/
				286	static void __cell_release(struct cell cell, struct bio_list inmates)
				287	{
				288	struct bio_prison *prison = cell->prison;
				289
				290	hlist_del(&cell->list);
				291
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	292	bio_list_add(inmates, cell->holder);
				293	bio_list_merge(inmates, &cell->bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	294
				295	mempool_free(cell, prison->cell_pool);
				296	}
				297
				298	static void cell_release(struct cell cell, struct bio_list bios)
				299	{
				300	unsigned long flags;
				301	struct bio_prison *prison = cell->prison;
				302
				303	spin_lock_irqsave(&prison->lock, flags);
				304	__cell_release(cell, bios);
				305	spin_unlock_irqrestore(&prison->lock, flags);
				306	}
				307
				308	/*
				309	* There are a couple of places where we put a bio into a cell briefly
				310	* before taking it out again. In these situations we know that no other
				311	* bio may be in the cell. This function releases the cell, and also does
				312	* a sanity check.
				313	*/
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	314	static void __cell_release_singleton(struct cell cell, struct bio bio)
				315	{
				316	hlist_del(&cell->list);
				317	BUG_ON(cell->holder != bio);
				318	BUG_ON(!bio_list_empty(&cell->bios));
				319	}
				320
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	321	static void cell_release_singleton(struct cell cell, struct bio bio)
				322	{
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	323	unsigned long flags;
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	324	struct bio_prison *prison = cell->prison;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	325
				326	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	327	__cell_release_singleton(cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	328	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	329	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	330
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	331	/*
				332	* Sometimes we don't want the holder, just the additional bios.
				333	*/
				334	static void __cell_release_no_holder(struct cell cell, struct bio_list inmates)
				335	{
				336	struct bio_prison *prison = cell->prison;
				337
				338	hlist_del(&cell->list);
				339	bio_list_merge(inmates, &cell->bios);
				340
				341	mempool_free(cell, prison->cell_pool);
				342	}
				343
				344	static void cell_release_no_holder(struct cell cell, struct bio_list inmates)
				345	{
				346	unsigned long flags;
				347	struct bio_prison *prison = cell->prison;
				348
				349	spin_lock_irqsave(&prison->lock, flags);
				350	__cell_release_no_holder(cell, inmates);
				351	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	352	}
				353
				354	static void cell_error(struct cell *cell)
				355	{
				356	struct bio_prison *prison = cell->prison;
				357	struct bio_list bios;
				358	struct bio *bio;
				359	unsigned long flags;
				360
				361	bio_list_init(&bios);
				362
				363	spin_lock_irqsave(&prison->lock, flags);
				364	__cell_release(cell, &bios);
				365	spin_unlock_irqrestore(&prison->lock, flags);
				366
				367	while ((bio = bio_list_pop(&bios)))
				368	bio_io_error(bio);
				369	}
				370
				371	/----------------------------------------------------------------/
				372
				373	/*
				374	* We use the deferred set to keep track of pending reads to shared blocks.
				375	* We do this to ensure the new mapping caused by a write isn't performed
				376	* until these prior reads have completed. Otherwise the insertion of the
				377	* new mapping could free the old block that the read bios are mapped to.
				378	*/
				379
				380	struct deferred_set;
				381	struct deferred_entry {
				382	struct deferred_set *ds;
				383	unsigned count;
				384	struct list_head work_items;
				385	};
				386
				387	struct deferred_set {
				388	spinlock_t lock;
				389	unsigned current_entry;
				390	unsigned sweeper;
				391	struct deferred_entry entries[DEFERRED_SET_SIZE];
				392	};
				393
				394	static void ds_init(struct deferred_set *ds)
				395	{
				396	int i;
				397
				398	spin_lock_init(&ds->lock);
				399	ds->current_entry = 0;
				400	ds->sweeper = 0;
				401	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
				402	ds->entries[i].ds = ds;
				403	ds->entries[i].count = 0;
				404	INIT_LIST_HEAD(&ds->entries[i].work_items);
				405	}
				406	}
				407
				408	static struct deferred_entry ds_inc(struct deferred_set ds)
				409	{
				410	unsigned long flags;
				411	struct deferred_entry *entry;
				412
				413	spin_lock_irqsave(&ds->lock, flags);
				414	entry = ds->entries + ds->current_entry;
				415	entry->count++;
				416	spin_unlock_irqrestore(&ds->lock, flags);
				417
				418	return entry;
				419	}
				420
				421	static unsigned ds_next(unsigned index)
				422	{
				423	return (index + 1) % DEFERRED_SET_SIZE;
				424	}
				425
				426	static void __sweep(struct deferred_set ds, struct list_head head)
				427	{
				428	while ((ds->sweeper != ds->current_entry) &&
				429	!ds->entries[ds->sweeper].count) {
				430	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				431	ds->sweeper = ds_next(ds->sweeper);
				432	}
				433
				434	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
				435	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				436	}
				437
				438	static void ds_dec(struct deferred_entry entry, struct list_head head)
				439	{
				440	unsigned long flags;
				441
				442	spin_lock_irqsave(&entry->ds->lock, flags);
				443	BUG_ON(!entry->count);
				444	--entry->count;
				445	__sweep(entry->ds, head);
				446	spin_unlock_irqrestore(&entry->ds->lock, flags);
				447	}
				448
				449	/*
				450	* Returns 1 if deferred or 0 if no pending items to delay job.
				451	*/
				452	static int ds_add_work(struct deferred_set ds, struct list_head work)
				453	{
				454	int r = 1;
				455	unsigned long flags;
				456	unsigned next_entry;
				457
				458	spin_lock_irqsave(&ds->lock, flags);
				459	if ((ds->sweeper == ds->current_entry) &&
				460	!ds->entries[ds->current_entry].count)
				461	r = 0;
				462	else {
				463	list_add(work, &ds->entries[ds->current_entry].work_items);
				464	next_entry = ds_next(ds->current_entry);
				465	if (!ds->entries[next_entry].count)
				466	ds->current_entry = next_entry;
				467	}
				468	spin_unlock_irqrestore(&ds->lock, flags);
				469
				470	return r;
				471	}
				472
				473	/----------------------------------------------------------------/
				474
				475	/*
				476	* Key building.
				477	*/
				478	static void build_data_key(struct dm_thin_device *td,
				479	dm_block_t b, struct cell_key *key)
				480	{
				481	key->virtual = 0;
				482	key->dev = dm_thin_dev_id(td);
				483	key->block = b;
				484	}
				485
				486	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
				487	struct cell_key *key)
				488	{
				489	key->virtual = 1;
				490	key->dev = dm_thin_dev_id(td);
				491	key->block = b;
				492	}
				493
				494	/----------------------------------------------------------------/
				495
				496	/*
				497	* A pool device ties together a metadata device and a data device. It
				498	* also provides the interface for creating and destroying internal
				499	* devices.
				500	*/
				501	struct new_mapping;
				502	struct pool {
				503	struct list_head list;
				504	struct dm_target ti; / Only set if a pool target is bound */
				505
				506	struct mapped_device *pool_md;
				507	struct block_device *md_dev;
				508	struct dm_pool_metadata *pmd;
				509
				510	uint32_t sectors_per_block;
				511	unsigned block_shift;
				512	dm_block_t offset_mask;
				513	dm_block_t low_water_blocks;
				514
				515	unsigned zero_new_blocks:1;
				516	unsigned low_water_triggered:1; /* A dm event has been sent */
				517	unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
				518
				519	struct bio_prison *prison;
				520	struct dm_kcopyd_client *copier;
				521
				522	struct workqueue_struct *wq;
				523	struct work_struct worker;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	524	struct delayed_work waker;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	525
				526	unsigned ref_count;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	527	unsigned long last_commit_jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	528
				529	spinlock_t lock;
				530	struct bio_list deferred_bios;
				531	struct bio_list deferred_flush_bios;
				532	struct list_head prepared_mappings;
				533
				534	struct bio_list retry_on_resume_list;
				535
				536	struct deferred_set ds; /* FIXME: move to thin_c */
				537
				538	struct new_mapping *next_mapping;
				539	mempool_t *mapping_pool;
				540	mempool_t *endio_hook_pool;
				541	};
				542
				543	/*
				544	* Target context for a pool.
				545	*/
				546	struct pool_c {
				547	struct dm_target *ti;
				548	struct pool *pool;
				549	struct dm_dev *data_dev;
				550	struct dm_dev *metadata_dev;
				551	struct dm_target_callbacks callbacks;
				552
				553	dm_block_t low_water_blocks;
				554	unsigned zero_new_blocks:1;
				555	};
				556
				557	/*
				558	* Target context for a thin.
				559	*/
				560	struct thin_c {
				561	struct dm_dev *pool_dev;
				562	dm_thin_id dev_id;
				563
				564	struct pool *pool;
				565	struct dm_thin_device *td;
				566	};
				567
				568	/----------------------------------------------------------------/
				569
				570	/*
				571	* A global list of pools that uses a struct mapped_device as a key.
				572	*/
				573	static struct dm_thin_pool_table {
				574	struct mutex mutex;
				575	struct list_head pools;
				576	} dm_thin_pool_table;
				577
				578	static void pool_table_init(void)
				579	{
				580	mutex_init(&dm_thin_pool_table.mutex);
				581	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
				582	}
				583
				584	static void __pool_table_insert(struct pool *pool)
				585	{
				586	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				587	list_add(&pool->list, &dm_thin_pool_table.pools);
				588	}
				589
				590	static void __pool_table_remove(struct pool *pool)
				591	{
				592	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				593	list_del(&pool->list);
				594	}
				595
				596	static struct pool __pool_table_lookup(struct mapped_device md)
				597	{
				598	struct pool pool = NULL, tmp;
				599
				600	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				601
				602	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				603	if (tmp->pool_md == md) {
				604	pool = tmp;
				605	break;
				606	}
				607	}
				608
				609	return pool;
				610	}
				611
				612	static struct pool __pool_table_lookup_metadata_dev(struct block_device md_dev)
				613	{
				614	struct pool pool = NULL, tmp;
				615
				616	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				617
				618	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				619	if (tmp->md_dev == md_dev) {
				620	pool = tmp;
				621	break;
				622	}
				623	}
				624
				625	return pool;
				626	}
				627
				628	/----------------------------------------------------------------/
				629
				630	static void __requeue_bio_list(struct thin_c tc, struct bio_list master)
				631	{
				632	struct bio *bio;
				633	struct bio_list bios;
				634
				635	bio_list_init(&bios);
				636	bio_list_merge(&bios, master);
				637	bio_list_init(master);
				638
				639	while ((bio = bio_list_pop(&bios))) {
				640	if (dm_get_mapinfo(bio)->ptr == tc)
				641	bio_endio(bio, DM_ENDIO_REQUEUE);
				642	else
				643	bio_list_add(master, bio);
				644	}
				645	}
				646
				647	static void requeue_io(struct thin_c *tc)
				648	{
				649	struct pool *pool = tc->pool;
				650	unsigned long flags;
				651
				652	spin_lock_irqsave(&pool->lock, flags);
				653	__requeue_bio_list(tc, &pool->deferred_bios);
				654	__requeue_bio_list(tc, &pool->retry_on_resume_list);
				655	spin_unlock_irqrestore(&pool->lock, flags);
				656	}
				657
				658	/*
				659	* This section of code contains the logic for processing a thin device's IO.
				660	* Much of the code depends on pool object resources (lists, workqueues, etc)
				661	* but most is exclusively called from the thin target rather than the thin-pool
				662	* target.
				663	*/
				664
				665	static dm_block_t get_bio_block(struct thin_c tc, struct bio bio)
				666	{
				667	return bio->bi_sector >> tc->pool->block_shift;
				668	}
				669
				670	static void remap(struct thin_c tc, struct bio bio, dm_block_t block)
				671	{
				672	struct pool *pool = tc->pool;
				673
				674	bio->bi_bdev = tc->pool_dev->bdev;
				675	bio->bi_sector = (block << pool->block_shift) +
				676	(bio->bi_sector & pool->offset_mask);
				677	}
				678
				679	static void remap_and_issue(struct thin_c tc, struct bio bio,
				680	dm_block_t block)
				681	{
				682	struct pool *pool = tc->pool;
				683	unsigned long flags;
				684
				685	remap(tc, bio, block);
				686
				687	/*
				688	* Batch together any FUA/FLUSH bios we find and then issue
				689	* a single commit for them in process_deferred_bios().
				690	*/
				691	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA)) {
				692	spin_lock_irqsave(&pool->lock, flags);
				693	bio_list_add(&pool->deferred_flush_bios, bio);
				694	spin_unlock_irqrestore(&pool->lock, flags);
				695	} else
				696	generic_make_request(bio);
				697	}
				698
				699	/*
				700	* wake_worker() is used when new work is queued and when pool_resume is
				701	* ready to continue deferred IO processing.
				702	*/
				703	static void wake_worker(struct pool *pool)
				704	{
				705	queue_work(pool->wq, &pool->worker);
				706	}
				707
				708	/----------------------------------------------------------------/
				709
				710	/*
				711	* Bio endio functions.
				712	*/
				713	struct endio_hook {
				714	struct thin_c *tc;
				715	bio_end_io_t *saved_bi_end_io;
				716	struct deferred_entry *entry;
				717	};
				718
				719	struct new_mapping {
				720	struct list_head list;
				721
				722	int prepared;
				723
				724	struct thin_c *tc;
				725	dm_block_t virt_block;
				726	dm_block_t data_block;
				727	struct cell *cell;
				728	int err;
				729
				730	/*
				731	* If the bio covers the whole area of a block then we can avoid
				732	* zeroing or copying. Instead this bio is hooked. The bio will
				733	* still be in the cell, so care has to be taken to avoid issuing
				734	* the bio twice.
				735	*/
				736	struct bio *bio;
				737	bio_end_io_t *saved_bi_end_io;
				738	};
				739
				740	static void __maybe_add_mapping(struct new_mapping *m)
				741	{
				742	struct pool *pool = m->tc->pool;
				743
				744	if (list_empty(&m->list) && m->prepared) {
				745	list_add(&m->list, &pool->prepared_mappings);
				746	wake_worker(pool);
				747	}
				748	}
				749
				750	static void copy_complete(int read_err, unsigned long write_err, void *context)
				751	{
				752	unsigned long flags;
				753	struct new_mapping *m = context;
				754	struct pool *pool = m->tc->pool;
				755
				756	m->err = read_err \|\| write_err ? -EIO : 0;
				757
				758	spin_lock_irqsave(&pool->lock, flags);
				759	m->prepared = 1;
				760	__maybe_add_mapping(m);
				761	spin_unlock_irqrestore(&pool->lock, flags);
				762	}
				763
				764	static void overwrite_endio(struct bio *bio, int err)
				765	{
				766	unsigned long flags;
				767	struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
				768	struct pool *pool = m->tc->pool;
				769
				770	m->err = err;
				771
				772	spin_lock_irqsave(&pool->lock, flags);
				773	m->prepared = 1;
				774	__maybe_add_mapping(m);
				775	spin_unlock_irqrestore(&pool->lock, flags);
				776	}
				777
				778	static void shared_read_endio(struct bio *bio, int err)
				779	{
				780	struct list_head mappings;
				781	struct new_mapping m, tmp;
				782	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				783	unsigned long flags;
				784	struct pool *pool = h->tc->pool;
				785
				786	bio->bi_end_io = h->saved_bi_end_io;
				787	bio_endio(bio, err);
				788
				789	INIT_LIST_HEAD(&mappings);
				790	ds_dec(h->entry, &mappings);
				791
				792	spin_lock_irqsave(&pool->lock, flags);
				793	list_for_each_entry_safe(m, tmp, &mappings, list) {
				794	list_del(&m->list);
				795	INIT_LIST_HEAD(&m->list);
				796	__maybe_add_mapping(m);
				797	}
				798	spin_unlock_irqrestore(&pool->lock, flags);
				799
				800	mempool_free(h, pool->endio_hook_pool);
				801	}
				802
				803	/----------------------------------------------------------------/
				804
				805	/*
				806	* Workqueue.
				807	*/
				808
				809	/*
				810	* Prepared mapping jobs.
				811	*/
				812
				813	/*
				814	* This sends the bios in the cell back to the deferred_bios list.
				815	*/
				816	static void cell_defer(struct thin_c tc, struct cell cell,
				817	dm_block_t data_block)
				818	{
				819	struct pool *pool = tc->pool;
				820	unsigned long flags;
				821
				822	spin_lock_irqsave(&pool->lock, flags);
				823	cell_release(cell, &pool->deferred_bios);
				824	spin_unlock_irqrestore(&tc->pool->lock, flags);
				825
				826	wake_worker(pool);
				827	}
				828
				829	/*
				830	* Same as cell_defer above, except it omits one particular detainee,
				831	* a write bio that covers the block and has already been processed.
				832	*/
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	833	static void cell_defer_except(struct thin_c tc, struct cell cell)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	834	{
				835	struct bio_list bios;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	836	struct pool *pool = tc->pool;
				837	unsigned long flags;
				838
				839	bio_list_init(&bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	840
				841	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	842	cell_release_no_holder(cell, &pool->deferred_bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	843	spin_unlock_irqrestore(&pool->lock, flags);
				844
				845	wake_worker(pool);
				846	}
				847
				848	static void process_prepared_mapping(struct new_mapping *m)
				849	{
				850	struct thin_c *tc = m->tc;
				851	struct bio *bio;
				852	int r;
				853
				854	bio = m->bio;
				855	if (bio)
				856	bio->bi_end_io = m->saved_bi_end_io;
				857
				858	if (m->err) {
				859	cell_error(m->cell);
				860	return;
				861	}
				862
				863	/*
				864	* Commit the prepared block into the mapping btree.
				865	* Any I/O for this block arriving after this point will get
				866	* remapped to it directly.
				867	*/
				868	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
				869	if (r) {
				870	DMERR("dm_thin_insert_block() failed");
				871	cell_error(m->cell);
				872	return;
				873	}
				874
				875	/*
				876	* Release any bios held while the block was being provisioned.
				877	* If we are processing a write bio that completely covers the block,
				878	* we already processed it so can ignore it now when processing
				879	* the bios in the cell.
				880	*/
				881	if (bio) {
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	882	cell_defer_except(tc, m->cell);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	883	bio_endio(bio, 0);
				884	} else
				885	cell_defer(tc, m->cell, m->data_block);
				886
				887	list_del(&m->list);
				888	mempool_free(m, tc->pool->mapping_pool);
				889	}
				890
				891	static void process_prepared_mappings(struct pool *pool)
				892	{
				893	unsigned long flags;
				894	struct list_head maps;
				895	struct new_mapping m, tmp;
				896
				897	INIT_LIST_HEAD(&maps);
				898	spin_lock_irqsave(&pool->lock, flags);
				899	list_splice_init(&pool->prepared_mappings, &maps);
				900	spin_unlock_irqrestore(&pool->lock, flags);
				901
				902	list_for_each_entry_safe(m, tmp, &maps, list)
				903	process_prepared_mapping(m);
				904	}
				905
				906	/*
				907	* Deferred bio jobs.
				908	*/
				909	static int io_overwrites_block(struct pool pool, struct bio bio)
				910	{
				911	return ((bio_data_dir(bio) == WRITE) &&
				912	!(bio->bi_sector & pool->offset_mask)) &&
				913	(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
				914	}
				915
				916	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
				917	bio_end_io_t *fn)
				918	{
				919	*save = bio->bi_end_io;
				920	bio->bi_end_io = fn;
				921	}
				922
				923	static int ensure_next_mapping(struct pool *pool)
				924	{
				925	if (pool->next_mapping)
				926	return 0;
				927
				928	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
				929
				930	return pool->next_mapping ? 0 : -ENOMEM;
				931	}
				932
				933	static struct new_mapping get_next_mapping(struct pool pool)
				934	{
				935	struct new_mapping *r = pool->next_mapping;
				936
				937	BUG_ON(!pool->next_mapping);
				938
				939	pool->next_mapping = NULL;
				940
				941	return r;
				942	}
				943
				944	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
				945	dm_block_t data_origin, dm_block_t data_dest,
				946	struct cell cell, struct bio bio)
				947	{
				948	int r;
				949	struct pool *pool = tc->pool;
				950	struct new_mapping *m = get_next_mapping(pool);
				951
				952	INIT_LIST_HEAD(&m->list);
				953	m->prepared = 0;
				954	m->tc = tc;
				955	m->virt_block = virt_block;
				956	m->data_block = data_dest;
				957	m->cell = cell;
				958	m->err = 0;
				959	m->bio = NULL;
				960
				961	ds_add_work(&pool->ds, &m->list);
				962
				963	/*
				964	* IO to pool_dev remaps to the pool target's data_dev.
				965	*
				966	* If the whole block of data is being overwritten, we can issue the
				967	* bio immediately. Otherwise we use kcopyd to clone the data first.
				968	*/
				969	if (io_overwrites_block(pool, bio)) {
				970	m->bio = bio;
				971	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
				972	dm_get_mapinfo(bio)->ptr = m;
				973	remap_and_issue(tc, bio, data_dest);
				974	} else {
				975	struct dm_io_region from, to;
				976
				977	from.bdev = tc->pool_dev->bdev;
				978	from.sector = data_origin * pool->sectors_per_block;
				979	from.count = pool->sectors_per_block;
				980
				981	to.bdev = tc->pool_dev->bdev;
				982	to.sector = data_dest * pool->sectors_per_block;
				983	to.count = pool->sectors_per_block;
				984
				985	r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
				986	0, copy_complete, m);
				987	if (r < 0) {
				988	mempool_free(m, pool->mapping_pool);
				989	DMERR("dm_kcopyd_copy() failed");
				990	cell_error(cell);
				991	}
				992	}
				993	}
				994
				995	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
				996	dm_block_t data_block, struct cell *cell,
				997	struct bio *bio)
				998	{
				999	struct pool *pool = tc->pool;
				1000	struct new_mapping *m = get_next_mapping(pool);
				1001
				1002	INIT_LIST_HEAD(&m->list);
				1003	m->prepared = 0;
				1004	m->tc = tc;
				1005	m->virt_block = virt_block;
				1006	m->data_block = data_block;
				1007	m->cell = cell;
				1008	m->err = 0;
				1009	m->bio = NULL;
				1010
				1011	/*
				1012	* If the whole block of data is being overwritten or we are not
				1013	* zeroing pre-existing data, we can issue the bio immediately.
				1014	* Otherwise we use kcopyd to zero the data first.
				1015	*/
				1016	if (!pool->zero_new_blocks)
				1017	process_prepared_mapping(m);
				1018
				1019	else if (io_overwrites_block(pool, bio)) {
				1020	m->bio = bio;
				1021	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
				1022	dm_get_mapinfo(bio)->ptr = m;
				1023	remap_and_issue(tc, bio, data_block);
				1024
				1025	} else {
				1026	int r;
				1027	struct dm_io_region to;
				1028
				1029	to.bdev = tc->pool_dev->bdev;
				1030	to.sector = data_block * pool->sectors_per_block;
				1031	to.count = pool->sectors_per_block;
				1032
				1033	r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
				1034	if (r < 0) {
				1035	mempool_free(m, pool->mapping_pool);
				1036	DMERR("dm_kcopyd_zero() failed");
				1037	cell_error(cell);
				1038	}
				1039	}
				1040	}
				1041
				1042	static int alloc_data_block(struct thin_c tc, dm_block_t result)
				1043	{
				1044	int r;
				1045	dm_block_t free_blocks;
				1046	unsigned long flags;
				1047	struct pool *pool = tc->pool;
				1048
				1049	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1050	if (r)
				1051	return r;
				1052
				1053	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
				1054	DMWARN("%s: reached low water mark, sending event.",
				1055	dm_device_name(pool->pool_md));
				1056	spin_lock_irqsave(&pool->lock, flags);
				1057	pool->low_water_triggered = 1;
				1058	spin_unlock_irqrestore(&pool->lock, flags);
				1059	dm_table_event(pool->ti->table);
				1060	}
				1061
				1062	if (!free_blocks) {
				1063	if (pool->no_free_space)
				1064	return -ENOSPC;
				1065	else {
				1066	/*
				1067	* Try to commit to see if that will free up some
				1068	* more space.
				1069	*/
				1070	r = dm_pool_commit_metadata(pool->pmd);
				1071	if (r) {
				1072	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1073	__func__, r);
				1074	return r;
				1075	}
				1076
				1077	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1078	if (r)
				1079	return r;
				1080
				1081	/*
				1082	* If we still have no space we set a flag to avoid
				1083	* doing all this checking and return -ENOSPC.
				1084	*/
				1085	if (!free_blocks) {
				1086	DMWARN("%s: no free space available.",
				1087	dm_device_name(pool->pool_md));
				1088	spin_lock_irqsave(&pool->lock, flags);
				1089	pool->no_free_space = 1;
				1090	spin_unlock_irqrestore(&pool->lock, flags);
				1091	return -ENOSPC;
				1092	}
				1093	}
				1094	}
				1095
				1096	r = dm_pool_alloc_data_block(pool->pmd, result);
				1097	if (r)
				1098	return r;
				1099
				1100	return 0;
				1101	}
				1102
				1103	/*
				1104	* If we have run out of space, queue bios until the device is
				1105	* resumed, presumably after having been reloaded with more space.
				1106	*/
				1107	static void retry_on_resume(struct bio *bio)
				1108	{
				1109	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
				1110	struct pool *pool = tc->pool;
				1111	unsigned long flags;
				1112
				1113	spin_lock_irqsave(&pool->lock, flags);
				1114	bio_list_add(&pool->retry_on_resume_list, bio);
				1115	spin_unlock_irqrestore(&pool->lock, flags);
				1116	}
				1117
				1118	static void no_space(struct cell *cell)
				1119	{
				1120	struct bio *bio;
				1121	struct bio_list bios;
				1122
				1123	bio_list_init(&bios);
				1124	cell_release(cell, &bios);
				1125
				1126	while ((bio = bio_list_pop(&bios)))
				1127	retry_on_resume(bio);
				1128	}
				1129
				1130	static void break_sharing(struct thin_c tc, struct bio bio, dm_block_t block,
				1131	struct cell_key *key,
				1132	struct dm_thin_lookup_result *lookup_result,
				1133	struct cell *cell)
				1134	{
				1135	int r;
				1136	dm_block_t data_block;
				1137
				1138	r = alloc_data_block(tc, &data_block);
				1139	switch (r) {
				1140	case 0:
				1141	schedule_copy(tc, block, lookup_result->block,
				1142	data_block, cell, bio);
				1143	break;
				1144
				1145	case -ENOSPC:
				1146	no_space(cell);
				1147	break;
				1148
				1149	default:
				1150	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1151	cell_error(cell);
				1152	break;
				1153	}
				1154	}
				1155
				1156	static void process_shared_bio(struct thin_c tc, struct bio bio,
				1157	dm_block_t block,
				1158	struct dm_thin_lookup_result *lookup_result)
				1159	{
				1160	struct cell *cell;
				1161	struct pool *pool = tc->pool;
				1162	struct cell_key key;
				1163
				1164	/*
				1165	* If cell is already occupied, then sharing is already in the process
				1166	* of being broken so we have nothing further to do here.
				1167	*/
				1168	build_data_key(tc->td, lookup_result->block, &key);
				1169	if (bio_detain(pool->prison, &key, bio, &cell))
				1170	return;
				1171
				1172	if (bio_data_dir(bio) == WRITE)
				1173	break_sharing(tc, bio, block, &key, lookup_result, cell);
				1174	else {
				1175	struct endio_hook *h;
				1176	h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
				1177
				1178	h->tc = tc;
				1179	h->entry = ds_inc(&pool->ds);
				1180	save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
				1181	dm_get_mapinfo(bio)->ptr = h;
				1182
				1183	cell_release_singleton(cell, bio);
				1184	remap_and_issue(tc, bio, lookup_result->block);
				1185	}
				1186	}
				1187
				1188	static void provision_block(struct thin_c tc, struct bio bio, dm_block_t block,
				1189	struct cell *cell)
				1190	{
				1191	int r;
				1192	dm_block_t data_block;
				1193
				1194	/*
				1195	* Remap empty bios (flushes) immediately, without provisioning.
				1196	*/
				1197	if (!bio->bi_size) {
				1198	cell_release_singleton(cell, bio);
				1199	remap_and_issue(tc, bio, 0);
				1200	return;
				1201	}
				1202
				1203	/*
				1204	* Fill read bios with zeroes and complete them immediately.
				1205	*/
				1206	if (bio_data_dir(bio) == READ) {
				1207	zero_fill_bio(bio);
				1208	cell_release_singleton(cell, bio);
				1209	bio_endio(bio, 0);
				1210	return;
				1211	}
				1212
				1213	r = alloc_data_block(tc, &data_block);
				1214	switch (r) {
				1215	case 0:
				1216	schedule_zero(tc, block, data_block, cell, bio);
				1217	break;
				1218
				1219	case -ENOSPC:
				1220	no_space(cell);
				1221	break;
				1222
				1223	default:
				1224	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1225	cell_error(cell);
				1226	break;
				1227	}
				1228	}
				1229
				1230	static void process_bio(struct thin_c tc, struct bio bio)
				1231	{
				1232	int r;
				1233	dm_block_t block = get_bio_block(tc, bio);
				1234	struct cell *cell;
				1235	struct cell_key key;
				1236	struct dm_thin_lookup_result lookup_result;
				1237
				1238	/*
				1239	* If cell is already occupied, then the block is already
				1240	* being provisioned so we have nothing further to do here.
				1241	*/
				1242	build_virtual_key(tc->td, block, &key);
				1243	if (bio_detain(tc->pool->prison, &key, bio, &cell))
				1244	return;
				1245
				1246	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1247	switch (r) {
				1248	case 0:
				1249	/*
				1250	* We can release this cell now. This thread is the only
				1251	* one that puts bios into a cell, and we know there were
				1252	* no preceding bios.
				1253	*/
				1254	/*
				1255	* TODO: this will probably have to change when discard goes
				1256	* back in.
				1257	*/
				1258	cell_release_singleton(cell, bio);
				1259
				1260	if (lookup_result.shared)
				1261	process_shared_bio(tc, bio, block, &lookup_result);
				1262	else
				1263	remap_and_issue(tc, bio, lookup_result.block);
				1264	break;
				1265
				1266	case -ENODATA:
				1267	provision_block(tc, bio, block, cell);
				1268	break;
				1269
				1270	default:
				1271	DMERR("dm_thin_find_block() failed, error = %d", r);
				1272	bio_io_error(bio);
				1273	break;
				1274	}
				1275	}
				1276
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	1277	static int need_commit_due_to_time(struct pool *pool)
				1278	{
				1279	return jiffies < pool->last_commit_jiffies \|\|
				1280	jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
				1281	}
				1282
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1283	static void process_deferred_bios(struct pool *pool)
				1284	{
				1285	unsigned long flags;
				1286	struct bio *bio;
				1287	struct bio_list bios;
				1288	int r;
				1289
				1290	bio_list_init(&bios);
				1291
				1292	spin_lock_irqsave(&pool->lock, flags);
				1293	bio_list_merge(&bios, &pool->deferred_bios);
				1294	bio_list_init(&pool->deferred_bios);
				1295	spin_unlock_irqrestore(&pool->lock, flags);
				1296
				1297	while ((bio = bio_list_pop(&bios))) {
				1298	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
				1299	/*
				1300	* If we've got no free new_mapping structs, and processing
				1301	* this bio might require one, we pause until there are some
				1302	* prepared mappings to process.
				1303	*/
				1304	if (ensure_next_mapping(pool)) {
				1305	spin_lock_irqsave(&pool->lock, flags);
				1306	bio_list_merge(&pool->deferred_bios, &bios);
				1307	spin_unlock_irqrestore(&pool->lock, flags);
				1308
				1309	break;
				1310	}
				1311	process_bio(tc, bio);
				1312	}
				1313
				1314	/*
				1315	* If there are any deferred flush bios, we must commit
				1316	* the metadata before issuing them.
				1317	*/
				1318	bio_list_init(&bios);
				1319	spin_lock_irqsave(&pool->lock, flags);
				1320	bio_list_merge(&bios, &pool->deferred_flush_bios);
				1321	bio_list_init(&pool->deferred_flush_bios);
				1322	spin_unlock_irqrestore(&pool->lock, flags);
				1323
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	1324	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1325	return;
				1326
				1327	r = dm_pool_commit_metadata(pool->pmd);
				1328	if (r) {
				1329	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1330	__func__, r);
				1331	while ((bio = bio_list_pop(&bios)))
				1332	bio_io_error(bio);
				1333	return;
				1334	}
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	1335	pool->last_commit_jiffies = jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1336
				1337	while ((bio = bio_list_pop(&bios)))
				1338	generic_make_request(bio);
				1339	}
				1340
				1341	static void do_worker(struct work_struct *ws)
				1342	{
				1343	struct pool *pool = container_of(ws, struct pool, worker);
				1344
				1345	process_prepared_mappings(pool);
				1346	process_deferred_bios(pool);
				1347	}
				1348
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	1349	/*
				1350	* We want to commit periodically so that not too much
				1351	* unwritten data builds up.
				1352	*/
				1353	static void do_waker(struct work_struct *ws)
				1354	{
				1355	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
				1356	wake_worker(pool);
				1357	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
				1358	}
				1359
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1360	/----------------------------------------------------------------/
				1361
				1362	/*
				1363	* Mapping functions.
				1364	*/
				1365
				1366	/*
				1367	* Called only while mapping a thin bio to hand it over to the workqueue.
				1368	*/
				1369	static void thin_defer_bio(struct thin_c tc, struct bio bio)
				1370	{
				1371	unsigned long flags;
				1372	struct pool *pool = tc->pool;
				1373
				1374	spin_lock_irqsave(&pool->lock, flags);
				1375	bio_list_add(&pool->deferred_bios, bio);
				1376	spin_unlock_irqrestore(&pool->lock, flags);
				1377
				1378	wake_worker(pool);
				1379	}
				1380
				1381	/*
				1382	* Non-blocking function called from the thin target's map function.
				1383	*/
				1384	static int thin_bio_map(struct dm_target ti, struct bio bio,
				1385	union map_info *map_context)
				1386	{
				1387	int r;
				1388	struct thin_c *tc = ti->private;
				1389	dm_block_t block = get_bio_block(tc, bio);
				1390	struct dm_thin_device *td = tc->td;
				1391	struct dm_thin_lookup_result result;
				1392
				1393	/*
				1394	* Save the thin context for easy access from the deferred bio later.
				1395	*/
				1396	map_context->ptr = tc;
				1397
				1398	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA)) {
				1399	thin_defer_bio(tc, bio);
				1400	return DM_MAPIO_SUBMITTED;
				1401	}
				1402
				1403	r = dm_thin_find_block(td, block, 0, &result);
				1404
				1405	/*
				1406	* Note that we defer readahead too.
				1407	*/
				1408	switch (r) {
				1409	case 0:
				1410	if (unlikely(result.shared)) {
				1411	/*
				1412	* We have a race condition here between the
				1413	* result.shared value returned by the lookup and
				1414	* snapshot creation, which may cause new
				1415	* sharing.
				1416	*
				1417	* To avoid this always quiesce the origin before
				1418	* taking the snap. You want to do this anyway to
				1419	* ensure a consistent application view
				1420	* (i.e. lockfs).
				1421	*
				1422	* More distant ancestors are irrelevant. The
				1423	* shared flag will be set in their case.
				1424	*/
				1425	thin_defer_bio(tc, bio);
				1426	r = DM_MAPIO_SUBMITTED;
				1427	} else {
				1428	remap(tc, bio, result.block);
				1429	r = DM_MAPIO_REMAPPED;
				1430	}
				1431	break;
				1432
				1433	case -ENODATA:
				1434	/*
				1435	* In future, the failed dm_thin_find_block above could
				1436	* provide the hint to load the metadata into cache.
				1437	*/
				1438	case -EWOULDBLOCK:
				1439	thin_defer_bio(tc, bio);
				1440	r = DM_MAPIO_SUBMITTED;
				1441	break;
				1442	}
				1443
				1444	return r;
				1445	}
				1446
				1447	static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				1448	{
				1449	int r;
				1450	unsigned long flags;
				1451	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
				1452
				1453	spin_lock_irqsave(&pt->pool->lock, flags);
				1454	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
				1455	spin_unlock_irqrestore(&pt->pool->lock, flags);
				1456
				1457	if (!r) {
				1458	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				1459	r = bdi_congested(&q->backing_dev_info, bdi_bits);
				1460	}
				1461
				1462	return r;
				1463	}
				1464
				1465	static void __requeue_bios(struct pool *pool)
				1466	{
				1467	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
				1468	bio_list_init(&pool->retry_on_resume_list);
				1469	}
				1470
				1471	/*----------------------------------------------------------------
				1472	* Binding of control targets to a pool object
				1473	--------------------------------------------------------------/
				1474	static int bind_control_target(struct pool pool, struct dm_target ti)
				1475	{
				1476	struct pool_c *pt = ti->private;
				1477
				1478	pool->ti = ti;
				1479	pool->low_water_blocks = pt->low_water_blocks;
				1480	pool->zero_new_blocks = pt->zero_new_blocks;
				1481
				1482	return 0;
				1483	}
				1484
				1485	static void unbind_control_target(struct pool pool, struct dm_target ti)
				1486	{
				1487	if (pool->ti == ti)
				1488	pool->ti = NULL;
				1489	}
				1490
				1491	/*----------------------------------------------------------------
				1492	* Pool creation
				1493	--------------------------------------------------------------/
				1494	static void __pool_destroy(struct pool *pool)
				1495	{
				1496	__pool_table_remove(pool);
				1497
				1498	if (dm_pool_metadata_close(pool->pmd) < 0)
				1499	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1500
				1501	prison_destroy(pool->prison);
				1502	dm_kcopyd_client_destroy(pool->copier);
				1503
				1504	if (pool->wq)
				1505	destroy_workqueue(pool->wq);
				1506
				1507	if (pool->next_mapping)
				1508	mempool_free(pool->next_mapping, pool->mapping_pool);
				1509	mempool_destroy(pool->mapping_pool);
				1510	mempool_destroy(pool->endio_hook_pool);
				1511	kfree(pool);
				1512	}
				1513
				1514	static struct pool pool_create(struct mapped_device pool_md,
				1515	struct block_device *metadata_dev,
				1516	unsigned long block_size, char **error)
				1517	{
				1518	int r;
				1519	void *err_p;
				1520	struct pool *pool;
				1521	struct dm_pool_metadata *pmd;
				1522
				1523	pmd = dm_pool_metadata_open(metadata_dev, block_size);
				1524	if (IS_ERR(pmd)) {
				1525	*error = "Error creating metadata object";
				1526	return (struct pool *)pmd;
				1527	}
				1528
				1529	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
				1530	if (!pool) {
				1531	*error = "Error allocating memory for pool";
				1532	err_p = ERR_PTR(-ENOMEM);
				1533	goto bad_pool;
				1534	}
				1535
				1536	pool->pmd = pmd;
				1537	pool->sectors_per_block = block_size;
				1538	pool->block_shift = ffs(block_size) - 1;
				1539	pool->offset_mask = block_size - 1;
				1540	pool->low_water_blocks = 0;
				1541	pool->zero_new_blocks = 1;
				1542	pool->prison = prison_create(PRISON_CELLS);
				1543	if (!pool->prison) {
				1544	*error = "Error creating pool's bio prison";
				1545	err_p = ERR_PTR(-ENOMEM);
				1546	goto bad_prison;
				1547	}
				1548
				1549	pool->copier = dm_kcopyd_client_create();
				1550	if (IS_ERR(pool->copier)) {
				1551	r = PTR_ERR(pool->copier);
				1552	*error = "Error creating pool's kcopyd client";
				1553	err_p = ERR_PTR(r);
				1554	goto bad_kcopyd_client;
				1555	}
				1556
				1557	/*
				1558	* Create singlethreaded workqueue that will service all devices
				1559	* that use this metadata.
				1560	*/
				1561	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
				1562	if (!pool->wq) {
				1563	*error = "Error creating pool's workqueue";
				1564	err_p = ERR_PTR(-ENOMEM);
				1565	goto bad_wq;
				1566	}
				1567
				1568	INIT_WORK(&pool->worker, do_worker);
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	1569	INIT_DELAYED_WORK(&pool->waker, do_waker);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1570	spin_lock_init(&pool->lock);
				1571	bio_list_init(&pool->deferred_bios);
				1572	bio_list_init(&pool->deferred_flush_bios);
				1573	INIT_LIST_HEAD(&pool->prepared_mappings);
				1574	pool->low_water_triggered = 0;
				1575	pool->no_free_space = 0;
				1576	bio_list_init(&pool->retry_on_resume_list);
				1577	ds_init(&pool->ds);
				1578
				1579	pool->next_mapping = NULL;
				1580	pool->mapping_pool =
				1581	mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
				1582	if (!pool->mapping_pool) {
				1583	*error = "Error creating pool's mapping mempool";
				1584	err_p = ERR_PTR(-ENOMEM);
				1585	goto bad_mapping_pool;
				1586	}
				1587
				1588	pool->endio_hook_pool =
				1589	mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
				1590	if (!pool->endio_hook_pool) {
				1591	*error = "Error creating pool's endio_hook mempool";
				1592	err_p = ERR_PTR(-ENOMEM);
				1593	goto bad_endio_hook_pool;
				1594	}
				1595	pool->ref_count = 1;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	1596	pool->last_commit_jiffies = jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1597	pool->pool_md = pool_md;
				1598	pool->md_dev = metadata_dev;
				1599	__pool_table_insert(pool);
				1600
				1601	return pool;
				1602
				1603	bad_endio_hook_pool:
				1604	mempool_destroy(pool->mapping_pool);
				1605	bad_mapping_pool:
				1606	destroy_workqueue(pool->wq);
				1607	bad_wq:
				1608	dm_kcopyd_client_destroy(pool->copier);
				1609	bad_kcopyd_client:
				1610	prison_destroy(pool->prison);
				1611	bad_prison:
				1612	kfree(pool);
				1613	bad_pool:
				1614	if (dm_pool_metadata_close(pmd))
				1615	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1616
				1617	return err_p;
				1618	}
				1619
				1620	static void __pool_inc(struct pool *pool)
				1621	{
				1622	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1623	pool->ref_count++;
				1624	}
				1625
				1626	static void __pool_dec(struct pool *pool)
				1627	{
				1628	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1629	BUG_ON(!pool->ref_count);
				1630	if (!--pool->ref_count)
				1631	__pool_destroy(pool);
				1632	}
				1633
				1634	static struct pool __pool_find(struct mapped_device pool_md,
				1635	struct block_device *metadata_dev,
				1636	unsigned long block_size, char **error)
				1637	{
				1638	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
				1639
				1640	if (pool) {
				1641	if (pool->pool_md != pool_md)
				1642	return ERR_PTR(-EBUSY);
				1643	__pool_inc(pool);
				1644
				1645	} else {
				1646	pool = __pool_table_lookup(pool_md);
				1647	if (pool) {
				1648	if (pool->md_dev != metadata_dev)
				1649	return ERR_PTR(-EINVAL);
				1650	__pool_inc(pool);
				1651
				1652	} else
				1653	pool = pool_create(pool_md, metadata_dev, block_size, error);
				1654	}
				1655
				1656	return pool;
				1657	}
				1658
				1659	/*----------------------------------------------------------------
				1660	* Pool target methods
				1661	--------------------------------------------------------------/
				1662	static void pool_dtr(struct dm_target *ti)
				1663	{
				1664	struct pool_c *pt = ti->private;
				1665
				1666	mutex_lock(&dm_thin_pool_table.mutex);
				1667
				1668	unbind_control_target(pt->pool, ti);
				1669	__pool_dec(pt->pool);
				1670	dm_put_device(ti, pt->metadata_dev);
				1671	dm_put_device(ti, pt->data_dev);
				1672	kfree(pt);
				1673
				1674	mutex_unlock(&dm_thin_pool_table.mutex);
				1675	}
				1676
				1677	struct pool_features {
				1678	unsigned zero_new_blocks:1;
				1679	};
				1680
				1681	static int parse_pool_features(struct dm_arg_set as, struct pool_features pf,
				1682	struct dm_target *ti)
				1683	{
				1684	int r;
				1685	unsigned argc;
				1686	const char *arg_name;
				1687
				1688	static struct dm_arg _args[] = {
				1689	{0, 1, "Invalid number of pool feature arguments"},
				1690	};
				1691
				1692	/*
				1693	* No feature arguments supplied.
				1694	*/
				1695	if (!as->argc)
				1696	return 0;
				1697
				1698	r = dm_read_arg_group(_args, as, &argc, &ti->error);
				1699	if (r)
				1700	return -EINVAL;
				1701
				1702	while (argc && !r) {
				1703	arg_name = dm_shift_arg(as);
				1704	argc--;
				1705
				1706	if (!strcasecmp(arg_name, "skip_block_zeroing")) {
				1707	pf->zero_new_blocks = 0;
				1708	continue;
				1709	}
				1710
				1711	ti->error = "Unrecognised pool feature requested";
				1712	r = -EINVAL;
				1713	}
				1714
				1715	return r;
				1716	}
				1717
				1718	/*
				1719	* thin-pool <metadata dev> <data dev>
				1720	* <data block size (sectors)>
				1721	* <low water mark (blocks)>
				1722	* [<#feature args> [<arg>]*]
				1723	*
				1724	* Optional feature arguments are:
				1725	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
				1726	*/
				1727	static int pool_ctr(struct dm_target ti, unsigned argc, char *argv)
				1728	{
				1729	int r;
				1730	struct pool_c *pt;
				1731	struct pool *pool;
				1732	struct pool_features pf;
				1733	struct dm_arg_set as;
				1734	struct dm_dev *data_dev;
				1735	unsigned long block_size;
				1736	dm_block_t low_water_blocks;
				1737	struct dm_dev *metadata_dev;
				1738	sector_t metadata_dev_size;
				1739
				1740	/*
				1741	* FIXME Remove validation from scope of lock.
				1742	*/
				1743	mutex_lock(&dm_thin_pool_table.mutex);
				1744
				1745	if (argc < 4) {
				1746	ti->error = "Invalid argument count";
				1747	r = -EINVAL;
				1748	goto out_unlock;
				1749	}
				1750	as.argc = argc;
				1751	as.argv = argv;
				1752
				1753	r = dm_get_device(ti, argv[0], FMODE_READ \| FMODE_WRITE, &metadata_dev);
				1754	if (r) {
				1755	ti->error = "Error opening metadata block device";
				1756	goto out_unlock;
				1757	}
				1758
				1759	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
				1760	if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
				1761	ti->error = "Metadata device is too large";
				1762	r = -EINVAL;
				1763	goto out_metadata;
				1764	}
				1765
				1766	r = dm_get_device(ti, argv[1], FMODE_READ \| FMODE_WRITE, &data_dev);
				1767	if (r) {
				1768	ti->error = "Error getting data device";
				1769	goto out_metadata;
				1770	}
				1771
				1772	if (kstrtoul(argv[2], 10, &block_size) \|\| !block_size \|\|
				1773	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				1774	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				1775	!is_power_of_2(block_size)) {
				1776	ti->error = "Invalid block size";
				1777	r = -EINVAL;
				1778	goto out;
				1779	}
				1780
				1781	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
				1782	ti->error = "Invalid low water mark";
				1783	r = -EINVAL;
				1784	goto out;
				1785	}
				1786
				1787	/*
				1788	* Set default pool features.
				1789	*/
				1790	memset(&pf, 0, sizeof(pf));
				1791	pf.zero_new_blocks = 1;
				1792
				1793	dm_consume_args(&as, 4);
				1794	r = parse_pool_features(&as, &pf, ti);
				1795	if (r)
				1796	goto out;
				1797
				1798	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
				1799	if (!pt) {
				1800	r = -ENOMEM;
				1801	goto out;
				1802	}
				1803
				1804	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
				1805	block_size, &ti->error);
				1806	if (IS_ERR(pool)) {
				1807	r = PTR_ERR(pool);
				1808	goto out_free_pt;
				1809	}
				1810
				1811	pt->pool = pool;
				1812	pt->ti = ti;
				1813	pt->metadata_dev = metadata_dev;
				1814	pt->data_dev = data_dev;
				1815	pt->low_water_blocks = low_water_blocks;
				1816	pt->zero_new_blocks = pf.zero_new_blocks;
				1817	ti->num_flush_requests = 1;
				1818	ti->num_discard_requests = 0;
				1819	ti->private = pt;
				1820
				1821	pt->callbacks.congested_fn = pool_is_congested;
				1822	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
				1823
				1824	mutex_unlock(&dm_thin_pool_table.mutex);
				1825
				1826	return 0;
				1827
				1828	out_free_pt:
				1829	kfree(pt);
				1830	out:
				1831	dm_put_device(ti, data_dev);
				1832	out_metadata:
				1833	dm_put_device(ti, metadata_dev);
				1834	out_unlock:
				1835	mutex_unlock(&dm_thin_pool_table.mutex);
				1836
				1837	return r;
				1838	}
				1839
				1840	static int pool_map(struct dm_target ti, struct bio bio,
				1841	union map_info *map_context)
				1842	{
				1843	int r;
				1844	struct pool_c *pt = ti->private;
				1845	struct pool *pool = pt->pool;
				1846	unsigned long flags;
				1847
				1848	/*
				1849	* As this is a singleton target, ti->begin is always zero.
				1850	*/
				1851	spin_lock_irqsave(&pool->lock, flags);
				1852	bio->bi_bdev = pt->data_dev->bdev;
				1853	r = DM_MAPIO_REMAPPED;
				1854	spin_unlock_irqrestore(&pool->lock, flags);
				1855
				1856	return r;
				1857	}
				1858
				1859	/*
				1860	* Retrieves the number of blocks of the data device from
				1861	* the superblock and compares it to the actual device size,
				1862	* thus resizing the data device in case it has grown.
				1863	*
				1864	* This both copes with opening preallocated data devices in the ctr
				1865	* being followed by a resume
				1866	* -and-
				1867	* calling the resume method individually after userspace has
				1868	* grown the data device in reaction to a table event.
				1869	*/
				1870	static int pool_preresume(struct dm_target *ti)
				1871	{
				1872	int r;
				1873	struct pool_c *pt = ti->private;
				1874	struct pool *pool = pt->pool;
				1875	dm_block_t data_size, sb_data_size;
				1876
				1877	/*
				1878	* Take control of the pool object.
				1879	*/
				1880	r = bind_control_target(pool, ti);
				1881	if (r)
				1882	return r;
				1883
				1884	data_size = ti->len >> pool->block_shift;
				1885	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
				1886	if (r) {
				1887	DMERR("failed to retrieve data device size");
				1888	return r;
				1889	}
				1890
				1891	if (data_size < sb_data_size) {
				1892	DMERR("pool target too small, is %llu blocks (expected %llu)",
				1893	data_size, sb_data_size);
				1894	return -EINVAL;
				1895
				1896	} else if (data_size > sb_data_size) {
				1897	r = dm_pool_resize_data_dev(pool->pmd, data_size);
				1898	if (r) {
				1899	DMERR("failed to resize data device");
				1900	return r;
				1901	}
				1902
				1903	r = dm_pool_commit_metadata(pool->pmd);
				1904	if (r) {
				1905	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1906	__func__, r);
				1907	return r;
				1908	}
				1909	}
				1910
				1911	return 0;
				1912	}
				1913
				1914	static void pool_resume(struct dm_target *ti)
				1915	{
				1916	struct pool_c *pt = ti->private;
				1917	struct pool *pool = pt->pool;
				1918	unsigned long flags;
				1919
				1920	spin_lock_irqsave(&pool->lock, flags);
				1921	pool->low_water_triggered = 0;
				1922	pool->no_free_space = 0;
				1923	__requeue_bios(pool);
				1924	spin_unlock_irqrestore(&pool->lock, flags);
				1925
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	1926	do_waker(&pool->waker.work);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1927	}
				1928
				1929	static void pool_postsuspend(struct dm_target *ti)
				1930	{
				1931	int r;
				1932	struct pool_c *pt = ti->private;
				1933	struct pool *pool = pt->pool;
				1934
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame^]	1935	cancel_delayed_work(&pool->waker);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1936	flush_workqueue(pool->wq);
				1937
				1938	r = dm_pool_commit_metadata(pool->pmd);
				1939	if (r < 0) {
				1940	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1941	__func__, r);
				1942	/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
				1943	}
				1944	}
				1945
				1946	static int check_arg_count(unsigned argc, unsigned args_required)
				1947	{
				1948	if (argc != args_required) {
				1949	DMWARN("Message received with %u arguments instead of %u.",
				1950	argc, args_required);
				1951	return -EINVAL;
				1952	}
				1953
				1954	return 0;
				1955	}
				1956
				1957	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
				1958	{
				1959	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
				1960	*dev_id <= MAX_DEV_ID)
				1961	return 0;
				1962
				1963	if (warning)
				1964	DMWARN("Message received with invalid device id: %s", arg);
				1965
				1966	return -EINVAL;
				1967	}
				1968
				1969	static int process_create_thin_mesg(unsigned argc, char *argv, struct pool pool)
				1970	{
				1971	dm_thin_id dev_id;
				1972	int r;
				1973
				1974	r = check_arg_count(argc, 2);
				1975	if (r)
				1976	return r;
				1977
				1978	r = read_dev_id(argv[1], &dev_id, 1);
				1979	if (r)
				1980	return r;
				1981
				1982	r = dm_pool_create_thin(pool->pmd, dev_id);
				1983	if (r) {
				1984	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
				1985	argv[1]);
				1986	return r;
				1987	}
				1988
				1989	return 0;
				1990	}
				1991
				1992	static int process_create_snap_mesg(unsigned argc, char *argv, struct pool pool)
				1993	{
				1994	dm_thin_id dev_id;
				1995	dm_thin_id origin_dev_id;
				1996	int r;
				1997
				1998	r = check_arg_count(argc, 3);
				1999	if (r)
				2000	return r;
				2001
				2002	r = read_dev_id(argv[1], &dev_id, 1);
				2003	if (r)
				2004	return r;
				2005
				2006	r = read_dev_id(argv[2], &origin_dev_id, 1);
				2007	if (r)
				2008	return r;
				2009
				2010	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
				2011	if (r) {
				2012	DMWARN("Creation of new snapshot %s of device %s failed.",
				2013	argv[1], argv[2]);
				2014	return r;
				2015	}
				2016
				2017	return 0;
				2018	}
				2019
				2020	static int process_delete_mesg(unsigned argc, char *argv, struct pool pool)
				2021	{
				2022	dm_thin_id dev_id;
				2023	int r;
				2024
				2025	r = check_arg_count(argc, 2);
				2026	if (r)
				2027	return r;
				2028
				2029	r = read_dev_id(argv[1], &dev_id, 1);
				2030	if (r)
				2031	return r;
				2032
				2033	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
				2034	if (r)
				2035	DMWARN("Deletion of thin device %s failed.", argv[1]);
				2036
				2037	return r;
				2038	}
				2039
				2040	static int process_set_transaction_id_mesg(unsigned argc, char *argv, struct pool pool)
				2041	{
				2042	dm_thin_id old_id, new_id;
				2043	int r;
				2044
				2045	r = check_arg_count(argc, 3);
				2046	if (r)
				2047	return r;
				2048
				2049	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
				2050	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
				2051	return -EINVAL;
				2052	}
				2053
				2054	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
				2055	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
				2056	return -EINVAL;
				2057	}
				2058
				2059	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
				2060	if (r) {
				2061	DMWARN("Failed to change transaction id from %s to %s.",
				2062	argv[1], argv[2]);
				2063	return r;
				2064	}
				2065
				2066	return 0;
				2067	}
				2068
				2069	/*
				2070	* Messages supported:
				2071	* create_thin <dev_id>
				2072	* create_snap <dev_id> <origin_id>
				2073	* delete <dev_id>
				2074	* trim <dev_id> <new_size_in_sectors>
				2075	* set_transaction_id <current_trans_id> <new_trans_id>
				2076	*/
				2077	static int pool_message(struct dm_target ti, unsigned argc, char *argv)
				2078	{
				2079	int r = -EINVAL;
				2080	struct pool_c *pt = ti->private;
				2081	struct pool *pool = pt->pool;
				2082
				2083	if (!strcasecmp(argv[0], "create_thin"))
				2084	r = process_create_thin_mesg(argc, argv, pool);
				2085
				2086	else if (!strcasecmp(argv[0], "create_snap"))
				2087	r = process_create_snap_mesg(argc, argv, pool);
				2088
				2089	else if (!strcasecmp(argv[0], "delete"))
				2090	r = process_delete_mesg(argc, argv, pool);
				2091
				2092	else if (!strcasecmp(argv[0], "set_transaction_id"))
				2093	r = process_set_transaction_id_mesg(argc, argv, pool);
				2094
				2095	else
				2096	DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
				2097
				2098	if (!r) {
				2099	r = dm_pool_commit_metadata(pool->pmd);
				2100	if (r)
				2101	DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
				2102	argv[0], r);
				2103	}
				2104
				2105	return r;
				2106	}
				2107
				2108	/*
				2109	* Status line is:
				2110	* <transaction id> <used metadata sectors>/<total metadata sectors>
				2111	* <used data sectors>/<total data sectors> <held metadata root>
				2112	*/
				2113	static int pool_status(struct dm_target *ti, status_type_t type,
				2114	char *result, unsigned maxlen)
				2115	{
				2116	int r;
				2117	unsigned sz = 0;
				2118	uint64_t transaction_id;
				2119	dm_block_t nr_free_blocks_data;
				2120	dm_block_t nr_free_blocks_metadata;
				2121	dm_block_t nr_blocks_data;
				2122	dm_block_t nr_blocks_metadata;
				2123	dm_block_t held_root;
				2124	char buf[BDEVNAME_SIZE];
				2125	char buf2[BDEVNAME_SIZE];
				2126	struct pool_c *pt = ti->private;
				2127	struct pool *pool = pt->pool;
				2128
				2129	switch (type) {
				2130	case STATUSTYPE_INFO:
				2131	r = dm_pool_get_metadata_transaction_id(pool->pmd,
				2132	&transaction_id);
				2133	if (r)
				2134	return r;
				2135
				2136	r = dm_pool_get_free_metadata_block_count(pool->pmd,
				2137	&nr_free_blocks_metadata);
				2138	if (r)
				2139	return r;
				2140
				2141	r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
				2142	if (r)
				2143	return r;
				2144
				2145	r = dm_pool_get_free_block_count(pool->pmd,
				2146	&nr_free_blocks_data);
				2147	if (r)
				2148	return r;
				2149
				2150	r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
				2151	if (r)
				2152	return r;
				2153
				2154	r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
				2155	if (r)
				2156	return r;
				2157
				2158	DMEMIT("%llu %llu/%llu %llu/%llu ",
				2159	(unsigned long long)transaction_id,
				2160	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				2161	(unsigned long long)nr_blocks_metadata,
				2162	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
				2163	(unsigned long long)nr_blocks_data);
				2164
				2165	if (held_root)
				2166	DMEMIT("%llu", held_root);
				2167	else
				2168	DMEMIT("-");
				2169
				2170	break;
				2171
				2172	case STATUSTYPE_TABLE:
				2173	DMEMIT("%s %s %lu %llu ",
				2174	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
				2175	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
				2176	(unsigned long)pool->sectors_per_block,
				2177	(unsigned long long)pt->low_water_blocks);
				2178
				2179	DMEMIT("%u ", !pool->zero_new_blocks);
				2180
				2181	if (!pool->zero_new_blocks)
				2182	DMEMIT("skip_block_zeroing ");
				2183	break;
				2184	}
				2185
				2186	return 0;
				2187	}
				2188
				2189	static int pool_iterate_devices(struct dm_target *ti,
				2190	iterate_devices_callout_fn fn, void *data)
				2191	{
				2192	struct pool_c *pt = ti->private;
				2193
				2194	return fn(ti, pt->data_dev, 0, ti->len, data);
				2195	}
				2196
				2197	static int pool_merge(struct dm_target ti, struct bvec_merge_data bvm,
				2198	struct bio_vec *biovec, int max_size)
				2199	{
				2200	struct pool_c *pt = ti->private;
				2201	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				2202
				2203	if (!q->merge_bvec_fn)
				2204	return max_size;
				2205
				2206	bvm->bi_bdev = pt->data_dev->bdev;
				2207
				2208	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
				2209	}
				2210
				2211	static void pool_io_hints(struct dm_target ti, struct queue_limits limits)
				2212	{
				2213	struct pool_c *pt = ti->private;
				2214	struct pool *pool = pt->pool;
				2215
				2216	blk_limits_io_min(limits, 0);
				2217	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
				2218	}
				2219
				2220	static struct target_type pool_target = {
				2221	.name = "thin-pool",
				2222	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
				2223	DM_TARGET_IMMUTABLE,
				2224	.version = {1, 0, 0},
				2225	.module = THIS_MODULE,
				2226	.ctr = pool_ctr,
				2227	.dtr = pool_dtr,
				2228	.map = pool_map,
				2229	.postsuspend = pool_postsuspend,
				2230	.preresume = pool_preresume,
				2231	.resume = pool_resume,
				2232	.message = pool_message,
				2233	.status = pool_status,
				2234	.merge = pool_merge,
				2235	.iterate_devices = pool_iterate_devices,
				2236	.io_hints = pool_io_hints,
				2237	};
				2238
				2239	/*----------------------------------------------------------------
				2240	* Thin target methods
				2241	--------------------------------------------------------------/
				2242	static void thin_dtr(struct dm_target *ti)
				2243	{
				2244	struct thin_c *tc = ti->private;
				2245
				2246	mutex_lock(&dm_thin_pool_table.mutex);
				2247
				2248	__pool_dec(tc->pool);
				2249	dm_pool_close_thin_device(tc->td);
				2250	dm_put_device(ti, tc->pool_dev);
				2251	kfree(tc);
				2252
				2253	mutex_unlock(&dm_thin_pool_table.mutex);
				2254	}
				2255
				2256	/*
				2257	* Thin target parameters:
				2258	*
				2259	* <pool_dev> <dev_id>
				2260	*
				2261	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
				2262	* dev_id: the internal device identifier
				2263	*/
				2264	static int thin_ctr(struct dm_target ti, unsigned argc, char *argv)
				2265	{
				2266	int r;
				2267	struct thin_c *tc;
				2268	struct dm_dev *pool_dev;
				2269	struct mapped_device *pool_md;
				2270
				2271	mutex_lock(&dm_thin_pool_table.mutex);
				2272
				2273	if (argc != 2) {
				2274	ti->error = "Invalid argument count";
				2275	r = -EINVAL;
				2276	goto out_unlock;
				2277	}
				2278
				2279	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
				2280	if (!tc) {
				2281	ti->error = "Out of memory";
				2282	r = -ENOMEM;
				2283	goto out_unlock;
				2284	}
				2285
				2286	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
				2287	if (r) {
				2288	ti->error = "Error opening pool device";
				2289	goto bad_pool_dev;
				2290	}
				2291	tc->pool_dev = pool_dev;
				2292
				2293	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
				2294	ti->error = "Invalid device id";
				2295	r = -EINVAL;
				2296	goto bad_common;
				2297	}
				2298
				2299	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
				2300	if (!pool_md) {
				2301	ti->error = "Couldn't get pool mapped device";
				2302	r = -EINVAL;
				2303	goto bad_common;
				2304	}
				2305
				2306	tc->pool = __pool_table_lookup(pool_md);
				2307	if (!tc->pool) {
				2308	ti->error = "Couldn't find pool object";
				2309	r = -EINVAL;
				2310	goto bad_pool_lookup;
				2311	}
				2312	__pool_inc(tc->pool);
				2313
				2314	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
				2315	if (r) {
				2316	ti->error = "Couldn't open thin internal device";
				2317	goto bad_thin_open;
				2318	}
				2319
				2320	ti->split_io = tc->pool->sectors_per_block;
				2321	ti->num_flush_requests = 1;
				2322	ti->num_discard_requests = 0;
				2323	ti->discards_supported = 0;
				2324
				2325	dm_put(pool_md);
				2326
				2327	mutex_unlock(&dm_thin_pool_table.mutex);
				2328
				2329	return 0;
				2330
				2331	bad_thin_open:
				2332	__pool_dec(tc->pool);
				2333	bad_pool_lookup:
				2334	dm_put(pool_md);
				2335	bad_common:
				2336	dm_put_device(ti, tc->pool_dev);
				2337	bad_pool_dev:
				2338	kfree(tc);
				2339	out_unlock:
				2340	mutex_unlock(&dm_thin_pool_table.mutex);
				2341
				2342	return r;
				2343	}
				2344
				2345	static int thin_map(struct dm_target ti, struct bio bio,
				2346	union map_info *map_context)
				2347	{
				2348	bio->bi_sector -= ti->begin;
				2349
				2350	return thin_bio_map(ti, bio, map_context);
				2351	}
				2352
				2353	static void thin_postsuspend(struct dm_target *ti)
				2354	{
				2355	if (dm_noflush_suspending(ti))
				2356	requeue_io((struct thin_c *)ti->private);
				2357	}
				2358
				2359	/*
				2360	* <nr mapped sectors> <highest mapped sector>
				2361	*/
				2362	static int thin_status(struct dm_target *ti, status_type_t type,
				2363	char *result, unsigned maxlen)
				2364	{
				2365	int r;
				2366	ssize_t sz = 0;
				2367	dm_block_t mapped, highest;
				2368	char buf[BDEVNAME_SIZE];
				2369	struct thin_c *tc = ti->private;
				2370
				2371	if (!tc->td)
				2372	DMEMIT("-");
				2373	else {
				2374	switch (type) {
				2375	case STATUSTYPE_INFO:
				2376	r = dm_thin_get_mapped_count(tc->td, &mapped);
				2377	if (r)
				2378	return r;
				2379
				2380	r = dm_thin_get_highest_mapped_block(tc->td, &highest);
				2381	if (r < 0)
				2382	return r;
				2383
				2384	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
				2385	if (r)
				2386	DMEMIT("%llu", ((highest + 1) *
				2387	tc->pool->sectors_per_block) - 1);
				2388	else
				2389	DMEMIT("-");
				2390	break;
				2391
				2392	case STATUSTYPE_TABLE:
				2393	DMEMIT("%s %lu",
				2394	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
				2395	(unsigned long) tc->dev_id);
				2396	break;
				2397	}
				2398	}
				2399
				2400	return 0;
				2401	}
				2402
				2403	static int thin_iterate_devices(struct dm_target *ti,
				2404	iterate_devices_callout_fn fn, void *data)
				2405	{
				2406	dm_block_t blocks;
				2407	struct thin_c *tc = ti->private;
				2408
				2409	/*
				2410	* We can't call dm_pool_get_data_dev_size() since that blocks. So
				2411	* we follow a more convoluted path through to the pool's target.
				2412	*/
				2413	if (!tc->pool->ti)
				2414	return 0; /* nothing is bound */
				2415
				2416	blocks = tc->pool->ti->len >> tc->pool->block_shift;
				2417	if (blocks)
				2418	return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
				2419
				2420	return 0;
				2421	}
				2422
				2423	static void thin_io_hints(struct dm_target ti, struct queue_limits limits)
				2424	{
				2425	struct thin_c *tc = ti->private;
				2426
				2427	blk_limits_io_min(limits, 0);
				2428	blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
				2429	}
				2430
				2431	static struct target_type thin_target = {
				2432	.name = "thin",
				2433	.version = {1, 0, 0},
				2434	.module = THIS_MODULE,
				2435	.ctr = thin_ctr,
				2436	.dtr = thin_dtr,
				2437	.map = thin_map,
				2438	.postsuspend = thin_postsuspend,
				2439	.status = thin_status,
				2440	.iterate_devices = thin_iterate_devices,
				2441	.io_hints = thin_io_hints,
				2442	};
				2443
				2444	/----------------------------------------------------------------/
				2445
				2446	static int __init dm_thin_init(void)
				2447	{
				2448	int r;
				2449
				2450	pool_table_init();
				2451
				2452	r = dm_register_target(&thin_target);
				2453	if (r)
				2454	return r;
				2455
				2456	r = dm_register_target(&pool_target);
				2457	if (r)
				2458	dm_unregister_target(&thin_target);
				2459
				2460	return r;
				2461	}
				2462
				2463	static void dm_thin_exit(void)
				2464	{
				2465	dm_unregister_target(&thin_target);
				2466	dm_unregister_target(&pool_target);
				2467	}
				2468
				2469	module_init(dm_thin_init);
				2470	module_exit(dm_thin_exit);
				2471
				2472	MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
				2473	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				2474	MODULE_LICENSE("GPL");