Blame - drivers/md/dm-thin.c - kernel/msm-4.19

blob: 703bbbc4f16f52923932cb7e90451df882402cd3 [file] [log] [blame]

Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 Red Hat UK.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm-thin-metadata.h"
				8
				9	#include <linux/device-mapper.h>
				10	#include <linux/dm-io.h>
				11	#include <linux/dm-kcopyd.h>
				12	#include <linux/list.h>
				13	#include <linux/init.h>
				14	#include <linux/module.h>
				15	#include <linux/slab.h>
				16
				17	#define DM_MSG_PREFIX "thin"
				18
				19	/*
				20	* Tunable constants
				21	*/
				22	#define ENDIO_HOOK_POOL_SIZE 10240
				23	#define DEFERRED_SET_SIZE 64
				24	#define MAPPING_POOL_SIZE 1024
				25	#define PRISON_CELLS 1024
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	26	#define COMMIT_PERIOD HZ
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	27
				28	/*
				29	* The block size of the device holding pool data must be
				30	* between 64KB and 1GB.
				31	*/
				32	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
				33	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				34
				35	/*
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	36	* Device id is restricted to 24 bits.
				37	*/
				38	#define MAX_DEV_ID ((1 << 24) - 1)
				39
				40	/*
				41	* How do we handle breaking sharing of data blocks?
				42	* =================================================
				43	*
				44	* We use a standard copy-on-write btree to store the mappings for the
				45	* devices (note I'm talking about copy-on-write of the metadata here, not
				46	* the data). When you take an internal snapshot you clone the root node
				47	* of the origin btree. After this there is no concept of an origin or a
				48	* snapshot. They are just two device trees that happen to point to the
				49	* same data blocks.
				50	*
				51	* When we get a write in we decide if it's to a shared data block using
				52	* some timestamp magic. If it is, we have to break sharing.
				53	*
				54	* Let's say we write to a shared block in what was the origin. The
				55	* steps are:
				56	*
				57	* i) plug io further to this physical block. (see bio_prison code).
				58	*
				59	* ii) quiesce any read io to that shared data block. Obviously
				60	* including all devices that share this block. (see deferred_set code)
				61	*
				62	* iii) copy the data block to a newly allocate block. This step can be
				63	* missed out if the io covers the block. (schedule_copy).
				64	*
				65	* iv) insert the new mapping into the origin's btree
Joe Thornber	fe878f3	2012-03-28 18:41:24 +0100	[diff] [blame]	66	* (process_prepared_mapping). This act of inserting breaks some
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	67	* sharing of btree nodes between the two devices. Breaking sharing only
				68	* effects the btree of that specific device. Btrees for the other
				69	* devices that share the block never change. The btree for the origin
				70	* device as it was after the last commit is untouched, ie. we're using
				71	* persistent data structures in the functional programming sense.
				72	*
				73	* v) unplug io to this physical block, including the io that triggered
				74	* the breaking of sharing.
				75	*
				76	* Steps (ii) and (iii) occur in parallel.
				77	*
				78	* The metadata _doesn't_ need to be committed before the io continues. We
				79	* get away with this because the io is always written to a _new_ block.
				80	* If there's a crash, then:
				81	*
				82	* - The origin mapping will point to the old origin block (the shared
				83	* one). This will contain the data as it was before the io that triggered
				84	* the breaking of sharing came in.
				85	*
				86	* - The snap mapping still points to the old block. As it would after
				87	* the commit.
				88	*
				89	* The downside of this scheme is the timestamp magic isn't perfect, and
				90	* will continue to think that data block in the snapshot device is shared
				91	* even after the write to the origin has broken sharing. I suspect data
				92	* blocks will typically be shared by many different devices, so we're
				93	* breaking sharing n + 1 times, rather than n, where n is the number of
				94	* devices that reference this data block. At the moment I think the
				95	* benefits far, far outweigh the disadvantages.
				96	*/
				97
				98	/----------------------------------------------------------------/
				99
				100	/*
				101	* Sometimes we can't deal with a bio straight away. We put them in prison
				102	* where they can't cause any mischief. Bios are put in a cell identified
				103	* by a key, multiple bios can be in the same cell. When the cell is
				104	* subsequently unlocked the bios become available.
				105	*/
				106	struct bio_prison;
				107
				108	struct cell_key {
				109	int virtual;
				110	dm_thin_id dev;
				111	dm_block_t block;
				112	};
				113
				114	struct cell {
				115	struct hlist_node list;
				116	struct bio_prison *prison;
				117	struct cell_key key;
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	118	struct bio *holder;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	119	struct bio_list bios;
				120	};
				121
				122	struct bio_prison {
				123	spinlock_t lock;
				124	mempool_t *cell_pool;
				125
				126	unsigned nr_buckets;
				127	unsigned hash_mask;
				128	struct hlist_head *cells;
				129	};
				130
				131	static uint32_t calc_nr_buckets(unsigned nr_cells)
				132	{
				133	uint32_t n = 128;
				134
				135	nr_cells /= 4;
				136	nr_cells = min(nr_cells, 8192u);
				137
				138	while (n < nr_cells)
				139	n <<= 1;
				140
				141	return n;
				142	}
				143
				144	/*
				145	* @nr_cells should be the number of cells you want in use _concurrently_.
				146	* Don't confuse it with the number of distinct keys.
				147	*/
				148	static struct bio_prison *prison_create(unsigned nr_cells)
				149	{
				150	unsigned i;
				151	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
				152	size_t len = sizeof(struct bio_prison) +
				153	(sizeof(struct hlist_head) * nr_buckets);
				154	struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
				155
				156	if (!prison)
				157	return NULL;
				158
				159	spin_lock_init(&prison->lock);
				160	prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
				161	sizeof(struct cell));
				162	if (!prison->cell_pool) {
				163	kfree(prison);
				164	return NULL;
				165	}
				166
				167	prison->nr_buckets = nr_buckets;
				168	prison->hash_mask = nr_buckets - 1;
				169	prison->cells = (struct hlist_head *) (prison + 1);
				170	for (i = 0; i < nr_buckets; i++)
				171	INIT_HLIST_HEAD(prison->cells + i);
				172
				173	return prison;
				174	}
				175
				176	static void prison_destroy(struct bio_prison *prison)
				177	{
				178	mempool_destroy(prison->cell_pool);
				179	kfree(prison);
				180	}
				181
				182	static uint32_t hash_key(struct bio_prison prison, struct cell_key key)
				183	{
				184	const unsigned long BIG_PRIME = 4294967291UL;
				185	uint64_t hash = key->block * BIG_PRIME;
				186
				187	return (uint32_t) (hash & prison->hash_mask);
				188	}
				189
				190	static int keys_equal(struct cell_key lhs, struct cell_key rhs)
				191	{
				192	return (lhs->virtual == rhs->virtual) &&
				193	(lhs->dev == rhs->dev) &&
				194	(lhs->block == rhs->block);
				195	}
				196
				197	static struct cell __search_bucket(struct hlist_head bucket,
				198	struct cell_key *key)
				199	{
				200	struct cell *cell;
				201	struct hlist_node *tmp;
				202
				203	hlist_for_each_entry(cell, tmp, bucket, list)
				204	if (keys_equal(&cell->key, key))
				205	return cell;
				206
				207	return NULL;
				208	}
				209
				210	/*
				211	* This may block if a new cell needs allocating. You must ensure that
				212	* cells will be unlocked even if the calling thread is blocked.
				213	*
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	214	* Returns 1 if the cell was already held, 0 if @inmate is the new holder.
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	215	*/
				216	static int bio_detain(struct bio_prison prison, struct cell_key key,
				217	struct bio inmate, struct cell *ref)
				218	{
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	219	int r = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	220	unsigned long flags;
				221	uint32_t hash = hash_key(prison, key);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	222	struct cell cell, cell2;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	223
				224	BUG_ON(hash > prison->nr_buckets);
				225
				226	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	227
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	228	cell = __search_bucket(prison->cells + hash, key);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	229	if (cell) {
				230	bio_list_add(&cell->bios, inmate);
				231	goto out;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	232	}
				233
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	234	/*
				235	* Allocate a new cell
				236	*/
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	237	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	238	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
				239	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	240
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	241	/*
				242	* We've been unlocked, so we have to double check that
				243	* nobody else has inserted this cell in the meantime.
				244	*/
				245	cell = __search_bucket(prison->cells + hash, key);
				246	if (cell) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	247	mempool_free(cell2, prison->cell_pool);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	248	bio_list_add(&cell->bios, inmate);
				249	goto out;
				250	}
				251
				252	/*
				253	* Use new cell.
				254	*/
				255	cell = cell2;
				256
				257	cell->prison = prison;
				258	memcpy(&cell->key, key, sizeof(cell->key));
				259	cell->holder = inmate;
				260	bio_list_init(&cell->bios);
				261	hlist_add_head(&cell->list, prison->cells + hash);
				262
				263	r = 0;
				264
				265	out:
				266	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	267
				268	*ref = cell;
				269
				270	return r;
				271	}
				272
				273	/*
				274	* @inmates must have been initialised prior to this call
				275	*/
				276	static void __cell_release(struct cell cell, struct bio_list inmates)
				277	{
				278	struct bio_prison *prison = cell->prison;
				279
				280	hlist_del(&cell->list);
				281
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	282	bio_list_add(inmates, cell->holder);
				283	bio_list_merge(inmates, &cell->bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	284
				285	mempool_free(cell, prison->cell_pool);
				286	}
				287
				288	static void cell_release(struct cell cell, struct bio_list bios)
				289	{
				290	unsigned long flags;
				291	struct bio_prison *prison = cell->prison;
				292
				293	spin_lock_irqsave(&prison->lock, flags);
				294	__cell_release(cell, bios);
				295	spin_unlock_irqrestore(&prison->lock, flags);
				296	}
				297
				298	/*
				299	* There are a couple of places where we put a bio into a cell briefly
				300	* before taking it out again. In these situations we know that no other
				301	* bio may be in the cell. This function releases the cell, and also does
				302	* a sanity check.
				303	*/
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	304	static void __cell_release_singleton(struct cell cell, struct bio bio)
				305	{
				306	hlist_del(&cell->list);
				307	BUG_ON(cell->holder != bio);
				308	BUG_ON(!bio_list_empty(&cell->bios));
				309	}
				310
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	311	static void cell_release_singleton(struct cell cell, struct bio bio)
				312	{
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	313	unsigned long flags;
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	314	struct bio_prison *prison = cell->prison;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	315
				316	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	317	__cell_release_singleton(cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	318	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	319	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	320
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	321	/*
				322	* Sometimes we don't want the holder, just the additional bios.
				323	*/
				324	static void __cell_release_no_holder(struct cell cell, struct bio_list inmates)
				325	{
				326	struct bio_prison *prison = cell->prison;
				327
				328	hlist_del(&cell->list);
				329	bio_list_merge(inmates, &cell->bios);
				330
				331	mempool_free(cell, prison->cell_pool);
				332	}
				333
				334	static void cell_release_no_holder(struct cell cell, struct bio_list inmates)
				335	{
				336	unsigned long flags;
				337	struct bio_prison *prison = cell->prison;
				338
				339	spin_lock_irqsave(&prison->lock, flags);
				340	__cell_release_no_holder(cell, inmates);
				341	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	342	}
				343
				344	static void cell_error(struct cell *cell)
				345	{
				346	struct bio_prison *prison = cell->prison;
				347	struct bio_list bios;
				348	struct bio *bio;
				349	unsigned long flags;
				350
				351	bio_list_init(&bios);
				352
				353	spin_lock_irqsave(&prison->lock, flags);
				354	__cell_release(cell, &bios);
				355	spin_unlock_irqrestore(&prison->lock, flags);
				356
				357	while ((bio = bio_list_pop(&bios)))
				358	bio_io_error(bio);
				359	}
				360
				361	/----------------------------------------------------------------/
				362
				363	/*
				364	* We use the deferred set to keep track of pending reads to shared blocks.
				365	* We do this to ensure the new mapping caused by a write isn't performed
				366	* until these prior reads have completed. Otherwise the insertion of the
				367	* new mapping could free the old block that the read bios are mapped to.
				368	*/
				369
				370	struct deferred_set;
				371	struct deferred_entry {
				372	struct deferred_set *ds;
				373	unsigned count;
				374	struct list_head work_items;
				375	};
				376
				377	struct deferred_set {
				378	spinlock_t lock;
				379	unsigned current_entry;
				380	unsigned sweeper;
				381	struct deferred_entry entries[DEFERRED_SET_SIZE];
				382	};
				383
				384	static void ds_init(struct deferred_set *ds)
				385	{
				386	int i;
				387
				388	spin_lock_init(&ds->lock);
				389	ds->current_entry = 0;
				390	ds->sweeper = 0;
				391	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
				392	ds->entries[i].ds = ds;
				393	ds->entries[i].count = 0;
				394	INIT_LIST_HEAD(&ds->entries[i].work_items);
				395	}
				396	}
				397
				398	static struct deferred_entry ds_inc(struct deferred_set ds)
				399	{
				400	unsigned long flags;
				401	struct deferred_entry *entry;
				402
				403	spin_lock_irqsave(&ds->lock, flags);
				404	entry = ds->entries + ds->current_entry;
				405	entry->count++;
				406	spin_unlock_irqrestore(&ds->lock, flags);
				407
				408	return entry;
				409	}
				410
				411	static unsigned ds_next(unsigned index)
				412	{
				413	return (index + 1) % DEFERRED_SET_SIZE;
				414	}
				415
				416	static void __sweep(struct deferred_set ds, struct list_head head)
				417	{
				418	while ((ds->sweeper != ds->current_entry) &&
				419	!ds->entries[ds->sweeper].count) {
				420	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				421	ds->sweeper = ds_next(ds->sweeper);
				422	}
				423
				424	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
				425	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				426	}
				427
				428	static void ds_dec(struct deferred_entry entry, struct list_head head)
				429	{
				430	unsigned long flags;
				431
				432	spin_lock_irqsave(&entry->ds->lock, flags);
				433	BUG_ON(!entry->count);
				434	--entry->count;
				435	__sweep(entry->ds, head);
				436	spin_unlock_irqrestore(&entry->ds->lock, flags);
				437	}
				438
				439	/*
				440	* Returns 1 if deferred or 0 if no pending items to delay job.
				441	*/
				442	static int ds_add_work(struct deferred_set ds, struct list_head work)
				443	{
				444	int r = 1;
				445	unsigned long flags;
				446	unsigned next_entry;
				447
				448	spin_lock_irqsave(&ds->lock, flags);
				449	if ((ds->sweeper == ds->current_entry) &&
				450	!ds->entries[ds->current_entry].count)
				451	r = 0;
				452	else {
				453	list_add(work, &ds->entries[ds->current_entry].work_items);
				454	next_entry = ds_next(ds->current_entry);
				455	if (!ds->entries[next_entry].count)
				456	ds->current_entry = next_entry;
				457	}
				458	spin_unlock_irqrestore(&ds->lock, flags);
				459
				460	return r;
				461	}
				462
				463	/----------------------------------------------------------------/
				464
				465	/*
				466	* Key building.
				467	*/
				468	static void build_data_key(struct dm_thin_device *td,
				469	dm_block_t b, struct cell_key *key)
				470	{
				471	key->virtual = 0;
				472	key->dev = dm_thin_dev_id(td);
				473	key->block = b;
				474	}
				475
				476	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
				477	struct cell_key *key)
				478	{
				479	key->virtual = 1;
				480	key->dev = dm_thin_dev_id(td);
				481	key->block = b;
				482	}
				483
				484	/----------------------------------------------------------------/
				485
				486	/*
				487	* A pool device ties together a metadata device and a data device. It
				488	* also provides the interface for creating and destroying internal
				489	* devices.
				490	*/
				491	struct new_mapping;
				492	struct pool {
				493	struct list_head list;
				494	struct dm_target ti; / Only set if a pool target is bound */
				495
				496	struct mapped_device *pool_md;
				497	struct block_device *md_dev;
				498	struct dm_pool_metadata *pmd;
				499
				500	uint32_t sectors_per_block;
				501	unsigned block_shift;
				502	dm_block_t offset_mask;
				503	dm_block_t low_water_blocks;
				504
				505	unsigned zero_new_blocks:1;
				506	unsigned low_water_triggered:1; /* A dm event has been sent */
				507	unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
				508
				509	struct bio_prison *prison;
				510	struct dm_kcopyd_client *copier;
				511
				512	struct workqueue_struct *wq;
				513	struct work_struct worker;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	514	struct delayed_work waker;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	515
				516	unsigned ref_count;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	517	unsigned long last_commit_jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	518
				519	spinlock_t lock;
				520	struct bio_list deferred_bios;
				521	struct bio_list deferred_flush_bios;
				522	struct list_head prepared_mappings;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	523	struct list_head prepared_discards;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	524
				525	struct bio_list retry_on_resume_list;
				526
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	527	struct deferred_set shared_read_ds;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	528	struct deferred_set all_io_ds;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	529
				530	struct new_mapping *next_mapping;
				531	mempool_t *mapping_pool;
				532	mempool_t *endio_hook_pool;
				533	};
				534
				535	/*
				536	* Target context for a pool.
				537	*/
				538	struct pool_c {
				539	struct dm_target *ti;
				540	struct pool *pool;
				541	struct dm_dev *data_dev;
				542	struct dm_dev *metadata_dev;
				543	struct dm_target_callbacks callbacks;
				544
				545	dm_block_t low_water_blocks;
				546	unsigned zero_new_blocks:1;
				547	};
				548
				549	/*
				550	* Target context for a thin.
				551	*/
				552	struct thin_c {
				553	struct dm_dev *pool_dev;
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	554	struct dm_dev *origin_dev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	555	dm_thin_id dev_id;
				556
				557	struct pool *pool;
				558	struct dm_thin_device *td;
				559	};
				560
				561	/----------------------------------------------------------------/
				562
				563	/*
				564	* A global list of pools that uses a struct mapped_device as a key.
				565	*/
				566	static struct dm_thin_pool_table {
				567	struct mutex mutex;
				568	struct list_head pools;
				569	} dm_thin_pool_table;
				570
				571	static void pool_table_init(void)
				572	{
				573	mutex_init(&dm_thin_pool_table.mutex);
				574	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
				575	}
				576
				577	static void __pool_table_insert(struct pool *pool)
				578	{
				579	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				580	list_add(&pool->list, &dm_thin_pool_table.pools);
				581	}
				582
				583	static void __pool_table_remove(struct pool *pool)
				584	{
				585	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				586	list_del(&pool->list);
				587	}
				588
				589	static struct pool __pool_table_lookup(struct mapped_device md)
				590	{
				591	struct pool pool = NULL, tmp;
				592
				593	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				594
				595	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				596	if (tmp->pool_md == md) {
				597	pool = tmp;
				598	break;
				599	}
				600	}
				601
				602	return pool;
				603	}
				604
				605	static struct pool __pool_table_lookup_metadata_dev(struct block_device md_dev)
				606	{
				607	struct pool pool = NULL, tmp;
				608
				609	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				610
				611	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				612	if (tmp->md_dev == md_dev) {
				613	pool = tmp;
				614	break;
				615	}
				616	}
				617
				618	return pool;
				619	}
				620
				621	/----------------------------------------------------------------/
				622
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	623	struct endio_hook {
				624	struct thin_c *tc;
				625	struct deferred_entry *shared_read_entry;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	626	struct deferred_entry *all_io_entry;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	627	struct new_mapping *overwrite_mapping;
				628	};
				629
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	630	static void __requeue_bio_list(struct thin_c tc, struct bio_list master)
				631	{
				632	struct bio *bio;
				633	struct bio_list bios;
				634
				635	bio_list_init(&bios);
				636	bio_list_merge(&bios, master);
				637	bio_list_init(master);
				638
				639	while ((bio = bio_list_pop(&bios))) {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	640	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				641	if (h->tc == tc)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	642	bio_endio(bio, DM_ENDIO_REQUEUE);
				643	else
				644	bio_list_add(master, bio);
				645	}
				646	}
				647
				648	static void requeue_io(struct thin_c *tc)
				649	{
				650	struct pool *pool = tc->pool;
				651	unsigned long flags;
				652
				653	spin_lock_irqsave(&pool->lock, flags);
				654	__requeue_bio_list(tc, &pool->deferred_bios);
				655	__requeue_bio_list(tc, &pool->retry_on_resume_list);
				656	spin_unlock_irqrestore(&pool->lock, flags);
				657	}
				658
				659	/*
				660	* This section of code contains the logic for processing a thin device's IO.
				661	* Much of the code depends on pool object resources (lists, workqueues, etc)
				662	* but most is exclusively called from the thin target rather than the thin-pool
				663	* target.
				664	*/
				665
				666	static dm_block_t get_bio_block(struct thin_c tc, struct bio bio)
				667	{
				668	return bio->bi_sector >> tc->pool->block_shift;
				669	}
				670
				671	static void remap(struct thin_c tc, struct bio bio, dm_block_t block)
				672	{
				673	struct pool *pool = tc->pool;
				674
				675	bio->bi_bdev = tc->pool_dev->bdev;
				676	bio->bi_sector = (block << pool->block_shift) +
				677	(bio->bi_sector & pool->offset_mask);
				678	}
				679
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	680	static void remap_to_origin(struct thin_c tc, struct bio bio)
				681	{
				682	bio->bi_bdev = tc->origin_dev->bdev;
				683	}
				684
				685	static void issue(struct thin_c tc, struct bio bio)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	686	{
				687	struct pool *pool = tc->pool;
				688	unsigned long flags;
				689
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	690	/*
				691	* Batch together any FUA/FLUSH bios we find and then issue
				692	* a single commit for them in process_deferred_bios().
				693	*/
				694	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA)) {
				695	spin_lock_irqsave(&pool->lock, flags);
				696	bio_list_add(&pool->deferred_flush_bios, bio);
				697	spin_unlock_irqrestore(&pool->lock, flags);
				698	} else
				699	generic_make_request(bio);
				700	}
				701
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	702	static void remap_to_origin_and_issue(struct thin_c tc, struct bio bio)
				703	{
				704	remap_to_origin(tc, bio);
				705	issue(tc, bio);
				706	}
				707
				708	static void remap_and_issue(struct thin_c tc, struct bio bio,
				709	dm_block_t block)
				710	{
				711	remap(tc, bio, block);
				712	issue(tc, bio);
				713	}
				714
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	715	/*
				716	* wake_worker() is used when new work is queued and when pool_resume is
				717	* ready to continue deferred IO processing.
				718	*/
				719	static void wake_worker(struct pool *pool)
				720	{
				721	queue_work(pool->wq, &pool->worker);
				722	}
				723
				724	/----------------------------------------------------------------/
				725
				726	/*
				727	* Bio endio functions.
				728	*/
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	729	struct new_mapping {
				730	struct list_head list;
				731
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	732	unsigned quiesced:1;
				733	unsigned prepared:1;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	734	unsigned pass_discard:1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	735
				736	struct thin_c *tc;
				737	dm_block_t virt_block;
				738	dm_block_t data_block;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	739	struct cell cell, cell2;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	740	int err;
				741
				742	/*
				743	* If the bio covers the whole area of a block then we can avoid
				744	* zeroing or copying. Instead this bio is hooked. The bio will
				745	* still be in the cell, so care has to be taken to avoid issuing
				746	* the bio twice.
				747	*/
				748	struct bio *bio;
				749	bio_end_io_t *saved_bi_end_io;
				750	};
				751
				752	static void __maybe_add_mapping(struct new_mapping *m)
				753	{
				754	struct pool *pool = m->tc->pool;
				755
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	756	if (m->quiesced && m->prepared) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	757	list_add(&m->list, &pool->prepared_mappings);
				758	wake_worker(pool);
				759	}
				760	}
				761
				762	static void copy_complete(int read_err, unsigned long write_err, void *context)
				763	{
				764	unsigned long flags;
				765	struct new_mapping *m = context;
				766	struct pool *pool = m->tc->pool;
				767
				768	m->err = read_err \|\| write_err ? -EIO : 0;
				769
				770	spin_lock_irqsave(&pool->lock, flags);
				771	m->prepared = 1;
				772	__maybe_add_mapping(m);
				773	spin_unlock_irqrestore(&pool->lock, flags);
				774	}
				775
				776	static void overwrite_endio(struct bio *bio, int err)
				777	{
				778	unsigned long flags;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	779	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				780	struct new_mapping *m = h->overwrite_mapping;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	781	struct pool *pool = m->tc->pool;
				782
				783	m->err = err;
				784
				785	spin_lock_irqsave(&pool->lock, flags);
				786	m->prepared = 1;
				787	__maybe_add_mapping(m);
				788	spin_unlock_irqrestore(&pool->lock, flags);
				789	}
				790
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	791	/----------------------------------------------------------------/
				792
				793	/*
				794	* Workqueue.
				795	*/
				796
				797	/*
				798	* Prepared mapping jobs.
				799	*/
				800
				801	/*
				802	* This sends the bios in the cell back to the deferred_bios list.
				803	*/
				804	static void cell_defer(struct thin_c tc, struct cell cell,
				805	dm_block_t data_block)
				806	{
				807	struct pool *pool = tc->pool;
				808	unsigned long flags;
				809
				810	spin_lock_irqsave(&pool->lock, flags);
				811	cell_release(cell, &pool->deferred_bios);
				812	spin_unlock_irqrestore(&tc->pool->lock, flags);
				813
				814	wake_worker(pool);
				815	}
				816
				817	/*
				818	* Same as cell_defer above, except it omits one particular detainee,
				819	* a write bio that covers the block and has already been processed.
				820	*/
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	821	static void cell_defer_except(struct thin_c tc, struct cell cell)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	822	{
				823	struct bio_list bios;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	824	struct pool *pool = tc->pool;
				825	unsigned long flags;
				826
				827	bio_list_init(&bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	828
				829	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	830	cell_release_no_holder(cell, &pool->deferred_bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	831	spin_unlock_irqrestore(&pool->lock, flags);
				832
				833	wake_worker(pool);
				834	}
				835
				836	static void process_prepared_mapping(struct new_mapping *m)
				837	{
				838	struct thin_c *tc = m->tc;
				839	struct bio *bio;
				840	int r;
				841
				842	bio = m->bio;
				843	if (bio)
				844	bio->bi_end_io = m->saved_bi_end_io;
				845
				846	if (m->err) {
				847	cell_error(m->cell);
				848	return;
				849	}
				850
				851	/*
				852	* Commit the prepared block into the mapping btree.
				853	* Any I/O for this block arriving after this point will get
				854	* remapped to it directly.
				855	*/
				856	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
				857	if (r) {
				858	DMERR("dm_thin_insert_block() failed");
				859	cell_error(m->cell);
				860	return;
				861	}
				862
				863	/*
				864	* Release any bios held while the block was being provisioned.
				865	* If we are processing a write bio that completely covers the block,
				866	* we already processed it so can ignore it now when processing
				867	* the bios in the cell.
				868	*/
				869	if (bio) {
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	870	cell_defer_except(tc, m->cell);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	871	bio_endio(bio, 0);
				872	} else
				873	cell_defer(tc, m->cell, m->data_block);
				874
				875	list_del(&m->list);
				876	mempool_free(m, tc->pool->mapping_pool);
				877	}
				878
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	879	static void process_prepared_discard(struct new_mapping *m)
				880	{
				881	int r;
				882	struct thin_c *tc = m->tc;
				883
				884	r = dm_thin_remove_block(tc->td, m->virt_block);
				885	if (r)
				886	DMERR("dm_thin_remove_block() failed");
				887
				888	/*
				889	* Pass the discard down to the underlying device?
				890	*/
				891	if (m->pass_discard)
				892	remap_and_issue(tc, m->bio, m->data_block);
				893	else
				894	bio_endio(m->bio, 0);
				895
				896	cell_defer_except(tc, m->cell);
				897	cell_defer_except(tc, m->cell2);
				898	mempool_free(m, tc->pool->mapping_pool);
				899	}
				900
				901	static void process_prepared(struct pool pool, struct list_head head,
				902	void (fn)(struct new_mapping ))
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	903	{
				904	unsigned long flags;
				905	struct list_head maps;
				906	struct new_mapping m, tmp;
				907
				908	INIT_LIST_HEAD(&maps);
				909	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	910	list_splice_init(head, &maps);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	911	spin_unlock_irqrestore(&pool->lock, flags);
				912
				913	list_for_each_entry_safe(m, tmp, &maps, list)
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	914	fn(m);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	915	}
				916
				917	/*
				918	* Deferred bio jobs.
				919	*/
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	920	static int io_overlaps_block(struct pool pool, struct bio bio)
				921	{
				922	return !(bio->bi_sector & pool->offset_mask) &&
				923	(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
				924
				925	}
				926
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	927	static int io_overwrites_block(struct pool pool, struct bio bio)
				928	{
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	929	return (bio_data_dir(bio) == WRITE) &&
				930	io_overlaps_block(pool, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	931	}
				932
				933	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
				934	bio_end_io_t *fn)
				935	{
				936	*save = bio->bi_end_io;
				937	bio->bi_end_io = fn;
				938	}
				939
				940	static int ensure_next_mapping(struct pool *pool)
				941	{
				942	if (pool->next_mapping)
				943	return 0;
				944
				945	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
				946
				947	return pool->next_mapping ? 0 : -ENOMEM;
				948	}
				949
				950	static struct new_mapping get_next_mapping(struct pool pool)
				951	{
				952	struct new_mapping *r = pool->next_mapping;
				953
				954	BUG_ON(!pool->next_mapping);
				955
				956	pool->next_mapping = NULL;
				957
				958	return r;
				959	}
				960
				961	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	962	struct dm_dev *origin, dm_block_t data_origin,
				963	dm_block_t data_dest,
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	964	struct cell cell, struct bio bio)
				965	{
				966	int r;
				967	struct pool *pool = tc->pool;
				968	struct new_mapping *m = get_next_mapping(pool);
				969
				970	INIT_LIST_HEAD(&m->list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	971	m->quiesced = 0;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	972	m->prepared = 0;
				973	m->tc = tc;
				974	m->virt_block = virt_block;
				975	m->data_block = data_dest;
				976	m->cell = cell;
				977	m->err = 0;
				978	m->bio = NULL;
				979
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	980	if (!ds_add_work(&pool->shared_read_ds, &m->list))
				981	m->quiesced = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	982
				983	/*
				984	* IO to pool_dev remaps to the pool target's data_dev.
				985	*
				986	* If the whole block of data is being overwritten, we can issue the
				987	* bio immediately. Otherwise we use kcopyd to clone the data first.
				988	*/
				989	if (io_overwrites_block(pool, bio)) {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	990	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				991	h->overwrite_mapping = m;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	992	m->bio = bio;
				993	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	994	remap_and_issue(tc, bio, data_dest);
				995	} else {
				996	struct dm_io_region from, to;
				997
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	998	from.bdev = origin->bdev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	999	from.sector = data_origin * pool->sectors_per_block;
				1000	from.count = pool->sectors_per_block;
				1001
				1002	to.bdev = tc->pool_dev->bdev;
				1003	to.sector = data_dest * pool->sectors_per_block;
				1004	to.count = pool->sectors_per_block;
				1005
				1006	r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
				1007	0, copy_complete, m);
				1008	if (r < 0) {
				1009	mempool_free(m, pool->mapping_pool);
				1010	DMERR("dm_kcopyd_copy() failed");
				1011	cell_error(cell);
				1012	}
				1013	}
				1014	}
				1015
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1016	static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
				1017	dm_block_t data_origin, dm_block_t data_dest,
				1018	struct cell cell, struct bio bio)
				1019	{
				1020	schedule_copy(tc, virt_block, tc->pool_dev,
				1021	data_origin, data_dest, cell, bio);
				1022	}
				1023
				1024	static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
				1025	dm_block_t data_dest,
				1026	struct cell cell, struct bio bio)
				1027	{
				1028	schedule_copy(tc, virt_block, tc->origin_dev,
				1029	virt_block, data_dest, cell, bio);
				1030	}
				1031
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1032	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
				1033	dm_block_t data_block, struct cell *cell,
				1034	struct bio *bio)
				1035	{
				1036	struct pool *pool = tc->pool;
				1037	struct new_mapping *m = get_next_mapping(pool);
				1038
				1039	INIT_LIST_HEAD(&m->list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1040	m->quiesced = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1041	m->prepared = 0;
				1042	m->tc = tc;
				1043	m->virt_block = virt_block;
				1044	m->data_block = data_block;
				1045	m->cell = cell;
				1046	m->err = 0;
				1047	m->bio = NULL;
				1048
				1049	/*
				1050	* If the whole block of data is being overwritten or we are not
				1051	* zeroing pre-existing data, we can issue the bio immediately.
				1052	* Otherwise we use kcopyd to zero the data first.
				1053	*/
				1054	if (!pool->zero_new_blocks)
				1055	process_prepared_mapping(m);
				1056
				1057	else if (io_overwrites_block(pool, bio)) {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1058	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1059	h->overwrite_mapping = m;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1060	m->bio = bio;
				1061	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1062	remap_and_issue(tc, bio, data_block);
				1063
				1064	} else {
				1065	int r;
				1066	struct dm_io_region to;
				1067
				1068	to.bdev = tc->pool_dev->bdev;
				1069	to.sector = data_block * pool->sectors_per_block;
				1070	to.count = pool->sectors_per_block;
				1071
				1072	r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
				1073	if (r < 0) {
				1074	mempool_free(m, pool->mapping_pool);
				1075	DMERR("dm_kcopyd_zero() failed");
				1076	cell_error(cell);
				1077	}
				1078	}
				1079	}
				1080
				1081	static int alloc_data_block(struct thin_c tc, dm_block_t result)
				1082	{
				1083	int r;
				1084	dm_block_t free_blocks;
				1085	unsigned long flags;
				1086	struct pool *pool = tc->pool;
				1087
				1088	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1089	if (r)
				1090	return r;
				1091
				1092	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
				1093	DMWARN("%s: reached low water mark, sending event.",
				1094	dm_device_name(pool->pool_md));
				1095	spin_lock_irqsave(&pool->lock, flags);
				1096	pool->low_water_triggered = 1;
				1097	spin_unlock_irqrestore(&pool->lock, flags);
				1098	dm_table_event(pool->ti->table);
				1099	}
				1100
				1101	if (!free_blocks) {
				1102	if (pool->no_free_space)
				1103	return -ENOSPC;
				1104	else {
				1105	/*
				1106	* Try to commit to see if that will free up some
				1107	* more space.
				1108	*/
				1109	r = dm_pool_commit_metadata(pool->pmd);
				1110	if (r) {
				1111	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1112	__func__, r);
				1113	return r;
				1114	}
				1115
				1116	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1117	if (r)
				1118	return r;
				1119
				1120	/*
				1121	* If we still have no space we set a flag to avoid
				1122	* doing all this checking and return -ENOSPC.
				1123	*/
				1124	if (!free_blocks) {
				1125	DMWARN("%s: no free space available.",
				1126	dm_device_name(pool->pool_md));
				1127	spin_lock_irqsave(&pool->lock, flags);
				1128	pool->no_free_space = 1;
				1129	spin_unlock_irqrestore(&pool->lock, flags);
				1130	return -ENOSPC;
				1131	}
				1132	}
				1133	}
				1134
				1135	r = dm_pool_alloc_data_block(pool->pmd, result);
				1136	if (r)
				1137	return r;
				1138
				1139	return 0;
				1140	}
				1141
				1142	/*
				1143	* If we have run out of space, queue bios until the device is
				1144	* resumed, presumably after having been reloaded with more space.
				1145	*/
				1146	static void retry_on_resume(struct bio *bio)
				1147	{
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1148	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1149	struct thin_c *tc = h->tc;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1150	struct pool *pool = tc->pool;
				1151	unsigned long flags;
				1152
				1153	spin_lock_irqsave(&pool->lock, flags);
				1154	bio_list_add(&pool->retry_on_resume_list, bio);
				1155	spin_unlock_irqrestore(&pool->lock, flags);
				1156	}
				1157
				1158	static void no_space(struct cell *cell)
				1159	{
				1160	struct bio *bio;
				1161	struct bio_list bios;
				1162
				1163	bio_list_init(&bios);
				1164	cell_release(cell, &bios);
				1165
				1166	while ((bio = bio_list_pop(&bios)))
				1167	retry_on_resume(bio);
				1168	}
				1169
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1170	static void process_discard(struct thin_c tc, struct bio bio)
				1171	{
				1172	int r;
				1173	struct pool *pool = tc->pool;
				1174	struct cell cell, cell2;
				1175	struct cell_key key, key2;
				1176	dm_block_t block = get_bio_block(tc, bio);
				1177	struct dm_thin_lookup_result lookup_result;
				1178	struct new_mapping *m;
				1179
				1180	build_virtual_key(tc->td, block, &key);
				1181	if (bio_detain(tc->pool->prison, &key, bio, &cell))
				1182	return;
				1183
				1184	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1185	switch (r) {
				1186	case 0:
				1187	/*
				1188	* Check nobody is fiddling with this pool block. This can
				1189	* happen if someone's in the process of breaking sharing
				1190	* on this block.
				1191	*/
				1192	build_data_key(tc->td, lookup_result.block, &key2);
				1193	if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
				1194	cell_release_singleton(cell, bio);
				1195	break;
				1196	}
				1197
				1198	if (io_overlaps_block(pool, bio)) {
				1199	/*
				1200	* IO may still be going to the destination block. We must
				1201	* quiesce before we can do the removal.
				1202	*/
				1203	m = get_next_mapping(pool);
				1204	m->tc = tc;
				1205	m->pass_discard = !lookup_result.shared;
				1206	m->virt_block = block;
				1207	m->data_block = lookup_result.block;
				1208	m->cell = cell;
				1209	m->cell2 = cell2;
				1210	m->err = 0;
				1211	m->bio = bio;
				1212
				1213	if (!ds_add_work(&pool->all_io_ds, &m->list)) {
				1214	list_add(&m->list, &pool->prepared_discards);
				1215	wake_worker(pool);
				1216	}
				1217	} else {
				1218	/*
				1219	* This path is hit if people are ignoring
				1220	* limits->discard_granularity. It ignores any
				1221	* part of the discard that is in a subsequent
				1222	* block.
				1223	*/
				1224	sector_t offset = bio->bi_sector - (block << pool->block_shift);
				1225	unsigned remaining = (pool->sectors_per_block - offset) << 9;
				1226	bio->bi_size = min(bio->bi_size, remaining);
				1227
				1228	cell_release_singleton(cell, bio);
				1229	cell_release_singleton(cell2, bio);
				1230	remap_and_issue(tc, bio, lookup_result.block);
				1231	}
				1232	break;
				1233
				1234	case -ENODATA:
				1235	/*
				1236	* It isn't provisioned, just forget it.
				1237	*/
				1238	cell_release_singleton(cell, bio);
				1239	bio_endio(bio, 0);
				1240	break;
				1241
				1242	default:
				1243	DMERR("discard: find block unexpectedly returned %d", r);
				1244	cell_release_singleton(cell, bio);
				1245	bio_io_error(bio);
				1246	break;
				1247	}
				1248	}
				1249
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1250	static void break_sharing(struct thin_c tc, struct bio bio, dm_block_t block,
				1251	struct cell_key *key,
				1252	struct dm_thin_lookup_result *lookup_result,
				1253	struct cell *cell)
				1254	{
				1255	int r;
				1256	dm_block_t data_block;
				1257
				1258	r = alloc_data_block(tc, &data_block);
				1259	switch (r) {
				1260	case 0:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1261	schedule_internal_copy(tc, block, lookup_result->block,
				1262	data_block, cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1263	break;
				1264
				1265	case -ENOSPC:
				1266	no_space(cell);
				1267	break;
				1268
				1269	default:
				1270	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1271	cell_error(cell);
				1272	break;
				1273	}
				1274	}
				1275
				1276	static void process_shared_bio(struct thin_c tc, struct bio bio,
				1277	dm_block_t block,
				1278	struct dm_thin_lookup_result *lookup_result)
				1279	{
				1280	struct cell *cell;
				1281	struct pool *pool = tc->pool;
				1282	struct cell_key key;
				1283
				1284	/*
				1285	* If cell is already occupied, then sharing is already in the process
				1286	* of being broken so we have nothing further to do here.
				1287	*/
				1288	build_data_key(tc->td, lookup_result->block, &key);
				1289	if (bio_detain(pool->prison, &key, bio, &cell))
				1290	return;
				1291
				1292	if (bio_data_dir(bio) == WRITE)
				1293	break_sharing(tc, bio, block, &key, lookup_result, cell);
				1294	else {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1295	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1296
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1297	h->shared_read_entry = ds_inc(&pool->shared_read_ds);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1298
				1299	cell_release_singleton(cell, bio);
				1300	remap_and_issue(tc, bio, lookup_result->block);
				1301	}
				1302	}
				1303
				1304	static void provision_block(struct thin_c tc, struct bio bio, dm_block_t block,
				1305	struct cell *cell)
				1306	{
				1307	int r;
				1308	dm_block_t data_block;
				1309
				1310	/*
				1311	* Remap empty bios (flushes) immediately, without provisioning.
				1312	*/
				1313	if (!bio->bi_size) {
				1314	cell_release_singleton(cell, bio);
				1315	remap_and_issue(tc, bio, 0);
				1316	return;
				1317	}
				1318
				1319	/*
				1320	* Fill read bios with zeroes and complete them immediately.
				1321	*/
				1322	if (bio_data_dir(bio) == READ) {
				1323	zero_fill_bio(bio);
				1324	cell_release_singleton(cell, bio);
				1325	bio_endio(bio, 0);
				1326	return;
				1327	}
				1328
				1329	r = alloc_data_block(tc, &data_block);
				1330	switch (r) {
				1331	case 0:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1332	if (tc->origin_dev)
				1333	schedule_external_copy(tc, block, data_block, cell, bio);
				1334	else
				1335	schedule_zero(tc, block, data_block, cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1336	break;
				1337
				1338	case -ENOSPC:
				1339	no_space(cell);
				1340	break;
				1341
				1342	default:
				1343	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1344	cell_error(cell);
				1345	break;
				1346	}
				1347	}
				1348
				1349	static void process_bio(struct thin_c tc, struct bio bio)
				1350	{
				1351	int r;
				1352	dm_block_t block = get_bio_block(tc, bio);
				1353	struct cell *cell;
				1354	struct cell_key key;
				1355	struct dm_thin_lookup_result lookup_result;
				1356
				1357	/*
				1358	* If cell is already occupied, then the block is already
				1359	* being provisioned so we have nothing further to do here.
				1360	*/
				1361	build_virtual_key(tc->td, block, &key);
				1362	if (bio_detain(tc->pool->prison, &key, bio, &cell))
				1363	return;
				1364
				1365	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1366	switch (r) {
				1367	case 0:
				1368	/*
				1369	* We can release this cell now. This thread is the only
				1370	* one that puts bios into a cell, and we know there were
				1371	* no preceding bios.
				1372	*/
				1373	/*
				1374	* TODO: this will probably have to change when discard goes
				1375	* back in.
				1376	*/
				1377	cell_release_singleton(cell, bio);
				1378
				1379	if (lookup_result.shared)
				1380	process_shared_bio(tc, bio, block, &lookup_result);
				1381	else
				1382	remap_and_issue(tc, bio, lookup_result.block);
				1383	break;
				1384
				1385	case -ENODATA:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1386	if (bio_data_dir(bio) == READ && tc->origin_dev) {
				1387	cell_release_singleton(cell, bio);
				1388	remap_to_origin_and_issue(tc, bio);
				1389	} else
				1390	provision_block(tc, bio, block, cell);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1391	break;
				1392
				1393	default:
				1394	DMERR("dm_thin_find_block() failed, error = %d", r);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1395	cell_release_singleton(cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1396	bio_io_error(bio);
				1397	break;
				1398	}
				1399	}
				1400
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1401	static int need_commit_due_to_time(struct pool *pool)
				1402	{
				1403	return jiffies < pool->last_commit_jiffies \|\|
				1404	jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
				1405	}
				1406
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1407	static void process_deferred_bios(struct pool *pool)
				1408	{
				1409	unsigned long flags;
				1410	struct bio *bio;
				1411	struct bio_list bios;
				1412	int r;
				1413
				1414	bio_list_init(&bios);
				1415
				1416	spin_lock_irqsave(&pool->lock, flags);
				1417	bio_list_merge(&bios, &pool->deferred_bios);
				1418	bio_list_init(&pool->deferred_bios);
				1419	spin_unlock_irqrestore(&pool->lock, flags);
				1420
				1421	while ((bio = bio_list_pop(&bios))) {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1422	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1423	struct thin_c *tc = h->tc;
				1424
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1425	/*
				1426	* If we've got no free new_mapping structs, and processing
				1427	* this bio might require one, we pause until there are some
				1428	* prepared mappings to process.
				1429	*/
				1430	if (ensure_next_mapping(pool)) {
				1431	spin_lock_irqsave(&pool->lock, flags);
				1432	bio_list_merge(&pool->deferred_bios, &bios);
				1433	spin_unlock_irqrestore(&pool->lock, flags);
				1434
				1435	break;
				1436	}
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1437
				1438	if (bio->bi_rw & REQ_DISCARD)
				1439	process_discard(tc, bio);
				1440	else
				1441	process_bio(tc, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1442	}
				1443
				1444	/*
				1445	* If there are any deferred flush bios, we must commit
				1446	* the metadata before issuing them.
				1447	*/
				1448	bio_list_init(&bios);
				1449	spin_lock_irqsave(&pool->lock, flags);
				1450	bio_list_merge(&bios, &pool->deferred_flush_bios);
				1451	bio_list_init(&pool->deferred_flush_bios);
				1452	spin_unlock_irqrestore(&pool->lock, flags);
				1453
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1454	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1455	return;
				1456
				1457	r = dm_pool_commit_metadata(pool->pmd);
				1458	if (r) {
				1459	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1460	__func__, r);
				1461	while ((bio = bio_list_pop(&bios)))
				1462	bio_io_error(bio);
				1463	return;
				1464	}
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1465	pool->last_commit_jiffies = jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1466
				1467	while ((bio = bio_list_pop(&bios)))
				1468	generic_make_request(bio);
				1469	}
				1470
				1471	static void do_worker(struct work_struct *ws)
				1472	{
				1473	struct pool *pool = container_of(ws, struct pool, worker);
				1474
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1475	process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
				1476	process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1477	process_deferred_bios(pool);
				1478	}
				1479
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1480	/*
				1481	* We want to commit periodically so that not too much
				1482	* unwritten data builds up.
				1483	*/
				1484	static void do_waker(struct work_struct *ws)
				1485	{
				1486	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
				1487	wake_worker(pool);
				1488	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
				1489	}
				1490
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1491	/----------------------------------------------------------------/
				1492
				1493	/*
				1494	* Mapping functions.
				1495	*/
				1496
				1497	/*
				1498	* Called only while mapping a thin bio to hand it over to the workqueue.
				1499	*/
				1500	static void thin_defer_bio(struct thin_c tc, struct bio bio)
				1501	{
				1502	unsigned long flags;
				1503	struct pool *pool = tc->pool;
				1504
				1505	spin_lock_irqsave(&pool->lock, flags);
				1506	bio_list_add(&pool->deferred_bios, bio);
				1507	spin_unlock_irqrestore(&pool->lock, flags);
				1508
				1509	wake_worker(pool);
				1510	}
				1511
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1512	static struct endio_hook thin_hook_bio(struct thin_c tc, struct bio *bio)
				1513	{
				1514	struct pool *pool = tc->pool;
				1515	struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
				1516
				1517	h->tc = tc;
				1518	h->shared_read_entry = NULL;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1519	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1520	h->overwrite_mapping = NULL;
				1521
				1522	return h;
				1523	}
				1524
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1525	/*
				1526	* Non-blocking function called from the thin target's map function.
				1527	*/
				1528	static int thin_bio_map(struct dm_target ti, struct bio bio,
				1529	union map_info *map_context)
				1530	{
				1531	int r;
				1532	struct thin_c *tc = ti->private;
				1533	dm_block_t block = get_bio_block(tc, bio);
				1534	struct dm_thin_device *td = tc->td;
				1535	struct dm_thin_lookup_result result;
				1536
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1537	map_context->ptr = thin_hook_bio(tc, bio);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1538	if (bio->bi_rw & (REQ_DISCARD \| REQ_FLUSH \| REQ_FUA)) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1539	thin_defer_bio(tc, bio);
				1540	return DM_MAPIO_SUBMITTED;
				1541	}
				1542
				1543	r = dm_thin_find_block(td, block, 0, &result);
				1544
				1545	/*
				1546	* Note that we defer readahead too.
				1547	*/
				1548	switch (r) {
				1549	case 0:
				1550	if (unlikely(result.shared)) {
				1551	/*
				1552	* We have a race condition here between the
				1553	* result.shared value returned by the lookup and
				1554	* snapshot creation, which may cause new
				1555	* sharing.
				1556	*
				1557	* To avoid this always quiesce the origin before
				1558	* taking the snap. You want to do this anyway to
				1559	* ensure a consistent application view
				1560	* (i.e. lockfs).
				1561	*
				1562	* More distant ancestors are irrelevant. The
				1563	* shared flag will be set in their case.
				1564	*/
				1565	thin_defer_bio(tc, bio);
				1566	r = DM_MAPIO_SUBMITTED;
				1567	} else {
				1568	remap(tc, bio, result.block);
				1569	r = DM_MAPIO_REMAPPED;
				1570	}
				1571	break;
				1572
				1573	case -ENODATA:
				1574	/*
				1575	* In future, the failed dm_thin_find_block above could
				1576	* provide the hint to load the metadata into cache.
				1577	*/
				1578	case -EWOULDBLOCK:
				1579	thin_defer_bio(tc, bio);
				1580	r = DM_MAPIO_SUBMITTED;
				1581	break;
				1582	}
				1583
				1584	return r;
				1585	}
				1586
				1587	static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				1588	{
				1589	int r;
				1590	unsigned long flags;
				1591	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
				1592
				1593	spin_lock_irqsave(&pt->pool->lock, flags);
				1594	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
				1595	spin_unlock_irqrestore(&pt->pool->lock, flags);
				1596
				1597	if (!r) {
				1598	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				1599	r = bdi_congested(&q->backing_dev_info, bdi_bits);
				1600	}
				1601
				1602	return r;
				1603	}
				1604
				1605	static void __requeue_bios(struct pool *pool)
				1606	{
				1607	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
				1608	bio_list_init(&pool->retry_on_resume_list);
				1609	}
				1610
				1611	/*----------------------------------------------------------------
				1612	* Binding of control targets to a pool object
				1613	--------------------------------------------------------------/
				1614	static int bind_control_target(struct pool pool, struct dm_target ti)
				1615	{
				1616	struct pool_c *pt = ti->private;
				1617
				1618	pool->ti = ti;
				1619	pool->low_water_blocks = pt->low_water_blocks;
				1620	pool->zero_new_blocks = pt->zero_new_blocks;
				1621
				1622	return 0;
				1623	}
				1624
				1625	static void unbind_control_target(struct pool pool, struct dm_target ti)
				1626	{
				1627	if (pool->ti == ti)
				1628	pool->ti = NULL;
				1629	}
				1630
				1631	/*----------------------------------------------------------------
				1632	* Pool creation
				1633	--------------------------------------------------------------/
				1634	static void __pool_destroy(struct pool *pool)
				1635	{
				1636	__pool_table_remove(pool);
				1637
				1638	if (dm_pool_metadata_close(pool->pmd) < 0)
				1639	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1640
				1641	prison_destroy(pool->prison);
				1642	dm_kcopyd_client_destroy(pool->copier);
				1643
				1644	if (pool->wq)
				1645	destroy_workqueue(pool->wq);
				1646
				1647	if (pool->next_mapping)
				1648	mempool_free(pool->next_mapping, pool->mapping_pool);
				1649	mempool_destroy(pool->mapping_pool);
				1650	mempool_destroy(pool->endio_hook_pool);
				1651	kfree(pool);
				1652	}
				1653
				1654	static struct pool pool_create(struct mapped_device pool_md,
				1655	struct block_device *metadata_dev,
				1656	unsigned long block_size, char **error)
				1657	{
				1658	int r;
				1659	void *err_p;
				1660	struct pool *pool;
				1661	struct dm_pool_metadata *pmd;
				1662
				1663	pmd = dm_pool_metadata_open(metadata_dev, block_size);
				1664	if (IS_ERR(pmd)) {
				1665	*error = "Error creating metadata object";
				1666	return (struct pool *)pmd;
				1667	}
				1668
				1669	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
				1670	if (!pool) {
				1671	*error = "Error allocating memory for pool";
				1672	err_p = ERR_PTR(-ENOMEM);
				1673	goto bad_pool;
				1674	}
				1675
				1676	pool->pmd = pmd;
				1677	pool->sectors_per_block = block_size;
				1678	pool->block_shift = ffs(block_size) - 1;
				1679	pool->offset_mask = block_size - 1;
				1680	pool->low_water_blocks = 0;
				1681	pool->zero_new_blocks = 1;
				1682	pool->prison = prison_create(PRISON_CELLS);
				1683	if (!pool->prison) {
				1684	*error = "Error creating pool's bio prison";
				1685	err_p = ERR_PTR(-ENOMEM);
				1686	goto bad_prison;
				1687	}
				1688
				1689	pool->copier = dm_kcopyd_client_create();
				1690	if (IS_ERR(pool->copier)) {
				1691	r = PTR_ERR(pool->copier);
				1692	*error = "Error creating pool's kcopyd client";
				1693	err_p = ERR_PTR(r);
				1694	goto bad_kcopyd_client;
				1695	}
				1696
				1697	/*
				1698	* Create singlethreaded workqueue that will service all devices
				1699	* that use this metadata.
				1700	*/
				1701	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
				1702	if (!pool->wq) {
				1703	*error = "Error creating pool's workqueue";
				1704	err_p = ERR_PTR(-ENOMEM);
				1705	goto bad_wq;
				1706	}
				1707
				1708	INIT_WORK(&pool->worker, do_worker);
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1709	INIT_DELAYED_WORK(&pool->waker, do_waker);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1710	spin_lock_init(&pool->lock);
				1711	bio_list_init(&pool->deferred_bios);
				1712	bio_list_init(&pool->deferred_flush_bios);
				1713	INIT_LIST_HEAD(&pool->prepared_mappings);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1714	INIT_LIST_HEAD(&pool->prepared_discards);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1715	pool->low_water_triggered = 0;
				1716	pool->no_free_space = 0;
				1717	bio_list_init(&pool->retry_on_resume_list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1718	ds_init(&pool->shared_read_ds);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1719	ds_init(&pool->all_io_ds);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1720
				1721	pool->next_mapping = NULL;
				1722	pool->mapping_pool =
				1723	mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
				1724	if (!pool->mapping_pool) {
				1725	*error = "Error creating pool's mapping mempool";
				1726	err_p = ERR_PTR(-ENOMEM);
				1727	goto bad_mapping_pool;
				1728	}
				1729
				1730	pool->endio_hook_pool =
				1731	mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
				1732	if (!pool->endio_hook_pool) {
				1733	*error = "Error creating pool's endio_hook mempool";
				1734	err_p = ERR_PTR(-ENOMEM);
				1735	goto bad_endio_hook_pool;
				1736	}
				1737	pool->ref_count = 1;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1738	pool->last_commit_jiffies = jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1739	pool->pool_md = pool_md;
				1740	pool->md_dev = metadata_dev;
				1741	__pool_table_insert(pool);
				1742
				1743	return pool;
				1744
				1745	bad_endio_hook_pool:
				1746	mempool_destroy(pool->mapping_pool);
				1747	bad_mapping_pool:
				1748	destroy_workqueue(pool->wq);
				1749	bad_wq:
				1750	dm_kcopyd_client_destroy(pool->copier);
				1751	bad_kcopyd_client:
				1752	prison_destroy(pool->prison);
				1753	bad_prison:
				1754	kfree(pool);
				1755	bad_pool:
				1756	if (dm_pool_metadata_close(pmd))
				1757	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1758
				1759	return err_p;
				1760	}
				1761
				1762	static void __pool_inc(struct pool *pool)
				1763	{
				1764	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1765	pool->ref_count++;
				1766	}
				1767
				1768	static void __pool_dec(struct pool *pool)
				1769	{
				1770	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1771	BUG_ON(!pool->ref_count);
				1772	if (!--pool->ref_count)
				1773	__pool_destroy(pool);
				1774	}
				1775
				1776	static struct pool __pool_find(struct mapped_device pool_md,
				1777	struct block_device *metadata_dev,
				1778	unsigned long block_size, char **error)
				1779	{
				1780	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
				1781
				1782	if (pool) {
				1783	if (pool->pool_md != pool_md)
				1784	return ERR_PTR(-EBUSY);
				1785	__pool_inc(pool);
				1786
				1787	} else {
				1788	pool = __pool_table_lookup(pool_md);
				1789	if (pool) {
				1790	if (pool->md_dev != metadata_dev)
				1791	return ERR_PTR(-EINVAL);
				1792	__pool_inc(pool);
				1793
				1794	} else
				1795	pool = pool_create(pool_md, metadata_dev, block_size, error);
				1796	}
				1797
				1798	return pool;
				1799	}
				1800
				1801	/*----------------------------------------------------------------
				1802	* Pool target methods
				1803	--------------------------------------------------------------/
				1804	static void pool_dtr(struct dm_target *ti)
				1805	{
				1806	struct pool_c *pt = ti->private;
				1807
				1808	mutex_lock(&dm_thin_pool_table.mutex);
				1809
				1810	unbind_control_target(pt->pool, ti);
				1811	__pool_dec(pt->pool);
				1812	dm_put_device(ti, pt->metadata_dev);
				1813	dm_put_device(ti, pt->data_dev);
				1814	kfree(pt);
				1815
				1816	mutex_unlock(&dm_thin_pool_table.mutex);
				1817	}
				1818
				1819	struct pool_features {
				1820	unsigned zero_new_blocks:1;
				1821	};
				1822
				1823	static int parse_pool_features(struct dm_arg_set as, struct pool_features pf,
				1824	struct dm_target *ti)
				1825	{
				1826	int r;
				1827	unsigned argc;
				1828	const char *arg_name;
				1829
				1830	static struct dm_arg _args[] = {
				1831	{0, 1, "Invalid number of pool feature arguments"},
				1832	};
				1833
				1834	/*
				1835	* No feature arguments supplied.
				1836	*/
				1837	if (!as->argc)
				1838	return 0;
				1839
				1840	r = dm_read_arg_group(_args, as, &argc, &ti->error);
				1841	if (r)
				1842	return -EINVAL;
				1843
				1844	while (argc && !r) {
				1845	arg_name = dm_shift_arg(as);
				1846	argc--;
				1847
				1848	if (!strcasecmp(arg_name, "skip_block_zeroing")) {
				1849	pf->zero_new_blocks = 0;
				1850	continue;
				1851	}
				1852
				1853	ti->error = "Unrecognised pool feature requested";
				1854	r = -EINVAL;
				1855	}
				1856
				1857	return r;
				1858	}
				1859
				1860	/*
				1861	* thin-pool <metadata dev> <data dev>
				1862	* <data block size (sectors)>
				1863	* <low water mark (blocks)>
				1864	* [<#feature args> [<arg>]*]
				1865	*
				1866	* Optional feature arguments are:
				1867	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
				1868	*/
				1869	static int pool_ctr(struct dm_target ti, unsigned argc, char *argv)
				1870	{
				1871	int r;
				1872	struct pool_c *pt;
				1873	struct pool *pool;
				1874	struct pool_features pf;
				1875	struct dm_arg_set as;
				1876	struct dm_dev *data_dev;
				1877	unsigned long block_size;
				1878	dm_block_t low_water_blocks;
				1879	struct dm_dev *metadata_dev;
				1880	sector_t metadata_dev_size;
Mike Snitzer	c4a69ec	2012-03-28 18:41:28 +0100	[diff] [blame]	1881	char b[BDEVNAME_SIZE];
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1882
				1883	/*
				1884	* FIXME Remove validation from scope of lock.
				1885	*/
				1886	mutex_lock(&dm_thin_pool_table.mutex);
				1887
				1888	if (argc < 4) {
				1889	ti->error = "Invalid argument count";
				1890	r = -EINVAL;
				1891	goto out_unlock;
				1892	}
				1893	as.argc = argc;
				1894	as.argv = argv;
				1895
				1896	r = dm_get_device(ti, argv[0], FMODE_READ \| FMODE_WRITE, &metadata_dev);
				1897	if (r) {
				1898	ti->error = "Error opening metadata block device";
				1899	goto out_unlock;
				1900	}
				1901
				1902	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
Mike Snitzer	c4a69ec	2012-03-28 18:41:28 +0100	[diff] [blame]	1903	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
				1904	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				1905	bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1906
				1907	r = dm_get_device(ti, argv[1], FMODE_READ \| FMODE_WRITE, &data_dev);
				1908	if (r) {
				1909	ti->error = "Error getting data device";
				1910	goto out_metadata;
				1911	}
				1912
				1913	if (kstrtoul(argv[2], 10, &block_size) \|\| !block_size \|\|
				1914	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				1915	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				1916	!is_power_of_2(block_size)) {
				1917	ti->error = "Invalid block size";
				1918	r = -EINVAL;
				1919	goto out;
				1920	}
				1921
				1922	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
				1923	ti->error = "Invalid low water mark";
				1924	r = -EINVAL;
				1925	goto out;
				1926	}
				1927
				1928	/*
				1929	* Set default pool features.
				1930	*/
				1931	memset(&pf, 0, sizeof(pf));
				1932	pf.zero_new_blocks = 1;
				1933
				1934	dm_consume_args(&as, 4);
				1935	r = parse_pool_features(&as, &pf, ti);
				1936	if (r)
				1937	goto out;
				1938
				1939	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
				1940	if (!pt) {
				1941	r = -ENOMEM;
				1942	goto out;
				1943	}
				1944
				1945	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
				1946	block_size, &ti->error);
				1947	if (IS_ERR(pool)) {
				1948	r = PTR_ERR(pool);
				1949	goto out_free_pt;
				1950	}
				1951
				1952	pt->pool = pool;
				1953	pt->ti = ti;
				1954	pt->metadata_dev = metadata_dev;
				1955	pt->data_dev = data_dev;
				1956	pt->low_water_blocks = low_water_blocks;
				1957	pt->zero_new_blocks = pf.zero_new_blocks;
				1958	ti->num_flush_requests = 1;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	1959	ti->num_discard_requests = 1;
				1960	ti->discards_supported = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1961	ti->private = pt;
				1962
				1963	pt->callbacks.congested_fn = pool_is_congested;
				1964	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
				1965
				1966	mutex_unlock(&dm_thin_pool_table.mutex);
				1967
				1968	return 0;
				1969
				1970	out_free_pt:
				1971	kfree(pt);
				1972	out:
				1973	dm_put_device(ti, data_dev);
				1974	out_metadata:
				1975	dm_put_device(ti, metadata_dev);
				1976	out_unlock:
				1977	mutex_unlock(&dm_thin_pool_table.mutex);
				1978
				1979	return r;
				1980	}
				1981
				1982	static int pool_map(struct dm_target ti, struct bio bio,
				1983	union map_info *map_context)
				1984	{
				1985	int r;
				1986	struct pool_c *pt = ti->private;
				1987	struct pool *pool = pt->pool;
				1988	unsigned long flags;
				1989
				1990	/*
				1991	* As this is a singleton target, ti->begin is always zero.
				1992	*/
				1993	spin_lock_irqsave(&pool->lock, flags);
				1994	bio->bi_bdev = pt->data_dev->bdev;
				1995	r = DM_MAPIO_REMAPPED;
				1996	spin_unlock_irqrestore(&pool->lock, flags);
				1997
				1998	return r;
				1999	}
				2000
				2001	/*
				2002	* Retrieves the number of blocks of the data device from
				2003	* the superblock and compares it to the actual device size,
				2004	* thus resizing the data device in case it has grown.
				2005	*
				2006	* This both copes with opening preallocated data devices in the ctr
				2007	* being followed by a resume
				2008	* -and-
				2009	* calling the resume method individually after userspace has
				2010	* grown the data device in reaction to a table event.
				2011	*/
				2012	static int pool_preresume(struct dm_target *ti)
				2013	{
				2014	int r;
				2015	struct pool_c *pt = ti->private;
				2016	struct pool *pool = pt->pool;
				2017	dm_block_t data_size, sb_data_size;
				2018
				2019	/*
				2020	* Take control of the pool object.
				2021	*/
				2022	r = bind_control_target(pool, ti);
				2023	if (r)
				2024	return r;
				2025
				2026	data_size = ti->len >> pool->block_shift;
				2027	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
				2028	if (r) {
				2029	DMERR("failed to retrieve data device size");
				2030	return r;
				2031	}
				2032
				2033	if (data_size < sb_data_size) {
				2034	DMERR("pool target too small, is %llu blocks (expected %llu)",
				2035	data_size, sb_data_size);
				2036	return -EINVAL;
				2037
				2038	} else if (data_size > sb_data_size) {
				2039	r = dm_pool_resize_data_dev(pool->pmd, data_size);
				2040	if (r) {
				2041	DMERR("failed to resize data device");
				2042	return r;
				2043	}
				2044
				2045	r = dm_pool_commit_metadata(pool->pmd);
				2046	if (r) {
				2047	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				2048	__func__, r);
				2049	return r;
				2050	}
				2051	}
				2052
				2053	return 0;
				2054	}
				2055
				2056	static void pool_resume(struct dm_target *ti)
				2057	{
				2058	struct pool_c *pt = ti->private;
				2059	struct pool *pool = pt->pool;
				2060	unsigned long flags;
				2061
				2062	spin_lock_irqsave(&pool->lock, flags);
				2063	pool->low_water_triggered = 0;
				2064	pool->no_free_space = 0;
				2065	__requeue_bios(pool);
				2066	spin_unlock_irqrestore(&pool->lock, flags);
				2067
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	2068	do_waker(&pool->waker.work);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2069	}
				2070
				2071	static void pool_postsuspend(struct dm_target *ti)
				2072	{
				2073	int r;
				2074	struct pool_c *pt = ti->private;
				2075	struct pool *pool = pt->pool;
				2076
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	2077	cancel_delayed_work(&pool->waker);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2078	flush_workqueue(pool->wq);
				2079
				2080	r = dm_pool_commit_metadata(pool->pmd);
				2081	if (r < 0) {
				2082	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				2083	__func__, r);
				2084	/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
				2085	}
				2086	}
				2087
				2088	static int check_arg_count(unsigned argc, unsigned args_required)
				2089	{
				2090	if (argc != args_required) {
				2091	DMWARN("Message received with %u arguments instead of %u.",
				2092	argc, args_required);
				2093	return -EINVAL;
				2094	}
				2095
				2096	return 0;
				2097	}
				2098
				2099	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
				2100	{
				2101	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
				2102	*dev_id <= MAX_DEV_ID)
				2103	return 0;
				2104
				2105	if (warning)
				2106	DMWARN("Message received with invalid device id: %s", arg);
				2107
				2108	return -EINVAL;
				2109	}
				2110
				2111	static int process_create_thin_mesg(unsigned argc, char *argv, struct pool pool)
				2112	{
				2113	dm_thin_id dev_id;
				2114	int r;
				2115
				2116	r = check_arg_count(argc, 2);
				2117	if (r)
				2118	return r;
				2119
				2120	r = read_dev_id(argv[1], &dev_id, 1);
				2121	if (r)
				2122	return r;
				2123
				2124	r = dm_pool_create_thin(pool->pmd, dev_id);
				2125	if (r) {
				2126	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
				2127	argv[1]);
				2128	return r;
				2129	}
				2130
				2131	return 0;
				2132	}
				2133
				2134	static int process_create_snap_mesg(unsigned argc, char *argv, struct pool pool)
				2135	{
				2136	dm_thin_id dev_id;
				2137	dm_thin_id origin_dev_id;
				2138	int r;
				2139
				2140	r = check_arg_count(argc, 3);
				2141	if (r)
				2142	return r;
				2143
				2144	r = read_dev_id(argv[1], &dev_id, 1);
				2145	if (r)
				2146	return r;
				2147
				2148	r = read_dev_id(argv[2], &origin_dev_id, 1);
				2149	if (r)
				2150	return r;
				2151
				2152	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
				2153	if (r) {
				2154	DMWARN("Creation of new snapshot %s of device %s failed.",
				2155	argv[1], argv[2]);
				2156	return r;
				2157	}
				2158
				2159	return 0;
				2160	}
				2161
				2162	static int process_delete_mesg(unsigned argc, char *argv, struct pool pool)
				2163	{
				2164	dm_thin_id dev_id;
				2165	int r;
				2166
				2167	r = check_arg_count(argc, 2);
				2168	if (r)
				2169	return r;
				2170
				2171	r = read_dev_id(argv[1], &dev_id, 1);
				2172	if (r)
				2173	return r;
				2174
				2175	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
				2176	if (r)
				2177	DMWARN("Deletion of thin device %s failed.", argv[1]);
				2178
				2179	return r;
				2180	}
				2181
				2182	static int process_set_transaction_id_mesg(unsigned argc, char *argv, struct pool pool)
				2183	{
				2184	dm_thin_id old_id, new_id;
				2185	int r;
				2186
				2187	r = check_arg_count(argc, 3);
				2188	if (r)
				2189	return r;
				2190
				2191	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
				2192	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
				2193	return -EINVAL;
				2194	}
				2195
				2196	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
				2197	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
				2198	return -EINVAL;
				2199	}
				2200
				2201	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
				2202	if (r) {
				2203	DMWARN("Failed to change transaction id from %s to %s.",
				2204	argv[1], argv[2]);
				2205	return r;
				2206	}
				2207
				2208	return 0;
				2209	}
				2210
				2211	/*
				2212	* Messages supported:
				2213	* create_thin <dev_id>
				2214	* create_snap <dev_id> <origin_id>
				2215	* delete <dev_id>
				2216	* trim <dev_id> <new_size_in_sectors>
				2217	* set_transaction_id <current_trans_id> <new_trans_id>
				2218	*/
				2219	static int pool_message(struct dm_target ti, unsigned argc, char *argv)
				2220	{
				2221	int r = -EINVAL;
				2222	struct pool_c *pt = ti->private;
				2223	struct pool *pool = pt->pool;
				2224
				2225	if (!strcasecmp(argv[0], "create_thin"))
				2226	r = process_create_thin_mesg(argc, argv, pool);
				2227
				2228	else if (!strcasecmp(argv[0], "create_snap"))
				2229	r = process_create_snap_mesg(argc, argv, pool);
				2230
				2231	else if (!strcasecmp(argv[0], "delete"))
				2232	r = process_delete_mesg(argc, argv, pool);
				2233
				2234	else if (!strcasecmp(argv[0], "set_transaction_id"))
				2235	r = process_set_transaction_id_mesg(argc, argv, pool);
				2236
				2237	else
				2238	DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
				2239
				2240	if (!r) {
				2241	r = dm_pool_commit_metadata(pool->pmd);
				2242	if (r)
				2243	DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
				2244	argv[0], r);
				2245	}
				2246
				2247	return r;
				2248	}
				2249
				2250	/*
				2251	* Status line is:
				2252	* <transaction id> <used metadata sectors>/<total metadata sectors>
				2253	* <used data sectors>/<total data sectors> <held metadata root>
				2254	*/
				2255	static int pool_status(struct dm_target *ti, status_type_t type,
				2256	char *result, unsigned maxlen)
				2257	{
				2258	int r;
				2259	unsigned sz = 0;
				2260	uint64_t transaction_id;
				2261	dm_block_t nr_free_blocks_data;
				2262	dm_block_t nr_free_blocks_metadata;
				2263	dm_block_t nr_blocks_data;
				2264	dm_block_t nr_blocks_metadata;
				2265	dm_block_t held_root;
				2266	char buf[BDEVNAME_SIZE];
				2267	char buf2[BDEVNAME_SIZE];
				2268	struct pool_c *pt = ti->private;
				2269	struct pool *pool = pt->pool;
				2270
				2271	switch (type) {
				2272	case STATUSTYPE_INFO:
				2273	r = dm_pool_get_metadata_transaction_id(pool->pmd,
				2274	&transaction_id);
				2275	if (r)
				2276	return r;
				2277
				2278	r = dm_pool_get_free_metadata_block_count(pool->pmd,
				2279	&nr_free_blocks_metadata);
				2280	if (r)
				2281	return r;
				2282
				2283	r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
				2284	if (r)
				2285	return r;
				2286
				2287	r = dm_pool_get_free_block_count(pool->pmd,
				2288	&nr_free_blocks_data);
				2289	if (r)
				2290	return r;
				2291
				2292	r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
				2293	if (r)
				2294	return r;
				2295
				2296	r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
				2297	if (r)
				2298	return r;
				2299
				2300	DMEMIT("%llu %llu/%llu %llu/%llu ",
				2301	(unsigned long long)transaction_id,
				2302	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				2303	(unsigned long long)nr_blocks_metadata,
				2304	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
				2305	(unsigned long long)nr_blocks_data);
				2306
				2307	if (held_root)
				2308	DMEMIT("%llu", held_root);
				2309	else
				2310	DMEMIT("-");
				2311
				2312	break;
				2313
				2314	case STATUSTYPE_TABLE:
				2315	DMEMIT("%s %s %lu %llu ",
				2316	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
				2317	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
				2318	(unsigned long)pool->sectors_per_block,
				2319	(unsigned long long)pt->low_water_blocks);
				2320
				2321	DMEMIT("%u ", !pool->zero_new_blocks);
				2322
				2323	if (!pool->zero_new_blocks)
				2324	DMEMIT("skip_block_zeroing ");
				2325	break;
				2326	}
				2327
				2328	return 0;
				2329	}
				2330
				2331	static int pool_iterate_devices(struct dm_target *ti,
				2332	iterate_devices_callout_fn fn, void *data)
				2333	{
				2334	struct pool_c *pt = ti->private;
				2335
				2336	return fn(ti, pt->data_dev, 0, ti->len, data);
				2337	}
				2338
				2339	static int pool_merge(struct dm_target ti, struct bvec_merge_data bvm,
				2340	struct bio_vec *biovec, int max_size)
				2341	{
				2342	struct pool_c *pt = ti->private;
				2343	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				2344
				2345	if (!q->merge_bvec_fn)
				2346	return max_size;
				2347
				2348	bvm->bi_bdev = pt->data_dev->bdev;
				2349
				2350	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
				2351	}
				2352
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	2353	static void set_discard_limits(struct pool pool, struct queue_limits limits)
				2354	{
				2355	limits->max_discard_sectors = pool->sectors_per_block;
				2356
				2357	/*
				2358	* This is just a hint, and not enforced. We have to cope with
				2359	* bios that overlap 2 blocks.
				2360	*/
				2361	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
				2362	}
				2363
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2364	static void pool_io_hints(struct dm_target ti, struct queue_limits limits)
				2365	{
				2366	struct pool_c *pt = ti->private;
				2367	struct pool *pool = pt->pool;
				2368
				2369	blk_limits_io_min(limits, 0);
				2370	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	2371	set_discard_limits(pool, limits);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2372	}
				2373
				2374	static struct target_type pool_target = {
				2375	.name = "thin-pool",
				2376	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
				2377	DM_TARGET_IMMUTABLE,
				2378	.version = {1, 0, 0},
				2379	.module = THIS_MODULE,
				2380	.ctr = pool_ctr,
				2381	.dtr = pool_dtr,
				2382	.map = pool_map,
				2383	.postsuspend = pool_postsuspend,
				2384	.preresume = pool_preresume,
				2385	.resume = pool_resume,
				2386	.message = pool_message,
				2387	.status = pool_status,
				2388	.merge = pool_merge,
				2389	.iterate_devices = pool_iterate_devices,
				2390	.io_hints = pool_io_hints,
				2391	};
				2392
				2393	/*----------------------------------------------------------------
				2394	* Thin target methods
				2395	--------------------------------------------------------------/
				2396	static void thin_dtr(struct dm_target *ti)
				2397	{
				2398	struct thin_c *tc = ti->private;
				2399
				2400	mutex_lock(&dm_thin_pool_table.mutex);
				2401
				2402	__pool_dec(tc->pool);
				2403	dm_pool_close_thin_device(tc->td);
				2404	dm_put_device(ti, tc->pool_dev);
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2405	if (tc->origin_dev)
				2406	dm_put_device(ti, tc->origin_dev);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2407	kfree(tc);
				2408
				2409	mutex_unlock(&dm_thin_pool_table.mutex);
				2410	}
				2411
				2412	/*
				2413	* Thin target parameters:
				2414	*
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2415	* <pool_dev> <dev_id> [origin_dev]
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2416	*
				2417	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
				2418	* dev_id: the internal device identifier
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2419	* origin_dev: a device external to the pool that should act as the origin
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2420	*/
				2421	static int thin_ctr(struct dm_target ti, unsigned argc, char *argv)
				2422	{
				2423	int r;
				2424	struct thin_c *tc;
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2425	struct dm_dev pool_dev, origin_dev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2426	struct mapped_device *pool_md;
				2427
				2428	mutex_lock(&dm_thin_pool_table.mutex);
				2429
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2430	if (argc != 2 && argc != 3) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2431	ti->error = "Invalid argument count";
				2432	r = -EINVAL;
				2433	goto out_unlock;
				2434	}
				2435
				2436	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
				2437	if (!tc) {
				2438	ti->error = "Out of memory";
				2439	r = -ENOMEM;
				2440	goto out_unlock;
				2441	}
				2442
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2443	if (argc == 3) {
				2444	r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
				2445	if (r) {
				2446	ti->error = "Error opening origin device";
				2447	goto bad_origin_dev;
				2448	}
				2449	tc->origin_dev = origin_dev;
				2450	}
				2451
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2452	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
				2453	if (r) {
				2454	ti->error = "Error opening pool device";
				2455	goto bad_pool_dev;
				2456	}
				2457	tc->pool_dev = pool_dev;
				2458
				2459	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
				2460	ti->error = "Invalid device id";
				2461	r = -EINVAL;
				2462	goto bad_common;
				2463	}
				2464
				2465	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
				2466	if (!pool_md) {
				2467	ti->error = "Couldn't get pool mapped device";
				2468	r = -EINVAL;
				2469	goto bad_common;
				2470	}
				2471
				2472	tc->pool = __pool_table_lookup(pool_md);
				2473	if (!tc->pool) {
				2474	ti->error = "Couldn't find pool object";
				2475	r = -EINVAL;
				2476	goto bad_pool_lookup;
				2477	}
				2478	__pool_inc(tc->pool);
				2479
				2480	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
				2481	if (r) {
				2482	ti->error = "Couldn't open thin internal device";
				2483	goto bad_thin_open;
				2484	}
				2485
				2486	ti->split_io = tc->pool->sectors_per_block;
				2487	ti->num_flush_requests = 1;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	2488	ti->num_discard_requests = 1;
				2489	ti->discards_supported = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2490
				2491	dm_put(pool_md);
				2492
				2493	mutex_unlock(&dm_thin_pool_table.mutex);
				2494
				2495	return 0;
				2496
				2497	bad_thin_open:
				2498	__pool_dec(tc->pool);
				2499	bad_pool_lookup:
				2500	dm_put(pool_md);
				2501	bad_common:
				2502	dm_put_device(ti, tc->pool_dev);
				2503	bad_pool_dev:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2504	if (tc->origin_dev)
				2505	dm_put_device(ti, tc->origin_dev);
				2506	bad_origin_dev:
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2507	kfree(tc);
				2508	out_unlock:
				2509	mutex_unlock(&dm_thin_pool_table.mutex);
				2510
				2511	return r;
				2512	}
				2513
				2514	static int thin_map(struct dm_target ti, struct bio bio,
				2515	union map_info *map_context)
				2516	{
Alasdair G Kergon	6efd6e8	2012-03-28 18:41:28 +0100	[diff] [blame]	2517	bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2518
				2519	return thin_bio_map(ti, bio, map_context);
				2520	}
				2521
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2522	static int thin_endio(struct dm_target *ti,
				2523	struct bio *bio, int err,
				2524	union map_info *map_context)
				2525	{
				2526	unsigned long flags;
				2527	struct endio_hook *h = map_context->ptr;
				2528	struct list_head work;
				2529	struct new_mapping m, tmp;
				2530	struct pool *pool = h->tc->pool;
				2531
				2532	if (h->shared_read_entry) {
				2533	INIT_LIST_HEAD(&work);
				2534	ds_dec(h->shared_read_entry, &work);
				2535
				2536	spin_lock_irqsave(&pool->lock, flags);
				2537	list_for_each_entry_safe(m, tmp, &work, list) {
				2538	list_del(&m->list);
				2539	m->quiesced = 1;
				2540	__maybe_add_mapping(m);
				2541	}
				2542	spin_unlock_irqrestore(&pool->lock, flags);
				2543	}
				2544
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	2545	if (h->all_io_entry) {
				2546	INIT_LIST_HEAD(&work);
				2547	ds_dec(h->all_io_entry, &work);
				2548	list_for_each_entry_safe(m, tmp, &work, list)
				2549	list_add(&m->list, &pool->prepared_discards);
				2550	}
				2551
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2552	mempool_free(h, pool->endio_hook_pool);
				2553
				2554	return 0;
				2555	}
				2556
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2557	static void thin_postsuspend(struct dm_target *ti)
				2558	{
				2559	if (dm_noflush_suspending(ti))
				2560	requeue_io((struct thin_c *)ti->private);
				2561	}
				2562
				2563	/*
				2564	* <nr mapped sectors> <highest mapped sector>
				2565	*/
				2566	static int thin_status(struct dm_target *ti, status_type_t type,
				2567	char *result, unsigned maxlen)
				2568	{
				2569	int r;
				2570	ssize_t sz = 0;
				2571	dm_block_t mapped, highest;
				2572	char buf[BDEVNAME_SIZE];
				2573	struct thin_c *tc = ti->private;
				2574
				2575	if (!tc->td)
				2576	DMEMIT("-");
				2577	else {
				2578	switch (type) {
				2579	case STATUSTYPE_INFO:
				2580	r = dm_thin_get_mapped_count(tc->td, &mapped);
				2581	if (r)
				2582	return r;
				2583
				2584	r = dm_thin_get_highest_mapped_block(tc->td, &highest);
				2585	if (r < 0)
				2586	return r;
				2587
				2588	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
				2589	if (r)
				2590	DMEMIT("%llu", ((highest + 1) *
				2591	tc->pool->sectors_per_block) - 1);
				2592	else
				2593	DMEMIT("-");
				2594	break;
				2595
				2596	case STATUSTYPE_TABLE:
				2597	DMEMIT("%s %lu",
				2598	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
				2599	(unsigned long) tc->dev_id);
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2600	if (tc->origin_dev)
				2601	DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2602	break;
				2603	}
				2604	}
				2605
				2606	return 0;
				2607	}
				2608
				2609	static int thin_iterate_devices(struct dm_target *ti,
				2610	iterate_devices_callout_fn fn, void *data)
				2611	{
				2612	dm_block_t blocks;
				2613	struct thin_c *tc = ti->private;
				2614
				2615	/*
				2616	* We can't call dm_pool_get_data_dev_size() since that blocks. So
				2617	* we follow a more convoluted path through to the pool's target.
				2618	*/
				2619	if (!tc->pool->ti)
				2620	return 0; /* nothing is bound */
				2621
				2622	blocks = tc->pool->ti->len >> tc->pool->block_shift;
				2623	if (blocks)
				2624	return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
				2625
				2626	return 0;
				2627	}
				2628
				2629	static void thin_io_hints(struct dm_target ti, struct queue_limits limits)
				2630	{
				2631	struct thin_c *tc = ti->private;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	2632	struct pool *pool = tc->pool;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2633
				2634	blk_limits_io_min(limits, 0);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame^]	2635	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
				2636	set_discard_limits(pool, limits);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2637	}
				2638
				2639	static struct target_type thin_target = {
				2640	.name = "thin",
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2641	.version = {1, 1, 0},
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2642	.module = THIS_MODULE,
				2643	.ctr = thin_ctr,
				2644	.dtr = thin_dtr,
				2645	.map = thin_map,
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2646	.end_io = thin_endio,
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2647	.postsuspend = thin_postsuspend,
				2648	.status = thin_status,
				2649	.iterate_devices = thin_iterate_devices,
				2650	.io_hints = thin_io_hints,
				2651	};
				2652
				2653	/----------------------------------------------------------------/
				2654
				2655	static int __init dm_thin_init(void)
				2656	{
				2657	int r;
				2658
				2659	pool_table_init();
				2660
				2661	r = dm_register_target(&thin_target);
				2662	if (r)
				2663	return r;
				2664
				2665	r = dm_register_target(&pool_target);
				2666	if (r)
				2667	dm_unregister_target(&thin_target);
				2668
				2669	return r;
				2670	}
				2671
				2672	static void dm_thin_exit(void)
				2673	{
				2674	dm_unregister_target(&thin_target);
				2675	dm_unregister_target(&pool_target);
				2676	}
				2677
				2678	module_init(dm_thin_init);
				2679	module_exit(dm_thin_exit);
				2680
				2681	MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
				2682	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				2683	MODULE_LICENSE("GPL");