Blame - drivers/md/dm-thin.c - kernel/msm-4.9

blob: db1b041ce97556e4668d05db61e09c01f64fc610 [file] [log] [blame]

Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 Red Hat UK.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm-thin-metadata.h"
				8
				9	#include <linux/device-mapper.h>
				10	#include <linux/dm-io.h>
				11	#include <linux/dm-kcopyd.h>
				12	#include <linux/list.h>
				13	#include <linux/init.h>
				14	#include <linux/module.h>
				15	#include <linux/slab.h>
				16
				17	#define DM_MSG_PREFIX "thin"
				18
				19	/*
				20	* Tunable constants
				21	*/
				22	#define ENDIO_HOOK_POOL_SIZE 10240
				23	#define DEFERRED_SET_SIZE 64
				24	#define MAPPING_POOL_SIZE 1024
				25	#define PRISON_CELLS 1024
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	26	#define COMMIT_PERIOD HZ
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	27
				28	/*
				29	* The block size of the device holding pool data must be
				30	* between 64KB and 1GB.
				31	*/
				32	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
				33	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				34
				35	/*
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	36	* Device id is restricted to 24 bits.
				37	*/
				38	#define MAX_DEV_ID ((1 << 24) - 1)
				39
				40	/*
				41	* How do we handle breaking sharing of data blocks?
				42	* =================================================
				43	*
				44	* We use a standard copy-on-write btree to store the mappings for the
				45	* devices (note I'm talking about copy-on-write of the metadata here, not
				46	* the data). When you take an internal snapshot you clone the root node
				47	* of the origin btree. After this there is no concept of an origin or a
				48	* snapshot. They are just two device trees that happen to point to the
				49	* same data blocks.
				50	*
				51	* When we get a write in we decide if it's to a shared data block using
				52	* some timestamp magic. If it is, we have to break sharing.
				53	*
				54	* Let's say we write to a shared block in what was the origin. The
				55	* steps are:
				56	*
				57	* i) plug io further to this physical block. (see bio_prison code).
				58	*
				59	* ii) quiesce any read io to that shared data block. Obviously
				60	* including all devices that share this block. (see deferred_set code)
				61	*
				62	* iii) copy the data block to a newly allocate block. This step can be
				63	* missed out if the io covers the block. (schedule_copy).
				64	*
				65	* iv) insert the new mapping into the origin's btree
Joe Thornber	fe878f3	2012-03-28 18:41:24 +0100	[diff] [blame]	66	* (process_prepared_mapping). This act of inserting breaks some
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	67	* sharing of btree nodes between the two devices. Breaking sharing only
				68	* effects the btree of that specific device. Btrees for the other
				69	* devices that share the block never change. The btree for the origin
				70	* device as it was after the last commit is untouched, ie. we're using
				71	* persistent data structures in the functional programming sense.
				72	*
				73	* v) unplug io to this physical block, including the io that triggered
				74	* the breaking of sharing.
				75	*
				76	* Steps (ii) and (iii) occur in parallel.
				77	*
				78	* The metadata _doesn't_ need to be committed before the io continues. We
				79	* get away with this because the io is always written to a _new_ block.
				80	* If there's a crash, then:
				81	*
				82	* - The origin mapping will point to the old origin block (the shared
				83	* one). This will contain the data as it was before the io that triggered
				84	* the breaking of sharing came in.
				85	*
				86	* - The snap mapping still points to the old block. As it would after
				87	* the commit.
				88	*
				89	* The downside of this scheme is the timestamp magic isn't perfect, and
				90	* will continue to think that data block in the snapshot device is shared
				91	* even after the write to the origin has broken sharing. I suspect data
				92	* blocks will typically be shared by many different devices, so we're
				93	* breaking sharing n + 1 times, rather than n, where n is the number of
				94	* devices that reference this data block. At the moment I think the
				95	* benefits far, far outweigh the disadvantages.
				96	*/
				97
				98	/----------------------------------------------------------------/
				99
				100	/*
				101	* Sometimes we can't deal with a bio straight away. We put them in prison
				102	* where they can't cause any mischief. Bios are put in a cell identified
				103	* by a key, multiple bios can be in the same cell. When the cell is
				104	* subsequently unlocked the bios become available.
				105	*/
				106	struct bio_prison;
				107
				108	struct cell_key {
				109	int virtual;
				110	dm_thin_id dev;
				111	dm_block_t block;
				112	};
				113
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	114	struct dm_bio_prison_cell {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	115	struct hlist_node list;
				116	struct bio_prison *prison;
				117	struct cell_key key;
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	118	struct bio *holder;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	119	struct bio_list bios;
				120	};
				121
				122	struct bio_prison {
				123	spinlock_t lock;
				124	mempool_t *cell_pool;
				125
				126	unsigned nr_buckets;
				127	unsigned hash_mask;
				128	struct hlist_head *cells;
				129	};
				130
				131	static uint32_t calc_nr_buckets(unsigned nr_cells)
				132	{
				133	uint32_t n = 128;
				134
				135	nr_cells /= 4;
				136	nr_cells = min(nr_cells, 8192u);
				137
				138	while (n < nr_cells)
				139	n <<= 1;
				140
				141	return n;
				142	}
				143
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	144	static struct kmem_cache *_cell_cache;
				145
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	146	/*
				147	* @nr_cells should be the number of cells you want in use _concurrently_.
				148	* Don't confuse it with the number of distinct keys.
				149	*/
				150	static struct bio_prison *prison_create(unsigned nr_cells)
				151	{
				152	unsigned i;
				153	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
				154	size_t len = sizeof(struct bio_prison) +
				155	(sizeof(struct hlist_head) * nr_buckets);
				156	struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
				157
				158	if (!prison)
				159	return NULL;
				160
				161	spin_lock_init(&prison->lock);
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	162	prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	163	if (!prison->cell_pool) {
				164	kfree(prison);
				165	return NULL;
				166	}
				167
				168	prison->nr_buckets = nr_buckets;
				169	prison->hash_mask = nr_buckets - 1;
				170	prison->cells = (struct hlist_head *) (prison + 1);
				171	for (i = 0; i < nr_buckets; i++)
				172	INIT_HLIST_HEAD(prison->cells + i);
				173
				174	return prison;
				175	}
				176
				177	static void prison_destroy(struct bio_prison *prison)
				178	{
				179	mempool_destroy(prison->cell_pool);
				180	kfree(prison);
				181	}
				182
				183	static uint32_t hash_key(struct bio_prison prison, struct cell_key key)
				184	{
				185	const unsigned long BIG_PRIME = 4294967291UL;
				186	uint64_t hash = key->block * BIG_PRIME;
				187
				188	return (uint32_t) (hash & prison->hash_mask);
				189	}
				190
				191	static int keys_equal(struct cell_key lhs, struct cell_key rhs)
				192	{
				193	return (lhs->virtual == rhs->virtual) &&
				194	(lhs->dev == rhs->dev) &&
				195	(lhs->block == rhs->block);
				196	}
				197
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	198	static struct dm_bio_prison_cell __search_bucket(struct hlist_head bucket,
				199	struct cell_key *key)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	200	{
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	201	struct dm_bio_prison_cell *cell;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	202	struct hlist_node *tmp;
				203
				204	hlist_for_each_entry(cell, tmp, bucket, list)
				205	if (keys_equal(&cell->key, key))
				206	return cell;
				207
				208	return NULL;
				209	}
				210
				211	/*
				212	* This may block if a new cell needs allocating. You must ensure that
				213	* cells will be unlocked even if the calling thread is blocked.
				214	*
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	215	* Returns 1 if the cell was already held, 0 if @inmate is the new holder.
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	216	*/
				217	static int bio_detain(struct bio_prison prison, struct cell_key key,
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	218	struct bio inmate, struct dm_bio_prison_cell *ref)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	219	{
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	220	int r = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	221	unsigned long flags;
				222	uint32_t hash = hash_key(prison, key);
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	223	struct dm_bio_prison_cell cell, cell2;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	224
				225	BUG_ON(hash > prison->nr_buckets);
				226
				227	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	228
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	229	cell = __search_bucket(prison->cells + hash, key);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	230	if (cell) {
				231	bio_list_add(&cell->bios, inmate);
				232	goto out;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	233	}
				234
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	235	/*
				236	* Allocate a new cell
				237	*/
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	238	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	239	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
				240	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	241
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	242	/*
				243	* We've been unlocked, so we have to double check that
				244	* nobody else has inserted this cell in the meantime.
				245	*/
				246	cell = __search_bucket(prison->cells + hash, key);
				247	if (cell) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	248	mempool_free(cell2, prison->cell_pool);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	249	bio_list_add(&cell->bios, inmate);
				250	goto out;
				251	}
				252
				253	/*
				254	* Use new cell.
				255	*/
				256	cell = cell2;
				257
				258	cell->prison = prison;
				259	memcpy(&cell->key, key, sizeof(cell->key));
				260	cell->holder = inmate;
				261	bio_list_init(&cell->bios);
				262	hlist_add_head(&cell->list, prison->cells + hash);
				263
				264	r = 0;
				265
				266	out:
				267	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	268
				269	*ref = cell;
				270
				271	return r;
				272	}
				273
				274	/*
				275	* @inmates must have been initialised prior to this call
				276	*/
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	277	static void __cell_release(struct dm_bio_prison_cell cell, struct bio_list inmates)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	278	{
				279	struct bio_prison *prison = cell->prison;
				280
				281	hlist_del(&cell->list);
				282
Mike Snitzer	03aaae7	2012-05-12 01:43:12 +0100	[diff] [blame]	283	if (inmates) {
				284	bio_list_add(inmates, cell->holder);
				285	bio_list_merge(inmates, &cell->bios);
				286	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	287
				288	mempool_free(cell, prison->cell_pool);
				289	}
				290
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	291	static void cell_release(struct dm_bio_prison_cell cell, struct bio_list bios)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	292	{
				293	unsigned long flags;
				294	struct bio_prison *prison = cell->prison;
				295
				296	spin_lock_irqsave(&prison->lock, flags);
				297	__cell_release(cell, bios);
				298	spin_unlock_irqrestore(&prison->lock, flags);
				299	}
				300
				301	/*
				302	* There are a couple of places where we put a bio into a cell briefly
				303	* before taking it out again. In these situations we know that no other
				304	* bio may be in the cell. This function releases the cell, and also does
				305	* a sanity check.
				306	*/
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	307	static void __cell_release_singleton(struct dm_bio_prison_cell cell, struct bio bio)
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	308	{
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	309	BUG_ON(cell->holder != bio);
				310	BUG_ON(!bio_list_empty(&cell->bios));
Mike Snitzer	03aaae7	2012-05-12 01:43:12 +0100	[diff] [blame]	311
				312	__cell_release(cell, NULL);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	313	}
				314
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	315	static void cell_release_singleton(struct dm_bio_prison_cell cell, struct bio bio)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	316	{
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	317	unsigned long flags;
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	318	struct bio_prison *prison = cell->prison;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	319
				320	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	321	__cell_release_singleton(cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	322	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	323	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	324
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	325	/*
				326	* Sometimes we don't want the holder, just the additional bios.
				327	*/
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	328	static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
				329	struct bio_list *inmates)
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	330	{
				331	struct bio_prison *prison = cell->prison;
				332
				333	hlist_del(&cell->list);
				334	bio_list_merge(inmates, &cell->bios);
				335
				336	mempool_free(cell, prison->cell_pool);
				337	}
				338
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	339	static void cell_release_no_holder(struct dm_bio_prison_cell *cell,
				340	struct bio_list *inmates)
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	341	{
				342	unsigned long flags;
				343	struct bio_prison *prison = cell->prison;
				344
				345	spin_lock_irqsave(&prison->lock, flags);
				346	__cell_release_no_holder(cell, inmates);
				347	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	348	}
				349
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	350	static void cell_error(struct dm_bio_prison_cell *cell)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	351	{
				352	struct bio_prison *prison = cell->prison;
				353	struct bio_list bios;
				354	struct bio *bio;
				355	unsigned long flags;
				356
				357	bio_list_init(&bios);
				358
				359	spin_lock_irqsave(&prison->lock, flags);
				360	__cell_release(cell, &bios);
				361	spin_unlock_irqrestore(&prison->lock, flags);
				362
				363	while ((bio = bio_list_pop(&bios)))
				364	bio_io_error(bio);
				365	}
				366
				367	/----------------------------------------------------------------/
				368
				369	/*
				370	* We use the deferred set to keep track of pending reads to shared blocks.
				371	* We do this to ensure the new mapping caused by a write isn't performed
				372	* until these prior reads have completed. Otherwise the insertion of the
				373	* new mapping could free the old block that the read bios are mapped to.
				374	*/
				375
				376	struct deferred_set;
				377	struct deferred_entry {
				378	struct deferred_set *ds;
				379	unsigned count;
				380	struct list_head work_items;
				381	};
				382
				383	struct deferred_set {
				384	spinlock_t lock;
				385	unsigned current_entry;
				386	unsigned sweeper;
				387	struct deferred_entry entries[DEFERRED_SET_SIZE];
				388	};
				389
				390	static void ds_init(struct deferred_set *ds)
				391	{
				392	int i;
				393
				394	spin_lock_init(&ds->lock);
				395	ds->current_entry = 0;
				396	ds->sweeper = 0;
				397	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
				398	ds->entries[i].ds = ds;
				399	ds->entries[i].count = 0;
				400	INIT_LIST_HEAD(&ds->entries[i].work_items);
				401	}
				402	}
				403
				404	static struct deferred_entry ds_inc(struct deferred_set ds)
				405	{
				406	unsigned long flags;
				407	struct deferred_entry *entry;
				408
				409	spin_lock_irqsave(&ds->lock, flags);
				410	entry = ds->entries + ds->current_entry;
				411	entry->count++;
				412	spin_unlock_irqrestore(&ds->lock, flags);
				413
				414	return entry;
				415	}
				416
				417	static unsigned ds_next(unsigned index)
				418	{
				419	return (index + 1) % DEFERRED_SET_SIZE;
				420	}
				421
				422	static void __sweep(struct deferred_set ds, struct list_head head)
				423	{
				424	while ((ds->sweeper != ds->current_entry) &&
				425	!ds->entries[ds->sweeper].count) {
				426	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				427	ds->sweeper = ds_next(ds->sweeper);
				428	}
				429
				430	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
				431	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				432	}
				433
				434	static void ds_dec(struct deferred_entry entry, struct list_head head)
				435	{
				436	unsigned long flags;
				437
				438	spin_lock_irqsave(&entry->ds->lock, flags);
				439	BUG_ON(!entry->count);
				440	--entry->count;
				441	__sweep(entry->ds, head);
				442	spin_unlock_irqrestore(&entry->ds->lock, flags);
				443	}
				444
				445	/*
				446	* Returns 1 if deferred or 0 if no pending items to delay job.
				447	*/
				448	static int ds_add_work(struct deferred_set ds, struct list_head work)
				449	{
				450	int r = 1;
				451	unsigned long flags;
				452	unsigned next_entry;
				453
				454	spin_lock_irqsave(&ds->lock, flags);
				455	if ((ds->sweeper == ds->current_entry) &&
				456	!ds->entries[ds->current_entry].count)
				457	r = 0;
				458	else {
				459	list_add(work, &ds->entries[ds->current_entry].work_items);
				460	next_entry = ds_next(ds->current_entry);
				461	if (!ds->entries[next_entry].count)
				462	ds->current_entry = next_entry;
				463	}
				464	spin_unlock_irqrestore(&ds->lock, flags);
				465
				466	return r;
				467	}
				468
				469	/----------------------------------------------------------------/
				470
				471	/*
				472	* Key building.
				473	*/
				474	static void build_data_key(struct dm_thin_device *td,
				475	dm_block_t b, struct cell_key *key)
				476	{
				477	key->virtual = 0;
				478	key->dev = dm_thin_dev_id(td);
				479	key->block = b;
				480	}
				481
				482	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
				483	struct cell_key *key)
				484	{
				485	key->virtual = 1;
				486	key->dev = dm_thin_dev_id(td);
				487	key->block = b;
				488	}
				489
				490	/----------------------------------------------------------------/
				491
				492	/*
				493	* A pool device ties together a metadata device and a data device. It
				494	* also provides the interface for creating and destroying internal
				495	* devices.
				496	*/
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	497	struct dm_thin_new_mapping;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	498
				499	struct pool_features {
				500	unsigned zero_new_blocks:1;
				501	unsigned discard_enabled:1;
				502	unsigned discard_passdown:1;
				503	};
				504
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	505	struct pool {
				506	struct list_head list;
				507	struct dm_target ti; / Only set if a pool target is bound */
				508
				509	struct mapped_device *pool_md;
				510	struct block_device *md_dev;
				511	struct dm_pool_metadata *pmd;
				512
				513	uint32_t sectors_per_block;
				514	unsigned block_shift;
				515	dm_block_t offset_mask;
				516	dm_block_t low_water_blocks;
				517
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	518	struct pool_features pf;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	519	unsigned low_water_triggered:1; /* A dm event has been sent */
				520	unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
				521
				522	struct bio_prison *prison;
				523	struct dm_kcopyd_client *copier;
				524
				525	struct workqueue_struct *wq;
				526	struct work_struct worker;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	527	struct delayed_work waker;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	528
				529	unsigned ref_count;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	530	unsigned long last_commit_jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	531
				532	spinlock_t lock;
				533	struct bio_list deferred_bios;
				534	struct bio_list deferred_flush_bios;
				535	struct list_head prepared_mappings;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	536	struct list_head prepared_discards;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	537
				538	struct bio_list retry_on_resume_list;
				539
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	540	struct deferred_set shared_read_ds;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	541	struct deferred_set all_io_ds;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	542
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	543	struct dm_thin_new_mapping *next_mapping;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	544	mempool_t *mapping_pool;
				545	mempool_t *endio_hook_pool;
				546	};
				547
				548	/*
				549	* Target context for a pool.
				550	*/
				551	struct pool_c {
				552	struct dm_target *ti;
				553	struct pool *pool;
				554	struct dm_dev *data_dev;
				555	struct dm_dev *metadata_dev;
				556	struct dm_target_callbacks callbacks;
				557
				558	dm_block_t low_water_blocks;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	559	struct pool_features pf;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	560	};
				561
				562	/*
				563	* Target context for a thin.
				564	*/
				565	struct thin_c {
				566	struct dm_dev *pool_dev;
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	567	struct dm_dev *origin_dev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	568	dm_thin_id dev_id;
				569
				570	struct pool *pool;
				571	struct dm_thin_device *td;
				572	};
				573
				574	/----------------------------------------------------------------/
				575
				576	/*
				577	* A global list of pools that uses a struct mapped_device as a key.
				578	*/
				579	static struct dm_thin_pool_table {
				580	struct mutex mutex;
				581	struct list_head pools;
				582	} dm_thin_pool_table;
				583
				584	static void pool_table_init(void)
				585	{
				586	mutex_init(&dm_thin_pool_table.mutex);
				587	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
				588	}
				589
				590	static void __pool_table_insert(struct pool *pool)
				591	{
				592	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				593	list_add(&pool->list, &dm_thin_pool_table.pools);
				594	}
				595
				596	static void __pool_table_remove(struct pool *pool)
				597	{
				598	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				599	list_del(&pool->list);
				600	}
				601
				602	static struct pool __pool_table_lookup(struct mapped_device md)
				603	{
				604	struct pool pool = NULL, tmp;
				605
				606	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				607
				608	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				609	if (tmp->pool_md == md) {
				610	pool = tmp;
				611	break;
				612	}
				613	}
				614
				615	return pool;
				616	}
				617
				618	static struct pool __pool_table_lookup_metadata_dev(struct block_device md_dev)
				619	{
				620	struct pool pool = NULL, tmp;
				621
				622	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				623
				624	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				625	if (tmp->md_dev == md_dev) {
				626	pool = tmp;
				627	break;
				628	}
				629	}
				630
				631	return pool;
				632	}
				633
				634	/----------------------------------------------------------------/
				635
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	636	struct dm_thin_endio_hook {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	637	struct thin_c *tc;
				638	struct deferred_entry *shared_read_entry;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	639	struct deferred_entry *all_io_entry;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	640	struct dm_thin_new_mapping *overwrite_mapping;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	641	};
				642
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	643	static void __requeue_bio_list(struct thin_c tc, struct bio_list master)
				644	{
				645	struct bio *bio;
				646	struct bio_list bios;
				647
				648	bio_list_init(&bios);
				649	bio_list_merge(&bios, master);
				650	bio_list_init(master);
				651
				652	while ((bio = bio_list_pop(&bios))) {
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	653	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
				654
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	655	if (h->tc == tc)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	656	bio_endio(bio, DM_ENDIO_REQUEUE);
				657	else
				658	bio_list_add(master, bio);
				659	}
				660	}
				661
				662	static void requeue_io(struct thin_c *tc)
				663	{
				664	struct pool *pool = tc->pool;
				665	unsigned long flags;
				666
				667	spin_lock_irqsave(&pool->lock, flags);
				668	__requeue_bio_list(tc, &pool->deferred_bios);
				669	__requeue_bio_list(tc, &pool->retry_on_resume_list);
				670	spin_unlock_irqrestore(&pool->lock, flags);
				671	}
				672
				673	/*
				674	* This section of code contains the logic for processing a thin device's IO.
				675	* Much of the code depends on pool object resources (lists, workqueues, etc)
				676	* but most is exclusively called from the thin target rather than the thin-pool
				677	* target.
				678	*/
				679
				680	static dm_block_t get_bio_block(struct thin_c tc, struct bio bio)
				681	{
				682	return bio->bi_sector >> tc->pool->block_shift;
				683	}
				684
				685	static void remap(struct thin_c tc, struct bio bio, dm_block_t block)
				686	{
				687	struct pool *pool = tc->pool;
				688
				689	bio->bi_bdev = tc->pool_dev->bdev;
				690	bio->bi_sector = (block << pool->block_shift) +
				691	(bio->bi_sector & pool->offset_mask);
				692	}
				693
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	694	static void remap_to_origin(struct thin_c tc, struct bio bio)
				695	{
				696	bio->bi_bdev = tc->origin_dev->bdev;
				697	}
				698
				699	static void issue(struct thin_c tc, struct bio bio)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	700	{
				701	struct pool *pool = tc->pool;
				702	unsigned long flags;
				703
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	704	/*
				705	* Batch together any FUA/FLUSH bios we find and then issue
				706	* a single commit for them in process_deferred_bios().
				707	*/
				708	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA)) {
				709	spin_lock_irqsave(&pool->lock, flags);
				710	bio_list_add(&pool->deferred_flush_bios, bio);
				711	spin_unlock_irqrestore(&pool->lock, flags);
				712	} else
				713	generic_make_request(bio);
				714	}
				715
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	716	static void remap_to_origin_and_issue(struct thin_c tc, struct bio bio)
				717	{
				718	remap_to_origin(tc, bio);
				719	issue(tc, bio);
				720	}
				721
				722	static void remap_and_issue(struct thin_c tc, struct bio bio,
				723	dm_block_t block)
				724	{
				725	remap(tc, bio, block);
				726	issue(tc, bio);
				727	}
				728
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	729	/*
				730	* wake_worker() is used when new work is queued and when pool_resume is
				731	* ready to continue deferred IO processing.
				732	*/
				733	static void wake_worker(struct pool *pool)
				734	{
				735	queue_work(pool->wq, &pool->worker);
				736	}
				737
				738	/----------------------------------------------------------------/
				739
				740	/*
				741	* Bio endio functions.
				742	*/
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	743	struct dm_thin_new_mapping {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	744	struct list_head list;
				745
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	746	unsigned quiesced:1;
				747	unsigned prepared:1;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	748	unsigned pass_discard:1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	749
				750	struct thin_c *tc;
				751	dm_block_t virt_block;
				752	dm_block_t data_block;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	753	struct dm_bio_prison_cell cell, cell2;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	754	int err;
				755
				756	/*
				757	* If the bio covers the whole area of a block then we can avoid
				758	* zeroing or copying. Instead this bio is hooked. The bio will
				759	* still be in the cell, so care has to be taken to avoid issuing
				760	* the bio twice.
				761	*/
				762	struct bio *bio;
				763	bio_end_io_t *saved_bi_end_io;
				764	};
				765
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	766	static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	767	{
				768	struct pool *pool = m->tc->pool;
				769
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	770	if (m->quiesced && m->prepared) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	771	list_add(&m->list, &pool->prepared_mappings);
				772	wake_worker(pool);
				773	}
				774	}
				775
				776	static void copy_complete(int read_err, unsigned long write_err, void *context)
				777	{
				778	unsigned long flags;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	779	struct dm_thin_new_mapping *m = context;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	780	struct pool *pool = m->tc->pool;
				781
				782	m->err = read_err \|\| write_err ? -EIO : 0;
				783
				784	spin_lock_irqsave(&pool->lock, flags);
				785	m->prepared = 1;
				786	__maybe_add_mapping(m);
				787	spin_unlock_irqrestore(&pool->lock, flags);
				788	}
				789
				790	static void overwrite_endio(struct bio *bio, int err)
				791	{
				792	unsigned long flags;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	793	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
				794	struct dm_thin_new_mapping *m = h->overwrite_mapping;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	795	struct pool *pool = m->tc->pool;
				796
				797	m->err = err;
				798
				799	spin_lock_irqsave(&pool->lock, flags);
				800	m->prepared = 1;
				801	__maybe_add_mapping(m);
				802	spin_unlock_irqrestore(&pool->lock, flags);
				803	}
				804
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	805	/----------------------------------------------------------------/
				806
				807	/*
				808	* Workqueue.
				809	*/
				810
				811	/*
				812	* Prepared mapping jobs.
				813	*/
				814
				815	/*
				816	* This sends the bios in the cell back to the deferred_bios list.
				817	*/
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	818	static void cell_defer(struct thin_c tc, struct dm_bio_prison_cell cell,
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	819	dm_block_t data_block)
				820	{
				821	struct pool *pool = tc->pool;
				822	unsigned long flags;
				823
				824	spin_lock_irqsave(&pool->lock, flags);
				825	cell_release(cell, &pool->deferred_bios);
				826	spin_unlock_irqrestore(&tc->pool->lock, flags);
				827
				828	wake_worker(pool);
				829	}
				830
				831	/*
				832	* Same as cell_defer above, except it omits one particular detainee,
				833	* a write bio that covers the block and has already been processed.
				834	*/
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	835	static void cell_defer_except(struct thin_c tc, struct dm_bio_prison_cell cell)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	836	{
				837	struct bio_list bios;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	838	struct pool *pool = tc->pool;
				839	unsigned long flags;
				840
				841	bio_list_init(&bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	842
				843	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	844	cell_release_no_holder(cell, &pool->deferred_bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	845	spin_unlock_irqrestore(&pool->lock, flags);
				846
				847	wake_worker(pool);
				848	}
				849
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	850	static void process_prepared_mapping(struct dm_thin_new_mapping *m)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	851	{
				852	struct thin_c *tc = m->tc;
				853	struct bio *bio;
				854	int r;
				855
				856	bio = m->bio;
				857	if (bio)
				858	bio->bi_end_io = m->saved_bi_end_io;
				859
				860	if (m->err) {
				861	cell_error(m->cell);
				862	return;
				863	}
				864
				865	/*
				866	* Commit the prepared block into the mapping btree.
				867	* Any I/O for this block arriving after this point will get
				868	* remapped to it directly.
				869	*/
				870	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
				871	if (r) {
				872	DMERR("dm_thin_insert_block() failed");
				873	cell_error(m->cell);
				874	return;
				875	}
				876
				877	/*
				878	* Release any bios held while the block was being provisioned.
				879	* If we are processing a write bio that completely covers the block,
				880	* we already processed it so can ignore it now when processing
				881	* the bios in the cell.
				882	*/
				883	if (bio) {
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	884	cell_defer_except(tc, m->cell);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	885	bio_endio(bio, 0);
				886	} else
				887	cell_defer(tc, m->cell, m->data_block);
				888
				889	list_del(&m->list);
				890	mempool_free(m, tc->pool->mapping_pool);
				891	}
				892
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	893	static void process_prepared_discard(struct dm_thin_new_mapping *m)
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	894	{
				895	int r;
				896	struct thin_c *tc = m->tc;
				897
				898	r = dm_thin_remove_block(tc->td, m->virt_block);
				899	if (r)
				900	DMERR("dm_thin_remove_block() failed");
				901
				902	/*
				903	* Pass the discard down to the underlying device?
				904	*/
				905	if (m->pass_discard)
				906	remap_and_issue(tc, m->bio, m->data_block);
				907	else
				908	bio_endio(m->bio, 0);
				909
				910	cell_defer_except(tc, m->cell);
				911	cell_defer_except(tc, m->cell2);
				912	mempool_free(m, tc->pool->mapping_pool);
				913	}
				914
				915	static void process_prepared(struct pool pool, struct list_head head,
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	916	void (fn)(struct dm_thin_new_mapping ))
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	917	{
				918	unsigned long flags;
				919	struct list_head maps;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	920	struct dm_thin_new_mapping m, tmp;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	921
				922	INIT_LIST_HEAD(&maps);
				923	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	924	list_splice_init(head, &maps);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	925	spin_unlock_irqrestore(&pool->lock, flags);
				926
				927	list_for_each_entry_safe(m, tmp, &maps, list)
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	928	fn(m);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	929	}
				930
				931	/*
				932	* Deferred bio jobs.
				933	*/
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	934	static int io_overlaps_block(struct pool pool, struct bio bio)
				935	{
				936	return !(bio->bi_sector & pool->offset_mask) &&
				937	(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
				938
				939	}
				940
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	941	static int io_overwrites_block(struct pool pool, struct bio bio)
				942	{
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	943	return (bio_data_dir(bio) == WRITE) &&
				944	io_overlaps_block(pool, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	945	}
				946
				947	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
				948	bio_end_io_t *fn)
				949	{
				950	*save = bio->bi_end_io;
				951	bio->bi_end_io = fn;
				952	}
				953
				954	static int ensure_next_mapping(struct pool *pool)
				955	{
				956	if (pool->next_mapping)
				957	return 0;
				958
				959	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
				960
				961	return pool->next_mapping ? 0 : -ENOMEM;
				962	}
				963
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	964	static struct dm_thin_new_mapping get_next_mapping(struct pool pool)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	965	{
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	966	struct dm_thin_new_mapping *r = pool->next_mapping;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	967
				968	BUG_ON(!pool->next_mapping);
				969
				970	pool->next_mapping = NULL;
				971
				972	return r;
				973	}
				974
				975	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	976	struct dm_dev *origin, dm_block_t data_origin,
				977	dm_block_t data_dest,
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	978	struct dm_bio_prison_cell cell, struct bio bio)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	979	{
				980	int r;
				981	struct pool *pool = tc->pool;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	982	struct dm_thin_new_mapping *m = get_next_mapping(pool);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	983
				984	INIT_LIST_HEAD(&m->list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	985	m->quiesced = 0;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	986	m->prepared = 0;
				987	m->tc = tc;
				988	m->virt_block = virt_block;
				989	m->data_block = data_dest;
				990	m->cell = cell;
				991	m->err = 0;
				992	m->bio = NULL;
				993
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	994	if (!ds_add_work(&pool->shared_read_ds, &m->list))
				995	m->quiesced = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	996
				997	/*
				998	* IO to pool_dev remaps to the pool target's data_dev.
				999	*
				1000	* If the whole block of data is being overwritten, we can issue the
				1001	* bio immediately. Otherwise we use kcopyd to clone the data first.
				1002	*/
				1003	if (io_overwrites_block(pool, bio)) {
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1004	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1005
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1006	h->overwrite_mapping = m;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1007	m->bio = bio;
				1008	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1009	remap_and_issue(tc, bio, data_dest);
				1010	} else {
				1011	struct dm_io_region from, to;
				1012
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1013	from.bdev = origin->bdev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1014	from.sector = data_origin * pool->sectors_per_block;
				1015	from.count = pool->sectors_per_block;
				1016
				1017	to.bdev = tc->pool_dev->bdev;
				1018	to.sector = data_dest * pool->sectors_per_block;
				1019	to.count = pool->sectors_per_block;
				1020
				1021	r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
				1022	0, copy_complete, m);
				1023	if (r < 0) {
				1024	mempool_free(m, pool->mapping_pool);
				1025	DMERR("dm_kcopyd_copy() failed");
				1026	cell_error(cell);
				1027	}
				1028	}
				1029	}
				1030
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1031	static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
				1032	dm_block_t data_origin, dm_block_t data_dest,
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1033	struct dm_bio_prison_cell cell, struct bio bio)
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1034	{
				1035	schedule_copy(tc, virt_block, tc->pool_dev,
				1036	data_origin, data_dest, cell, bio);
				1037	}
				1038
				1039	static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
				1040	dm_block_t data_dest,
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1041	struct dm_bio_prison_cell cell, struct bio bio)
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1042	{
				1043	schedule_copy(tc, virt_block, tc->origin_dev,
				1044	virt_block, data_dest, cell, bio);
				1045	}
				1046
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1047	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1048	dm_block_t data_block, struct dm_bio_prison_cell *cell,
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1049	struct bio *bio)
				1050	{
				1051	struct pool *pool = tc->pool;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1052	struct dm_thin_new_mapping *m = get_next_mapping(pool);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1053
				1054	INIT_LIST_HEAD(&m->list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1055	m->quiesced = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1056	m->prepared = 0;
				1057	m->tc = tc;
				1058	m->virt_block = virt_block;
				1059	m->data_block = data_block;
				1060	m->cell = cell;
				1061	m->err = 0;
				1062	m->bio = NULL;
				1063
				1064	/*
				1065	* If the whole block of data is being overwritten or we are not
				1066	* zeroing pre-existing data, we can issue the bio immediately.
				1067	* Otherwise we use kcopyd to zero the data first.
				1068	*/
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1069	if (!pool->pf.zero_new_blocks)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1070	process_prepared_mapping(m);
				1071
				1072	else if (io_overwrites_block(pool, bio)) {
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1073	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1074
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1075	h->overwrite_mapping = m;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1076	m->bio = bio;
				1077	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1078	remap_and_issue(tc, bio, data_block);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1079	} else {
				1080	int r;
				1081	struct dm_io_region to;
				1082
				1083	to.bdev = tc->pool_dev->bdev;
				1084	to.sector = data_block * pool->sectors_per_block;
				1085	to.count = pool->sectors_per_block;
				1086
				1087	r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
				1088	if (r < 0) {
				1089	mempool_free(m, pool->mapping_pool);
				1090	DMERR("dm_kcopyd_zero() failed");
				1091	cell_error(cell);
				1092	}
				1093	}
				1094	}
				1095
				1096	static int alloc_data_block(struct thin_c tc, dm_block_t result)
				1097	{
				1098	int r;
				1099	dm_block_t free_blocks;
				1100	unsigned long flags;
				1101	struct pool *pool = tc->pool;
				1102
				1103	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1104	if (r)
				1105	return r;
				1106
				1107	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
				1108	DMWARN("%s: reached low water mark, sending event.",
				1109	dm_device_name(pool->pool_md));
				1110	spin_lock_irqsave(&pool->lock, flags);
				1111	pool->low_water_triggered = 1;
				1112	spin_unlock_irqrestore(&pool->lock, flags);
				1113	dm_table_event(pool->ti->table);
				1114	}
				1115
				1116	if (!free_blocks) {
				1117	if (pool->no_free_space)
				1118	return -ENOSPC;
				1119	else {
				1120	/*
				1121	* Try to commit to see if that will free up some
				1122	* more space.
				1123	*/
				1124	r = dm_pool_commit_metadata(pool->pmd);
				1125	if (r) {
				1126	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1127	__func__, r);
				1128	return r;
				1129	}
				1130
				1131	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1132	if (r)
				1133	return r;
				1134
				1135	/*
				1136	* If we still have no space we set a flag to avoid
				1137	* doing all this checking and return -ENOSPC.
				1138	*/
				1139	if (!free_blocks) {
				1140	DMWARN("%s: no free space available.",
				1141	dm_device_name(pool->pool_md));
				1142	spin_lock_irqsave(&pool->lock, flags);
				1143	pool->no_free_space = 1;
				1144	spin_unlock_irqrestore(&pool->lock, flags);
				1145	return -ENOSPC;
				1146	}
				1147	}
				1148	}
				1149
				1150	r = dm_pool_alloc_data_block(pool->pmd, result);
				1151	if (r)
				1152	return r;
				1153
				1154	return 0;
				1155	}
				1156
				1157	/*
				1158	* If we have run out of space, queue bios until the device is
				1159	* resumed, presumably after having been reloaded with more space.
				1160	*/
				1161	static void retry_on_resume(struct bio *bio)
				1162	{
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1163	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1164	struct thin_c *tc = h->tc;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1165	struct pool *pool = tc->pool;
				1166	unsigned long flags;
				1167
				1168	spin_lock_irqsave(&pool->lock, flags);
				1169	bio_list_add(&pool->retry_on_resume_list, bio);
				1170	spin_unlock_irqrestore(&pool->lock, flags);
				1171	}
				1172
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1173	static void no_space(struct dm_bio_prison_cell *cell)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1174	{
				1175	struct bio *bio;
				1176	struct bio_list bios;
				1177
				1178	bio_list_init(&bios);
				1179	cell_release(cell, &bios);
				1180
				1181	while ((bio = bio_list_pop(&bios)))
				1182	retry_on_resume(bio);
				1183	}
				1184
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1185	static void process_discard(struct thin_c tc, struct bio bio)
				1186	{
				1187	int r;
Mike Snitzer	c3a0ce2	2012-05-12 01:43:16 +0100	[diff] [blame]	1188	unsigned long flags;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1189	struct pool *pool = tc->pool;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1190	struct dm_bio_prison_cell cell, cell2;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1191	struct cell_key key, key2;
				1192	dm_block_t block = get_bio_block(tc, bio);
				1193	struct dm_thin_lookup_result lookup_result;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1194	struct dm_thin_new_mapping *m;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1195
				1196	build_virtual_key(tc->td, block, &key);
				1197	if (bio_detain(tc->pool->prison, &key, bio, &cell))
				1198	return;
				1199
				1200	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1201	switch (r) {
				1202	case 0:
				1203	/*
				1204	* Check nobody is fiddling with this pool block. This can
				1205	* happen if someone's in the process of breaking sharing
				1206	* on this block.
				1207	*/
				1208	build_data_key(tc->td, lookup_result.block, &key2);
				1209	if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
				1210	cell_release_singleton(cell, bio);
				1211	break;
				1212	}
				1213
				1214	if (io_overlaps_block(pool, bio)) {
				1215	/*
				1216	* IO may still be going to the destination block. We must
				1217	* quiesce before we can do the removal.
				1218	*/
				1219	m = get_next_mapping(pool);
				1220	m->tc = tc;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1221	m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1222	m->virt_block = block;
				1223	m->data_block = lookup_result.block;
				1224	m->cell = cell;
				1225	m->cell2 = cell2;
				1226	m->err = 0;
				1227	m->bio = bio;
				1228
				1229	if (!ds_add_work(&pool->all_io_ds, &m->list)) {
Mike Snitzer	c3a0ce2	2012-05-12 01:43:16 +0100	[diff] [blame]	1230	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1231	list_add(&m->list, &pool->prepared_discards);
Mike Snitzer	c3a0ce2	2012-05-12 01:43:16 +0100	[diff] [blame]	1232	spin_unlock_irqrestore(&pool->lock, flags);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1233	wake_worker(pool);
				1234	}
				1235	} else {
				1236	/*
				1237	* This path is hit if people are ignoring
				1238	* limits->discard_granularity. It ignores any
				1239	* part of the discard that is in a subsequent
				1240	* block.
				1241	*/
				1242	sector_t offset = bio->bi_sector - (block << pool->block_shift);
				1243	unsigned remaining = (pool->sectors_per_block - offset) << 9;
				1244	bio->bi_size = min(bio->bi_size, remaining);
				1245
				1246	cell_release_singleton(cell, bio);
				1247	cell_release_singleton(cell2, bio);
				1248	remap_and_issue(tc, bio, lookup_result.block);
				1249	}
				1250	break;
				1251
				1252	case -ENODATA:
				1253	/*
				1254	* It isn't provisioned, just forget it.
				1255	*/
				1256	cell_release_singleton(cell, bio);
				1257	bio_endio(bio, 0);
				1258	break;
				1259
				1260	default:
				1261	DMERR("discard: find block unexpectedly returned %d", r);
				1262	cell_release_singleton(cell, bio);
				1263	bio_io_error(bio);
				1264	break;
				1265	}
				1266	}
				1267
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1268	static void break_sharing(struct thin_c tc, struct bio bio, dm_block_t block,
				1269	struct cell_key *key,
				1270	struct dm_thin_lookup_result *lookup_result,
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1271	struct dm_bio_prison_cell *cell)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1272	{
				1273	int r;
				1274	dm_block_t data_block;
				1275
				1276	r = alloc_data_block(tc, &data_block);
				1277	switch (r) {
				1278	case 0:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1279	schedule_internal_copy(tc, block, lookup_result->block,
				1280	data_block, cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1281	break;
				1282
				1283	case -ENOSPC:
				1284	no_space(cell);
				1285	break;
				1286
				1287	default:
				1288	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1289	cell_error(cell);
				1290	break;
				1291	}
				1292	}
				1293
				1294	static void process_shared_bio(struct thin_c tc, struct bio bio,
				1295	dm_block_t block,
				1296	struct dm_thin_lookup_result *lookup_result)
				1297	{
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1298	struct dm_bio_prison_cell *cell;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1299	struct pool *pool = tc->pool;
				1300	struct cell_key key;
				1301
				1302	/*
				1303	* If cell is already occupied, then sharing is already in the process
				1304	* of being broken so we have nothing further to do here.
				1305	*/
				1306	build_data_key(tc->td, lookup_result->block, &key);
				1307	if (bio_detain(pool->prison, &key, bio, &cell))
				1308	return;
				1309
				1310	if (bio_data_dir(bio) == WRITE)
				1311	break_sharing(tc, bio, block, &key, lookup_result, cell);
				1312	else {
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1313	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1314
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1315	h->shared_read_entry = ds_inc(&pool->shared_read_ds);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1316
				1317	cell_release_singleton(cell, bio);
				1318	remap_and_issue(tc, bio, lookup_result->block);
				1319	}
				1320	}
				1321
				1322	static void provision_block(struct thin_c tc, struct bio bio, dm_block_t block,
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1323	struct dm_bio_prison_cell *cell)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1324	{
				1325	int r;
				1326	dm_block_t data_block;
				1327
				1328	/*
				1329	* Remap empty bios (flushes) immediately, without provisioning.
				1330	*/
				1331	if (!bio->bi_size) {
				1332	cell_release_singleton(cell, bio);
				1333	remap_and_issue(tc, bio, 0);
				1334	return;
				1335	}
				1336
				1337	/*
				1338	* Fill read bios with zeroes and complete them immediately.
				1339	*/
				1340	if (bio_data_dir(bio) == READ) {
				1341	zero_fill_bio(bio);
				1342	cell_release_singleton(cell, bio);
				1343	bio_endio(bio, 0);
				1344	return;
				1345	}
				1346
				1347	r = alloc_data_block(tc, &data_block);
				1348	switch (r) {
				1349	case 0:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1350	if (tc->origin_dev)
				1351	schedule_external_copy(tc, block, data_block, cell, bio);
				1352	else
				1353	schedule_zero(tc, block, data_block, cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1354	break;
				1355
				1356	case -ENOSPC:
				1357	no_space(cell);
				1358	break;
				1359
				1360	default:
				1361	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1362	cell_error(cell);
				1363	break;
				1364	}
				1365	}
				1366
				1367	static void process_bio(struct thin_c tc, struct bio bio)
				1368	{
				1369	int r;
				1370	dm_block_t block = get_bio_block(tc, bio);
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1371	struct dm_bio_prison_cell *cell;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1372	struct cell_key key;
				1373	struct dm_thin_lookup_result lookup_result;
				1374
				1375	/*
				1376	* If cell is already occupied, then the block is already
				1377	* being provisioned so we have nothing further to do here.
				1378	*/
				1379	build_virtual_key(tc->td, block, &key);
				1380	if (bio_detain(tc->pool->prison, &key, bio, &cell))
				1381	return;
				1382
				1383	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1384	switch (r) {
				1385	case 0:
				1386	/*
				1387	* We can release this cell now. This thread is the only
				1388	* one that puts bios into a cell, and we know there were
				1389	* no preceding bios.
				1390	*/
				1391	/*
				1392	* TODO: this will probably have to change when discard goes
				1393	* back in.
				1394	*/
				1395	cell_release_singleton(cell, bio);
				1396
				1397	if (lookup_result.shared)
				1398	process_shared_bio(tc, bio, block, &lookup_result);
				1399	else
				1400	remap_and_issue(tc, bio, lookup_result.block);
				1401	break;
				1402
				1403	case -ENODATA:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1404	if (bio_data_dir(bio) == READ && tc->origin_dev) {
				1405	cell_release_singleton(cell, bio);
				1406	remap_to_origin_and_issue(tc, bio);
				1407	} else
				1408	provision_block(tc, bio, block, cell);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1409	break;
				1410
				1411	default:
				1412	DMERR("dm_thin_find_block() failed, error = %d", r);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1413	cell_release_singleton(cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1414	bio_io_error(bio);
				1415	break;
				1416	}
				1417	}
				1418
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1419	static int need_commit_due_to_time(struct pool *pool)
				1420	{
				1421	return jiffies < pool->last_commit_jiffies \|\|
				1422	jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
				1423	}
				1424
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1425	static void process_deferred_bios(struct pool *pool)
				1426	{
				1427	unsigned long flags;
				1428	struct bio *bio;
				1429	struct bio_list bios;
				1430	int r;
				1431
				1432	bio_list_init(&bios);
				1433
				1434	spin_lock_irqsave(&pool->lock, flags);
				1435	bio_list_merge(&bios, &pool->deferred_bios);
				1436	bio_list_init(&pool->deferred_bios);
				1437	spin_unlock_irqrestore(&pool->lock, flags);
				1438
				1439	while ((bio = bio_list_pop(&bios))) {
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1440	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1441	struct thin_c *tc = h->tc;
				1442
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1443	/*
				1444	* If we've got no free new_mapping structs, and processing
				1445	* this bio might require one, we pause until there are some
				1446	* prepared mappings to process.
				1447	*/
				1448	if (ensure_next_mapping(pool)) {
				1449	spin_lock_irqsave(&pool->lock, flags);
				1450	bio_list_merge(&pool->deferred_bios, &bios);
				1451	spin_unlock_irqrestore(&pool->lock, flags);
				1452
				1453	break;
				1454	}
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1455
				1456	if (bio->bi_rw & REQ_DISCARD)
				1457	process_discard(tc, bio);
				1458	else
				1459	process_bio(tc, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1460	}
				1461
				1462	/*
				1463	* If there are any deferred flush bios, we must commit
				1464	* the metadata before issuing them.
				1465	*/
				1466	bio_list_init(&bios);
				1467	spin_lock_irqsave(&pool->lock, flags);
				1468	bio_list_merge(&bios, &pool->deferred_flush_bios);
				1469	bio_list_init(&pool->deferred_flush_bios);
				1470	spin_unlock_irqrestore(&pool->lock, flags);
				1471
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1472	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1473	return;
				1474
				1475	r = dm_pool_commit_metadata(pool->pmd);
				1476	if (r) {
				1477	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1478	__func__, r);
				1479	while ((bio = bio_list_pop(&bios)))
				1480	bio_io_error(bio);
				1481	return;
				1482	}
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1483	pool->last_commit_jiffies = jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1484
				1485	while ((bio = bio_list_pop(&bios)))
				1486	generic_make_request(bio);
				1487	}
				1488
				1489	static void do_worker(struct work_struct *ws)
				1490	{
				1491	struct pool *pool = container_of(ws, struct pool, worker);
				1492
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1493	process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
				1494	process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1495	process_deferred_bios(pool);
				1496	}
				1497
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1498	/*
				1499	* We want to commit periodically so that not too much
				1500	* unwritten data builds up.
				1501	*/
				1502	static void do_waker(struct work_struct *ws)
				1503	{
				1504	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
				1505	wake_worker(pool);
				1506	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
				1507	}
				1508
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1509	/----------------------------------------------------------------/
				1510
				1511	/*
				1512	* Mapping functions.
				1513	*/
				1514
				1515	/*
				1516	* Called only while mapping a thin bio to hand it over to the workqueue.
				1517	*/
				1518	static void thin_defer_bio(struct thin_c tc, struct bio bio)
				1519	{
				1520	unsigned long flags;
				1521	struct pool *pool = tc->pool;
				1522
				1523	spin_lock_irqsave(&pool->lock, flags);
				1524	bio_list_add(&pool->deferred_bios, bio);
				1525	spin_unlock_irqrestore(&pool->lock, flags);
				1526
				1527	wake_worker(pool);
				1528	}
				1529
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1530	static struct dm_thin_endio_hook thin_hook_bio(struct thin_c tc, struct bio *bio)
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1531	{
				1532	struct pool *pool = tc->pool;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1533	struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1534
				1535	h->tc = tc;
				1536	h->shared_read_entry = NULL;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1537	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1538	h->overwrite_mapping = NULL;
				1539
				1540	return h;
				1541	}
				1542
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1543	/*
				1544	* Non-blocking function called from the thin target's map function.
				1545	*/
				1546	static int thin_bio_map(struct dm_target ti, struct bio bio,
				1547	union map_info *map_context)
				1548	{
				1549	int r;
				1550	struct thin_c *tc = ti->private;
				1551	dm_block_t block = get_bio_block(tc, bio);
				1552	struct dm_thin_device *td = tc->td;
				1553	struct dm_thin_lookup_result result;
				1554
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1555	map_context->ptr = thin_hook_bio(tc, bio);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1556	if (bio->bi_rw & (REQ_DISCARD \| REQ_FLUSH \| REQ_FUA)) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1557	thin_defer_bio(tc, bio);
				1558	return DM_MAPIO_SUBMITTED;
				1559	}
				1560
				1561	r = dm_thin_find_block(td, block, 0, &result);
				1562
				1563	/*
				1564	* Note that we defer readahead too.
				1565	*/
				1566	switch (r) {
				1567	case 0:
				1568	if (unlikely(result.shared)) {
				1569	/*
				1570	* We have a race condition here between the
				1571	* result.shared value returned by the lookup and
				1572	* snapshot creation, which may cause new
				1573	* sharing.
				1574	*
				1575	* To avoid this always quiesce the origin before
				1576	* taking the snap. You want to do this anyway to
				1577	* ensure a consistent application view
				1578	* (i.e. lockfs).
				1579	*
				1580	* More distant ancestors are irrelevant. The
				1581	* shared flag will be set in their case.
				1582	*/
				1583	thin_defer_bio(tc, bio);
				1584	r = DM_MAPIO_SUBMITTED;
				1585	} else {
				1586	remap(tc, bio, result.block);
				1587	r = DM_MAPIO_REMAPPED;
				1588	}
				1589	break;
				1590
				1591	case -ENODATA:
				1592	/*
				1593	* In future, the failed dm_thin_find_block above could
				1594	* provide the hint to load the metadata into cache.
				1595	*/
				1596	case -EWOULDBLOCK:
				1597	thin_defer_bio(tc, bio);
				1598	r = DM_MAPIO_SUBMITTED;
				1599	break;
				1600	}
				1601
				1602	return r;
				1603	}
				1604
				1605	static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				1606	{
				1607	int r;
				1608	unsigned long flags;
				1609	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
				1610
				1611	spin_lock_irqsave(&pt->pool->lock, flags);
				1612	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
				1613	spin_unlock_irqrestore(&pt->pool->lock, flags);
				1614
				1615	if (!r) {
				1616	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				1617	r = bdi_congested(&q->backing_dev_info, bdi_bits);
				1618	}
				1619
				1620	return r;
				1621	}
				1622
				1623	static void __requeue_bios(struct pool *pool)
				1624	{
				1625	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
				1626	bio_list_init(&pool->retry_on_resume_list);
				1627	}
				1628
				1629	/*----------------------------------------------------------------
				1630	* Binding of control targets to a pool object
				1631	--------------------------------------------------------------/
				1632	static int bind_control_target(struct pool pool, struct dm_target ti)
				1633	{
				1634	struct pool_c *pt = ti->private;
				1635
				1636	pool->ti = ti;
				1637	pool->low_water_blocks = pt->low_water_blocks;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1638	pool->pf = pt->pf;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1639
Mike Snitzer	f402693	2012-05-19 01:01:01 +0100	[diff] [blame]	1640	/*
				1641	* If discard_passdown was enabled verify that the data device
				1642	* supports discards. Disable discard_passdown if not; otherwise
				1643	* -EOPNOTSUPP will be returned.
				1644	*/
				1645	if (pt->pf.discard_passdown) {
				1646	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				1647	if (!q \|\| !blk_queue_discard(q)) {
				1648	char buf[BDEVNAME_SIZE];
				1649	DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.",
				1650	bdevname(pt->data_dev->bdev, buf));
				1651	pool->pf.discard_passdown = 0;
				1652	}
				1653	}
				1654
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1655	return 0;
				1656	}
				1657
				1658	static void unbind_control_target(struct pool pool, struct dm_target ti)
				1659	{
				1660	if (pool->ti == ti)
				1661	pool->ti = NULL;
				1662	}
				1663
				1664	/*----------------------------------------------------------------
				1665	* Pool creation
				1666	--------------------------------------------------------------/
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1667	/* Initialize pool features. */
				1668	static void pool_features_init(struct pool_features *pf)
				1669	{
				1670	pf->zero_new_blocks = 1;
				1671	pf->discard_enabled = 1;
				1672	pf->discard_passdown = 1;
				1673	}
				1674
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1675	static void __pool_destroy(struct pool *pool)
				1676	{
				1677	__pool_table_remove(pool);
				1678
				1679	if (dm_pool_metadata_close(pool->pmd) < 0)
				1680	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1681
				1682	prison_destroy(pool->prison);
				1683	dm_kcopyd_client_destroy(pool->copier);
				1684
				1685	if (pool->wq)
				1686	destroy_workqueue(pool->wq);
				1687
				1688	if (pool->next_mapping)
				1689	mempool_free(pool->next_mapping, pool->mapping_pool);
				1690	mempool_destroy(pool->mapping_pool);
				1691	mempool_destroy(pool->endio_hook_pool);
				1692	kfree(pool);
				1693	}
				1694
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1695	static struct kmem_cache *_new_mapping_cache;
				1696	static struct kmem_cache *_endio_hook_cache;
				1697
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1698	static struct pool pool_create(struct mapped_device pool_md,
				1699	struct block_device *metadata_dev,
				1700	unsigned long block_size, char **error)
				1701	{
				1702	int r;
				1703	void *err_p;
				1704	struct pool *pool;
				1705	struct dm_pool_metadata *pmd;
				1706
				1707	pmd = dm_pool_metadata_open(metadata_dev, block_size);
				1708	if (IS_ERR(pmd)) {
				1709	*error = "Error creating metadata object";
				1710	return (struct pool *)pmd;
				1711	}
				1712
				1713	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
				1714	if (!pool) {
				1715	*error = "Error allocating memory for pool";
				1716	err_p = ERR_PTR(-ENOMEM);
				1717	goto bad_pool;
				1718	}
				1719
				1720	pool->pmd = pmd;
				1721	pool->sectors_per_block = block_size;
				1722	pool->block_shift = ffs(block_size) - 1;
				1723	pool->offset_mask = block_size - 1;
				1724	pool->low_water_blocks = 0;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1725	pool_features_init(&pool->pf);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1726	pool->prison = prison_create(PRISON_CELLS);
				1727	if (!pool->prison) {
				1728	*error = "Error creating pool's bio prison";
				1729	err_p = ERR_PTR(-ENOMEM);
				1730	goto bad_prison;
				1731	}
				1732
				1733	pool->copier = dm_kcopyd_client_create();
				1734	if (IS_ERR(pool->copier)) {
				1735	r = PTR_ERR(pool->copier);
				1736	*error = "Error creating pool's kcopyd client";
				1737	err_p = ERR_PTR(r);
				1738	goto bad_kcopyd_client;
				1739	}
				1740
				1741	/*
				1742	* Create singlethreaded workqueue that will service all devices
				1743	* that use this metadata.
				1744	*/
				1745	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
				1746	if (!pool->wq) {
				1747	*error = "Error creating pool's workqueue";
				1748	err_p = ERR_PTR(-ENOMEM);
				1749	goto bad_wq;
				1750	}
				1751
				1752	INIT_WORK(&pool->worker, do_worker);
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1753	INIT_DELAYED_WORK(&pool->waker, do_waker);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1754	spin_lock_init(&pool->lock);
				1755	bio_list_init(&pool->deferred_bios);
				1756	bio_list_init(&pool->deferred_flush_bios);
				1757	INIT_LIST_HEAD(&pool->prepared_mappings);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1758	INIT_LIST_HEAD(&pool->prepared_discards);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1759	pool->low_water_triggered = 0;
				1760	pool->no_free_space = 0;
				1761	bio_list_init(&pool->retry_on_resume_list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1762	ds_init(&pool->shared_read_ds);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1763	ds_init(&pool->all_io_ds);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1764
				1765	pool->next_mapping = NULL;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1766	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
				1767	_new_mapping_cache);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1768	if (!pool->mapping_pool) {
				1769	*error = "Error creating pool's mapping mempool";
				1770	err_p = ERR_PTR(-ENOMEM);
				1771	goto bad_mapping_pool;
				1772	}
				1773
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	1774	pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
				1775	_endio_hook_cache);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1776	if (!pool->endio_hook_pool) {
				1777	*error = "Error creating pool's endio_hook mempool";
				1778	err_p = ERR_PTR(-ENOMEM);
				1779	goto bad_endio_hook_pool;
				1780	}
				1781	pool->ref_count = 1;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1782	pool->last_commit_jiffies = jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1783	pool->pool_md = pool_md;
				1784	pool->md_dev = metadata_dev;
				1785	__pool_table_insert(pool);
				1786
				1787	return pool;
				1788
				1789	bad_endio_hook_pool:
				1790	mempool_destroy(pool->mapping_pool);
				1791	bad_mapping_pool:
				1792	destroy_workqueue(pool->wq);
				1793	bad_wq:
				1794	dm_kcopyd_client_destroy(pool->copier);
				1795	bad_kcopyd_client:
				1796	prison_destroy(pool->prison);
				1797	bad_prison:
				1798	kfree(pool);
				1799	bad_pool:
				1800	if (dm_pool_metadata_close(pmd))
				1801	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1802
				1803	return err_p;
				1804	}
				1805
				1806	static void __pool_inc(struct pool *pool)
				1807	{
				1808	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1809	pool->ref_count++;
				1810	}
				1811
				1812	static void __pool_dec(struct pool *pool)
				1813	{
				1814	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1815	BUG_ON(!pool->ref_count);
				1816	if (!--pool->ref_count)
				1817	__pool_destroy(pool);
				1818	}
				1819
				1820	static struct pool __pool_find(struct mapped_device pool_md,
				1821	struct block_device *metadata_dev,
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1822	unsigned long block_size, char **error,
				1823	int *created)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1824	{
				1825	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
				1826
				1827	if (pool) {
				1828	if (pool->pool_md != pool_md)
				1829	return ERR_PTR(-EBUSY);
				1830	__pool_inc(pool);
				1831
				1832	} else {
				1833	pool = __pool_table_lookup(pool_md);
				1834	if (pool) {
				1835	if (pool->md_dev != metadata_dev)
				1836	return ERR_PTR(-EINVAL);
				1837	__pool_inc(pool);
				1838
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1839	} else {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1840	pool = pool_create(pool_md, metadata_dev, block_size, error);
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1841	*created = 1;
				1842	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1843	}
				1844
				1845	return pool;
				1846	}
				1847
				1848	/*----------------------------------------------------------------
				1849	* Pool target methods
				1850	--------------------------------------------------------------/
				1851	static void pool_dtr(struct dm_target *ti)
				1852	{
				1853	struct pool_c *pt = ti->private;
				1854
				1855	mutex_lock(&dm_thin_pool_table.mutex);
				1856
				1857	unbind_control_target(pt->pool, ti);
				1858	__pool_dec(pt->pool);
				1859	dm_put_device(ti, pt->metadata_dev);
				1860	dm_put_device(ti, pt->data_dev);
				1861	kfree(pt);
				1862
				1863	mutex_unlock(&dm_thin_pool_table.mutex);
				1864	}
				1865
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1866	static int parse_pool_features(struct dm_arg_set as, struct pool_features pf,
				1867	struct dm_target *ti)
				1868	{
				1869	int r;
				1870	unsigned argc;
				1871	const char *arg_name;
				1872
				1873	static struct dm_arg _args[] = {
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1874	{0, 3, "Invalid number of pool feature arguments"},
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1875	};
				1876
				1877	/*
				1878	* No feature arguments supplied.
				1879	*/
				1880	if (!as->argc)
				1881	return 0;
				1882
				1883	r = dm_read_arg_group(_args, as, &argc, &ti->error);
				1884	if (r)
				1885	return -EINVAL;
				1886
				1887	while (argc && !r) {
				1888	arg_name = dm_shift_arg(as);
				1889	argc--;
				1890
				1891	if (!strcasecmp(arg_name, "skip_block_zeroing")) {
				1892	pf->zero_new_blocks = 0;
				1893	continue;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1894	} else if (!strcasecmp(arg_name, "ignore_discard")) {
				1895	pf->discard_enabled = 0;
				1896	continue;
				1897	} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
				1898	pf->discard_passdown = 0;
				1899	continue;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1900	}
				1901
				1902	ti->error = "Unrecognised pool feature requested";
				1903	r = -EINVAL;
				1904	}
				1905
				1906	return r;
				1907	}
				1908
				1909	/*
				1910	* thin-pool <metadata dev> <data dev>
				1911	* <data block size (sectors)>
				1912	* <low water mark (blocks)>
				1913	* [<#feature args> [<arg>]*]
				1914	*
				1915	* Optional feature arguments are:
				1916	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1917	* ignore_discard: disable discard
				1918	* no_discard_passdown: don't pass discards down to the data device
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1919	*/
				1920	static int pool_ctr(struct dm_target ti, unsigned argc, char *argv)
				1921	{
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1922	int r, pool_created = 0;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1923	struct pool_c *pt;
				1924	struct pool *pool;
				1925	struct pool_features pf;
				1926	struct dm_arg_set as;
				1927	struct dm_dev *data_dev;
				1928	unsigned long block_size;
				1929	dm_block_t low_water_blocks;
				1930	struct dm_dev *metadata_dev;
				1931	sector_t metadata_dev_size;
Mike Snitzer	c4a69ec	2012-03-28 18:41:28 +0100	[diff] [blame]	1932	char b[BDEVNAME_SIZE];
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1933
				1934	/*
				1935	* FIXME Remove validation from scope of lock.
				1936	*/
				1937	mutex_lock(&dm_thin_pool_table.mutex);
				1938
				1939	if (argc < 4) {
				1940	ti->error = "Invalid argument count";
				1941	r = -EINVAL;
				1942	goto out_unlock;
				1943	}
				1944	as.argc = argc;
				1945	as.argv = argv;
				1946
				1947	r = dm_get_device(ti, argv[0], FMODE_READ \| FMODE_WRITE, &metadata_dev);
				1948	if (r) {
				1949	ti->error = "Error opening metadata block device";
				1950	goto out_unlock;
				1951	}
				1952
				1953	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
Mike Snitzer	c4a69ec	2012-03-28 18:41:28 +0100	[diff] [blame]	1954	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
				1955	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				1956	bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1957
				1958	r = dm_get_device(ti, argv[1], FMODE_READ \| FMODE_WRITE, &data_dev);
				1959	if (r) {
				1960	ti->error = "Error getting data device";
				1961	goto out_metadata;
				1962	}
				1963
				1964	if (kstrtoul(argv[2], 10, &block_size) \|\| !block_size \|\|
				1965	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				1966	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				1967	!is_power_of_2(block_size)) {
				1968	ti->error = "Invalid block size";
				1969	r = -EINVAL;
				1970	goto out;
				1971	}
				1972
				1973	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
				1974	ti->error = "Invalid low water mark";
				1975	r = -EINVAL;
				1976	goto out;
				1977	}
				1978
				1979	/*
				1980	* Set default pool features.
				1981	*/
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1982	pool_features_init(&pf);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1983
				1984	dm_consume_args(&as, 4);
				1985	r = parse_pool_features(&as, &pf, ti);
				1986	if (r)
				1987	goto out;
				1988
				1989	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
				1990	if (!pt) {
				1991	r = -ENOMEM;
				1992	goto out;
				1993	}
				1994
				1995	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1996	block_size, &ti->error, &pool_created);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1997	if (IS_ERR(pool)) {
				1998	r = PTR_ERR(pool);
				1999	goto out_free_pt;
				2000	}
				2001
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2002	/*
				2003	* 'pool_created' reflects whether this is the first table load.
				2004	* Top level discard support is not allowed to be changed after
				2005	* initial load. This would require a pool reload to trigger thin
				2006	* device changes.
				2007	*/
				2008	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
				2009	ti->error = "Discard support cannot be disabled once enabled";
				2010	r = -EINVAL;
				2011	goto out_flags_changed;
				2012	}
				2013
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2014	pt->pool = pool;
				2015	pt->ti = ti;
				2016	pt->metadata_dev = metadata_dev;
				2017	pt->data_dev = data_dev;
				2018	pt->low_water_blocks = low_water_blocks;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2019	pt->pf = pf;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2020	ti->num_flush_requests = 1;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2021	/*
				2022	* Only need to enable discards if the pool should pass
				2023	* them down to the data device. The thin device's discard
				2024	* processing will cause mappings to be removed from the btree.
				2025	*/
				2026	if (pf.discard_enabled && pf.discard_passdown) {
				2027	ti->num_discard_requests = 1;
				2028	/*
				2029	* Setting 'discards_supported' circumvents the normal
				2030	* stacking of discard limits (this keeps the pool and
				2031	* thin devices' discard limits consistent).
				2032	*/
				2033	ti->discards_supported = 1;
				2034	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2035	ti->private = pt;
				2036
				2037	pt->callbacks.congested_fn = pool_is_congested;
				2038	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
				2039
				2040	mutex_unlock(&dm_thin_pool_table.mutex);
				2041
				2042	return 0;
				2043
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2044	out_flags_changed:
				2045	__pool_dec(pool);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2046	out_free_pt:
				2047	kfree(pt);
				2048	out:
				2049	dm_put_device(ti, data_dev);
				2050	out_metadata:
				2051	dm_put_device(ti, metadata_dev);
				2052	out_unlock:
				2053	mutex_unlock(&dm_thin_pool_table.mutex);
				2054
				2055	return r;
				2056	}
				2057
				2058	static int pool_map(struct dm_target ti, struct bio bio,
				2059	union map_info *map_context)
				2060	{
				2061	int r;
				2062	struct pool_c *pt = ti->private;
				2063	struct pool *pool = pt->pool;
				2064	unsigned long flags;
				2065
				2066	/*
				2067	* As this is a singleton target, ti->begin is always zero.
				2068	*/
				2069	spin_lock_irqsave(&pool->lock, flags);
				2070	bio->bi_bdev = pt->data_dev->bdev;
				2071	r = DM_MAPIO_REMAPPED;
				2072	spin_unlock_irqrestore(&pool->lock, flags);
				2073
				2074	return r;
				2075	}
				2076
				2077	/*
				2078	* Retrieves the number of blocks of the data device from
				2079	* the superblock and compares it to the actual device size,
				2080	* thus resizing the data device in case it has grown.
				2081	*
				2082	* This both copes with opening preallocated data devices in the ctr
				2083	* being followed by a resume
				2084	* -and-
				2085	* calling the resume method individually after userspace has
				2086	* grown the data device in reaction to a table event.
				2087	*/
				2088	static int pool_preresume(struct dm_target *ti)
				2089	{
				2090	int r;
				2091	struct pool_c *pt = ti->private;
				2092	struct pool *pool = pt->pool;
				2093	dm_block_t data_size, sb_data_size;
				2094
				2095	/*
				2096	* Take control of the pool object.
				2097	*/
				2098	r = bind_control_target(pool, ti);
				2099	if (r)
				2100	return r;
				2101
				2102	data_size = ti->len >> pool->block_shift;
				2103	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
				2104	if (r) {
				2105	DMERR("failed to retrieve data device size");
				2106	return r;
				2107	}
				2108
				2109	if (data_size < sb_data_size) {
				2110	DMERR("pool target too small, is %llu blocks (expected %llu)",
				2111	data_size, sb_data_size);
				2112	return -EINVAL;
				2113
				2114	} else if (data_size > sb_data_size) {
				2115	r = dm_pool_resize_data_dev(pool->pmd, data_size);
				2116	if (r) {
				2117	DMERR("failed to resize data device");
				2118	return r;
				2119	}
				2120
				2121	r = dm_pool_commit_metadata(pool->pmd);
				2122	if (r) {
				2123	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				2124	__func__, r);
				2125	return r;
				2126	}
				2127	}
				2128
				2129	return 0;
				2130	}
				2131
				2132	static void pool_resume(struct dm_target *ti)
				2133	{
				2134	struct pool_c *pt = ti->private;
				2135	struct pool *pool = pt->pool;
				2136	unsigned long flags;
				2137
				2138	spin_lock_irqsave(&pool->lock, flags);
				2139	pool->low_water_triggered = 0;
				2140	pool->no_free_space = 0;
				2141	__requeue_bios(pool);
				2142	spin_unlock_irqrestore(&pool->lock, flags);
				2143
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	2144	do_waker(&pool->waker.work);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2145	}
				2146
				2147	static void pool_postsuspend(struct dm_target *ti)
				2148	{
				2149	int r;
				2150	struct pool_c *pt = ti->private;
				2151	struct pool *pool = pt->pool;
				2152
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	2153	cancel_delayed_work(&pool->waker);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2154	flush_workqueue(pool->wq);
				2155
				2156	r = dm_pool_commit_metadata(pool->pmd);
				2157	if (r < 0) {
				2158	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				2159	__func__, r);
				2160	/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
				2161	}
				2162	}
				2163
				2164	static int check_arg_count(unsigned argc, unsigned args_required)
				2165	{
				2166	if (argc != args_required) {
				2167	DMWARN("Message received with %u arguments instead of %u.",
				2168	argc, args_required);
				2169	return -EINVAL;
				2170	}
				2171
				2172	return 0;
				2173	}
				2174
				2175	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
				2176	{
				2177	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
				2178	*dev_id <= MAX_DEV_ID)
				2179	return 0;
				2180
				2181	if (warning)
				2182	DMWARN("Message received with invalid device id: %s", arg);
				2183
				2184	return -EINVAL;
				2185	}
				2186
				2187	static int process_create_thin_mesg(unsigned argc, char *argv, struct pool pool)
				2188	{
				2189	dm_thin_id dev_id;
				2190	int r;
				2191
				2192	r = check_arg_count(argc, 2);
				2193	if (r)
				2194	return r;
				2195
				2196	r = read_dev_id(argv[1], &dev_id, 1);
				2197	if (r)
				2198	return r;
				2199
				2200	r = dm_pool_create_thin(pool->pmd, dev_id);
				2201	if (r) {
				2202	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
				2203	argv[1]);
				2204	return r;
				2205	}
				2206
				2207	return 0;
				2208	}
				2209
				2210	static int process_create_snap_mesg(unsigned argc, char *argv, struct pool pool)
				2211	{
				2212	dm_thin_id dev_id;
				2213	dm_thin_id origin_dev_id;
				2214	int r;
				2215
				2216	r = check_arg_count(argc, 3);
				2217	if (r)
				2218	return r;
				2219
				2220	r = read_dev_id(argv[1], &dev_id, 1);
				2221	if (r)
				2222	return r;
				2223
				2224	r = read_dev_id(argv[2], &origin_dev_id, 1);
				2225	if (r)
				2226	return r;
				2227
				2228	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
				2229	if (r) {
				2230	DMWARN("Creation of new snapshot %s of device %s failed.",
				2231	argv[1], argv[2]);
				2232	return r;
				2233	}
				2234
				2235	return 0;
				2236	}
				2237
				2238	static int process_delete_mesg(unsigned argc, char *argv, struct pool pool)
				2239	{
				2240	dm_thin_id dev_id;
				2241	int r;
				2242
				2243	r = check_arg_count(argc, 2);
				2244	if (r)
				2245	return r;
				2246
				2247	r = read_dev_id(argv[1], &dev_id, 1);
				2248	if (r)
				2249	return r;
				2250
				2251	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
				2252	if (r)
				2253	DMWARN("Deletion of thin device %s failed.", argv[1]);
				2254
				2255	return r;
				2256	}
				2257
				2258	static int process_set_transaction_id_mesg(unsigned argc, char *argv, struct pool pool)
				2259	{
				2260	dm_thin_id old_id, new_id;
				2261	int r;
				2262
				2263	r = check_arg_count(argc, 3);
				2264	if (r)
				2265	return r;
				2266
				2267	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
				2268	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
				2269	return -EINVAL;
				2270	}
				2271
				2272	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
				2273	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
				2274	return -EINVAL;
				2275	}
				2276
				2277	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
				2278	if (r) {
				2279	DMWARN("Failed to change transaction id from %s to %s.",
				2280	argv[1], argv[2]);
				2281	return r;
				2282	}
				2283
				2284	return 0;
				2285	}
				2286
				2287	/*
				2288	* Messages supported:
				2289	* create_thin <dev_id>
				2290	* create_snap <dev_id> <origin_id>
				2291	* delete <dev_id>
				2292	* trim <dev_id> <new_size_in_sectors>
				2293	* set_transaction_id <current_trans_id> <new_trans_id>
				2294	*/
				2295	static int pool_message(struct dm_target ti, unsigned argc, char *argv)
				2296	{
				2297	int r = -EINVAL;
				2298	struct pool_c *pt = ti->private;
				2299	struct pool *pool = pt->pool;
				2300
				2301	if (!strcasecmp(argv[0], "create_thin"))
				2302	r = process_create_thin_mesg(argc, argv, pool);
				2303
				2304	else if (!strcasecmp(argv[0], "create_snap"))
				2305	r = process_create_snap_mesg(argc, argv, pool);
				2306
				2307	else if (!strcasecmp(argv[0], "delete"))
				2308	r = process_delete_mesg(argc, argv, pool);
				2309
				2310	else if (!strcasecmp(argv[0], "set_transaction_id"))
				2311	r = process_set_transaction_id_mesg(argc, argv, pool);
				2312
				2313	else
				2314	DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
				2315
				2316	if (!r) {
				2317	r = dm_pool_commit_metadata(pool->pmd);
				2318	if (r)
				2319	DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
				2320	argv[0], r);
				2321	}
				2322
				2323	return r;
				2324	}
				2325
				2326	/*
				2327	* Status line is:
				2328	* <transaction id> <used metadata sectors>/<total metadata sectors>
				2329	* <used data sectors>/<total data sectors> <held metadata root>
				2330	*/
				2331	static int pool_status(struct dm_target *ti, status_type_t type,
				2332	char *result, unsigned maxlen)
				2333	{
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2334	int r, count;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2335	unsigned sz = 0;
				2336	uint64_t transaction_id;
				2337	dm_block_t nr_free_blocks_data;
				2338	dm_block_t nr_free_blocks_metadata;
				2339	dm_block_t nr_blocks_data;
				2340	dm_block_t nr_blocks_metadata;
				2341	dm_block_t held_root;
				2342	char buf[BDEVNAME_SIZE];
				2343	char buf2[BDEVNAME_SIZE];
				2344	struct pool_c *pt = ti->private;
				2345	struct pool *pool = pt->pool;
				2346
				2347	switch (type) {
				2348	case STATUSTYPE_INFO:
				2349	r = dm_pool_get_metadata_transaction_id(pool->pmd,
				2350	&transaction_id);
				2351	if (r)
				2352	return r;
				2353
				2354	r = dm_pool_get_free_metadata_block_count(pool->pmd,
				2355	&nr_free_blocks_metadata);
				2356	if (r)
				2357	return r;
				2358
				2359	r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
				2360	if (r)
				2361	return r;
				2362
				2363	r = dm_pool_get_free_block_count(pool->pmd,
				2364	&nr_free_blocks_data);
				2365	if (r)
				2366	return r;
				2367
				2368	r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
				2369	if (r)
				2370	return r;
				2371
				2372	r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
				2373	if (r)
				2374	return r;
				2375
				2376	DMEMIT("%llu %llu/%llu %llu/%llu ",
				2377	(unsigned long long)transaction_id,
				2378	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				2379	(unsigned long long)nr_blocks_metadata,
				2380	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
				2381	(unsigned long long)nr_blocks_data);
				2382
				2383	if (held_root)
				2384	DMEMIT("%llu", held_root);
				2385	else
				2386	DMEMIT("-");
				2387
				2388	break;
				2389
				2390	case STATUSTYPE_TABLE:
				2391	DMEMIT("%s %s %lu %llu ",
				2392	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
				2393	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
				2394	(unsigned long)pool->sectors_per_block,
				2395	(unsigned long long)pt->low_water_blocks);
				2396
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2397	count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
Mike Snitzer	f402693	2012-05-19 01:01:01 +0100	[diff] [blame]	2398	!pt->pf.discard_passdown;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2399	DMEMIT("%u ", count);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2400
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2401	if (!pool->pf.zero_new_blocks)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2402	DMEMIT("skip_block_zeroing ");
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2403
				2404	if (!pool->pf.discard_enabled)
				2405	DMEMIT("ignore_discard ");
				2406
Mike Snitzer	f402693	2012-05-19 01:01:01 +0100	[diff] [blame]	2407	if (!pt->pf.discard_passdown)
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2408	DMEMIT("no_discard_passdown ");
				2409
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2410	break;
				2411	}
				2412
				2413	return 0;
				2414	}
				2415
				2416	static int pool_iterate_devices(struct dm_target *ti,
				2417	iterate_devices_callout_fn fn, void *data)
				2418	{
				2419	struct pool_c *pt = ti->private;
				2420
				2421	return fn(ti, pt->data_dev, 0, ti->len, data);
				2422	}
				2423
				2424	static int pool_merge(struct dm_target ti, struct bvec_merge_data bvm,
				2425	struct bio_vec *biovec, int max_size)
				2426	{
				2427	struct pool_c *pt = ti->private;
				2428	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				2429
				2430	if (!q->merge_bvec_fn)
				2431	return max_size;
				2432
				2433	bvm->bi_bdev = pt->data_dev->bdev;
				2434
				2435	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
				2436	}
				2437
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2438	static void set_discard_limits(struct pool pool, struct queue_limits limits)
				2439	{
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2440	/*
				2441	* FIXME: these limits may be incompatible with the pool's data device
				2442	*/
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2443	limits->max_discard_sectors = pool->sectors_per_block;
				2444
				2445	/*
				2446	* This is just a hint, and not enforced. We have to cope with
				2447	* bios that overlap 2 blocks.
				2448	*/
				2449	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2450	limits->discard_zeroes_data = pool->pf.zero_new_blocks;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2451	}
				2452
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2453	static void pool_io_hints(struct dm_target ti, struct queue_limits limits)
				2454	{
				2455	struct pool_c *pt = ti->private;
				2456	struct pool *pool = pt->pool;
				2457
				2458	blk_limits_io_min(limits, 0);
				2459	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2460	if (pool->pf.discard_enabled)
				2461	set_discard_limits(pool, limits);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2462	}
				2463
				2464	static struct target_type pool_target = {
				2465	.name = "thin-pool",
				2466	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
				2467	DM_TARGET_IMMUTABLE,
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2468	.version = {1, 1, 0},
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2469	.module = THIS_MODULE,
				2470	.ctr = pool_ctr,
				2471	.dtr = pool_dtr,
				2472	.map = pool_map,
				2473	.postsuspend = pool_postsuspend,
				2474	.preresume = pool_preresume,
				2475	.resume = pool_resume,
				2476	.message = pool_message,
				2477	.status = pool_status,
				2478	.merge = pool_merge,
				2479	.iterate_devices = pool_iterate_devices,
				2480	.io_hints = pool_io_hints,
				2481	};
				2482
				2483	/*----------------------------------------------------------------
				2484	* Thin target methods
				2485	--------------------------------------------------------------/
				2486	static void thin_dtr(struct dm_target *ti)
				2487	{
				2488	struct thin_c *tc = ti->private;
				2489
				2490	mutex_lock(&dm_thin_pool_table.mutex);
				2491
				2492	__pool_dec(tc->pool);
				2493	dm_pool_close_thin_device(tc->td);
				2494	dm_put_device(ti, tc->pool_dev);
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2495	if (tc->origin_dev)
				2496	dm_put_device(ti, tc->origin_dev);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2497	kfree(tc);
				2498
				2499	mutex_unlock(&dm_thin_pool_table.mutex);
				2500	}
				2501
				2502	/*
				2503	* Thin target parameters:
				2504	*
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2505	* <pool_dev> <dev_id> [origin_dev]
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2506	*
				2507	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
				2508	* dev_id: the internal device identifier
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2509	* origin_dev: a device external to the pool that should act as the origin
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2510	*
				2511	* If the pool device has discards disabled, they get disabled for the thin
				2512	* device as well.
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2513	*/
				2514	static int thin_ctr(struct dm_target ti, unsigned argc, char *argv)
				2515	{
				2516	int r;
				2517	struct thin_c *tc;
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2518	struct dm_dev pool_dev, origin_dev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2519	struct mapped_device *pool_md;
				2520
				2521	mutex_lock(&dm_thin_pool_table.mutex);
				2522
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2523	if (argc != 2 && argc != 3) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2524	ti->error = "Invalid argument count";
				2525	r = -EINVAL;
				2526	goto out_unlock;
				2527	}
				2528
				2529	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
				2530	if (!tc) {
				2531	ti->error = "Out of memory";
				2532	r = -ENOMEM;
				2533	goto out_unlock;
				2534	}
				2535
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2536	if (argc == 3) {
				2537	r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
				2538	if (r) {
				2539	ti->error = "Error opening origin device";
				2540	goto bad_origin_dev;
				2541	}
				2542	tc->origin_dev = origin_dev;
				2543	}
				2544
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2545	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
				2546	if (r) {
				2547	ti->error = "Error opening pool device";
				2548	goto bad_pool_dev;
				2549	}
				2550	tc->pool_dev = pool_dev;
				2551
				2552	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
				2553	ti->error = "Invalid device id";
				2554	r = -EINVAL;
				2555	goto bad_common;
				2556	}
				2557
				2558	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
				2559	if (!pool_md) {
				2560	ti->error = "Couldn't get pool mapped device";
				2561	r = -EINVAL;
				2562	goto bad_common;
				2563	}
				2564
				2565	tc->pool = __pool_table_lookup(pool_md);
				2566	if (!tc->pool) {
				2567	ti->error = "Couldn't find pool object";
				2568	r = -EINVAL;
				2569	goto bad_pool_lookup;
				2570	}
				2571	__pool_inc(tc->pool);
				2572
				2573	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
				2574	if (r) {
				2575	ti->error = "Couldn't open thin internal device";
				2576	goto bad_thin_open;
				2577	}
				2578
				2579	ti->split_io = tc->pool->sectors_per_block;
				2580	ti->num_flush_requests = 1;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2581
				2582	/* In case the pool supports discards, pass them on. */
				2583	if (tc->pool->pf.discard_enabled) {
				2584	ti->discards_supported = 1;
				2585	ti->num_discard_requests = 1;
				2586	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2587
				2588	dm_put(pool_md);
				2589
				2590	mutex_unlock(&dm_thin_pool_table.mutex);
				2591
				2592	return 0;
				2593
				2594	bad_thin_open:
				2595	__pool_dec(tc->pool);
				2596	bad_pool_lookup:
				2597	dm_put(pool_md);
				2598	bad_common:
				2599	dm_put_device(ti, tc->pool_dev);
				2600	bad_pool_dev:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2601	if (tc->origin_dev)
				2602	dm_put_device(ti, tc->origin_dev);
				2603	bad_origin_dev:
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2604	kfree(tc);
				2605	out_unlock:
				2606	mutex_unlock(&dm_thin_pool_table.mutex);
				2607
				2608	return r;
				2609	}
				2610
				2611	static int thin_map(struct dm_target ti, struct bio bio,
				2612	union map_info *map_context)
				2613	{
Alasdair G Kergon	6efd6e8	2012-03-28 18:41:28 +0100	[diff] [blame]	2614	bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2615
				2616	return thin_bio_map(ti, bio, map_context);
				2617	}
				2618
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2619	static int thin_endio(struct dm_target *ti,
				2620	struct bio *bio, int err,
				2621	union map_info *map_context)
				2622	{
				2623	unsigned long flags;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	2624	struct dm_thin_endio_hook *h = map_context->ptr;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2625	struct list_head work;
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	2626	struct dm_thin_new_mapping m, tmp;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2627	struct pool *pool = h->tc->pool;
				2628
				2629	if (h->shared_read_entry) {
				2630	INIT_LIST_HEAD(&work);
				2631	ds_dec(h->shared_read_entry, &work);
				2632
				2633	spin_lock_irqsave(&pool->lock, flags);
				2634	list_for_each_entry_safe(m, tmp, &work, list) {
				2635	list_del(&m->list);
				2636	m->quiesced = 1;
				2637	__maybe_add_mapping(m);
				2638	}
				2639	spin_unlock_irqrestore(&pool->lock, flags);
				2640	}
				2641
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2642	if (h->all_io_entry) {
				2643	INIT_LIST_HEAD(&work);
				2644	ds_dec(h->all_io_entry, &work);
Mike Snitzer	c3a0ce2	2012-05-12 01:43:16 +0100	[diff] [blame]	2645	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2646	list_for_each_entry_safe(m, tmp, &work, list)
				2647	list_add(&m->list, &pool->prepared_discards);
Mike Snitzer	c3a0ce2	2012-05-12 01:43:16 +0100	[diff] [blame]	2648	spin_unlock_irqrestore(&pool->lock, flags);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2649	}
				2650
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2651	mempool_free(h, pool->endio_hook_pool);
				2652
				2653	return 0;
				2654	}
				2655
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2656	static void thin_postsuspend(struct dm_target *ti)
				2657	{
				2658	if (dm_noflush_suspending(ti))
				2659	requeue_io((struct thin_c *)ti->private);
				2660	}
				2661
				2662	/*
				2663	* <nr mapped sectors> <highest mapped sector>
				2664	*/
				2665	static int thin_status(struct dm_target *ti, status_type_t type,
				2666	char *result, unsigned maxlen)
				2667	{
				2668	int r;
				2669	ssize_t sz = 0;
				2670	dm_block_t mapped, highest;
				2671	char buf[BDEVNAME_SIZE];
				2672	struct thin_c *tc = ti->private;
				2673
				2674	if (!tc->td)
				2675	DMEMIT("-");
				2676	else {
				2677	switch (type) {
				2678	case STATUSTYPE_INFO:
				2679	r = dm_thin_get_mapped_count(tc->td, &mapped);
				2680	if (r)
				2681	return r;
				2682
				2683	r = dm_thin_get_highest_mapped_block(tc->td, &highest);
				2684	if (r < 0)
				2685	return r;
				2686
				2687	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
				2688	if (r)
				2689	DMEMIT("%llu", ((highest + 1) *
				2690	tc->pool->sectors_per_block) - 1);
				2691	else
				2692	DMEMIT("-");
				2693	break;
				2694
				2695	case STATUSTYPE_TABLE:
				2696	DMEMIT("%s %lu",
				2697	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
				2698	(unsigned long) tc->dev_id);
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2699	if (tc->origin_dev)
				2700	DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2701	break;
				2702	}
				2703	}
				2704
				2705	return 0;
				2706	}
				2707
				2708	static int thin_iterate_devices(struct dm_target *ti,
				2709	iterate_devices_callout_fn fn, void *data)
				2710	{
				2711	dm_block_t blocks;
				2712	struct thin_c *tc = ti->private;
				2713
				2714	/*
				2715	* We can't call dm_pool_get_data_dev_size() since that blocks. So
				2716	* we follow a more convoluted path through to the pool's target.
				2717	*/
				2718	if (!tc->pool->ti)
				2719	return 0; /* nothing is bound */
				2720
				2721	blocks = tc->pool->ti->len >> tc->pool->block_shift;
				2722	if (blocks)
				2723	return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
				2724
				2725	return 0;
				2726	}
				2727
				2728	static void thin_io_hints(struct dm_target ti, struct queue_limits limits)
				2729	{
				2730	struct thin_c *tc = ti->private;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2731	struct pool *pool = tc->pool;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2732
				2733	blk_limits_io_min(limits, 0);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2734	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
				2735	set_discard_limits(pool, limits);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2736	}
				2737
				2738	static struct target_type thin_target = {
				2739	.name = "thin",
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2740	.version = {1, 1, 0},
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2741	.module = THIS_MODULE,
				2742	.ctr = thin_ctr,
				2743	.dtr = thin_dtr,
				2744	.map = thin_map,
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2745	.end_io = thin_endio,
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2746	.postsuspend = thin_postsuspend,
				2747	.status = thin_status,
				2748	.iterate_devices = thin_iterate_devices,
				2749	.io_hints = thin_io_hints,
				2750	};
				2751
				2752	/----------------------------------------------------------------/
				2753
				2754	static int __init dm_thin_init(void)
				2755	{
				2756	int r;
				2757
				2758	pool_table_init();
				2759
				2760	r = dm_register_target(&thin_target);
				2761	if (r)
				2762	return r;
				2763
				2764	r = dm_register_target(&pool_target);
				2765	if (r)
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	2766	goto bad_pool_target;
				2767
				2768	r = -ENOMEM;
				2769
				2770	_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
				2771	if (!_cell_cache)
				2772	goto bad_cell_cache;
				2773
				2774	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
				2775	if (!_new_mapping_cache)
				2776	goto bad_new_mapping_cache;
				2777
				2778	_endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
				2779	if (!_endio_hook_cache)
				2780	goto bad_endio_hook_cache;
				2781
				2782	return 0;
				2783
				2784	bad_endio_hook_cache:
				2785	kmem_cache_destroy(_new_mapping_cache);
				2786	bad_new_mapping_cache:
				2787	kmem_cache_destroy(_cell_cache);
				2788	bad_cell_cache:
				2789	dm_unregister_target(&pool_target);
				2790	bad_pool_target:
				2791	dm_unregister_target(&thin_target);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2792
				2793	return r;
				2794	}
				2795
				2796	static void dm_thin_exit(void)
				2797	{
				2798	dm_unregister_target(&thin_target);
				2799	dm_unregister_target(&pool_target);
Mike Snitzer	a24c256	2012-06-03 00:30:00 +0100	[diff] [blame^]	2800
				2801	kmem_cache_destroy(_cell_cache);
				2802	kmem_cache_destroy(_new_mapping_cache);
				2803	kmem_cache_destroy(_endio_hook_cache);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2804	}
				2805
				2806	module_init(dm_thin_init);
				2807	module_exit(dm_thin_exit);
				2808
Alasdair G Kergon	7cab8bf	2012-05-12 01:43:19 +0100	[diff] [blame]	2809	MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2810	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				2811	MODULE_LICENSE("GPL");