blob: 22a22a701e16c108eb1c69d43fe45088c33de739 [file] [log] [blame]
Joe Thornber991d9fa2011-10-31 20:21:18 +00001/*
Joe Thornbere49e5822012-07-27 15:08:16 +01002 * Copyright (C) 2011-2012 Red Hat UK.
Joe Thornber991d9fa2011-10-31 20:21:18 +00003 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01008#include "dm.h"
Joe Thornber991d9fa2011-10-31 20:21:18 +00009
10#include <linux/device-mapper.h>
11#include <linux/dm-io.h>
12#include <linux/dm-kcopyd.h>
13#include <linux/list.h>
14#include <linux/init.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17
18#define DM_MSG_PREFIX "thin"
19
20/*
21 * Tunable constants
22 */
Alasdair G Kergon7768ed32012-07-27 15:07:57 +010023#define ENDIO_HOOK_POOL_SIZE 1024
Joe Thornber991d9fa2011-10-31 20:21:18 +000024#define DEFERRED_SET_SIZE 64
25#define MAPPING_POOL_SIZE 1024
26#define PRISON_CELLS 1024
Joe Thornber905e51b2012-03-28 18:41:27 +010027#define COMMIT_PERIOD HZ
Joe Thornber991d9fa2011-10-31 20:21:18 +000028
29/*
30 * The block size of the device holding pool data must be
31 * between 64KB and 1GB.
32 */
33#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
34#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
35
36/*
Joe Thornber991d9fa2011-10-31 20:21:18 +000037 * Device id is restricted to 24 bits.
38 */
39#define MAX_DEV_ID ((1 << 24) - 1)
40
41/*
42 * How do we handle breaking sharing of data blocks?
43 * =================================================
44 *
45 * We use a standard copy-on-write btree to store the mappings for the
46 * devices (note I'm talking about copy-on-write of the metadata here, not
47 * the data). When you take an internal snapshot you clone the root node
48 * of the origin btree. After this there is no concept of an origin or a
49 * snapshot. They are just two device trees that happen to point to the
50 * same data blocks.
51 *
52 * When we get a write in we decide if it's to a shared data block using
53 * some timestamp magic. If it is, we have to break sharing.
54 *
55 * Let's say we write to a shared block in what was the origin. The
56 * steps are:
57 *
58 * i) plug io further to this physical block. (see bio_prison code).
59 *
60 * ii) quiesce any read io to that shared data block. Obviously
Mike Snitzer44feb382012-10-12 21:02:10 +010061 * including all devices that share this block. (see dm_deferred_set code)
Joe Thornber991d9fa2011-10-31 20:21:18 +000062 *
63 * iii) copy the data block to a newly allocate block. This step can be
64 * missed out if the io covers the block. (schedule_copy).
65 *
66 * iv) insert the new mapping into the origin's btree
Joe Thornberfe878f32012-03-28 18:41:24 +010067 * (process_prepared_mapping). This act of inserting breaks some
Joe Thornber991d9fa2011-10-31 20:21:18 +000068 * sharing of btree nodes between the two devices. Breaking sharing only
69 * effects the btree of that specific device. Btrees for the other
70 * devices that share the block never change. The btree for the origin
71 * device as it was after the last commit is untouched, ie. we're using
72 * persistent data structures in the functional programming sense.
73 *
74 * v) unplug io to this physical block, including the io that triggered
75 * the breaking of sharing.
76 *
77 * Steps (ii) and (iii) occur in parallel.
78 *
79 * The metadata _doesn't_ need to be committed before the io continues. We
80 * get away with this because the io is always written to a _new_ block.
81 * If there's a crash, then:
82 *
83 * - The origin mapping will point to the old origin block (the shared
84 * one). This will contain the data as it was before the io that triggered
85 * the breaking of sharing came in.
86 *
87 * - The snap mapping still points to the old block. As it would after
88 * the commit.
89 *
90 * The downside of this scheme is the timestamp magic isn't perfect, and
91 * will continue to think that data block in the snapshot device is shared
92 * even after the write to the origin has broken sharing. I suspect data
93 * blocks will typically be shared by many different devices, so we're
94 * breaking sharing n + 1 times, rather than n, where n is the number of
95 * devices that reference this data block. At the moment I think the
96 * benefits far, far outweigh the disadvantages.
97 */
98
99/*----------------------------------------------------------------*/
100
101/*
102 * Sometimes we can't deal with a bio straight away. We put them in prison
103 * where they can't cause any mischief. Bios are put in a cell identified
104 * by a key, multiple bios can be in the same cell. When the cell is
105 * subsequently unlocked the bios become available.
106 */
Mike Snitzer44feb382012-10-12 21:02:10 +0100107struct dm_bio_prison;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000108
Mike Snitzer44feb382012-10-12 21:02:10 +0100109struct dm_cell_key {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000110 int virtual;
111 dm_thin_id dev;
112 dm_block_t block;
113};
114
Mike Snitzera24c2562012-06-03 00:30:00 +0100115struct dm_bio_prison_cell {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000116 struct hlist_node list;
Mike Snitzer44feb382012-10-12 21:02:10 +0100117 struct dm_bio_prison *prison;
118 struct dm_cell_key key;
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100119 struct bio *holder;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000120 struct bio_list bios;
121};
122
Mike Snitzer44feb382012-10-12 21:02:10 +0100123struct dm_bio_prison {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000124 spinlock_t lock;
125 mempool_t *cell_pool;
126
127 unsigned nr_buckets;
128 unsigned hash_mask;
129 struct hlist_head *cells;
130};
131
132static uint32_t calc_nr_buckets(unsigned nr_cells)
133{
134 uint32_t n = 128;
135
136 nr_cells /= 4;
137 nr_cells = min(nr_cells, 8192u);
138
139 while (n < nr_cells)
140 n <<= 1;
141
142 return n;
143}
144
Mike Snitzera24c2562012-06-03 00:30:00 +0100145static struct kmem_cache *_cell_cache;
146
Joe Thornber991d9fa2011-10-31 20:21:18 +0000147/*
148 * @nr_cells should be the number of cells you want in use _concurrently_.
149 * Don't confuse it with the number of distinct keys.
150 */
Mike Snitzer44feb382012-10-12 21:02:10 +0100151static struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000152{
153 unsigned i;
154 uint32_t nr_buckets = calc_nr_buckets(nr_cells);
Mike Snitzer44feb382012-10-12 21:02:10 +0100155 size_t len = sizeof(struct dm_bio_prison) +
Joe Thornber991d9fa2011-10-31 20:21:18 +0000156 (sizeof(struct hlist_head) * nr_buckets);
Mike Snitzer44feb382012-10-12 21:02:10 +0100157 struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000158
159 if (!prison)
160 return NULL;
161
162 spin_lock_init(&prison->lock);
Mike Snitzera24c2562012-06-03 00:30:00 +0100163 prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000164 if (!prison->cell_pool) {
165 kfree(prison);
166 return NULL;
167 }
168
169 prison->nr_buckets = nr_buckets;
170 prison->hash_mask = nr_buckets - 1;
171 prison->cells = (struct hlist_head *) (prison + 1);
172 for (i = 0; i < nr_buckets; i++)
173 INIT_HLIST_HEAD(prison->cells + i);
174
175 return prison;
176}
177
Mike Snitzer44feb382012-10-12 21:02:10 +0100178static void dm_bio_prison_destroy(struct dm_bio_prison *prison)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000179{
180 mempool_destroy(prison->cell_pool);
181 kfree(prison);
182}
183
Mike Snitzer44feb382012-10-12 21:02:10 +0100184static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000185{
186 const unsigned long BIG_PRIME = 4294967291UL;
187 uint64_t hash = key->block * BIG_PRIME;
188
189 return (uint32_t) (hash & prison->hash_mask);
190}
191
Mike Snitzer44feb382012-10-12 21:02:10 +0100192static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000193{
194 return (lhs->virtual == rhs->virtual) &&
195 (lhs->dev == rhs->dev) &&
196 (lhs->block == rhs->block);
197}
198
Mike Snitzera24c2562012-06-03 00:30:00 +0100199static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
Mike Snitzer44feb382012-10-12 21:02:10 +0100200 struct dm_cell_key *key)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000201{
Mike Snitzera24c2562012-06-03 00:30:00 +0100202 struct dm_bio_prison_cell *cell;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000203 struct hlist_node *tmp;
204
205 hlist_for_each_entry(cell, tmp, bucket, list)
206 if (keys_equal(&cell->key, key))
207 return cell;
208
209 return NULL;
210}
211
212/*
213 * This may block if a new cell needs allocating. You must ensure that
214 * cells will be unlocked even if the calling thread is blocked.
215 *
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100216 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
Joe Thornber991d9fa2011-10-31 20:21:18 +0000217 */
Mike Snitzer44feb382012-10-12 21:02:10 +0100218static int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
219 struct bio *inmate, struct dm_bio_prison_cell **ref)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000220{
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100221 int r = 1;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000222 unsigned long flags;
223 uint32_t hash = hash_key(prison, key);
Mike Snitzera24c2562012-06-03 00:30:00 +0100224 struct dm_bio_prison_cell *cell, *cell2;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000225
226 BUG_ON(hash > prison->nr_buckets);
227
228 spin_lock_irqsave(&prison->lock, flags);
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100229
Joe Thornber991d9fa2011-10-31 20:21:18 +0000230 cell = __search_bucket(prison->cells + hash, key);
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100231 if (cell) {
232 bio_list_add(&cell->bios, inmate);
233 goto out;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000234 }
235
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100236 /*
237 * Allocate a new cell
238 */
Joe Thornber991d9fa2011-10-31 20:21:18 +0000239 spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100240 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
241 spin_lock_irqsave(&prison->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000242
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100243 /*
244 * We've been unlocked, so we have to double check that
245 * nobody else has inserted this cell in the meantime.
246 */
247 cell = __search_bucket(prison->cells + hash, key);
248 if (cell) {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000249 mempool_free(cell2, prison->cell_pool);
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100250 bio_list_add(&cell->bios, inmate);
251 goto out;
252 }
253
254 /*
255 * Use new cell.
256 */
257 cell = cell2;
258
259 cell->prison = prison;
260 memcpy(&cell->key, key, sizeof(cell->key));
261 cell->holder = inmate;
262 bio_list_init(&cell->bios);
263 hlist_add_head(&cell->list, prison->cells + hash);
264
265 r = 0;
266
267out:
268 spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000269
270 *ref = cell;
271
272 return r;
273}
274
275/*
276 * @inmates must have been initialised prior to this call
277 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100278static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000279{
Mike Snitzer44feb382012-10-12 21:02:10 +0100280 struct dm_bio_prison *prison = cell->prison;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000281
282 hlist_del(&cell->list);
283
Mike Snitzer03aaae72012-05-12 01:43:12 +0100284 if (inmates) {
285 bio_list_add(inmates, cell->holder);
286 bio_list_merge(inmates, &cell->bios);
287 }
Joe Thornber991d9fa2011-10-31 20:21:18 +0000288
289 mempool_free(cell, prison->cell_pool);
290}
291
Mike Snitzer44feb382012-10-12 21:02:10 +0100292static void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000293{
294 unsigned long flags;
Mike Snitzer44feb382012-10-12 21:02:10 +0100295 struct dm_bio_prison *prison = cell->prison;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000296
297 spin_lock_irqsave(&prison->lock, flags);
298 __cell_release(cell, bios);
299 spin_unlock_irqrestore(&prison->lock, flags);
300}
301
302/*
303 * There are a couple of places where we put a bio into a cell briefly
304 * before taking it out again. In these situations we know that no other
305 * bio may be in the cell. This function releases the cell, and also does
306 * a sanity check.
307 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100308static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100309{
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100310 BUG_ON(cell->holder != bio);
311 BUG_ON(!bio_list_empty(&cell->bios));
Mike Snitzer03aaae72012-05-12 01:43:12 +0100312
313 __cell_release(cell, NULL);
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100314}
315
Mike Snitzer44feb382012-10-12 21:02:10 +0100316static void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000317{
Joe Thornber991d9fa2011-10-31 20:21:18 +0000318 unsigned long flags;
Mike Snitzer44feb382012-10-12 21:02:10 +0100319 struct dm_bio_prison *prison = cell->prison;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000320
321 spin_lock_irqsave(&prison->lock, flags);
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100322 __cell_release_singleton(cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000323 spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100324}
Joe Thornber991d9fa2011-10-31 20:21:18 +0000325
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100326/*
327 * Sometimes we don't want the holder, just the additional bios.
328 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100329static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
330 struct bio_list *inmates)
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100331{
Mike Snitzer44feb382012-10-12 21:02:10 +0100332 struct dm_bio_prison *prison = cell->prison;
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100333
334 hlist_del(&cell->list);
335 bio_list_merge(inmates, &cell->bios);
336
337 mempool_free(cell, prison->cell_pool);
338}
339
Mike Snitzer44feb382012-10-12 21:02:10 +0100340static void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell,
341 struct bio_list *inmates)
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100342{
343 unsigned long flags;
Mike Snitzer44feb382012-10-12 21:02:10 +0100344 struct dm_bio_prison *prison = cell->prison;
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100345
346 spin_lock_irqsave(&prison->lock, flags);
347 __cell_release_no_holder(cell, inmates);
348 spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000349}
350
Mike Snitzer44feb382012-10-12 21:02:10 +0100351static void dm_cell_error(struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000352{
Mike Snitzer44feb382012-10-12 21:02:10 +0100353 struct dm_bio_prison *prison = cell->prison;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000354 struct bio_list bios;
355 struct bio *bio;
356 unsigned long flags;
357
358 bio_list_init(&bios);
359
360 spin_lock_irqsave(&prison->lock, flags);
361 __cell_release(cell, &bios);
362 spin_unlock_irqrestore(&prison->lock, flags);
363
364 while ((bio = bio_list_pop(&bios)))
365 bio_io_error(bio);
366}
367
368/*----------------------------------------------------------------*/
369
370/*
371 * We use the deferred set to keep track of pending reads to shared blocks.
372 * We do this to ensure the new mapping caused by a write isn't performed
373 * until these prior reads have completed. Otherwise the insertion of the
374 * new mapping could free the old block that the read bios are mapped to.
375 */
376
Mike Snitzer44feb382012-10-12 21:02:10 +0100377struct dm_deferred_set;
378struct dm_deferred_entry {
379 struct dm_deferred_set *ds;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000380 unsigned count;
381 struct list_head work_items;
382};
383
Mike Snitzer44feb382012-10-12 21:02:10 +0100384struct dm_deferred_set {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000385 spinlock_t lock;
386 unsigned current_entry;
387 unsigned sweeper;
Mike Snitzer44feb382012-10-12 21:02:10 +0100388 struct dm_deferred_entry entries[DEFERRED_SET_SIZE];
Joe Thornber991d9fa2011-10-31 20:21:18 +0000389};
390
Mike Snitzer44feb382012-10-12 21:02:10 +0100391static struct dm_deferred_set *dm_deferred_set_create(void)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000392{
393 int i;
Mike Snitzer44feb382012-10-12 21:02:10 +0100394 struct dm_deferred_set *ds;
395
396 ds = kmalloc(sizeof(*ds), GFP_KERNEL);
397 if (!ds)
398 return NULL;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000399
400 spin_lock_init(&ds->lock);
401 ds->current_entry = 0;
402 ds->sweeper = 0;
403 for (i = 0; i < DEFERRED_SET_SIZE; i++) {
404 ds->entries[i].ds = ds;
405 ds->entries[i].count = 0;
406 INIT_LIST_HEAD(&ds->entries[i].work_items);
407 }
Mike Snitzer44feb382012-10-12 21:02:10 +0100408
409 return ds;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000410}
411
Mike Snitzer44feb382012-10-12 21:02:10 +0100412static void dm_deferred_set_destroy(struct dm_deferred_set *ds)
413{
414 kfree(ds);
415}
416
417static struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000418{
419 unsigned long flags;
Mike Snitzer44feb382012-10-12 21:02:10 +0100420 struct dm_deferred_entry *entry;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000421
422 spin_lock_irqsave(&ds->lock, flags);
423 entry = ds->entries + ds->current_entry;
424 entry->count++;
425 spin_unlock_irqrestore(&ds->lock, flags);
426
427 return entry;
428}
429
430static unsigned ds_next(unsigned index)
431{
432 return (index + 1) % DEFERRED_SET_SIZE;
433}
434
Mike Snitzer44feb382012-10-12 21:02:10 +0100435static void __sweep(struct dm_deferred_set *ds, struct list_head *head)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000436{
437 while ((ds->sweeper != ds->current_entry) &&
438 !ds->entries[ds->sweeper].count) {
439 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
440 ds->sweeper = ds_next(ds->sweeper);
441 }
442
443 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
444 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
445}
446
Mike Snitzer44feb382012-10-12 21:02:10 +0100447static void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000448{
449 unsigned long flags;
450
451 spin_lock_irqsave(&entry->ds->lock, flags);
452 BUG_ON(!entry->count);
453 --entry->count;
454 __sweep(entry->ds, head);
455 spin_unlock_irqrestore(&entry->ds->lock, flags);
456}
457
458/*
459 * Returns 1 if deferred or 0 if no pending items to delay job.
460 */
Mike Snitzer44feb382012-10-12 21:02:10 +0100461static int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000462{
463 int r = 1;
464 unsigned long flags;
465 unsigned next_entry;
466
467 spin_lock_irqsave(&ds->lock, flags);
468 if ((ds->sweeper == ds->current_entry) &&
469 !ds->entries[ds->current_entry].count)
470 r = 0;
471 else {
472 list_add(work, &ds->entries[ds->current_entry].work_items);
473 next_entry = ds_next(ds->current_entry);
474 if (!ds->entries[next_entry].count)
475 ds->current_entry = next_entry;
476 }
477 spin_unlock_irqrestore(&ds->lock, flags);
478
479 return r;
480}
481
Mike Snitzer44feb382012-10-12 21:02:10 +0100482static int __init dm_bio_prison_init(void)
483{
484 _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
485 if (!_cell_cache)
486 return -ENOMEM;
487
488 return 0;
489}
490
491static void __exit dm_bio_prison_exit(void)
492{
493 kmem_cache_destroy(_cell_cache);
494 _cell_cache = NULL;
495}
496
Joe Thornber991d9fa2011-10-31 20:21:18 +0000497/*----------------------------------------------------------------*/
498
499/*
500 * Key building.
501 */
502static void build_data_key(struct dm_thin_device *td,
Mike Snitzer44feb382012-10-12 21:02:10 +0100503 dm_block_t b, struct dm_cell_key *key)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000504{
505 key->virtual = 0;
506 key->dev = dm_thin_dev_id(td);
507 key->block = b;
508}
509
510static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
Mike Snitzer44feb382012-10-12 21:02:10 +0100511 struct dm_cell_key *key)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000512{
513 key->virtual = 1;
514 key->dev = dm_thin_dev_id(td);
515 key->block = b;
516}
517
518/*----------------------------------------------------------------*/
519
520/*
521 * A pool device ties together a metadata device and a data device. It
522 * also provides the interface for creating and destroying internal
523 * devices.
524 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100525struct dm_thin_new_mapping;
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100526
Joe Thornbere49e5822012-07-27 15:08:16 +0100527/*
528 * The pool runs in 3 modes. Ordered in degraded order for comparisons.
529 */
530enum pool_mode {
531 PM_WRITE, /* metadata may be changed */
532 PM_READ_ONLY, /* metadata may not be changed */
533 PM_FAIL, /* all I/O fails */
534};
535
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100536struct pool_features {
Joe Thornbere49e5822012-07-27 15:08:16 +0100537 enum pool_mode mode;
538
Mike Snitzer9bc142d2012-09-26 23:45:46 +0100539 bool zero_new_blocks:1;
540 bool discard_enabled:1;
541 bool discard_passdown:1;
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100542};
543
Joe Thornbere49e5822012-07-27 15:08:16 +0100544struct thin_c;
545typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
546typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
547
Joe Thornber991d9fa2011-10-31 20:21:18 +0000548struct pool {
549 struct list_head list;
550 struct dm_target *ti; /* Only set if a pool target is bound */
551
552 struct mapped_device *pool_md;
553 struct block_device *md_dev;
554 struct dm_pool_metadata *pmd;
555
Joe Thornber991d9fa2011-10-31 20:21:18 +0000556 dm_block_t low_water_blocks;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100557 uint32_t sectors_per_block;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +0100558 int sectors_per_block_shift;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000559
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100560 struct pool_features pf;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000561 unsigned low_water_triggered:1; /* A dm event has been sent */
562 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
563
Mike Snitzer44feb382012-10-12 21:02:10 +0100564 struct dm_bio_prison *prison;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000565 struct dm_kcopyd_client *copier;
566
567 struct workqueue_struct *wq;
568 struct work_struct worker;
Joe Thornber905e51b2012-03-28 18:41:27 +0100569 struct delayed_work waker;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000570
Joe Thornber905e51b2012-03-28 18:41:27 +0100571 unsigned long last_commit_jiffies;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100572 unsigned ref_count;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000573
574 spinlock_t lock;
575 struct bio_list deferred_bios;
576 struct bio_list deferred_flush_bios;
577 struct list_head prepared_mappings;
Joe Thornber104655f2012-03-28 18:41:28 +0100578 struct list_head prepared_discards;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000579
580 struct bio_list retry_on_resume_list;
581
Mike Snitzer44feb382012-10-12 21:02:10 +0100582 struct dm_deferred_set *shared_read_ds;
583 struct dm_deferred_set *all_io_ds;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000584
Mike Snitzera24c2562012-06-03 00:30:00 +0100585 struct dm_thin_new_mapping *next_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000586 mempool_t *mapping_pool;
587 mempool_t *endio_hook_pool;
Joe Thornbere49e5822012-07-27 15:08:16 +0100588
589 process_bio_fn process_bio;
590 process_bio_fn process_discard;
591
592 process_mapping_fn process_prepared_mapping;
593 process_mapping_fn process_prepared_discard;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000594};
595
Joe Thornbere49e5822012-07-27 15:08:16 +0100596static enum pool_mode get_pool_mode(struct pool *pool);
597static void set_pool_mode(struct pool *pool, enum pool_mode mode);
598
Joe Thornber991d9fa2011-10-31 20:21:18 +0000599/*
600 * Target context for a pool.
601 */
602struct pool_c {
603 struct dm_target *ti;
604 struct pool *pool;
605 struct dm_dev *data_dev;
606 struct dm_dev *metadata_dev;
607 struct dm_target_callbacks callbacks;
608
609 dm_block_t low_water_blocks;
Mike Snitzer0424caa2012-09-26 23:45:47 +0100610 struct pool_features requested_pf; /* Features requested during table load */
611 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
Joe Thornber991d9fa2011-10-31 20:21:18 +0000612};
613
614/*
615 * Target context for a thin.
616 */
617struct thin_c {
618 struct dm_dev *pool_dev;
Joe Thornber2dd9c252012-03-28 18:41:28 +0100619 struct dm_dev *origin_dev;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000620 dm_thin_id dev_id;
621
622 struct pool *pool;
623 struct dm_thin_device *td;
624};
625
626/*----------------------------------------------------------------*/
627
628/*
629 * A global list of pools that uses a struct mapped_device as a key.
630 */
631static struct dm_thin_pool_table {
632 struct mutex mutex;
633 struct list_head pools;
634} dm_thin_pool_table;
635
636static void pool_table_init(void)
637{
638 mutex_init(&dm_thin_pool_table.mutex);
639 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
640}
641
642static void __pool_table_insert(struct pool *pool)
643{
644 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
645 list_add(&pool->list, &dm_thin_pool_table.pools);
646}
647
648static void __pool_table_remove(struct pool *pool)
649{
650 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
651 list_del(&pool->list);
652}
653
654static struct pool *__pool_table_lookup(struct mapped_device *md)
655{
656 struct pool *pool = NULL, *tmp;
657
658 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
659
660 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
661 if (tmp->pool_md == md) {
662 pool = tmp;
663 break;
664 }
665 }
666
667 return pool;
668}
669
670static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
671{
672 struct pool *pool = NULL, *tmp;
673
674 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
675
676 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
677 if (tmp->md_dev == md_dev) {
678 pool = tmp;
679 break;
680 }
681 }
682
683 return pool;
684}
685
686/*----------------------------------------------------------------*/
687
Mike Snitzera24c2562012-06-03 00:30:00 +0100688struct dm_thin_endio_hook {
Joe Thornbereb2aa482012-03-28 18:41:28 +0100689 struct thin_c *tc;
Mike Snitzer44feb382012-10-12 21:02:10 +0100690 struct dm_deferred_entry *shared_read_entry;
691 struct dm_deferred_entry *all_io_entry;
Mike Snitzera24c2562012-06-03 00:30:00 +0100692 struct dm_thin_new_mapping *overwrite_mapping;
Joe Thornbereb2aa482012-03-28 18:41:28 +0100693};
694
Joe Thornber991d9fa2011-10-31 20:21:18 +0000695static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
696{
697 struct bio *bio;
698 struct bio_list bios;
699
700 bio_list_init(&bios);
701 bio_list_merge(&bios, master);
702 bio_list_init(master);
703
704 while ((bio = bio_list_pop(&bios))) {
Mike Snitzera24c2562012-06-03 00:30:00 +0100705 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
706
Joe Thornbereb2aa482012-03-28 18:41:28 +0100707 if (h->tc == tc)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000708 bio_endio(bio, DM_ENDIO_REQUEUE);
709 else
710 bio_list_add(master, bio);
711 }
712}
713
714static void requeue_io(struct thin_c *tc)
715{
716 struct pool *pool = tc->pool;
717 unsigned long flags;
718
719 spin_lock_irqsave(&pool->lock, flags);
720 __requeue_bio_list(tc, &pool->deferred_bios);
721 __requeue_bio_list(tc, &pool->retry_on_resume_list);
722 spin_unlock_irqrestore(&pool->lock, flags);
723}
724
725/*
726 * This section of code contains the logic for processing a thin device's IO.
727 * Much of the code depends on pool object resources (lists, workqueues, etc)
728 * but most is exclusively called from the thin target rather than the thin-pool
729 * target.
730 */
731
732static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
733{
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100734 sector_t block_nr = bio->bi_sector;
735
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +0100736 if (tc->pool->sectors_per_block_shift < 0)
737 (void) sector_div(block_nr, tc->pool->sectors_per_block);
738 else
739 block_nr >>= tc->pool->sectors_per_block_shift;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100740
741 return block_nr;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000742}
743
744static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
745{
746 struct pool *pool = tc->pool;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100747 sector_t bi_sector = bio->bi_sector;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000748
749 bio->bi_bdev = tc->pool_dev->bdev;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +0100750 if (tc->pool->sectors_per_block_shift < 0)
751 bio->bi_sector = (block * pool->sectors_per_block) +
752 sector_div(bi_sector, pool->sectors_per_block);
753 else
754 bio->bi_sector = (block << pool->sectors_per_block_shift) |
755 (bi_sector & (pool->sectors_per_block - 1));
Joe Thornber991d9fa2011-10-31 20:21:18 +0000756}
757
Joe Thornber2dd9c252012-03-28 18:41:28 +0100758static void remap_to_origin(struct thin_c *tc, struct bio *bio)
759{
760 bio->bi_bdev = tc->origin_dev->bdev;
761}
762
Joe Thornber4afdd682012-07-27 15:08:14 +0100763static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
764{
765 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
766 dm_thin_changed_this_transaction(tc->td);
767}
768
Joe Thornber2dd9c252012-03-28 18:41:28 +0100769static void issue(struct thin_c *tc, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000770{
771 struct pool *pool = tc->pool;
772 unsigned long flags;
773
Joe Thornbere49e5822012-07-27 15:08:16 +0100774 if (!bio_triggers_commit(tc, bio)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000775 generic_make_request(bio);
Joe Thornbere49e5822012-07-27 15:08:16 +0100776 return;
777 }
778
779 /*
780 * Complete bio with an error if earlier I/O caused changes to
781 * the metadata that can't be committed e.g, due to I/O errors
782 * on the metadata device.
783 */
784 if (dm_thin_aborted_changes(tc->td)) {
785 bio_io_error(bio);
786 return;
787 }
788
789 /*
790 * Batch together any bios that trigger commits and then issue a
791 * single commit for them in process_deferred_bios().
792 */
793 spin_lock_irqsave(&pool->lock, flags);
794 bio_list_add(&pool->deferred_flush_bios, bio);
795 spin_unlock_irqrestore(&pool->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000796}
797
Joe Thornber2dd9c252012-03-28 18:41:28 +0100798static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
799{
800 remap_to_origin(tc, bio);
801 issue(tc, bio);
802}
803
804static void remap_and_issue(struct thin_c *tc, struct bio *bio,
805 dm_block_t block)
806{
807 remap(tc, bio, block);
808 issue(tc, bio);
809}
810
Joe Thornber991d9fa2011-10-31 20:21:18 +0000811/*
812 * wake_worker() is used when new work is queued and when pool_resume is
813 * ready to continue deferred IO processing.
814 */
815static void wake_worker(struct pool *pool)
816{
817 queue_work(pool->wq, &pool->worker);
818}
819
820/*----------------------------------------------------------------*/
821
822/*
823 * Bio endio functions.
824 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100825struct dm_thin_new_mapping {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000826 struct list_head list;
827
Joe Thornbereb2aa482012-03-28 18:41:28 +0100828 unsigned quiesced:1;
829 unsigned prepared:1;
Joe Thornber104655f2012-03-28 18:41:28 +0100830 unsigned pass_discard:1;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000831
832 struct thin_c *tc;
833 dm_block_t virt_block;
834 dm_block_t data_block;
Mike Snitzera24c2562012-06-03 00:30:00 +0100835 struct dm_bio_prison_cell *cell, *cell2;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000836 int err;
837
838 /*
839 * If the bio covers the whole area of a block then we can avoid
840 * zeroing or copying. Instead this bio is hooked. The bio will
841 * still be in the cell, so care has to be taken to avoid issuing
842 * the bio twice.
843 */
844 struct bio *bio;
845 bio_end_io_t *saved_bi_end_io;
846};
847
Mike Snitzera24c2562012-06-03 00:30:00 +0100848static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000849{
850 struct pool *pool = m->tc->pool;
851
Joe Thornbereb2aa482012-03-28 18:41:28 +0100852 if (m->quiesced && m->prepared) {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000853 list_add(&m->list, &pool->prepared_mappings);
854 wake_worker(pool);
855 }
856}
857
858static void copy_complete(int read_err, unsigned long write_err, void *context)
859{
860 unsigned long flags;
Mike Snitzera24c2562012-06-03 00:30:00 +0100861 struct dm_thin_new_mapping *m = context;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000862 struct pool *pool = m->tc->pool;
863
864 m->err = read_err || write_err ? -EIO : 0;
865
866 spin_lock_irqsave(&pool->lock, flags);
867 m->prepared = 1;
868 __maybe_add_mapping(m);
869 spin_unlock_irqrestore(&pool->lock, flags);
870}
871
872static void overwrite_endio(struct bio *bio, int err)
873{
874 unsigned long flags;
Mike Snitzera24c2562012-06-03 00:30:00 +0100875 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
876 struct dm_thin_new_mapping *m = h->overwrite_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000877 struct pool *pool = m->tc->pool;
878
879 m->err = err;
880
881 spin_lock_irqsave(&pool->lock, flags);
882 m->prepared = 1;
883 __maybe_add_mapping(m);
884 spin_unlock_irqrestore(&pool->lock, flags);
885}
886
Joe Thornber991d9fa2011-10-31 20:21:18 +0000887/*----------------------------------------------------------------*/
888
889/*
890 * Workqueue.
891 */
892
893/*
894 * Prepared mapping jobs.
895 */
896
897/*
898 * This sends the bios in the cell back to the deferred_bios list.
899 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100900static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
Joe Thornber991d9fa2011-10-31 20:21:18 +0000901 dm_block_t data_block)
902{
903 struct pool *pool = tc->pool;
904 unsigned long flags;
905
906 spin_lock_irqsave(&pool->lock, flags);
Mike Snitzer44feb382012-10-12 21:02:10 +0100907 dm_cell_release(cell, &pool->deferred_bios);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000908 spin_unlock_irqrestore(&tc->pool->lock, flags);
909
910 wake_worker(pool);
911}
912
913/*
914 * Same as cell_defer above, except it omits one particular detainee,
915 * a write bio that covers the block and has already been processed.
916 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100917static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000918{
919 struct bio_list bios;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000920 struct pool *pool = tc->pool;
921 unsigned long flags;
922
923 bio_list_init(&bios);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000924
925 spin_lock_irqsave(&pool->lock, flags);
Mike Snitzer44feb382012-10-12 21:02:10 +0100926 dm_cell_release_no_holder(cell, &pool->deferred_bios);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000927 spin_unlock_irqrestore(&pool->lock, flags);
928
929 wake_worker(pool);
930}
931
Joe Thornbere49e5822012-07-27 15:08:16 +0100932static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
933{
934 if (m->bio)
935 m->bio->bi_end_io = m->saved_bi_end_io;
Mike Snitzer44feb382012-10-12 21:02:10 +0100936 dm_cell_error(m->cell);
Joe Thornbere49e5822012-07-27 15:08:16 +0100937 list_del(&m->list);
938 mempool_free(m, m->tc->pool->mapping_pool);
939}
Mike Snitzera24c2562012-06-03 00:30:00 +0100940static void process_prepared_mapping(struct dm_thin_new_mapping *m)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000941{
942 struct thin_c *tc = m->tc;
943 struct bio *bio;
944 int r;
945
946 bio = m->bio;
947 if (bio)
948 bio->bi_end_io = m->saved_bi_end_io;
949
950 if (m->err) {
Mike Snitzer44feb382012-10-12 21:02:10 +0100951 dm_cell_error(m->cell);
Joe Thornber905386f2012-07-27 15:08:05 +0100952 goto out;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000953 }
954
955 /*
956 * Commit the prepared block into the mapping btree.
957 * Any I/O for this block arriving after this point will get
958 * remapped to it directly.
959 */
960 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
961 if (r) {
962 DMERR("dm_thin_insert_block() failed");
Mike Snitzer44feb382012-10-12 21:02:10 +0100963 dm_cell_error(m->cell);
Joe Thornber905386f2012-07-27 15:08:05 +0100964 goto out;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000965 }
966
967 /*
968 * Release any bios held while the block was being provisioned.
969 * If we are processing a write bio that completely covers the block,
970 * we already processed it so can ignore it now when processing
971 * the bios in the cell.
972 */
973 if (bio) {
Joe Thornber6f94a4c2012-03-28 18:41:23 +0100974 cell_defer_except(tc, m->cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000975 bio_endio(bio, 0);
976 } else
977 cell_defer(tc, m->cell, m->data_block);
978
Joe Thornber905386f2012-07-27 15:08:05 +0100979out:
Joe Thornber991d9fa2011-10-31 20:21:18 +0000980 list_del(&m->list);
981 mempool_free(m, tc->pool->mapping_pool);
982}
983
Joe Thornbere49e5822012-07-27 15:08:16 +0100984static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
Joe Thornber104655f2012-03-28 18:41:28 +0100985{
Joe Thornber104655f2012-03-28 18:41:28 +0100986 struct thin_c *tc = m->tc;
987
Joe Thornbere49e5822012-07-27 15:08:16 +0100988 bio_io_error(m->bio);
989 cell_defer_except(tc, m->cell);
990 cell_defer_except(tc, m->cell2);
991 mempool_free(m, tc->pool->mapping_pool);
992}
Joe Thornber104655f2012-03-28 18:41:28 +0100993
Joe Thornbere49e5822012-07-27 15:08:16 +0100994static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
995{
996 struct thin_c *tc = m->tc;
997
Joe Thornber104655f2012-03-28 18:41:28 +0100998 if (m->pass_discard)
999 remap_and_issue(tc, m->bio, m->data_block);
1000 else
1001 bio_endio(m->bio, 0);
1002
1003 cell_defer_except(tc, m->cell);
1004 cell_defer_except(tc, m->cell2);
1005 mempool_free(m, tc->pool->mapping_pool);
1006}
1007
Joe Thornbere49e5822012-07-27 15:08:16 +01001008static void process_prepared_discard(struct dm_thin_new_mapping *m)
1009{
1010 int r;
1011 struct thin_c *tc = m->tc;
1012
1013 r = dm_thin_remove_block(tc->td, m->virt_block);
1014 if (r)
1015 DMERR("dm_thin_remove_block() failed");
1016
1017 process_prepared_discard_passdown(m);
1018}
1019
Joe Thornber104655f2012-03-28 18:41:28 +01001020static void process_prepared(struct pool *pool, struct list_head *head,
Joe Thornbere49e5822012-07-27 15:08:16 +01001021 process_mapping_fn *fn)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001022{
1023 unsigned long flags;
1024 struct list_head maps;
Mike Snitzera24c2562012-06-03 00:30:00 +01001025 struct dm_thin_new_mapping *m, *tmp;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001026
1027 INIT_LIST_HEAD(&maps);
1028 spin_lock_irqsave(&pool->lock, flags);
Joe Thornber104655f2012-03-28 18:41:28 +01001029 list_splice_init(head, &maps);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001030 spin_unlock_irqrestore(&pool->lock, flags);
1031
1032 list_for_each_entry_safe(m, tmp, &maps, list)
Joe Thornbere49e5822012-07-27 15:08:16 +01001033 (*fn)(m);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001034}
1035
1036/*
1037 * Deferred bio jobs.
1038 */
Joe Thornber104655f2012-03-28 18:41:28 +01001039static int io_overlaps_block(struct pool *pool, struct bio *bio)
1040{
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +01001041 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
Joe Thornber104655f2012-03-28 18:41:28 +01001042}
1043
Joe Thornber991d9fa2011-10-31 20:21:18 +00001044static int io_overwrites_block(struct pool *pool, struct bio *bio)
1045{
Joe Thornber104655f2012-03-28 18:41:28 +01001046 return (bio_data_dir(bio) == WRITE) &&
1047 io_overlaps_block(pool, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001048}
1049
1050static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
1051 bio_end_io_t *fn)
1052{
1053 *save = bio->bi_end_io;
1054 bio->bi_end_io = fn;
1055}
1056
1057static int ensure_next_mapping(struct pool *pool)
1058{
1059 if (pool->next_mapping)
1060 return 0;
1061
1062 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
1063
1064 return pool->next_mapping ? 0 : -ENOMEM;
1065}
1066
Mike Snitzera24c2562012-06-03 00:30:00 +01001067static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001068{
Mike Snitzera24c2562012-06-03 00:30:00 +01001069 struct dm_thin_new_mapping *r = pool->next_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001070
1071 BUG_ON(!pool->next_mapping);
1072
1073 pool->next_mapping = NULL;
1074
1075 return r;
1076}
1077
1078static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
Joe Thornber2dd9c252012-03-28 18:41:28 +01001079 struct dm_dev *origin, dm_block_t data_origin,
1080 dm_block_t data_dest,
Mike Snitzera24c2562012-06-03 00:30:00 +01001081 struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001082{
1083 int r;
1084 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +01001085 struct dm_thin_new_mapping *m = get_next_mapping(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001086
1087 INIT_LIST_HEAD(&m->list);
Joe Thornbereb2aa482012-03-28 18:41:28 +01001088 m->quiesced = 0;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001089 m->prepared = 0;
1090 m->tc = tc;
1091 m->virt_block = virt_block;
1092 m->data_block = data_dest;
1093 m->cell = cell;
1094 m->err = 0;
1095 m->bio = NULL;
1096
Mike Snitzer44feb382012-10-12 21:02:10 +01001097 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
Joe Thornbereb2aa482012-03-28 18:41:28 +01001098 m->quiesced = 1;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001099
1100 /*
1101 * IO to pool_dev remaps to the pool target's data_dev.
1102 *
1103 * If the whole block of data is being overwritten, we can issue the
1104 * bio immediately. Otherwise we use kcopyd to clone the data first.
1105 */
1106 if (io_overwrites_block(pool, bio)) {
Mike Snitzera24c2562012-06-03 00:30:00 +01001107 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1108
Joe Thornbereb2aa482012-03-28 18:41:28 +01001109 h->overwrite_mapping = m;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001110 m->bio = bio;
1111 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001112 remap_and_issue(tc, bio, data_dest);
1113 } else {
1114 struct dm_io_region from, to;
1115
Joe Thornber2dd9c252012-03-28 18:41:28 +01001116 from.bdev = origin->bdev;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001117 from.sector = data_origin * pool->sectors_per_block;
1118 from.count = pool->sectors_per_block;
1119
1120 to.bdev = tc->pool_dev->bdev;
1121 to.sector = data_dest * pool->sectors_per_block;
1122 to.count = pool->sectors_per_block;
1123
1124 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
1125 0, copy_complete, m);
1126 if (r < 0) {
1127 mempool_free(m, pool->mapping_pool);
1128 DMERR("dm_kcopyd_copy() failed");
Mike Snitzer44feb382012-10-12 21:02:10 +01001129 dm_cell_error(cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001130 }
1131 }
1132}
1133
Joe Thornber2dd9c252012-03-28 18:41:28 +01001134static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1135 dm_block_t data_origin, dm_block_t data_dest,
Mike Snitzera24c2562012-06-03 00:30:00 +01001136 struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber2dd9c252012-03-28 18:41:28 +01001137{
1138 schedule_copy(tc, virt_block, tc->pool_dev,
1139 data_origin, data_dest, cell, bio);
1140}
1141
1142static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1143 dm_block_t data_dest,
Mike Snitzera24c2562012-06-03 00:30:00 +01001144 struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber2dd9c252012-03-28 18:41:28 +01001145{
1146 schedule_copy(tc, virt_block, tc->origin_dev,
1147 virt_block, data_dest, cell, bio);
1148}
1149
Joe Thornber991d9fa2011-10-31 20:21:18 +00001150static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
Mike Snitzera24c2562012-06-03 00:30:00 +01001151 dm_block_t data_block, struct dm_bio_prison_cell *cell,
Joe Thornber991d9fa2011-10-31 20:21:18 +00001152 struct bio *bio)
1153{
1154 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +01001155 struct dm_thin_new_mapping *m = get_next_mapping(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001156
1157 INIT_LIST_HEAD(&m->list);
Joe Thornbereb2aa482012-03-28 18:41:28 +01001158 m->quiesced = 1;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001159 m->prepared = 0;
1160 m->tc = tc;
1161 m->virt_block = virt_block;
1162 m->data_block = data_block;
1163 m->cell = cell;
1164 m->err = 0;
1165 m->bio = NULL;
1166
1167 /*
1168 * If the whole block of data is being overwritten or we are not
1169 * zeroing pre-existing data, we can issue the bio immediately.
1170 * Otherwise we use kcopyd to zero the data first.
1171 */
Joe Thornber67e2e2b2012-03-28 18:41:29 +01001172 if (!pool->pf.zero_new_blocks)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001173 process_prepared_mapping(m);
1174
1175 else if (io_overwrites_block(pool, bio)) {
Mike Snitzera24c2562012-06-03 00:30:00 +01001176 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1177
Joe Thornbereb2aa482012-03-28 18:41:28 +01001178 h->overwrite_mapping = m;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001179 m->bio = bio;
1180 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001181 remap_and_issue(tc, bio, data_block);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001182 } else {
1183 int r;
1184 struct dm_io_region to;
1185
1186 to.bdev = tc->pool_dev->bdev;
1187 to.sector = data_block * pool->sectors_per_block;
1188 to.count = pool->sectors_per_block;
1189
1190 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
1191 if (r < 0) {
1192 mempool_free(m, pool->mapping_pool);
1193 DMERR("dm_kcopyd_zero() failed");
Mike Snitzer44feb382012-10-12 21:02:10 +01001194 dm_cell_error(cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001195 }
1196 }
1197}
1198
Joe Thornbere49e5822012-07-27 15:08:16 +01001199static int commit(struct pool *pool)
1200{
1201 int r;
1202
1203 r = dm_pool_commit_metadata(pool->pmd);
1204 if (r)
1205 DMERR("commit failed, error = %d", r);
1206
1207 return r;
1208}
1209
1210/*
1211 * A non-zero return indicates read_only or fail_io mode.
1212 * Many callers don't care about the return value.
1213 */
1214static int commit_or_fallback(struct pool *pool)
1215{
1216 int r;
1217
1218 if (get_pool_mode(pool) != PM_WRITE)
1219 return -EINVAL;
1220
1221 r = commit(pool);
1222 if (r)
1223 set_pool_mode(pool, PM_READ_ONLY);
1224
1225 return r;
1226}
1227
Joe Thornber991d9fa2011-10-31 20:21:18 +00001228static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1229{
1230 int r;
1231 dm_block_t free_blocks;
1232 unsigned long flags;
1233 struct pool *pool = tc->pool;
1234
1235 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1236 if (r)
1237 return r;
1238
1239 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1240 DMWARN("%s: reached low water mark, sending event.",
1241 dm_device_name(pool->pool_md));
1242 spin_lock_irqsave(&pool->lock, flags);
1243 pool->low_water_triggered = 1;
1244 spin_unlock_irqrestore(&pool->lock, flags);
1245 dm_table_event(pool->ti->table);
1246 }
1247
1248 if (!free_blocks) {
1249 if (pool->no_free_space)
1250 return -ENOSPC;
1251 else {
1252 /*
1253 * Try to commit to see if that will free up some
1254 * more space.
1255 */
Joe Thornbere49e5822012-07-27 15:08:16 +01001256 (void) commit_or_fallback(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001257
1258 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1259 if (r)
1260 return r;
1261
1262 /*
1263 * If we still have no space we set a flag to avoid
1264 * doing all this checking and return -ENOSPC.
1265 */
1266 if (!free_blocks) {
1267 DMWARN("%s: no free space available.",
1268 dm_device_name(pool->pool_md));
1269 spin_lock_irqsave(&pool->lock, flags);
1270 pool->no_free_space = 1;
1271 spin_unlock_irqrestore(&pool->lock, flags);
1272 return -ENOSPC;
1273 }
1274 }
1275 }
1276
1277 r = dm_pool_alloc_data_block(pool->pmd, result);
1278 if (r)
1279 return r;
1280
1281 return 0;
1282}
1283
1284/*
1285 * If we have run out of space, queue bios until the device is
1286 * resumed, presumably after having been reloaded with more space.
1287 */
1288static void retry_on_resume(struct bio *bio)
1289{
Mike Snitzera24c2562012-06-03 00:30:00 +01001290 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
Joe Thornbereb2aa482012-03-28 18:41:28 +01001291 struct thin_c *tc = h->tc;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001292 struct pool *pool = tc->pool;
1293 unsigned long flags;
1294
1295 spin_lock_irqsave(&pool->lock, flags);
1296 bio_list_add(&pool->retry_on_resume_list, bio);
1297 spin_unlock_irqrestore(&pool->lock, flags);
1298}
1299
Mike Snitzera24c2562012-06-03 00:30:00 +01001300static void no_space(struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001301{
1302 struct bio *bio;
1303 struct bio_list bios;
1304
1305 bio_list_init(&bios);
Mike Snitzer44feb382012-10-12 21:02:10 +01001306 dm_cell_release(cell, &bios);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001307
1308 while ((bio = bio_list_pop(&bios)))
1309 retry_on_resume(bio);
1310}
1311
Joe Thornber104655f2012-03-28 18:41:28 +01001312static void process_discard(struct thin_c *tc, struct bio *bio)
1313{
1314 int r;
Mike Snitzerc3a0ce22012-05-12 01:43:16 +01001315 unsigned long flags;
Joe Thornber104655f2012-03-28 18:41:28 +01001316 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +01001317 struct dm_bio_prison_cell *cell, *cell2;
Mike Snitzer44feb382012-10-12 21:02:10 +01001318 struct dm_cell_key key, key2;
Joe Thornber104655f2012-03-28 18:41:28 +01001319 dm_block_t block = get_bio_block(tc, bio);
1320 struct dm_thin_lookup_result lookup_result;
Mike Snitzera24c2562012-06-03 00:30:00 +01001321 struct dm_thin_new_mapping *m;
Joe Thornber104655f2012-03-28 18:41:28 +01001322
1323 build_virtual_key(tc->td, block, &key);
Mike Snitzer44feb382012-10-12 21:02:10 +01001324 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
Joe Thornber104655f2012-03-28 18:41:28 +01001325 return;
1326
1327 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1328 switch (r) {
1329 case 0:
1330 /*
1331 * Check nobody is fiddling with this pool block. This can
1332 * happen if someone's in the process of breaking sharing
1333 * on this block.
1334 */
1335 build_data_key(tc->td, lookup_result.block, &key2);
Mike Snitzer44feb382012-10-12 21:02:10 +01001336 if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
1337 dm_cell_release_singleton(cell, bio);
Joe Thornber104655f2012-03-28 18:41:28 +01001338 break;
1339 }
1340
1341 if (io_overlaps_block(pool, bio)) {
1342 /*
1343 * IO may still be going to the destination block. We must
1344 * quiesce before we can do the removal.
1345 */
1346 m = get_next_mapping(pool);
1347 m->tc = tc;
Mike Snitzer17b7d632012-07-27 15:07:57 +01001348 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
Joe Thornber104655f2012-03-28 18:41:28 +01001349 m->virt_block = block;
1350 m->data_block = lookup_result.block;
1351 m->cell = cell;
1352 m->cell2 = cell2;
1353 m->err = 0;
1354 m->bio = bio;
1355
Mike Snitzer44feb382012-10-12 21:02:10 +01001356 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
Mike Snitzerc3a0ce22012-05-12 01:43:16 +01001357 spin_lock_irqsave(&pool->lock, flags);
Joe Thornber104655f2012-03-28 18:41:28 +01001358 list_add(&m->list, &pool->prepared_discards);
Mike Snitzerc3a0ce22012-05-12 01:43:16 +01001359 spin_unlock_irqrestore(&pool->lock, flags);
Joe Thornber104655f2012-03-28 18:41:28 +01001360 wake_worker(pool);
1361 }
1362 } else {
1363 /*
Mikulas Patocka49296302012-07-27 15:08:03 +01001364 * The DM core makes sure that the discard doesn't span
1365 * a block boundary. So we submit the discard of a
1366 * partial block appropriately.
Joe Thornber104655f2012-03-28 18:41:28 +01001367 */
Mike Snitzer44feb382012-10-12 21:02:10 +01001368 dm_cell_release_singleton(cell, bio);
1369 dm_cell_release_singleton(cell2, bio);
Mikulas Patocka650d2a02012-07-20 14:25:05 +01001370 if ((!lookup_result.shared) && pool->pf.discard_passdown)
1371 remap_and_issue(tc, bio, lookup_result.block);
1372 else
1373 bio_endio(bio, 0);
Joe Thornber104655f2012-03-28 18:41:28 +01001374 }
1375 break;
1376
1377 case -ENODATA:
1378 /*
1379 * It isn't provisioned, just forget it.
1380 */
Mike Snitzer44feb382012-10-12 21:02:10 +01001381 dm_cell_release_singleton(cell, bio);
Joe Thornber104655f2012-03-28 18:41:28 +01001382 bio_endio(bio, 0);
1383 break;
1384
1385 default:
1386 DMERR("discard: find block unexpectedly returned %d", r);
Mike Snitzer44feb382012-10-12 21:02:10 +01001387 dm_cell_release_singleton(cell, bio);
Joe Thornber104655f2012-03-28 18:41:28 +01001388 bio_io_error(bio);
1389 break;
1390 }
1391}
1392
Joe Thornber991d9fa2011-10-31 20:21:18 +00001393static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
Mike Snitzer44feb382012-10-12 21:02:10 +01001394 struct dm_cell_key *key,
Joe Thornber991d9fa2011-10-31 20:21:18 +00001395 struct dm_thin_lookup_result *lookup_result,
Mike Snitzera24c2562012-06-03 00:30:00 +01001396 struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001397{
1398 int r;
1399 dm_block_t data_block;
1400
1401 r = alloc_data_block(tc, &data_block);
1402 switch (r) {
1403 case 0:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001404 schedule_internal_copy(tc, block, lookup_result->block,
1405 data_block, cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001406 break;
1407
1408 case -ENOSPC:
1409 no_space(cell);
1410 break;
1411
1412 default:
1413 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
Mike Snitzer44feb382012-10-12 21:02:10 +01001414 dm_cell_error(cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001415 break;
1416 }
1417}
1418
1419static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1420 dm_block_t block,
1421 struct dm_thin_lookup_result *lookup_result)
1422{
Mike Snitzera24c2562012-06-03 00:30:00 +01001423 struct dm_bio_prison_cell *cell;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001424 struct pool *pool = tc->pool;
Mike Snitzer44feb382012-10-12 21:02:10 +01001425 struct dm_cell_key key;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001426
1427 /*
1428 * If cell is already occupied, then sharing is already in the process
1429 * of being broken so we have nothing further to do here.
1430 */
1431 build_data_key(tc->td, lookup_result->block, &key);
Mike Snitzer44feb382012-10-12 21:02:10 +01001432 if (dm_bio_detain(pool->prison, &key, bio, &cell))
Joe Thornber991d9fa2011-10-31 20:21:18 +00001433 return;
1434
Joe Thornber60049702012-07-27 15:08:06 +01001435 if (bio_data_dir(bio) == WRITE && bio->bi_size)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001436 break_sharing(tc, bio, block, &key, lookup_result, cell);
1437 else {
Mike Snitzera24c2562012-06-03 00:30:00 +01001438 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001439
Mike Snitzer44feb382012-10-12 21:02:10 +01001440 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001441
Mike Snitzer44feb382012-10-12 21:02:10 +01001442 dm_cell_release_singleton(cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001443 remap_and_issue(tc, bio, lookup_result->block);
1444 }
1445}
1446
1447static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
Mike Snitzera24c2562012-06-03 00:30:00 +01001448 struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001449{
1450 int r;
1451 dm_block_t data_block;
1452
1453 /*
1454 * Remap empty bios (flushes) immediately, without provisioning.
1455 */
1456 if (!bio->bi_size) {
Mike Snitzer44feb382012-10-12 21:02:10 +01001457 dm_cell_release_singleton(cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001458 remap_and_issue(tc, bio, 0);
1459 return;
1460 }
1461
1462 /*
1463 * Fill read bios with zeroes and complete them immediately.
1464 */
1465 if (bio_data_dir(bio) == READ) {
1466 zero_fill_bio(bio);
Mike Snitzer44feb382012-10-12 21:02:10 +01001467 dm_cell_release_singleton(cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001468 bio_endio(bio, 0);
1469 return;
1470 }
1471
1472 r = alloc_data_block(tc, &data_block);
1473 switch (r) {
1474 case 0:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001475 if (tc->origin_dev)
1476 schedule_external_copy(tc, block, data_block, cell, bio);
1477 else
1478 schedule_zero(tc, block, data_block, cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001479 break;
1480
1481 case -ENOSPC:
1482 no_space(cell);
1483 break;
1484
1485 default:
1486 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
Joe Thornbere49e5822012-07-27 15:08:16 +01001487 set_pool_mode(tc->pool, PM_READ_ONLY);
Mike Snitzer44feb382012-10-12 21:02:10 +01001488 dm_cell_error(cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001489 break;
1490 }
1491}
1492
1493static void process_bio(struct thin_c *tc, struct bio *bio)
1494{
1495 int r;
1496 dm_block_t block = get_bio_block(tc, bio);
Mike Snitzera24c2562012-06-03 00:30:00 +01001497 struct dm_bio_prison_cell *cell;
Mike Snitzer44feb382012-10-12 21:02:10 +01001498 struct dm_cell_key key;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001499 struct dm_thin_lookup_result lookup_result;
1500
1501 /*
1502 * If cell is already occupied, then the block is already
1503 * being provisioned so we have nothing further to do here.
1504 */
1505 build_virtual_key(tc->td, block, &key);
Mike Snitzer44feb382012-10-12 21:02:10 +01001506 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
Joe Thornber991d9fa2011-10-31 20:21:18 +00001507 return;
1508
1509 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1510 switch (r) {
1511 case 0:
1512 /*
1513 * We can release this cell now. This thread is the only
1514 * one that puts bios into a cell, and we know there were
1515 * no preceding bios.
1516 */
1517 /*
1518 * TODO: this will probably have to change when discard goes
1519 * back in.
1520 */
Mike Snitzer44feb382012-10-12 21:02:10 +01001521 dm_cell_release_singleton(cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001522
1523 if (lookup_result.shared)
1524 process_shared_bio(tc, bio, block, &lookup_result);
1525 else
1526 remap_and_issue(tc, bio, lookup_result.block);
1527 break;
1528
1529 case -ENODATA:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001530 if (bio_data_dir(bio) == READ && tc->origin_dev) {
Mike Snitzer44feb382012-10-12 21:02:10 +01001531 dm_cell_release_singleton(cell, bio);
Joe Thornber2dd9c252012-03-28 18:41:28 +01001532 remap_to_origin_and_issue(tc, bio);
1533 } else
1534 provision_block(tc, bio, block, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001535 break;
1536
1537 default:
1538 DMERR("dm_thin_find_block() failed, error = %d", r);
Mike Snitzer44feb382012-10-12 21:02:10 +01001539 dm_cell_release_singleton(cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001540 bio_io_error(bio);
1541 break;
1542 }
1543}
1544
Joe Thornbere49e5822012-07-27 15:08:16 +01001545static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1546{
1547 int r;
1548 int rw = bio_data_dir(bio);
1549 dm_block_t block = get_bio_block(tc, bio);
1550 struct dm_thin_lookup_result lookup_result;
1551
1552 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1553 switch (r) {
1554 case 0:
1555 if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1556 bio_io_error(bio);
1557 else
1558 remap_and_issue(tc, bio, lookup_result.block);
1559 break;
1560
1561 case -ENODATA:
1562 if (rw != READ) {
1563 bio_io_error(bio);
1564 break;
1565 }
1566
1567 if (tc->origin_dev) {
1568 remap_to_origin_and_issue(tc, bio);
1569 break;
1570 }
1571
1572 zero_fill_bio(bio);
1573 bio_endio(bio, 0);
1574 break;
1575
1576 default:
1577 DMERR("dm_thin_find_block() failed, error = %d", r);
1578 bio_io_error(bio);
1579 break;
1580 }
1581}
1582
1583static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1584{
1585 bio_io_error(bio);
1586}
1587
Joe Thornber905e51b2012-03-28 18:41:27 +01001588static int need_commit_due_to_time(struct pool *pool)
1589{
1590 return jiffies < pool->last_commit_jiffies ||
1591 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1592}
1593
Joe Thornber991d9fa2011-10-31 20:21:18 +00001594static void process_deferred_bios(struct pool *pool)
1595{
1596 unsigned long flags;
1597 struct bio *bio;
1598 struct bio_list bios;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001599
1600 bio_list_init(&bios);
1601
1602 spin_lock_irqsave(&pool->lock, flags);
1603 bio_list_merge(&bios, &pool->deferred_bios);
1604 bio_list_init(&pool->deferred_bios);
1605 spin_unlock_irqrestore(&pool->lock, flags);
1606
1607 while ((bio = bio_list_pop(&bios))) {
Mike Snitzera24c2562012-06-03 00:30:00 +01001608 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
Joe Thornbereb2aa482012-03-28 18:41:28 +01001609 struct thin_c *tc = h->tc;
1610
Joe Thornber991d9fa2011-10-31 20:21:18 +00001611 /*
1612 * If we've got no free new_mapping structs, and processing
1613 * this bio might require one, we pause until there are some
1614 * prepared mappings to process.
1615 */
1616 if (ensure_next_mapping(pool)) {
1617 spin_lock_irqsave(&pool->lock, flags);
1618 bio_list_merge(&pool->deferred_bios, &bios);
1619 spin_unlock_irqrestore(&pool->lock, flags);
1620
1621 break;
1622 }
Joe Thornber104655f2012-03-28 18:41:28 +01001623
1624 if (bio->bi_rw & REQ_DISCARD)
Joe Thornbere49e5822012-07-27 15:08:16 +01001625 pool->process_discard(tc, bio);
Joe Thornber104655f2012-03-28 18:41:28 +01001626 else
Joe Thornbere49e5822012-07-27 15:08:16 +01001627 pool->process_bio(tc, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001628 }
1629
1630 /*
1631 * If there are any deferred flush bios, we must commit
1632 * the metadata before issuing them.
1633 */
1634 bio_list_init(&bios);
1635 spin_lock_irqsave(&pool->lock, flags);
1636 bio_list_merge(&bios, &pool->deferred_flush_bios);
1637 bio_list_init(&pool->deferred_flush_bios);
1638 spin_unlock_irqrestore(&pool->lock, flags);
1639
Joe Thornber905e51b2012-03-28 18:41:27 +01001640 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
Joe Thornber991d9fa2011-10-31 20:21:18 +00001641 return;
1642
Joe Thornbere49e5822012-07-27 15:08:16 +01001643 if (commit_or_fallback(pool)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001644 while ((bio = bio_list_pop(&bios)))
1645 bio_io_error(bio);
1646 return;
1647 }
Joe Thornber905e51b2012-03-28 18:41:27 +01001648 pool->last_commit_jiffies = jiffies;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001649
1650 while ((bio = bio_list_pop(&bios)))
1651 generic_make_request(bio);
1652}
1653
1654static void do_worker(struct work_struct *ws)
1655{
1656 struct pool *pool = container_of(ws, struct pool, worker);
1657
Joe Thornbere49e5822012-07-27 15:08:16 +01001658 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1659 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001660 process_deferred_bios(pool);
1661}
1662
Joe Thornber905e51b2012-03-28 18:41:27 +01001663/*
1664 * We want to commit periodically so that not too much
1665 * unwritten data builds up.
1666 */
1667static void do_waker(struct work_struct *ws)
1668{
1669 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1670 wake_worker(pool);
1671 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1672}
1673
Joe Thornber991d9fa2011-10-31 20:21:18 +00001674/*----------------------------------------------------------------*/
1675
Joe Thornbere49e5822012-07-27 15:08:16 +01001676static enum pool_mode get_pool_mode(struct pool *pool)
1677{
1678 return pool->pf.mode;
1679}
1680
1681static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1682{
1683 int r;
1684
1685 pool->pf.mode = mode;
1686
1687 switch (mode) {
1688 case PM_FAIL:
1689 DMERR("switching pool to failure mode");
1690 pool->process_bio = process_bio_fail;
1691 pool->process_discard = process_bio_fail;
1692 pool->process_prepared_mapping = process_prepared_mapping_fail;
1693 pool->process_prepared_discard = process_prepared_discard_fail;
1694 break;
1695
1696 case PM_READ_ONLY:
1697 DMERR("switching pool to read-only mode");
1698 r = dm_pool_abort_metadata(pool->pmd);
1699 if (r) {
1700 DMERR("aborting transaction failed");
1701 set_pool_mode(pool, PM_FAIL);
1702 } else {
1703 dm_pool_metadata_read_only(pool->pmd);
1704 pool->process_bio = process_bio_read_only;
1705 pool->process_discard = process_discard;
1706 pool->process_prepared_mapping = process_prepared_mapping_fail;
1707 pool->process_prepared_discard = process_prepared_discard_passdown;
1708 }
1709 break;
1710
1711 case PM_WRITE:
1712 pool->process_bio = process_bio;
1713 pool->process_discard = process_discard;
1714 pool->process_prepared_mapping = process_prepared_mapping;
1715 pool->process_prepared_discard = process_prepared_discard;
1716 break;
1717 }
1718}
1719
1720/*----------------------------------------------------------------*/
1721
Joe Thornber991d9fa2011-10-31 20:21:18 +00001722/*
1723 * Mapping functions.
1724 */
1725
1726/*
1727 * Called only while mapping a thin bio to hand it over to the workqueue.
1728 */
1729static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1730{
1731 unsigned long flags;
1732 struct pool *pool = tc->pool;
1733
1734 spin_lock_irqsave(&pool->lock, flags);
1735 bio_list_add(&pool->deferred_bios, bio);
1736 spin_unlock_irqrestore(&pool->lock, flags);
1737
1738 wake_worker(pool);
1739}
1740
Mike Snitzera24c2562012-06-03 00:30:00 +01001741static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
Joe Thornbereb2aa482012-03-28 18:41:28 +01001742{
1743 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +01001744 struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
Joe Thornbereb2aa482012-03-28 18:41:28 +01001745
1746 h->tc = tc;
1747 h->shared_read_entry = NULL;
Mike Snitzer44feb382012-10-12 21:02:10 +01001748 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds);
Joe Thornbereb2aa482012-03-28 18:41:28 +01001749 h->overwrite_mapping = NULL;
1750
1751 return h;
1752}
1753
Joe Thornber991d9fa2011-10-31 20:21:18 +00001754/*
1755 * Non-blocking function called from the thin target's map function.
1756 */
1757static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1758 union map_info *map_context)
1759{
1760 int r;
1761 struct thin_c *tc = ti->private;
1762 dm_block_t block = get_bio_block(tc, bio);
1763 struct dm_thin_device *td = tc->td;
1764 struct dm_thin_lookup_result result;
1765
Joe Thornbereb2aa482012-03-28 18:41:28 +01001766 map_context->ptr = thin_hook_bio(tc, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01001767
1768 if (get_pool_mode(tc->pool) == PM_FAIL) {
1769 bio_io_error(bio);
1770 return DM_MAPIO_SUBMITTED;
1771 }
1772
Joe Thornber104655f2012-03-28 18:41:28 +01001773 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001774 thin_defer_bio(tc, bio);
1775 return DM_MAPIO_SUBMITTED;
1776 }
1777
1778 r = dm_thin_find_block(td, block, 0, &result);
1779
1780 /*
1781 * Note that we defer readahead too.
1782 */
1783 switch (r) {
1784 case 0:
1785 if (unlikely(result.shared)) {
1786 /*
1787 * We have a race condition here between the
1788 * result.shared value returned by the lookup and
1789 * snapshot creation, which may cause new
1790 * sharing.
1791 *
1792 * To avoid this always quiesce the origin before
1793 * taking the snap. You want to do this anyway to
1794 * ensure a consistent application view
1795 * (i.e. lockfs).
1796 *
1797 * More distant ancestors are irrelevant. The
1798 * shared flag will be set in their case.
1799 */
1800 thin_defer_bio(tc, bio);
1801 r = DM_MAPIO_SUBMITTED;
1802 } else {
1803 remap(tc, bio, result.block);
1804 r = DM_MAPIO_REMAPPED;
1805 }
1806 break;
1807
1808 case -ENODATA:
Joe Thornbere49e5822012-07-27 15:08:16 +01001809 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1810 /*
1811 * This block isn't provisioned, and we have no way
1812 * of doing so. Just error it.
1813 */
1814 bio_io_error(bio);
1815 r = DM_MAPIO_SUBMITTED;
1816 break;
1817 }
1818 /* fall through */
1819
1820 case -EWOULDBLOCK:
Joe Thornber991d9fa2011-10-31 20:21:18 +00001821 /*
1822 * In future, the failed dm_thin_find_block above could
1823 * provide the hint to load the metadata into cache.
1824 */
Joe Thornber991d9fa2011-10-31 20:21:18 +00001825 thin_defer_bio(tc, bio);
1826 r = DM_MAPIO_SUBMITTED;
1827 break;
Joe Thornbere49e5822012-07-27 15:08:16 +01001828
1829 default:
1830 /*
1831 * Must always call bio_io_error on failure.
1832 * dm_thin_find_block can fail with -EINVAL if the
1833 * pool is switched to fail-io mode.
1834 */
1835 bio_io_error(bio);
1836 r = DM_MAPIO_SUBMITTED;
1837 break;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001838 }
1839
1840 return r;
1841}
1842
1843static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1844{
1845 int r;
1846 unsigned long flags;
1847 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1848
1849 spin_lock_irqsave(&pt->pool->lock, flags);
1850 r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1851 spin_unlock_irqrestore(&pt->pool->lock, flags);
1852
1853 if (!r) {
1854 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1855 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1856 }
1857
1858 return r;
1859}
1860
1861static void __requeue_bios(struct pool *pool)
1862{
1863 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1864 bio_list_init(&pool->retry_on_resume_list);
1865}
1866
1867/*----------------------------------------------------------------
1868 * Binding of control targets to a pool object
1869 *--------------------------------------------------------------*/
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001870static bool data_dev_supports_discard(struct pool_c *pt)
1871{
1872 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1873
1874 return q && blk_queue_discard(q);
1875}
1876
1877/*
1878 * If discard_passdown was enabled verify that the data device
Mike Snitzer0424caa2012-09-26 23:45:47 +01001879 * supports discards. Disable discard_passdown if not.
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001880 */
Mike Snitzer0424caa2012-09-26 23:45:47 +01001881static void disable_passdown_if_not_supported(struct pool_c *pt)
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001882{
Mike Snitzer0424caa2012-09-26 23:45:47 +01001883 struct pool *pool = pt->pool;
1884 struct block_device *data_bdev = pt->data_dev->bdev;
1885 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1886 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1887 const char *reason = NULL;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001888 char buf[BDEVNAME_SIZE];
1889
Mike Snitzer0424caa2012-09-26 23:45:47 +01001890 if (!pt->adjusted_pf.discard_passdown)
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001891 return;
1892
Mike Snitzer0424caa2012-09-26 23:45:47 +01001893 if (!data_dev_supports_discard(pt))
1894 reason = "discard unsupported";
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001895
Mike Snitzer0424caa2012-09-26 23:45:47 +01001896 else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1897 reason = "max discard sectors smaller than a block";
1898
1899 else if (data_limits->discard_granularity > block_size)
1900 reason = "discard granularity larger than a block";
1901
1902 else if (block_size & (data_limits->discard_granularity - 1))
1903 reason = "discard granularity not a factor of block size";
1904
1905 if (reason) {
1906 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1907 pt->adjusted_pf.discard_passdown = false;
1908 }
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001909}
1910
Joe Thornber991d9fa2011-10-31 20:21:18 +00001911static int bind_control_target(struct pool *pool, struct dm_target *ti)
1912{
1913 struct pool_c *pt = ti->private;
1914
Joe Thornbere49e5822012-07-27 15:08:16 +01001915 /*
1916 * We want to make sure that degraded pools are never upgraded.
1917 */
1918 enum pool_mode old_mode = pool->pf.mode;
Mike Snitzer0424caa2012-09-26 23:45:47 +01001919 enum pool_mode new_mode = pt->adjusted_pf.mode;
Joe Thornbere49e5822012-07-27 15:08:16 +01001920
1921 if (old_mode > new_mode)
1922 new_mode = old_mode;
1923
Joe Thornber991d9fa2011-10-31 20:21:18 +00001924 pool->ti = ti;
1925 pool->low_water_blocks = pt->low_water_blocks;
Mike Snitzer0424caa2012-09-26 23:45:47 +01001926 pool->pf = pt->adjusted_pf;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001927
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001928 set_pool_mode(pool, new_mode);
Mike Snitzerf4026932012-05-19 01:01:01 +01001929
Joe Thornber991d9fa2011-10-31 20:21:18 +00001930 return 0;
1931}
1932
1933static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1934{
1935 if (pool->ti == ti)
1936 pool->ti = NULL;
1937}
1938
1939/*----------------------------------------------------------------
1940 * Pool creation
1941 *--------------------------------------------------------------*/
Joe Thornber67e2e2b2012-03-28 18:41:29 +01001942/* Initialize pool features. */
1943static void pool_features_init(struct pool_features *pf)
1944{
Joe Thornbere49e5822012-07-27 15:08:16 +01001945 pf->mode = PM_WRITE;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001946 pf->zero_new_blocks = true;
1947 pf->discard_enabled = true;
1948 pf->discard_passdown = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01001949}
1950
Joe Thornber991d9fa2011-10-31 20:21:18 +00001951static void __pool_destroy(struct pool *pool)
1952{
1953 __pool_table_remove(pool);
1954
1955 if (dm_pool_metadata_close(pool->pmd) < 0)
1956 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1957
Mike Snitzer44feb382012-10-12 21:02:10 +01001958 dm_bio_prison_destroy(pool->prison);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001959 dm_kcopyd_client_destroy(pool->copier);
1960
1961 if (pool->wq)
1962 destroy_workqueue(pool->wq);
1963
1964 if (pool->next_mapping)
1965 mempool_free(pool->next_mapping, pool->mapping_pool);
1966 mempool_destroy(pool->mapping_pool);
1967 mempool_destroy(pool->endio_hook_pool);
Mike Snitzer44feb382012-10-12 21:02:10 +01001968 dm_deferred_set_destroy(pool->shared_read_ds);
1969 dm_deferred_set_destroy(pool->all_io_ds);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001970 kfree(pool);
1971}
1972
Mike Snitzera24c2562012-06-03 00:30:00 +01001973static struct kmem_cache *_new_mapping_cache;
1974static struct kmem_cache *_endio_hook_cache;
1975
Joe Thornber991d9fa2011-10-31 20:21:18 +00001976static struct pool *pool_create(struct mapped_device *pool_md,
1977 struct block_device *metadata_dev,
Joe Thornbere49e5822012-07-27 15:08:16 +01001978 unsigned long block_size,
1979 int read_only, char **error)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001980{
1981 int r;
1982 void *err_p;
1983 struct pool *pool;
1984 struct dm_pool_metadata *pmd;
Joe Thornbere49e5822012-07-27 15:08:16 +01001985 bool format_device = read_only ? false : true;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001986
Joe Thornbere49e5822012-07-27 15:08:16 +01001987 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001988 if (IS_ERR(pmd)) {
1989 *error = "Error creating metadata object";
1990 return (struct pool *)pmd;
1991 }
1992
1993 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1994 if (!pool) {
1995 *error = "Error allocating memory for pool";
1996 err_p = ERR_PTR(-ENOMEM);
1997 goto bad_pool;
1998 }
1999
2000 pool->pmd = pmd;
2001 pool->sectors_per_block = block_size;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +01002002 if (block_size & (block_size - 1))
2003 pool->sectors_per_block_shift = -1;
2004 else
2005 pool->sectors_per_block_shift = __ffs(block_size);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002006 pool->low_water_blocks = 0;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002007 pool_features_init(&pool->pf);
Mike Snitzer44feb382012-10-12 21:02:10 +01002008 pool->prison = dm_bio_prison_create(PRISON_CELLS);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002009 if (!pool->prison) {
2010 *error = "Error creating pool's bio prison";
2011 err_p = ERR_PTR(-ENOMEM);
2012 goto bad_prison;
2013 }
2014
2015 pool->copier = dm_kcopyd_client_create();
2016 if (IS_ERR(pool->copier)) {
2017 r = PTR_ERR(pool->copier);
2018 *error = "Error creating pool's kcopyd client";
2019 err_p = ERR_PTR(r);
2020 goto bad_kcopyd_client;
2021 }
2022
2023 /*
2024 * Create singlethreaded workqueue that will service all devices
2025 * that use this metadata.
2026 */
2027 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2028 if (!pool->wq) {
2029 *error = "Error creating pool's workqueue";
2030 err_p = ERR_PTR(-ENOMEM);
2031 goto bad_wq;
2032 }
2033
2034 INIT_WORK(&pool->worker, do_worker);
Joe Thornber905e51b2012-03-28 18:41:27 +01002035 INIT_DELAYED_WORK(&pool->waker, do_waker);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002036 spin_lock_init(&pool->lock);
2037 bio_list_init(&pool->deferred_bios);
2038 bio_list_init(&pool->deferred_flush_bios);
2039 INIT_LIST_HEAD(&pool->prepared_mappings);
Joe Thornber104655f2012-03-28 18:41:28 +01002040 INIT_LIST_HEAD(&pool->prepared_discards);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002041 pool->low_water_triggered = 0;
2042 pool->no_free_space = 0;
2043 bio_list_init(&pool->retry_on_resume_list);
Mike Snitzer44feb382012-10-12 21:02:10 +01002044
2045 pool->shared_read_ds = dm_deferred_set_create();
2046 if (!pool->shared_read_ds) {
2047 *error = "Error creating pool's shared read deferred set";
2048 err_p = ERR_PTR(-ENOMEM);
2049 goto bad_shared_read_ds;
2050 }
2051
2052 pool->all_io_ds = dm_deferred_set_create();
2053 if (!pool->all_io_ds) {
2054 *error = "Error creating pool's all io deferred set";
2055 err_p = ERR_PTR(-ENOMEM);
2056 goto bad_all_io_ds;
2057 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002058
2059 pool->next_mapping = NULL;
Mike Snitzera24c2562012-06-03 00:30:00 +01002060 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
2061 _new_mapping_cache);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002062 if (!pool->mapping_pool) {
2063 *error = "Error creating pool's mapping mempool";
2064 err_p = ERR_PTR(-ENOMEM);
2065 goto bad_mapping_pool;
2066 }
2067
Mike Snitzera24c2562012-06-03 00:30:00 +01002068 pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
2069 _endio_hook_cache);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002070 if (!pool->endio_hook_pool) {
2071 *error = "Error creating pool's endio_hook mempool";
2072 err_p = ERR_PTR(-ENOMEM);
2073 goto bad_endio_hook_pool;
2074 }
2075 pool->ref_count = 1;
Joe Thornber905e51b2012-03-28 18:41:27 +01002076 pool->last_commit_jiffies = jiffies;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002077 pool->pool_md = pool_md;
2078 pool->md_dev = metadata_dev;
2079 __pool_table_insert(pool);
2080
2081 return pool;
2082
2083bad_endio_hook_pool:
2084 mempool_destroy(pool->mapping_pool);
2085bad_mapping_pool:
Mike Snitzer44feb382012-10-12 21:02:10 +01002086 dm_deferred_set_destroy(pool->all_io_ds);
2087bad_all_io_ds:
2088 dm_deferred_set_destroy(pool->shared_read_ds);
2089bad_shared_read_ds:
Joe Thornber991d9fa2011-10-31 20:21:18 +00002090 destroy_workqueue(pool->wq);
2091bad_wq:
2092 dm_kcopyd_client_destroy(pool->copier);
2093bad_kcopyd_client:
Mike Snitzer44feb382012-10-12 21:02:10 +01002094 dm_bio_prison_destroy(pool->prison);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002095bad_prison:
2096 kfree(pool);
2097bad_pool:
2098 if (dm_pool_metadata_close(pmd))
2099 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2100
2101 return err_p;
2102}
2103
2104static void __pool_inc(struct pool *pool)
2105{
2106 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2107 pool->ref_count++;
2108}
2109
2110static void __pool_dec(struct pool *pool)
2111{
2112 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2113 BUG_ON(!pool->ref_count);
2114 if (!--pool->ref_count)
2115 __pool_destroy(pool);
2116}
2117
2118static struct pool *__pool_find(struct mapped_device *pool_md,
2119 struct block_device *metadata_dev,
Joe Thornbere49e5822012-07-27 15:08:16 +01002120 unsigned long block_size, int read_only,
2121 char **error, int *created)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002122{
2123 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2124
2125 if (pool) {
Mike Snitzerf09996c2012-07-27 15:07:59 +01002126 if (pool->pool_md != pool_md) {
2127 *error = "metadata device already in use by a pool";
Joe Thornber991d9fa2011-10-31 20:21:18 +00002128 return ERR_PTR(-EBUSY);
Mike Snitzerf09996c2012-07-27 15:07:59 +01002129 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002130 __pool_inc(pool);
2131
2132 } else {
2133 pool = __pool_table_lookup(pool_md);
2134 if (pool) {
Mike Snitzerf09996c2012-07-27 15:07:59 +01002135 if (pool->md_dev != metadata_dev) {
2136 *error = "different pool cannot replace a pool";
Joe Thornber991d9fa2011-10-31 20:21:18 +00002137 return ERR_PTR(-EINVAL);
Mike Snitzerf09996c2012-07-27 15:07:59 +01002138 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002139 __pool_inc(pool);
2140
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002141 } else {
Joe Thornbere49e5822012-07-27 15:08:16 +01002142 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002143 *created = 1;
2144 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002145 }
2146
2147 return pool;
2148}
2149
2150/*----------------------------------------------------------------
2151 * Pool target methods
2152 *--------------------------------------------------------------*/
2153static void pool_dtr(struct dm_target *ti)
2154{
2155 struct pool_c *pt = ti->private;
2156
2157 mutex_lock(&dm_thin_pool_table.mutex);
2158
2159 unbind_control_target(pt->pool, ti);
2160 __pool_dec(pt->pool);
2161 dm_put_device(ti, pt->metadata_dev);
2162 dm_put_device(ti, pt->data_dev);
2163 kfree(pt);
2164
2165 mutex_unlock(&dm_thin_pool_table.mutex);
2166}
2167
Joe Thornber991d9fa2011-10-31 20:21:18 +00002168static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2169 struct dm_target *ti)
2170{
2171 int r;
2172 unsigned argc;
2173 const char *arg_name;
2174
2175 static struct dm_arg _args[] = {
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002176 {0, 3, "Invalid number of pool feature arguments"},
Joe Thornber991d9fa2011-10-31 20:21:18 +00002177 };
2178
2179 /*
2180 * No feature arguments supplied.
2181 */
2182 if (!as->argc)
2183 return 0;
2184
2185 r = dm_read_arg_group(_args, as, &argc, &ti->error);
2186 if (r)
2187 return -EINVAL;
2188
2189 while (argc && !r) {
2190 arg_name = dm_shift_arg(as);
2191 argc--;
2192
Joe Thornbere49e5822012-07-27 15:08:16 +01002193 if (!strcasecmp(arg_name, "skip_block_zeroing"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002194 pf->zero_new_blocks = false;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002195
Joe Thornbere49e5822012-07-27 15:08:16 +01002196 else if (!strcasecmp(arg_name, "ignore_discard"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002197 pf->discard_enabled = false;
Joe Thornbere49e5822012-07-27 15:08:16 +01002198
2199 else if (!strcasecmp(arg_name, "no_discard_passdown"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002200 pf->discard_passdown = false;
Joe Thornbere49e5822012-07-27 15:08:16 +01002201
2202 else if (!strcasecmp(arg_name, "read_only"))
2203 pf->mode = PM_READ_ONLY;
2204
2205 else {
2206 ti->error = "Unrecognised pool feature requested";
2207 r = -EINVAL;
2208 break;
2209 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002210 }
2211
2212 return r;
2213}
2214
2215/*
2216 * thin-pool <metadata dev> <data dev>
2217 * <data block size (sectors)>
2218 * <low water mark (blocks)>
2219 * [<#feature args> [<arg>]*]
2220 *
2221 * Optional feature arguments are:
2222 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002223 * ignore_discard: disable discard
2224 * no_discard_passdown: don't pass discards down to the data device
Joe Thornber991d9fa2011-10-31 20:21:18 +00002225 */
2226static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2227{
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002228 int r, pool_created = 0;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002229 struct pool_c *pt;
2230 struct pool *pool;
2231 struct pool_features pf;
2232 struct dm_arg_set as;
2233 struct dm_dev *data_dev;
2234 unsigned long block_size;
2235 dm_block_t low_water_blocks;
2236 struct dm_dev *metadata_dev;
2237 sector_t metadata_dev_size;
Mike Snitzerc4a69ec2012-03-28 18:41:28 +01002238 char b[BDEVNAME_SIZE];
Joe Thornber991d9fa2011-10-31 20:21:18 +00002239
2240 /*
2241 * FIXME Remove validation from scope of lock.
2242 */
2243 mutex_lock(&dm_thin_pool_table.mutex);
2244
2245 if (argc < 4) {
2246 ti->error = "Invalid argument count";
2247 r = -EINVAL;
2248 goto out_unlock;
2249 }
2250 as.argc = argc;
2251 as.argv = argv;
2252
2253 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
2254 if (r) {
2255 ti->error = "Error opening metadata block device";
2256 goto out_unlock;
2257 }
2258
2259 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
Mike Snitzerc4a69ec2012-03-28 18:41:28 +01002260 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
2261 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2262 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002263
2264 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2265 if (r) {
2266 ti->error = "Error getting data device";
2267 goto out_metadata;
2268 }
2269
2270 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2271 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2272 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01002273 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00002274 ti->error = "Invalid block size";
2275 r = -EINVAL;
2276 goto out;
2277 }
2278
2279 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2280 ti->error = "Invalid low water mark";
2281 r = -EINVAL;
2282 goto out;
2283 }
2284
2285 /*
2286 * Set default pool features.
2287 */
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002288 pool_features_init(&pf);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002289
2290 dm_consume_args(&as, 4);
2291 r = parse_pool_features(&as, &pf, ti);
2292 if (r)
2293 goto out;
2294
2295 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2296 if (!pt) {
2297 r = -ENOMEM;
2298 goto out;
2299 }
2300
2301 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
Joe Thornbere49e5822012-07-27 15:08:16 +01002302 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002303 if (IS_ERR(pool)) {
2304 r = PTR_ERR(pool);
2305 goto out_free_pt;
2306 }
2307
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002308 /*
2309 * 'pool_created' reflects whether this is the first table load.
2310 * Top level discard support is not allowed to be changed after
2311 * initial load. This would require a pool reload to trigger thin
2312 * device changes.
2313 */
2314 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2315 ti->error = "Discard support cannot be disabled once enabled";
2316 r = -EINVAL;
2317 goto out_flags_changed;
2318 }
2319
Joe Thornber991d9fa2011-10-31 20:21:18 +00002320 pt->pool = pool;
2321 pt->ti = ti;
2322 pt->metadata_dev = metadata_dev;
2323 pt->data_dev = data_dev;
2324 pt->low_water_blocks = low_water_blocks;
Mike Snitzer0424caa2012-09-26 23:45:47 +01002325 pt->adjusted_pf = pt->requested_pf = pf;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002326 ti->num_flush_requests = 1;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002327
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002328 /*
2329 * Only need to enable discards if the pool should pass
2330 * them down to the data device. The thin device's discard
2331 * processing will cause mappings to be removed from the btree.
2332 */
2333 if (pf.discard_enabled && pf.discard_passdown) {
2334 ti->num_discard_requests = 1;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002335
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002336 /*
2337 * Setting 'discards_supported' circumvents the normal
2338 * stacking of discard limits (this keeps the pool and
2339 * thin devices' discard limits consistent).
2340 */
Alasdair G Kergon0ac55482012-07-27 15:08:08 +01002341 ti->discards_supported = true;
Mike Snitzer307615a2012-09-26 23:45:39 +01002342 ti->discard_zeroes_data_unsupported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002343 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002344 ti->private = pt;
2345
2346 pt->callbacks.congested_fn = pool_is_congested;
2347 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2348
2349 mutex_unlock(&dm_thin_pool_table.mutex);
2350
2351 return 0;
2352
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002353out_flags_changed:
2354 __pool_dec(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002355out_free_pt:
2356 kfree(pt);
2357out:
2358 dm_put_device(ti, data_dev);
2359out_metadata:
2360 dm_put_device(ti, metadata_dev);
2361out_unlock:
2362 mutex_unlock(&dm_thin_pool_table.mutex);
2363
2364 return r;
2365}
2366
2367static int pool_map(struct dm_target *ti, struct bio *bio,
2368 union map_info *map_context)
2369{
2370 int r;
2371 struct pool_c *pt = ti->private;
2372 struct pool *pool = pt->pool;
2373 unsigned long flags;
2374
2375 /*
2376 * As this is a singleton target, ti->begin is always zero.
2377 */
2378 spin_lock_irqsave(&pool->lock, flags);
2379 bio->bi_bdev = pt->data_dev->bdev;
2380 r = DM_MAPIO_REMAPPED;
2381 spin_unlock_irqrestore(&pool->lock, flags);
2382
2383 return r;
2384}
2385
2386/*
2387 * Retrieves the number of blocks of the data device from
2388 * the superblock and compares it to the actual device size,
2389 * thus resizing the data device in case it has grown.
2390 *
2391 * This both copes with opening preallocated data devices in the ctr
2392 * being followed by a resume
2393 * -and-
2394 * calling the resume method individually after userspace has
2395 * grown the data device in reaction to a table event.
2396 */
2397static int pool_preresume(struct dm_target *ti)
2398{
2399 int r;
2400 struct pool_c *pt = ti->private;
2401 struct pool *pool = pt->pool;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01002402 sector_t data_size = ti->len;
2403 dm_block_t sb_data_size;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002404
2405 /*
2406 * Take control of the pool object.
2407 */
2408 r = bind_control_target(pool, ti);
2409 if (r)
2410 return r;
2411
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01002412 (void) sector_div(data_size, pool->sectors_per_block);
2413
Joe Thornber991d9fa2011-10-31 20:21:18 +00002414 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2415 if (r) {
2416 DMERR("failed to retrieve data device size");
2417 return r;
2418 }
2419
2420 if (data_size < sb_data_size) {
2421 DMERR("pool target too small, is %llu blocks (expected %llu)",
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01002422 (unsigned long long)data_size, sb_data_size);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002423 return -EINVAL;
2424
2425 } else if (data_size > sb_data_size) {
2426 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2427 if (r) {
2428 DMERR("failed to resize data device");
Joe Thornbere49e5822012-07-27 15:08:16 +01002429 /* FIXME Stricter than necessary: Rollback transaction instead here */
2430 set_pool_mode(pool, PM_READ_ONLY);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002431 return r;
2432 }
2433
Joe Thornbere49e5822012-07-27 15:08:16 +01002434 (void) commit_or_fallback(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002435 }
2436
2437 return 0;
2438}
2439
2440static void pool_resume(struct dm_target *ti)
2441{
2442 struct pool_c *pt = ti->private;
2443 struct pool *pool = pt->pool;
2444 unsigned long flags;
2445
2446 spin_lock_irqsave(&pool->lock, flags);
2447 pool->low_water_triggered = 0;
2448 pool->no_free_space = 0;
2449 __requeue_bios(pool);
2450 spin_unlock_irqrestore(&pool->lock, flags);
2451
Joe Thornber905e51b2012-03-28 18:41:27 +01002452 do_waker(&pool->waker.work);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002453}
2454
2455static void pool_postsuspend(struct dm_target *ti)
2456{
Joe Thornber991d9fa2011-10-31 20:21:18 +00002457 struct pool_c *pt = ti->private;
2458 struct pool *pool = pt->pool;
2459
Joe Thornber905e51b2012-03-28 18:41:27 +01002460 cancel_delayed_work(&pool->waker);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002461 flush_workqueue(pool->wq);
Joe Thornbere49e5822012-07-27 15:08:16 +01002462 (void) commit_or_fallback(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002463}
2464
2465static int check_arg_count(unsigned argc, unsigned args_required)
2466{
2467 if (argc != args_required) {
2468 DMWARN("Message received with %u arguments instead of %u.",
2469 argc, args_required);
2470 return -EINVAL;
2471 }
2472
2473 return 0;
2474}
2475
2476static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2477{
2478 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2479 *dev_id <= MAX_DEV_ID)
2480 return 0;
2481
2482 if (warning)
2483 DMWARN("Message received with invalid device id: %s", arg);
2484
2485 return -EINVAL;
2486}
2487
2488static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2489{
2490 dm_thin_id dev_id;
2491 int r;
2492
2493 r = check_arg_count(argc, 2);
2494 if (r)
2495 return r;
2496
2497 r = read_dev_id(argv[1], &dev_id, 1);
2498 if (r)
2499 return r;
2500
2501 r = dm_pool_create_thin(pool->pmd, dev_id);
2502 if (r) {
2503 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2504 argv[1]);
2505 return r;
2506 }
2507
2508 return 0;
2509}
2510
2511static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2512{
2513 dm_thin_id dev_id;
2514 dm_thin_id origin_dev_id;
2515 int r;
2516
2517 r = check_arg_count(argc, 3);
2518 if (r)
2519 return r;
2520
2521 r = read_dev_id(argv[1], &dev_id, 1);
2522 if (r)
2523 return r;
2524
2525 r = read_dev_id(argv[2], &origin_dev_id, 1);
2526 if (r)
2527 return r;
2528
2529 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2530 if (r) {
2531 DMWARN("Creation of new snapshot %s of device %s failed.",
2532 argv[1], argv[2]);
2533 return r;
2534 }
2535
2536 return 0;
2537}
2538
2539static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2540{
2541 dm_thin_id dev_id;
2542 int r;
2543
2544 r = check_arg_count(argc, 2);
2545 if (r)
2546 return r;
2547
2548 r = read_dev_id(argv[1], &dev_id, 1);
2549 if (r)
2550 return r;
2551
2552 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2553 if (r)
2554 DMWARN("Deletion of thin device %s failed.", argv[1]);
2555
2556 return r;
2557}
2558
2559static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2560{
2561 dm_thin_id old_id, new_id;
2562 int r;
2563
2564 r = check_arg_count(argc, 3);
2565 if (r)
2566 return r;
2567
2568 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2569 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2570 return -EINVAL;
2571 }
2572
2573 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2574 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2575 return -EINVAL;
2576 }
2577
2578 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2579 if (r) {
2580 DMWARN("Failed to change transaction id from %s to %s.",
2581 argv[1], argv[2]);
2582 return r;
2583 }
2584
2585 return 0;
2586}
2587
Joe Thornbercc8394d2012-06-03 00:30:01 +01002588static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2589{
2590 int r;
2591
2592 r = check_arg_count(argc, 1);
2593 if (r)
2594 return r;
2595
Joe Thornbere49e5822012-07-27 15:08:16 +01002596 (void) commit_or_fallback(pool);
Joe Thornber0d200ae2012-07-03 12:55:31 +01002597
Joe Thornbercc8394d2012-06-03 00:30:01 +01002598 r = dm_pool_reserve_metadata_snap(pool->pmd);
2599 if (r)
2600 DMWARN("reserve_metadata_snap message failed.");
2601
2602 return r;
2603}
2604
2605static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2606{
2607 int r;
2608
2609 r = check_arg_count(argc, 1);
2610 if (r)
2611 return r;
2612
2613 r = dm_pool_release_metadata_snap(pool->pmd);
2614 if (r)
2615 DMWARN("release_metadata_snap message failed.");
2616
2617 return r;
2618}
2619
Joe Thornber991d9fa2011-10-31 20:21:18 +00002620/*
2621 * Messages supported:
2622 * create_thin <dev_id>
2623 * create_snap <dev_id> <origin_id>
2624 * delete <dev_id>
2625 * trim <dev_id> <new_size_in_sectors>
2626 * set_transaction_id <current_trans_id> <new_trans_id>
Joe Thornbercc8394d2012-06-03 00:30:01 +01002627 * reserve_metadata_snap
2628 * release_metadata_snap
Joe Thornber991d9fa2011-10-31 20:21:18 +00002629 */
2630static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2631{
2632 int r = -EINVAL;
2633 struct pool_c *pt = ti->private;
2634 struct pool *pool = pt->pool;
2635
2636 if (!strcasecmp(argv[0], "create_thin"))
2637 r = process_create_thin_mesg(argc, argv, pool);
2638
2639 else if (!strcasecmp(argv[0], "create_snap"))
2640 r = process_create_snap_mesg(argc, argv, pool);
2641
2642 else if (!strcasecmp(argv[0], "delete"))
2643 r = process_delete_mesg(argc, argv, pool);
2644
2645 else if (!strcasecmp(argv[0], "set_transaction_id"))
2646 r = process_set_transaction_id_mesg(argc, argv, pool);
2647
Joe Thornbercc8394d2012-06-03 00:30:01 +01002648 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2649 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2650
2651 else if (!strcasecmp(argv[0], "release_metadata_snap"))
2652 r = process_release_metadata_snap_mesg(argc, argv, pool);
2653
Joe Thornber991d9fa2011-10-31 20:21:18 +00002654 else
2655 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2656
Joe Thornbere49e5822012-07-27 15:08:16 +01002657 if (!r)
2658 (void) commit_or_fallback(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002659
2660 return r;
2661}
2662
Joe Thornbere49e5822012-07-27 15:08:16 +01002663static void emit_flags(struct pool_features *pf, char *result,
2664 unsigned sz, unsigned maxlen)
2665{
2666 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2667 !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2668 DMEMIT("%u ", count);
2669
2670 if (!pf->zero_new_blocks)
2671 DMEMIT("skip_block_zeroing ");
2672
2673 if (!pf->discard_enabled)
2674 DMEMIT("ignore_discard ");
2675
2676 if (!pf->discard_passdown)
2677 DMEMIT("no_discard_passdown ");
2678
2679 if (pf->mode == PM_READ_ONLY)
2680 DMEMIT("read_only ");
2681}
2682
Joe Thornber991d9fa2011-10-31 20:21:18 +00002683/*
2684 * Status line is:
2685 * <transaction id> <used metadata sectors>/<total metadata sectors>
2686 * <used data sectors>/<total data sectors> <held metadata root>
2687 */
2688static int pool_status(struct dm_target *ti, status_type_t type,
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01002689 unsigned status_flags, char *result, unsigned maxlen)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002690{
Joe Thornbere49e5822012-07-27 15:08:16 +01002691 int r;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002692 unsigned sz = 0;
2693 uint64_t transaction_id;
2694 dm_block_t nr_free_blocks_data;
2695 dm_block_t nr_free_blocks_metadata;
2696 dm_block_t nr_blocks_data;
2697 dm_block_t nr_blocks_metadata;
2698 dm_block_t held_root;
2699 char buf[BDEVNAME_SIZE];
2700 char buf2[BDEVNAME_SIZE];
2701 struct pool_c *pt = ti->private;
2702 struct pool *pool = pt->pool;
2703
2704 switch (type) {
2705 case STATUSTYPE_INFO:
Joe Thornbere49e5822012-07-27 15:08:16 +01002706 if (get_pool_mode(pool) == PM_FAIL) {
2707 DMEMIT("Fail");
2708 break;
2709 }
2710
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01002711 /* Commit to ensure statistics aren't out-of-date */
2712 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2713 (void) commit_or_fallback(pool);
2714
Joe Thornber991d9fa2011-10-31 20:21:18 +00002715 r = dm_pool_get_metadata_transaction_id(pool->pmd,
2716 &transaction_id);
2717 if (r)
2718 return r;
2719
2720 r = dm_pool_get_free_metadata_block_count(pool->pmd,
2721 &nr_free_blocks_metadata);
2722 if (r)
2723 return r;
2724
2725 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2726 if (r)
2727 return r;
2728
2729 r = dm_pool_get_free_block_count(pool->pmd,
2730 &nr_free_blocks_data);
2731 if (r)
2732 return r;
2733
2734 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2735 if (r)
2736 return r;
2737
Joe Thornbercc8394d2012-06-03 00:30:01 +01002738 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002739 if (r)
2740 return r;
2741
2742 DMEMIT("%llu %llu/%llu %llu/%llu ",
2743 (unsigned long long)transaction_id,
2744 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2745 (unsigned long long)nr_blocks_metadata,
2746 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2747 (unsigned long long)nr_blocks_data);
2748
2749 if (held_root)
Joe Thornbere49e5822012-07-27 15:08:16 +01002750 DMEMIT("%llu ", held_root);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002751 else
Joe Thornbere49e5822012-07-27 15:08:16 +01002752 DMEMIT("- ");
2753
2754 if (pool->pf.mode == PM_READ_ONLY)
2755 DMEMIT("ro ");
2756 else
2757 DMEMIT("rw ");
2758
2759 if (pool->pf.discard_enabled && pool->pf.discard_passdown)
2760 DMEMIT("discard_passdown");
2761 else
2762 DMEMIT("no_discard_passdown");
Joe Thornber991d9fa2011-10-31 20:21:18 +00002763
2764 break;
2765
2766 case STATUSTYPE_TABLE:
2767 DMEMIT("%s %s %lu %llu ",
2768 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2769 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2770 (unsigned long)pool->sectors_per_block,
2771 (unsigned long long)pt->low_water_blocks);
Mike Snitzer0424caa2012-09-26 23:45:47 +01002772 emit_flags(&pt->requested_pf, result, sz, maxlen);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002773 break;
2774 }
2775
2776 return 0;
2777}
2778
2779static int pool_iterate_devices(struct dm_target *ti,
2780 iterate_devices_callout_fn fn, void *data)
2781{
2782 struct pool_c *pt = ti->private;
2783
2784 return fn(ti, pt->data_dev, 0, ti->len, data);
2785}
2786
2787static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2788 struct bio_vec *biovec, int max_size)
2789{
2790 struct pool_c *pt = ti->private;
2791 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2792
2793 if (!q->merge_bvec_fn)
2794 return max_size;
2795
2796 bvm->bi_bdev = pt->data_dev->bdev;
2797
2798 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2799}
2800
Mike Snitzer28eed342012-10-12 21:02:07 +01002801static bool block_size_is_power_of_two(struct pool *pool)
2802{
2803 return pool->sectors_per_block_shift >= 0;
2804}
2805
Mike Snitzer0424caa2012-09-26 23:45:47 +01002806static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
Joe Thornber104655f2012-03-28 18:41:28 +01002807{
Mike Snitzer0424caa2012-09-26 23:45:47 +01002808 struct pool *pool = pt->pool;
2809 struct queue_limits *data_limits;
2810
Joe Thornber104655f2012-03-28 18:41:28 +01002811 limits->max_discard_sectors = pool->sectors_per_block;
2812
2813 /*
Mike Snitzer0424caa2012-09-26 23:45:47 +01002814 * discard_granularity is just a hint, and not enforced.
Joe Thornber104655f2012-03-28 18:41:28 +01002815 */
Mike Snitzer0424caa2012-09-26 23:45:47 +01002816 if (pt->adjusted_pf.discard_passdown) {
2817 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2818 limits->discard_granularity = data_limits->discard_granularity;
Mike Snitzer28eed342012-10-12 21:02:07 +01002819 } else if (block_size_is_power_of_two(pool))
Mike Snitzer0424caa2012-09-26 23:45:47 +01002820 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
Mike Snitzer28eed342012-10-12 21:02:07 +01002821 else
2822 /*
2823 * Use largest power of 2 that is a factor of sectors_per_block
2824 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
2825 */
2826 limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
2827 DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
Joe Thornber104655f2012-03-28 18:41:28 +01002828}
2829
Joe Thornber991d9fa2011-10-31 20:21:18 +00002830static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2831{
2832 struct pool_c *pt = ti->private;
2833 struct pool *pool = pt->pool;
2834
2835 blk_limits_io_min(limits, 0);
2836 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
Mike Snitzer0424caa2012-09-26 23:45:47 +01002837
2838 /*
2839 * pt->adjusted_pf is a staging area for the actual features to use.
2840 * They get transferred to the live pool in bind_control_target()
2841 * called from pool_preresume().
2842 */
2843 if (!pt->adjusted_pf.discard_enabled)
2844 return;
2845
2846 disable_passdown_if_not_supported(pt);
2847
2848 set_discard_limits(pt, limits);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002849}
2850
2851static struct target_type pool_target = {
2852 .name = "thin-pool",
2853 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2854 DM_TARGET_IMMUTABLE,
Mike Snitzer0424caa2012-09-26 23:45:47 +01002855 .version = {1, 4, 0},
Joe Thornber991d9fa2011-10-31 20:21:18 +00002856 .module = THIS_MODULE,
2857 .ctr = pool_ctr,
2858 .dtr = pool_dtr,
2859 .map = pool_map,
2860 .postsuspend = pool_postsuspend,
2861 .preresume = pool_preresume,
2862 .resume = pool_resume,
2863 .message = pool_message,
2864 .status = pool_status,
2865 .merge = pool_merge,
2866 .iterate_devices = pool_iterate_devices,
2867 .io_hints = pool_io_hints,
2868};
2869
2870/*----------------------------------------------------------------
2871 * Thin target methods
2872 *--------------------------------------------------------------*/
2873static void thin_dtr(struct dm_target *ti)
2874{
2875 struct thin_c *tc = ti->private;
2876
2877 mutex_lock(&dm_thin_pool_table.mutex);
2878
2879 __pool_dec(tc->pool);
2880 dm_pool_close_thin_device(tc->td);
2881 dm_put_device(ti, tc->pool_dev);
Joe Thornber2dd9c252012-03-28 18:41:28 +01002882 if (tc->origin_dev)
2883 dm_put_device(ti, tc->origin_dev);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002884 kfree(tc);
2885
2886 mutex_unlock(&dm_thin_pool_table.mutex);
2887}
2888
2889/*
2890 * Thin target parameters:
2891 *
Joe Thornber2dd9c252012-03-28 18:41:28 +01002892 * <pool_dev> <dev_id> [origin_dev]
Joe Thornber991d9fa2011-10-31 20:21:18 +00002893 *
2894 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2895 * dev_id: the internal device identifier
Joe Thornber2dd9c252012-03-28 18:41:28 +01002896 * origin_dev: a device external to the pool that should act as the origin
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002897 *
2898 * If the pool device has discards disabled, they get disabled for the thin
2899 * device as well.
Joe Thornber991d9fa2011-10-31 20:21:18 +00002900 */
2901static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2902{
2903 int r;
2904 struct thin_c *tc;
Joe Thornber2dd9c252012-03-28 18:41:28 +01002905 struct dm_dev *pool_dev, *origin_dev;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002906 struct mapped_device *pool_md;
2907
2908 mutex_lock(&dm_thin_pool_table.mutex);
2909
Joe Thornber2dd9c252012-03-28 18:41:28 +01002910 if (argc != 2 && argc != 3) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00002911 ti->error = "Invalid argument count";
2912 r = -EINVAL;
2913 goto out_unlock;
2914 }
2915
2916 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2917 if (!tc) {
2918 ti->error = "Out of memory";
2919 r = -ENOMEM;
2920 goto out_unlock;
2921 }
2922
Joe Thornber2dd9c252012-03-28 18:41:28 +01002923 if (argc == 3) {
2924 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2925 if (r) {
2926 ti->error = "Error opening origin device";
2927 goto bad_origin_dev;
2928 }
2929 tc->origin_dev = origin_dev;
2930 }
2931
Joe Thornber991d9fa2011-10-31 20:21:18 +00002932 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2933 if (r) {
2934 ti->error = "Error opening pool device";
2935 goto bad_pool_dev;
2936 }
2937 tc->pool_dev = pool_dev;
2938
2939 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2940 ti->error = "Invalid device id";
2941 r = -EINVAL;
2942 goto bad_common;
2943 }
2944
2945 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2946 if (!pool_md) {
2947 ti->error = "Couldn't get pool mapped device";
2948 r = -EINVAL;
2949 goto bad_common;
2950 }
2951
2952 tc->pool = __pool_table_lookup(pool_md);
2953 if (!tc->pool) {
2954 ti->error = "Couldn't find pool object";
2955 r = -EINVAL;
2956 goto bad_pool_lookup;
2957 }
2958 __pool_inc(tc->pool);
2959
Joe Thornbere49e5822012-07-27 15:08:16 +01002960 if (get_pool_mode(tc->pool) == PM_FAIL) {
2961 ti->error = "Couldn't open thin device, Pool is in fail mode";
2962 goto bad_thin_open;
2963 }
2964
Joe Thornber991d9fa2011-10-31 20:21:18 +00002965 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2966 if (r) {
2967 ti->error = "Couldn't open thin internal device";
2968 goto bad_thin_open;
2969 }
2970
Mike Snitzer542f9032012-07-27 15:08:00 +01002971 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2972 if (r)
2973 goto bad_thin_open;
2974
Joe Thornber991d9fa2011-10-31 20:21:18 +00002975 ti->num_flush_requests = 1;
Joe Thornber16ad3d12012-07-27 15:08:07 +01002976 ti->flush_supported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002977
2978 /* In case the pool supports discards, pass them on. */
2979 if (tc->pool->pf.discard_enabled) {
Alasdair G Kergon0ac55482012-07-27 15:08:08 +01002980 ti->discards_supported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002981 ti->num_discard_requests = 1;
Alasdair G Kergon0ac55482012-07-27 15:08:08 +01002982 ti->discard_zeroes_data_unsupported = true;
Mikulas Patocka49296302012-07-27 15:08:03 +01002983 /* Discard requests must be split on a block boundary */
Alasdair G Kergon0ac55482012-07-27 15:08:08 +01002984 ti->split_discard_requests = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002985 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002986
2987 dm_put(pool_md);
2988
2989 mutex_unlock(&dm_thin_pool_table.mutex);
2990
2991 return 0;
2992
2993bad_thin_open:
2994 __pool_dec(tc->pool);
2995bad_pool_lookup:
2996 dm_put(pool_md);
2997bad_common:
2998 dm_put_device(ti, tc->pool_dev);
2999bad_pool_dev:
Joe Thornber2dd9c252012-03-28 18:41:28 +01003000 if (tc->origin_dev)
3001 dm_put_device(ti, tc->origin_dev);
3002bad_origin_dev:
Joe Thornber991d9fa2011-10-31 20:21:18 +00003003 kfree(tc);
3004out_unlock:
3005 mutex_unlock(&dm_thin_pool_table.mutex);
3006
3007 return r;
3008}
3009
3010static int thin_map(struct dm_target *ti, struct bio *bio,
3011 union map_info *map_context)
3012{
Alasdair G Kergon6efd6e82012-03-28 18:41:28 +01003013 bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003014
3015 return thin_bio_map(ti, bio, map_context);
3016}
3017
Joe Thornbereb2aa482012-03-28 18:41:28 +01003018static int thin_endio(struct dm_target *ti,
3019 struct bio *bio, int err,
3020 union map_info *map_context)
3021{
3022 unsigned long flags;
Mike Snitzera24c2562012-06-03 00:30:00 +01003023 struct dm_thin_endio_hook *h = map_context->ptr;
Joe Thornbereb2aa482012-03-28 18:41:28 +01003024 struct list_head work;
Mike Snitzera24c2562012-06-03 00:30:00 +01003025 struct dm_thin_new_mapping *m, *tmp;
Joe Thornbereb2aa482012-03-28 18:41:28 +01003026 struct pool *pool = h->tc->pool;
3027
3028 if (h->shared_read_entry) {
3029 INIT_LIST_HEAD(&work);
Mike Snitzer44feb382012-10-12 21:02:10 +01003030 dm_deferred_entry_dec(h->shared_read_entry, &work);
Joe Thornbereb2aa482012-03-28 18:41:28 +01003031
3032 spin_lock_irqsave(&pool->lock, flags);
3033 list_for_each_entry_safe(m, tmp, &work, list) {
3034 list_del(&m->list);
3035 m->quiesced = 1;
3036 __maybe_add_mapping(m);
3037 }
3038 spin_unlock_irqrestore(&pool->lock, flags);
3039 }
3040
Joe Thornber104655f2012-03-28 18:41:28 +01003041 if (h->all_io_entry) {
3042 INIT_LIST_HEAD(&work);
Mike Snitzer44feb382012-10-12 21:02:10 +01003043 dm_deferred_entry_dec(h->all_io_entry, &work);
Mike Snitzerc3a0ce22012-05-12 01:43:16 +01003044 spin_lock_irqsave(&pool->lock, flags);
Joe Thornber104655f2012-03-28 18:41:28 +01003045 list_for_each_entry_safe(m, tmp, &work, list)
3046 list_add(&m->list, &pool->prepared_discards);
Mike Snitzerc3a0ce22012-05-12 01:43:16 +01003047 spin_unlock_irqrestore(&pool->lock, flags);
Joe Thornber104655f2012-03-28 18:41:28 +01003048 }
3049
Joe Thornbereb2aa482012-03-28 18:41:28 +01003050 mempool_free(h, pool->endio_hook_pool);
3051
3052 return 0;
3053}
3054
Joe Thornber991d9fa2011-10-31 20:21:18 +00003055static void thin_postsuspend(struct dm_target *ti)
3056{
3057 if (dm_noflush_suspending(ti))
3058 requeue_io((struct thin_c *)ti->private);
3059}
3060
3061/*
3062 * <nr mapped sectors> <highest mapped sector>
3063 */
3064static int thin_status(struct dm_target *ti, status_type_t type,
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01003065 unsigned status_flags, char *result, unsigned maxlen)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003066{
3067 int r;
3068 ssize_t sz = 0;
3069 dm_block_t mapped, highest;
3070 char buf[BDEVNAME_SIZE];
3071 struct thin_c *tc = ti->private;
3072
Joe Thornbere49e5822012-07-27 15:08:16 +01003073 if (get_pool_mode(tc->pool) == PM_FAIL) {
3074 DMEMIT("Fail");
3075 return 0;
3076 }
3077
Joe Thornber991d9fa2011-10-31 20:21:18 +00003078 if (!tc->td)
3079 DMEMIT("-");
3080 else {
3081 switch (type) {
3082 case STATUSTYPE_INFO:
3083 r = dm_thin_get_mapped_count(tc->td, &mapped);
3084 if (r)
3085 return r;
3086
3087 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
3088 if (r < 0)
3089 return r;
3090
3091 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3092 if (r)
3093 DMEMIT("%llu", ((highest + 1) *
3094 tc->pool->sectors_per_block) - 1);
3095 else
3096 DMEMIT("-");
3097 break;
3098
3099 case STATUSTYPE_TABLE:
3100 DMEMIT("%s %lu",
3101 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3102 (unsigned long) tc->dev_id);
Joe Thornber2dd9c252012-03-28 18:41:28 +01003103 if (tc->origin_dev)
3104 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
Joe Thornber991d9fa2011-10-31 20:21:18 +00003105 break;
3106 }
3107 }
3108
3109 return 0;
3110}
3111
3112static int thin_iterate_devices(struct dm_target *ti,
3113 iterate_devices_callout_fn fn, void *data)
3114{
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003115 sector_t blocks;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003116 struct thin_c *tc = ti->private;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003117 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003118
3119 /*
3120 * We can't call dm_pool_get_data_dev_size() since that blocks. So
3121 * we follow a more convoluted path through to the pool's target.
3122 */
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003123 if (!pool->ti)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003124 return 0; /* nothing is bound */
3125
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003126 blocks = pool->ti->len;
3127 (void) sector_div(blocks, pool->sectors_per_block);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003128 if (blocks)
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003129 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003130
3131 return 0;
3132}
3133
Mike Snitzer0424caa2012-09-26 23:45:47 +01003134/*
3135 * A thin device always inherits its queue limits from its pool.
3136 */
Joe Thornber991d9fa2011-10-31 20:21:18 +00003137static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
3138{
3139 struct thin_c *tc = ti->private;
3140
Mike Snitzer0424caa2012-09-26 23:45:47 +01003141 *limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003142}
3143
3144static struct target_type thin_target = {
3145 .name = "thin",
Mike Snitzer0424caa2012-09-26 23:45:47 +01003146 .version = {1, 4, 0},
Joe Thornber991d9fa2011-10-31 20:21:18 +00003147 .module = THIS_MODULE,
3148 .ctr = thin_ctr,
3149 .dtr = thin_dtr,
3150 .map = thin_map,
Joe Thornbereb2aa482012-03-28 18:41:28 +01003151 .end_io = thin_endio,
Joe Thornber991d9fa2011-10-31 20:21:18 +00003152 .postsuspend = thin_postsuspend,
3153 .status = thin_status,
3154 .iterate_devices = thin_iterate_devices,
3155 .io_hints = thin_io_hints,
3156};
3157
3158/*----------------------------------------------------------------*/
3159
3160static int __init dm_thin_init(void)
3161{
3162 int r;
3163
3164 pool_table_init();
3165
3166 r = dm_register_target(&thin_target);
3167 if (r)
3168 return r;
3169
3170 r = dm_register_target(&pool_target);
3171 if (r)
Mike Snitzera24c2562012-06-03 00:30:00 +01003172 goto bad_pool_target;
3173
3174 r = -ENOMEM;
3175
Mike Snitzer44feb382012-10-12 21:02:10 +01003176 dm_bio_prison_init();
Mike Snitzera24c2562012-06-03 00:30:00 +01003177
3178 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3179 if (!_new_mapping_cache)
3180 goto bad_new_mapping_cache;
3181
3182 _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
3183 if (!_endio_hook_cache)
3184 goto bad_endio_hook_cache;
3185
3186 return 0;
3187
3188bad_endio_hook_cache:
3189 kmem_cache_destroy(_new_mapping_cache);
3190bad_new_mapping_cache:
Mike Snitzera24c2562012-06-03 00:30:00 +01003191 dm_unregister_target(&pool_target);
3192bad_pool_target:
3193 dm_unregister_target(&thin_target);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003194
3195 return r;
3196}
3197
3198static void dm_thin_exit(void)
3199{
3200 dm_unregister_target(&thin_target);
3201 dm_unregister_target(&pool_target);
Mike Snitzera24c2562012-06-03 00:30:00 +01003202
Mike Snitzer44feb382012-10-12 21:02:10 +01003203 dm_bio_prison_exit();
Mike Snitzera24c2562012-06-03 00:30:00 +01003204 kmem_cache_destroy(_new_mapping_cache);
3205 kmem_cache_destroy(_endio_hook_cache);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003206}
3207
3208module_init(dm_thin_init);
3209module_exit(dm_thin_exit);
3210
Alasdair G Kergon7cab8bf2012-05-12 01:43:19 +01003211MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003212MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3213MODULE_LICENSE("GPL");