blob: 2e71de8e0048c9eb1e1815d79fbedc8aaedf772a [file] [log] [blame]
Joe Thornber991d9fa2011-10-31 20:21:18 +00001/*
Joe Thornbere49e5822012-07-27 15:08:16 +01002 * Copyright (C) 2011-2012 Red Hat UK.
Joe Thornber991d9fa2011-10-31 20:21:18 +00003 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
Mike Snitzer4f81a412012-10-12 21:02:13 +01008#include "dm-bio-prison.h"
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01009#include "dm.h"
Joe Thornber991d9fa2011-10-31 20:21:18 +000010
11#include <linux/device-mapper.h>
12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h>
14#include <linux/list.h>
Mike Snitzerc140e1c2014-03-20 21:17:14 -040015#include <linux/rculist.h>
Joe Thornber991d9fa2011-10-31 20:21:18 +000016#include <linux/init.h>
17#include <linux/module.h>
18#include <linux/slab.h>
Mike Snitzer67324ea2014-03-21 18:33:41 -040019#include <linux/rbtree.h>
Joe Thornber991d9fa2011-10-31 20:21:18 +000020
21#define DM_MSG_PREFIX "thin"
22
23/*
24 * Tunable constants
25 */
Alasdair G Kergon7768ed32012-07-27 15:07:57 +010026#define ENDIO_HOOK_POOL_SIZE 1024
Joe Thornber991d9fa2011-10-31 20:21:18 +000027#define MAPPING_POOL_SIZE 1024
28#define PRISON_CELLS 1024
Joe Thornber905e51b2012-03-28 18:41:27 +010029#define COMMIT_PERIOD HZ
Joe Thornber85ad6432014-05-09 15:59:38 +010030#define NO_SPACE_TIMEOUT (HZ * 60)
Joe Thornber991d9fa2011-10-31 20:21:18 +000031
Mikulas Patockadf5d2e92013-03-01 22:45:49 +000032DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
33 "A percentage of time allocated for copy on write");
34
Joe Thornber991d9fa2011-10-31 20:21:18 +000035/*
36 * The block size of the device holding pool data must be
37 * between 64KB and 1GB.
38 */
39#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
40#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
41
42/*
Joe Thornber991d9fa2011-10-31 20:21:18 +000043 * Device id is restricted to 24 bits.
44 */
45#define MAX_DEV_ID ((1 << 24) - 1)
46
47/*
48 * How do we handle breaking sharing of data blocks?
49 * =================================================
50 *
51 * We use a standard copy-on-write btree to store the mappings for the
52 * devices (note I'm talking about copy-on-write of the metadata here, not
53 * the data). When you take an internal snapshot you clone the root node
54 * of the origin btree. After this there is no concept of an origin or a
55 * snapshot. They are just two device trees that happen to point to the
56 * same data blocks.
57 *
58 * When we get a write in we decide if it's to a shared data block using
59 * some timestamp magic. If it is, we have to break sharing.
60 *
61 * Let's say we write to a shared block in what was the origin. The
62 * steps are:
63 *
64 * i) plug io further to this physical block. (see bio_prison code).
65 *
66 * ii) quiesce any read io to that shared data block. Obviously
Mike Snitzer44feb382012-10-12 21:02:10 +010067 * including all devices that share this block. (see dm_deferred_set code)
Joe Thornber991d9fa2011-10-31 20:21:18 +000068 *
69 * iii) copy the data block to a newly allocate block. This step can be
70 * missed out if the io covers the block. (schedule_copy).
71 *
72 * iv) insert the new mapping into the origin's btree
Joe Thornberfe878f32012-03-28 18:41:24 +010073 * (process_prepared_mapping). This act of inserting breaks some
Joe Thornber991d9fa2011-10-31 20:21:18 +000074 * sharing of btree nodes between the two devices. Breaking sharing only
75 * effects the btree of that specific device. Btrees for the other
76 * devices that share the block never change. The btree for the origin
77 * device as it was after the last commit is untouched, ie. we're using
78 * persistent data structures in the functional programming sense.
79 *
80 * v) unplug io to this physical block, including the io that triggered
81 * the breaking of sharing.
82 *
83 * Steps (ii) and (iii) occur in parallel.
84 *
85 * The metadata _doesn't_ need to be committed before the io continues. We
86 * get away with this because the io is always written to a _new_ block.
87 * If there's a crash, then:
88 *
89 * - The origin mapping will point to the old origin block (the shared
90 * one). This will contain the data as it was before the io that triggered
91 * the breaking of sharing came in.
92 *
93 * - The snap mapping still points to the old block. As it would after
94 * the commit.
95 *
96 * The downside of this scheme is the timestamp magic isn't perfect, and
97 * will continue to think that data block in the snapshot device is shared
98 * even after the write to the origin has broken sharing. I suspect data
99 * blocks will typically be shared by many different devices, so we're
100 * breaking sharing n + 1 times, rather than n, where n is the number of
101 * devices that reference this data block. At the moment I think the
102 * benefits far, far outweigh the disadvantages.
103 */
104
105/*----------------------------------------------------------------*/
106
107/*
Joe Thornber991d9fa2011-10-31 20:21:18 +0000108 * Key building.
109 */
110static void build_data_key(struct dm_thin_device *td,
Mike Snitzer44feb382012-10-12 21:02:10 +0100111 dm_block_t b, struct dm_cell_key *key)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000112{
113 key->virtual = 0;
114 key->dev = dm_thin_dev_id(td);
115 key->block = b;
116}
117
118static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
Mike Snitzer44feb382012-10-12 21:02:10 +0100119 struct dm_cell_key *key)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000120{
121 key->virtual = 1;
122 key->dev = dm_thin_dev_id(td);
123 key->block = b;
124}
125
126/*----------------------------------------------------------------*/
127
128/*
129 * A pool device ties together a metadata device and a data device. It
130 * also provides the interface for creating and destroying internal
131 * devices.
132 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100133struct dm_thin_new_mapping;
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100134
Joe Thornbere49e5822012-07-27 15:08:16 +0100135/*
Joe Thornber3e1a0692014-03-03 16:03:26 +0000136 * The pool runs in 4 modes. Ordered in degraded order for comparisons.
Joe Thornbere49e5822012-07-27 15:08:16 +0100137 */
138enum pool_mode {
139 PM_WRITE, /* metadata may be changed */
Joe Thornber3e1a0692014-03-03 16:03:26 +0000140 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
Joe Thornbere49e5822012-07-27 15:08:16 +0100141 PM_READ_ONLY, /* metadata may not be changed */
142 PM_FAIL, /* all I/O fails */
143};
144
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100145struct pool_features {
Joe Thornbere49e5822012-07-27 15:08:16 +0100146 enum pool_mode mode;
147
Mike Snitzer9bc142d2012-09-26 23:45:46 +0100148 bool zero_new_blocks:1;
149 bool discard_enabled:1;
150 bool discard_passdown:1;
Mike Snitzer787a996c2013-12-06 16:21:43 -0500151 bool error_if_no_space:1;
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100152};
153
Joe Thornbere49e5822012-07-27 15:08:16 +0100154struct thin_c;
155typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
156typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
157
Joe Thornber991d9fa2011-10-31 20:21:18 +0000158struct pool {
159 struct list_head list;
160 struct dm_target *ti; /* Only set if a pool target is bound */
161
162 struct mapped_device *pool_md;
163 struct block_device *md_dev;
164 struct dm_pool_metadata *pmd;
165
Joe Thornber991d9fa2011-10-31 20:21:18 +0000166 dm_block_t low_water_blocks;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100167 uint32_t sectors_per_block;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +0100168 int sectors_per_block_shift;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000169
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100170 struct pool_features pf;
Joe Thornber88a66212013-12-04 20:16:12 -0500171 bool low_water_triggered:1; /* A dm event has been sent */
Joe Thornber991d9fa2011-10-31 20:21:18 +0000172
Mike Snitzer44feb382012-10-12 21:02:10 +0100173 struct dm_bio_prison *prison;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000174 struct dm_kcopyd_client *copier;
175
176 struct workqueue_struct *wq;
177 struct work_struct worker;
Joe Thornber905e51b2012-03-28 18:41:27 +0100178 struct delayed_work waker;
Joe Thornber85ad6432014-05-09 15:59:38 +0100179 struct delayed_work no_space_timeout;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000180
Joe Thornber905e51b2012-03-28 18:41:27 +0100181 unsigned long last_commit_jiffies;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100182 unsigned ref_count;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000183
184 spinlock_t lock;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000185 struct bio_list deferred_flush_bios;
186 struct list_head prepared_mappings;
Joe Thornber104655f2012-03-28 18:41:28 +0100187 struct list_head prepared_discards;
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400188 struct list_head active_thins;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000189
Mike Snitzer44feb382012-10-12 21:02:10 +0100190 struct dm_deferred_set *shared_read_ds;
191 struct dm_deferred_set *all_io_ds;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000192
Mike Snitzera24c2562012-06-03 00:30:00 +0100193 struct dm_thin_new_mapping *next_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000194 mempool_t *mapping_pool;
Joe Thornbere49e5822012-07-27 15:08:16 +0100195
196 process_bio_fn process_bio;
197 process_bio_fn process_discard;
198
199 process_mapping_fn process_prepared_mapping;
200 process_mapping_fn process_prepared_discard;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000201};
202
Joe Thornbere49e5822012-07-27 15:08:16 +0100203static enum pool_mode get_pool_mode(struct pool *pool);
Joe Thornberb5330652013-12-04 19:51:33 -0500204static void metadata_operation_failed(struct pool *pool, const char *op, int r);
Joe Thornbere49e5822012-07-27 15:08:16 +0100205
Joe Thornber991d9fa2011-10-31 20:21:18 +0000206/*
207 * Target context for a pool.
208 */
209struct pool_c {
210 struct dm_target *ti;
211 struct pool *pool;
212 struct dm_dev *data_dev;
213 struct dm_dev *metadata_dev;
214 struct dm_target_callbacks callbacks;
215
216 dm_block_t low_water_blocks;
Mike Snitzer0424caa2012-09-26 23:45:47 +0100217 struct pool_features requested_pf; /* Features requested during table load */
218 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
Joe Thornber991d9fa2011-10-31 20:21:18 +0000219};
220
221/*
222 * Target context for a thin.
223 */
224struct thin_c {
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400225 struct list_head list;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000226 struct dm_dev *pool_dev;
Joe Thornber2dd9c252012-03-28 18:41:28 +0100227 struct dm_dev *origin_dev;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000228 dm_thin_id dev_id;
229
230 struct pool *pool;
231 struct dm_thin_device *td;
Joe Thornber738211f2014-03-03 15:52:28 +0000232 bool requeue_mode:1;
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400233 spinlock_t lock;
234 struct bio_list deferred_bio_list;
235 struct bio_list retry_on_resume_list;
Mike Snitzer67324ea2014-03-21 18:33:41 -0400236 struct rb_root sort_bio_list; /* sorted list of deferred bios */
Joe Thornberb10ebd32014-04-08 11:29:01 +0100237
238 /*
239 * Ensures the thin is not destroyed until the worker has finished
240 * iterating the active_thins list.
241 */
242 atomic_t refcount;
243 struct completion can_destroy;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000244};
245
246/*----------------------------------------------------------------*/
247
Joe Thornber025b9682013-03-01 22:45:50 +0000248/*
249 * wake_worker() is used when new work is queued and when pool_resume is
250 * ready to continue deferred IO processing.
251 */
252static void wake_worker(struct pool *pool)
253{
254 queue_work(pool->wq, &pool->worker);
255}
256
257/*----------------------------------------------------------------*/
258
Joe Thornber6beca5e2013-03-01 22:45:50 +0000259static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
260 struct dm_bio_prison_cell **cell_result)
261{
262 int r;
263 struct dm_bio_prison_cell *cell_prealloc;
264
265 /*
266 * Allocate a cell from the prison's mempool.
267 * This might block but it can't fail.
268 */
269 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
270
271 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
272 if (r)
273 /*
274 * We reused an old cell; we can get rid of
275 * the new one.
276 */
277 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
278
279 return r;
280}
281
282static void cell_release(struct pool *pool,
283 struct dm_bio_prison_cell *cell,
284 struct bio_list *bios)
285{
286 dm_cell_release(pool->prison, cell, bios);
287 dm_bio_prison_free_cell(pool->prison, cell);
288}
289
290static void cell_release_no_holder(struct pool *pool,
291 struct dm_bio_prison_cell *cell,
292 struct bio_list *bios)
293{
294 dm_cell_release_no_holder(pool->prison, cell, bios);
295 dm_bio_prison_free_cell(pool->prison, cell);
296}
297
Joe Thornber025b9682013-03-01 22:45:50 +0000298static void cell_defer_no_holder_no_free(struct thin_c *tc,
299 struct dm_bio_prison_cell *cell)
300{
301 struct pool *pool = tc->pool;
302 unsigned long flags;
303
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400304 spin_lock_irqsave(&tc->lock, flags);
305 dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
306 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber025b9682013-03-01 22:45:50 +0000307
308 wake_worker(pool);
309}
310
Joe Thornber6beca5e2013-03-01 22:45:50 +0000311static void cell_error(struct pool *pool,
312 struct dm_bio_prison_cell *cell)
313{
314 dm_cell_error(pool->prison, cell);
315 dm_bio_prison_free_cell(pool->prison, cell);
316}
317
318/*----------------------------------------------------------------*/
319
Joe Thornber991d9fa2011-10-31 20:21:18 +0000320/*
321 * A global list of pools that uses a struct mapped_device as a key.
322 */
323static struct dm_thin_pool_table {
324 struct mutex mutex;
325 struct list_head pools;
326} dm_thin_pool_table;
327
328static void pool_table_init(void)
329{
330 mutex_init(&dm_thin_pool_table.mutex);
331 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
332}
333
334static void __pool_table_insert(struct pool *pool)
335{
336 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
337 list_add(&pool->list, &dm_thin_pool_table.pools);
338}
339
340static void __pool_table_remove(struct pool *pool)
341{
342 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
343 list_del(&pool->list);
344}
345
346static struct pool *__pool_table_lookup(struct mapped_device *md)
347{
348 struct pool *pool = NULL, *tmp;
349
350 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
351
352 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
353 if (tmp->pool_md == md) {
354 pool = tmp;
355 break;
356 }
357 }
358
359 return pool;
360}
361
362static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
363{
364 struct pool *pool = NULL, *tmp;
365
366 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
367
368 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
369 if (tmp->md_dev == md_dev) {
370 pool = tmp;
371 break;
372 }
373 }
374
375 return pool;
376}
377
378/*----------------------------------------------------------------*/
379
Mike Snitzera24c2562012-06-03 00:30:00 +0100380struct dm_thin_endio_hook {
Joe Thornbereb2aa482012-03-28 18:41:28 +0100381 struct thin_c *tc;
Mike Snitzer44feb382012-10-12 21:02:10 +0100382 struct dm_deferred_entry *shared_read_entry;
383 struct dm_deferred_entry *all_io_entry;
Mike Snitzera24c2562012-06-03 00:30:00 +0100384 struct dm_thin_new_mapping *overwrite_mapping;
Mike Snitzer67324ea2014-03-21 18:33:41 -0400385 struct rb_node rb_node;
Joe Thornbereb2aa482012-03-28 18:41:28 +0100386};
387
Joe Thornber18adc572014-03-03 15:46:42 +0000388static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000389{
390 struct bio *bio;
391 struct bio_list bios;
Joe Thornber18adc572014-03-03 15:46:42 +0000392 unsigned long flags;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000393
394 bio_list_init(&bios);
Joe Thornber18adc572014-03-03 15:46:42 +0000395
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400396 spin_lock_irqsave(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000397 bio_list_merge(&bios, master);
398 bio_list_init(master);
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400399 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000400
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400401 while ((bio = bio_list_pop(&bios)))
402 bio_endio(bio, DM_ENDIO_REQUEUE);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000403}
404
405static void requeue_io(struct thin_c *tc)
406{
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400407 requeue_bio_list(tc, &tc->deferred_bio_list);
408 requeue_bio_list(tc, &tc->retry_on_resume_list);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000409}
410
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400411static void error_thin_retry_list(struct thin_c *tc)
Joe Thornber3e1a0692014-03-03 16:03:26 +0000412{
413 struct bio *bio;
414 unsigned long flags;
415 struct bio_list bios;
416
417 bio_list_init(&bios);
418
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400419 spin_lock_irqsave(&tc->lock, flags);
420 bio_list_merge(&bios, &tc->retry_on_resume_list);
421 bio_list_init(&tc->retry_on_resume_list);
422 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber3e1a0692014-03-03 16:03:26 +0000423
424 while ((bio = bio_list_pop(&bios)))
425 bio_io_error(bio);
426}
427
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400428static void error_retry_list(struct pool *pool)
429{
430 struct thin_c *tc;
431
432 rcu_read_lock();
433 list_for_each_entry_rcu(tc, &pool->active_thins, list)
434 error_thin_retry_list(tc);
435 rcu_read_unlock();
436}
437
Joe Thornber991d9fa2011-10-31 20:21:18 +0000438/*
439 * This section of code contains the logic for processing a thin device's IO.
440 * Much of the code depends on pool object resources (lists, workqueues, etc)
441 * but most is exclusively called from the thin target rather than the thin-pool
442 * target.
443 */
444
Mike Snitzer58f77a22013-03-01 22:45:45 +0000445static bool block_size_is_power_of_two(struct pool *pool)
446{
447 return pool->sectors_per_block_shift >= 0;
448}
449
Joe Thornber991d9fa2011-10-31 20:21:18 +0000450static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
451{
Mike Snitzer58f77a22013-03-01 22:45:45 +0000452 struct pool *pool = tc->pool;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700453 sector_t block_nr = bio->bi_iter.bi_sector;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100454
Mike Snitzer58f77a22013-03-01 22:45:45 +0000455 if (block_size_is_power_of_two(pool))
456 block_nr >>= pool->sectors_per_block_shift;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +0100457 else
Mike Snitzer58f77a22013-03-01 22:45:45 +0000458 (void) sector_div(block_nr, pool->sectors_per_block);
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100459
460 return block_nr;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000461}
462
463static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
464{
465 struct pool *pool = tc->pool;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700466 sector_t bi_sector = bio->bi_iter.bi_sector;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000467
468 bio->bi_bdev = tc->pool_dev->bdev;
Mike Snitzer58f77a22013-03-01 22:45:45 +0000469 if (block_size_is_power_of_two(pool))
Kent Overstreet4f024f32013-10-11 15:44:27 -0700470 bio->bi_iter.bi_sector =
471 (block << pool->sectors_per_block_shift) |
472 (bi_sector & (pool->sectors_per_block - 1));
Mike Snitzer58f77a22013-03-01 22:45:45 +0000473 else
Kent Overstreet4f024f32013-10-11 15:44:27 -0700474 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
Mike Snitzer58f77a22013-03-01 22:45:45 +0000475 sector_div(bi_sector, pool->sectors_per_block);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000476}
477
Joe Thornber2dd9c252012-03-28 18:41:28 +0100478static void remap_to_origin(struct thin_c *tc, struct bio *bio)
479{
480 bio->bi_bdev = tc->origin_dev->bdev;
481}
482
Joe Thornber4afdd682012-07-27 15:08:14 +0100483static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
484{
485 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
486 dm_thin_changed_this_transaction(tc->td);
487}
488
Joe Thornbere8088072012-12-21 20:23:31 +0000489static void inc_all_io_entry(struct pool *pool, struct bio *bio)
490{
491 struct dm_thin_endio_hook *h;
492
493 if (bio->bi_rw & REQ_DISCARD)
494 return;
495
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +0000496 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornbere8088072012-12-21 20:23:31 +0000497 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
498}
499
Joe Thornber2dd9c252012-03-28 18:41:28 +0100500static void issue(struct thin_c *tc, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000501{
502 struct pool *pool = tc->pool;
503 unsigned long flags;
504
Joe Thornbere49e5822012-07-27 15:08:16 +0100505 if (!bio_triggers_commit(tc, bio)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000506 generic_make_request(bio);
Joe Thornbere49e5822012-07-27 15:08:16 +0100507 return;
508 }
509
510 /*
511 * Complete bio with an error if earlier I/O caused changes to
512 * the metadata that can't be committed e.g, due to I/O errors
513 * on the metadata device.
514 */
515 if (dm_thin_aborted_changes(tc->td)) {
516 bio_io_error(bio);
517 return;
518 }
519
520 /*
521 * Batch together any bios that trigger commits and then issue a
522 * single commit for them in process_deferred_bios().
523 */
524 spin_lock_irqsave(&pool->lock, flags);
525 bio_list_add(&pool->deferred_flush_bios, bio);
526 spin_unlock_irqrestore(&pool->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000527}
528
Joe Thornber2dd9c252012-03-28 18:41:28 +0100529static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
530{
531 remap_to_origin(tc, bio);
532 issue(tc, bio);
533}
534
535static void remap_and_issue(struct thin_c *tc, struct bio *bio,
536 dm_block_t block)
537{
538 remap(tc, bio, block);
539 issue(tc, bio);
540}
541
Joe Thornber991d9fa2011-10-31 20:21:18 +0000542/*----------------------------------------------------------------*/
543
544/*
545 * Bio endio functions.
546 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100547struct dm_thin_new_mapping {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000548 struct list_head list;
549
Mike Snitzer7f214662013-12-17 13:43:31 -0500550 bool quiesced:1;
551 bool prepared:1;
552 bool pass_discard:1;
553 bool definitely_not_shared:1;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000554
Mike Snitzer7f214662013-12-17 13:43:31 -0500555 int err;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000556 struct thin_c *tc;
557 dm_block_t virt_block;
558 dm_block_t data_block;
Mike Snitzera24c2562012-06-03 00:30:00 +0100559 struct dm_bio_prison_cell *cell, *cell2;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000560
561 /*
562 * If the bio covers the whole area of a block then we can avoid
563 * zeroing or copying. Instead this bio is hooked. The bio will
564 * still be in the cell, so care has to be taken to avoid issuing
565 * the bio twice.
566 */
567 struct bio *bio;
568 bio_end_io_t *saved_bi_end_io;
569};
570
Mike Snitzera24c2562012-06-03 00:30:00 +0100571static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000572{
573 struct pool *pool = m->tc->pool;
574
Joe Thornbereb2aa482012-03-28 18:41:28 +0100575 if (m->quiesced && m->prepared) {
Mike Snitzerdaec3382013-12-11 14:01:20 -0500576 list_add_tail(&m->list, &pool->prepared_mappings);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000577 wake_worker(pool);
578 }
579}
580
581static void copy_complete(int read_err, unsigned long write_err, void *context)
582{
583 unsigned long flags;
Mike Snitzera24c2562012-06-03 00:30:00 +0100584 struct dm_thin_new_mapping *m = context;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000585 struct pool *pool = m->tc->pool;
586
587 m->err = read_err || write_err ? -EIO : 0;
588
589 spin_lock_irqsave(&pool->lock, flags);
Mike Snitzer7f214662013-12-17 13:43:31 -0500590 m->prepared = true;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000591 __maybe_add_mapping(m);
592 spin_unlock_irqrestore(&pool->lock, flags);
593}
594
595static void overwrite_endio(struct bio *bio, int err)
596{
597 unsigned long flags;
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +0000598 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Mike Snitzera24c2562012-06-03 00:30:00 +0100599 struct dm_thin_new_mapping *m = h->overwrite_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000600 struct pool *pool = m->tc->pool;
601
602 m->err = err;
603
604 spin_lock_irqsave(&pool->lock, flags);
Mike Snitzer7f214662013-12-17 13:43:31 -0500605 m->prepared = true;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000606 __maybe_add_mapping(m);
607 spin_unlock_irqrestore(&pool->lock, flags);
608}
609
Joe Thornber991d9fa2011-10-31 20:21:18 +0000610/*----------------------------------------------------------------*/
611
612/*
613 * Workqueue.
614 */
615
616/*
617 * Prepared mapping jobs.
618 */
619
620/*
621 * This sends the bios in the cell back to the deferred_bios list.
622 */
Joe Thornber2aab3852012-12-21 20:23:33 +0000623static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000624{
625 struct pool *pool = tc->pool;
626 unsigned long flags;
627
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400628 spin_lock_irqsave(&tc->lock, flags);
629 cell_release(pool, cell, &tc->deferred_bio_list);
630 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000631
632 wake_worker(pool);
633}
634
635/*
Joe Thornber6beca5e2013-03-01 22:45:50 +0000636 * Same as cell_defer above, except it omits the original holder of the cell.
Joe Thornber991d9fa2011-10-31 20:21:18 +0000637 */
Joe Thornberf286ba02012-12-21 20:23:33 +0000638static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000639{
Joe Thornber991d9fa2011-10-31 20:21:18 +0000640 struct pool *pool = tc->pool;
641 unsigned long flags;
642
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400643 spin_lock_irqsave(&tc->lock, flags);
644 cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
645 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000646
647 wake_worker(pool);
648}
649
Joe Thornbere49e5822012-07-27 15:08:16 +0100650static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
651{
Kent Overstreet196d38b2013-11-23 18:34:15 -0800652 if (m->bio) {
Joe Thornbere49e5822012-07-27 15:08:16 +0100653 m->bio->bi_end_io = m->saved_bi_end_io;
Kent Overstreet196d38b2013-11-23 18:34:15 -0800654 atomic_inc(&m->bio->bi_remaining);
655 }
Joe Thornber6beca5e2013-03-01 22:45:50 +0000656 cell_error(m->tc->pool, m->cell);
Joe Thornbere49e5822012-07-27 15:08:16 +0100657 list_del(&m->list);
658 mempool_free(m, m->tc->pool->mapping_pool);
659}
Joe Thornber025b9682013-03-01 22:45:50 +0000660
Mike Snitzera24c2562012-06-03 00:30:00 +0100661static void process_prepared_mapping(struct dm_thin_new_mapping *m)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000662{
663 struct thin_c *tc = m->tc;
Joe Thornber6beca5e2013-03-01 22:45:50 +0000664 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000665 struct bio *bio;
666 int r;
667
668 bio = m->bio;
Kent Overstreet196d38b2013-11-23 18:34:15 -0800669 if (bio) {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000670 bio->bi_end_io = m->saved_bi_end_io;
Kent Overstreet196d38b2013-11-23 18:34:15 -0800671 atomic_inc(&bio->bi_remaining);
672 }
Joe Thornber991d9fa2011-10-31 20:21:18 +0000673
674 if (m->err) {
Joe Thornber6beca5e2013-03-01 22:45:50 +0000675 cell_error(pool, m->cell);
Joe Thornber905386f2012-07-27 15:08:05 +0100676 goto out;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000677 }
678
679 /*
680 * Commit the prepared block into the mapping btree.
681 * Any I/O for this block arriving after this point will get
682 * remapped to it directly.
683 */
684 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
685 if (r) {
Joe Thornberb5330652013-12-04 19:51:33 -0500686 metadata_operation_failed(pool, "dm_thin_insert_block", r);
Joe Thornber6beca5e2013-03-01 22:45:50 +0000687 cell_error(pool, m->cell);
Joe Thornber905386f2012-07-27 15:08:05 +0100688 goto out;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000689 }
690
691 /*
692 * Release any bios held while the block was being provisioned.
693 * If we are processing a write bio that completely covers the block,
694 * we already processed it so can ignore it now when processing
695 * the bios in the cell.
696 */
697 if (bio) {
Joe Thornberf286ba02012-12-21 20:23:33 +0000698 cell_defer_no_holder(tc, m->cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000699 bio_endio(bio, 0);
700 } else
Joe Thornber2aab3852012-12-21 20:23:33 +0000701 cell_defer(tc, m->cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000702
Joe Thornber905386f2012-07-27 15:08:05 +0100703out:
Joe Thornber991d9fa2011-10-31 20:21:18 +0000704 list_del(&m->list);
Joe Thornber6beca5e2013-03-01 22:45:50 +0000705 mempool_free(m, pool->mapping_pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000706}
707
Joe Thornbere49e5822012-07-27 15:08:16 +0100708static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
Joe Thornber104655f2012-03-28 18:41:28 +0100709{
Joe Thornber104655f2012-03-28 18:41:28 +0100710 struct thin_c *tc = m->tc;
711
Joe Thornbere49e5822012-07-27 15:08:16 +0100712 bio_io_error(m->bio);
Joe Thornberf286ba02012-12-21 20:23:33 +0000713 cell_defer_no_holder(tc, m->cell);
714 cell_defer_no_holder(tc, m->cell2);
Joe Thornbere49e5822012-07-27 15:08:16 +0100715 mempool_free(m, tc->pool->mapping_pool);
716}
Joe Thornber104655f2012-03-28 18:41:28 +0100717
Joe Thornbere49e5822012-07-27 15:08:16 +0100718static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
719{
720 struct thin_c *tc = m->tc;
721
Joe Thornbere8088072012-12-21 20:23:31 +0000722 inc_all_io_entry(tc->pool, m->bio);
Joe Thornberf286ba02012-12-21 20:23:33 +0000723 cell_defer_no_holder(tc, m->cell);
724 cell_defer_no_holder(tc, m->cell2);
Joe Thornbere8088072012-12-21 20:23:31 +0000725
Joe Thornber104655f2012-03-28 18:41:28 +0100726 if (m->pass_discard)
Joe Thornber19fa1a62013-12-17 12:09:40 -0500727 if (m->definitely_not_shared)
728 remap_and_issue(tc, m->bio, m->data_block);
729 else {
730 bool used = false;
731 if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
732 bio_endio(m->bio, 0);
733 else
734 remap_and_issue(tc, m->bio, m->data_block);
735 }
Joe Thornber104655f2012-03-28 18:41:28 +0100736 else
737 bio_endio(m->bio, 0);
738
Joe Thornber104655f2012-03-28 18:41:28 +0100739 mempool_free(m, tc->pool->mapping_pool);
740}
741
Joe Thornbere49e5822012-07-27 15:08:16 +0100742static void process_prepared_discard(struct dm_thin_new_mapping *m)
743{
744 int r;
745 struct thin_c *tc = m->tc;
746
747 r = dm_thin_remove_block(tc->td, m->virt_block);
748 if (r)
Mike Snitzerc3977412012-12-21 20:23:34 +0000749 DMERR_LIMIT("dm_thin_remove_block() failed");
Joe Thornbere49e5822012-07-27 15:08:16 +0100750
751 process_prepared_discard_passdown(m);
752}
753
Joe Thornber104655f2012-03-28 18:41:28 +0100754static void process_prepared(struct pool *pool, struct list_head *head,
Joe Thornbere49e5822012-07-27 15:08:16 +0100755 process_mapping_fn *fn)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000756{
757 unsigned long flags;
758 struct list_head maps;
Mike Snitzera24c2562012-06-03 00:30:00 +0100759 struct dm_thin_new_mapping *m, *tmp;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000760
761 INIT_LIST_HEAD(&maps);
762 spin_lock_irqsave(&pool->lock, flags);
Joe Thornber104655f2012-03-28 18:41:28 +0100763 list_splice_init(head, &maps);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000764 spin_unlock_irqrestore(&pool->lock, flags);
765
766 list_for_each_entry_safe(m, tmp, &maps, list)
Joe Thornbere49e5822012-07-27 15:08:16 +0100767 (*fn)(m);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000768}
769
770/*
771 * Deferred bio jobs.
772 */
Joe Thornber104655f2012-03-28 18:41:28 +0100773static int io_overlaps_block(struct pool *pool, struct bio *bio)
774{
Kent Overstreet4f024f32013-10-11 15:44:27 -0700775 return bio->bi_iter.bi_size ==
776 (pool->sectors_per_block << SECTOR_SHIFT);
Joe Thornber104655f2012-03-28 18:41:28 +0100777}
778
Joe Thornber991d9fa2011-10-31 20:21:18 +0000779static int io_overwrites_block(struct pool *pool, struct bio *bio)
780{
Joe Thornber104655f2012-03-28 18:41:28 +0100781 return (bio_data_dir(bio) == WRITE) &&
782 io_overlaps_block(pool, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000783}
784
785static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
786 bio_end_io_t *fn)
787{
788 *save = bio->bi_end_io;
789 bio->bi_end_io = fn;
790}
791
792static int ensure_next_mapping(struct pool *pool)
793{
794 if (pool->next_mapping)
795 return 0;
796
797 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
798
799 return pool->next_mapping ? 0 : -ENOMEM;
800}
801
Mike Snitzera24c2562012-06-03 00:30:00 +0100802static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000803{
Mike Snitzer16961b02013-12-17 13:19:11 -0500804 struct dm_thin_new_mapping *m = pool->next_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000805
806 BUG_ON(!pool->next_mapping);
807
Mike Snitzer16961b02013-12-17 13:19:11 -0500808 memset(m, 0, sizeof(struct dm_thin_new_mapping));
809 INIT_LIST_HEAD(&m->list);
810 m->bio = NULL;
811
Joe Thornber991d9fa2011-10-31 20:21:18 +0000812 pool->next_mapping = NULL;
813
Mike Snitzer16961b02013-12-17 13:19:11 -0500814 return m;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000815}
816
817static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
Joe Thornber2dd9c252012-03-28 18:41:28 +0100818 struct dm_dev *origin, dm_block_t data_origin,
819 dm_block_t data_dest,
Mike Snitzera24c2562012-06-03 00:30:00 +0100820 struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000821{
822 int r;
823 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +0100824 struct dm_thin_new_mapping *m = get_next_mapping(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000825
Joe Thornber991d9fa2011-10-31 20:21:18 +0000826 m->tc = tc;
827 m->virt_block = virt_block;
828 m->data_block = data_dest;
829 m->cell = cell;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000830
Mike Snitzer44feb382012-10-12 21:02:10 +0100831 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
Mike Snitzer7f214662013-12-17 13:43:31 -0500832 m->quiesced = true;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000833
834 /*
835 * IO to pool_dev remaps to the pool target's data_dev.
836 *
837 * If the whole block of data is being overwritten, we can issue the
838 * bio immediately. Otherwise we use kcopyd to clone the data first.
839 */
840 if (io_overwrites_block(pool, bio)) {
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +0000841 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Mike Snitzera24c2562012-06-03 00:30:00 +0100842
Joe Thornbereb2aa482012-03-28 18:41:28 +0100843 h->overwrite_mapping = m;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000844 m->bio = bio;
845 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornbere8088072012-12-21 20:23:31 +0000846 inc_all_io_entry(pool, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000847 remap_and_issue(tc, bio, data_dest);
848 } else {
849 struct dm_io_region from, to;
850
Joe Thornber2dd9c252012-03-28 18:41:28 +0100851 from.bdev = origin->bdev;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000852 from.sector = data_origin * pool->sectors_per_block;
853 from.count = pool->sectors_per_block;
854
855 to.bdev = tc->pool_dev->bdev;
856 to.sector = data_dest * pool->sectors_per_block;
857 to.count = pool->sectors_per_block;
858
859 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
860 0, copy_complete, m);
861 if (r < 0) {
862 mempool_free(m, pool->mapping_pool);
Mike Snitzerc3977412012-12-21 20:23:34 +0000863 DMERR_LIMIT("dm_kcopyd_copy() failed");
Joe Thornber6beca5e2013-03-01 22:45:50 +0000864 cell_error(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000865 }
866 }
867}
868
Joe Thornber2dd9c252012-03-28 18:41:28 +0100869static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
870 dm_block_t data_origin, dm_block_t data_dest,
Mike Snitzera24c2562012-06-03 00:30:00 +0100871 struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber2dd9c252012-03-28 18:41:28 +0100872{
873 schedule_copy(tc, virt_block, tc->pool_dev,
874 data_origin, data_dest, cell, bio);
875}
876
877static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
878 dm_block_t data_dest,
Mike Snitzera24c2562012-06-03 00:30:00 +0100879 struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber2dd9c252012-03-28 18:41:28 +0100880{
881 schedule_copy(tc, virt_block, tc->origin_dev,
882 virt_block, data_dest, cell, bio);
883}
884
Joe Thornber991d9fa2011-10-31 20:21:18 +0000885static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
Mike Snitzera24c2562012-06-03 00:30:00 +0100886 dm_block_t data_block, struct dm_bio_prison_cell *cell,
Joe Thornber991d9fa2011-10-31 20:21:18 +0000887 struct bio *bio)
888{
889 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +0100890 struct dm_thin_new_mapping *m = get_next_mapping(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000891
Mike Snitzer7f214662013-12-17 13:43:31 -0500892 m->quiesced = true;
893 m->prepared = false;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000894 m->tc = tc;
895 m->virt_block = virt_block;
896 m->data_block = data_block;
897 m->cell = cell;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000898
899 /*
900 * If the whole block of data is being overwritten or we are not
901 * zeroing pre-existing data, we can issue the bio immediately.
902 * Otherwise we use kcopyd to zero the data first.
903 */
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100904 if (!pool->pf.zero_new_blocks)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000905 process_prepared_mapping(m);
906
907 else if (io_overwrites_block(pool, bio)) {
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +0000908 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Mike Snitzera24c2562012-06-03 00:30:00 +0100909
Joe Thornbereb2aa482012-03-28 18:41:28 +0100910 h->overwrite_mapping = m;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000911 m->bio = bio;
912 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornbere8088072012-12-21 20:23:31 +0000913 inc_all_io_entry(pool, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000914 remap_and_issue(tc, bio, data_block);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000915 } else {
916 int r;
917 struct dm_io_region to;
918
919 to.bdev = tc->pool_dev->bdev;
920 to.sector = data_block * pool->sectors_per_block;
921 to.count = pool->sectors_per_block;
922
923 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
924 if (r < 0) {
925 mempool_free(m, pool->mapping_pool);
Mike Snitzerc3977412012-12-21 20:23:34 +0000926 DMERR_LIMIT("dm_kcopyd_zero() failed");
Joe Thornber6beca5e2013-03-01 22:45:50 +0000927 cell_error(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000928 }
929 }
930}
931
Joe Thornbere49e5822012-07-27 15:08:16 +0100932/*
933 * A non-zero return indicates read_only or fail_io mode.
934 * Many callers don't care about the return value.
935 */
Joe Thornber020cc3b2013-12-04 15:05:36 -0500936static int commit(struct pool *pool)
Joe Thornbere49e5822012-07-27 15:08:16 +0100937{
938 int r;
939
Joe Thornber8d07e8a2014-05-06 16:28:14 +0100940 if (get_pool_mode(pool) >= PM_READ_ONLY)
Joe Thornbere49e5822012-07-27 15:08:16 +0100941 return -EINVAL;
942
Joe Thornber020cc3b2013-12-04 15:05:36 -0500943 r = dm_pool_commit_metadata(pool->pmd);
Joe Thornberb5330652013-12-04 19:51:33 -0500944 if (r)
945 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
Joe Thornbere49e5822012-07-27 15:08:16 +0100946
947 return r;
948}
949
Joe Thornber88a66212013-12-04 20:16:12 -0500950static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
951{
952 unsigned long flags;
953
954 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
955 DMWARN("%s: reached low water mark for data device: sending event.",
956 dm_device_name(pool->pool_md));
957 spin_lock_irqsave(&pool->lock, flags);
958 pool->low_water_triggered = true;
959 spin_unlock_irqrestore(&pool->lock, flags);
960 dm_table_event(pool->ti->table);
961 }
962}
963
Joe Thornber3e1a0692014-03-03 16:03:26 +0000964static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
965
Joe Thornber991d9fa2011-10-31 20:21:18 +0000966static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
967{
968 int r;
969 dm_block_t free_blocks;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000970 struct pool *pool = tc->pool;
971
Joe Thornber3e1a0692014-03-03 16:03:26 +0000972 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
Joe Thornber8d30abf2013-12-04 19:16:11 -0500973 return -EINVAL;
974
Joe Thornber991d9fa2011-10-31 20:21:18 +0000975 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
Joe Thornberb5330652013-12-04 19:51:33 -0500976 if (r) {
977 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000978 return r;
Joe Thornberb5330652013-12-04 19:51:33 -0500979 }
Joe Thornber991d9fa2011-10-31 20:21:18 +0000980
Joe Thornber88a66212013-12-04 20:16:12 -0500981 check_low_water_mark(pool, free_blocks);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000982
983 if (!free_blocks) {
Mike Snitzer94563ba2013-08-22 09:56:18 -0400984 /*
985 * Try to commit to see if that will free up some
986 * more space.
987 */
Joe Thornber020cc3b2013-12-04 15:05:36 -0500988 r = commit(pool);
989 if (r)
990 return r;
Mike Snitzer94563ba2013-08-22 09:56:18 -0400991
992 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
Joe Thornberb5330652013-12-04 19:51:33 -0500993 if (r) {
994 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
Mike Snitzer94563ba2013-08-22 09:56:18 -0400995 return r;
Joe Thornberb5330652013-12-04 19:51:33 -0500996 }
Mike Snitzer94563ba2013-08-22 09:56:18 -0400997
Mike Snitzer94563ba2013-08-22 09:56:18 -0400998 if (!free_blocks) {
Joe Thornber3e1a0692014-03-03 16:03:26 +0000999 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001000 return -ENOSPC;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001001 }
1002 }
1003
1004 r = dm_pool_alloc_data_block(pool->pmd, result);
Mike Snitzer4a02b342013-12-03 12:20:57 -05001005 if (r) {
Joe Thornberb5330652013-12-04 19:51:33 -05001006 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001007 return r;
Mike Snitzer4a02b342013-12-03 12:20:57 -05001008 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001009
1010 return 0;
1011}
1012
1013/*
1014 * If we have run out of space, queue bios until the device is
1015 * resumed, presumably after having been reloaded with more space.
1016 */
1017static void retry_on_resume(struct bio *bio)
1018{
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00001019 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornbereb2aa482012-03-28 18:41:28 +01001020 struct thin_c *tc = h->tc;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001021 unsigned long flags;
1022
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001023 spin_lock_irqsave(&tc->lock, flags);
1024 bio_list_add(&tc->retry_on_resume_list, bio);
1025 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001026}
1027
Joe Thornber3e1a0692014-03-03 16:03:26 +00001028static bool should_error_unserviceable_bio(struct pool *pool)
1029{
1030 enum pool_mode m = get_pool_mode(pool);
1031
1032 switch (m) {
1033 case PM_WRITE:
1034 /* Shouldn't get here */
1035 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1036 return true;
1037
1038 case PM_OUT_OF_DATA_SPACE:
1039 return pool->pf.error_if_no_space;
1040
1041 case PM_READ_ONLY:
1042 case PM_FAIL:
1043 return true;
1044 default:
1045 /* Shouldn't get here */
1046 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1047 return true;
1048 }
1049}
1050
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001051static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1052{
Joe Thornber3e1a0692014-03-03 16:03:26 +00001053 if (should_error_unserviceable_bio(pool))
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001054 bio_io_error(bio);
Mike Snitzer6d162022013-12-20 18:09:02 -05001055 else
1056 retry_on_resume(bio);
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001057}
1058
Mike Snitzer399cadd2013-12-05 16:03:33 -05001059static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001060{
1061 struct bio *bio;
1062 struct bio_list bios;
1063
Joe Thornber3e1a0692014-03-03 16:03:26 +00001064 if (should_error_unserviceable_bio(pool)) {
1065 cell_error(pool, cell);
1066 return;
1067 }
1068
Joe Thornber991d9fa2011-10-31 20:21:18 +00001069 bio_list_init(&bios);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001070 cell_release(pool, cell, &bios);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001071
Joe Thornber3e1a0692014-03-03 16:03:26 +00001072 if (should_error_unserviceable_bio(pool))
1073 while ((bio = bio_list_pop(&bios)))
1074 bio_io_error(bio);
1075 else
1076 while ((bio = bio_list_pop(&bios)))
1077 retry_on_resume(bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001078}
1079
Joe Thornber104655f2012-03-28 18:41:28 +01001080static void process_discard(struct thin_c *tc, struct bio *bio)
1081{
1082 int r;
Mike Snitzerc3a0ce22012-05-12 01:43:16 +01001083 unsigned long flags;
Joe Thornber104655f2012-03-28 18:41:28 +01001084 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +01001085 struct dm_bio_prison_cell *cell, *cell2;
Mike Snitzer44feb382012-10-12 21:02:10 +01001086 struct dm_cell_key key, key2;
Joe Thornber104655f2012-03-28 18:41:28 +01001087 dm_block_t block = get_bio_block(tc, bio);
1088 struct dm_thin_lookup_result lookup_result;
Mike Snitzera24c2562012-06-03 00:30:00 +01001089 struct dm_thin_new_mapping *m;
Joe Thornber104655f2012-03-28 18:41:28 +01001090
1091 build_virtual_key(tc->td, block, &key);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001092 if (bio_detain(tc->pool, &key, bio, &cell))
Joe Thornber104655f2012-03-28 18:41:28 +01001093 return;
1094
1095 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1096 switch (r) {
1097 case 0:
1098 /*
1099 * Check nobody is fiddling with this pool block. This can
1100 * happen if someone's in the process of breaking sharing
1101 * on this block.
1102 */
1103 build_data_key(tc->td, lookup_result.block, &key2);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001104 if (bio_detain(tc->pool, &key2, bio, &cell2)) {
Joe Thornberf286ba02012-12-21 20:23:33 +00001105 cell_defer_no_holder(tc, cell);
Joe Thornber104655f2012-03-28 18:41:28 +01001106 break;
1107 }
1108
1109 if (io_overlaps_block(pool, bio)) {
1110 /*
1111 * IO may still be going to the destination block. We must
1112 * quiesce before we can do the removal.
1113 */
1114 m = get_next_mapping(pool);
1115 m->tc = tc;
Joe Thornber19fa1a62013-12-17 12:09:40 -05001116 m->pass_discard = pool->pf.discard_passdown;
1117 m->definitely_not_shared = !lookup_result.shared;
Joe Thornber104655f2012-03-28 18:41:28 +01001118 m->virt_block = block;
1119 m->data_block = lookup_result.block;
1120 m->cell = cell;
1121 m->cell2 = cell2;
Joe Thornber104655f2012-03-28 18:41:28 +01001122 m->bio = bio;
1123
Mike Snitzer44feb382012-10-12 21:02:10 +01001124 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
Mike Snitzerc3a0ce22012-05-12 01:43:16 +01001125 spin_lock_irqsave(&pool->lock, flags);
Mike Snitzerdaec3382013-12-11 14:01:20 -05001126 list_add_tail(&m->list, &pool->prepared_discards);
Mike Snitzerc3a0ce22012-05-12 01:43:16 +01001127 spin_unlock_irqrestore(&pool->lock, flags);
Joe Thornber104655f2012-03-28 18:41:28 +01001128 wake_worker(pool);
1129 }
1130 } else {
Joe Thornbere8088072012-12-21 20:23:31 +00001131 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001132 cell_defer_no_holder(tc, cell);
1133 cell_defer_no_holder(tc, cell2);
Joe Thornbere8088072012-12-21 20:23:31 +00001134
Joe Thornber104655f2012-03-28 18:41:28 +01001135 /*
Mikulas Patocka49296302012-07-27 15:08:03 +01001136 * The DM core makes sure that the discard doesn't span
1137 * a block boundary. So we submit the discard of a
1138 * partial block appropriately.
Joe Thornber104655f2012-03-28 18:41:28 +01001139 */
Mikulas Patocka650d2a02012-07-20 14:25:05 +01001140 if ((!lookup_result.shared) && pool->pf.discard_passdown)
1141 remap_and_issue(tc, bio, lookup_result.block);
1142 else
1143 bio_endio(bio, 0);
Joe Thornber104655f2012-03-28 18:41:28 +01001144 }
1145 break;
1146
1147 case -ENODATA:
1148 /*
1149 * It isn't provisioned, just forget it.
1150 */
Joe Thornberf286ba02012-12-21 20:23:33 +00001151 cell_defer_no_holder(tc, cell);
Joe Thornber104655f2012-03-28 18:41:28 +01001152 bio_endio(bio, 0);
1153 break;
1154
1155 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001156 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1157 __func__, r);
Joe Thornberf286ba02012-12-21 20:23:33 +00001158 cell_defer_no_holder(tc, cell);
Joe Thornber104655f2012-03-28 18:41:28 +01001159 bio_io_error(bio);
1160 break;
1161 }
1162}
1163
Joe Thornber991d9fa2011-10-31 20:21:18 +00001164static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
Mike Snitzer44feb382012-10-12 21:02:10 +01001165 struct dm_cell_key *key,
Joe Thornber991d9fa2011-10-31 20:21:18 +00001166 struct dm_thin_lookup_result *lookup_result,
Mike Snitzera24c2562012-06-03 00:30:00 +01001167 struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001168{
1169 int r;
1170 dm_block_t data_block;
Mike Snitzerd6fc2042013-08-21 17:40:11 -04001171 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001172
1173 r = alloc_data_block(tc, &data_block);
1174 switch (r) {
1175 case 0:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001176 schedule_internal_copy(tc, block, lookup_result->block,
1177 data_block, cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001178 break;
1179
1180 case -ENOSPC:
Mike Snitzer399cadd2013-12-05 16:03:33 -05001181 retry_bios_on_resume(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001182 break;
1183
1184 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001185 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1186 __func__, r);
Mike Snitzerd6fc2042013-08-21 17:40:11 -04001187 cell_error(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001188 break;
1189 }
1190}
1191
1192static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1193 dm_block_t block,
1194 struct dm_thin_lookup_result *lookup_result)
1195{
Mike Snitzera24c2562012-06-03 00:30:00 +01001196 struct dm_bio_prison_cell *cell;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001197 struct pool *pool = tc->pool;
Mike Snitzer44feb382012-10-12 21:02:10 +01001198 struct dm_cell_key key;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001199
1200 /*
1201 * If cell is already occupied, then sharing is already in the process
1202 * of being broken so we have nothing further to do here.
1203 */
1204 build_data_key(tc->td, lookup_result->block, &key);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001205 if (bio_detain(pool, &key, bio, &cell))
Joe Thornber991d9fa2011-10-31 20:21:18 +00001206 return;
1207
Kent Overstreet4f024f32013-10-11 15:44:27 -07001208 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001209 break_sharing(tc, bio, block, &key, lookup_result, cell);
1210 else {
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00001211 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornber991d9fa2011-10-31 20:21:18 +00001212
Mike Snitzer44feb382012-10-12 21:02:10 +01001213 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
Joe Thornbere8088072012-12-21 20:23:31 +00001214 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001215 cell_defer_no_holder(tc, cell);
Joe Thornbere8088072012-12-21 20:23:31 +00001216
Joe Thornber991d9fa2011-10-31 20:21:18 +00001217 remap_and_issue(tc, bio, lookup_result->block);
1218 }
1219}
1220
1221static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
Mike Snitzera24c2562012-06-03 00:30:00 +01001222 struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001223{
1224 int r;
1225 dm_block_t data_block;
Joe Thornber6beca5e2013-03-01 22:45:50 +00001226 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001227
1228 /*
1229 * Remap empty bios (flushes) immediately, without provisioning.
1230 */
Kent Overstreet4f024f32013-10-11 15:44:27 -07001231 if (!bio->bi_iter.bi_size) {
Joe Thornber6beca5e2013-03-01 22:45:50 +00001232 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001233 cell_defer_no_holder(tc, cell);
Joe Thornbere8088072012-12-21 20:23:31 +00001234
Joe Thornber991d9fa2011-10-31 20:21:18 +00001235 remap_and_issue(tc, bio, 0);
1236 return;
1237 }
1238
1239 /*
1240 * Fill read bios with zeroes and complete them immediately.
1241 */
1242 if (bio_data_dir(bio) == READ) {
1243 zero_fill_bio(bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001244 cell_defer_no_holder(tc, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001245 bio_endio(bio, 0);
1246 return;
1247 }
1248
1249 r = alloc_data_block(tc, &data_block);
1250 switch (r) {
1251 case 0:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001252 if (tc->origin_dev)
1253 schedule_external_copy(tc, block, data_block, cell, bio);
1254 else
1255 schedule_zero(tc, block, data_block, cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001256 break;
1257
1258 case -ENOSPC:
Mike Snitzer399cadd2013-12-05 16:03:33 -05001259 retry_bios_on_resume(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001260 break;
1261
1262 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001263 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1264 __func__, r);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001265 cell_error(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001266 break;
1267 }
1268}
1269
1270static void process_bio(struct thin_c *tc, struct bio *bio)
1271{
1272 int r;
Joe Thornber6beca5e2013-03-01 22:45:50 +00001273 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001274 dm_block_t block = get_bio_block(tc, bio);
Mike Snitzera24c2562012-06-03 00:30:00 +01001275 struct dm_bio_prison_cell *cell;
Mike Snitzer44feb382012-10-12 21:02:10 +01001276 struct dm_cell_key key;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001277 struct dm_thin_lookup_result lookup_result;
1278
1279 /*
1280 * If cell is already occupied, then the block is already
1281 * being provisioned so we have nothing further to do here.
1282 */
1283 build_virtual_key(tc->td, block, &key);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001284 if (bio_detain(pool, &key, bio, &cell))
Joe Thornber991d9fa2011-10-31 20:21:18 +00001285 return;
1286
1287 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1288 switch (r) {
1289 case 0:
Joe Thornbere8088072012-12-21 20:23:31 +00001290 if (lookup_result.shared) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001291 process_shared_bio(tc, bio, block, &lookup_result);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001292 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
Joe Thornbere8088072012-12-21 20:23:31 +00001293 } else {
Joe Thornber6beca5e2013-03-01 22:45:50 +00001294 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001295 cell_defer_no_holder(tc, cell);
Joe Thornbere8088072012-12-21 20:23:31 +00001296
Joe Thornber991d9fa2011-10-31 20:21:18 +00001297 remap_and_issue(tc, bio, lookup_result.block);
Joe Thornbere8088072012-12-21 20:23:31 +00001298 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001299 break;
1300
1301 case -ENODATA:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001302 if (bio_data_dir(bio) == READ && tc->origin_dev) {
Joe Thornber6beca5e2013-03-01 22:45:50 +00001303 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001304 cell_defer_no_holder(tc, cell);
Joe Thornbere8088072012-12-21 20:23:31 +00001305
Joe Thornber2dd9c252012-03-28 18:41:28 +01001306 remap_to_origin_and_issue(tc, bio);
1307 } else
1308 provision_block(tc, bio, block, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001309 break;
1310
1311 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001312 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1313 __func__, r);
Joe Thornberf286ba02012-12-21 20:23:33 +00001314 cell_defer_no_holder(tc, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001315 bio_io_error(bio);
1316 break;
1317 }
1318}
1319
Joe Thornbere49e5822012-07-27 15:08:16 +01001320static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1321{
1322 int r;
1323 int rw = bio_data_dir(bio);
1324 dm_block_t block = get_bio_block(tc, bio);
1325 struct dm_thin_lookup_result lookup_result;
1326
1327 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1328 switch (r) {
1329 case 0:
Kent Overstreet4f024f32013-10-11 15:44:27 -07001330 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001331 handle_unserviceable_bio(tc->pool, bio);
Joe Thornbere8088072012-12-21 20:23:31 +00001332 else {
1333 inc_all_io_entry(tc->pool, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01001334 remap_and_issue(tc, bio, lookup_result.block);
Joe Thornbere8088072012-12-21 20:23:31 +00001335 }
Joe Thornbere49e5822012-07-27 15:08:16 +01001336 break;
1337
1338 case -ENODATA:
1339 if (rw != READ) {
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001340 handle_unserviceable_bio(tc->pool, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01001341 break;
1342 }
1343
1344 if (tc->origin_dev) {
Joe Thornbere8088072012-12-21 20:23:31 +00001345 inc_all_io_entry(tc->pool, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01001346 remap_to_origin_and_issue(tc, bio);
1347 break;
1348 }
1349
1350 zero_fill_bio(bio);
1351 bio_endio(bio, 0);
1352 break;
1353
1354 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001355 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1356 __func__, r);
Joe Thornbere49e5822012-07-27 15:08:16 +01001357 bio_io_error(bio);
1358 break;
1359 }
1360}
1361
Joe Thornber3e1a0692014-03-03 16:03:26 +00001362static void process_bio_success(struct thin_c *tc, struct bio *bio)
1363{
1364 bio_endio(bio, 0);
1365}
1366
Joe Thornbere49e5822012-07-27 15:08:16 +01001367static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1368{
1369 bio_io_error(bio);
1370}
1371
Joe Thornberac8c3f32013-05-10 14:37:21 +01001372/*
1373 * FIXME: should we also commit due to size of transaction, measured in
1374 * metadata blocks?
1375 */
Joe Thornber905e51b2012-03-28 18:41:27 +01001376static int need_commit_due_to_time(struct pool *pool)
1377{
1378 return jiffies < pool->last_commit_jiffies ||
1379 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1380}
1381
Mike Snitzer67324ea2014-03-21 18:33:41 -04001382#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
1383#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
1384
1385static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
1386{
1387 struct rb_node **rbp, *parent;
1388 struct dm_thin_endio_hook *pbd;
1389 sector_t bi_sector = bio->bi_iter.bi_sector;
1390
1391 rbp = &tc->sort_bio_list.rb_node;
1392 parent = NULL;
1393 while (*rbp) {
1394 parent = *rbp;
1395 pbd = thin_pbd(parent);
1396
1397 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
1398 rbp = &(*rbp)->rb_left;
1399 else
1400 rbp = &(*rbp)->rb_right;
1401 }
1402
1403 pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1404 rb_link_node(&pbd->rb_node, parent, rbp);
1405 rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
1406}
1407
1408static void __extract_sorted_bios(struct thin_c *tc)
1409{
1410 struct rb_node *node;
1411 struct dm_thin_endio_hook *pbd;
1412 struct bio *bio;
1413
1414 for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
1415 pbd = thin_pbd(node);
1416 bio = thin_bio(pbd);
1417
1418 bio_list_add(&tc->deferred_bio_list, bio);
1419 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
1420 }
1421
1422 WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
1423}
1424
1425static void __sort_thin_deferred_bios(struct thin_c *tc)
1426{
1427 struct bio *bio;
1428 struct bio_list bios;
1429
1430 bio_list_init(&bios);
1431 bio_list_merge(&bios, &tc->deferred_bio_list);
1432 bio_list_init(&tc->deferred_bio_list);
1433
1434 /* Sort deferred_bio_list using rb-tree */
1435 while ((bio = bio_list_pop(&bios)))
1436 __thin_bio_rb_add(tc, bio);
1437
1438 /*
1439 * Transfer the sorted bios in sort_bio_list back to
1440 * deferred_bio_list to allow lockless submission of
1441 * all bios.
1442 */
1443 __extract_sorted_bios(tc);
1444}
1445
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001446static void process_thin_deferred_bios(struct thin_c *tc)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001447{
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001448 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001449 unsigned long flags;
1450 struct bio *bio;
1451 struct bio_list bios;
Mike Snitzer67324ea2014-03-21 18:33:41 -04001452 struct blk_plug plug;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001453
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001454 if (tc->requeue_mode) {
1455 requeue_bio_list(tc, &tc->deferred_bio_list);
1456 return;
1457 }
1458
Joe Thornber991d9fa2011-10-31 20:21:18 +00001459 bio_list_init(&bios);
1460
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001461 spin_lock_irqsave(&tc->lock, flags);
Mike Snitzer67324ea2014-03-21 18:33:41 -04001462
1463 if (bio_list_empty(&tc->deferred_bio_list)) {
1464 spin_unlock_irqrestore(&tc->lock, flags);
1465 return;
1466 }
1467
1468 __sort_thin_deferred_bios(tc);
1469
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001470 bio_list_merge(&bios, &tc->deferred_bio_list);
1471 bio_list_init(&tc->deferred_bio_list);
Mike Snitzer67324ea2014-03-21 18:33:41 -04001472
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001473 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001474
Mike Snitzer67324ea2014-03-21 18:33:41 -04001475 blk_start_plug(&plug);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001476 while ((bio = bio_list_pop(&bios))) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001477 /*
1478 * If we've got no free new_mapping structs, and processing
1479 * this bio might require one, we pause until there are some
1480 * prepared mappings to process.
1481 */
1482 if (ensure_next_mapping(pool)) {
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001483 spin_lock_irqsave(&tc->lock, flags);
1484 bio_list_add(&tc->deferred_bio_list, bio);
1485 bio_list_merge(&tc->deferred_bio_list, &bios);
1486 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001487 break;
1488 }
Joe Thornber104655f2012-03-28 18:41:28 +01001489
1490 if (bio->bi_rw & REQ_DISCARD)
Joe Thornbere49e5822012-07-27 15:08:16 +01001491 pool->process_discard(tc, bio);
Joe Thornber104655f2012-03-28 18:41:28 +01001492 else
Joe Thornbere49e5822012-07-27 15:08:16 +01001493 pool->process_bio(tc, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001494 }
Mike Snitzer67324ea2014-03-21 18:33:41 -04001495 blk_finish_plug(&plug);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001496}
1497
Joe Thornberb10ebd32014-04-08 11:29:01 +01001498static void thin_get(struct thin_c *tc);
1499static void thin_put(struct thin_c *tc);
1500
1501/*
1502 * We can't hold rcu_read_lock() around code that can block. So we
1503 * find a thin with the rcu lock held; bump a refcount; then drop
1504 * the lock.
1505 */
1506static struct thin_c *get_first_thin(struct pool *pool)
1507{
1508 struct thin_c *tc = NULL;
1509
1510 rcu_read_lock();
1511 if (!list_empty(&pool->active_thins)) {
1512 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
1513 thin_get(tc);
1514 }
1515 rcu_read_unlock();
1516
1517 return tc;
1518}
1519
1520static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
1521{
1522 struct thin_c *old_tc = tc;
1523
1524 rcu_read_lock();
1525 list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
1526 thin_get(tc);
1527 thin_put(old_tc);
1528 rcu_read_unlock();
1529 return tc;
1530 }
1531 thin_put(old_tc);
1532 rcu_read_unlock();
1533
1534 return NULL;
1535}
1536
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001537static void process_deferred_bios(struct pool *pool)
1538{
1539 unsigned long flags;
1540 struct bio *bio;
1541 struct bio_list bios;
1542 struct thin_c *tc;
1543
Joe Thornberb10ebd32014-04-08 11:29:01 +01001544 tc = get_first_thin(pool);
1545 while (tc) {
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001546 process_thin_deferred_bios(tc);
Joe Thornberb10ebd32014-04-08 11:29:01 +01001547 tc = get_next_thin(pool, tc);
1548 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001549
1550 /*
1551 * If there are any deferred flush bios, we must commit
1552 * the metadata before issuing them.
1553 */
1554 bio_list_init(&bios);
1555 spin_lock_irqsave(&pool->lock, flags);
1556 bio_list_merge(&bios, &pool->deferred_flush_bios);
1557 bio_list_init(&pool->deferred_flush_bios);
1558 spin_unlock_irqrestore(&pool->lock, flags);
1559
Mike Snitzer4d1662a2014-02-06 06:08:56 -05001560 if (bio_list_empty(&bios) &&
1561 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
Joe Thornber991d9fa2011-10-31 20:21:18 +00001562 return;
1563
Joe Thornber020cc3b2013-12-04 15:05:36 -05001564 if (commit(pool)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001565 while ((bio = bio_list_pop(&bios)))
1566 bio_io_error(bio);
1567 return;
1568 }
Joe Thornber905e51b2012-03-28 18:41:27 +01001569 pool->last_commit_jiffies = jiffies;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001570
1571 while ((bio = bio_list_pop(&bios)))
1572 generic_make_request(bio);
1573}
1574
1575static void do_worker(struct work_struct *ws)
1576{
1577 struct pool *pool = container_of(ws, struct pool, worker);
1578
Joe Thornbere49e5822012-07-27 15:08:16 +01001579 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1580 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001581 process_deferred_bios(pool);
1582}
1583
Joe Thornber905e51b2012-03-28 18:41:27 +01001584/*
1585 * We want to commit periodically so that not too much
1586 * unwritten data builds up.
1587 */
1588static void do_waker(struct work_struct *ws)
1589{
1590 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1591 wake_worker(pool);
1592 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1593}
1594
Joe Thornber85ad6432014-05-09 15:59:38 +01001595/*
1596 * We're holding onto IO to allow userland time to react. After the
1597 * timeout either the pool will have been resized (and thus back in
1598 * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
1599 */
1600static void do_no_space_timeout(struct work_struct *ws)
1601{
1602 struct pool *pool = container_of(to_delayed_work(ws), struct pool,
1603 no_space_timeout);
1604
1605 if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
1606 set_pool_mode(pool, PM_READ_ONLY);
1607}
1608
Joe Thornber991d9fa2011-10-31 20:21:18 +00001609/*----------------------------------------------------------------*/
1610
Joe Thornber738211f2014-03-03 15:52:28 +00001611struct noflush_work {
1612 struct work_struct worker;
1613 struct thin_c *tc;
1614
1615 atomic_t complete;
1616 wait_queue_head_t wait;
1617};
1618
1619static void complete_noflush_work(struct noflush_work *w)
1620{
1621 atomic_set(&w->complete, 1);
1622 wake_up(&w->wait);
1623}
1624
1625static void do_noflush_start(struct work_struct *ws)
1626{
1627 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1628 w->tc->requeue_mode = true;
1629 requeue_io(w->tc);
1630 complete_noflush_work(w);
1631}
1632
1633static void do_noflush_stop(struct work_struct *ws)
1634{
1635 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1636 w->tc->requeue_mode = false;
1637 complete_noflush_work(w);
1638}
1639
1640static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1641{
1642 struct noflush_work w;
1643
Mike Snitzerfbcde3d2014-04-29 11:22:04 -04001644 INIT_WORK_ONSTACK(&w.worker, fn);
Joe Thornber738211f2014-03-03 15:52:28 +00001645 w.tc = tc;
1646 atomic_set(&w.complete, 0);
1647 init_waitqueue_head(&w.wait);
1648
1649 queue_work(tc->pool->wq, &w.worker);
1650
1651 wait_event(w.wait, atomic_read(&w.complete));
1652}
1653
1654/*----------------------------------------------------------------*/
1655
Joe Thornbere49e5822012-07-27 15:08:16 +01001656static enum pool_mode get_pool_mode(struct pool *pool)
1657{
1658 return pool->pf.mode;
1659}
1660
Joe Thornber3e1a0692014-03-03 16:03:26 +00001661static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1662{
1663 dm_table_event(pool->ti->table);
1664 DMINFO("%s: switching pool to %s mode",
1665 dm_device_name(pool->pool_md), new_mode);
1666}
1667
Mike Snitzer8b64e882013-12-20 14:27:28 -05001668static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
Joe Thornbere49e5822012-07-27 15:08:16 +01001669{
Mike Snitzercdc2b412014-02-14 18:10:55 -05001670 struct pool_c *pt = pool->ti->private;
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05001671 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1672 enum pool_mode old_mode = get_pool_mode(pool);
1673
1674 /*
1675 * Never allow the pool to transition to PM_WRITE mode if user
1676 * intervention is required to verify metadata and data consistency.
1677 */
1678 if (new_mode == PM_WRITE && needs_check) {
1679 DMERR("%s: unable to switch pool to write mode until repaired.",
1680 dm_device_name(pool->pool_md));
1681 if (old_mode != new_mode)
1682 new_mode = old_mode;
1683 else
1684 new_mode = PM_READ_ONLY;
1685 }
1686 /*
1687 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1688 * not going to recover without a thin_repair. So we never let the
1689 * pool move out of the old mode.
1690 */
1691 if (old_mode == PM_FAIL)
1692 new_mode = old_mode;
Joe Thornbere49e5822012-07-27 15:08:16 +01001693
Mike Snitzer8b64e882013-12-20 14:27:28 -05001694 switch (new_mode) {
Joe Thornbere49e5822012-07-27 15:08:16 +01001695 case PM_FAIL:
Mike Snitzer8b64e882013-12-20 14:27:28 -05001696 if (old_mode != new_mode)
Joe Thornber3e1a0692014-03-03 16:03:26 +00001697 notify_of_pool_mode_change(pool, "failure");
Joe Thornber5383ef32013-12-04 16:30:01 -05001698 dm_pool_metadata_read_only(pool->pmd);
Joe Thornbere49e5822012-07-27 15:08:16 +01001699 pool->process_bio = process_bio_fail;
1700 pool->process_discard = process_bio_fail;
1701 pool->process_prepared_mapping = process_prepared_mapping_fail;
1702 pool->process_prepared_discard = process_prepared_discard_fail;
Joe Thornber3e1a0692014-03-03 16:03:26 +00001703
1704 error_retry_list(pool);
Joe Thornbere49e5822012-07-27 15:08:16 +01001705 break;
1706
1707 case PM_READ_ONLY:
Mike Snitzer8b64e882013-12-20 14:27:28 -05001708 if (old_mode != new_mode)
Joe Thornber3e1a0692014-03-03 16:03:26 +00001709 notify_of_pool_mode_change(pool, "read-only");
1710 dm_pool_metadata_read_only(pool->pmd);
1711 pool->process_bio = process_bio_read_only;
1712 pool->process_discard = process_bio_success;
1713 pool->process_prepared_mapping = process_prepared_mapping_fail;
1714 pool->process_prepared_discard = process_prepared_discard_passdown;
1715
1716 error_retry_list(pool);
1717 break;
1718
1719 case PM_OUT_OF_DATA_SPACE:
1720 /*
1721 * Ideally we'd never hit this state; the low water mark
1722 * would trigger userland to extend the pool before we
1723 * completely run out of data space. However, many small
1724 * IOs to unprovisioned space can consume data space at an
1725 * alarming rate. Adjust your low water mark if you're
1726 * frequently seeing this mode.
1727 */
1728 if (old_mode != new_mode)
1729 notify_of_pool_mode_change(pool, "out-of-data-space");
1730 pool->process_bio = process_bio_read_only;
1731 pool->process_discard = process_discard;
1732 pool->process_prepared_mapping = process_prepared_mapping;
1733 pool->process_prepared_discard = process_prepared_discard_passdown;
Joe Thornber85ad6432014-05-09 15:59:38 +01001734
1735 if (!pool->pf.error_if_no_space)
1736 queue_delayed_work(pool->wq, &pool->no_space_timeout, NO_SPACE_TIMEOUT);
Joe Thornbere49e5822012-07-27 15:08:16 +01001737 break;
1738
1739 case PM_WRITE:
Mike Snitzer8b64e882013-12-20 14:27:28 -05001740 if (old_mode != new_mode)
Joe Thornber3e1a0692014-03-03 16:03:26 +00001741 notify_of_pool_mode_change(pool, "write");
Joe Thornber9b7aaa62013-12-04 16:58:19 -05001742 dm_pool_metadata_read_write(pool->pmd);
Joe Thornbere49e5822012-07-27 15:08:16 +01001743 pool->process_bio = process_bio;
1744 pool->process_discard = process_discard;
1745 pool->process_prepared_mapping = process_prepared_mapping;
1746 pool->process_prepared_discard = process_prepared_discard;
1747 break;
1748 }
Mike Snitzer8b64e882013-12-20 14:27:28 -05001749
1750 pool->pf.mode = new_mode;
Mike Snitzercdc2b412014-02-14 18:10:55 -05001751 /*
1752 * The pool mode may have changed, sync it so bind_control_target()
1753 * doesn't cause an unexpected mode transition on resume.
1754 */
1755 pt->adjusted_pf.mode = new_mode;
Joe Thornbere49e5822012-07-27 15:08:16 +01001756}
1757
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05001758static void abort_transaction(struct pool *pool)
1759{
1760 const char *dev_name = dm_device_name(pool->pool_md);
1761
1762 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1763 if (dm_pool_abort_metadata(pool->pmd)) {
1764 DMERR("%s: failed to abort metadata transaction", dev_name);
1765 set_pool_mode(pool, PM_FAIL);
1766 }
1767
1768 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1769 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1770 set_pool_mode(pool, PM_FAIL);
1771 }
1772}
1773
Joe Thornberb5330652013-12-04 19:51:33 -05001774static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1775{
1776 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1777 dm_device_name(pool->pool_md), op, r);
1778
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05001779 abort_transaction(pool);
Joe Thornberb5330652013-12-04 19:51:33 -05001780 set_pool_mode(pool, PM_READ_ONLY);
1781}
1782
Joe Thornbere49e5822012-07-27 15:08:16 +01001783/*----------------------------------------------------------------*/
1784
Joe Thornber991d9fa2011-10-31 20:21:18 +00001785/*
1786 * Mapping functions.
1787 */
1788
1789/*
1790 * Called only while mapping a thin bio to hand it over to the workqueue.
1791 */
1792static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1793{
1794 unsigned long flags;
1795 struct pool *pool = tc->pool;
1796
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001797 spin_lock_irqsave(&tc->lock, flags);
1798 bio_list_add(&tc->deferred_bio_list, bio);
1799 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001800
1801 wake_worker(pool);
1802}
1803
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00001804static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
Joe Thornbereb2aa482012-03-28 18:41:28 +01001805{
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00001806 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornbereb2aa482012-03-28 18:41:28 +01001807
1808 h->tc = tc;
1809 h->shared_read_entry = NULL;
Joe Thornbere8088072012-12-21 20:23:31 +00001810 h->all_io_entry = NULL;
Joe Thornbereb2aa482012-03-28 18:41:28 +01001811 h->overwrite_mapping = NULL;
Joe Thornbereb2aa482012-03-28 18:41:28 +01001812}
1813
Joe Thornber991d9fa2011-10-31 20:21:18 +00001814/*
1815 * Non-blocking function called from the thin target's map function.
1816 */
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00001817static int thin_bio_map(struct dm_target *ti, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001818{
1819 int r;
1820 struct thin_c *tc = ti->private;
1821 dm_block_t block = get_bio_block(tc, bio);
1822 struct dm_thin_device *td = tc->td;
1823 struct dm_thin_lookup_result result;
Joe Thornber025b9682013-03-01 22:45:50 +00001824 struct dm_bio_prison_cell cell1, cell2;
1825 struct dm_bio_prison_cell *cell_result;
Joe Thornbere8088072012-12-21 20:23:31 +00001826 struct dm_cell_key key;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001827
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00001828 thin_hook_bio(tc, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01001829
Joe Thornber738211f2014-03-03 15:52:28 +00001830 if (tc->requeue_mode) {
1831 bio_endio(bio, DM_ENDIO_REQUEUE);
1832 return DM_MAPIO_SUBMITTED;
1833 }
1834
Joe Thornbere49e5822012-07-27 15:08:16 +01001835 if (get_pool_mode(tc->pool) == PM_FAIL) {
1836 bio_io_error(bio);
1837 return DM_MAPIO_SUBMITTED;
1838 }
1839
Joe Thornber104655f2012-03-28 18:41:28 +01001840 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001841 thin_defer_bio(tc, bio);
1842 return DM_MAPIO_SUBMITTED;
1843 }
1844
1845 r = dm_thin_find_block(td, block, 0, &result);
1846
1847 /*
1848 * Note that we defer readahead too.
1849 */
1850 switch (r) {
1851 case 0:
1852 if (unlikely(result.shared)) {
1853 /*
1854 * We have a race condition here between the
1855 * result.shared value returned by the lookup and
1856 * snapshot creation, which may cause new
1857 * sharing.
1858 *
1859 * To avoid this always quiesce the origin before
1860 * taking the snap. You want to do this anyway to
1861 * ensure a consistent application view
1862 * (i.e. lockfs).
1863 *
1864 * More distant ancestors are irrelevant. The
1865 * shared flag will be set in their case.
1866 */
1867 thin_defer_bio(tc, bio);
Joe Thornbere8088072012-12-21 20:23:31 +00001868 return DM_MAPIO_SUBMITTED;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001869 }
Joe Thornbere8088072012-12-21 20:23:31 +00001870
1871 build_virtual_key(tc->td, block, &key);
Joe Thornber025b9682013-03-01 22:45:50 +00001872 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
Joe Thornbere8088072012-12-21 20:23:31 +00001873 return DM_MAPIO_SUBMITTED;
1874
1875 build_data_key(tc->td, result.block, &key);
Joe Thornber025b9682013-03-01 22:45:50 +00001876 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
1877 cell_defer_no_holder_no_free(tc, &cell1);
Joe Thornbere8088072012-12-21 20:23:31 +00001878 return DM_MAPIO_SUBMITTED;
1879 }
1880
1881 inc_all_io_entry(tc->pool, bio);
Joe Thornber025b9682013-03-01 22:45:50 +00001882 cell_defer_no_holder_no_free(tc, &cell2);
1883 cell_defer_no_holder_no_free(tc, &cell1);
Joe Thornbere8088072012-12-21 20:23:31 +00001884
1885 remap(tc, bio, result.block);
1886 return DM_MAPIO_REMAPPED;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001887
1888 case -ENODATA:
Joe Thornbere49e5822012-07-27 15:08:16 +01001889 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1890 /*
1891 * This block isn't provisioned, and we have no way
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001892 * of doing so.
Joe Thornbere49e5822012-07-27 15:08:16 +01001893 */
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001894 handle_unserviceable_bio(tc->pool, bio);
Joe Thornber2aab3852012-12-21 20:23:33 +00001895 return DM_MAPIO_SUBMITTED;
Joe Thornbere49e5822012-07-27 15:08:16 +01001896 }
1897 /* fall through */
1898
1899 case -EWOULDBLOCK:
Joe Thornber991d9fa2011-10-31 20:21:18 +00001900 /*
1901 * In future, the failed dm_thin_find_block above could
1902 * provide the hint to load the metadata into cache.
1903 */
Joe Thornber991d9fa2011-10-31 20:21:18 +00001904 thin_defer_bio(tc, bio);
Joe Thornber2aab3852012-12-21 20:23:33 +00001905 return DM_MAPIO_SUBMITTED;
Joe Thornbere49e5822012-07-27 15:08:16 +01001906
1907 default:
1908 /*
1909 * Must always call bio_io_error on failure.
1910 * dm_thin_find_block can fail with -EINVAL if the
1911 * pool is switched to fail-io mode.
1912 */
1913 bio_io_error(bio);
Joe Thornber2aab3852012-12-21 20:23:33 +00001914 return DM_MAPIO_SUBMITTED;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001915 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001916}
1917
1918static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1919{
Joe Thornber991d9fa2011-10-31 20:21:18 +00001920 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
Mike Snitzer760fe672014-03-20 08:36:47 -04001921 struct request_queue *q;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001922
Mike Snitzer760fe672014-03-20 08:36:47 -04001923 if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
1924 return 1;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001925
Mike Snitzer760fe672014-03-20 08:36:47 -04001926 q = bdev_get_queue(pt->data_dev->bdev);
1927 return bdi_congested(&q->backing_dev_info, bdi_bits);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001928}
1929
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001930static void requeue_bios(struct pool *pool)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001931{
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001932 unsigned long flags;
1933 struct thin_c *tc;
1934
1935 rcu_read_lock();
1936 list_for_each_entry_rcu(tc, &pool->active_thins, list) {
1937 spin_lock_irqsave(&tc->lock, flags);
1938 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
1939 bio_list_init(&tc->retry_on_resume_list);
1940 spin_unlock_irqrestore(&tc->lock, flags);
1941 }
1942 rcu_read_unlock();
Joe Thornber991d9fa2011-10-31 20:21:18 +00001943}
1944
1945/*----------------------------------------------------------------
1946 * Binding of control targets to a pool object
1947 *--------------------------------------------------------------*/
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001948static bool data_dev_supports_discard(struct pool_c *pt)
1949{
1950 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1951
1952 return q && blk_queue_discard(q);
1953}
1954
Joe Thornber58051b92013-03-20 17:21:25 +00001955static bool is_factor(sector_t block_size, uint32_t n)
1956{
1957 return !sector_div(block_size, n);
1958}
1959
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001960/*
1961 * If discard_passdown was enabled verify that the data device
Mike Snitzer0424caa2012-09-26 23:45:47 +01001962 * supports discards. Disable discard_passdown if not.
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001963 */
Mike Snitzer0424caa2012-09-26 23:45:47 +01001964static void disable_passdown_if_not_supported(struct pool_c *pt)
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001965{
Mike Snitzer0424caa2012-09-26 23:45:47 +01001966 struct pool *pool = pt->pool;
1967 struct block_device *data_bdev = pt->data_dev->bdev;
1968 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1969 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1970 const char *reason = NULL;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001971 char buf[BDEVNAME_SIZE];
1972
Mike Snitzer0424caa2012-09-26 23:45:47 +01001973 if (!pt->adjusted_pf.discard_passdown)
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001974 return;
1975
Mike Snitzer0424caa2012-09-26 23:45:47 +01001976 if (!data_dev_supports_discard(pt))
1977 reason = "discard unsupported";
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001978
Mike Snitzer0424caa2012-09-26 23:45:47 +01001979 else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1980 reason = "max discard sectors smaller than a block";
1981
1982 else if (data_limits->discard_granularity > block_size)
1983 reason = "discard granularity larger than a block";
1984
Joe Thornber58051b92013-03-20 17:21:25 +00001985 else if (!is_factor(block_size, data_limits->discard_granularity))
Mike Snitzer0424caa2012-09-26 23:45:47 +01001986 reason = "discard granularity not a factor of block size";
1987
1988 if (reason) {
1989 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1990 pt->adjusted_pf.discard_passdown = false;
1991 }
Mike Snitzer9bc142d2012-09-26 23:45:46 +01001992}
1993
Joe Thornber991d9fa2011-10-31 20:21:18 +00001994static int bind_control_target(struct pool *pool, struct dm_target *ti)
1995{
1996 struct pool_c *pt = ti->private;
1997
Joe Thornbere49e5822012-07-27 15:08:16 +01001998 /*
Joe Thornber9b7aaa62013-12-04 16:58:19 -05001999 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
Joe Thornbere49e5822012-07-27 15:08:16 +01002000 */
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05002001 enum pool_mode old_mode = get_pool_mode(pool);
Mike Snitzer0424caa2012-09-26 23:45:47 +01002002 enum pool_mode new_mode = pt->adjusted_pf.mode;
Joe Thornbere49e5822012-07-27 15:08:16 +01002003
Joe Thornber9b7aaa62013-12-04 16:58:19 -05002004 /*
Mike Snitzer8b64e882013-12-20 14:27:28 -05002005 * Don't change the pool's mode until set_pool_mode() below.
2006 * Otherwise the pool's process_* function pointers may
2007 * not match the desired pool mode.
2008 */
2009 pt->adjusted_pf.mode = old_mode;
2010
2011 pool->ti = ti;
2012 pool->pf = pt->adjusted_pf;
2013 pool->low_water_blocks = pt->low_water_blocks;
2014
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002015 set_pool_mode(pool, new_mode);
Mike Snitzerf4026932012-05-19 01:01:01 +01002016
Joe Thornber991d9fa2011-10-31 20:21:18 +00002017 return 0;
2018}
2019
2020static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2021{
2022 if (pool->ti == ti)
2023 pool->ti = NULL;
2024}
2025
2026/*----------------------------------------------------------------
2027 * Pool creation
2028 *--------------------------------------------------------------*/
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002029/* Initialize pool features. */
2030static void pool_features_init(struct pool_features *pf)
2031{
Joe Thornbere49e5822012-07-27 15:08:16 +01002032 pf->mode = PM_WRITE;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002033 pf->zero_new_blocks = true;
2034 pf->discard_enabled = true;
2035 pf->discard_passdown = true;
Mike Snitzer787a996c2013-12-06 16:21:43 -05002036 pf->error_if_no_space = false;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002037}
2038
Joe Thornber991d9fa2011-10-31 20:21:18 +00002039static void __pool_destroy(struct pool *pool)
2040{
2041 __pool_table_remove(pool);
2042
2043 if (dm_pool_metadata_close(pool->pmd) < 0)
2044 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2045
Mike Snitzer44feb382012-10-12 21:02:10 +01002046 dm_bio_prison_destroy(pool->prison);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002047 dm_kcopyd_client_destroy(pool->copier);
2048
2049 if (pool->wq)
2050 destroy_workqueue(pool->wq);
2051
2052 if (pool->next_mapping)
2053 mempool_free(pool->next_mapping, pool->mapping_pool);
2054 mempool_destroy(pool->mapping_pool);
Mike Snitzer44feb382012-10-12 21:02:10 +01002055 dm_deferred_set_destroy(pool->shared_read_ds);
2056 dm_deferred_set_destroy(pool->all_io_ds);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002057 kfree(pool);
2058}
2059
Mike Snitzera24c2562012-06-03 00:30:00 +01002060static struct kmem_cache *_new_mapping_cache;
Mike Snitzera24c2562012-06-03 00:30:00 +01002061
Joe Thornber991d9fa2011-10-31 20:21:18 +00002062static struct pool *pool_create(struct mapped_device *pool_md,
2063 struct block_device *metadata_dev,
Joe Thornbere49e5822012-07-27 15:08:16 +01002064 unsigned long block_size,
2065 int read_only, char **error)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002066{
2067 int r;
2068 void *err_p;
2069 struct pool *pool;
2070 struct dm_pool_metadata *pmd;
Joe Thornbere49e5822012-07-27 15:08:16 +01002071 bool format_device = read_only ? false : true;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002072
Joe Thornbere49e5822012-07-27 15:08:16 +01002073 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002074 if (IS_ERR(pmd)) {
2075 *error = "Error creating metadata object";
2076 return (struct pool *)pmd;
2077 }
2078
2079 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
2080 if (!pool) {
2081 *error = "Error allocating memory for pool";
2082 err_p = ERR_PTR(-ENOMEM);
2083 goto bad_pool;
2084 }
2085
2086 pool->pmd = pmd;
2087 pool->sectors_per_block = block_size;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +01002088 if (block_size & (block_size - 1))
2089 pool->sectors_per_block_shift = -1;
2090 else
2091 pool->sectors_per_block_shift = __ffs(block_size);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002092 pool->low_water_blocks = 0;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002093 pool_features_init(&pool->pf);
Mike Snitzer44feb382012-10-12 21:02:10 +01002094 pool->prison = dm_bio_prison_create(PRISON_CELLS);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002095 if (!pool->prison) {
2096 *error = "Error creating pool's bio prison";
2097 err_p = ERR_PTR(-ENOMEM);
2098 goto bad_prison;
2099 }
2100
Mikulas Patockadf5d2e92013-03-01 22:45:49 +00002101 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002102 if (IS_ERR(pool->copier)) {
2103 r = PTR_ERR(pool->copier);
2104 *error = "Error creating pool's kcopyd client";
2105 err_p = ERR_PTR(r);
2106 goto bad_kcopyd_client;
2107 }
2108
2109 /*
2110 * Create singlethreaded workqueue that will service all devices
2111 * that use this metadata.
2112 */
2113 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2114 if (!pool->wq) {
2115 *error = "Error creating pool's workqueue";
2116 err_p = ERR_PTR(-ENOMEM);
2117 goto bad_wq;
2118 }
2119
2120 INIT_WORK(&pool->worker, do_worker);
Joe Thornber905e51b2012-03-28 18:41:27 +01002121 INIT_DELAYED_WORK(&pool->waker, do_waker);
Joe Thornber85ad6432014-05-09 15:59:38 +01002122 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002123 spin_lock_init(&pool->lock);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002124 bio_list_init(&pool->deferred_flush_bios);
2125 INIT_LIST_HEAD(&pool->prepared_mappings);
Joe Thornber104655f2012-03-28 18:41:28 +01002126 INIT_LIST_HEAD(&pool->prepared_discards);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04002127 INIT_LIST_HEAD(&pool->active_thins);
Joe Thornber88a66212013-12-04 20:16:12 -05002128 pool->low_water_triggered = false;
Mike Snitzer44feb382012-10-12 21:02:10 +01002129
2130 pool->shared_read_ds = dm_deferred_set_create();
2131 if (!pool->shared_read_ds) {
2132 *error = "Error creating pool's shared read deferred set";
2133 err_p = ERR_PTR(-ENOMEM);
2134 goto bad_shared_read_ds;
2135 }
2136
2137 pool->all_io_ds = dm_deferred_set_create();
2138 if (!pool->all_io_ds) {
2139 *error = "Error creating pool's all io deferred set";
2140 err_p = ERR_PTR(-ENOMEM);
2141 goto bad_all_io_ds;
2142 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002143
2144 pool->next_mapping = NULL;
Mike Snitzera24c2562012-06-03 00:30:00 +01002145 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
2146 _new_mapping_cache);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002147 if (!pool->mapping_pool) {
2148 *error = "Error creating pool's mapping mempool";
2149 err_p = ERR_PTR(-ENOMEM);
2150 goto bad_mapping_pool;
2151 }
2152
Joe Thornber991d9fa2011-10-31 20:21:18 +00002153 pool->ref_count = 1;
Joe Thornber905e51b2012-03-28 18:41:27 +01002154 pool->last_commit_jiffies = jiffies;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002155 pool->pool_md = pool_md;
2156 pool->md_dev = metadata_dev;
2157 __pool_table_insert(pool);
2158
2159 return pool;
2160
Joe Thornber991d9fa2011-10-31 20:21:18 +00002161bad_mapping_pool:
Mike Snitzer44feb382012-10-12 21:02:10 +01002162 dm_deferred_set_destroy(pool->all_io_ds);
2163bad_all_io_ds:
2164 dm_deferred_set_destroy(pool->shared_read_ds);
2165bad_shared_read_ds:
Joe Thornber991d9fa2011-10-31 20:21:18 +00002166 destroy_workqueue(pool->wq);
2167bad_wq:
2168 dm_kcopyd_client_destroy(pool->copier);
2169bad_kcopyd_client:
Mike Snitzer44feb382012-10-12 21:02:10 +01002170 dm_bio_prison_destroy(pool->prison);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002171bad_prison:
2172 kfree(pool);
2173bad_pool:
2174 if (dm_pool_metadata_close(pmd))
2175 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2176
2177 return err_p;
2178}
2179
2180static void __pool_inc(struct pool *pool)
2181{
2182 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2183 pool->ref_count++;
2184}
2185
2186static void __pool_dec(struct pool *pool)
2187{
2188 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2189 BUG_ON(!pool->ref_count);
2190 if (!--pool->ref_count)
2191 __pool_destroy(pool);
2192}
2193
2194static struct pool *__pool_find(struct mapped_device *pool_md,
2195 struct block_device *metadata_dev,
Joe Thornbere49e5822012-07-27 15:08:16 +01002196 unsigned long block_size, int read_only,
2197 char **error, int *created)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002198{
2199 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2200
2201 if (pool) {
Mike Snitzerf09996c2012-07-27 15:07:59 +01002202 if (pool->pool_md != pool_md) {
2203 *error = "metadata device already in use by a pool";
Joe Thornber991d9fa2011-10-31 20:21:18 +00002204 return ERR_PTR(-EBUSY);
Mike Snitzerf09996c2012-07-27 15:07:59 +01002205 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002206 __pool_inc(pool);
2207
2208 } else {
2209 pool = __pool_table_lookup(pool_md);
2210 if (pool) {
Mike Snitzerf09996c2012-07-27 15:07:59 +01002211 if (pool->md_dev != metadata_dev) {
2212 *error = "different pool cannot replace a pool";
Joe Thornber991d9fa2011-10-31 20:21:18 +00002213 return ERR_PTR(-EINVAL);
Mike Snitzerf09996c2012-07-27 15:07:59 +01002214 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002215 __pool_inc(pool);
2216
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002217 } else {
Joe Thornbere49e5822012-07-27 15:08:16 +01002218 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002219 *created = 1;
2220 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002221 }
2222
2223 return pool;
2224}
2225
2226/*----------------------------------------------------------------
2227 * Pool target methods
2228 *--------------------------------------------------------------*/
2229static void pool_dtr(struct dm_target *ti)
2230{
2231 struct pool_c *pt = ti->private;
2232
2233 mutex_lock(&dm_thin_pool_table.mutex);
2234
2235 unbind_control_target(pt->pool, ti);
2236 __pool_dec(pt->pool);
2237 dm_put_device(ti, pt->metadata_dev);
2238 dm_put_device(ti, pt->data_dev);
2239 kfree(pt);
2240
2241 mutex_unlock(&dm_thin_pool_table.mutex);
2242}
2243
Joe Thornber991d9fa2011-10-31 20:21:18 +00002244static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2245 struct dm_target *ti)
2246{
2247 int r;
2248 unsigned argc;
2249 const char *arg_name;
2250
2251 static struct dm_arg _args[] = {
Mike Snitzer74aa45c2014-01-15 19:07:58 -05002252 {0, 4, "Invalid number of pool feature arguments"},
Joe Thornber991d9fa2011-10-31 20:21:18 +00002253 };
2254
2255 /*
2256 * No feature arguments supplied.
2257 */
2258 if (!as->argc)
2259 return 0;
2260
2261 r = dm_read_arg_group(_args, as, &argc, &ti->error);
2262 if (r)
2263 return -EINVAL;
2264
2265 while (argc && !r) {
2266 arg_name = dm_shift_arg(as);
2267 argc--;
2268
Joe Thornbere49e5822012-07-27 15:08:16 +01002269 if (!strcasecmp(arg_name, "skip_block_zeroing"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002270 pf->zero_new_blocks = false;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002271
Joe Thornbere49e5822012-07-27 15:08:16 +01002272 else if (!strcasecmp(arg_name, "ignore_discard"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002273 pf->discard_enabled = false;
Joe Thornbere49e5822012-07-27 15:08:16 +01002274
2275 else if (!strcasecmp(arg_name, "no_discard_passdown"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002276 pf->discard_passdown = false;
Joe Thornbere49e5822012-07-27 15:08:16 +01002277
2278 else if (!strcasecmp(arg_name, "read_only"))
2279 pf->mode = PM_READ_ONLY;
2280
Mike Snitzer787a996c2013-12-06 16:21:43 -05002281 else if (!strcasecmp(arg_name, "error_if_no_space"))
2282 pf->error_if_no_space = true;
2283
Joe Thornbere49e5822012-07-27 15:08:16 +01002284 else {
2285 ti->error = "Unrecognised pool feature requested";
2286 r = -EINVAL;
2287 break;
2288 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002289 }
2290
2291 return r;
2292}
2293
Joe Thornberac8c3f32013-05-10 14:37:21 +01002294static void metadata_low_callback(void *context)
2295{
2296 struct pool *pool = context;
2297
2298 DMWARN("%s: reached low water mark for metadata device: sending event.",
2299 dm_device_name(pool->pool_md));
2300
2301 dm_table_event(pool->ti->table);
2302}
2303
Mike Snitzer7d489352014-02-12 23:58:15 -05002304static sector_t get_dev_size(struct block_device *bdev)
Joe Thornberb17446d2013-05-10 14:37:18 +01002305{
Mike Snitzer7d489352014-02-12 23:58:15 -05002306 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2307}
2308
2309static void warn_if_metadata_device_too_big(struct block_device *bdev)
2310{
2311 sector_t metadata_dev_size = get_dev_size(bdev);
Joe Thornberb17446d2013-05-10 14:37:18 +01002312 char buffer[BDEVNAME_SIZE];
2313
Mike Snitzer7d489352014-02-12 23:58:15 -05002314 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
Joe Thornberb17446d2013-05-10 14:37:18 +01002315 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2316 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
Mike Snitzer7d489352014-02-12 23:58:15 -05002317}
2318
2319static sector_t get_metadata_dev_size(struct block_device *bdev)
2320{
2321 sector_t metadata_dev_size = get_dev_size(bdev);
2322
2323 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2324 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
Joe Thornberb17446d2013-05-10 14:37:18 +01002325
2326 return metadata_dev_size;
2327}
2328
Joe Thornber24347e92013-05-10 14:37:19 +01002329static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2330{
2331 sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2332
Mike Snitzer7d489352014-02-12 23:58:15 -05002333 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
Joe Thornber24347e92013-05-10 14:37:19 +01002334
2335 return metadata_dev_size;
2336}
2337
Joe Thornber991d9fa2011-10-31 20:21:18 +00002338/*
Joe Thornberac8c3f32013-05-10 14:37:21 +01002339 * When a metadata threshold is crossed a dm event is triggered, and
2340 * userland should respond by growing the metadata device. We could let
2341 * userland set the threshold, like we do with the data threshold, but I'm
2342 * not sure they know enough to do this well.
2343 */
2344static dm_block_t calc_metadata_threshold(struct pool_c *pt)
2345{
2346 /*
2347 * 4M is ample for all ops with the possible exception of thin
2348 * device deletion which is harmless if it fails (just retry the
2349 * delete after you've grown the device).
2350 */
2351 dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
2352 return min((dm_block_t)1024ULL /* 4M */, quarter);
2353}
2354
2355/*
Joe Thornber991d9fa2011-10-31 20:21:18 +00002356 * thin-pool <metadata dev> <data dev>
2357 * <data block size (sectors)>
2358 * <low water mark (blocks)>
2359 * [<#feature args> [<arg>]*]
2360 *
2361 * Optional feature arguments are:
2362 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002363 * ignore_discard: disable discard
2364 * no_discard_passdown: don't pass discards down to the data device
Mike Snitzer787a996c2013-12-06 16:21:43 -05002365 * read_only: Don't allow any changes to be made to the pool metadata.
2366 * error_if_no_space: error IOs, instead of queueing, if no space.
Joe Thornber991d9fa2011-10-31 20:21:18 +00002367 */
2368static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2369{
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002370 int r, pool_created = 0;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002371 struct pool_c *pt;
2372 struct pool *pool;
2373 struct pool_features pf;
2374 struct dm_arg_set as;
2375 struct dm_dev *data_dev;
2376 unsigned long block_size;
2377 dm_block_t low_water_blocks;
2378 struct dm_dev *metadata_dev;
Joe Thornber5d0db962013-05-10 14:37:19 +01002379 fmode_t metadata_mode;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002380
2381 /*
2382 * FIXME Remove validation from scope of lock.
2383 */
2384 mutex_lock(&dm_thin_pool_table.mutex);
2385
2386 if (argc < 4) {
2387 ti->error = "Invalid argument count";
2388 r = -EINVAL;
2389 goto out_unlock;
2390 }
Joe Thornber5d0db962013-05-10 14:37:19 +01002391
Joe Thornber991d9fa2011-10-31 20:21:18 +00002392 as.argc = argc;
2393 as.argv = argv;
2394
Joe Thornber5d0db962013-05-10 14:37:19 +01002395 /*
2396 * Set default pool features.
2397 */
2398 pool_features_init(&pf);
2399
2400 dm_consume_args(&as, 4);
2401 r = parse_pool_features(&as, &pf, ti);
2402 if (r)
2403 goto out_unlock;
2404
2405 metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
2406 r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002407 if (r) {
2408 ti->error = "Error opening metadata block device";
2409 goto out_unlock;
2410 }
Mike Snitzer7d489352014-02-12 23:58:15 -05002411 warn_if_metadata_device_too_big(metadata_dev->bdev);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002412
2413 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2414 if (r) {
2415 ti->error = "Error getting data device";
2416 goto out_metadata;
2417 }
2418
2419 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2420 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2421 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01002422 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00002423 ti->error = "Invalid block size";
2424 r = -EINVAL;
2425 goto out;
2426 }
2427
2428 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2429 ti->error = "Invalid low water mark";
2430 r = -EINVAL;
2431 goto out;
2432 }
2433
Joe Thornber991d9fa2011-10-31 20:21:18 +00002434 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2435 if (!pt) {
2436 r = -ENOMEM;
2437 goto out;
2438 }
2439
2440 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
Joe Thornbere49e5822012-07-27 15:08:16 +01002441 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002442 if (IS_ERR(pool)) {
2443 r = PTR_ERR(pool);
2444 goto out_free_pt;
2445 }
2446
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002447 /*
2448 * 'pool_created' reflects whether this is the first table load.
2449 * Top level discard support is not allowed to be changed after
2450 * initial load. This would require a pool reload to trigger thin
2451 * device changes.
2452 */
2453 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2454 ti->error = "Discard support cannot be disabled once enabled";
2455 r = -EINVAL;
2456 goto out_flags_changed;
2457 }
2458
Joe Thornber991d9fa2011-10-31 20:21:18 +00002459 pt->pool = pool;
2460 pt->ti = ti;
2461 pt->metadata_dev = metadata_dev;
2462 pt->data_dev = data_dev;
2463 pt->low_water_blocks = low_water_blocks;
Mike Snitzer0424caa2012-09-26 23:45:47 +01002464 pt->adjusted_pf = pt->requested_pf = pf;
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00002465 ti->num_flush_bios = 1;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002466
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002467 /*
2468 * Only need to enable discards if the pool should pass
2469 * them down to the data device. The thin device's discard
2470 * processing will cause mappings to be removed from the btree.
2471 */
Mike Snitzerb60ab992013-09-19 18:49:11 -04002472 ti->discard_zeroes_data_unsupported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002473 if (pf.discard_enabled && pf.discard_passdown) {
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00002474 ti->num_discard_bios = 1;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002475
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002476 /*
2477 * Setting 'discards_supported' circumvents the normal
2478 * stacking of discard limits (this keeps the pool and
2479 * thin devices' discard limits consistent).
2480 */
Alasdair G Kergon0ac55482012-07-27 15:08:08 +01002481 ti->discards_supported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002482 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002483 ti->private = pt;
2484
Joe Thornberac8c3f32013-05-10 14:37:21 +01002485 r = dm_pool_register_metadata_threshold(pt->pool->pmd,
2486 calc_metadata_threshold(pt),
2487 metadata_low_callback,
2488 pool);
2489 if (r)
2490 goto out_free_pt;
2491
Joe Thornber991d9fa2011-10-31 20:21:18 +00002492 pt->callbacks.congested_fn = pool_is_congested;
2493 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2494
2495 mutex_unlock(&dm_thin_pool_table.mutex);
2496
2497 return 0;
2498
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002499out_flags_changed:
2500 __pool_dec(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002501out_free_pt:
2502 kfree(pt);
2503out:
2504 dm_put_device(ti, data_dev);
2505out_metadata:
2506 dm_put_device(ti, metadata_dev);
2507out_unlock:
2508 mutex_unlock(&dm_thin_pool_table.mutex);
2509
2510 return r;
2511}
2512
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00002513static int pool_map(struct dm_target *ti, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002514{
2515 int r;
2516 struct pool_c *pt = ti->private;
2517 struct pool *pool = pt->pool;
2518 unsigned long flags;
2519
2520 /*
2521 * As this is a singleton target, ti->begin is always zero.
2522 */
2523 spin_lock_irqsave(&pool->lock, flags);
2524 bio->bi_bdev = pt->data_dev->bdev;
2525 r = DM_MAPIO_REMAPPED;
2526 spin_unlock_irqrestore(&pool->lock, flags);
2527
2528 return r;
2529}
2530
Joe Thornberb17446d2013-05-10 14:37:18 +01002531static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2532{
2533 int r;
2534 struct pool_c *pt = ti->private;
2535 struct pool *pool = pt->pool;
2536 sector_t data_size = ti->len;
2537 dm_block_t sb_data_size;
2538
2539 *need_commit = false;
2540
2541 (void) sector_div(data_size, pool->sectors_per_block);
2542
2543 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2544 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002545 DMERR("%s: failed to retrieve data device size",
2546 dm_device_name(pool->pool_md));
Joe Thornberb17446d2013-05-10 14:37:18 +01002547 return r;
2548 }
2549
2550 if (data_size < sb_data_size) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002551 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
2552 dm_device_name(pool->pool_md),
Joe Thornberb17446d2013-05-10 14:37:18 +01002553 (unsigned long long)data_size, sb_data_size);
2554 return -EINVAL;
2555
2556 } else if (data_size > sb_data_size) {
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05002557 if (dm_pool_metadata_needs_check(pool->pmd)) {
2558 DMERR("%s: unable to grow the data device until repaired.",
2559 dm_device_name(pool->pool_md));
2560 return 0;
2561 }
2562
Mike Snitzer6f7f51d2013-12-04 10:25:53 -05002563 if (sb_data_size)
2564 DMINFO("%s: growing the data device from %llu to %llu blocks",
2565 dm_device_name(pool->pool_md),
2566 sb_data_size, (unsigned long long)data_size);
Joe Thornberb17446d2013-05-10 14:37:18 +01002567 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2568 if (r) {
Joe Thornberb5330652013-12-04 19:51:33 -05002569 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
Joe Thornberb17446d2013-05-10 14:37:18 +01002570 return r;
2571 }
2572
2573 *need_commit = true;
2574 }
2575
2576 return 0;
2577}
2578
Joe Thornber24347e92013-05-10 14:37:19 +01002579static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2580{
2581 int r;
2582 struct pool_c *pt = ti->private;
2583 struct pool *pool = pt->pool;
2584 dm_block_t metadata_dev_size, sb_metadata_dev_size;
2585
2586 *need_commit = false;
2587
Alasdair G Kergon610bba82013-05-19 18:57:50 +01002588 metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
Joe Thornber24347e92013-05-10 14:37:19 +01002589
2590 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
2591 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002592 DMERR("%s: failed to retrieve metadata device size",
2593 dm_device_name(pool->pool_md));
Joe Thornber24347e92013-05-10 14:37:19 +01002594 return r;
2595 }
2596
2597 if (metadata_dev_size < sb_metadata_dev_size) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002598 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
2599 dm_device_name(pool->pool_md),
Joe Thornber24347e92013-05-10 14:37:19 +01002600 metadata_dev_size, sb_metadata_dev_size);
2601 return -EINVAL;
2602
2603 } else if (metadata_dev_size > sb_metadata_dev_size) {
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05002604 if (dm_pool_metadata_needs_check(pool->pmd)) {
2605 DMERR("%s: unable to grow the metadata device until repaired.",
2606 dm_device_name(pool->pool_md));
2607 return 0;
2608 }
2609
Mike Snitzer7d489352014-02-12 23:58:15 -05002610 warn_if_metadata_device_too_big(pool->md_dev);
Mike Snitzer6f7f51d2013-12-04 10:25:53 -05002611 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2612 dm_device_name(pool->pool_md),
2613 sb_metadata_dev_size, metadata_dev_size);
Joe Thornber24347e92013-05-10 14:37:19 +01002614 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2615 if (r) {
Joe Thornberb5330652013-12-04 19:51:33 -05002616 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
Joe Thornber24347e92013-05-10 14:37:19 +01002617 return r;
2618 }
2619
2620 *need_commit = true;
2621 }
2622
2623 return 0;
2624}
2625
Joe Thornber991d9fa2011-10-31 20:21:18 +00002626/*
2627 * Retrieves the number of blocks of the data device from
2628 * the superblock and compares it to the actual device size,
2629 * thus resizing the data device in case it has grown.
2630 *
2631 * This both copes with opening preallocated data devices in the ctr
2632 * being followed by a resume
2633 * -and-
2634 * calling the resume method individually after userspace has
2635 * grown the data device in reaction to a table event.
2636 */
2637static int pool_preresume(struct dm_target *ti)
2638{
2639 int r;
Joe Thornber24347e92013-05-10 14:37:19 +01002640 bool need_commit1, need_commit2;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002641 struct pool_c *pt = ti->private;
2642 struct pool *pool = pt->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002643
2644 /*
2645 * Take control of the pool object.
2646 */
2647 r = bind_control_target(pool, ti);
2648 if (r)
2649 return r;
2650
Joe Thornberb17446d2013-05-10 14:37:18 +01002651 r = maybe_resize_data_dev(ti, &need_commit1);
2652 if (r)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002653 return r;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002654
Joe Thornber24347e92013-05-10 14:37:19 +01002655 r = maybe_resize_metadata_dev(ti, &need_commit2);
2656 if (r)
2657 return r;
2658
2659 if (need_commit1 || need_commit2)
Joe Thornber020cc3b2013-12-04 15:05:36 -05002660 (void) commit(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002661
2662 return 0;
2663}
2664
2665static void pool_resume(struct dm_target *ti)
2666{
2667 struct pool_c *pt = ti->private;
2668 struct pool *pool = pt->pool;
2669 unsigned long flags;
2670
2671 spin_lock_irqsave(&pool->lock, flags);
Joe Thornber88a66212013-12-04 20:16:12 -05002672 pool->low_water_triggered = false;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002673 spin_unlock_irqrestore(&pool->lock, flags);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04002674 requeue_bios(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002675
Joe Thornber905e51b2012-03-28 18:41:27 +01002676 do_waker(&pool->waker.work);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002677}
2678
2679static void pool_postsuspend(struct dm_target *ti)
2680{
Joe Thornber991d9fa2011-10-31 20:21:18 +00002681 struct pool_c *pt = ti->private;
2682 struct pool *pool = pt->pool;
2683
Joe Thornber905e51b2012-03-28 18:41:27 +01002684 cancel_delayed_work(&pool->waker);
Joe Thornber85ad6432014-05-09 15:59:38 +01002685 cancel_delayed_work(&pool->no_space_timeout);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002686 flush_workqueue(pool->wq);
Joe Thornber020cc3b2013-12-04 15:05:36 -05002687 (void) commit(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002688}
2689
2690static int check_arg_count(unsigned argc, unsigned args_required)
2691{
2692 if (argc != args_required) {
2693 DMWARN("Message received with %u arguments instead of %u.",
2694 argc, args_required);
2695 return -EINVAL;
2696 }
2697
2698 return 0;
2699}
2700
2701static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2702{
2703 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2704 *dev_id <= MAX_DEV_ID)
2705 return 0;
2706
2707 if (warning)
2708 DMWARN("Message received with invalid device id: %s", arg);
2709
2710 return -EINVAL;
2711}
2712
2713static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2714{
2715 dm_thin_id dev_id;
2716 int r;
2717
2718 r = check_arg_count(argc, 2);
2719 if (r)
2720 return r;
2721
2722 r = read_dev_id(argv[1], &dev_id, 1);
2723 if (r)
2724 return r;
2725
2726 r = dm_pool_create_thin(pool->pmd, dev_id);
2727 if (r) {
2728 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2729 argv[1]);
2730 return r;
2731 }
2732
2733 return 0;
2734}
2735
2736static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2737{
2738 dm_thin_id dev_id;
2739 dm_thin_id origin_dev_id;
2740 int r;
2741
2742 r = check_arg_count(argc, 3);
2743 if (r)
2744 return r;
2745
2746 r = read_dev_id(argv[1], &dev_id, 1);
2747 if (r)
2748 return r;
2749
2750 r = read_dev_id(argv[2], &origin_dev_id, 1);
2751 if (r)
2752 return r;
2753
2754 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2755 if (r) {
2756 DMWARN("Creation of new snapshot %s of device %s failed.",
2757 argv[1], argv[2]);
2758 return r;
2759 }
2760
2761 return 0;
2762}
2763
2764static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2765{
2766 dm_thin_id dev_id;
2767 int r;
2768
2769 r = check_arg_count(argc, 2);
2770 if (r)
2771 return r;
2772
2773 r = read_dev_id(argv[1], &dev_id, 1);
2774 if (r)
2775 return r;
2776
2777 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2778 if (r)
2779 DMWARN("Deletion of thin device %s failed.", argv[1]);
2780
2781 return r;
2782}
2783
2784static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2785{
2786 dm_thin_id old_id, new_id;
2787 int r;
2788
2789 r = check_arg_count(argc, 3);
2790 if (r)
2791 return r;
2792
2793 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2794 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2795 return -EINVAL;
2796 }
2797
2798 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2799 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2800 return -EINVAL;
2801 }
2802
2803 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2804 if (r) {
2805 DMWARN("Failed to change transaction id from %s to %s.",
2806 argv[1], argv[2]);
2807 return r;
2808 }
2809
2810 return 0;
2811}
2812
Joe Thornbercc8394d2012-06-03 00:30:01 +01002813static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2814{
2815 int r;
2816
2817 r = check_arg_count(argc, 1);
2818 if (r)
2819 return r;
2820
Joe Thornber020cc3b2013-12-04 15:05:36 -05002821 (void) commit(pool);
Joe Thornber0d200ae2012-07-03 12:55:31 +01002822
Joe Thornbercc8394d2012-06-03 00:30:01 +01002823 r = dm_pool_reserve_metadata_snap(pool->pmd);
2824 if (r)
2825 DMWARN("reserve_metadata_snap message failed.");
2826
2827 return r;
2828}
2829
2830static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2831{
2832 int r;
2833
2834 r = check_arg_count(argc, 1);
2835 if (r)
2836 return r;
2837
2838 r = dm_pool_release_metadata_snap(pool->pmd);
2839 if (r)
2840 DMWARN("release_metadata_snap message failed.");
2841
2842 return r;
2843}
2844
Joe Thornber991d9fa2011-10-31 20:21:18 +00002845/*
2846 * Messages supported:
2847 * create_thin <dev_id>
2848 * create_snap <dev_id> <origin_id>
2849 * delete <dev_id>
2850 * trim <dev_id> <new_size_in_sectors>
2851 * set_transaction_id <current_trans_id> <new_trans_id>
Joe Thornbercc8394d2012-06-03 00:30:01 +01002852 * reserve_metadata_snap
2853 * release_metadata_snap
Joe Thornber991d9fa2011-10-31 20:21:18 +00002854 */
2855static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2856{
2857 int r = -EINVAL;
2858 struct pool_c *pt = ti->private;
2859 struct pool *pool = pt->pool;
2860
2861 if (!strcasecmp(argv[0], "create_thin"))
2862 r = process_create_thin_mesg(argc, argv, pool);
2863
2864 else if (!strcasecmp(argv[0], "create_snap"))
2865 r = process_create_snap_mesg(argc, argv, pool);
2866
2867 else if (!strcasecmp(argv[0], "delete"))
2868 r = process_delete_mesg(argc, argv, pool);
2869
2870 else if (!strcasecmp(argv[0], "set_transaction_id"))
2871 r = process_set_transaction_id_mesg(argc, argv, pool);
2872
Joe Thornbercc8394d2012-06-03 00:30:01 +01002873 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2874 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2875
2876 else if (!strcasecmp(argv[0], "release_metadata_snap"))
2877 r = process_release_metadata_snap_mesg(argc, argv, pool);
2878
Joe Thornber991d9fa2011-10-31 20:21:18 +00002879 else
2880 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2881
Joe Thornbere49e5822012-07-27 15:08:16 +01002882 if (!r)
Joe Thornber020cc3b2013-12-04 15:05:36 -05002883 (void) commit(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002884
2885 return r;
2886}
2887
Joe Thornbere49e5822012-07-27 15:08:16 +01002888static void emit_flags(struct pool_features *pf, char *result,
2889 unsigned sz, unsigned maxlen)
2890{
2891 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
Mike Snitzer787a996c2013-12-06 16:21:43 -05002892 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
2893 pf->error_if_no_space;
Joe Thornbere49e5822012-07-27 15:08:16 +01002894 DMEMIT("%u ", count);
2895
2896 if (!pf->zero_new_blocks)
2897 DMEMIT("skip_block_zeroing ");
2898
2899 if (!pf->discard_enabled)
2900 DMEMIT("ignore_discard ");
2901
2902 if (!pf->discard_passdown)
2903 DMEMIT("no_discard_passdown ");
2904
2905 if (pf->mode == PM_READ_ONLY)
2906 DMEMIT("read_only ");
Mike Snitzer787a996c2013-12-06 16:21:43 -05002907
2908 if (pf->error_if_no_space)
2909 DMEMIT("error_if_no_space ");
Joe Thornbere49e5822012-07-27 15:08:16 +01002910}
2911
Joe Thornber991d9fa2011-10-31 20:21:18 +00002912/*
2913 * Status line is:
2914 * <transaction id> <used metadata sectors>/<total metadata sectors>
2915 * <used data sectors>/<total data sectors> <held metadata root>
2916 */
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002917static void pool_status(struct dm_target *ti, status_type_t type,
2918 unsigned status_flags, char *result, unsigned maxlen)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002919{
Joe Thornbere49e5822012-07-27 15:08:16 +01002920 int r;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002921 unsigned sz = 0;
2922 uint64_t transaction_id;
2923 dm_block_t nr_free_blocks_data;
2924 dm_block_t nr_free_blocks_metadata;
2925 dm_block_t nr_blocks_data;
2926 dm_block_t nr_blocks_metadata;
2927 dm_block_t held_root;
2928 char buf[BDEVNAME_SIZE];
2929 char buf2[BDEVNAME_SIZE];
2930 struct pool_c *pt = ti->private;
2931 struct pool *pool = pt->pool;
2932
2933 switch (type) {
2934 case STATUSTYPE_INFO:
Joe Thornbere49e5822012-07-27 15:08:16 +01002935 if (get_pool_mode(pool) == PM_FAIL) {
2936 DMEMIT("Fail");
2937 break;
2938 }
2939
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01002940 /* Commit to ensure statistics aren't out-of-date */
2941 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
Joe Thornber020cc3b2013-12-04 15:05:36 -05002942 (void) commit(pool);
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01002943
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002944 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2945 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002946 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
2947 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002948 goto err;
2949 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002950
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002951 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
2952 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002953 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
2954 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002955 goto err;
2956 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002957
2958 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002959 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002960 DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
2961 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002962 goto err;
2963 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002964
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002965 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
2966 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002967 DMERR("%s: dm_pool_get_free_block_count returned %d",
2968 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002969 goto err;
2970 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002971
2972 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002973 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002974 DMERR("%s: dm_pool_get_data_dev_size returned %d",
2975 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002976 goto err;
2977 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002978
Joe Thornbercc8394d2012-06-03 00:30:01 +01002979 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002980 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002981 DMERR("%s: dm_pool_get_metadata_snap returned %d",
2982 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00002983 goto err;
2984 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002985
2986 DMEMIT("%llu %llu/%llu %llu/%llu ",
2987 (unsigned long long)transaction_id,
2988 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2989 (unsigned long long)nr_blocks_metadata,
2990 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2991 (unsigned long long)nr_blocks_data);
2992
2993 if (held_root)
Joe Thornbere49e5822012-07-27 15:08:16 +01002994 DMEMIT("%llu ", held_root);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002995 else
Joe Thornbere49e5822012-07-27 15:08:16 +01002996 DMEMIT("- ");
2997
Joe Thornber3e1a0692014-03-03 16:03:26 +00002998 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
2999 DMEMIT("out_of_data_space ");
3000 else if (pool->pf.mode == PM_READ_ONLY)
Joe Thornbere49e5822012-07-27 15:08:16 +01003001 DMEMIT("ro ");
3002 else
3003 DMEMIT("rw ");
3004
Mike Snitzer018debe2012-12-21 20:23:32 +00003005 if (!pool->pf.discard_enabled)
Mike Snitzer787a996c2013-12-06 16:21:43 -05003006 DMEMIT("ignore_discard ");
Mike Snitzer018debe2012-12-21 20:23:32 +00003007 else if (pool->pf.discard_passdown)
Mike Snitzer787a996c2013-12-06 16:21:43 -05003008 DMEMIT("discard_passdown ");
Joe Thornbere49e5822012-07-27 15:08:16 +01003009 else
Mike Snitzer787a996c2013-12-06 16:21:43 -05003010 DMEMIT("no_discard_passdown ");
3011
3012 if (pool->pf.error_if_no_space)
3013 DMEMIT("error_if_no_space ");
3014 else
3015 DMEMIT("queue_if_no_space ");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003016
3017 break;
3018
3019 case STATUSTYPE_TABLE:
3020 DMEMIT("%s %s %lu %llu ",
3021 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
3022 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
3023 (unsigned long)pool->sectors_per_block,
3024 (unsigned long long)pt->low_water_blocks);
Mike Snitzer0424caa2012-09-26 23:45:47 +01003025 emit_flags(&pt->requested_pf, result, sz, maxlen);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003026 break;
3027 }
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003028 return;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003029
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003030err:
3031 DMEMIT("Error");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003032}
3033
3034static int pool_iterate_devices(struct dm_target *ti,
3035 iterate_devices_callout_fn fn, void *data)
3036{
3037 struct pool_c *pt = ti->private;
3038
3039 return fn(ti, pt->data_dev, 0, ti->len, data);
3040}
3041
3042static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3043 struct bio_vec *biovec, int max_size)
3044{
3045 struct pool_c *pt = ti->private;
3046 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
3047
3048 if (!q->merge_bvec_fn)
3049 return max_size;
3050
3051 bvm->bi_bdev = pt->data_dev->bdev;
3052
3053 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3054}
3055
Mike Snitzer0424caa2012-09-26 23:45:47 +01003056static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
Joe Thornber104655f2012-03-28 18:41:28 +01003057{
Mike Snitzer0424caa2012-09-26 23:45:47 +01003058 struct pool *pool = pt->pool;
3059 struct queue_limits *data_limits;
3060
Joe Thornber104655f2012-03-28 18:41:28 +01003061 limits->max_discard_sectors = pool->sectors_per_block;
3062
3063 /*
Mike Snitzer0424caa2012-09-26 23:45:47 +01003064 * discard_granularity is just a hint, and not enforced.
Joe Thornber104655f2012-03-28 18:41:28 +01003065 */
Mike Snitzer0424caa2012-09-26 23:45:47 +01003066 if (pt->adjusted_pf.discard_passdown) {
3067 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
3068 limits->discard_granularity = data_limits->discard_granularity;
Mike Snitzerf13945d2013-03-01 22:45:44 +00003069 } else
Mike Snitzer0424caa2012-09-26 23:45:47 +01003070 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
Joe Thornber104655f2012-03-28 18:41:28 +01003071}
3072
Joe Thornber991d9fa2011-10-31 20:21:18 +00003073static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3074{
3075 struct pool_c *pt = ti->private;
3076 struct pool *pool = pt->pool;
Mike Snitzer0cc67cd2013-08-20 15:02:41 -04003077 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003078
Mike Snitzer0cc67cd2013-08-20 15:02:41 -04003079 /*
3080 * If the system-determined stacked limits are compatible with the
3081 * pool's blocksize (io_opt is a factor) do not override them.
3082 */
3083 if (io_opt_sectors < pool->sectors_per_block ||
3084 do_div(io_opt_sectors, pool->sectors_per_block)) {
3085 blk_limits_io_min(limits, 0);
3086 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3087 }
Mike Snitzer0424caa2012-09-26 23:45:47 +01003088
3089 /*
3090 * pt->adjusted_pf is a staging area for the actual features to use.
3091 * They get transferred to the live pool in bind_control_target()
3092 * called from pool_preresume().
3093 */
Mike Snitzerb60ab992013-09-19 18:49:11 -04003094 if (!pt->adjusted_pf.discard_enabled) {
3095 /*
3096 * Must explicitly disallow stacking discard limits otherwise the
3097 * block layer will stack them if pool's data device has support.
3098 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
3099 * user to see that, so make sure to set all discard limits to 0.
3100 */
3101 limits->discard_granularity = 0;
Mike Snitzer0424caa2012-09-26 23:45:47 +01003102 return;
Mike Snitzerb60ab992013-09-19 18:49:11 -04003103 }
Mike Snitzer0424caa2012-09-26 23:45:47 +01003104
3105 disable_passdown_if_not_supported(pt);
3106
3107 set_discard_limits(pt, limits);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003108}
3109
3110static struct target_type pool_target = {
3111 .name = "thin-pool",
3112 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3113 DM_TARGET_IMMUTABLE,
Mike Snitzer67324ea2014-03-21 18:33:41 -04003114 .version = {1, 12, 0},
Joe Thornber991d9fa2011-10-31 20:21:18 +00003115 .module = THIS_MODULE,
3116 .ctr = pool_ctr,
3117 .dtr = pool_dtr,
3118 .map = pool_map,
3119 .postsuspend = pool_postsuspend,
3120 .preresume = pool_preresume,
3121 .resume = pool_resume,
3122 .message = pool_message,
3123 .status = pool_status,
3124 .merge = pool_merge,
3125 .iterate_devices = pool_iterate_devices,
3126 .io_hints = pool_io_hints,
3127};
3128
3129/*----------------------------------------------------------------
3130 * Thin target methods
3131 *--------------------------------------------------------------*/
Joe Thornberb10ebd32014-04-08 11:29:01 +01003132static void thin_get(struct thin_c *tc)
3133{
3134 atomic_inc(&tc->refcount);
3135}
3136
3137static void thin_put(struct thin_c *tc)
3138{
3139 if (atomic_dec_and_test(&tc->refcount))
3140 complete(&tc->can_destroy);
3141}
3142
Joe Thornber991d9fa2011-10-31 20:21:18 +00003143static void thin_dtr(struct dm_target *ti)
3144{
3145 struct thin_c *tc = ti->private;
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003146 unsigned long flags;
3147
Joe Thornberb10ebd32014-04-08 11:29:01 +01003148 thin_put(tc);
3149 wait_for_completion(&tc->can_destroy);
3150
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003151 spin_lock_irqsave(&tc->pool->lock, flags);
3152 list_del_rcu(&tc->list);
3153 spin_unlock_irqrestore(&tc->pool->lock, flags);
3154 synchronize_rcu();
Joe Thornber991d9fa2011-10-31 20:21:18 +00003155
3156 mutex_lock(&dm_thin_pool_table.mutex);
3157
3158 __pool_dec(tc->pool);
3159 dm_pool_close_thin_device(tc->td);
3160 dm_put_device(ti, tc->pool_dev);
Joe Thornber2dd9c252012-03-28 18:41:28 +01003161 if (tc->origin_dev)
3162 dm_put_device(ti, tc->origin_dev);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003163 kfree(tc);
3164
3165 mutex_unlock(&dm_thin_pool_table.mutex);
3166}
3167
3168/*
3169 * Thin target parameters:
3170 *
Joe Thornber2dd9c252012-03-28 18:41:28 +01003171 * <pool_dev> <dev_id> [origin_dev]
Joe Thornber991d9fa2011-10-31 20:21:18 +00003172 *
3173 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
3174 * dev_id: the internal device identifier
Joe Thornber2dd9c252012-03-28 18:41:28 +01003175 * origin_dev: a device external to the pool that should act as the origin
Joe Thornber67e2e2b2012-03-28 18:41:29 +01003176 *
3177 * If the pool device has discards disabled, they get disabled for the thin
3178 * device as well.
Joe Thornber991d9fa2011-10-31 20:21:18 +00003179 */
3180static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3181{
3182 int r;
3183 struct thin_c *tc;
Joe Thornber2dd9c252012-03-28 18:41:28 +01003184 struct dm_dev *pool_dev, *origin_dev;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003185 struct mapped_device *pool_md;
Joe Thornber5e3283e2014-04-08 11:08:41 +01003186 unsigned long flags;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003187
3188 mutex_lock(&dm_thin_pool_table.mutex);
3189
Joe Thornber2dd9c252012-03-28 18:41:28 +01003190 if (argc != 2 && argc != 3) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00003191 ti->error = "Invalid argument count";
3192 r = -EINVAL;
3193 goto out_unlock;
3194 }
3195
3196 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
3197 if (!tc) {
3198 ti->error = "Out of memory";
3199 r = -ENOMEM;
3200 goto out_unlock;
3201 }
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003202 spin_lock_init(&tc->lock);
3203 bio_list_init(&tc->deferred_bio_list);
3204 bio_list_init(&tc->retry_on_resume_list);
Mike Snitzer67324ea2014-03-21 18:33:41 -04003205 tc->sort_bio_list = RB_ROOT;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003206
Joe Thornber2dd9c252012-03-28 18:41:28 +01003207 if (argc == 3) {
3208 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
3209 if (r) {
3210 ti->error = "Error opening origin device";
3211 goto bad_origin_dev;
3212 }
3213 tc->origin_dev = origin_dev;
3214 }
3215
Joe Thornber991d9fa2011-10-31 20:21:18 +00003216 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
3217 if (r) {
3218 ti->error = "Error opening pool device";
3219 goto bad_pool_dev;
3220 }
3221 tc->pool_dev = pool_dev;
3222
3223 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
3224 ti->error = "Invalid device id";
3225 r = -EINVAL;
3226 goto bad_common;
3227 }
3228
3229 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
3230 if (!pool_md) {
3231 ti->error = "Couldn't get pool mapped device";
3232 r = -EINVAL;
3233 goto bad_common;
3234 }
3235
3236 tc->pool = __pool_table_lookup(pool_md);
3237 if (!tc->pool) {
3238 ti->error = "Couldn't find pool object";
3239 r = -EINVAL;
3240 goto bad_pool_lookup;
3241 }
3242 __pool_inc(tc->pool);
3243
Joe Thornbere49e5822012-07-27 15:08:16 +01003244 if (get_pool_mode(tc->pool) == PM_FAIL) {
3245 ti->error = "Couldn't open thin device, Pool is in fail mode";
Mike Snitzer1acacc02014-02-19 20:32:33 -05003246 r = -EINVAL;
Joe Thornbere49e5822012-07-27 15:08:16 +01003247 goto bad_thin_open;
3248 }
3249
Joe Thornber991d9fa2011-10-31 20:21:18 +00003250 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3251 if (r) {
3252 ti->error = "Couldn't open thin internal device";
3253 goto bad_thin_open;
3254 }
3255
Mike Snitzer542f9032012-07-27 15:08:00 +01003256 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3257 if (r)
Mike Snitzer1acacc02014-02-19 20:32:33 -05003258 goto bad_target_max_io_len;
Mike Snitzer542f9032012-07-27 15:08:00 +01003259
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00003260 ti->num_flush_bios = 1;
Joe Thornber16ad3d12012-07-27 15:08:07 +01003261 ti->flush_supported = true;
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00003262 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
Joe Thornber67e2e2b2012-03-28 18:41:29 +01003263
3264 /* In case the pool supports discards, pass them on. */
Mike Snitzerb60ab992013-09-19 18:49:11 -04003265 ti->discard_zeroes_data_unsupported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01003266 if (tc->pool->pf.discard_enabled) {
Alasdair G Kergon0ac55482012-07-27 15:08:08 +01003267 ti->discards_supported = true;
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00003268 ti->num_discard_bios = 1;
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00003269 /* Discard bios must be split on a block boundary */
3270 ti->split_discard_bios = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01003271 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003272
3273 dm_put(pool_md);
3274
3275 mutex_unlock(&dm_thin_pool_table.mutex);
3276
Joe Thornberb10ebd32014-04-08 11:29:01 +01003277 atomic_set(&tc->refcount, 1);
3278 init_completion(&tc->can_destroy);
3279
Joe Thornber5e3283e2014-04-08 11:08:41 +01003280 spin_lock_irqsave(&tc->pool->lock, flags);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003281 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
Joe Thornber5e3283e2014-04-08 11:08:41 +01003282 spin_unlock_irqrestore(&tc->pool->lock, flags);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003283 /*
3284 * This synchronize_rcu() call is needed here otherwise we risk a
3285 * wake_worker() call finding no bios to process (because the newly
3286 * added tc isn't yet visible). So this reduces latency since we
3287 * aren't then dependent on the periodic commit to wake_worker().
3288 */
3289 synchronize_rcu();
3290
Joe Thornber991d9fa2011-10-31 20:21:18 +00003291 return 0;
3292
Mike Snitzer1acacc02014-02-19 20:32:33 -05003293bad_target_max_io_len:
3294 dm_pool_close_thin_device(tc->td);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003295bad_thin_open:
3296 __pool_dec(tc->pool);
3297bad_pool_lookup:
3298 dm_put(pool_md);
3299bad_common:
3300 dm_put_device(ti, tc->pool_dev);
3301bad_pool_dev:
Joe Thornber2dd9c252012-03-28 18:41:28 +01003302 if (tc->origin_dev)
3303 dm_put_device(ti, tc->origin_dev);
3304bad_origin_dev:
Joe Thornber991d9fa2011-10-31 20:21:18 +00003305 kfree(tc);
3306out_unlock:
3307 mutex_unlock(&dm_thin_pool_table.mutex);
3308
3309 return r;
3310}
3311
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00003312static int thin_map(struct dm_target *ti, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003313{
Kent Overstreet4f024f32013-10-11 15:44:27 -07003314 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003315
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00003316 return thin_bio_map(ti, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003317}
3318
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00003319static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
Joe Thornbereb2aa482012-03-28 18:41:28 +01003320{
3321 unsigned long flags;
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00003322 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornbereb2aa482012-03-28 18:41:28 +01003323 struct list_head work;
Mike Snitzera24c2562012-06-03 00:30:00 +01003324 struct dm_thin_new_mapping *m, *tmp;
Joe Thornbereb2aa482012-03-28 18:41:28 +01003325 struct pool *pool = h->tc->pool;
3326
3327 if (h->shared_read_entry) {
3328 INIT_LIST_HEAD(&work);
Mike Snitzer44feb382012-10-12 21:02:10 +01003329 dm_deferred_entry_dec(h->shared_read_entry, &work);
Joe Thornbereb2aa482012-03-28 18:41:28 +01003330
3331 spin_lock_irqsave(&pool->lock, flags);
3332 list_for_each_entry_safe(m, tmp, &work, list) {
3333 list_del(&m->list);
Mike Snitzer7f214662013-12-17 13:43:31 -05003334 m->quiesced = true;
Joe Thornbereb2aa482012-03-28 18:41:28 +01003335 __maybe_add_mapping(m);
3336 }
3337 spin_unlock_irqrestore(&pool->lock, flags);
3338 }
3339
Joe Thornber104655f2012-03-28 18:41:28 +01003340 if (h->all_io_entry) {
3341 INIT_LIST_HEAD(&work);
Mike Snitzer44feb382012-10-12 21:02:10 +01003342 dm_deferred_entry_dec(h->all_io_entry, &work);
Joe Thornber563af182012-12-21 20:23:31 +00003343 if (!list_empty(&work)) {
3344 spin_lock_irqsave(&pool->lock, flags);
3345 list_for_each_entry_safe(m, tmp, &work, list)
Mike Snitzerdaec3382013-12-11 14:01:20 -05003346 list_add_tail(&m->list, &pool->prepared_discards);
Joe Thornber563af182012-12-21 20:23:31 +00003347 spin_unlock_irqrestore(&pool->lock, flags);
3348 wake_worker(pool);
3349 }
Joe Thornber104655f2012-03-28 18:41:28 +01003350 }
3351
Joe Thornbereb2aa482012-03-28 18:41:28 +01003352 return 0;
3353}
3354
Joe Thornber738211f2014-03-03 15:52:28 +00003355static void thin_presuspend(struct dm_target *ti)
3356{
3357 struct thin_c *tc = ti->private;
3358
3359 if (dm_noflush_suspending(ti))
3360 noflush_work(tc, do_noflush_start);
3361}
3362
Joe Thornber991d9fa2011-10-31 20:21:18 +00003363static void thin_postsuspend(struct dm_target *ti)
3364{
Joe Thornber738211f2014-03-03 15:52:28 +00003365 struct thin_c *tc = ti->private;
3366
3367 /*
3368 * The dm_noflush_suspending flag has been cleared by now, so
3369 * unfortunately we must always run this.
3370 */
3371 noflush_work(tc, do_noflush_stop);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003372}
3373
3374/*
3375 * <nr mapped sectors> <highest mapped sector>
3376 */
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003377static void thin_status(struct dm_target *ti, status_type_t type,
3378 unsigned status_flags, char *result, unsigned maxlen)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003379{
3380 int r;
3381 ssize_t sz = 0;
3382 dm_block_t mapped, highest;
3383 char buf[BDEVNAME_SIZE];
3384 struct thin_c *tc = ti->private;
3385
Joe Thornbere49e5822012-07-27 15:08:16 +01003386 if (get_pool_mode(tc->pool) == PM_FAIL) {
3387 DMEMIT("Fail");
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003388 return;
Joe Thornbere49e5822012-07-27 15:08:16 +01003389 }
3390
Joe Thornber991d9fa2011-10-31 20:21:18 +00003391 if (!tc->td)
3392 DMEMIT("-");
3393 else {
3394 switch (type) {
3395 case STATUSTYPE_INFO:
3396 r = dm_thin_get_mapped_count(tc->td, &mapped);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003397 if (r) {
3398 DMERR("dm_thin_get_mapped_count returned %d", r);
3399 goto err;
3400 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003401
3402 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003403 if (r < 0) {
3404 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
3405 goto err;
3406 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003407
3408 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3409 if (r)
3410 DMEMIT("%llu", ((highest + 1) *
3411 tc->pool->sectors_per_block) - 1);
3412 else
3413 DMEMIT("-");
3414 break;
3415
3416 case STATUSTYPE_TABLE:
3417 DMEMIT("%s %lu",
3418 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3419 (unsigned long) tc->dev_id);
Joe Thornber2dd9c252012-03-28 18:41:28 +01003420 if (tc->origin_dev)
3421 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
Joe Thornber991d9fa2011-10-31 20:21:18 +00003422 break;
3423 }
3424 }
3425
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003426 return;
3427
3428err:
3429 DMEMIT("Error");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003430}
3431
3432static int thin_iterate_devices(struct dm_target *ti,
3433 iterate_devices_callout_fn fn, void *data)
3434{
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003435 sector_t blocks;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003436 struct thin_c *tc = ti->private;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003437 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003438
3439 /*
3440 * We can't call dm_pool_get_data_dev_size() since that blocks. So
3441 * we follow a more convoluted path through to the pool's target.
3442 */
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003443 if (!pool->ti)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003444 return 0; /* nothing is bound */
3445
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003446 blocks = pool->ti->len;
3447 (void) sector_div(blocks, pool->sectors_per_block);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003448 if (blocks)
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003449 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003450
3451 return 0;
3452}
3453
Joe Thornber991d9fa2011-10-31 20:21:18 +00003454static struct target_type thin_target = {
3455 .name = "thin",
Mike Snitzer67324ea2014-03-21 18:33:41 -04003456 .version = {1, 12, 0},
Joe Thornber991d9fa2011-10-31 20:21:18 +00003457 .module = THIS_MODULE,
3458 .ctr = thin_ctr,
3459 .dtr = thin_dtr,
3460 .map = thin_map,
Joe Thornbereb2aa482012-03-28 18:41:28 +01003461 .end_io = thin_endio,
Joe Thornber738211f2014-03-03 15:52:28 +00003462 .presuspend = thin_presuspend,
Joe Thornber991d9fa2011-10-31 20:21:18 +00003463 .postsuspend = thin_postsuspend,
3464 .status = thin_status,
3465 .iterate_devices = thin_iterate_devices,
Joe Thornber991d9fa2011-10-31 20:21:18 +00003466};
3467
3468/*----------------------------------------------------------------*/
3469
3470static int __init dm_thin_init(void)
3471{
3472 int r;
3473
3474 pool_table_init();
3475
3476 r = dm_register_target(&thin_target);
3477 if (r)
3478 return r;
3479
3480 r = dm_register_target(&pool_target);
3481 if (r)
Mike Snitzera24c2562012-06-03 00:30:00 +01003482 goto bad_pool_target;
3483
3484 r = -ENOMEM;
3485
Mike Snitzera24c2562012-06-03 00:30:00 +01003486 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3487 if (!_new_mapping_cache)
3488 goto bad_new_mapping_cache;
3489
Mike Snitzera24c2562012-06-03 00:30:00 +01003490 return 0;
3491
Mike Snitzera24c2562012-06-03 00:30:00 +01003492bad_new_mapping_cache:
Mike Snitzera24c2562012-06-03 00:30:00 +01003493 dm_unregister_target(&pool_target);
3494bad_pool_target:
3495 dm_unregister_target(&thin_target);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003496
3497 return r;
3498}
3499
3500static void dm_thin_exit(void)
3501{
3502 dm_unregister_target(&thin_target);
3503 dm_unregister_target(&pool_target);
Mike Snitzera24c2562012-06-03 00:30:00 +01003504
Mike Snitzera24c2562012-06-03 00:30:00 +01003505 kmem_cache_destroy(_new_mapping_cache);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003506}
3507
3508module_init(dm_thin_init);
3509module_exit(dm_thin_exit);
3510
Alasdair G Kergon7cab8bf2012-05-12 01:43:19 +01003511MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003512MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3513MODULE_LICENSE("GPL");