blob: 912d7f4d89d1c922c32d08d6c882110644f7d7f8 [file] [log] [blame]
Joe Thornber991d9fa2011-10-31 20:21:18 +00001/*
Joe Thornbere49e5822012-07-27 15:08:16 +01002 * Copyright (C) 2011-2012 Red Hat UK.
Joe Thornber991d9fa2011-10-31 20:21:18 +00003 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
Mike Snitzer4f81a412012-10-12 21:02:13 +01008#include "dm-bio-prison.h"
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01009#include "dm.h"
Joe Thornber991d9fa2011-10-31 20:21:18 +000010
11#include <linux/device-mapper.h>
12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h>
Mike Snitzer604ea902014-10-09 18:43:25 -040014#include <linux/log2.h>
Joe Thornber991d9fa2011-10-31 20:21:18 +000015#include <linux/list.h>
Mike Snitzerc140e1c2014-03-20 21:17:14 -040016#include <linux/rculist.h>
Joe Thornber991d9fa2011-10-31 20:21:18 +000017#include <linux/init.h>
18#include <linux/module.h>
19#include <linux/slab.h>
Mike Snitzer67324ea2014-03-21 18:33:41 -040020#include <linux/rbtree.h>
Joe Thornber991d9fa2011-10-31 20:21:18 +000021
22#define DM_MSG_PREFIX "thin"
23
24/*
25 * Tunable constants
26 */
Alasdair G Kergon7768ed32012-07-27 15:07:57 +010027#define ENDIO_HOOK_POOL_SIZE 1024
Joe Thornber991d9fa2011-10-31 20:21:18 +000028#define MAPPING_POOL_SIZE 1024
Joe Thornber905e51b2012-03-28 18:41:27 +010029#define COMMIT_PERIOD HZ
Mike Snitzer80c57892014-05-20 13:38:33 -040030#define NO_SPACE_TIMEOUT_SECS 60
31
32static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
Joe Thornber991d9fa2011-10-31 20:21:18 +000033
Mikulas Patockadf5d2e92013-03-01 22:45:49 +000034DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
35 "A percentage of time allocated for copy on write");
36
Joe Thornber991d9fa2011-10-31 20:21:18 +000037/*
38 * The block size of the device holding pool data must be
39 * between 64KB and 1GB.
40 */
41#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
42#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
43
44/*
Joe Thornber991d9fa2011-10-31 20:21:18 +000045 * Device id is restricted to 24 bits.
46 */
47#define MAX_DEV_ID ((1 << 24) - 1)
48
49/*
50 * How do we handle breaking sharing of data blocks?
51 * =================================================
52 *
53 * We use a standard copy-on-write btree to store the mappings for the
54 * devices (note I'm talking about copy-on-write of the metadata here, not
55 * the data). When you take an internal snapshot you clone the root node
56 * of the origin btree. After this there is no concept of an origin or a
57 * snapshot. They are just two device trees that happen to point to the
58 * same data blocks.
59 *
60 * When we get a write in we decide if it's to a shared data block using
61 * some timestamp magic. If it is, we have to break sharing.
62 *
63 * Let's say we write to a shared block in what was the origin. The
64 * steps are:
65 *
66 * i) plug io further to this physical block. (see bio_prison code).
67 *
68 * ii) quiesce any read io to that shared data block. Obviously
Mike Snitzer44feb382012-10-12 21:02:10 +010069 * including all devices that share this block. (see dm_deferred_set code)
Joe Thornber991d9fa2011-10-31 20:21:18 +000070 *
71 * iii) copy the data block to a newly allocate block. This step can be
72 * missed out if the io covers the block. (schedule_copy).
73 *
74 * iv) insert the new mapping into the origin's btree
Joe Thornberfe878f32012-03-28 18:41:24 +010075 * (process_prepared_mapping). This act of inserting breaks some
Joe Thornber991d9fa2011-10-31 20:21:18 +000076 * sharing of btree nodes between the two devices. Breaking sharing only
77 * effects the btree of that specific device. Btrees for the other
78 * devices that share the block never change. The btree for the origin
79 * device as it was after the last commit is untouched, ie. we're using
80 * persistent data structures in the functional programming sense.
81 *
82 * v) unplug io to this physical block, including the io that triggered
83 * the breaking of sharing.
84 *
85 * Steps (ii) and (iii) occur in parallel.
86 *
87 * The metadata _doesn't_ need to be committed before the io continues. We
88 * get away with this because the io is always written to a _new_ block.
89 * If there's a crash, then:
90 *
91 * - The origin mapping will point to the old origin block (the shared
92 * one). This will contain the data as it was before the io that triggered
93 * the breaking of sharing came in.
94 *
95 * - The snap mapping still points to the old block. As it would after
96 * the commit.
97 *
98 * The downside of this scheme is the timestamp magic isn't perfect, and
99 * will continue to think that data block in the snapshot device is shared
100 * even after the write to the origin has broken sharing. I suspect data
101 * blocks will typically be shared by many different devices, so we're
102 * breaking sharing n + 1 times, rather than n, where n is the number of
103 * devices that reference this data block. At the moment I think the
104 * benefits far, far outweigh the disadvantages.
105 */
106
107/*----------------------------------------------------------------*/
108
109/*
Joe Thornber991d9fa2011-10-31 20:21:18 +0000110 * Key building.
111 */
112static void build_data_key(struct dm_thin_device *td,
Mike Snitzer44feb382012-10-12 21:02:10 +0100113 dm_block_t b, struct dm_cell_key *key)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000114{
115 key->virtual = 0;
116 key->dev = dm_thin_dev_id(td);
117 key->block = b;
118}
119
120static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
Mike Snitzer44feb382012-10-12 21:02:10 +0100121 struct dm_cell_key *key)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000122{
123 key->virtual = 1;
124 key->dev = dm_thin_dev_id(td);
125 key->block = b;
126}
127
128/*----------------------------------------------------------------*/
129
Joe Thornber7d327fe2014-10-06 15:45:59 +0100130#define THROTTLE_THRESHOLD (1 * HZ)
131
132struct throttle {
133 struct rw_semaphore lock;
134 unsigned long threshold;
135 bool throttle_applied;
136};
137
138static void throttle_init(struct throttle *t)
139{
140 init_rwsem(&t->lock);
141 t->throttle_applied = false;
142}
143
144static void throttle_work_start(struct throttle *t)
145{
146 t->threshold = jiffies + THROTTLE_THRESHOLD;
147}
148
149static void throttle_work_update(struct throttle *t)
150{
151 if (!t->throttle_applied && jiffies > t->threshold) {
152 down_write(&t->lock);
153 t->throttle_applied = true;
154 }
155}
156
157static void throttle_work_complete(struct throttle *t)
158{
159 if (t->throttle_applied) {
160 t->throttle_applied = false;
161 up_write(&t->lock);
162 }
163}
164
165static void throttle_lock(struct throttle *t)
166{
167 down_read(&t->lock);
168}
169
170static void throttle_unlock(struct throttle *t)
171{
172 up_read(&t->lock);
173}
174
175/*----------------------------------------------------------------*/
176
Joe Thornber991d9fa2011-10-31 20:21:18 +0000177/*
178 * A pool device ties together a metadata device and a data device. It
179 * also provides the interface for creating and destroying internal
180 * devices.
181 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100182struct dm_thin_new_mapping;
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100183
Joe Thornbere49e5822012-07-27 15:08:16 +0100184/*
Joe Thornber3e1a0692014-03-03 16:03:26 +0000185 * The pool runs in 4 modes. Ordered in degraded order for comparisons.
Joe Thornbere49e5822012-07-27 15:08:16 +0100186 */
187enum pool_mode {
188 PM_WRITE, /* metadata may be changed */
Joe Thornber3e1a0692014-03-03 16:03:26 +0000189 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
Joe Thornbere49e5822012-07-27 15:08:16 +0100190 PM_READ_ONLY, /* metadata may not be changed */
191 PM_FAIL, /* all I/O fails */
192};
193
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100194struct pool_features {
Joe Thornbere49e5822012-07-27 15:08:16 +0100195 enum pool_mode mode;
196
Mike Snitzer9bc142d2012-09-26 23:45:46 +0100197 bool zero_new_blocks:1;
198 bool discard_enabled:1;
199 bool discard_passdown:1;
Mike Snitzer787a996c2013-12-06 16:21:43 -0500200 bool error_if_no_space:1;
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100201};
202
Joe Thornbere49e5822012-07-27 15:08:16 +0100203struct thin_c;
204typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
Joe Thornbera374bb22014-10-10 13:43:14 +0100205typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
Joe Thornbere49e5822012-07-27 15:08:16 +0100206typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
207
Joe Thornber991d9fa2011-10-31 20:21:18 +0000208struct pool {
209 struct list_head list;
210 struct dm_target *ti; /* Only set if a pool target is bound */
211
212 struct mapped_device *pool_md;
213 struct block_device *md_dev;
214 struct dm_pool_metadata *pmd;
215
Joe Thornber991d9fa2011-10-31 20:21:18 +0000216 dm_block_t low_water_blocks;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100217 uint32_t sectors_per_block;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +0100218 int sectors_per_block_shift;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000219
Joe Thornber67e2e2b2012-03-28 18:41:29 +0100220 struct pool_features pf;
Joe Thornber88a66212013-12-04 20:16:12 -0500221 bool low_water_triggered:1; /* A dm event has been sent */
Joe Thornber991d9fa2011-10-31 20:21:18 +0000222
Mike Snitzer44feb382012-10-12 21:02:10 +0100223 struct dm_bio_prison *prison;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000224 struct dm_kcopyd_client *copier;
225
226 struct workqueue_struct *wq;
Joe Thornber7d327fe2014-10-06 15:45:59 +0100227 struct throttle throttle;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000228 struct work_struct worker;
Joe Thornber905e51b2012-03-28 18:41:27 +0100229 struct delayed_work waker;
Joe Thornber85ad643b2014-05-09 15:59:38 +0100230 struct delayed_work no_space_timeout;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000231
Joe Thornber905e51b2012-03-28 18:41:27 +0100232 unsigned long last_commit_jiffies;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100233 unsigned ref_count;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000234
235 spinlock_t lock;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000236 struct bio_list deferred_flush_bios;
237 struct list_head prepared_mappings;
Joe Thornber104655f2012-03-28 18:41:28 +0100238 struct list_head prepared_discards;
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400239 struct list_head active_thins;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000240
Mike Snitzer44feb382012-10-12 21:02:10 +0100241 struct dm_deferred_set *shared_read_ds;
242 struct dm_deferred_set *all_io_ds;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000243
Mike Snitzera24c2562012-06-03 00:30:00 +0100244 struct dm_thin_new_mapping *next_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000245 mempool_t *mapping_pool;
Joe Thornbere49e5822012-07-27 15:08:16 +0100246
247 process_bio_fn process_bio;
248 process_bio_fn process_discard;
249
Joe Thornbera374bb22014-10-10 13:43:14 +0100250 process_cell_fn process_cell;
251 process_cell_fn process_discard_cell;
252
Joe Thornbere49e5822012-07-27 15:08:16 +0100253 process_mapping_fn process_prepared_mapping;
254 process_mapping_fn process_prepared_discard;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000255};
256
Joe Thornbere49e5822012-07-27 15:08:16 +0100257static enum pool_mode get_pool_mode(struct pool *pool);
Joe Thornberb5330652013-12-04 19:51:33 -0500258static void metadata_operation_failed(struct pool *pool, const char *op, int r);
Joe Thornbere49e5822012-07-27 15:08:16 +0100259
Joe Thornber991d9fa2011-10-31 20:21:18 +0000260/*
261 * Target context for a pool.
262 */
263struct pool_c {
264 struct dm_target *ti;
265 struct pool *pool;
266 struct dm_dev *data_dev;
267 struct dm_dev *metadata_dev;
268 struct dm_target_callbacks callbacks;
269
270 dm_block_t low_water_blocks;
Mike Snitzer0424caa2012-09-26 23:45:47 +0100271 struct pool_features requested_pf; /* Features requested during table load */
272 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
Joe Thornber991d9fa2011-10-31 20:21:18 +0000273};
274
275/*
276 * Target context for a thin.
277 */
278struct thin_c {
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400279 struct list_head list;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000280 struct dm_dev *pool_dev;
Joe Thornber2dd9c252012-03-28 18:41:28 +0100281 struct dm_dev *origin_dev;
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100282 sector_t origin_size;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000283 dm_thin_id dev_id;
284
285 struct pool *pool;
286 struct dm_thin_device *td;
Joe Thornber738211f2014-03-03 15:52:28 +0000287 bool requeue_mode:1;
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400288 spinlock_t lock;
Joe Thornbera374bb22014-10-10 13:43:14 +0100289 struct list_head deferred_cells;
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400290 struct bio_list deferred_bio_list;
291 struct bio_list retry_on_resume_list;
Mike Snitzer67324ea2014-03-21 18:33:41 -0400292 struct rb_root sort_bio_list; /* sorted list of deferred bios */
Joe Thornberb10ebd32014-04-08 11:29:01 +0100293
294 /*
295 * Ensures the thin is not destroyed until the worker has finished
296 * iterating the active_thins list.
297 */
298 atomic_t refcount;
299 struct completion can_destroy;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000300};
301
302/*----------------------------------------------------------------*/
303
Joe Thornber025b9682013-03-01 22:45:50 +0000304/*
305 * wake_worker() is used when new work is queued and when pool_resume is
306 * ready to continue deferred IO processing.
307 */
308static void wake_worker(struct pool *pool)
309{
310 queue_work(pool->wq, &pool->worker);
311}
312
313/*----------------------------------------------------------------*/
314
Joe Thornber6beca5e2013-03-01 22:45:50 +0000315static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
316 struct dm_bio_prison_cell **cell_result)
317{
318 int r;
319 struct dm_bio_prison_cell *cell_prealloc;
320
321 /*
322 * Allocate a cell from the prison's mempool.
323 * This might block but it can't fail.
324 */
325 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
326
327 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
328 if (r)
329 /*
330 * We reused an old cell; we can get rid of
331 * the new one.
332 */
333 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
334
335 return r;
336}
337
338static void cell_release(struct pool *pool,
339 struct dm_bio_prison_cell *cell,
340 struct bio_list *bios)
341{
342 dm_cell_release(pool->prison, cell, bios);
343 dm_bio_prison_free_cell(pool->prison, cell);
344}
345
346static void cell_release_no_holder(struct pool *pool,
347 struct dm_bio_prison_cell *cell,
348 struct bio_list *bios)
349{
350 dm_cell_release_no_holder(pool->prison, cell, bios);
351 dm_bio_prison_free_cell(pool->prison, cell);
352}
353
Mike Snitzeraf918052014-05-22 14:32:51 -0400354static void cell_error_with_code(struct pool *pool,
355 struct dm_bio_prison_cell *cell, int error_code)
Joe Thornber6beca5e2013-03-01 22:45:50 +0000356{
Mike Snitzeraf918052014-05-22 14:32:51 -0400357 dm_cell_error(pool->prison, cell, error_code);
Joe Thornber6beca5e2013-03-01 22:45:50 +0000358 dm_bio_prison_free_cell(pool->prison, cell);
359}
360
Mike Snitzeraf918052014-05-22 14:32:51 -0400361static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
362{
363 cell_error_with_code(pool, cell, -EIO);
364}
365
Joe Thornbera374bb22014-10-10 13:43:14 +0100366static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
367{
368 cell_error_with_code(pool, cell, 0);
369}
370
371static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
372{
373 cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
374}
375
Joe Thornber6beca5e2013-03-01 22:45:50 +0000376/*----------------------------------------------------------------*/
377
Joe Thornber991d9fa2011-10-31 20:21:18 +0000378/*
379 * A global list of pools that uses a struct mapped_device as a key.
380 */
381static struct dm_thin_pool_table {
382 struct mutex mutex;
383 struct list_head pools;
384} dm_thin_pool_table;
385
386static void pool_table_init(void)
387{
388 mutex_init(&dm_thin_pool_table.mutex);
389 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
390}
391
392static void __pool_table_insert(struct pool *pool)
393{
394 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
395 list_add(&pool->list, &dm_thin_pool_table.pools);
396}
397
398static void __pool_table_remove(struct pool *pool)
399{
400 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
401 list_del(&pool->list);
402}
403
404static struct pool *__pool_table_lookup(struct mapped_device *md)
405{
406 struct pool *pool = NULL, *tmp;
407
408 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
409
410 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
411 if (tmp->pool_md == md) {
412 pool = tmp;
413 break;
414 }
415 }
416
417 return pool;
418}
419
420static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
421{
422 struct pool *pool = NULL, *tmp;
423
424 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
425
426 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
427 if (tmp->md_dev == md_dev) {
428 pool = tmp;
429 break;
430 }
431 }
432
433 return pool;
434}
435
436/*----------------------------------------------------------------*/
437
Mike Snitzera24c2562012-06-03 00:30:00 +0100438struct dm_thin_endio_hook {
Joe Thornbereb2aa482012-03-28 18:41:28 +0100439 struct thin_c *tc;
Mike Snitzer44feb382012-10-12 21:02:10 +0100440 struct dm_deferred_entry *shared_read_entry;
441 struct dm_deferred_entry *all_io_entry;
Mike Snitzera24c2562012-06-03 00:30:00 +0100442 struct dm_thin_new_mapping *overwrite_mapping;
Mike Snitzer67324ea2014-03-21 18:33:41 -0400443 struct rb_node rb_node;
Joe Thornbereb2aa482012-03-28 18:41:28 +0100444};
445
Joe Thornber18adc572014-03-03 15:46:42 +0000446static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000447{
448 struct bio *bio;
449 struct bio_list bios;
Joe Thornber18adc572014-03-03 15:46:42 +0000450 unsigned long flags;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000451
452 bio_list_init(&bios);
Joe Thornber18adc572014-03-03 15:46:42 +0000453
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400454 spin_lock_irqsave(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000455 bio_list_merge(&bios, master);
456 bio_list_init(master);
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400457 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000458
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400459 while ((bio = bio_list_pop(&bios)))
460 bio_endio(bio, DM_ENDIO_REQUEUE);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000461}
462
Joe Thornbera374bb22014-10-10 13:43:14 +0100463static void requeue_deferred_cells(struct thin_c *tc)
464{
465 struct pool *pool = tc->pool;
466 unsigned long flags;
467 struct list_head cells;
468 struct dm_bio_prison_cell *cell, *tmp;
469
470 INIT_LIST_HEAD(&cells);
471
472 spin_lock_irqsave(&tc->lock, flags);
473 list_splice_init(&tc->deferred_cells, &cells);
474 spin_unlock_irqrestore(&tc->lock, flags);
475
476 list_for_each_entry_safe(cell, tmp, &cells, user_list)
477 cell_requeue(pool, cell);
478}
479
Joe Thornber991d9fa2011-10-31 20:21:18 +0000480static void requeue_io(struct thin_c *tc)
481{
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400482 requeue_bio_list(tc, &tc->deferred_bio_list);
483 requeue_bio_list(tc, &tc->retry_on_resume_list);
Joe Thornbera374bb22014-10-10 13:43:14 +0100484 requeue_deferred_cells(tc);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000485}
486
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400487static void error_thin_retry_list(struct thin_c *tc)
Joe Thornber3e1a0692014-03-03 16:03:26 +0000488{
489 struct bio *bio;
490 unsigned long flags;
491 struct bio_list bios;
492
493 bio_list_init(&bios);
494
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400495 spin_lock_irqsave(&tc->lock, flags);
496 bio_list_merge(&bios, &tc->retry_on_resume_list);
497 bio_list_init(&tc->retry_on_resume_list);
498 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber3e1a0692014-03-03 16:03:26 +0000499
500 while ((bio = bio_list_pop(&bios)))
501 bio_io_error(bio);
502}
503
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400504static void error_retry_list(struct pool *pool)
505{
506 struct thin_c *tc;
507
508 rcu_read_lock();
509 list_for_each_entry_rcu(tc, &pool->active_thins, list)
510 error_thin_retry_list(tc);
511 rcu_read_unlock();
512}
513
Joe Thornber991d9fa2011-10-31 20:21:18 +0000514/*
515 * This section of code contains the logic for processing a thin device's IO.
516 * Much of the code depends on pool object resources (lists, workqueues, etc)
517 * but most is exclusively called from the thin target rather than the thin-pool
518 * target.
519 */
520
Mike Snitzer58f77a22013-03-01 22:45:45 +0000521static bool block_size_is_power_of_two(struct pool *pool)
522{
523 return pool->sectors_per_block_shift >= 0;
524}
525
Joe Thornber991d9fa2011-10-31 20:21:18 +0000526static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
527{
Mike Snitzer58f77a22013-03-01 22:45:45 +0000528 struct pool *pool = tc->pool;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700529 sector_t block_nr = bio->bi_iter.bi_sector;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100530
Mike Snitzer58f77a22013-03-01 22:45:45 +0000531 if (block_size_is_power_of_two(pool))
532 block_nr >>= pool->sectors_per_block_shift;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +0100533 else
Mike Snitzer58f77a22013-03-01 22:45:45 +0000534 (void) sector_div(block_nr, pool->sectors_per_block);
Mike Snitzer55f2b8b2012-07-27 15:08:02 +0100535
536 return block_nr;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000537}
538
539static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
540{
541 struct pool *pool = tc->pool;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700542 sector_t bi_sector = bio->bi_iter.bi_sector;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000543
544 bio->bi_bdev = tc->pool_dev->bdev;
Mike Snitzer58f77a22013-03-01 22:45:45 +0000545 if (block_size_is_power_of_two(pool))
Kent Overstreet4f024f32013-10-11 15:44:27 -0700546 bio->bi_iter.bi_sector =
547 (block << pool->sectors_per_block_shift) |
548 (bi_sector & (pool->sectors_per_block - 1));
Mike Snitzer58f77a22013-03-01 22:45:45 +0000549 else
Kent Overstreet4f024f32013-10-11 15:44:27 -0700550 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
Mike Snitzer58f77a22013-03-01 22:45:45 +0000551 sector_div(bi_sector, pool->sectors_per_block);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000552}
553
Joe Thornber2dd9c252012-03-28 18:41:28 +0100554static void remap_to_origin(struct thin_c *tc, struct bio *bio)
555{
556 bio->bi_bdev = tc->origin_dev->bdev;
557}
558
Joe Thornber4afdd682012-07-27 15:08:14 +0100559static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
560{
561 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
562 dm_thin_changed_this_transaction(tc->td);
563}
564
Joe Thornbere8088072012-12-21 20:23:31 +0000565static void inc_all_io_entry(struct pool *pool, struct bio *bio)
566{
567 struct dm_thin_endio_hook *h;
568
569 if (bio->bi_rw & REQ_DISCARD)
570 return;
571
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +0000572 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornbere8088072012-12-21 20:23:31 +0000573 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
574}
575
Joe Thornber2dd9c252012-03-28 18:41:28 +0100576static void issue(struct thin_c *tc, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000577{
578 struct pool *pool = tc->pool;
579 unsigned long flags;
580
Joe Thornbere49e5822012-07-27 15:08:16 +0100581 if (!bio_triggers_commit(tc, bio)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000582 generic_make_request(bio);
Joe Thornbere49e5822012-07-27 15:08:16 +0100583 return;
584 }
585
586 /*
587 * Complete bio with an error if earlier I/O caused changes to
588 * the metadata that can't be committed e.g, due to I/O errors
589 * on the metadata device.
590 */
591 if (dm_thin_aborted_changes(tc->td)) {
592 bio_io_error(bio);
593 return;
594 }
595
596 /*
597 * Batch together any bios that trigger commits and then issue a
598 * single commit for them in process_deferred_bios().
599 */
600 spin_lock_irqsave(&pool->lock, flags);
601 bio_list_add(&pool->deferred_flush_bios, bio);
602 spin_unlock_irqrestore(&pool->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000603}
604
Joe Thornber2dd9c252012-03-28 18:41:28 +0100605static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
606{
607 remap_to_origin(tc, bio);
608 issue(tc, bio);
609}
610
611static void remap_and_issue(struct thin_c *tc, struct bio *bio,
612 dm_block_t block)
613{
614 remap(tc, bio, block);
615 issue(tc, bio);
616}
617
Joe Thornber991d9fa2011-10-31 20:21:18 +0000618/*----------------------------------------------------------------*/
619
620/*
621 * Bio endio functions.
622 */
Mike Snitzera24c2562012-06-03 00:30:00 +0100623struct dm_thin_new_mapping {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000624 struct list_head list;
625
Mike Snitzer7f214662013-12-17 13:43:31 -0500626 bool pass_discard:1;
627 bool definitely_not_shared:1;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000628
Joe Thornber50f3c3e2014-06-13 13:57:09 +0100629 /*
630 * Track quiescing, copying and zeroing preparation actions. When this
631 * counter hits zero the block is prepared and can be inserted into the
632 * btree.
633 */
634 atomic_t prepare_actions;
635
Mike Snitzer7f214662013-12-17 13:43:31 -0500636 int err;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000637 struct thin_c *tc;
638 dm_block_t virt_block;
639 dm_block_t data_block;
Mike Snitzera24c2562012-06-03 00:30:00 +0100640 struct dm_bio_prison_cell *cell, *cell2;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000641
642 /*
643 * If the bio covers the whole area of a block then we can avoid
644 * zeroing or copying. Instead this bio is hooked. The bio will
645 * still be in the cell, so care has to be taken to avoid issuing
646 * the bio twice.
647 */
648 struct bio *bio;
649 bio_end_io_t *saved_bi_end_io;
650};
651
Joe Thornber50f3c3e2014-06-13 13:57:09 +0100652static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000653{
654 struct pool *pool = m->tc->pool;
655
Joe Thornber50f3c3e2014-06-13 13:57:09 +0100656 if (atomic_dec_and_test(&m->prepare_actions)) {
Mike Snitzerdaec3382013-12-11 14:01:20 -0500657 list_add_tail(&m->list, &pool->prepared_mappings);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000658 wake_worker(pool);
659 }
660}
661
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100662static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000663{
664 unsigned long flags;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000665 struct pool *pool = m->tc->pool;
666
Joe Thornber991d9fa2011-10-31 20:21:18 +0000667 spin_lock_irqsave(&pool->lock, flags);
Joe Thornber50f3c3e2014-06-13 13:57:09 +0100668 __complete_mapping_preparation(m);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000669 spin_unlock_irqrestore(&pool->lock, flags);
670}
671
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100672static void copy_complete(int read_err, unsigned long write_err, void *context)
673{
674 struct dm_thin_new_mapping *m = context;
675
676 m->err = read_err || write_err ? -EIO : 0;
677 complete_mapping_preparation(m);
678}
679
Joe Thornber991d9fa2011-10-31 20:21:18 +0000680static void overwrite_endio(struct bio *bio, int err)
681{
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +0000682 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Mike Snitzera24c2562012-06-03 00:30:00 +0100683 struct dm_thin_new_mapping *m = h->overwrite_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000684
685 m->err = err;
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100686 complete_mapping_preparation(m);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000687}
688
Joe Thornber991d9fa2011-10-31 20:21:18 +0000689/*----------------------------------------------------------------*/
690
691/*
692 * Workqueue.
693 */
694
695/*
696 * Prepared mapping jobs.
697 */
698
699/*
700 * This sends the bios in the cell back to the deferred_bios list.
701 */
Joe Thornber2aab3852012-12-21 20:23:33 +0000702static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000703{
704 struct pool *pool = tc->pool;
705 unsigned long flags;
706
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400707 spin_lock_irqsave(&tc->lock, flags);
708 cell_release(pool, cell, &tc->deferred_bio_list);
709 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000710
711 wake_worker(pool);
712}
713
714/*
Joe Thornber6beca5e2013-03-01 22:45:50 +0000715 * Same as cell_defer above, except it omits the original holder of the cell.
Joe Thornber991d9fa2011-10-31 20:21:18 +0000716 */
Joe Thornberf286ba02012-12-21 20:23:33 +0000717static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000718{
Joe Thornber991d9fa2011-10-31 20:21:18 +0000719 struct pool *pool = tc->pool;
720 unsigned long flags;
721
Mike Snitzerc140e1c2014-03-20 21:17:14 -0400722 spin_lock_irqsave(&tc->lock, flags);
723 cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
724 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000725
726 wake_worker(pool);
727}
728
Joe Thornbera374bb22014-10-10 13:43:14 +0100729static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
730
731static void inc_remap_and_issue_cell(struct thin_c *tc,
732 struct dm_bio_prison_cell *cell,
733 dm_block_t block)
734{
735 struct bio *bio;
736 struct bio_list bios;
737
738 bio_list_init(&bios);
739 cell_release_no_holder(tc->pool, cell, &bios);
740
741 while ((bio = bio_list_pop(&bios))) {
742 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
743 thin_defer_bio(tc, bio);
744 else {
745 inc_all_io_entry(tc->pool, bio);
746 remap_and_issue(tc, bio, block);
747 }
748 }
749}
750
Joe Thornbere49e5822012-07-27 15:08:16 +0100751static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
752{
Kent Overstreet196d38b2013-11-23 18:34:15 -0800753 if (m->bio) {
Joe Thornbere49e5822012-07-27 15:08:16 +0100754 m->bio->bi_end_io = m->saved_bi_end_io;
Kent Overstreet196d38b2013-11-23 18:34:15 -0800755 atomic_inc(&m->bio->bi_remaining);
756 }
Joe Thornber6beca5e2013-03-01 22:45:50 +0000757 cell_error(m->tc->pool, m->cell);
Joe Thornbere49e5822012-07-27 15:08:16 +0100758 list_del(&m->list);
759 mempool_free(m, m->tc->pool->mapping_pool);
760}
Joe Thornber025b9682013-03-01 22:45:50 +0000761
Mike Snitzera24c2562012-06-03 00:30:00 +0100762static void process_prepared_mapping(struct dm_thin_new_mapping *m)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000763{
764 struct thin_c *tc = m->tc;
Joe Thornber6beca5e2013-03-01 22:45:50 +0000765 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000766 struct bio *bio;
767 int r;
768
769 bio = m->bio;
Kent Overstreet196d38b2013-11-23 18:34:15 -0800770 if (bio) {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000771 bio->bi_end_io = m->saved_bi_end_io;
Kent Overstreet196d38b2013-11-23 18:34:15 -0800772 atomic_inc(&bio->bi_remaining);
773 }
Joe Thornber991d9fa2011-10-31 20:21:18 +0000774
775 if (m->err) {
Joe Thornber6beca5e2013-03-01 22:45:50 +0000776 cell_error(pool, m->cell);
Joe Thornber905386f2012-07-27 15:08:05 +0100777 goto out;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000778 }
779
780 /*
781 * Commit the prepared block into the mapping btree.
782 * Any I/O for this block arriving after this point will get
783 * remapped to it directly.
784 */
785 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
786 if (r) {
Joe Thornberb5330652013-12-04 19:51:33 -0500787 metadata_operation_failed(pool, "dm_thin_insert_block", r);
Joe Thornber6beca5e2013-03-01 22:45:50 +0000788 cell_error(pool, m->cell);
Joe Thornber905386f2012-07-27 15:08:05 +0100789 goto out;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000790 }
791
792 /*
793 * Release any bios held while the block was being provisioned.
794 * If we are processing a write bio that completely covers the block,
795 * we already processed it so can ignore it now when processing
796 * the bios in the cell.
797 */
798 if (bio) {
Joe Thornberf286ba02012-12-21 20:23:33 +0000799 cell_defer_no_holder(tc, m->cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000800 bio_endio(bio, 0);
801 } else
Joe Thornber2aab3852012-12-21 20:23:33 +0000802 cell_defer(tc, m->cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000803
Joe Thornber905386f2012-07-27 15:08:05 +0100804out:
Joe Thornber991d9fa2011-10-31 20:21:18 +0000805 list_del(&m->list);
Joe Thornber6beca5e2013-03-01 22:45:50 +0000806 mempool_free(m, pool->mapping_pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000807}
808
Joe Thornbere49e5822012-07-27 15:08:16 +0100809static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
Joe Thornber104655f2012-03-28 18:41:28 +0100810{
Joe Thornber104655f2012-03-28 18:41:28 +0100811 struct thin_c *tc = m->tc;
812
Joe Thornbere49e5822012-07-27 15:08:16 +0100813 bio_io_error(m->bio);
Joe Thornberf286ba02012-12-21 20:23:33 +0000814 cell_defer_no_holder(tc, m->cell);
815 cell_defer_no_holder(tc, m->cell2);
Joe Thornbere49e5822012-07-27 15:08:16 +0100816 mempool_free(m, tc->pool->mapping_pool);
817}
Joe Thornber104655f2012-03-28 18:41:28 +0100818
Joe Thornbere49e5822012-07-27 15:08:16 +0100819static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
820{
821 struct thin_c *tc = m->tc;
822
Joe Thornbere8088072012-12-21 20:23:31 +0000823 inc_all_io_entry(tc->pool, m->bio);
Joe Thornberf286ba02012-12-21 20:23:33 +0000824 cell_defer_no_holder(tc, m->cell);
825 cell_defer_no_holder(tc, m->cell2);
Joe Thornbere8088072012-12-21 20:23:31 +0000826
Joe Thornber104655f2012-03-28 18:41:28 +0100827 if (m->pass_discard)
Joe Thornber19fa1a62013-12-17 12:09:40 -0500828 if (m->definitely_not_shared)
829 remap_and_issue(tc, m->bio, m->data_block);
830 else {
831 bool used = false;
832 if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
833 bio_endio(m->bio, 0);
834 else
835 remap_and_issue(tc, m->bio, m->data_block);
836 }
Joe Thornber104655f2012-03-28 18:41:28 +0100837 else
838 bio_endio(m->bio, 0);
839
Joe Thornber104655f2012-03-28 18:41:28 +0100840 mempool_free(m, tc->pool->mapping_pool);
841}
842
Joe Thornbere49e5822012-07-27 15:08:16 +0100843static void process_prepared_discard(struct dm_thin_new_mapping *m)
844{
845 int r;
846 struct thin_c *tc = m->tc;
847
848 r = dm_thin_remove_block(tc->td, m->virt_block);
849 if (r)
Mike Snitzerc3977412012-12-21 20:23:34 +0000850 DMERR_LIMIT("dm_thin_remove_block() failed");
Joe Thornbere49e5822012-07-27 15:08:16 +0100851
852 process_prepared_discard_passdown(m);
853}
854
Joe Thornber104655f2012-03-28 18:41:28 +0100855static void process_prepared(struct pool *pool, struct list_head *head,
Joe Thornbere49e5822012-07-27 15:08:16 +0100856 process_mapping_fn *fn)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000857{
858 unsigned long flags;
859 struct list_head maps;
Mike Snitzera24c2562012-06-03 00:30:00 +0100860 struct dm_thin_new_mapping *m, *tmp;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000861
862 INIT_LIST_HEAD(&maps);
863 spin_lock_irqsave(&pool->lock, flags);
Joe Thornber104655f2012-03-28 18:41:28 +0100864 list_splice_init(head, &maps);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000865 spin_unlock_irqrestore(&pool->lock, flags);
866
867 list_for_each_entry_safe(m, tmp, &maps, list)
Joe Thornbere49e5822012-07-27 15:08:16 +0100868 (*fn)(m);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000869}
870
871/*
872 * Deferred bio jobs.
873 */
Joe Thornber104655f2012-03-28 18:41:28 +0100874static int io_overlaps_block(struct pool *pool, struct bio *bio)
875{
Kent Overstreet4f024f32013-10-11 15:44:27 -0700876 return bio->bi_iter.bi_size ==
877 (pool->sectors_per_block << SECTOR_SHIFT);
Joe Thornber104655f2012-03-28 18:41:28 +0100878}
879
Joe Thornber991d9fa2011-10-31 20:21:18 +0000880static int io_overwrites_block(struct pool *pool, struct bio *bio)
881{
Joe Thornber104655f2012-03-28 18:41:28 +0100882 return (bio_data_dir(bio) == WRITE) &&
883 io_overlaps_block(pool, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000884}
885
886static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
887 bio_end_io_t *fn)
888{
889 *save = bio->bi_end_io;
890 bio->bi_end_io = fn;
891}
892
893static int ensure_next_mapping(struct pool *pool)
894{
895 if (pool->next_mapping)
896 return 0;
897
898 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
899
900 return pool->next_mapping ? 0 : -ENOMEM;
901}
902
Mike Snitzera24c2562012-06-03 00:30:00 +0100903static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000904{
Mike Snitzer16961b02013-12-17 13:19:11 -0500905 struct dm_thin_new_mapping *m = pool->next_mapping;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000906
907 BUG_ON(!pool->next_mapping);
908
Mike Snitzer16961b02013-12-17 13:19:11 -0500909 memset(m, 0, sizeof(struct dm_thin_new_mapping));
910 INIT_LIST_HEAD(&m->list);
911 m->bio = NULL;
912
Joe Thornber991d9fa2011-10-31 20:21:18 +0000913 pool->next_mapping = NULL;
914
Mike Snitzer16961b02013-12-17 13:19:11 -0500915 return m;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000916}
917
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100918static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
919 sector_t begin, sector_t end)
920{
921 int r;
922 struct dm_io_region to;
923
924 to.bdev = tc->pool_dev->bdev;
925 to.sector = begin;
926 to.count = end - begin;
927
928 r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
929 if (r < 0) {
930 DMERR_LIMIT("dm_kcopyd_zero() failed");
931 copy_complete(1, 1, m);
932 }
933}
934
Mike Snitzer452d7a62014-10-09 19:20:21 -0400935static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
936 dm_block_t data_block,
937 struct dm_thin_new_mapping *m)
938{
939 struct pool *pool = tc->pool;
940 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
941
942 h->overwrite_mapping = m;
943 m->bio = bio;
944 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
945 inc_all_io_entry(pool, bio);
946 remap_and_issue(tc, bio, data_block);
947}
948
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100949/*
950 * A partial copy also needs to zero the uncopied region.
951 */
Joe Thornber991d9fa2011-10-31 20:21:18 +0000952static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
Joe Thornber2dd9c252012-03-28 18:41:28 +0100953 struct dm_dev *origin, dm_block_t data_origin,
954 dm_block_t data_dest,
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100955 struct dm_bio_prison_cell *cell, struct bio *bio,
956 sector_t len)
Joe Thornber991d9fa2011-10-31 20:21:18 +0000957{
958 int r;
959 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +0100960 struct dm_thin_new_mapping *m = get_next_mapping(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +0000961
Joe Thornber991d9fa2011-10-31 20:21:18 +0000962 m->tc = tc;
963 m->virt_block = virt_block;
964 m->data_block = data_dest;
965 m->cell = cell;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000966
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100967 /*
968 * quiesce action + copy action + an extra reference held for the
969 * duration of this function (we may need to inc later for a
970 * partial zero).
971 */
972 atomic_set(&m->prepare_actions, 3);
973
Mike Snitzer44feb382012-10-12 21:02:10 +0100974 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100975 complete_mapping_preparation(m); /* already quiesced */
Joe Thornber991d9fa2011-10-31 20:21:18 +0000976
977 /*
978 * IO to pool_dev remaps to the pool target's data_dev.
979 *
980 * If the whole block of data is being overwritten, we can issue the
981 * bio immediately. Otherwise we use kcopyd to clone the data first.
982 */
Mike Snitzer452d7a62014-10-09 19:20:21 -0400983 if (io_overwrites_block(pool, bio))
984 remap_and_issue_overwrite(tc, bio, data_dest, m);
985 else {
Joe Thornber991d9fa2011-10-31 20:21:18 +0000986 struct dm_io_region from, to;
987
Joe Thornber2dd9c252012-03-28 18:41:28 +0100988 from.bdev = origin->bdev;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000989 from.sector = data_origin * pool->sectors_per_block;
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100990 from.count = len;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000991
992 to.bdev = tc->pool_dev->bdev;
993 to.sector = data_dest * pool->sectors_per_block;
Joe Thornbere5aea7b2014-06-13 14:47:24 +0100994 to.count = len;
Joe Thornber991d9fa2011-10-31 20:21:18 +0000995
996 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
997 0, copy_complete, m);
998 if (r < 0) {
Mike Snitzerc3977412012-12-21 20:23:34 +0000999 DMERR_LIMIT("dm_kcopyd_copy() failed");
Joe Thornbere5aea7b2014-06-13 14:47:24 +01001000 copy_complete(1, 1, m);
1001
1002 /*
1003 * We allow the zero to be issued, to simplify the
1004 * error path. Otherwise we'd need to start
1005 * worrying about decrementing the prepare_actions
1006 * counter.
1007 */
1008 }
1009
1010 /*
1011 * Do we need to zero a tail region?
1012 */
1013 if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
1014 atomic_inc(&m->prepare_actions);
1015 ll_zero(tc, m,
1016 data_dest * pool->sectors_per_block + len,
1017 (data_dest + 1) * pool->sectors_per_block);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001018 }
1019 }
Joe Thornbere5aea7b2014-06-13 14:47:24 +01001020
1021 complete_mapping_preparation(m); /* drop our ref */
Joe Thornber991d9fa2011-10-31 20:21:18 +00001022}
1023
Joe Thornber2dd9c252012-03-28 18:41:28 +01001024static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1025 dm_block_t data_origin, dm_block_t data_dest,
Mike Snitzera24c2562012-06-03 00:30:00 +01001026 struct dm_bio_prison_cell *cell, struct bio *bio)
Joe Thornber2dd9c252012-03-28 18:41:28 +01001027{
1028 schedule_copy(tc, virt_block, tc->pool_dev,
Joe Thornbere5aea7b2014-06-13 14:47:24 +01001029 data_origin, data_dest, cell, bio,
1030 tc->pool->sectors_per_block);
Joe Thornber2dd9c252012-03-28 18:41:28 +01001031}
1032
Joe Thornber991d9fa2011-10-31 20:21:18 +00001033static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
Mike Snitzera24c2562012-06-03 00:30:00 +01001034 dm_block_t data_block, struct dm_bio_prison_cell *cell,
Joe Thornber991d9fa2011-10-31 20:21:18 +00001035 struct bio *bio)
1036{
1037 struct pool *pool = tc->pool;
Mike Snitzera24c2562012-06-03 00:30:00 +01001038 struct dm_thin_new_mapping *m = get_next_mapping(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001039
Joe Thornber50f3c3e2014-06-13 13:57:09 +01001040 atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
Joe Thornber991d9fa2011-10-31 20:21:18 +00001041 m->tc = tc;
1042 m->virt_block = virt_block;
1043 m->data_block = data_block;
1044 m->cell = cell;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001045
1046 /*
1047 * If the whole block of data is being overwritten or we are not
1048 * zeroing pre-existing data, we can issue the bio immediately.
1049 * Otherwise we use kcopyd to zero the data first.
1050 */
Joe Thornber67e2e2b2012-03-28 18:41:29 +01001051 if (!pool->pf.zero_new_blocks)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001052 process_prepared_mapping(m);
1053
Mike Snitzer452d7a62014-10-09 19:20:21 -04001054 else if (io_overwrites_block(pool, bio))
1055 remap_and_issue_overwrite(tc, bio, data_block, m);
Mike Snitzera24c2562012-06-03 00:30:00 +01001056
Mike Snitzer452d7a62014-10-09 19:20:21 -04001057 else
Joe Thornbere5aea7b2014-06-13 14:47:24 +01001058 ll_zero(tc, m,
1059 data_block * pool->sectors_per_block,
1060 (data_block + 1) * pool->sectors_per_block);
1061}
Joe Thornber991d9fa2011-10-31 20:21:18 +00001062
Joe Thornbere5aea7b2014-06-13 14:47:24 +01001063static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1064 dm_block_t data_dest,
1065 struct dm_bio_prison_cell *cell, struct bio *bio)
1066{
1067 struct pool *pool = tc->pool;
1068 sector_t virt_block_begin = virt_block * pool->sectors_per_block;
1069 sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
1070
1071 if (virt_block_end <= tc->origin_size)
1072 schedule_copy(tc, virt_block, tc->origin_dev,
1073 virt_block, data_dest, cell, bio,
1074 pool->sectors_per_block);
1075
1076 else if (virt_block_begin < tc->origin_size)
1077 schedule_copy(tc, virt_block, tc->origin_dev,
1078 virt_block, data_dest, cell, bio,
1079 tc->origin_size - virt_block_begin);
1080
1081 else
1082 schedule_zero(tc, virt_block, data_dest, cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001083}
1084
Joe Thornbere49e5822012-07-27 15:08:16 +01001085/*
1086 * A non-zero return indicates read_only or fail_io mode.
1087 * Many callers don't care about the return value.
1088 */
Joe Thornber020cc3b2013-12-04 15:05:36 -05001089static int commit(struct pool *pool)
Joe Thornbere49e5822012-07-27 15:08:16 +01001090{
1091 int r;
1092
Joe Thornber8d07e8a2014-05-06 16:28:14 +01001093 if (get_pool_mode(pool) >= PM_READ_ONLY)
Joe Thornbere49e5822012-07-27 15:08:16 +01001094 return -EINVAL;
1095
Joe Thornber020cc3b2013-12-04 15:05:36 -05001096 r = dm_pool_commit_metadata(pool->pmd);
Joe Thornberb5330652013-12-04 19:51:33 -05001097 if (r)
1098 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
Joe Thornbere49e5822012-07-27 15:08:16 +01001099
1100 return r;
1101}
1102
Joe Thornber88a66212013-12-04 20:16:12 -05001103static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1104{
1105 unsigned long flags;
1106
1107 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1108 DMWARN("%s: reached low water mark for data device: sending event.",
1109 dm_device_name(pool->pool_md));
1110 spin_lock_irqsave(&pool->lock, flags);
1111 pool->low_water_triggered = true;
1112 spin_unlock_irqrestore(&pool->lock, flags);
1113 dm_table_event(pool->ti->table);
1114 }
1115}
1116
Joe Thornber3e1a0692014-03-03 16:03:26 +00001117static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
1118
Joe Thornber991d9fa2011-10-31 20:21:18 +00001119static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1120{
1121 int r;
1122 dm_block_t free_blocks;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001123 struct pool *pool = tc->pool;
1124
Joe Thornber3e1a0692014-03-03 16:03:26 +00001125 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
Joe Thornber8d30abf2013-12-04 19:16:11 -05001126 return -EINVAL;
1127
Joe Thornber991d9fa2011-10-31 20:21:18 +00001128 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
Joe Thornberb5330652013-12-04 19:51:33 -05001129 if (r) {
1130 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001131 return r;
Joe Thornberb5330652013-12-04 19:51:33 -05001132 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001133
Joe Thornber88a66212013-12-04 20:16:12 -05001134 check_low_water_mark(pool, free_blocks);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001135
1136 if (!free_blocks) {
Mike Snitzer94563ba2013-08-22 09:56:18 -04001137 /*
1138 * Try to commit to see if that will free up some
1139 * more space.
1140 */
Joe Thornber020cc3b2013-12-04 15:05:36 -05001141 r = commit(pool);
1142 if (r)
1143 return r;
Mike Snitzer94563ba2013-08-22 09:56:18 -04001144
1145 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
Joe Thornberb5330652013-12-04 19:51:33 -05001146 if (r) {
1147 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
Mike Snitzer94563ba2013-08-22 09:56:18 -04001148 return r;
Joe Thornberb5330652013-12-04 19:51:33 -05001149 }
Mike Snitzer94563ba2013-08-22 09:56:18 -04001150
Mike Snitzer94563ba2013-08-22 09:56:18 -04001151 if (!free_blocks) {
Joe Thornber3e1a0692014-03-03 16:03:26 +00001152 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001153 return -ENOSPC;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001154 }
1155 }
1156
1157 r = dm_pool_alloc_data_block(pool->pmd, result);
Mike Snitzer4a02b342013-12-03 12:20:57 -05001158 if (r) {
Joe Thornberb5330652013-12-04 19:51:33 -05001159 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001160 return r;
Mike Snitzer4a02b342013-12-03 12:20:57 -05001161 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001162
1163 return 0;
1164}
1165
1166/*
1167 * If we have run out of space, queue bios until the device is
1168 * resumed, presumably after having been reloaded with more space.
1169 */
1170static void retry_on_resume(struct bio *bio)
1171{
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00001172 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornbereb2aa482012-03-28 18:41:28 +01001173 struct thin_c *tc = h->tc;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001174 unsigned long flags;
1175
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001176 spin_lock_irqsave(&tc->lock, flags);
1177 bio_list_add(&tc->retry_on_resume_list, bio);
1178 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001179}
1180
Mike Snitzeraf918052014-05-22 14:32:51 -04001181static int should_error_unserviceable_bio(struct pool *pool)
Joe Thornber3e1a0692014-03-03 16:03:26 +00001182{
1183 enum pool_mode m = get_pool_mode(pool);
1184
1185 switch (m) {
1186 case PM_WRITE:
1187 /* Shouldn't get here */
1188 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
Mike Snitzeraf918052014-05-22 14:32:51 -04001189 return -EIO;
Joe Thornber3e1a0692014-03-03 16:03:26 +00001190
1191 case PM_OUT_OF_DATA_SPACE:
Mike Snitzeraf918052014-05-22 14:32:51 -04001192 return pool->pf.error_if_no_space ? -ENOSPC : 0;
Joe Thornber3e1a0692014-03-03 16:03:26 +00001193
1194 case PM_READ_ONLY:
1195 case PM_FAIL:
Mike Snitzeraf918052014-05-22 14:32:51 -04001196 return -EIO;
Joe Thornber3e1a0692014-03-03 16:03:26 +00001197 default:
1198 /* Shouldn't get here */
1199 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
Mike Snitzeraf918052014-05-22 14:32:51 -04001200 return -EIO;
Joe Thornber3e1a0692014-03-03 16:03:26 +00001201 }
1202}
1203
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001204static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1205{
Mike Snitzeraf918052014-05-22 14:32:51 -04001206 int error = should_error_unserviceable_bio(pool);
1207
1208 if (error)
1209 bio_endio(bio, error);
Mike Snitzer6d162022013-12-20 18:09:02 -05001210 else
1211 retry_on_resume(bio);
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001212}
1213
Mike Snitzer399cadd2013-12-05 16:03:33 -05001214static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001215{
1216 struct bio *bio;
1217 struct bio_list bios;
Mike Snitzeraf918052014-05-22 14:32:51 -04001218 int error;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001219
Mike Snitzeraf918052014-05-22 14:32:51 -04001220 error = should_error_unserviceable_bio(pool);
1221 if (error) {
1222 cell_error_with_code(pool, cell, error);
Joe Thornber3e1a0692014-03-03 16:03:26 +00001223 return;
1224 }
1225
Joe Thornber991d9fa2011-10-31 20:21:18 +00001226 bio_list_init(&bios);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001227 cell_release(pool, cell, &bios);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001228
Mike Snitzeraf918052014-05-22 14:32:51 -04001229 error = should_error_unserviceable_bio(pool);
1230 if (error)
Joe Thornber3e1a0692014-03-03 16:03:26 +00001231 while ((bio = bio_list_pop(&bios)))
Mike Snitzeraf918052014-05-22 14:32:51 -04001232 bio_endio(bio, error);
Joe Thornber3e1a0692014-03-03 16:03:26 +00001233 else
1234 while ((bio = bio_list_pop(&bios)))
1235 retry_on_resume(bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001236}
1237
Joe Thornbera374bb22014-10-10 13:43:14 +01001238static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
Joe Thornber104655f2012-03-28 18:41:28 +01001239{
1240 int r;
Joe Thornbera374bb22014-10-10 13:43:14 +01001241 struct bio *bio = cell->holder;
Joe Thornber104655f2012-03-28 18:41:28 +01001242 struct pool *pool = tc->pool;
Joe Thornbera374bb22014-10-10 13:43:14 +01001243 struct dm_bio_prison_cell *cell2;
1244 struct dm_cell_key key2;
Joe Thornber104655f2012-03-28 18:41:28 +01001245 dm_block_t block = get_bio_block(tc, bio);
1246 struct dm_thin_lookup_result lookup_result;
Mike Snitzera24c2562012-06-03 00:30:00 +01001247 struct dm_thin_new_mapping *m;
Joe Thornber104655f2012-03-28 18:41:28 +01001248
Joe Thornbera374bb22014-10-10 13:43:14 +01001249 if (tc->requeue_mode) {
1250 cell_requeue(pool, cell);
Joe Thornber104655f2012-03-28 18:41:28 +01001251 return;
Joe Thornbera374bb22014-10-10 13:43:14 +01001252 }
Joe Thornber104655f2012-03-28 18:41:28 +01001253
1254 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1255 switch (r) {
1256 case 0:
1257 /*
1258 * Check nobody is fiddling with this pool block. This can
1259 * happen if someone's in the process of breaking sharing
1260 * on this block.
1261 */
1262 build_data_key(tc->td, lookup_result.block, &key2);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001263 if (bio_detain(tc->pool, &key2, bio, &cell2)) {
Joe Thornberf286ba02012-12-21 20:23:33 +00001264 cell_defer_no_holder(tc, cell);
Joe Thornber104655f2012-03-28 18:41:28 +01001265 break;
1266 }
1267
1268 if (io_overlaps_block(pool, bio)) {
1269 /*
1270 * IO may still be going to the destination block. We must
1271 * quiesce before we can do the removal.
1272 */
1273 m = get_next_mapping(pool);
1274 m->tc = tc;
Joe Thornber19fa1a62013-12-17 12:09:40 -05001275 m->pass_discard = pool->pf.discard_passdown;
1276 m->definitely_not_shared = !lookup_result.shared;
Joe Thornber104655f2012-03-28 18:41:28 +01001277 m->virt_block = block;
1278 m->data_block = lookup_result.block;
1279 m->cell = cell;
1280 m->cell2 = cell2;
Joe Thornber104655f2012-03-28 18:41:28 +01001281 m->bio = bio;
1282
Joe Thornber7a7e97c2014-09-12 11:34:01 +01001283 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1284 pool->process_prepared_discard(m);
1285
Joe Thornber104655f2012-03-28 18:41:28 +01001286 } else {
Joe Thornbere8088072012-12-21 20:23:31 +00001287 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001288 cell_defer_no_holder(tc, cell);
1289 cell_defer_no_holder(tc, cell2);
Joe Thornbere8088072012-12-21 20:23:31 +00001290
Joe Thornber104655f2012-03-28 18:41:28 +01001291 /*
Mikulas Patocka49296302012-07-27 15:08:03 +01001292 * The DM core makes sure that the discard doesn't span
1293 * a block boundary. So we submit the discard of a
1294 * partial block appropriately.
Joe Thornber104655f2012-03-28 18:41:28 +01001295 */
Mikulas Patocka650d2a02012-07-20 14:25:05 +01001296 if ((!lookup_result.shared) && pool->pf.discard_passdown)
1297 remap_and_issue(tc, bio, lookup_result.block);
1298 else
1299 bio_endio(bio, 0);
Joe Thornber104655f2012-03-28 18:41:28 +01001300 }
1301 break;
1302
1303 case -ENODATA:
1304 /*
1305 * It isn't provisioned, just forget it.
1306 */
Joe Thornberf286ba02012-12-21 20:23:33 +00001307 cell_defer_no_holder(tc, cell);
Joe Thornber104655f2012-03-28 18:41:28 +01001308 bio_endio(bio, 0);
1309 break;
1310
1311 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001312 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1313 __func__, r);
Joe Thornberf286ba02012-12-21 20:23:33 +00001314 cell_defer_no_holder(tc, cell);
Joe Thornber104655f2012-03-28 18:41:28 +01001315 bio_io_error(bio);
1316 break;
1317 }
1318}
1319
Joe Thornbera374bb22014-10-10 13:43:14 +01001320static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1321{
1322 struct dm_bio_prison_cell *cell;
1323 struct dm_cell_key key;
1324 dm_block_t block = get_bio_block(tc, bio);
1325
1326 build_virtual_key(tc->td, block, &key);
1327 if (bio_detain(tc->pool, &key, bio, &cell))
1328 return;
1329
1330 process_discard_cell(tc, cell);
1331}
1332
Joe Thornber991d9fa2011-10-31 20:21:18 +00001333static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
Mike Snitzer44feb382012-10-12 21:02:10 +01001334 struct dm_cell_key *key,
Joe Thornber991d9fa2011-10-31 20:21:18 +00001335 struct dm_thin_lookup_result *lookup_result,
Mike Snitzera24c2562012-06-03 00:30:00 +01001336 struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001337{
1338 int r;
1339 dm_block_t data_block;
Mike Snitzerd6fc2042013-08-21 17:40:11 -04001340 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001341
1342 r = alloc_data_block(tc, &data_block);
1343 switch (r) {
1344 case 0:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001345 schedule_internal_copy(tc, block, lookup_result->block,
1346 data_block, cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001347 break;
1348
1349 case -ENOSPC:
Mike Snitzer399cadd2013-12-05 16:03:33 -05001350 retry_bios_on_resume(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001351 break;
1352
1353 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001354 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1355 __func__, r);
Mike Snitzerd6fc2042013-08-21 17:40:11 -04001356 cell_error(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001357 break;
1358 }
1359}
1360
1361static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1362 dm_block_t block,
1363 struct dm_thin_lookup_result *lookup_result)
1364{
Mike Snitzera24c2562012-06-03 00:30:00 +01001365 struct dm_bio_prison_cell *cell;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001366 struct pool *pool = tc->pool;
Mike Snitzer44feb382012-10-12 21:02:10 +01001367 struct dm_cell_key key;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001368
1369 /*
1370 * If cell is already occupied, then sharing is already in the process
1371 * of being broken so we have nothing further to do here.
1372 */
1373 build_data_key(tc->td, lookup_result->block, &key);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001374 if (bio_detain(pool, &key, bio, &cell))
Joe Thornber991d9fa2011-10-31 20:21:18 +00001375 return;
1376
Kent Overstreet4f024f32013-10-11 15:44:27 -07001377 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001378 break_sharing(tc, bio, block, &key, lookup_result, cell);
1379 else {
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00001380 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornber991d9fa2011-10-31 20:21:18 +00001381
Mike Snitzer44feb382012-10-12 21:02:10 +01001382 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
Joe Thornbere8088072012-12-21 20:23:31 +00001383 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001384 cell_defer_no_holder(tc, cell);
Joe Thornbere8088072012-12-21 20:23:31 +00001385
Joe Thornber991d9fa2011-10-31 20:21:18 +00001386 remap_and_issue(tc, bio, lookup_result->block);
1387 }
1388}
1389
1390static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
Mike Snitzera24c2562012-06-03 00:30:00 +01001391 struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001392{
1393 int r;
1394 dm_block_t data_block;
Joe Thornber6beca5e2013-03-01 22:45:50 +00001395 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001396
1397 /*
1398 * Remap empty bios (flushes) immediately, without provisioning.
1399 */
Kent Overstreet4f024f32013-10-11 15:44:27 -07001400 if (!bio->bi_iter.bi_size) {
Joe Thornber6beca5e2013-03-01 22:45:50 +00001401 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001402 cell_defer_no_holder(tc, cell);
Joe Thornbere8088072012-12-21 20:23:31 +00001403
Joe Thornber991d9fa2011-10-31 20:21:18 +00001404 remap_and_issue(tc, bio, 0);
1405 return;
1406 }
1407
1408 /*
1409 * Fill read bios with zeroes and complete them immediately.
1410 */
1411 if (bio_data_dir(bio) == READ) {
1412 zero_fill_bio(bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001413 cell_defer_no_holder(tc, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001414 bio_endio(bio, 0);
1415 return;
1416 }
1417
1418 r = alloc_data_block(tc, &data_block);
1419 switch (r) {
1420 case 0:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001421 if (tc->origin_dev)
1422 schedule_external_copy(tc, block, data_block, cell, bio);
1423 else
1424 schedule_zero(tc, block, data_block, cell, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001425 break;
1426
1427 case -ENOSPC:
Mike Snitzer399cadd2013-12-05 16:03:33 -05001428 retry_bios_on_resume(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001429 break;
1430
1431 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001432 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1433 __func__, r);
Joe Thornber6beca5e2013-03-01 22:45:50 +00001434 cell_error(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001435 break;
1436 }
1437}
1438
Joe Thornbera374bb22014-10-10 13:43:14 +01001439static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001440{
1441 int r;
Joe Thornber6beca5e2013-03-01 22:45:50 +00001442 struct pool *pool = tc->pool;
Joe Thornbera374bb22014-10-10 13:43:14 +01001443 struct bio *bio = cell->holder;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001444 dm_block_t block = get_bio_block(tc, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001445 struct dm_thin_lookup_result lookup_result;
1446
Joe Thornbera374bb22014-10-10 13:43:14 +01001447 if (tc->requeue_mode) {
1448 cell_requeue(pool, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001449 return;
Joe Thornbera374bb22014-10-10 13:43:14 +01001450 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001451
1452 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1453 switch (r) {
1454 case 0:
Joe Thornbere8088072012-12-21 20:23:31 +00001455 if (lookup_result.shared) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001456 process_shared_bio(tc, bio, block, &lookup_result);
Joe Thornbera374bb22014-10-10 13:43:14 +01001457 // FIXME: we can't remap because we're waiting on a commit
Joe Thornber6beca5e2013-03-01 22:45:50 +00001458 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
Joe Thornbere8088072012-12-21 20:23:31 +00001459 } else {
Joe Thornber6beca5e2013-03-01 22:45:50 +00001460 inc_all_io_entry(pool, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001461 remap_and_issue(tc, bio, lookup_result.block);
Joe Thornbera374bb22014-10-10 13:43:14 +01001462 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
Joe Thornbere8088072012-12-21 20:23:31 +00001463 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001464 break;
1465
1466 case -ENODATA:
Joe Thornber2dd9c252012-03-28 18:41:28 +01001467 if (bio_data_dir(bio) == READ && tc->origin_dev) {
Joe Thornber6beca5e2013-03-01 22:45:50 +00001468 inc_all_io_entry(pool, bio);
Joe Thornberf286ba02012-12-21 20:23:33 +00001469 cell_defer_no_holder(tc, cell);
Joe Thornbere8088072012-12-21 20:23:31 +00001470
Joe Thornbere5aea7b2014-06-13 14:47:24 +01001471 if (bio_end_sector(bio) <= tc->origin_size)
1472 remap_to_origin_and_issue(tc, bio);
1473
1474 else if (bio->bi_iter.bi_sector < tc->origin_size) {
1475 zero_fill_bio(bio);
1476 bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1477 remap_to_origin_and_issue(tc, bio);
1478
1479 } else {
1480 zero_fill_bio(bio);
1481 bio_endio(bio, 0);
1482 }
Joe Thornber2dd9c252012-03-28 18:41:28 +01001483 } else
1484 provision_block(tc, bio, block, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001485 break;
1486
1487 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001488 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1489 __func__, r);
Joe Thornberf286ba02012-12-21 20:23:33 +00001490 cell_defer_no_holder(tc, cell);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001491 bio_io_error(bio);
1492 break;
1493 }
1494}
1495
Joe Thornbera374bb22014-10-10 13:43:14 +01001496static void process_bio(struct thin_c *tc, struct bio *bio)
1497{
1498 struct pool *pool = tc->pool;
1499 dm_block_t block = get_bio_block(tc, bio);
1500 struct dm_bio_prison_cell *cell;
1501 struct dm_cell_key key;
1502
1503 /*
1504 * If cell is already occupied, then the block is already
1505 * being provisioned so we have nothing further to do here.
1506 */
1507 build_virtual_key(tc->td, block, &key);
1508 if (bio_detain(pool, &key, bio, &cell))
1509 return;
1510
1511 process_cell(tc, cell);
1512}
1513
1514static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
1515 struct dm_bio_prison_cell *cell)
Joe Thornbere49e5822012-07-27 15:08:16 +01001516{
1517 int r;
1518 int rw = bio_data_dir(bio);
1519 dm_block_t block = get_bio_block(tc, bio);
1520 struct dm_thin_lookup_result lookup_result;
1521
1522 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1523 switch (r) {
1524 case 0:
Joe Thornbera374bb22014-10-10 13:43:14 +01001525 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001526 handle_unserviceable_bio(tc->pool, bio);
Joe Thornbera374bb22014-10-10 13:43:14 +01001527 if (cell)
1528 cell_defer_no_holder(tc, cell);
1529 } else {
Joe Thornbere8088072012-12-21 20:23:31 +00001530 inc_all_io_entry(tc->pool, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01001531 remap_and_issue(tc, bio, lookup_result.block);
Joe Thornbera374bb22014-10-10 13:43:14 +01001532 if (cell)
1533 inc_remap_and_issue_cell(tc, cell, lookup_result.block);
Joe Thornbere8088072012-12-21 20:23:31 +00001534 }
Joe Thornbere49e5822012-07-27 15:08:16 +01001535 break;
1536
1537 case -ENODATA:
Joe Thornbera374bb22014-10-10 13:43:14 +01001538 if (cell)
1539 cell_defer_no_holder(tc, cell);
Joe Thornbere49e5822012-07-27 15:08:16 +01001540 if (rw != READ) {
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05001541 handle_unserviceable_bio(tc->pool, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01001542 break;
1543 }
1544
1545 if (tc->origin_dev) {
Joe Thornbere8088072012-12-21 20:23:31 +00001546 inc_all_io_entry(tc->pool, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01001547 remap_to_origin_and_issue(tc, bio);
1548 break;
1549 }
1550
1551 zero_fill_bio(bio);
1552 bio_endio(bio, 0);
1553 break;
1554
1555 default:
Mike Snitzerc3977412012-12-21 20:23:34 +00001556 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1557 __func__, r);
Joe Thornbera374bb22014-10-10 13:43:14 +01001558 if (cell)
1559 cell_defer_no_holder(tc, cell);
Joe Thornbere49e5822012-07-27 15:08:16 +01001560 bio_io_error(bio);
1561 break;
1562 }
1563}
1564
Joe Thornbera374bb22014-10-10 13:43:14 +01001565static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1566{
1567 __process_bio_read_only(tc, bio, NULL);
1568}
1569
1570static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1571{
1572 __process_bio_read_only(tc, cell->holder, cell);
1573}
1574
Joe Thornber3e1a0692014-03-03 16:03:26 +00001575static void process_bio_success(struct thin_c *tc, struct bio *bio)
1576{
1577 bio_endio(bio, 0);
1578}
1579
Joe Thornbere49e5822012-07-27 15:08:16 +01001580static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1581{
1582 bio_io_error(bio);
1583}
1584
Joe Thornbera374bb22014-10-10 13:43:14 +01001585static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1586{
1587 cell_success(tc->pool, cell);
1588}
1589
1590static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1591{
1592 cell_error(tc->pool, cell);
1593}
1594
Joe Thornberac8c3f32013-05-10 14:37:21 +01001595/*
1596 * FIXME: should we also commit due to size of transaction, measured in
1597 * metadata blocks?
1598 */
Joe Thornber905e51b2012-03-28 18:41:27 +01001599static int need_commit_due_to_time(struct pool *pool)
1600{
1601 return jiffies < pool->last_commit_jiffies ||
1602 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1603}
1604
Mike Snitzer67324ea2014-03-21 18:33:41 -04001605#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
1606#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
1607
1608static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
1609{
1610 struct rb_node **rbp, *parent;
1611 struct dm_thin_endio_hook *pbd;
1612 sector_t bi_sector = bio->bi_iter.bi_sector;
1613
1614 rbp = &tc->sort_bio_list.rb_node;
1615 parent = NULL;
1616 while (*rbp) {
1617 parent = *rbp;
1618 pbd = thin_pbd(parent);
1619
1620 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
1621 rbp = &(*rbp)->rb_left;
1622 else
1623 rbp = &(*rbp)->rb_right;
1624 }
1625
1626 pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1627 rb_link_node(&pbd->rb_node, parent, rbp);
1628 rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
1629}
1630
1631static void __extract_sorted_bios(struct thin_c *tc)
1632{
1633 struct rb_node *node;
1634 struct dm_thin_endio_hook *pbd;
1635 struct bio *bio;
1636
1637 for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
1638 pbd = thin_pbd(node);
1639 bio = thin_bio(pbd);
1640
1641 bio_list_add(&tc->deferred_bio_list, bio);
1642 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
1643 }
1644
1645 WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
1646}
1647
1648static void __sort_thin_deferred_bios(struct thin_c *tc)
1649{
1650 struct bio *bio;
1651 struct bio_list bios;
1652
1653 bio_list_init(&bios);
1654 bio_list_merge(&bios, &tc->deferred_bio_list);
1655 bio_list_init(&tc->deferred_bio_list);
1656
1657 /* Sort deferred_bio_list using rb-tree */
1658 while ((bio = bio_list_pop(&bios)))
1659 __thin_bio_rb_add(tc, bio);
1660
1661 /*
1662 * Transfer the sorted bios in sort_bio_list back to
1663 * deferred_bio_list to allow lockless submission of
1664 * all bios.
1665 */
1666 __extract_sorted_bios(tc);
1667}
1668
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001669static void process_thin_deferred_bios(struct thin_c *tc)
Joe Thornber991d9fa2011-10-31 20:21:18 +00001670{
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001671 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001672 unsigned long flags;
1673 struct bio *bio;
1674 struct bio_list bios;
Mike Snitzer67324ea2014-03-21 18:33:41 -04001675 struct blk_plug plug;
Joe Thornber8a01a6a2014-10-06 15:28:30 +01001676 unsigned count = 0;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001677
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001678 if (tc->requeue_mode) {
1679 requeue_bio_list(tc, &tc->deferred_bio_list);
1680 return;
1681 }
1682
Joe Thornber991d9fa2011-10-31 20:21:18 +00001683 bio_list_init(&bios);
1684
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001685 spin_lock_irqsave(&tc->lock, flags);
Mike Snitzer67324ea2014-03-21 18:33:41 -04001686
1687 if (bio_list_empty(&tc->deferred_bio_list)) {
1688 spin_unlock_irqrestore(&tc->lock, flags);
1689 return;
1690 }
1691
1692 __sort_thin_deferred_bios(tc);
1693
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001694 bio_list_merge(&bios, &tc->deferred_bio_list);
1695 bio_list_init(&tc->deferred_bio_list);
Mike Snitzer67324ea2014-03-21 18:33:41 -04001696
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001697 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001698
Mike Snitzer67324ea2014-03-21 18:33:41 -04001699 blk_start_plug(&plug);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001700 while ((bio = bio_list_pop(&bios))) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001701 /*
1702 * If we've got no free new_mapping structs, and processing
1703 * this bio might require one, we pause until there are some
1704 * prepared mappings to process.
1705 */
1706 if (ensure_next_mapping(pool)) {
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001707 spin_lock_irqsave(&tc->lock, flags);
1708 bio_list_add(&tc->deferred_bio_list, bio);
1709 bio_list_merge(&tc->deferred_bio_list, &bios);
1710 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001711 break;
1712 }
Joe Thornber104655f2012-03-28 18:41:28 +01001713
1714 if (bio->bi_rw & REQ_DISCARD)
Joe Thornbere49e5822012-07-27 15:08:16 +01001715 pool->process_discard(tc, bio);
Joe Thornber104655f2012-03-28 18:41:28 +01001716 else
Joe Thornbere49e5822012-07-27 15:08:16 +01001717 pool->process_bio(tc, bio);
Joe Thornber8a01a6a2014-10-06 15:28:30 +01001718
1719 if ((count++ & 127) == 0) {
Joe Thornber7d327fe2014-10-06 15:45:59 +01001720 throttle_work_update(&pool->throttle);
Joe Thornber8a01a6a2014-10-06 15:28:30 +01001721 dm_pool_issue_prefetches(pool->pmd);
1722 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001723 }
Mike Snitzer67324ea2014-03-21 18:33:41 -04001724 blk_finish_plug(&plug);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001725}
1726
Joe Thornbera374bb22014-10-10 13:43:14 +01001727static void process_thin_deferred_cells(struct thin_c *tc)
1728{
1729 struct pool *pool = tc->pool;
1730 unsigned long flags;
1731 struct list_head cells;
1732 struct dm_bio_prison_cell *cell, *tmp;
1733
1734 INIT_LIST_HEAD(&cells);
1735
1736 spin_lock_irqsave(&tc->lock, flags);
1737 list_splice_init(&tc->deferred_cells, &cells);
1738 spin_unlock_irqrestore(&tc->lock, flags);
1739
1740 if (list_empty(&cells))
1741 return;
1742
1743 list_for_each_entry_safe(cell, tmp, &cells, user_list) {
1744 BUG_ON(!cell->holder);
1745
1746 /*
1747 * If we've got no free new_mapping structs, and processing
1748 * this bio might require one, we pause until there are some
1749 * prepared mappings to process.
1750 */
1751 if (ensure_next_mapping(pool)) {
1752 spin_lock_irqsave(&tc->lock, flags);
1753 list_add(&cell->user_list, &tc->deferred_cells);
1754 list_splice(&cells, &tc->deferred_cells);
1755 spin_unlock_irqrestore(&tc->lock, flags);
1756 break;
1757 }
1758
1759 if (cell->holder->bi_rw & REQ_DISCARD)
1760 pool->process_discard_cell(tc, cell);
1761 else
1762 pool->process_cell(tc, cell);
1763 }
1764}
1765
Joe Thornberb10ebd32014-04-08 11:29:01 +01001766static void thin_get(struct thin_c *tc);
1767static void thin_put(struct thin_c *tc);
1768
1769/*
1770 * We can't hold rcu_read_lock() around code that can block. So we
1771 * find a thin with the rcu lock held; bump a refcount; then drop
1772 * the lock.
1773 */
1774static struct thin_c *get_first_thin(struct pool *pool)
1775{
1776 struct thin_c *tc = NULL;
1777
1778 rcu_read_lock();
1779 if (!list_empty(&pool->active_thins)) {
1780 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
1781 thin_get(tc);
1782 }
1783 rcu_read_unlock();
1784
1785 return tc;
1786}
1787
1788static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
1789{
1790 struct thin_c *old_tc = tc;
1791
1792 rcu_read_lock();
1793 list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
1794 thin_get(tc);
1795 thin_put(old_tc);
1796 rcu_read_unlock();
1797 return tc;
1798 }
1799 thin_put(old_tc);
1800 rcu_read_unlock();
1801
1802 return NULL;
1803}
1804
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001805static void process_deferred_bios(struct pool *pool)
1806{
1807 unsigned long flags;
1808 struct bio *bio;
1809 struct bio_list bios;
1810 struct thin_c *tc;
1811
Joe Thornberb10ebd32014-04-08 11:29:01 +01001812 tc = get_first_thin(pool);
1813 while (tc) {
Joe Thornbera374bb22014-10-10 13:43:14 +01001814 process_thin_deferred_cells(tc);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04001815 process_thin_deferred_bios(tc);
Joe Thornberb10ebd32014-04-08 11:29:01 +01001816 tc = get_next_thin(pool, tc);
1817 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00001818
1819 /*
1820 * If there are any deferred flush bios, we must commit
1821 * the metadata before issuing them.
1822 */
1823 bio_list_init(&bios);
1824 spin_lock_irqsave(&pool->lock, flags);
1825 bio_list_merge(&bios, &pool->deferred_flush_bios);
1826 bio_list_init(&pool->deferred_flush_bios);
1827 spin_unlock_irqrestore(&pool->lock, flags);
1828
Mike Snitzer4d1662a2014-02-06 06:08:56 -05001829 if (bio_list_empty(&bios) &&
1830 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
Joe Thornber991d9fa2011-10-31 20:21:18 +00001831 return;
1832
Joe Thornber020cc3b2013-12-04 15:05:36 -05001833 if (commit(pool)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00001834 while ((bio = bio_list_pop(&bios)))
1835 bio_io_error(bio);
1836 return;
1837 }
Joe Thornber905e51b2012-03-28 18:41:27 +01001838 pool->last_commit_jiffies = jiffies;
Joe Thornber991d9fa2011-10-31 20:21:18 +00001839
1840 while ((bio = bio_list_pop(&bios)))
1841 generic_make_request(bio);
1842}
1843
1844static void do_worker(struct work_struct *ws)
1845{
1846 struct pool *pool = container_of(ws, struct pool, worker);
1847
Joe Thornber7d327fe2014-10-06 15:45:59 +01001848 throttle_work_start(&pool->throttle);
Joe Thornber8a01a6a2014-10-06 15:28:30 +01001849 dm_pool_issue_prefetches(pool->pmd);
Joe Thornber7d327fe2014-10-06 15:45:59 +01001850 throttle_work_update(&pool->throttle);
Joe Thornbere49e5822012-07-27 15:08:16 +01001851 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
Joe Thornber7d327fe2014-10-06 15:45:59 +01001852 throttle_work_update(&pool->throttle);
Joe Thornbere49e5822012-07-27 15:08:16 +01001853 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
Joe Thornber7d327fe2014-10-06 15:45:59 +01001854 throttle_work_update(&pool->throttle);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001855 process_deferred_bios(pool);
Joe Thornber7d327fe2014-10-06 15:45:59 +01001856 throttle_work_complete(&pool->throttle);
Joe Thornber991d9fa2011-10-31 20:21:18 +00001857}
1858
Joe Thornber905e51b2012-03-28 18:41:27 +01001859/*
1860 * We want to commit periodically so that not too much
1861 * unwritten data builds up.
1862 */
1863static void do_waker(struct work_struct *ws)
1864{
1865 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1866 wake_worker(pool);
1867 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1868}
1869
Joe Thornber85ad643b2014-05-09 15:59:38 +01001870/*
1871 * We're holding onto IO to allow userland time to react. After the
1872 * timeout either the pool will have been resized (and thus back in
1873 * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
1874 */
1875static void do_no_space_timeout(struct work_struct *ws)
1876{
1877 struct pool *pool = container_of(to_delayed_work(ws), struct pool,
1878 no_space_timeout);
1879
1880 if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
1881 set_pool_mode(pool, PM_READ_ONLY);
1882}
1883
Joe Thornber991d9fa2011-10-31 20:21:18 +00001884/*----------------------------------------------------------------*/
1885
Joe Thornbere7a3e872014-05-13 16:14:14 -04001886struct pool_work {
Joe Thornber738211f2014-03-03 15:52:28 +00001887 struct work_struct worker;
Joe Thornbere7a3e872014-05-13 16:14:14 -04001888 struct completion complete;
Joe Thornber738211f2014-03-03 15:52:28 +00001889};
1890
Joe Thornbere7a3e872014-05-13 16:14:14 -04001891static struct pool_work *to_pool_work(struct work_struct *ws)
Joe Thornber738211f2014-03-03 15:52:28 +00001892{
Joe Thornbere7a3e872014-05-13 16:14:14 -04001893 return container_of(ws, struct pool_work, worker);
1894}
1895
1896static void pool_work_complete(struct pool_work *pw)
1897{
1898 complete(&pw->complete);
1899}
1900
1901static void pool_work_wait(struct pool_work *pw, struct pool *pool,
1902 void (*fn)(struct work_struct *))
1903{
1904 INIT_WORK_ONSTACK(&pw->worker, fn);
1905 init_completion(&pw->complete);
1906 queue_work(pool->wq, &pw->worker);
1907 wait_for_completion(&pw->complete);
1908}
1909
1910/*----------------------------------------------------------------*/
1911
1912struct noflush_work {
1913 struct pool_work pw;
1914 struct thin_c *tc;
1915};
1916
1917static struct noflush_work *to_noflush(struct work_struct *ws)
1918{
1919 return container_of(to_pool_work(ws), struct noflush_work, pw);
Joe Thornber738211f2014-03-03 15:52:28 +00001920}
1921
1922static void do_noflush_start(struct work_struct *ws)
1923{
Joe Thornbere7a3e872014-05-13 16:14:14 -04001924 struct noflush_work *w = to_noflush(ws);
Joe Thornber738211f2014-03-03 15:52:28 +00001925 w->tc->requeue_mode = true;
1926 requeue_io(w->tc);
Joe Thornbere7a3e872014-05-13 16:14:14 -04001927 pool_work_complete(&w->pw);
Joe Thornber738211f2014-03-03 15:52:28 +00001928}
1929
1930static void do_noflush_stop(struct work_struct *ws)
1931{
Joe Thornbere7a3e872014-05-13 16:14:14 -04001932 struct noflush_work *w = to_noflush(ws);
Joe Thornber738211f2014-03-03 15:52:28 +00001933 w->tc->requeue_mode = false;
Joe Thornbere7a3e872014-05-13 16:14:14 -04001934 pool_work_complete(&w->pw);
Joe Thornber738211f2014-03-03 15:52:28 +00001935}
1936
1937static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1938{
1939 struct noflush_work w;
1940
Joe Thornber738211f2014-03-03 15:52:28 +00001941 w.tc = tc;
Joe Thornbere7a3e872014-05-13 16:14:14 -04001942 pool_work_wait(&w.pw, tc->pool, fn);
Joe Thornber738211f2014-03-03 15:52:28 +00001943}
1944
1945/*----------------------------------------------------------------*/
1946
Joe Thornbere49e5822012-07-27 15:08:16 +01001947static enum pool_mode get_pool_mode(struct pool *pool)
1948{
1949 return pool->pf.mode;
1950}
1951
Joe Thornber3e1a0692014-03-03 16:03:26 +00001952static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1953{
1954 dm_table_event(pool->ti->table);
1955 DMINFO("%s: switching pool to %s mode",
1956 dm_device_name(pool->pool_md), new_mode);
1957}
1958
Mike Snitzer8b64e882013-12-20 14:27:28 -05001959static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
Joe Thornbere49e5822012-07-27 15:08:16 +01001960{
Mike Snitzercdc2b412014-02-14 18:10:55 -05001961 struct pool_c *pt = pool->ti->private;
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05001962 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1963 enum pool_mode old_mode = get_pool_mode(pool);
Mike Snitzer80c57892014-05-20 13:38:33 -04001964 unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05001965
1966 /*
1967 * Never allow the pool to transition to PM_WRITE mode if user
1968 * intervention is required to verify metadata and data consistency.
1969 */
1970 if (new_mode == PM_WRITE && needs_check) {
1971 DMERR("%s: unable to switch pool to write mode until repaired.",
1972 dm_device_name(pool->pool_md));
1973 if (old_mode != new_mode)
1974 new_mode = old_mode;
1975 else
1976 new_mode = PM_READ_ONLY;
1977 }
1978 /*
1979 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1980 * not going to recover without a thin_repair. So we never let the
1981 * pool move out of the old mode.
1982 */
1983 if (old_mode == PM_FAIL)
1984 new_mode = old_mode;
Joe Thornbere49e5822012-07-27 15:08:16 +01001985
Mike Snitzer8b64e882013-12-20 14:27:28 -05001986 switch (new_mode) {
Joe Thornbere49e5822012-07-27 15:08:16 +01001987 case PM_FAIL:
Mike Snitzer8b64e882013-12-20 14:27:28 -05001988 if (old_mode != new_mode)
Joe Thornber3e1a0692014-03-03 16:03:26 +00001989 notify_of_pool_mode_change(pool, "failure");
Joe Thornber5383ef32013-12-04 16:30:01 -05001990 dm_pool_metadata_read_only(pool->pmd);
Joe Thornbere49e5822012-07-27 15:08:16 +01001991 pool->process_bio = process_bio_fail;
1992 pool->process_discard = process_bio_fail;
Joe Thornbera374bb22014-10-10 13:43:14 +01001993 pool->process_cell = process_cell_fail;
1994 pool->process_discard_cell = process_cell_fail;
Joe Thornbere49e5822012-07-27 15:08:16 +01001995 pool->process_prepared_mapping = process_prepared_mapping_fail;
1996 pool->process_prepared_discard = process_prepared_discard_fail;
Joe Thornber3e1a0692014-03-03 16:03:26 +00001997
1998 error_retry_list(pool);
Joe Thornbere49e5822012-07-27 15:08:16 +01001999 break;
2000
2001 case PM_READ_ONLY:
Mike Snitzer8b64e882013-12-20 14:27:28 -05002002 if (old_mode != new_mode)
Joe Thornber3e1a0692014-03-03 16:03:26 +00002003 notify_of_pool_mode_change(pool, "read-only");
2004 dm_pool_metadata_read_only(pool->pmd);
2005 pool->process_bio = process_bio_read_only;
2006 pool->process_discard = process_bio_success;
Joe Thornbera374bb22014-10-10 13:43:14 +01002007 pool->process_cell = process_cell_read_only;
2008 pool->process_discard_cell = process_cell_success;
Joe Thornber3e1a0692014-03-03 16:03:26 +00002009 pool->process_prepared_mapping = process_prepared_mapping_fail;
2010 pool->process_prepared_discard = process_prepared_discard_passdown;
2011
2012 error_retry_list(pool);
2013 break;
2014
2015 case PM_OUT_OF_DATA_SPACE:
2016 /*
2017 * Ideally we'd never hit this state; the low water mark
2018 * would trigger userland to extend the pool before we
2019 * completely run out of data space. However, many small
2020 * IOs to unprovisioned space can consume data space at an
2021 * alarming rate. Adjust your low water mark if you're
2022 * frequently seeing this mode.
2023 */
2024 if (old_mode != new_mode)
2025 notify_of_pool_mode_change(pool, "out-of-data-space");
2026 pool->process_bio = process_bio_read_only;
Joe Thornbera374bb22014-10-10 13:43:14 +01002027 pool->process_discard = process_discard_bio;
2028 pool->process_cell = process_cell_read_only;
2029 pool->process_discard_cell = process_discard_cell;
Joe Thornber3e1a0692014-03-03 16:03:26 +00002030 pool->process_prepared_mapping = process_prepared_mapping;
2031 pool->process_prepared_discard = process_prepared_discard_passdown;
Joe Thornber85ad643b2014-05-09 15:59:38 +01002032
Mike Snitzer80c57892014-05-20 13:38:33 -04002033 if (!pool->pf.error_if_no_space && no_space_timeout)
2034 queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
Joe Thornbere49e5822012-07-27 15:08:16 +01002035 break;
2036
2037 case PM_WRITE:
Mike Snitzer8b64e882013-12-20 14:27:28 -05002038 if (old_mode != new_mode)
Joe Thornber3e1a0692014-03-03 16:03:26 +00002039 notify_of_pool_mode_change(pool, "write");
Joe Thornber9b7aaa62013-12-04 16:58:19 -05002040 dm_pool_metadata_read_write(pool->pmd);
Joe Thornbere49e5822012-07-27 15:08:16 +01002041 pool->process_bio = process_bio;
Joe Thornbera374bb22014-10-10 13:43:14 +01002042 pool->process_discard = process_discard_bio;
2043 pool->process_cell = process_cell;
2044 pool->process_discard_cell = process_discard_cell;
Joe Thornbere49e5822012-07-27 15:08:16 +01002045 pool->process_prepared_mapping = process_prepared_mapping;
2046 pool->process_prepared_discard = process_prepared_discard;
2047 break;
2048 }
Mike Snitzer8b64e882013-12-20 14:27:28 -05002049
2050 pool->pf.mode = new_mode;
Mike Snitzercdc2b412014-02-14 18:10:55 -05002051 /*
2052 * The pool mode may have changed, sync it so bind_control_target()
2053 * doesn't cause an unexpected mode transition on resume.
2054 */
2055 pt->adjusted_pf.mode = new_mode;
Joe Thornbere49e5822012-07-27 15:08:16 +01002056}
2057
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05002058static void abort_transaction(struct pool *pool)
2059{
2060 const char *dev_name = dm_device_name(pool->pool_md);
2061
2062 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
2063 if (dm_pool_abort_metadata(pool->pmd)) {
2064 DMERR("%s: failed to abort metadata transaction", dev_name);
2065 set_pool_mode(pool, PM_FAIL);
2066 }
2067
2068 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
2069 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
2070 set_pool_mode(pool, PM_FAIL);
2071 }
2072}
2073
Joe Thornberb5330652013-12-04 19:51:33 -05002074static void metadata_operation_failed(struct pool *pool, const char *op, int r)
2075{
2076 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
2077 dm_device_name(pool->pool_md), op, r);
2078
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05002079 abort_transaction(pool);
Joe Thornberb5330652013-12-04 19:51:33 -05002080 set_pool_mode(pool, PM_READ_ONLY);
2081}
2082
Joe Thornbere49e5822012-07-27 15:08:16 +01002083/*----------------------------------------------------------------*/
2084
Joe Thornber991d9fa2011-10-31 20:21:18 +00002085/*
2086 * Mapping functions.
2087 */
2088
2089/*
2090 * Called only while mapping a thin bio to hand it over to the workqueue.
2091 */
2092static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
2093{
2094 unsigned long flags;
2095 struct pool *pool = tc->pool;
2096
Mike Snitzerc140e1c2014-03-20 21:17:14 -04002097 spin_lock_irqsave(&tc->lock, flags);
2098 bio_list_add(&tc->deferred_bio_list, bio);
2099 spin_unlock_irqrestore(&tc->lock, flags);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002100
2101 wake_worker(pool);
2102}
2103
Joe Thornber7d327fe2014-10-06 15:45:59 +01002104static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
2105{
2106 struct pool *pool = tc->pool;
2107
2108 throttle_lock(&pool->throttle);
2109 thin_defer_bio(tc, bio);
2110 throttle_unlock(&pool->throttle);
2111}
2112
Joe Thornbera374bb22014-10-10 13:43:14 +01002113static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2114{
2115 unsigned long flags;
2116 struct pool *pool = tc->pool;
2117
2118 throttle_lock(&pool->throttle);
2119 spin_lock_irqsave(&tc->lock, flags);
2120 list_add_tail(&cell->user_list, &tc->deferred_cells);
2121 spin_unlock_irqrestore(&tc->lock, flags);
2122 throttle_unlock(&pool->throttle);
2123
2124 wake_worker(pool);
2125}
2126
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00002127static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
Joe Thornbereb2aa482012-03-28 18:41:28 +01002128{
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00002129 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornbereb2aa482012-03-28 18:41:28 +01002130
2131 h->tc = tc;
2132 h->shared_read_entry = NULL;
Joe Thornbere8088072012-12-21 20:23:31 +00002133 h->all_io_entry = NULL;
Joe Thornbereb2aa482012-03-28 18:41:28 +01002134 h->overwrite_mapping = NULL;
Joe Thornbereb2aa482012-03-28 18:41:28 +01002135}
2136
Joe Thornber991d9fa2011-10-31 20:21:18 +00002137/*
2138 * Non-blocking function called from the thin target's map function.
2139 */
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00002140static int thin_bio_map(struct dm_target *ti, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002141{
2142 int r;
2143 struct thin_c *tc = ti->private;
2144 dm_block_t block = get_bio_block(tc, bio);
2145 struct dm_thin_device *td = tc->td;
2146 struct dm_thin_lookup_result result;
Joe Thornbera374bb22014-10-10 13:43:14 +01002147 struct dm_bio_prison_cell *virt_cell, *data_cell;
Joe Thornbere8088072012-12-21 20:23:31 +00002148 struct dm_cell_key key;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002149
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00002150 thin_hook_bio(tc, bio);
Joe Thornbere49e5822012-07-27 15:08:16 +01002151
Joe Thornber738211f2014-03-03 15:52:28 +00002152 if (tc->requeue_mode) {
2153 bio_endio(bio, DM_ENDIO_REQUEUE);
2154 return DM_MAPIO_SUBMITTED;
2155 }
2156
Joe Thornbere49e5822012-07-27 15:08:16 +01002157 if (get_pool_mode(tc->pool) == PM_FAIL) {
2158 bio_io_error(bio);
2159 return DM_MAPIO_SUBMITTED;
2160 }
2161
Joe Thornber104655f2012-03-28 18:41:28 +01002162 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
Joe Thornber7d327fe2014-10-06 15:45:59 +01002163 thin_defer_bio_with_throttle(tc, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002164 return DM_MAPIO_SUBMITTED;
2165 }
2166
Joe Thornberc822ed92014-10-10 09:41:09 +01002167 /*
2168 * We must hold the virtual cell before doing the lookup, otherwise
2169 * there's a race with discard.
2170 */
2171 build_virtual_key(tc->td, block, &key);
Joe Thornbera374bb22014-10-10 13:43:14 +01002172 if (bio_detain(tc->pool, &key, bio, &virt_cell))
Joe Thornberc822ed92014-10-10 09:41:09 +01002173 return DM_MAPIO_SUBMITTED;
2174
Joe Thornber991d9fa2011-10-31 20:21:18 +00002175 r = dm_thin_find_block(td, block, 0, &result);
2176
2177 /*
2178 * Note that we defer readahead too.
2179 */
2180 switch (r) {
2181 case 0:
2182 if (unlikely(result.shared)) {
2183 /*
2184 * We have a race condition here between the
2185 * result.shared value returned by the lookup and
2186 * snapshot creation, which may cause new
2187 * sharing.
2188 *
2189 * To avoid this always quiesce the origin before
2190 * taking the snap. You want to do this anyway to
2191 * ensure a consistent application view
2192 * (i.e. lockfs).
2193 *
2194 * More distant ancestors are irrelevant. The
2195 * shared flag will be set in their case.
2196 */
Joe Thornbera374bb22014-10-10 13:43:14 +01002197 thin_defer_cell(tc, virt_cell);
Joe Thornbere8088072012-12-21 20:23:31 +00002198 return DM_MAPIO_SUBMITTED;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002199 }
Joe Thornbere8088072012-12-21 20:23:31 +00002200
Joe Thornbere8088072012-12-21 20:23:31 +00002201 build_data_key(tc->td, result.block, &key);
Joe Thornbera374bb22014-10-10 13:43:14 +01002202 if (bio_detain(tc->pool, &key, bio, &data_cell)) {
2203 cell_defer_no_holder(tc, virt_cell);
Joe Thornbere8088072012-12-21 20:23:31 +00002204 return DM_MAPIO_SUBMITTED;
2205 }
2206
2207 inc_all_io_entry(tc->pool, bio);
Joe Thornbera374bb22014-10-10 13:43:14 +01002208 cell_defer_no_holder(tc, data_cell);
2209 cell_defer_no_holder(tc, virt_cell);
Joe Thornbere8088072012-12-21 20:23:31 +00002210
2211 remap(tc, bio, result.block);
2212 return DM_MAPIO_REMAPPED;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002213
2214 case -ENODATA:
Joe Thornbere49e5822012-07-27 15:08:16 +01002215 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
2216 /*
2217 * This block isn't provisioned, and we have no way
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05002218 * of doing so.
Joe Thornbere49e5822012-07-27 15:08:16 +01002219 */
Mike Snitzer8c0f0e82013-12-05 15:47:24 -05002220 handle_unserviceable_bio(tc->pool, bio);
Joe Thornbera374bb22014-10-10 13:43:14 +01002221 cell_defer_no_holder(tc, virt_cell);
Joe Thornber2aab3852012-12-21 20:23:33 +00002222 return DM_MAPIO_SUBMITTED;
Joe Thornbere49e5822012-07-27 15:08:16 +01002223 }
2224 /* fall through */
2225
2226 case -EWOULDBLOCK:
Joe Thornbera374bb22014-10-10 13:43:14 +01002227 thin_defer_cell(tc, virt_cell);
Joe Thornber2aab3852012-12-21 20:23:33 +00002228 return DM_MAPIO_SUBMITTED;
Joe Thornbere49e5822012-07-27 15:08:16 +01002229
2230 default:
2231 /*
2232 * Must always call bio_io_error on failure.
2233 * dm_thin_find_block can fail with -EINVAL if the
2234 * pool is switched to fail-io mode.
2235 */
2236 bio_io_error(bio);
Joe Thornbera374bb22014-10-10 13:43:14 +01002237 cell_defer_no_holder(tc, virt_cell);
Joe Thornber2aab3852012-12-21 20:23:33 +00002238 return DM_MAPIO_SUBMITTED;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002239 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002240}
2241
2242static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2243{
Joe Thornber991d9fa2011-10-31 20:21:18 +00002244 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
Mike Snitzer760fe672014-03-20 08:36:47 -04002245 struct request_queue *q;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002246
Mike Snitzer760fe672014-03-20 08:36:47 -04002247 if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
2248 return 1;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002249
Mike Snitzer760fe672014-03-20 08:36:47 -04002250 q = bdev_get_queue(pt->data_dev->bdev);
2251 return bdi_congested(&q->backing_dev_info, bdi_bits);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002252}
2253
Mike Snitzerc140e1c2014-03-20 21:17:14 -04002254static void requeue_bios(struct pool *pool)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002255{
Mike Snitzerc140e1c2014-03-20 21:17:14 -04002256 unsigned long flags;
2257 struct thin_c *tc;
2258
2259 rcu_read_lock();
2260 list_for_each_entry_rcu(tc, &pool->active_thins, list) {
2261 spin_lock_irqsave(&tc->lock, flags);
2262 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
2263 bio_list_init(&tc->retry_on_resume_list);
2264 spin_unlock_irqrestore(&tc->lock, flags);
2265 }
2266 rcu_read_unlock();
Joe Thornber991d9fa2011-10-31 20:21:18 +00002267}
2268
2269/*----------------------------------------------------------------
2270 * Binding of control targets to a pool object
2271 *--------------------------------------------------------------*/
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002272static bool data_dev_supports_discard(struct pool_c *pt)
2273{
2274 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2275
2276 return q && blk_queue_discard(q);
2277}
2278
Joe Thornber58051b92013-03-20 17:21:25 +00002279static bool is_factor(sector_t block_size, uint32_t n)
2280{
2281 return !sector_div(block_size, n);
2282}
2283
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002284/*
2285 * If discard_passdown was enabled verify that the data device
Mike Snitzer0424caa2012-09-26 23:45:47 +01002286 * supports discards. Disable discard_passdown if not.
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002287 */
Mike Snitzer0424caa2012-09-26 23:45:47 +01002288static void disable_passdown_if_not_supported(struct pool_c *pt)
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002289{
Mike Snitzer0424caa2012-09-26 23:45:47 +01002290 struct pool *pool = pt->pool;
2291 struct block_device *data_bdev = pt->data_dev->bdev;
2292 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
2293 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
2294 const char *reason = NULL;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002295 char buf[BDEVNAME_SIZE];
2296
Mike Snitzer0424caa2012-09-26 23:45:47 +01002297 if (!pt->adjusted_pf.discard_passdown)
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002298 return;
2299
Mike Snitzer0424caa2012-09-26 23:45:47 +01002300 if (!data_dev_supports_discard(pt))
2301 reason = "discard unsupported";
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002302
Mike Snitzer0424caa2012-09-26 23:45:47 +01002303 else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2304 reason = "max discard sectors smaller than a block";
2305
2306 else if (data_limits->discard_granularity > block_size)
2307 reason = "discard granularity larger than a block";
2308
Joe Thornber58051b92013-03-20 17:21:25 +00002309 else if (!is_factor(block_size, data_limits->discard_granularity))
Mike Snitzer0424caa2012-09-26 23:45:47 +01002310 reason = "discard granularity not a factor of block size";
2311
2312 if (reason) {
2313 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
2314 pt->adjusted_pf.discard_passdown = false;
2315 }
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002316}
2317
Joe Thornber991d9fa2011-10-31 20:21:18 +00002318static int bind_control_target(struct pool *pool, struct dm_target *ti)
2319{
2320 struct pool_c *pt = ti->private;
2321
Joe Thornbere49e5822012-07-27 15:08:16 +01002322 /*
Joe Thornber9b7aaa62013-12-04 16:58:19 -05002323 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
Joe Thornbere49e5822012-07-27 15:08:16 +01002324 */
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05002325 enum pool_mode old_mode = get_pool_mode(pool);
Mike Snitzer0424caa2012-09-26 23:45:47 +01002326 enum pool_mode new_mode = pt->adjusted_pf.mode;
Joe Thornbere49e5822012-07-27 15:08:16 +01002327
Joe Thornber9b7aaa62013-12-04 16:58:19 -05002328 /*
Mike Snitzer8b64e882013-12-20 14:27:28 -05002329 * Don't change the pool's mode until set_pool_mode() below.
2330 * Otherwise the pool's process_* function pointers may
2331 * not match the desired pool mode.
2332 */
2333 pt->adjusted_pf.mode = old_mode;
2334
2335 pool->ti = ti;
2336 pool->pf = pt->adjusted_pf;
2337 pool->low_water_blocks = pt->low_water_blocks;
2338
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002339 set_pool_mode(pool, new_mode);
Mike Snitzerf4026932012-05-19 01:01:01 +01002340
Joe Thornber991d9fa2011-10-31 20:21:18 +00002341 return 0;
2342}
2343
2344static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2345{
2346 if (pool->ti == ti)
2347 pool->ti = NULL;
2348}
2349
2350/*----------------------------------------------------------------
2351 * Pool creation
2352 *--------------------------------------------------------------*/
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002353/* Initialize pool features. */
2354static void pool_features_init(struct pool_features *pf)
2355{
Joe Thornbere49e5822012-07-27 15:08:16 +01002356 pf->mode = PM_WRITE;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002357 pf->zero_new_blocks = true;
2358 pf->discard_enabled = true;
2359 pf->discard_passdown = true;
Mike Snitzer787a996c2013-12-06 16:21:43 -05002360 pf->error_if_no_space = false;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002361}
2362
Joe Thornber991d9fa2011-10-31 20:21:18 +00002363static void __pool_destroy(struct pool *pool)
2364{
2365 __pool_table_remove(pool);
2366
2367 if (dm_pool_metadata_close(pool->pmd) < 0)
2368 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2369
Mike Snitzer44feb382012-10-12 21:02:10 +01002370 dm_bio_prison_destroy(pool->prison);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002371 dm_kcopyd_client_destroy(pool->copier);
2372
2373 if (pool->wq)
2374 destroy_workqueue(pool->wq);
2375
2376 if (pool->next_mapping)
2377 mempool_free(pool->next_mapping, pool->mapping_pool);
2378 mempool_destroy(pool->mapping_pool);
Mike Snitzer44feb382012-10-12 21:02:10 +01002379 dm_deferred_set_destroy(pool->shared_read_ds);
2380 dm_deferred_set_destroy(pool->all_io_ds);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002381 kfree(pool);
2382}
2383
Mike Snitzera24c2562012-06-03 00:30:00 +01002384static struct kmem_cache *_new_mapping_cache;
Mike Snitzera24c2562012-06-03 00:30:00 +01002385
Joe Thornber991d9fa2011-10-31 20:21:18 +00002386static struct pool *pool_create(struct mapped_device *pool_md,
2387 struct block_device *metadata_dev,
Joe Thornbere49e5822012-07-27 15:08:16 +01002388 unsigned long block_size,
2389 int read_only, char **error)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002390{
2391 int r;
2392 void *err_p;
2393 struct pool *pool;
2394 struct dm_pool_metadata *pmd;
Joe Thornbere49e5822012-07-27 15:08:16 +01002395 bool format_device = read_only ? false : true;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002396
Joe Thornbere49e5822012-07-27 15:08:16 +01002397 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002398 if (IS_ERR(pmd)) {
2399 *error = "Error creating metadata object";
2400 return (struct pool *)pmd;
2401 }
2402
2403 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
2404 if (!pool) {
2405 *error = "Error allocating memory for pool";
2406 err_p = ERR_PTR(-ENOMEM);
2407 goto bad_pool;
2408 }
2409
2410 pool->pmd = pmd;
2411 pool->sectors_per_block = block_size;
Mikulas Patockaf9a8e0c2012-07-27 15:08:03 +01002412 if (block_size & (block_size - 1))
2413 pool->sectors_per_block_shift = -1;
2414 else
2415 pool->sectors_per_block_shift = __ffs(block_size);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002416 pool->low_water_blocks = 0;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002417 pool_features_init(&pool->pf);
Joe Thornbera195db22014-10-06 16:30:06 -04002418 pool->prison = dm_bio_prison_create();
Joe Thornber991d9fa2011-10-31 20:21:18 +00002419 if (!pool->prison) {
2420 *error = "Error creating pool's bio prison";
2421 err_p = ERR_PTR(-ENOMEM);
2422 goto bad_prison;
2423 }
2424
Mikulas Patockadf5d2e92013-03-01 22:45:49 +00002425 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002426 if (IS_ERR(pool->copier)) {
2427 r = PTR_ERR(pool->copier);
2428 *error = "Error creating pool's kcopyd client";
2429 err_p = ERR_PTR(r);
2430 goto bad_kcopyd_client;
2431 }
2432
2433 /*
2434 * Create singlethreaded workqueue that will service all devices
2435 * that use this metadata.
2436 */
2437 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2438 if (!pool->wq) {
2439 *error = "Error creating pool's workqueue";
2440 err_p = ERR_PTR(-ENOMEM);
2441 goto bad_wq;
2442 }
2443
Joe Thornber7d327fe2014-10-06 15:45:59 +01002444 throttle_init(&pool->throttle);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002445 INIT_WORK(&pool->worker, do_worker);
Joe Thornber905e51b2012-03-28 18:41:27 +01002446 INIT_DELAYED_WORK(&pool->waker, do_waker);
Joe Thornber85ad643b2014-05-09 15:59:38 +01002447 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002448 spin_lock_init(&pool->lock);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002449 bio_list_init(&pool->deferred_flush_bios);
2450 INIT_LIST_HEAD(&pool->prepared_mappings);
Joe Thornber104655f2012-03-28 18:41:28 +01002451 INIT_LIST_HEAD(&pool->prepared_discards);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04002452 INIT_LIST_HEAD(&pool->active_thins);
Joe Thornber88a66212013-12-04 20:16:12 -05002453 pool->low_water_triggered = false;
Mike Snitzer44feb382012-10-12 21:02:10 +01002454
2455 pool->shared_read_ds = dm_deferred_set_create();
2456 if (!pool->shared_read_ds) {
2457 *error = "Error creating pool's shared read deferred set";
2458 err_p = ERR_PTR(-ENOMEM);
2459 goto bad_shared_read_ds;
2460 }
2461
2462 pool->all_io_ds = dm_deferred_set_create();
2463 if (!pool->all_io_ds) {
2464 *error = "Error creating pool's all io deferred set";
2465 err_p = ERR_PTR(-ENOMEM);
2466 goto bad_all_io_ds;
2467 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002468
2469 pool->next_mapping = NULL;
Mike Snitzera24c2562012-06-03 00:30:00 +01002470 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
2471 _new_mapping_cache);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002472 if (!pool->mapping_pool) {
2473 *error = "Error creating pool's mapping mempool";
2474 err_p = ERR_PTR(-ENOMEM);
2475 goto bad_mapping_pool;
2476 }
2477
Joe Thornber991d9fa2011-10-31 20:21:18 +00002478 pool->ref_count = 1;
Joe Thornber905e51b2012-03-28 18:41:27 +01002479 pool->last_commit_jiffies = jiffies;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002480 pool->pool_md = pool_md;
2481 pool->md_dev = metadata_dev;
2482 __pool_table_insert(pool);
2483
2484 return pool;
2485
Joe Thornber991d9fa2011-10-31 20:21:18 +00002486bad_mapping_pool:
Mike Snitzer44feb382012-10-12 21:02:10 +01002487 dm_deferred_set_destroy(pool->all_io_ds);
2488bad_all_io_ds:
2489 dm_deferred_set_destroy(pool->shared_read_ds);
2490bad_shared_read_ds:
Joe Thornber991d9fa2011-10-31 20:21:18 +00002491 destroy_workqueue(pool->wq);
2492bad_wq:
2493 dm_kcopyd_client_destroy(pool->copier);
2494bad_kcopyd_client:
Mike Snitzer44feb382012-10-12 21:02:10 +01002495 dm_bio_prison_destroy(pool->prison);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002496bad_prison:
2497 kfree(pool);
2498bad_pool:
2499 if (dm_pool_metadata_close(pmd))
2500 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2501
2502 return err_p;
2503}
2504
2505static void __pool_inc(struct pool *pool)
2506{
2507 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2508 pool->ref_count++;
2509}
2510
2511static void __pool_dec(struct pool *pool)
2512{
2513 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2514 BUG_ON(!pool->ref_count);
2515 if (!--pool->ref_count)
2516 __pool_destroy(pool);
2517}
2518
2519static struct pool *__pool_find(struct mapped_device *pool_md,
2520 struct block_device *metadata_dev,
Joe Thornbere49e5822012-07-27 15:08:16 +01002521 unsigned long block_size, int read_only,
2522 char **error, int *created)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002523{
2524 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2525
2526 if (pool) {
Mike Snitzerf09996c2012-07-27 15:07:59 +01002527 if (pool->pool_md != pool_md) {
2528 *error = "metadata device already in use by a pool";
Joe Thornber991d9fa2011-10-31 20:21:18 +00002529 return ERR_PTR(-EBUSY);
Mike Snitzerf09996c2012-07-27 15:07:59 +01002530 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002531 __pool_inc(pool);
2532
2533 } else {
2534 pool = __pool_table_lookup(pool_md);
2535 if (pool) {
Mike Snitzerf09996c2012-07-27 15:07:59 +01002536 if (pool->md_dev != metadata_dev) {
2537 *error = "different pool cannot replace a pool";
Joe Thornber991d9fa2011-10-31 20:21:18 +00002538 return ERR_PTR(-EINVAL);
Mike Snitzerf09996c2012-07-27 15:07:59 +01002539 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002540 __pool_inc(pool);
2541
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002542 } else {
Joe Thornbere49e5822012-07-27 15:08:16 +01002543 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002544 *created = 1;
2545 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002546 }
2547
2548 return pool;
2549}
2550
2551/*----------------------------------------------------------------
2552 * Pool target methods
2553 *--------------------------------------------------------------*/
2554static void pool_dtr(struct dm_target *ti)
2555{
2556 struct pool_c *pt = ti->private;
2557
2558 mutex_lock(&dm_thin_pool_table.mutex);
2559
2560 unbind_control_target(pt->pool, ti);
2561 __pool_dec(pt->pool);
2562 dm_put_device(ti, pt->metadata_dev);
2563 dm_put_device(ti, pt->data_dev);
2564 kfree(pt);
2565
2566 mutex_unlock(&dm_thin_pool_table.mutex);
2567}
2568
Joe Thornber991d9fa2011-10-31 20:21:18 +00002569static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2570 struct dm_target *ti)
2571{
2572 int r;
2573 unsigned argc;
2574 const char *arg_name;
2575
2576 static struct dm_arg _args[] = {
Mike Snitzer74aa45c2014-01-15 19:07:58 -05002577 {0, 4, "Invalid number of pool feature arguments"},
Joe Thornber991d9fa2011-10-31 20:21:18 +00002578 };
2579
2580 /*
2581 * No feature arguments supplied.
2582 */
2583 if (!as->argc)
2584 return 0;
2585
2586 r = dm_read_arg_group(_args, as, &argc, &ti->error);
2587 if (r)
2588 return -EINVAL;
2589
2590 while (argc && !r) {
2591 arg_name = dm_shift_arg(as);
2592 argc--;
2593
Joe Thornbere49e5822012-07-27 15:08:16 +01002594 if (!strcasecmp(arg_name, "skip_block_zeroing"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002595 pf->zero_new_blocks = false;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002596
Joe Thornbere49e5822012-07-27 15:08:16 +01002597 else if (!strcasecmp(arg_name, "ignore_discard"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002598 pf->discard_enabled = false;
Joe Thornbere49e5822012-07-27 15:08:16 +01002599
2600 else if (!strcasecmp(arg_name, "no_discard_passdown"))
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002601 pf->discard_passdown = false;
Joe Thornbere49e5822012-07-27 15:08:16 +01002602
2603 else if (!strcasecmp(arg_name, "read_only"))
2604 pf->mode = PM_READ_ONLY;
2605
Mike Snitzer787a996c2013-12-06 16:21:43 -05002606 else if (!strcasecmp(arg_name, "error_if_no_space"))
2607 pf->error_if_no_space = true;
2608
Joe Thornbere49e5822012-07-27 15:08:16 +01002609 else {
2610 ti->error = "Unrecognised pool feature requested";
2611 r = -EINVAL;
2612 break;
2613 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002614 }
2615
2616 return r;
2617}
2618
Joe Thornberac8c3f32013-05-10 14:37:21 +01002619static void metadata_low_callback(void *context)
2620{
2621 struct pool *pool = context;
2622
2623 DMWARN("%s: reached low water mark for metadata device: sending event.",
2624 dm_device_name(pool->pool_md));
2625
2626 dm_table_event(pool->ti->table);
2627}
2628
Mike Snitzer7d489352014-02-12 23:58:15 -05002629static sector_t get_dev_size(struct block_device *bdev)
Joe Thornberb17446d2013-05-10 14:37:18 +01002630{
Mike Snitzer7d489352014-02-12 23:58:15 -05002631 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2632}
2633
2634static void warn_if_metadata_device_too_big(struct block_device *bdev)
2635{
2636 sector_t metadata_dev_size = get_dev_size(bdev);
Joe Thornberb17446d2013-05-10 14:37:18 +01002637 char buffer[BDEVNAME_SIZE];
2638
Mike Snitzer7d489352014-02-12 23:58:15 -05002639 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
Joe Thornberb17446d2013-05-10 14:37:18 +01002640 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2641 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
Mike Snitzer7d489352014-02-12 23:58:15 -05002642}
2643
2644static sector_t get_metadata_dev_size(struct block_device *bdev)
2645{
2646 sector_t metadata_dev_size = get_dev_size(bdev);
2647
2648 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2649 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
Joe Thornberb17446d2013-05-10 14:37:18 +01002650
2651 return metadata_dev_size;
2652}
2653
Joe Thornber24347e92013-05-10 14:37:19 +01002654static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2655{
2656 sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2657
Mike Snitzer7d489352014-02-12 23:58:15 -05002658 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
Joe Thornber24347e92013-05-10 14:37:19 +01002659
2660 return metadata_dev_size;
2661}
2662
Joe Thornber991d9fa2011-10-31 20:21:18 +00002663/*
Joe Thornberac8c3f32013-05-10 14:37:21 +01002664 * When a metadata threshold is crossed a dm event is triggered, and
2665 * userland should respond by growing the metadata device. We could let
2666 * userland set the threshold, like we do with the data threshold, but I'm
2667 * not sure they know enough to do this well.
2668 */
2669static dm_block_t calc_metadata_threshold(struct pool_c *pt)
2670{
2671 /*
2672 * 4M is ample for all ops with the possible exception of thin
2673 * device deletion which is harmless if it fails (just retry the
2674 * delete after you've grown the device).
2675 */
2676 dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
2677 return min((dm_block_t)1024ULL /* 4M */, quarter);
2678}
2679
2680/*
Joe Thornber991d9fa2011-10-31 20:21:18 +00002681 * thin-pool <metadata dev> <data dev>
2682 * <data block size (sectors)>
2683 * <low water mark (blocks)>
2684 * [<#feature args> [<arg>]*]
2685 *
2686 * Optional feature arguments are:
2687 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002688 * ignore_discard: disable discard
2689 * no_discard_passdown: don't pass discards down to the data device
Mike Snitzer787a996c2013-12-06 16:21:43 -05002690 * read_only: Don't allow any changes to be made to the pool metadata.
2691 * error_if_no_space: error IOs, instead of queueing, if no space.
Joe Thornber991d9fa2011-10-31 20:21:18 +00002692 */
2693static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2694{
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002695 int r, pool_created = 0;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002696 struct pool_c *pt;
2697 struct pool *pool;
2698 struct pool_features pf;
2699 struct dm_arg_set as;
2700 struct dm_dev *data_dev;
2701 unsigned long block_size;
2702 dm_block_t low_water_blocks;
2703 struct dm_dev *metadata_dev;
Joe Thornber5d0db962013-05-10 14:37:19 +01002704 fmode_t metadata_mode;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002705
2706 /*
2707 * FIXME Remove validation from scope of lock.
2708 */
2709 mutex_lock(&dm_thin_pool_table.mutex);
2710
2711 if (argc < 4) {
2712 ti->error = "Invalid argument count";
2713 r = -EINVAL;
2714 goto out_unlock;
2715 }
Joe Thornber5d0db962013-05-10 14:37:19 +01002716
Joe Thornber991d9fa2011-10-31 20:21:18 +00002717 as.argc = argc;
2718 as.argv = argv;
2719
Joe Thornber5d0db962013-05-10 14:37:19 +01002720 /*
2721 * Set default pool features.
2722 */
2723 pool_features_init(&pf);
2724
2725 dm_consume_args(&as, 4);
2726 r = parse_pool_features(&as, &pf, ti);
2727 if (r)
2728 goto out_unlock;
2729
2730 metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
2731 r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002732 if (r) {
2733 ti->error = "Error opening metadata block device";
2734 goto out_unlock;
2735 }
Mike Snitzer7d489352014-02-12 23:58:15 -05002736 warn_if_metadata_device_too_big(metadata_dev->bdev);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002737
2738 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2739 if (r) {
2740 ti->error = "Error getting data device";
2741 goto out_metadata;
2742 }
2743
2744 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2745 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2746 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01002747 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00002748 ti->error = "Invalid block size";
2749 r = -EINVAL;
2750 goto out;
2751 }
2752
2753 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2754 ti->error = "Invalid low water mark";
2755 r = -EINVAL;
2756 goto out;
2757 }
2758
Joe Thornber991d9fa2011-10-31 20:21:18 +00002759 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2760 if (!pt) {
2761 r = -ENOMEM;
2762 goto out;
2763 }
2764
2765 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
Joe Thornbere49e5822012-07-27 15:08:16 +01002766 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002767 if (IS_ERR(pool)) {
2768 r = PTR_ERR(pool);
2769 goto out_free_pt;
2770 }
2771
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002772 /*
2773 * 'pool_created' reflects whether this is the first table load.
2774 * Top level discard support is not allowed to be changed after
2775 * initial load. This would require a pool reload to trigger thin
2776 * device changes.
2777 */
2778 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2779 ti->error = "Discard support cannot be disabled once enabled";
2780 r = -EINVAL;
2781 goto out_flags_changed;
2782 }
2783
Joe Thornber991d9fa2011-10-31 20:21:18 +00002784 pt->pool = pool;
2785 pt->ti = ti;
2786 pt->metadata_dev = metadata_dev;
2787 pt->data_dev = data_dev;
2788 pt->low_water_blocks = low_water_blocks;
Mike Snitzer0424caa2012-09-26 23:45:47 +01002789 pt->adjusted_pf = pt->requested_pf = pf;
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00002790 ti->num_flush_bios = 1;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002791
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002792 /*
2793 * Only need to enable discards if the pool should pass
2794 * them down to the data device. The thin device's discard
2795 * processing will cause mappings to be removed from the btree.
2796 */
Mike Snitzerb60ab992013-09-19 18:49:11 -04002797 ti->discard_zeroes_data_unsupported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002798 if (pf.discard_enabled && pf.discard_passdown) {
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00002799 ti->num_discard_bios = 1;
Mike Snitzer9bc142d2012-09-26 23:45:46 +01002800
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002801 /*
2802 * Setting 'discards_supported' circumvents the normal
2803 * stacking of discard limits (this keeps the pool and
2804 * thin devices' discard limits consistent).
2805 */
Alasdair G Kergon0ac55482012-07-27 15:08:08 +01002806 ti->discards_supported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002807 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00002808 ti->private = pt;
2809
Joe Thornberac8c3f32013-05-10 14:37:21 +01002810 r = dm_pool_register_metadata_threshold(pt->pool->pmd,
2811 calc_metadata_threshold(pt),
2812 metadata_low_callback,
2813 pool);
2814 if (r)
2815 goto out_free_pt;
2816
Joe Thornber991d9fa2011-10-31 20:21:18 +00002817 pt->callbacks.congested_fn = pool_is_congested;
2818 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2819
2820 mutex_unlock(&dm_thin_pool_table.mutex);
2821
2822 return 0;
2823
Joe Thornber67e2e2b2012-03-28 18:41:29 +01002824out_flags_changed:
2825 __pool_dec(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002826out_free_pt:
2827 kfree(pt);
2828out:
2829 dm_put_device(ti, data_dev);
2830out_metadata:
2831 dm_put_device(ti, metadata_dev);
2832out_unlock:
2833 mutex_unlock(&dm_thin_pool_table.mutex);
2834
2835 return r;
2836}
2837
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00002838static int pool_map(struct dm_target *ti, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002839{
2840 int r;
2841 struct pool_c *pt = ti->private;
2842 struct pool *pool = pt->pool;
2843 unsigned long flags;
2844
2845 /*
2846 * As this is a singleton target, ti->begin is always zero.
2847 */
2848 spin_lock_irqsave(&pool->lock, flags);
2849 bio->bi_bdev = pt->data_dev->bdev;
2850 r = DM_MAPIO_REMAPPED;
2851 spin_unlock_irqrestore(&pool->lock, flags);
2852
2853 return r;
2854}
2855
Joe Thornberb17446d2013-05-10 14:37:18 +01002856static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2857{
2858 int r;
2859 struct pool_c *pt = ti->private;
2860 struct pool *pool = pt->pool;
2861 sector_t data_size = ti->len;
2862 dm_block_t sb_data_size;
2863
2864 *need_commit = false;
2865
2866 (void) sector_div(data_size, pool->sectors_per_block);
2867
2868 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2869 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002870 DMERR("%s: failed to retrieve data device size",
2871 dm_device_name(pool->pool_md));
Joe Thornberb17446d2013-05-10 14:37:18 +01002872 return r;
2873 }
2874
2875 if (data_size < sb_data_size) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002876 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
2877 dm_device_name(pool->pool_md),
Joe Thornberb17446d2013-05-10 14:37:18 +01002878 (unsigned long long)data_size, sb_data_size);
2879 return -EINVAL;
2880
2881 } else if (data_size > sb_data_size) {
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05002882 if (dm_pool_metadata_needs_check(pool->pmd)) {
2883 DMERR("%s: unable to grow the data device until repaired.",
2884 dm_device_name(pool->pool_md));
2885 return 0;
2886 }
2887
Mike Snitzer6f7f51d2013-12-04 10:25:53 -05002888 if (sb_data_size)
2889 DMINFO("%s: growing the data device from %llu to %llu blocks",
2890 dm_device_name(pool->pool_md),
2891 sb_data_size, (unsigned long long)data_size);
Joe Thornberb17446d2013-05-10 14:37:18 +01002892 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2893 if (r) {
Joe Thornberb5330652013-12-04 19:51:33 -05002894 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
Joe Thornberb17446d2013-05-10 14:37:18 +01002895 return r;
2896 }
2897
2898 *need_commit = true;
2899 }
2900
2901 return 0;
2902}
2903
Joe Thornber24347e92013-05-10 14:37:19 +01002904static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2905{
2906 int r;
2907 struct pool_c *pt = ti->private;
2908 struct pool *pool = pt->pool;
2909 dm_block_t metadata_dev_size, sb_metadata_dev_size;
2910
2911 *need_commit = false;
2912
Alasdair G Kergon610bba82013-05-19 18:57:50 +01002913 metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
Joe Thornber24347e92013-05-10 14:37:19 +01002914
2915 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
2916 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002917 DMERR("%s: failed to retrieve metadata device size",
2918 dm_device_name(pool->pool_md));
Joe Thornber24347e92013-05-10 14:37:19 +01002919 return r;
2920 }
2921
2922 if (metadata_dev_size < sb_metadata_dev_size) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04002923 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
2924 dm_device_name(pool->pool_md),
Joe Thornber24347e92013-05-10 14:37:19 +01002925 metadata_dev_size, sb_metadata_dev_size);
2926 return -EINVAL;
2927
2928 } else if (metadata_dev_size > sb_metadata_dev_size) {
Mike Snitzer07f2b6e2014-02-14 11:58:41 -05002929 if (dm_pool_metadata_needs_check(pool->pmd)) {
2930 DMERR("%s: unable to grow the metadata device until repaired.",
2931 dm_device_name(pool->pool_md));
2932 return 0;
2933 }
2934
Mike Snitzer7d489352014-02-12 23:58:15 -05002935 warn_if_metadata_device_too_big(pool->md_dev);
Mike Snitzer6f7f51d2013-12-04 10:25:53 -05002936 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2937 dm_device_name(pool->pool_md),
2938 sb_metadata_dev_size, metadata_dev_size);
Joe Thornber24347e92013-05-10 14:37:19 +01002939 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2940 if (r) {
Joe Thornberb5330652013-12-04 19:51:33 -05002941 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
Joe Thornber24347e92013-05-10 14:37:19 +01002942 return r;
2943 }
2944
2945 *need_commit = true;
2946 }
2947
2948 return 0;
2949}
2950
Joe Thornber991d9fa2011-10-31 20:21:18 +00002951/*
2952 * Retrieves the number of blocks of the data device from
2953 * the superblock and compares it to the actual device size,
2954 * thus resizing the data device in case it has grown.
2955 *
2956 * This both copes with opening preallocated data devices in the ctr
2957 * being followed by a resume
2958 * -and-
2959 * calling the resume method individually after userspace has
2960 * grown the data device in reaction to a table event.
2961 */
2962static int pool_preresume(struct dm_target *ti)
2963{
2964 int r;
Joe Thornber24347e92013-05-10 14:37:19 +01002965 bool need_commit1, need_commit2;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002966 struct pool_c *pt = ti->private;
2967 struct pool *pool = pt->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002968
2969 /*
2970 * Take control of the pool object.
2971 */
2972 r = bind_control_target(pool, ti);
2973 if (r)
2974 return r;
2975
Joe Thornberb17446d2013-05-10 14:37:18 +01002976 r = maybe_resize_data_dev(ti, &need_commit1);
2977 if (r)
Joe Thornber991d9fa2011-10-31 20:21:18 +00002978 return r;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002979
Joe Thornber24347e92013-05-10 14:37:19 +01002980 r = maybe_resize_metadata_dev(ti, &need_commit2);
2981 if (r)
2982 return r;
2983
2984 if (need_commit1 || need_commit2)
Joe Thornber020cc3b2013-12-04 15:05:36 -05002985 (void) commit(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00002986
2987 return 0;
2988}
2989
2990static void pool_resume(struct dm_target *ti)
2991{
2992 struct pool_c *pt = ti->private;
2993 struct pool *pool = pt->pool;
2994 unsigned long flags;
2995
2996 spin_lock_irqsave(&pool->lock, flags);
Joe Thornber88a66212013-12-04 20:16:12 -05002997 pool->low_water_triggered = false;
Joe Thornber991d9fa2011-10-31 20:21:18 +00002998 spin_unlock_irqrestore(&pool->lock, flags);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04002999 requeue_bios(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003000
Joe Thornber905e51b2012-03-28 18:41:27 +01003001 do_waker(&pool->waker.work);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003002}
3003
3004static void pool_postsuspend(struct dm_target *ti)
3005{
Joe Thornber991d9fa2011-10-31 20:21:18 +00003006 struct pool_c *pt = ti->private;
3007 struct pool *pool = pt->pool;
3008
Joe Thornber905e51b2012-03-28 18:41:27 +01003009 cancel_delayed_work(&pool->waker);
Joe Thornber85ad643b2014-05-09 15:59:38 +01003010 cancel_delayed_work(&pool->no_space_timeout);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003011 flush_workqueue(pool->wq);
Joe Thornber020cc3b2013-12-04 15:05:36 -05003012 (void) commit(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003013}
3014
3015static int check_arg_count(unsigned argc, unsigned args_required)
3016{
3017 if (argc != args_required) {
3018 DMWARN("Message received with %u arguments instead of %u.",
3019 argc, args_required);
3020 return -EINVAL;
3021 }
3022
3023 return 0;
3024}
3025
3026static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
3027{
3028 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
3029 *dev_id <= MAX_DEV_ID)
3030 return 0;
3031
3032 if (warning)
3033 DMWARN("Message received with invalid device id: %s", arg);
3034
3035 return -EINVAL;
3036}
3037
3038static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
3039{
3040 dm_thin_id dev_id;
3041 int r;
3042
3043 r = check_arg_count(argc, 2);
3044 if (r)
3045 return r;
3046
3047 r = read_dev_id(argv[1], &dev_id, 1);
3048 if (r)
3049 return r;
3050
3051 r = dm_pool_create_thin(pool->pmd, dev_id);
3052 if (r) {
3053 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
3054 argv[1]);
3055 return r;
3056 }
3057
3058 return 0;
3059}
3060
3061static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3062{
3063 dm_thin_id dev_id;
3064 dm_thin_id origin_dev_id;
3065 int r;
3066
3067 r = check_arg_count(argc, 3);
3068 if (r)
3069 return r;
3070
3071 r = read_dev_id(argv[1], &dev_id, 1);
3072 if (r)
3073 return r;
3074
3075 r = read_dev_id(argv[2], &origin_dev_id, 1);
3076 if (r)
3077 return r;
3078
3079 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
3080 if (r) {
3081 DMWARN("Creation of new snapshot %s of device %s failed.",
3082 argv[1], argv[2]);
3083 return r;
3084 }
3085
3086 return 0;
3087}
3088
3089static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
3090{
3091 dm_thin_id dev_id;
3092 int r;
3093
3094 r = check_arg_count(argc, 2);
3095 if (r)
3096 return r;
3097
3098 r = read_dev_id(argv[1], &dev_id, 1);
3099 if (r)
3100 return r;
3101
3102 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
3103 if (r)
3104 DMWARN("Deletion of thin device %s failed.", argv[1]);
3105
3106 return r;
3107}
3108
3109static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
3110{
3111 dm_thin_id old_id, new_id;
3112 int r;
3113
3114 r = check_arg_count(argc, 3);
3115 if (r)
3116 return r;
3117
3118 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
3119 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
3120 return -EINVAL;
3121 }
3122
3123 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
3124 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
3125 return -EINVAL;
3126 }
3127
3128 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
3129 if (r) {
3130 DMWARN("Failed to change transaction id from %s to %s.",
3131 argv[1], argv[2]);
3132 return r;
3133 }
3134
3135 return 0;
3136}
3137
Joe Thornbercc8394d2012-06-03 00:30:01 +01003138static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3139{
3140 int r;
3141
3142 r = check_arg_count(argc, 1);
3143 if (r)
3144 return r;
3145
Joe Thornber020cc3b2013-12-04 15:05:36 -05003146 (void) commit(pool);
Joe Thornber0d200ae2012-07-03 12:55:31 +01003147
Joe Thornbercc8394d2012-06-03 00:30:01 +01003148 r = dm_pool_reserve_metadata_snap(pool->pmd);
3149 if (r)
3150 DMWARN("reserve_metadata_snap message failed.");
3151
3152 return r;
3153}
3154
3155static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3156{
3157 int r;
3158
3159 r = check_arg_count(argc, 1);
3160 if (r)
3161 return r;
3162
3163 r = dm_pool_release_metadata_snap(pool->pmd);
3164 if (r)
3165 DMWARN("release_metadata_snap message failed.");
3166
3167 return r;
3168}
3169
Joe Thornber991d9fa2011-10-31 20:21:18 +00003170/*
3171 * Messages supported:
3172 * create_thin <dev_id>
3173 * create_snap <dev_id> <origin_id>
3174 * delete <dev_id>
3175 * trim <dev_id> <new_size_in_sectors>
3176 * set_transaction_id <current_trans_id> <new_trans_id>
Joe Thornbercc8394d2012-06-03 00:30:01 +01003177 * reserve_metadata_snap
3178 * release_metadata_snap
Joe Thornber991d9fa2011-10-31 20:21:18 +00003179 */
3180static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
3181{
3182 int r = -EINVAL;
3183 struct pool_c *pt = ti->private;
3184 struct pool *pool = pt->pool;
3185
3186 if (!strcasecmp(argv[0], "create_thin"))
3187 r = process_create_thin_mesg(argc, argv, pool);
3188
3189 else if (!strcasecmp(argv[0], "create_snap"))
3190 r = process_create_snap_mesg(argc, argv, pool);
3191
3192 else if (!strcasecmp(argv[0], "delete"))
3193 r = process_delete_mesg(argc, argv, pool);
3194
3195 else if (!strcasecmp(argv[0], "set_transaction_id"))
3196 r = process_set_transaction_id_mesg(argc, argv, pool);
3197
Joe Thornbercc8394d2012-06-03 00:30:01 +01003198 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
3199 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
3200
3201 else if (!strcasecmp(argv[0], "release_metadata_snap"))
3202 r = process_release_metadata_snap_mesg(argc, argv, pool);
3203
Joe Thornber991d9fa2011-10-31 20:21:18 +00003204 else
3205 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
3206
Joe Thornbere49e5822012-07-27 15:08:16 +01003207 if (!r)
Joe Thornber020cc3b2013-12-04 15:05:36 -05003208 (void) commit(pool);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003209
3210 return r;
3211}
3212
Joe Thornbere49e5822012-07-27 15:08:16 +01003213static void emit_flags(struct pool_features *pf, char *result,
3214 unsigned sz, unsigned maxlen)
3215{
3216 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
Mike Snitzer787a996c2013-12-06 16:21:43 -05003217 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
3218 pf->error_if_no_space;
Joe Thornbere49e5822012-07-27 15:08:16 +01003219 DMEMIT("%u ", count);
3220
3221 if (!pf->zero_new_blocks)
3222 DMEMIT("skip_block_zeroing ");
3223
3224 if (!pf->discard_enabled)
3225 DMEMIT("ignore_discard ");
3226
3227 if (!pf->discard_passdown)
3228 DMEMIT("no_discard_passdown ");
3229
3230 if (pf->mode == PM_READ_ONLY)
3231 DMEMIT("read_only ");
Mike Snitzer787a996c2013-12-06 16:21:43 -05003232
3233 if (pf->error_if_no_space)
3234 DMEMIT("error_if_no_space ");
Joe Thornbere49e5822012-07-27 15:08:16 +01003235}
3236
Joe Thornber991d9fa2011-10-31 20:21:18 +00003237/*
3238 * Status line is:
3239 * <transaction id> <used metadata sectors>/<total metadata sectors>
3240 * <used data sectors>/<total data sectors> <held metadata root>
3241 */
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003242static void pool_status(struct dm_target *ti, status_type_t type,
3243 unsigned status_flags, char *result, unsigned maxlen)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003244{
Joe Thornbere49e5822012-07-27 15:08:16 +01003245 int r;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003246 unsigned sz = 0;
3247 uint64_t transaction_id;
3248 dm_block_t nr_free_blocks_data;
3249 dm_block_t nr_free_blocks_metadata;
3250 dm_block_t nr_blocks_data;
3251 dm_block_t nr_blocks_metadata;
3252 dm_block_t held_root;
3253 char buf[BDEVNAME_SIZE];
3254 char buf2[BDEVNAME_SIZE];
3255 struct pool_c *pt = ti->private;
3256 struct pool *pool = pt->pool;
3257
3258 switch (type) {
3259 case STATUSTYPE_INFO:
Joe Thornbere49e5822012-07-27 15:08:16 +01003260 if (get_pool_mode(pool) == PM_FAIL) {
3261 DMEMIT("Fail");
3262 break;
3263 }
3264
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01003265 /* Commit to ensure statistics aren't out-of-date */
3266 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
Joe Thornber020cc3b2013-12-04 15:05:36 -05003267 (void) commit(pool);
Alasdair G Kergon1f4e0ff2012-07-27 15:08:16 +01003268
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003269 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
3270 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04003271 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
3272 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003273 goto err;
3274 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003275
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003276 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
3277 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04003278 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
3279 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003280 goto err;
3281 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003282
3283 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003284 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04003285 DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
3286 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003287 goto err;
3288 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003289
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003290 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
3291 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04003292 DMERR("%s: dm_pool_get_free_block_count returned %d",
3293 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003294 goto err;
3295 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003296
3297 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003298 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04003299 DMERR("%s: dm_pool_get_data_dev_size returned %d",
3300 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003301 goto err;
3302 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003303
Joe Thornbercc8394d2012-06-03 00:30:01 +01003304 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003305 if (r) {
Mike Snitzer4fa59712013-08-21 17:30:40 -04003306 DMERR("%s: dm_pool_get_metadata_snap returned %d",
3307 dm_device_name(pool->pool_md), r);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003308 goto err;
3309 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003310
3311 DMEMIT("%llu %llu/%llu %llu/%llu ",
3312 (unsigned long long)transaction_id,
3313 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3314 (unsigned long long)nr_blocks_metadata,
3315 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3316 (unsigned long long)nr_blocks_data);
3317
3318 if (held_root)
Joe Thornbere49e5822012-07-27 15:08:16 +01003319 DMEMIT("%llu ", held_root);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003320 else
Joe Thornbere49e5822012-07-27 15:08:16 +01003321 DMEMIT("- ");
3322
Joe Thornber3e1a0692014-03-03 16:03:26 +00003323 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
3324 DMEMIT("out_of_data_space ");
3325 else if (pool->pf.mode == PM_READ_ONLY)
Joe Thornbere49e5822012-07-27 15:08:16 +01003326 DMEMIT("ro ");
3327 else
3328 DMEMIT("rw ");
3329
Mike Snitzer018debe2012-12-21 20:23:32 +00003330 if (!pool->pf.discard_enabled)
Mike Snitzer787a996c2013-12-06 16:21:43 -05003331 DMEMIT("ignore_discard ");
Mike Snitzer018debe2012-12-21 20:23:32 +00003332 else if (pool->pf.discard_passdown)
Mike Snitzer787a996c2013-12-06 16:21:43 -05003333 DMEMIT("discard_passdown ");
Joe Thornbere49e5822012-07-27 15:08:16 +01003334 else
Mike Snitzer787a996c2013-12-06 16:21:43 -05003335 DMEMIT("no_discard_passdown ");
3336
3337 if (pool->pf.error_if_no_space)
3338 DMEMIT("error_if_no_space ");
3339 else
3340 DMEMIT("queue_if_no_space ");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003341
3342 break;
3343
3344 case STATUSTYPE_TABLE:
3345 DMEMIT("%s %s %lu %llu ",
3346 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
3347 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
3348 (unsigned long)pool->sectors_per_block,
3349 (unsigned long long)pt->low_water_blocks);
Mike Snitzer0424caa2012-09-26 23:45:47 +01003350 emit_flags(&pt->requested_pf, result, sz, maxlen);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003351 break;
3352 }
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003353 return;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003354
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003355err:
3356 DMEMIT("Error");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003357}
3358
3359static int pool_iterate_devices(struct dm_target *ti,
3360 iterate_devices_callout_fn fn, void *data)
3361{
3362 struct pool_c *pt = ti->private;
3363
3364 return fn(ti, pt->data_dev, 0, ti->len, data);
3365}
3366
3367static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3368 struct bio_vec *biovec, int max_size)
3369{
3370 struct pool_c *pt = ti->private;
3371 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
3372
3373 if (!q->merge_bvec_fn)
3374 return max_size;
3375
3376 bvm->bi_bdev = pt->data_dev->bdev;
3377
3378 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3379}
3380
Mike Snitzer0424caa2012-09-26 23:45:47 +01003381static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
Joe Thornber104655f2012-03-28 18:41:28 +01003382{
Mike Snitzer0424caa2012-09-26 23:45:47 +01003383 struct pool *pool = pt->pool;
3384 struct queue_limits *data_limits;
3385
Joe Thornber104655f2012-03-28 18:41:28 +01003386 limits->max_discard_sectors = pool->sectors_per_block;
3387
3388 /*
Mike Snitzer0424caa2012-09-26 23:45:47 +01003389 * discard_granularity is just a hint, and not enforced.
Joe Thornber104655f2012-03-28 18:41:28 +01003390 */
Mike Snitzer0424caa2012-09-26 23:45:47 +01003391 if (pt->adjusted_pf.discard_passdown) {
3392 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
Lukas Czerner09869de2014-06-11 12:28:43 -04003393 limits->discard_granularity = max(data_limits->discard_granularity,
3394 pool->sectors_per_block << SECTOR_SHIFT);
Mike Snitzerf13945d2013-03-01 22:45:44 +00003395 } else
Mike Snitzer0424caa2012-09-26 23:45:47 +01003396 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
Joe Thornber104655f2012-03-28 18:41:28 +01003397}
3398
Joe Thornber991d9fa2011-10-31 20:21:18 +00003399static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3400{
3401 struct pool_c *pt = ti->private;
3402 struct pool *pool = pt->pool;
Mike Snitzer604ea902014-10-09 18:43:25 -04003403 sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3404
3405 /*
3406 * Adjust max_sectors_kb to highest possible power-of-2
3407 * factor of pool->sectors_per_block.
3408 */
3409 if (limits->max_hw_sectors & (limits->max_hw_sectors - 1))
3410 limits->max_sectors = rounddown_pow_of_two(limits->max_hw_sectors);
3411 else
3412 limits->max_sectors = limits->max_hw_sectors;
3413
3414 if (limits->max_sectors < pool->sectors_per_block) {
3415 while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
3416 if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
3417 limits->max_sectors--;
3418 limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
3419 }
3420 } else if (block_size_is_power_of_two(pool)) {
3421 /* max_sectors_kb is >= power-of-2 thinp blocksize */
3422 while (!is_factor(limits->max_sectors, pool->sectors_per_block)) {
3423 if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
3424 limits->max_sectors--;
3425 limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
3426 }
3427 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003428
Mike Snitzer0cc67cd2013-08-20 15:02:41 -04003429 /*
3430 * If the system-determined stacked limits are compatible with the
3431 * pool's blocksize (io_opt is a factor) do not override them.
3432 */
3433 if (io_opt_sectors < pool->sectors_per_block ||
Mike Snitzer604ea902014-10-09 18:43:25 -04003434 !is_factor(io_opt_sectors, pool->sectors_per_block)) {
3435 if (is_factor(pool->sectors_per_block, limits->max_sectors))
3436 blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
3437 else
3438 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
Mike Snitzer0cc67cd2013-08-20 15:02:41 -04003439 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3440 }
Mike Snitzer0424caa2012-09-26 23:45:47 +01003441
3442 /*
3443 * pt->adjusted_pf is a staging area for the actual features to use.
3444 * They get transferred to the live pool in bind_control_target()
3445 * called from pool_preresume().
3446 */
Mike Snitzerb60ab992013-09-19 18:49:11 -04003447 if (!pt->adjusted_pf.discard_enabled) {
3448 /*
3449 * Must explicitly disallow stacking discard limits otherwise the
3450 * block layer will stack them if pool's data device has support.
3451 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
3452 * user to see that, so make sure to set all discard limits to 0.
3453 */
3454 limits->discard_granularity = 0;
Mike Snitzer0424caa2012-09-26 23:45:47 +01003455 return;
Mike Snitzerb60ab992013-09-19 18:49:11 -04003456 }
Mike Snitzer0424caa2012-09-26 23:45:47 +01003457
3458 disable_passdown_if_not_supported(pt);
3459
3460 set_discard_limits(pt, limits);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003461}
3462
3463static struct target_type pool_target = {
3464 .name = "thin-pool",
3465 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3466 DM_TARGET_IMMUTABLE,
Mike Snitzer36f12ae2014-10-09 15:24:12 -04003467 .version = {1, 14, 0},
Joe Thornber991d9fa2011-10-31 20:21:18 +00003468 .module = THIS_MODULE,
3469 .ctr = pool_ctr,
3470 .dtr = pool_dtr,
3471 .map = pool_map,
3472 .postsuspend = pool_postsuspend,
3473 .preresume = pool_preresume,
3474 .resume = pool_resume,
3475 .message = pool_message,
3476 .status = pool_status,
3477 .merge = pool_merge,
3478 .iterate_devices = pool_iterate_devices,
3479 .io_hints = pool_io_hints,
3480};
3481
3482/*----------------------------------------------------------------
3483 * Thin target methods
3484 *--------------------------------------------------------------*/
Joe Thornberb10ebd32014-04-08 11:29:01 +01003485static void thin_get(struct thin_c *tc)
3486{
3487 atomic_inc(&tc->refcount);
3488}
3489
3490static void thin_put(struct thin_c *tc)
3491{
3492 if (atomic_dec_and_test(&tc->refcount))
3493 complete(&tc->can_destroy);
3494}
3495
Joe Thornber991d9fa2011-10-31 20:21:18 +00003496static void thin_dtr(struct dm_target *ti)
3497{
3498 struct thin_c *tc = ti->private;
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003499 unsigned long flags;
3500
Joe Thornberb10ebd32014-04-08 11:29:01 +01003501 thin_put(tc);
3502 wait_for_completion(&tc->can_destroy);
3503
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003504 spin_lock_irqsave(&tc->pool->lock, flags);
3505 list_del_rcu(&tc->list);
3506 spin_unlock_irqrestore(&tc->pool->lock, flags);
3507 synchronize_rcu();
Joe Thornber991d9fa2011-10-31 20:21:18 +00003508
3509 mutex_lock(&dm_thin_pool_table.mutex);
3510
3511 __pool_dec(tc->pool);
3512 dm_pool_close_thin_device(tc->td);
3513 dm_put_device(ti, tc->pool_dev);
Joe Thornber2dd9c252012-03-28 18:41:28 +01003514 if (tc->origin_dev)
3515 dm_put_device(ti, tc->origin_dev);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003516 kfree(tc);
3517
3518 mutex_unlock(&dm_thin_pool_table.mutex);
3519}
3520
3521/*
3522 * Thin target parameters:
3523 *
Joe Thornber2dd9c252012-03-28 18:41:28 +01003524 * <pool_dev> <dev_id> [origin_dev]
Joe Thornber991d9fa2011-10-31 20:21:18 +00003525 *
3526 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
3527 * dev_id: the internal device identifier
Joe Thornber2dd9c252012-03-28 18:41:28 +01003528 * origin_dev: a device external to the pool that should act as the origin
Joe Thornber67e2e2b2012-03-28 18:41:29 +01003529 *
3530 * If the pool device has discards disabled, they get disabled for the thin
3531 * device as well.
Joe Thornber991d9fa2011-10-31 20:21:18 +00003532 */
3533static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3534{
3535 int r;
3536 struct thin_c *tc;
Joe Thornber2dd9c252012-03-28 18:41:28 +01003537 struct dm_dev *pool_dev, *origin_dev;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003538 struct mapped_device *pool_md;
Joe Thornber5e3283e2014-04-08 11:08:41 +01003539 unsigned long flags;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003540
3541 mutex_lock(&dm_thin_pool_table.mutex);
3542
Joe Thornber2dd9c252012-03-28 18:41:28 +01003543 if (argc != 2 && argc != 3) {
Joe Thornber991d9fa2011-10-31 20:21:18 +00003544 ti->error = "Invalid argument count";
3545 r = -EINVAL;
3546 goto out_unlock;
3547 }
3548
3549 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
3550 if (!tc) {
3551 ti->error = "Out of memory";
3552 r = -ENOMEM;
3553 goto out_unlock;
3554 }
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003555 spin_lock_init(&tc->lock);
Joe Thornbera374bb22014-10-10 13:43:14 +01003556 INIT_LIST_HEAD(&tc->deferred_cells);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003557 bio_list_init(&tc->deferred_bio_list);
3558 bio_list_init(&tc->retry_on_resume_list);
Mike Snitzer67324ea2014-03-21 18:33:41 -04003559 tc->sort_bio_list = RB_ROOT;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003560
Joe Thornber2dd9c252012-03-28 18:41:28 +01003561 if (argc == 3) {
3562 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
3563 if (r) {
3564 ti->error = "Error opening origin device";
3565 goto bad_origin_dev;
3566 }
3567 tc->origin_dev = origin_dev;
3568 }
3569
Joe Thornber991d9fa2011-10-31 20:21:18 +00003570 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
3571 if (r) {
3572 ti->error = "Error opening pool device";
3573 goto bad_pool_dev;
3574 }
3575 tc->pool_dev = pool_dev;
3576
3577 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
3578 ti->error = "Invalid device id";
3579 r = -EINVAL;
3580 goto bad_common;
3581 }
3582
3583 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
3584 if (!pool_md) {
3585 ti->error = "Couldn't get pool mapped device";
3586 r = -EINVAL;
3587 goto bad_common;
3588 }
3589
3590 tc->pool = __pool_table_lookup(pool_md);
3591 if (!tc->pool) {
3592 ti->error = "Couldn't find pool object";
3593 r = -EINVAL;
3594 goto bad_pool_lookup;
3595 }
3596 __pool_inc(tc->pool);
3597
Joe Thornbere49e5822012-07-27 15:08:16 +01003598 if (get_pool_mode(tc->pool) == PM_FAIL) {
3599 ti->error = "Couldn't open thin device, Pool is in fail mode";
Mike Snitzer1acacc02014-02-19 20:32:33 -05003600 r = -EINVAL;
Joe Thornbere49e5822012-07-27 15:08:16 +01003601 goto bad_thin_open;
3602 }
3603
Joe Thornber991d9fa2011-10-31 20:21:18 +00003604 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3605 if (r) {
3606 ti->error = "Couldn't open thin internal device";
3607 goto bad_thin_open;
3608 }
3609
Mike Snitzer542f9032012-07-27 15:08:00 +01003610 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3611 if (r)
Mike Snitzer1acacc02014-02-19 20:32:33 -05003612 goto bad_target_max_io_len;
Mike Snitzer542f9032012-07-27 15:08:00 +01003613
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00003614 ti->num_flush_bios = 1;
Joe Thornber16ad3d12012-07-27 15:08:07 +01003615 ti->flush_supported = true;
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00003616 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
Joe Thornber67e2e2b2012-03-28 18:41:29 +01003617
3618 /* In case the pool supports discards, pass them on. */
Mike Snitzerb60ab992013-09-19 18:49:11 -04003619 ti->discard_zeroes_data_unsupported = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01003620 if (tc->pool->pf.discard_enabled) {
Alasdair G Kergon0ac55482012-07-27 15:08:08 +01003621 ti->discards_supported = true;
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00003622 ti->num_discard_bios = 1;
Alasdair G Kergon55a62ee2013-03-01 22:45:47 +00003623 /* Discard bios must be split on a block boundary */
3624 ti->split_discard_bios = true;
Joe Thornber67e2e2b2012-03-28 18:41:29 +01003625 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003626
3627 dm_put(pool_md);
3628
3629 mutex_unlock(&dm_thin_pool_table.mutex);
3630
Joe Thornberb10ebd32014-04-08 11:29:01 +01003631 atomic_set(&tc->refcount, 1);
3632 init_completion(&tc->can_destroy);
3633
Joe Thornber5e3283e2014-04-08 11:08:41 +01003634 spin_lock_irqsave(&tc->pool->lock, flags);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003635 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
Joe Thornber5e3283e2014-04-08 11:08:41 +01003636 spin_unlock_irqrestore(&tc->pool->lock, flags);
Mike Snitzerc140e1c2014-03-20 21:17:14 -04003637 /*
3638 * This synchronize_rcu() call is needed here otherwise we risk a
3639 * wake_worker() call finding no bios to process (because the newly
3640 * added tc isn't yet visible). So this reduces latency since we
3641 * aren't then dependent on the periodic commit to wake_worker().
3642 */
3643 synchronize_rcu();
3644
Joe Thornber991d9fa2011-10-31 20:21:18 +00003645 return 0;
3646
Mike Snitzer1acacc02014-02-19 20:32:33 -05003647bad_target_max_io_len:
3648 dm_pool_close_thin_device(tc->td);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003649bad_thin_open:
3650 __pool_dec(tc->pool);
3651bad_pool_lookup:
3652 dm_put(pool_md);
3653bad_common:
3654 dm_put_device(ti, tc->pool_dev);
3655bad_pool_dev:
Joe Thornber2dd9c252012-03-28 18:41:28 +01003656 if (tc->origin_dev)
3657 dm_put_device(ti, tc->origin_dev);
3658bad_origin_dev:
Joe Thornber991d9fa2011-10-31 20:21:18 +00003659 kfree(tc);
3660out_unlock:
3661 mutex_unlock(&dm_thin_pool_table.mutex);
3662
3663 return r;
3664}
3665
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00003666static int thin_map(struct dm_target *ti, struct bio *bio)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003667{
Kent Overstreet4f024f32013-10-11 15:44:27 -07003668 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003669
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00003670 return thin_bio_map(ti, bio);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003671}
3672
Mikulas Patocka7de3ee52012-12-21 20:23:41 +00003673static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
Joe Thornbereb2aa482012-03-28 18:41:28 +01003674{
3675 unsigned long flags;
Mikulas Patocka59c3d2c2012-12-21 20:23:40 +00003676 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Joe Thornbereb2aa482012-03-28 18:41:28 +01003677 struct list_head work;
Mike Snitzera24c2562012-06-03 00:30:00 +01003678 struct dm_thin_new_mapping *m, *tmp;
Joe Thornbereb2aa482012-03-28 18:41:28 +01003679 struct pool *pool = h->tc->pool;
3680
3681 if (h->shared_read_entry) {
3682 INIT_LIST_HEAD(&work);
Mike Snitzer44feb382012-10-12 21:02:10 +01003683 dm_deferred_entry_dec(h->shared_read_entry, &work);
Joe Thornbereb2aa482012-03-28 18:41:28 +01003684
3685 spin_lock_irqsave(&pool->lock, flags);
3686 list_for_each_entry_safe(m, tmp, &work, list) {
3687 list_del(&m->list);
Joe Thornber50f3c3e2014-06-13 13:57:09 +01003688 __complete_mapping_preparation(m);
Joe Thornbereb2aa482012-03-28 18:41:28 +01003689 }
3690 spin_unlock_irqrestore(&pool->lock, flags);
3691 }
3692
Joe Thornber104655f2012-03-28 18:41:28 +01003693 if (h->all_io_entry) {
3694 INIT_LIST_HEAD(&work);
Mike Snitzer44feb382012-10-12 21:02:10 +01003695 dm_deferred_entry_dec(h->all_io_entry, &work);
Joe Thornber563af182012-12-21 20:23:31 +00003696 if (!list_empty(&work)) {
3697 spin_lock_irqsave(&pool->lock, flags);
3698 list_for_each_entry_safe(m, tmp, &work, list)
Mike Snitzerdaec3382013-12-11 14:01:20 -05003699 list_add_tail(&m->list, &pool->prepared_discards);
Joe Thornber563af182012-12-21 20:23:31 +00003700 spin_unlock_irqrestore(&pool->lock, flags);
3701 wake_worker(pool);
3702 }
Joe Thornber104655f2012-03-28 18:41:28 +01003703 }
3704
Joe Thornbereb2aa482012-03-28 18:41:28 +01003705 return 0;
3706}
3707
Joe Thornber738211f2014-03-03 15:52:28 +00003708static void thin_presuspend(struct dm_target *ti)
3709{
3710 struct thin_c *tc = ti->private;
3711
3712 if (dm_noflush_suspending(ti))
3713 noflush_work(tc, do_noflush_start);
3714}
3715
Joe Thornber991d9fa2011-10-31 20:21:18 +00003716static void thin_postsuspend(struct dm_target *ti)
3717{
Joe Thornber738211f2014-03-03 15:52:28 +00003718 struct thin_c *tc = ti->private;
3719
3720 /*
3721 * The dm_noflush_suspending flag has been cleared by now, so
3722 * unfortunately we must always run this.
3723 */
3724 noflush_work(tc, do_noflush_stop);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003725}
3726
Joe Thornbere5aea7b2014-06-13 14:47:24 +01003727static int thin_preresume(struct dm_target *ti)
3728{
3729 struct thin_c *tc = ti->private;
3730
3731 if (tc->origin_dev)
3732 tc->origin_size = get_dev_size(tc->origin_dev->bdev);
3733
3734 return 0;
3735}
3736
Joe Thornber991d9fa2011-10-31 20:21:18 +00003737/*
3738 * <nr mapped sectors> <highest mapped sector>
3739 */
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003740static void thin_status(struct dm_target *ti, status_type_t type,
3741 unsigned status_flags, char *result, unsigned maxlen)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003742{
3743 int r;
3744 ssize_t sz = 0;
3745 dm_block_t mapped, highest;
3746 char buf[BDEVNAME_SIZE];
3747 struct thin_c *tc = ti->private;
3748
Joe Thornbere49e5822012-07-27 15:08:16 +01003749 if (get_pool_mode(tc->pool) == PM_FAIL) {
3750 DMEMIT("Fail");
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003751 return;
Joe Thornbere49e5822012-07-27 15:08:16 +01003752 }
3753
Joe Thornber991d9fa2011-10-31 20:21:18 +00003754 if (!tc->td)
3755 DMEMIT("-");
3756 else {
3757 switch (type) {
3758 case STATUSTYPE_INFO:
3759 r = dm_thin_get_mapped_count(tc->td, &mapped);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003760 if (r) {
3761 DMERR("dm_thin_get_mapped_count returned %d", r);
3762 goto err;
3763 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003764
3765 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003766 if (r < 0) {
3767 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
3768 goto err;
3769 }
Joe Thornber991d9fa2011-10-31 20:21:18 +00003770
3771 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3772 if (r)
3773 DMEMIT("%llu", ((highest + 1) *
3774 tc->pool->sectors_per_block) - 1);
3775 else
3776 DMEMIT("-");
3777 break;
3778
3779 case STATUSTYPE_TABLE:
3780 DMEMIT("%s %lu",
3781 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3782 (unsigned long) tc->dev_id);
Joe Thornber2dd9c252012-03-28 18:41:28 +01003783 if (tc->origin_dev)
3784 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
Joe Thornber991d9fa2011-10-31 20:21:18 +00003785 break;
3786 }
3787 }
3788
Mikulas Patockafd7c0922013-03-01 22:45:44 +00003789 return;
3790
3791err:
3792 DMEMIT("Error");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003793}
3794
Mike Snitzer36f12ae2014-10-09 15:24:12 -04003795static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3796 struct bio_vec *biovec, int max_size)
3797{
3798 struct thin_c *tc = ti->private;
3799 struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
3800
3801 if (!q->merge_bvec_fn)
3802 return max_size;
3803
3804 bvm->bi_bdev = tc->pool_dev->bdev;
3805 bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
3806
3807 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3808}
3809
Joe Thornber991d9fa2011-10-31 20:21:18 +00003810static int thin_iterate_devices(struct dm_target *ti,
3811 iterate_devices_callout_fn fn, void *data)
3812{
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003813 sector_t blocks;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003814 struct thin_c *tc = ti->private;
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003815 struct pool *pool = tc->pool;
Joe Thornber991d9fa2011-10-31 20:21:18 +00003816
3817 /*
3818 * We can't call dm_pool_get_data_dev_size() since that blocks. So
3819 * we follow a more convoluted path through to the pool's target.
3820 */
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003821 if (!pool->ti)
Joe Thornber991d9fa2011-10-31 20:21:18 +00003822 return 0; /* nothing is bound */
3823
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003824 blocks = pool->ti->len;
3825 (void) sector_div(blocks, pool->sectors_per_block);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003826 if (blocks)
Mike Snitzer55f2b8b2012-07-27 15:08:02 +01003827 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003828
3829 return 0;
3830}
3831
Joe Thornber991d9fa2011-10-31 20:21:18 +00003832static struct target_type thin_target = {
3833 .name = "thin",
Mike Snitzer36f12ae2014-10-09 15:24:12 -04003834 .version = {1, 14, 0},
Joe Thornber991d9fa2011-10-31 20:21:18 +00003835 .module = THIS_MODULE,
3836 .ctr = thin_ctr,
3837 .dtr = thin_dtr,
3838 .map = thin_map,
Joe Thornbereb2aa482012-03-28 18:41:28 +01003839 .end_io = thin_endio,
Joe Thornbere5aea7b2014-06-13 14:47:24 +01003840 .preresume = thin_preresume,
Joe Thornber738211f2014-03-03 15:52:28 +00003841 .presuspend = thin_presuspend,
Joe Thornber991d9fa2011-10-31 20:21:18 +00003842 .postsuspend = thin_postsuspend,
3843 .status = thin_status,
Mike Snitzer36f12ae2014-10-09 15:24:12 -04003844 .merge = thin_merge,
Joe Thornber991d9fa2011-10-31 20:21:18 +00003845 .iterate_devices = thin_iterate_devices,
Joe Thornber991d9fa2011-10-31 20:21:18 +00003846};
3847
3848/*----------------------------------------------------------------*/
3849
3850static int __init dm_thin_init(void)
3851{
3852 int r;
3853
3854 pool_table_init();
3855
3856 r = dm_register_target(&thin_target);
3857 if (r)
3858 return r;
3859
3860 r = dm_register_target(&pool_target);
3861 if (r)
Mike Snitzera24c2562012-06-03 00:30:00 +01003862 goto bad_pool_target;
3863
3864 r = -ENOMEM;
3865
Mike Snitzera24c2562012-06-03 00:30:00 +01003866 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3867 if (!_new_mapping_cache)
3868 goto bad_new_mapping_cache;
3869
Mike Snitzera24c2562012-06-03 00:30:00 +01003870 return 0;
3871
Mike Snitzera24c2562012-06-03 00:30:00 +01003872bad_new_mapping_cache:
Mike Snitzera24c2562012-06-03 00:30:00 +01003873 dm_unregister_target(&pool_target);
3874bad_pool_target:
3875 dm_unregister_target(&thin_target);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003876
3877 return r;
3878}
3879
3880static void dm_thin_exit(void)
3881{
3882 dm_unregister_target(&thin_target);
3883 dm_unregister_target(&pool_target);
Mike Snitzera24c2562012-06-03 00:30:00 +01003884
Mike Snitzera24c2562012-06-03 00:30:00 +01003885 kmem_cache_destroy(_new_mapping_cache);
Joe Thornber991d9fa2011-10-31 20:21:18 +00003886}
3887
3888module_init(dm_thin_init);
3889module_exit(dm_thin_exit);
3890
Mike Snitzer80c57892014-05-20 13:38:33 -04003891module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
3892MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
3893
Alasdair G Kergon7cab8bf2012-05-12 01:43:19 +01003894MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
Joe Thornber991d9fa2011-10-31 20:21:18 +00003895MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3896MODULE_LICENSE("GPL");