blob: 4e7f6c039a2afb551a5b5edeac5019c6efec6c29 [file] [log] [blame]
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001/*
2 * Copyright (C) 2018 Google Limited.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
Paul Lawrence9aa7b212019-03-22 08:25:28 -07008#include "dm-bufio.h"
Paul Lawrencede12cfe2018-10-23 08:56:04 -07009#include "dm-core.h"
10
11#include <linux/crc32.h>
Paul Lawrencede12cfe2018-10-23 08:56:04 -070012#include <linux/module.h>
13
14#define DM_MSG_PREFIX "bow"
Paul Lawrence9aa7b212019-03-22 08:25:28 -070015#define SECTOR_SIZE 512
Paul Lawrencede12cfe2018-10-23 08:56:04 -070016
17struct log_entry {
18 u64 source;
19 u64 dest;
20 u32 size;
21 u32 checksum;
22} __packed;
23
24struct log_sector {
25 u32 magic;
26 u16 header_version;
27 u16 header_size;
28 u32 block_size;
29 u32 count;
30 u32 sequence;
31 sector_t sector0;
32 struct log_entry entries[];
33} __packed;
34
35/*
36 * MAGIC is BOW in ascii
37 */
38#define MAGIC 0x00574f42
39#define HEADER_VERSION 0x0100
40
41/*
42 * A sorted set of ranges representing the state of the data on the device.
43 * Use an rb_tree for fast lookup of a given sector
44 * Consecutive ranges are always of different type - operations on this
45 * set must merge matching consecutive ranges.
46 *
47 * Top range is always of type TOP
48 */
49struct bow_range {
50 struct rb_node node;
51 sector_t sector;
52 enum {
53 INVALID, /* Type not set */
54 SECTOR0, /* First sector - holds log record */
55 SECTOR0_CURRENT,/* Live contents of sector0 */
56 UNCHANGED, /* Original contents */
57 TRIMMED, /* Range has been trimmed */
58 CHANGED, /* Range has been changed */
59 BACKUP, /* Range is being used as a backup */
60 TOP, /* Final range - sector is size of device */
61 } type;
62 struct list_head trimmed_list; /* list of TRIMMED ranges */
63};
64
65static const char * const readable_type[] = {
66 "Invalid",
67 "Sector0",
68 "Sector0_current",
69 "Unchanged",
70 "Free",
71 "Changed",
72 "Backup",
73 "Top",
74};
75
76enum state {
77 TRIM,
78 CHECKPOINT,
79 COMMITTED,
80};
81
82struct bow_context {
83 struct dm_dev *dev;
84 u32 block_size;
85 u32 block_shift;
86 struct workqueue_struct *workqueue;
87 struct dm_bufio_client *bufio;
88 struct mutex ranges_lock; /* Hold to access this struct and/or ranges */
89 struct rb_root ranges;
90 struct dm_kobject_holder kobj_holder; /* for sysfs attributes */
91 atomic_t state; /* One of the enum state values above */
92 u64 trims_total;
93 struct log_sector *log_sector;
94 struct list_head trimmed_list;
95 bool forward_trims;
96};
97
98sector_t range_top(struct bow_range *br)
99{
100 return container_of(rb_next(&br->node), struct bow_range, node)
101 ->sector;
102}
103
104u64 range_size(struct bow_range *br)
105{
106 return (range_top(br) - br->sector) * SECTOR_SIZE;
107}
108
109static sector_t bvec_top(struct bvec_iter *bi_iter)
110{
111 return bi_iter->bi_sector + bi_iter->bi_size / SECTOR_SIZE;
112}
113
114/*
115 * Find the first range that overlaps with bi_iter
116 * bi_iter is set to the size of the overlapping sub-range
117 */
118static struct bow_range *find_first_overlapping_range(struct rb_root *ranges,
119 struct bvec_iter *bi_iter)
120{
121 struct rb_node *node = ranges->rb_node;
122 struct bow_range *br;
123
124 while (node) {
125 br = container_of(node, struct bow_range, node);
126
127 if (br->sector <= bi_iter->bi_sector
128 && bi_iter->bi_sector < range_top(br))
129 break;
130
131 if (bi_iter->bi_sector < br->sector)
132 node = node->rb_left;
133 else
134 node = node->rb_right;
135 }
136
137 WARN_ON(!node);
138 if (!node)
139 return NULL;
140
141 if (range_top(br) - bi_iter->bi_sector
142 < bi_iter->bi_size >> SECTOR_SHIFT)
143 bi_iter->bi_size = (range_top(br) - bi_iter->bi_sector)
144 << SECTOR_SHIFT;
145
146 return br;
147}
148
149void add_before(struct rb_root *ranges, struct bow_range *new_br,
150 struct bow_range *existing)
151{
152 struct rb_node *parent = &(existing->node);
153 struct rb_node **link = &(parent->rb_left);
154
155 while (*link) {
156 parent = *link;
157 link = &((*link)->rb_right);
158 }
159
160 rb_link_node(&new_br->node, parent, link);
161 rb_insert_color(&new_br->node, ranges);
162}
163
164/*
165 * Given a range br returned by find_first_overlapping_range, split br into a
166 * leading range, a range matching the bi_iter and a trailing range.
167 * Leading and trailing may end up size 0 and will then be deleted. The
168 * new range matching the bi_iter is then returned and should have its type
169 * and type specific fields populated.
170 * If bi_iter runs off the end of the range, bi_iter is truncated accordingly
171 */
172static int split_range(struct bow_context *bc, struct bow_range **br,
173 struct bvec_iter *bi_iter)
174{
175 struct bow_range *new_br;
176
177 if (bi_iter->bi_sector < (*br)->sector) {
178 WARN_ON(true);
Paul Lawrence387e0792019-03-25 09:24:24 -0700179 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700180 }
181
182 if (bi_iter->bi_sector > (*br)->sector) {
183 struct bow_range *leading_br =
184 kzalloc(sizeof(*leading_br), GFP_KERNEL);
185
186 if (!leading_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700187 return -ENOMEM;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700188
189 *leading_br = **br;
190 if (leading_br->type == TRIMMED)
191 list_add(&leading_br->trimmed_list, &bc->trimmed_list);
192
193 add_before(&bc->ranges, leading_br, *br);
194 (*br)->sector = bi_iter->bi_sector;
195 }
196
197 if (bvec_top(bi_iter) >= range_top(*br)) {
198 bi_iter->bi_size = (range_top(*br) - (*br)->sector)
199 * SECTOR_SIZE;
Paul Lawrence387e0792019-03-25 09:24:24 -0700200 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700201 }
202
203 /* new_br will be the beginning, existing br will be the tail */
204 new_br = kzalloc(sizeof(*new_br), GFP_KERNEL);
205 if (!new_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700206 return -ENOMEM;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700207
208 new_br->sector = (*br)->sector;
209 (*br)->sector = bvec_top(bi_iter);
210 add_before(&bc->ranges, new_br, *br);
211 *br = new_br;
212
Paul Lawrence387e0792019-03-25 09:24:24 -0700213 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700214}
215
216/*
217 * Sets type of a range. May merge range into surrounding ranges
218 * Since br may be invalidated, always sets br to NULL to prevent
219 * usage after this is called
220 */
221static void set_type(struct bow_context *bc, struct bow_range **br, int type)
222{
223 struct bow_range *prev = container_of(rb_prev(&(*br)->node),
224 struct bow_range, node);
225 struct bow_range *next = container_of(rb_next(&(*br)->node),
226 struct bow_range, node);
227
228 if ((*br)->type == TRIMMED) {
229 bc->trims_total -= range_size(*br);
230 list_del(&(*br)->trimmed_list);
231 }
232
233 if (type == TRIMMED) {
234 bc->trims_total += range_size(*br);
235 list_add(&(*br)->trimmed_list, &bc->trimmed_list);
236 }
237
238 (*br)->type = type;
239
240 if (next->type == type) {
241 if (type == TRIMMED)
242 list_del(&next->trimmed_list);
243 rb_erase(&next->node, &bc->ranges);
244 kfree(next);
245 }
246
247 if (prev->type == type) {
248 if (type == TRIMMED)
249 list_del(&(*br)->trimmed_list);
250 rb_erase(&(*br)->node, &bc->ranges);
251 kfree(*br);
252 }
253
254 *br = NULL;
255}
256
257static struct bow_range *find_free_range(struct bow_context *bc)
258{
259 if (list_empty(&bc->trimmed_list)) {
260 DMERR("Unable to find free space to back up to");
261 return NULL;
262 }
263
264 return list_first_entry(&bc->trimmed_list, struct bow_range,
265 trimmed_list);
266}
267
268static sector_t sector_to_page(struct bow_context const *bc, sector_t sector)
269{
Paul Lawrencee8bdeec2019-03-26 09:05:55 -0700270 WARN_ON((sector & (((sector_t)1 << (bc->block_shift - SECTOR_SHIFT)) - 1))
271 != 0);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700272 return sector >> (bc->block_shift - SECTOR_SHIFT);
273}
274
275static int copy_data(struct bow_context const *bc,
276 struct bow_range *source, struct bow_range *dest,
277 u32 *checksum)
278{
279 int i;
280
281 if (range_size(source) != range_size(dest)) {
282 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700283 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700284 }
285
286 if (checksum)
287 *checksum = sector_to_page(bc, source->sector);
288
289 for (i = 0; i < range_size(source) >> bc->block_shift; ++i) {
290 struct dm_buffer *read_buffer, *write_buffer;
291 u8 *read, *write;
292 sector_t page = sector_to_page(bc, source->sector) + i;
293
294 read = dm_bufio_read(bc->bufio, page, &read_buffer);
295 if (IS_ERR(read)) {
Paul Lawrencee8bdeec2019-03-26 09:05:55 -0700296 DMERR("Cannot read page %llu",
297 (unsigned long long)page);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700298 return PTR_ERR(read);
299 }
300
301 if (checksum)
302 *checksum = crc32(*checksum, read, bc->block_size);
303
304 write = dm_bufio_new(bc->bufio,
305 sector_to_page(bc, dest->sector) + i,
306 &write_buffer);
307 if (IS_ERR(write)) {
308 DMERR("Cannot write sector");
309 dm_bufio_release(read_buffer);
310 return PTR_ERR(write);
311 }
312
313 memcpy(write, read, bc->block_size);
314
315 dm_bufio_mark_buffer_dirty(write_buffer);
316 dm_bufio_release(write_buffer);
317 dm_bufio_release(read_buffer);
318 }
319
320 dm_bufio_write_dirty_buffers(bc->bufio);
Paul Lawrence387e0792019-03-25 09:24:24 -0700321 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700322}
323
324/****** logging functions ******/
325
326static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest,
327 unsigned int size, u32 checksum);
328
329static int backup_log_sector(struct bow_context *bc)
330{
331 struct bow_range *first_br, *free_br;
332 struct bvec_iter bi_iter;
333 u32 checksum = 0;
334 int ret;
335
336 first_br = container_of(rb_first(&bc->ranges), struct bow_range, node);
337
338 if (first_br->type != SECTOR0) {
339 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700340 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700341 }
342
343 if (range_size(first_br) != bc->block_size) {
344 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700345 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700346 }
347
348 free_br = find_free_range(bc);
349 /* No space left - return this error to userspace */
350 if (!free_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700351 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700352 bi_iter.bi_sector = free_br->sector;
353 bi_iter.bi_size = bc->block_size;
354 ret = split_range(bc, &free_br, &bi_iter);
355 if (ret)
356 return ret;
357 if (bi_iter.bi_size != bc->block_size) {
358 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700359 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700360 }
361
362 ret = copy_data(bc, first_br, free_br, &checksum);
363 if (ret)
364 return ret;
365
366 bc->log_sector->count = 0;
367 bc->log_sector->sequence++;
368 ret = add_log_entry(bc, first_br->sector, free_br->sector,
369 range_size(first_br), checksum);
370 if (ret)
371 return ret;
372
373 set_type(bc, &free_br, BACKUP);
Paul Lawrence387e0792019-03-25 09:24:24 -0700374 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700375}
376
377static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest,
378 unsigned int size, u32 checksum)
379{
380 struct dm_buffer *sector_buffer;
381 u8 *sector;
382
383 if (sizeof(struct log_sector)
384 + sizeof(struct log_entry) * (bc->log_sector->count + 1)
385 > bc->block_size) {
386 int ret = backup_log_sector(bc);
387
388 if (ret)
389 return ret;
390 }
391
392 sector = dm_bufio_new(bc->bufio, 0, &sector_buffer);
393 if (IS_ERR(sector)) {
394 DMERR("Cannot write boot sector");
395 dm_bufio_release(sector_buffer);
Paul Lawrence387e0792019-03-25 09:24:24 -0700396 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700397 }
398
399 bc->log_sector->entries[bc->log_sector->count].source = source;
400 bc->log_sector->entries[bc->log_sector->count].dest = dest;
401 bc->log_sector->entries[bc->log_sector->count].size = size;
402 bc->log_sector->entries[bc->log_sector->count].checksum = checksum;
403 bc->log_sector->count++;
404
405 memcpy(sector, bc->log_sector, bc->block_size);
406 dm_bufio_mark_buffer_dirty(sector_buffer);
407 dm_bufio_release(sector_buffer);
408 dm_bufio_write_dirty_buffers(bc->bufio);
Paul Lawrence387e0792019-03-25 09:24:24 -0700409 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700410}
411
412static int prepare_log(struct bow_context *bc)
413{
414 struct bow_range *free_br, *first_br;
415 struct bvec_iter bi_iter;
416 u32 checksum = 0;
417 int ret;
418
419 /* Carve out first sector as log sector */
420 first_br = container_of(rb_first(&bc->ranges), struct bow_range, node);
421 if (first_br->type != UNCHANGED) {
422 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700423 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700424 }
425
426 if (range_size(first_br) < bc->block_size) {
427 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700428 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700429 }
430 bi_iter.bi_sector = 0;
431 bi_iter.bi_size = bc->block_size;
432 ret = split_range(bc, &first_br, &bi_iter);
433 if (ret)
434 return ret;
435 first_br->type = SECTOR0;
436 if (range_size(first_br) != bc->block_size) {
437 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700438 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700439 }
440
441 /* Find free sector for active sector0 reads/writes */
442 free_br = find_free_range(bc);
443 if (!free_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700444 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700445 bi_iter.bi_sector = free_br->sector;
446 bi_iter.bi_size = bc->block_size;
447 ret = split_range(bc, &free_br, &bi_iter);
448 if (ret)
449 return ret;
450 free_br->type = SECTOR0_CURRENT;
451
452 /* Copy data */
453 ret = copy_data(bc, first_br, free_br, NULL);
454 if (ret)
455 return ret;
456
457 bc->log_sector->sector0 = free_br->sector;
458
459 /* Find free sector to back up original sector zero */
460 free_br = find_free_range(bc);
461 if (!free_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700462 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700463 bi_iter.bi_sector = free_br->sector;
464 bi_iter.bi_size = bc->block_size;
465 ret = split_range(bc, &free_br, &bi_iter);
466 if (ret)
467 return ret;
468
469 /* Back up */
470 ret = copy_data(bc, first_br, free_br, &checksum);
471 if (ret)
472 return ret;
473
474 /*
475 * Set up our replacement boot sector - it will get written when we
476 * add the first log entry, which we do immediately
477 */
478 bc->log_sector->magic = MAGIC;
479 bc->log_sector->header_version = HEADER_VERSION;
480 bc->log_sector->header_size = sizeof(*bc->log_sector);
481 bc->log_sector->block_size = bc->block_size;
482 bc->log_sector->count = 0;
483 bc->log_sector->sequence = 0;
484
485 /* Add log entry */
486 ret = add_log_entry(bc, first_br->sector, free_br->sector,
487 range_size(first_br), checksum);
488 if (ret)
489 return ret;
490
491 set_type(bc, &free_br, BACKUP);
Paul Lawrence387e0792019-03-25 09:24:24 -0700492 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700493}
494
495static struct bow_range *find_sector0_current(struct bow_context *bc)
496{
497 struct bvec_iter bi_iter;
498
499 bi_iter.bi_sector = bc->log_sector->sector0;
500 bi_iter.bi_size = bc->block_size;
501 return find_first_overlapping_range(&bc->ranges, &bi_iter);
502}
503
504/****** sysfs interface functions ******/
505
506static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
507 char *buf)
508{
509 struct bow_context *bc = container_of(kobj, struct bow_context,
510 kobj_holder.kobj);
511
512 return scnprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&bc->state));
513}
514
515static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
516 const char *buf, size_t count)
517{
518 struct bow_context *bc = container_of(kobj, struct bow_context,
519 kobj_holder.kobj);
520 enum state state, original_state;
521 int ret;
522
523 state = buf[0] - '0';
524 if (state < TRIM || state > COMMITTED) {
525 DMERR("State value %d out of range", state);
526 return -EINVAL;
527 }
528
529 mutex_lock(&bc->ranges_lock);
530 original_state = atomic_read(&bc->state);
531 if (state != original_state + 1) {
532 DMERR("Invalid state change from %d to %d",
533 original_state, state);
534 ret = -EINVAL;
535 goto bad;
536 }
537
538 DMINFO("Switching to state %s", state == CHECKPOINT ? "Checkpoint"
539 : state == COMMITTED ? "Committed" : "Unknown");
540
541 if (state == CHECKPOINT) {
542 ret = prepare_log(bc);
543 if (ret) {
544 DMERR("Failed to switch to checkpoint state");
545 goto bad;
546 }
547 } else if (state == COMMITTED) {
548 struct bow_range *br = find_sector0_current(bc);
549 struct bow_range *sector0_br =
550 container_of(rb_first(&bc->ranges), struct bow_range,
551 node);
552
553 ret = copy_data(bc, br, sector0_br, 0);
554 if (ret) {
555 DMERR("Failed to switch to committed state");
556 goto bad;
557 }
558 }
559 atomic_inc(&bc->state);
560 ret = count;
561
562bad:
563 mutex_unlock(&bc->ranges_lock);
564 return ret;
565}
566
567static ssize_t free_show(struct kobject *kobj, struct kobj_attribute *attr,
568 char *buf)
569{
570 struct bow_context *bc = container_of(kobj, struct bow_context,
571 kobj_holder.kobj);
572 u64 trims_total;
573
574 mutex_lock(&bc->ranges_lock);
575 trims_total = bc->trims_total;
576 mutex_unlock(&bc->ranges_lock);
577
578 return scnprintf(buf, PAGE_SIZE, "%llu\n", trims_total);
579}
580
581static struct kobj_attribute attr_state = __ATTR_RW(state);
582static struct kobj_attribute attr_free = __ATTR_RO(free);
583
584static struct attribute *bow_attrs[] = {
585 &attr_state.attr,
586 &attr_free.attr,
587 NULL
588};
589
590static struct kobj_type bow_ktype = {
591 .sysfs_ops = &kobj_sysfs_ops,
592 .default_attrs = bow_attrs,
593 .release = dm_kobject_release
594};
595
596/****** constructor/destructor ******/
597
598static void dm_bow_dtr(struct dm_target *ti)
599{
600 struct bow_context *bc = (struct bow_context *) ti->private;
601 struct kobject *kobj;
602
603 while (rb_first(&bc->ranges)) {
604 struct bow_range *br = container_of(rb_first(&bc->ranges),
605 struct bow_range, node);
606
607 rb_erase(&br->node, &bc->ranges);
608 kfree(br);
609 }
610 if (bc->workqueue)
611 destroy_workqueue(bc->workqueue);
612 if (bc->bufio)
613 dm_bufio_client_destroy(bc->bufio);
614
615 kobj = &bc->kobj_holder.kobj;
616 if (kobj->state_initialized) {
617 kobject_put(kobj);
618 wait_for_completion(dm_get_completion_from_kobject(kobj));
619 }
620
621 kfree(bc->log_sector);
622 kfree(bc);
623}
624
625static int dm_bow_ctr(struct dm_target *ti, unsigned int argc, char **argv)
626{
627 struct bow_context *bc;
628 struct bow_range *br;
629 int ret;
630 struct mapped_device *md = dm_table_get_md(ti->table);
631
632 if (argc != 1) {
633 ti->error = "Invalid argument count";
634 return -EINVAL;
635 }
636
637 bc = kzalloc(sizeof(*bc), GFP_KERNEL);
638 if (!bc) {
639 ti->error = "Cannot allocate bow context";
640 return -ENOMEM;
641 }
642
643 ti->num_flush_bios = 1;
644 ti->num_discard_bios = 1;
645 ti->num_write_same_bios = 1;
646 ti->private = bc;
647
648 ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
649 &bc->dev);
650 if (ret) {
651 ti->error = "Device lookup failed";
652 goto bad;
653 }
654
655 if (bc->dev->bdev->bd_queue->limits.max_discard_sectors == 0) {
656 bc->dev->bdev->bd_queue->limits.discard_granularity = 1 << 12;
657 bc->dev->bdev->bd_queue->limits.max_hw_discard_sectors = 1 << 15;
658 bc->dev->bdev->bd_queue->limits.max_discard_sectors = 1 << 15;
659 bc->forward_trims = false;
660 } else {
661 bc->forward_trims = true;
662 }
663
664 bc->block_size = bc->dev->bdev->bd_queue->limits.logical_block_size;
665 bc->block_shift = ilog2(bc->block_size);
666 bc->log_sector = kzalloc(bc->block_size, GFP_KERNEL);
667 if (!bc->log_sector) {
668 ti->error = "Cannot allocate log sector";
669 goto bad;
670 }
671
672 init_completion(&bc->kobj_holder.completion);
673 ret = kobject_init_and_add(&bc->kobj_holder.kobj, &bow_ktype,
674 &disk_to_dev(dm_disk(md))->kobj, "%s",
675 "bow");
676 if (ret) {
677 ti->error = "Cannot create sysfs node";
678 goto bad;
679 }
680
681 mutex_init(&bc->ranges_lock);
682 bc->ranges = RB_ROOT;
683 bc->bufio = dm_bufio_client_create(bc->dev->bdev, bc->block_size, 1, 0,
684 NULL, NULL);
685 if (IS_ERR(bc->bufio)) {
686 ti->error = "Cannot initialize dm-bufio";
687 ret = PTR_ERR(bc->bufio);
688 bc->bufio = NULL;
689 goto bad;
690 }
691
692 bc->workqueue = alloc_workqueue("dm-bow",
693 WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM
694 | WQ_UNBOUND, num_online_cpus());
695 if (!bc->workqueue) {
696 ti->error = "Cannot allocate workqueue";
697 ret = -ENOMEM;
698 goto bad;
699 }
700
701 INIT_LIST_HEAD(&bc->trimmed_list);
702
703 br = kzalloc(sizeof(*br), GFP_KERNEL);
704 if (!br) {
705 ti->error = "Cannot allocate ranges";
706 ret = -ENOMEM;
707 goto bad;
708 }
709
710 br->sector = ti->len;
711 br->type = TOP;
712 rb_link_node(&br->node, NULL, &bc->ranges.rb_node);
713 rb_insert_color(&br->node, &bc->ranges);
714
715 br = kzalloc(sizeof(*br), GFP_KERNEL);
716 if (!br) {
717 ti->error = "Cannot allocate ranges";
718 ret = -ENOMEM;
719 goto bad;
720 }
721
722 br->sector = 0;
723 br->type = UNCHANGED;
724 rb_link_node(&br->node, bc->ranges.rb_node,
725 &bc->ranges.rb_node->rb_left);
726 rb_insert_color(&br->node, &bc->ranges);
727
728 ti->discards_supported = true;
729
730 return 0;
731
732bad:
733 dm_bow_dtr(ti);
734 return ret;
735}
736
737/****** Handle writes ******/
738
739static int prepare_unchanged_range(struct bow_context *bc, struct bow_range *br,
740 struct bvec_iter *bi_iter,
741 bool record_checksum)
742{
743 struct bow_range *backup_br;
744 struct bvec_iter backup_bi;
745 sector_t log_source, log_dest;
746 unsigned int log_size;
747 u32 checksum = 0;
748 int ret;
749 int original_type;
750 sector_t sector0;
751
752 /* Find a free range */
753 backup_br = find_free_range(bc);
754 if (!backup_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700755 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700756
757 /* Carve out a backup range. This may be smaller than the br given */
758 backup_bi.bi_sector = backup_br->sector;
759 backup_bi.bi_size = min(range_size(backup_br), (u64) bi_iter->bi_size);
760 ret = split_range(bc, &backup_br, &backup_bi);
761 if (ret)
762 return ret;
763
764 /*
765 * Carve out a changed range. This will not be smaller than the backup
766 * br since the backup br is smaller than the source range and iterator
767 */
768 bi_iter->bi_size = backup_bi.bi_size;
769 ret = split_range(bc, &br, bi_iter);
770 if (ret)
771 return ret;
772 if (range_size(br) != range_size(backup_br)) {
773 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700774 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700775 }
776
777
778 /* Copy data over */
779 ret = copy_data(bc, br, backup_br, record_checksum ? &checksum : NULL);
780 if (ret)
781 return ret;
782
783 /* Add an entry to the log */
784 log_source = br->sector;
785 log_dest = backup_br->sector;
786 log_size = range_size(br);
787
788 /*
789 * Set the types. Note that since set_type also amalgamates ranges
790 * we have to set both sectors to their final type before calling
791 * set_type on either
792 */
793 original_type = br->type;
794 sector0 = backup_br->sector;
795 if (backup_br->type == TRIMMED)
796 list_del(&backup_br->trimmed_list);
797 backup_br->type = br->type == SECTOR0_CURRENT ? SECTOR0_CURRENT
798 : BACKUP;
799 br->type = CHANGED;
800 set_type(bc, &backup_br, backup_br->type);
801
802 /*
803 * Add the log entry after marking the backup sector, since adding a log
804 * can cause another backup
805 */
806 ret = add_log_entry(bc, log_source, log_dest, log_size, checksum);
807 if (ret) {
808 br->type = original_type;
809 return ret;
810 }
811
812 /* Now it is safe to mark this backup successful */
813 if (original_type == SECTOR0_CURRENT)
814 bc->log_sector->sector0 = sector0;
815
816 set_type(bc, &br, br->type);
817 return ret;
818}
819
820static int prepare_free_range(struct bow_context *bc, struct bow_range *br,
821 struct bvec_iter *bi_iter)
822{
823 int ret;
824
825 ret = split_range(bc, &br, bi_iter);
826 if (ret)
827 return ret;
828 set_type(bc, &br, CHANGED);
Paul Lawrence387e0792019-03-25 09:24:24 -0700829 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700830}
831
832static int prepare_changed_range(struct bow_context *bc, struct bow_range *br,
833 struct bvec_iter *bi_iter)
834{
835 /* Nothing to do ... */
Paul Lawrence387e0792019-03-25 09:24:24 -0700836 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700837}
838
839static int prepare_one_range(struct bow_context *bc,
840 struct bvec_iter *bi_iter)
841{
842 struct bow_range *br = find_first_overlapping_range(&bc->ranges,
843 bi_iter);
844 switch (br->type) {
845 case CHANGED:
846 return prepare_changed_range(bc, br, bi_iter);
847
848 case TRIMMED:
849 return prepare_free_range(bc, br, bi_iter);
850
851 case UNCHANGED:
852 case BACKUP:
853 return prepare_unchanged_range(bc, br, bi_iter, true);
854
855 /*
856 * We cannot track the checksum for the active sector0, since it
857 * may change at any point.
858 */
859 case SECTOR0_CURRENT:
860 return prepare_unchanged_range(bc, br, bi_iter, false);
861
862 case SECTOR0: /* Handled in the dm_bow_map */
863 case TOP: /* Illegal - top is off the end of the device */
864 default:
865 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700866 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700867 }
868}
869
870struct write_work {
871 struct work_struct work;
872 struct bow_context *bc;
873 struct bio *bio;
874};
875
876static void bow_write(struct work_struct *work)
877{
878 struct write_work *ww = container_of(work, struct write_work, work);
879 struct bow_context *bc = ww->bc;
880 struct bio *bio = ww->bio;
881 struct bvec_iter bi_iter = bio->bi_iter;
Paul Lawrence387e0792019-03-25 09:24:24 -0700882 int ret = 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700883
884 kfree(ww);
885
886 mutex_lock(&bc->ranges_lock);
887 do {
888 ret = prepare_one_range(bc, &bi_iter);
889 bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE;
890 bi_iter.bi_size = bio->bi_iter.bi_size
891 - (bi_iter.bi_sector - bio->bi_iter.bi_sector)
892 * SECTOR_SIZE;
893 } while (!ret && bi_iter.bi_size);
894
895 mutex_unlock(&bc->ranges_lock);
896
897 if (!ret) {
Paul Lawrence387e0792019-03-25 09:24:24 -0700898 bio->bi_bdev = bc->dev->bdev;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700899 submit_bio(bio);
900 } else {
901 DMERR("Write failure with error %d", -ret);
Paul Lawrence387e0792019-03-25 09:24:24 -0700902 bio->bi_error = ret;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700903 bio_endio(bio);
904 }
905}
906
907static int queue_write(struct bow_context *bc, struct bio *bio)
908{
909 struct write_work *ww = kmalloc(sizeof(*ww), GFP_NOIO | __GFP_NORETRY
910 | __GFP_NOMEMALLOC | __GFP_NOWARN);
911 if (!ww) {
912 DMERR("Failed to allocate write_work");
913 return -ENOMEM;
914 }
915
916 INIT_WORK(&ww->work, bow_write);
917 ww->bc = bc;
918 ww->bio = bio;
919 queue_work(bc->workqueue, &ww->work);
920 return DM_MAPIO_SUBMITTED;
921}
922
923static int handle_sector0(struct bow_context *bc, struct bio *bio)
924{
925 int ret = DM_MAPIO_REMAPPED;
926
927 if (bio->bi_iter.bi_size > bc->block_size) {
928 struct bio * split = bio_split(bio,
929 bc->block_size >> SECTOR_SHIFT,
930 GFP_NOIO,
Paul Lawrence9aa7b212019-03-22 08:25:28 -0700931 fs_bio_set);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700932 if (!split) {
933 DMERR("Failed to split bio");
Paul Lawrence387e0792019-03-25 09:24:24 -0700934 bio->bi_error = -ENOMEM;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700935 bio_endio(bio);
936 return DM_MAPIO_SUBMITTED;
937 }
938
939 bio_chain(split, bio);
940 split->bi_iter.bi_sector = bc->log_sector->sector0;
Paul Lawrence387e0792019-03-25 09:24:24 -0700941 split->bi_bdev = bc->dev->bdev;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700942 submit_bio(split);
943
944 if (bio_data_dir(bio) == WRITE)
945 ret = queue_write(bc, bio);
946 } else {
947 bio->bi_iter.bi_sector = bc->log_sector->sector0;
948 }
949
950 return ret;
951}
952
953static int add_trim(struct bow_context *bc, struct bio *bio)
954{
955 struct bow_range *br;
956 struct bvec_iter bi_iter = bio->bi_iter;
957
Paul Lawrencee8bdeec2019-03-26 09:05:55 -0700958 DMDEBUG("add_trim: %llu, %u",
959 (unsigned long long)bio->bi_iter.bi_sector,
960 bio->bi_iter.bi_size);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700961
962 do {
963 br = find_first_overlapping_range(&bc->ranges, &bi_iter);
964
965 switch (br->type) {
966 case UNCHANGED:
967 if (!split_range(bc, &br, &bi_iter))
968 set_type(bc, &br, TRIMMED);
969 break;
970
971 case TRIMMED:
972 /* Nothing to do */
973 break;
974
975 default:
976 /* No other case is legal in TRIM state */
977 WARN_ON(true);
978 break;
979 }
980
981 bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE;
982 bi_iter.bi_size = bio->bi_iter.bi_size
983 - (bi_iter.bi_sector - bio->bi_iter.bi_sector)
984 * SECTOR_SIZE;
985
986 } while (bi_iter.bi_size);
987
988 bio_endio(bio);
989 return DM_MAPIO_SUBMITTED;
990}
991
992static int remove_trim(struct bow_context *bc, struct bio *bio)
993{
994 struct bow_range *br;
995 struct bvec_iter bi_iter = bio->bi_iter;
996
Paul Lawrencee8bdeec2019-03-26 09:05:55 -0700997 DMDEBUG("remove_trim: %llu, %u",
998 (unsigned long long)bio->bi_iter.bi_sector,
999 bio->bi_iter.bi_size);
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001000
1001 do {
1002 br = find_first_overlapping_range(&bc->ranges, &bi_iter);
1003
1004 switch (br->type) {
1005 case UNCHANGED:
1006 /* Nothing to do */
1007 break;
1008
1009 case TRIMMED:
1010 if (!split_range(bc, &br, &bi_iter))
1011 set_type(bc, &br, UNCHANGED);
1012 break;
1013
1014 default:
1015 /* No other case is legal in TRIM state */
1016 WARN_ON(true);
1017 break;
1018 }
1019
1020 bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE;
1021 bi_iter.bi_size = bio->bi_iter.bi_size
1022 - (bi_iter.bi_sector - bio->bi_iter.bi_sector)
1023 * SECTOR_SIZE;
1024
1025 } while (bi_iter.bi_size);
1026
1027 return DM_MAPIO_REMAPPED;
1028}
1029
1030int remap_unless_illegal_trim(struct bow_context *bc, struct bio *bio)
1031{
1032 if (!bc->forward_trims && bio_op(bio) == REQ_OP_DISCARD) {
Paul Lawrence387e0792019-03-25 09:24:24 -07001033 bio->bi_error = -EINVAL;
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001034 bio_endio(bio);
1035 return DM_MAPIO_SUBMITTED;
1036 } else {
Paul Lawrence387e0792019-03-25 09:24:24 -07001037 bio->bi_bdev = bc->dev->bdev;
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001038 return DM_MAPIO_REMAPPED;
1039 }
1040}
1041
1042/****** dm interface ******/
1043
1044static int dm_bow_map(struct dm_target *ti, struct bio *bio)
1045{
1046 int ret = DM_MAPIO_REMAPPED;
1047 struct bow_context *bc = ti->private;
1048
1049 if (likely(bc->state.counter == COMMITTED))
1050 return remap_unless_illegal_trim(bc, bio);
1051
1052 if (bio_data_dir(bio) == READ && bio->bi_iter.bi_sector != 0)
1053 return remap_unless_illegal_trim(bc, bio);
1054
1055 if (atomic_read(&bc->state) != COMMITTED) {
1056 enum state state;
1057
1058 mutex_lock(&bc->ranges_lock);
1059 state = atomic_read(&bc->state);
1060 if (state == TRIM) {
1061 if (bio_op(bio) == REQ_OP_DISCARD)
1062 ret = add_trim(bc, bio);
1063 else if (bio_data_dir(bio) == WRITE)
1064 ret = remove_trim(bc, bio);
1065 else
1066 /* pass-through */;
1067 } else if (state == CHECKPOINT) {
1068 if (bio->bi_iter.bi_sector == 0)
1069 ret = handle_sector0(bc, bio);
1070 else if (bio_data_dir(bio) == WRITE)
1071 ret = queue_write(bc, bio);
1072 else
1073 /* pass-through */;
1074 } else {
1075 /* pass-through */
1076 }
1077 mutex_unlock(&bc->ranges_lock);
1078 }
1079
1080 if (ret == DM_MAPIO_REMAPPED)
1081 return remap_unless_illegal_trim(bc, bio);
1082
1083 return ret;
1084}
1085
1086static void dm_bow_tablestatus(struct dm_target *ti, char *result,
1087 unsigned int maxlen)
1088{
1089 char *end = result + maxlen;
1090 struct bow_context *bc = ti->private;
1091 struct rb_node *i;
1092 int trimmed_list_length = 0;
1093 int trimmed_range_count = 0;
1094 struct bow_range *br;
1095
1096 if (maxlen == 0)
1097 return;
1098 result[0] = 0;
1099
1100 list_for_each_entry(br, &bc->trimmed_list, trimmed_list)
1101 if (br->type == TRIMMED) {
1102 ++trimmed_list_length;
1103 } else {
1104 scnprintf(result, end - result,
1105 "ERROR: non-trimmed entry in trimmed_list");
1106 return;
1107 }
1108
1109 if (!rb_first(&bc->ranges)) {
1110 scnprintf(result, end - result, "ERROR: Empty ranges");
1111 return;
1112 }
1113
1114 if (container_of(rb_first(&bc->ranges), struct bow_range, node)
1115 ->sector) {
1116 scnprintf(result, end - result,
1117 "ERROR: First range does not start at sector 0");
1118 return;
1119 }
1120
1121 for (i = rb_first(&bc->ranges); i; i = rb_next(i)) {
1122 struct bow_range *br = container_of(i, struct bow_range, node);
1123
Paul Lawrencee8bdeec2019-03-26 09:05:55 -07001124 result += scnprintf(result, end - result, "%s: %llu",
1125 readable_type[br->type],
1126 (unsigned long long)br->sector);
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001127 if (result >= end)
1128 return;
1129
1130 result += scnprintf(result, end - result, "\n");
1131 if (result >= end)
1132 return;
1133
1134 if (br->type == TRIMMED)
1135 ++trimmed_range_count;
1136
1137 if (br->type == TOP) {
1138 if (br->sector != ti->len) {
1139 scnprintf(result, end - result,
1140 "\nERROR: Top sector is incorrect");
1141 }
1142
1143 if (&br->node != rb_last(&bc->ranges)) {
1144 scnprintf(result, end - result,
1145 "\nERROR: Top sector is not last");
1146 }
1147
1148 break;
1149 }
1150
1151 if (!rb_next(i)) {
1152 scnprintf(result, end - result,
1153 "\nERROR: Last range not of type TOP");
1154 return;
1155 }
1156
1157 if (br->sector > range_top(br)) {
1158 scnprintf(result, end - result,
1159 "\nERROR: sectors out of order");
1160 return;
1161 }
1162 }
1163
1164 if (trimmed_range_count != trimmed_list_length)
1165 scnprintf(result, end - result,
1166 "\nERROR: not all trimmed ranges in trimmed list");
1167}
1168
1169static void dm_bow_status(struct dm_target *ti, status_type_t type,
1170 unsigned int status_flags, char *result,
1171 unsigned int maxlen)
1172{
1173 switch (type) {
1174 case STATUSTYPE_INFO:
1175 if (maxlen)
1176 result[0] = 0;
1177 break;
1178
1179 case STATUSTYPE_TABLE:
1180 dm_bow_tablestatus(ti, result, maxlen);
1181 break;
1182 }
1183}
1184
Paul Lawrence9aa7b212019-03-22 08:25:28 -07001185int dm_bow_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
1186 fmode_t *mode)
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001187{
1188 struct bow_context *bc = ti->private;
1189 struct dm_dev *dev = bc->dev;
1190
1191 *bdev = dev->bdev;
1192 /* Only pass ioctls through if the device sizes match exactly. */
1193 return ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1194}
1195
1196static int dm_bow_iterate_devices(struct dm_target *ti,
1197 iterate_devices_callout_fn fn, void *data)
1198{
1199 struct bow_context *bc = ti->private;
1200
1201 return fn(ti, bc->dev, 0, ti->len, data);
1202}
1203
1204static struct target_type bow_target = {
1205 .name = "bow",
1206 .version = {1, 1, 1},
1207 .module = THIS_MODULE,
1208 .ctr = dm_bow_ctr,
1209 .dtr = dm_bow_dtr,
1210 .map = dm_bow_map,
1211 .status = dm_bow_status,
1212 .prepare_ioctl = dm_bow_prepare_ioctl,
1213 .iterate_devices = dm_bow_iterate_devices,
1214};
1215
1216int __init dm_bow_init(void)
1217{
1218 int r = dm_register_target(&bow_target);
1219
1220 if (r < 0)
1221 DMERR("registering bow failed %d", r);
1222 return r;
1223}
1224
1225void dm_bow_exit(void)
1226{
1227 dm_unregister_target(&bow_target);
1228}
1229
1230MODULE_LICENSE("GPL");
1231
1232module_init(dm_bow_init);
1233module_exit(dm_bow_exit);