blob: b45f0b2fcc7c85f982fb4b142b8e39bceb3f8d88 [file] [log] [blame]
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001/*
2 * Copyright (C) 2018 Google Limited.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
Paul Lawrence9aa7b212019-03-22 08:25:28 -07008#include "dm-bufio.h"
Paul Lawrencede12cfe2018-10-23 08:56:04 -07009#include "dm-core.h"
10
11#include <linux/crc32.h>
Paul Lawrencede12cfe2018-10-23 08:56:04 -070012#include <linux/module.h>
13
14#define DM_MSG_PREFIX "bow"
Paul Lawrence9aa7b212019-03-22 08:25:28 -070015#define SECTOR_SIZE 512
Paul Lawrencede12cfe2018-10-23 08:56:04 -070016
17struct log_entry {
18 u64 source;
19 u64 dest;
20 u32 size;
21 u32 checksum;
22} __packed;
23
24struct log_sector {
25 u32 magic;
26 u16 header_version;
27 u16 header_size;
28 u32 block_size;
29 u32 count;
30 u32 sequence;
31 sector_t sector0;
32 struct log_entry entries[];
33} __packed;
34
35/*
36 * MAGIC is BOW in ascii
37 */
38#define MAGIC 0x00574f42
39#define HEADER_VERSION 0x0100
40
41/*
42 * A sorted set of ranges representing the state of the data on the device.
43 * Use an rb_tree for fast lookup of a given sector
44 * Consecutive ranges are always of different type - operations on this
45 * set must merge matching consecutive ranges.
46 *
47 * Top range is always of type TOP
48 */
49struct bow_range {
50 struct rb_node node;
51 sector_t sector;
52 enum {
53 INVALID, /* Type not set */
54 SECTOR0, /* First sector - holds log record */
55 SECTOR0_CURRENT,/* Live contents of sector0 */
56 UNCHANGED, /* Original contents */
57 TRIMMED, /* Range has been trimmed */
58 CHANGED, /* Range has been changed */
59 BACKUP, /* Range is being used as a backup */
60 TOP, /* Final range - sector is size of device */
61 } type;
62 struct list_head trimmed_list; /* list of TRIMMED ranges */
63};
64
65static const char * const readable_type[] = {
66 "Invalid",
67 "Sector0",
68 "Sector0_current",
69 "Unchanged",
70 "Free",
71 "Changed",
72 "Backup",
73 "Top",
74};
75
76enum state {
77 TRIM,
78 CHECKPOINT,
79 COMMITTED,
80};
81
82struct bow_context {
83 struct dm_dev *dev;
84 u32 block_size;
85 u32 block_shift;
86 struct workqueue_struct *workqueue;
87 struct dm_bufio_client *bufio;
88 struct mutex ranges_lock; /* Hold to access this struct and/or ranges */
89 struct rb_root ranges;
90 struct dm_kobject_holder kobj_holder; /* for sysfs attributes */
91 atomic_t state; /* One of the enum state values above */
92 u64 trims_total;
93 struct log_sector *log_sector;
94 struct list_head trimmed_list;
95 bool forward_trims;
96};
97
98sector_t range_top(struct bow_range *br)
99{
100 return container_of(rb_next(&br->node), struct bow_range, node)
101 ->sector;
102}
103
104u64 range_size(struct bow_range *br)
105{
106 return (range_top(br) - br->sector) * SECTOR_SIZE;
107}
108
109static sector_t bvec_top(struct bvec_iter *bi_iter)
110{
111 return bi_iter->bi_sector + bi_iter->bi_size / SECTOR_SIZE;
112}
113
114/*
115 * Find the first range that overlaps with bi_iter
116 * bi_iter is set to the size of the overlapping sub-range
117 */
118static struct bow_range *find_first_overlapping_range(struct rb_root *ranges,
119 struct bvec_iter *bi_iter)
120{
121 struct rb_node *node = ranges->rb_node;
122 struct bow_range *br;
123
124 while (node) {
125 br = container_of(node, struct bow_range, node);
126
127 if (br->sector <= bi_iter->bi_sector
128 && bi_iter->bi_sector < range_top(br))
129 break;
130
131 if (bi_iter->bi_sector < br->sector)
132 node = node->rb_left;
133 else
134 node = node->rb_right;
135 }
136
137 WARN_ON(!node);
138 if (!node)
139 return NULL;
140
141 if (range_top(br) - bi_iter->bi_sector
142 < bi_iter->bi_size >> SECTOR_SHIFT)
143 bi_iter->bi_size = (range_top(br) - bi_iter->bi_sector)
144 << SECTOR_SHIFT;
145
146 return br;
147}
148
149void add_before(struct rb_root *ranges, struct bow_range *new_br,
150 struct bow_range *existing)
151{
152 struct rb_node *parent = &(existing->node);
153 struct rb_node **link = &(parent->rb_left);
154
155 while (*link) {
156 parent = *link;
157 link = &((*link)->rb_right);
158 }
159
160 rb_link_node(&new_br->node, parent, link);
161 rb_insert_color(&new_br->node, ranges);
162}
163
164/*
165 * Given a range br returned by find_first_overlapping_range, split br into a
166 * leading range, a range matching the bi_iter and a trailing range.
167 * Leading and trailing may end up size 0 and will then be deleted. The
168 * new range matching the bi_iter is then returned and should have its type
169 * and type specific fields populated.
170 * If bi_iter runs off the end of the range, bi_iter is truncated accordingly
171 */
172static int split_range(struct bow_context *bc, struct bow_range **br,
173 struct bvec_iter *bi_iter)
174{
175 struct bow_range *new_br;
176
177 if (bi_iter->bi_sector < (*br)->sector) {
178 WARN_ON(true);
Paul Lawrence387e0792019-03-25 09:24:24 -0700179 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700180 }
181
182 if (bi_iter->bi_sector > (*br)->sector) {
183 struct bow_range *leading_br =
184 kzalloc(sizeof(*leading_br), GFP_KERNEL);
185
186 if (!leading_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700187 return -ENOMEM;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700188
189 *leading_br = **br;
190 if (leading_br->type == TRIMMED)
191 list_add(&leading_br->trimmed_list, &bc->trimmed_list);
192
193 add_before(&bc->ranges, leading_br, *br);
194 (*br)->sector = bi_iter->bi_sector;
195 }
196
197 if (bvec_top(bi_iter) >= range_top(*br)) {
198 bi_iter->bi_size = (range_top(*br) - (*br)->sector)
199 * SECTOR_SIZE;
Paul Lawrence387e0792019-03-25 09:24:24 -0700200 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700201 }
202
203 /* new_br will be the beginning, existing br will be the tail */
204 new_br = kzalloc(sizeof(*new_br), GFP_KERNEL);
205 if (!new_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700206 return -ENOMEM;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700207
208 new_br->sector = (*br)->sector;
209 (*br)->sector = bvec_top(bi_iter);
210 add_before(&bc->ranges, new_br, *br);
211 *br = new_br;
212
Paul Lawrence387e0792019-03-25 09:24:24 -0700213 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700214}
215
216/*
217 * Sets type of a range. May merge range into surrounding ranges
218 * Since br may be invalidated, always sets br to NULL to prevent
219 * usage after this is called
220 */
221static void set_type(struct bow_context *bc, struct bow_range **br, int type)
222{
223 struct bow_range *prev = container_of(rb_prev(&(*br)->node),
224 struct bow_range, node);
225 struct bow_range *next = container_of(rb_next(&(*br)->node),
226 struct bow_range, node);
227
228 if ((*br)->type == TRIMMED) {
229 bc->trims_total -= range_size(*br);
230 list_del(&(*br)->trimmed_list);
231 }
232
233 if (type == TRIMMED) {
234 bc->trims_total += range_size(*br);
235 list_add(&(*br)->trimmed_list, &bc->trimmed_list);
236 }
237
238 (*br)->type = type;
239
240 if (next->type == type) {
241 if (type == TRIMMED)
242 list_del(&next->trimmed_list);
243 rb_erase(&next->node, &bc->ranges);
244 kfree(next);
245 }
246
247 if (prev->type == type) {
248 if (type == TRIMMED)
249 list_del(&(*br)->trimmed_list);
250 rb_erase(&(*br)->node, &bc->ranges);
251 kfree(*br);
252 }
253
254 *br = NULL;
255}
256
257static struct bow_range *find_free_range(struct bow_context *bc)
258{
259 if (list_empty(&bc->trimmed_list)) {
260 DMERR("Unable to find free space to back up to");
261 return NULL;
262 }
263
264 return list_first_entry(&bc->trimmed_list, struct bow_range,
265 trimmed_list);
266}
267
268static sector_t sector_to_page(struct bow_context const *bc, sector_t sector)
269{
Paul Lawrencee8bdeec2019-03-26 09:05:55 -0700270 WARN_ON((sector & (((sector_t)1 << (bc->block_shift - SECTOR_SHIFT)) - 1))
271 != 0);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700272 return sector >> (bc->block_shift - SECTOR_SHIFT);
273}
274
275static int copy_data(struct bow_context const *bc,
276 struct bow_range *source, struct bow_range *dest,
277 u32 *checksum)
278{
279 int i;
280
281 if (range_size(source) != range_size(dest)) {
282 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700283 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700284 }
285
286 if (checksum)
287 *checksum = sector_to_page(bc, source->sector);
288
289 for (i = 0; i < range_size(source) >> bc->block_shift; ++i) {
290 struct dm_buffer *read_buffer, *write_buffer;
291 u8 *read, *write;
292 sector_t page = sector_to_page(bc, source->sector) + i;
293
294 read = dm_bufio_read(bc->bufio, page, &read_buffer);
295 if (IS_ERR(read)) {
Paul Lawrencee8bdeec2019-03-26 09:05:55 -0700296 DMERR("Cannot read page %llu",
297 (unsigned long long)page);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700298 return PTR_ERR(read);
299 }
300
301 if (checksum)
302 *checksum = crc32(*checksum, read, bc->block_size);
303
304 write = dm_bufio_new(bc->bufio,
305 sector_to_page(bc, dest->sector) + i,
306 &write_buffer);
307 if (IS_ERR(write)) {
308 DMERR("Cannot write sector");
309 dm_bufio_release(read_buffer);
310 return PTR_ERR(write);
311 }
312
313 memcpy(write, read, bc->block_size);
314
315 dm_bufio_mark_buffer_dirty(write_buffer);
316 dm_bufio_release(write_buffer);
317 dm_bufio_release(read_buffer);
318 }
319
320 dm_bufio_write_dirty_buffers(bc->bufio);
Paul Lawrence387e0792019-03-25 09:24:24 -0700321 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700322}
323
324/****** logging functions ******/
325
326static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest,
327 unsigned int size, u32 checksum);
328
329static int backup_log_sector(struct bow_context *bc)
330{
331 struct bow_range *first_br, *free_br;
332 struct bvec_iter bi_iter;
333 u32 checksum = 0;
334 int ret;
335
336 first_br = container_of(rb_first(&bc->ranges), struct bow_range, node);
337
338 if (first_br->type != SECTOR0) {
339 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700340 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700341 }
342
343 if (range_size(first_br) != bc->block_size) {
344 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700345 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700346 }
347
348 free_br = find_free_range(bc);
349 /* No space left - return this error to userspace */
350 if (!free_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700351 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700352 bi_iter.bi_sector = free_br->sector;
353 bi_iter.bi_size = bc->block_size;
354 ret = split_range(bc, &free_br, &bi_iter);
355 if (ret)
356 return ret;
357 if (bi_iter.bi_size != bc->block_size) {
358 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700359 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700360 }
361
362 ret = copy_data(bc, first_br, free_br, &checksum);
363 if (ret)
364 return ret;
365
366 bc->log_sector->count = 0;
367 bc->log_sector->sequence++;
368 ret = add_log_entry(bc, first_br->sector, free_br->sector,
369 range_size(first_br), checksum);
370 if (ret)
371 return ret;
372
373 set_type(bc, &free_br, BACKUP);
Paul Lawrence387e0792019-03-25 09:24:24 -0700374 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700375}
376
377static int add_log_entry(struct bow_context *bc, sector_t source, sector_t dest,
378 unsigned int size, u32 checksum)
379{
380 struct dm_buffer *sector_buffer;
381 u8 *sector;
382
383 if (sizeof(struct log_sector)
384 + sizeof(struct log_entry) * (bc->log_sector->count + 1)
385 > bc->block_size) {
386 int ret = backup_log_sector(bc);
387
388 if (ret)
389 return ret;
390 }
391
392 sector = dm_bufio_new(bc->bufio, 0, &sector_buffer);
393 if (IS_ERR(sector)) {
394 DMERR("Cannot write boot sector");
395 dm_bufio_release(sector_buffer);
Paul Lawrence387e0792019-03-25 09:24:24 -0700396 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700397 }
398
399 bc->log_sector->entries[bc->log_sector->count].source = source;
400 bc->log_sector->entries[bc->log_sector->count].dest = dest;
401 bc->log_sector->entries[bc->log_sector->count].size = size;
402 bc->log_sector->entries[bc->log_sector->count].checksum = checksum;
403 bc->log_sector->count++;
404
405 memcpy(sector, bc->log_sector, bc->block_size);
406 dm_bufio_mark_buffer_dirty(sector_buffer);
407 dm_bufio_release(sector_buffer);
408 dm_bufio_write_dirty_buffers(bc->bufio);
Paul Lawrence387e0792019-03-25 09:24:24 -0700409 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700410}
411
412static int prepare_log(struct bow_context *bc)
413{
414 struct bow_range *free_br, *first_br;
415 struct bvec_iter bi_iter;
416 u32 checksum = 0;
417 int ret;
418
419 /* Carve out first sector as log sector */
420 first_br = container_of(rb_first(&bc->ranges), struct bow_range, node);
421 if (first_br->type != UNCHANGED) {
422 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700423 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700424 }
425
426 if (range_size(first_br) < bc->block_size) {
427 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700428 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700429 }
430 bi_iter.bi_sector = 0;
431 bi_iter.bi_size = bc->block_size;
432 ret = split_range(bc, &first_br, &bi_iter);
433 if (ret)
434 return ret;
435 first_br->type = SECTOR0;
436 if (range_size(first_br) != bc->block_size) {
437 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700438 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700439 }
440
441 /* Find free sector for active sector0 reads/writes */
442 free_br = find_free_range(bc);
443 if (!free_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700444 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700445 bi_iter.bi_sector = free_br->sector;
446 bi_iter.bi_size = bc->block_size;
447 ret = split_range(bc, &free_br, &bi_iter);
448 if (ret)
449 return ret;
450 free_br->type = SECTOR0_CURRENT;
451
452 /* Copy data */
453 ret = copy_data(bc, first_br, free_br, NULL);
454 if (ret)
455 return ret;
456
457 bc->log_sector->sector0 = free_br->sector;
458
459 /* Find free sector to back up original sector zero */
460 free_br = find_free_range(bc);
461 if (!free_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700462 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700463 bi_iter.bi_sector = free_br->sector;
464 bi_iter.bi_size = bc->block_size;
465 ret = split_range(bc, &free_br, &bi_iter);
466 if (ret)
467 return ret;
468
469 /* Back up */
470 ret = copy_data(bc, first_br, free_br, &checksum);
471 if (ret)
472 return ret;
473
474 /*
475 * Set up our replacement boot sector - it will get written when we
476 * add the first log entry, which we do immediately
477 */
478 bc->log_sector->magic = MAGIC;
479 bc->log_sector->header_version = HEADER_VERSION;
480 bc->log_sector->header_size = sizeof(*bc->log_sector);
481 bc->log_sector->block_size = bc->block_size;
482 bc->log_sector->count = 0;
483 bc->log_sector->sequence = 0;
484
485 /* Add log entry */
486 ret = add_log_entry(bc, first_br->sector, free_br->sector,
487 range_size(first_br), checksum);
488 if (ret)
489 return ret;
490
491 set_type(bc, &free_br, BACKUP);
Paul Lawrence387e0792019-03-25 09:24:24 -0700492 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700493}
494
495static struct bow_range *find_sector0_current(struct bow_context *bc)
496{
497 struct bvec_iter bi_iter;
498
499 bi_iter.bi_sector = bc->log_sector->sector0;
500 bi_iter.bi_size = bc->block_size;
501 return find_first_overlapping_range(&bc->ranges, &bi_iter);
502}
503
504/****** sysfs interface functions ******/
505
506static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
507 char *buf)
508{
509 struct bow_context *bc = container_of(kobj, struct bow_context,
510 kobj_holder.kobj);
511
512 return scnprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&bc->state));
513}
514
515static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
516 const char *buf, size_t count)
517{
518 struct bow_context *bc = container_of(kobj, struct bow_context,
519 kobj_holder.kobj);
520 enum state state, original_state;
521 int ret;
522
523 state = buf[0] - '0';
524 if (state < TRIM || state > COMMITTED) {
525 DMERR("State value %d out of range", state);
526 return -EINVAL;
527 }
528
529 mutex_lock(&bc->ranges_lock);
530 original_state = atomic_read(&bc->state);
531 if (state != original_state + 1) {
532 DMERR("Invalid state change from %d to %d",
533 original_state, state);
534 ret = -EINVAL;
535 goto bad;
536 }
537
538 DMINFO("Switching to state %s", state == CHECKPOINT ? "Checkpoint"
539 : state == COMMITTED ? "Committed" : "Unknown");
540
541 if (state == CHECKPOINT) {
542 ret = prepare_log(bc);
543 if (ret) {
544 DMERR("Failed to switch to checkpoint state");
545 goto bad;
546 }
547 } else if (state == COMMITTED) {
548 struct bow_range *br = find_sector0_current(bc);
549 struct bow_range *sector0_br =
550 container_of(rb_first(&bc->ranges), struct bow_range,
551 node);
552
553 ret = copy_data(bc, br, sector0_br, 0);
554 if (ret) {
555 DMERR("Failed to switch to committed state");
556 goto bad;
557 }
558 }
559 atomic_inc(&bc->state);
560 ret = count;
561
562bad:
563 mutex_unlock(&bc->ranges_lock);
564 return ret;
565}
566
567static ssize_t free_show(struct kobject *kobj, struct kobj_attribute *attr,
568 char *buf)
569{
570 struct bow_context *bc = container_of(kobj, struct bow_context,
571 kobj_holder.kobj);
572 u64 trims_total;
573
574 mutex_lock(&bc->ranges_lock);
575 trims_total = bc->trims_total;
576 mutex_unlock(&bc->ranges_lock);
577
578 return scnprintf(buf, PAGE_SIZE, "%llu\n", trims_total);
579}
580
581static struct kobj_attribute attr_state = __ATTR_RW(state);
582static struct kobj_attribute attr_free = __ATTR_RO(free);
583
584static struct attribute *bow_attrs[] = {
585 &attr_state.attr,
586 &attr_free.attr,
587 NULL
588};
589
590static struct kobj_type bow_ktype = {
591 .sysfs_ops = &kobj_sysfs_ops,
592 .default_attrs = bow_attrs,
593 .release = dm_kobject_release
594};
595
596/****** constructor/destructor ******/
597
598static void dm_bow_dtr(struct dm_target *ti)
599{
600 struct bow_context *bc = (struct bow_context *) ti->private;
601 struct kobject *kobj;
602
603 while (rb_first(&bc->ranges)) {
604 struct bow_range *br = container_of(rb_first(&bc->ranges),
605 struct bow_range, node);
606
607 rb_erase(&br->node, &bc->ranges);
608 kfree(br);
609 }
610 if (bc->workqueue)
611 destroy_workqueue(bc->workqueue);
612 if (bc->bufio)
613 dm_bufio_client_destroy(bc->bufio);
614
615 kobj = &bc->kobj_holder.kobj;
616 if (kobj->state_initialized) {
617 kobject_put(kobj);
618 wait_for_completion(dm_get_completion_from_kobject(kobj));
619 }
620
621 kfree(bc->log_sector);
622 kfree(bc);
623}
624
625static int dm_bow_ctr(struct dm_target *ti, unsigned int argc, char **argv)
626{
627 struct bow_context *bc;
628 struct bow_range *br;
629 int ret;
630 struct mapped_device *md = dm_table_get_md(ti->table);
631
632 if (argc != 1) {
633 ti->error = "Invalid argument count";
634 return -EINVAL;
635 }
636
637 bc = kzalloc(sizeof(*bc), GFP_KERNEL);
638 if (!bc) {
639 ti->error = "Cannot allocate bow context";
640 return -ENOMEM;
641 }
642
643 ti->num_flush_bios = 1;
644 ti->num_discard_bios = 1;
645 ti->num_write_same_bios = 1;
646 ti->private = bc;
647
648 ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
649 &bc->dev);
650 if (ret) {
651 ti->error = "Device lookup failed";
652 goto bad;
653 }
654
655 if (bc->dev->bdev->bd_queue->limits.max_discard_sectors == 0) {
656 bc->dev->bdev->bd_queue->limits.discard_granularity = 1 << 12;
657 bc->dev->bdev->bd_queue->limits.max_hw_discard_sectors = 1 << 15;
658 bc->dev->bdev->bd_queue->limits.max_discard_sectors = 1 << 15;
659 bc->forward_trims = false;
660 } else {
661 bc->forward_trims = true;
662 }
663
664 bc->block_size = bc->dev->bdev->bd_queue->limits.logical_block_size;
665 bc->block_shift = ilog2(bc->block_size);
666 bc->log_sector = kzalloc(bc->block_size, GFP_KERNEL);
667 if (!bc->log_sector) {
668 ti->error = "Cannot allocate log sector";
669 goto bad;
670 }
671
672 init_completion(&bc->kobj_holder.completion);
673 ret = kobject_init_and_add(&bc->kobj_holder.kobj, &bow_ktype,
674 &disk_to_dev(dm_disk(md))->kobj, "%s",
675 "bow");
676 if (ret) {
677 ti->error = "Cannot create sysfs node";
678 goto bad;
679 }
680
681 mutex_init(&bc->ranges_lock);
682 bc->ranges = RB_ROOT;
683 bc->bufio = dm_bufio_client_create(bc->dev->bdev, bc->block_size, 1, 0,
684 NULL, NULL);
685 if (IS_ERR(bc->bufio)) {
686 ti->error = "Cannot initialize dm-bufio";
687 ret = PTR_ERR(bc->bufio);
688 bc->bufio = NULL;
689 goto bad;
690 }
691
692 bc->workqueue = alloc_workqueue("dm-bow",
693 WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM
694 | WQ_UNBOUND, num_online_cpus());
695 if (!bc->workqueue) {
696 ti->error = "Cannot allocate workqueue";
697 ret = -ENOMEM;
698 goto bad;
699 }
700
701 INIT_LIST_HEAD(&bc->trimmed_list);
702
703 br = kzalloc(sizeof(*br), GFP_KERNEL);
704 if (!br) {
705 ti->error = "Cannot allocate ranges";
706 ret = -ENOMEM;
707 goto bad;
708 }
709
710 br->sector = ti->len;
711 br->type = TOP;
712 rb_link_node(&br->node, NULL, &bc->ranges.rb_node);
713 rb_insert_color(&br->node, &bc->ranges);
714
715 br = kzalloc(sizeof(*br), GFP_KERNEL);
716 if (!br) {
717 ti->error = "Cannot allocate ranges";
718 ret = -ENOMEM;
719 goto bad;
720 }
721
722 br->sector = 0;
723 br->type = UNCHANGED;
724 rb_link_node(&br->node, bc->ranges.rb_node,
725 &bc->ranges.rb_node->rb_left);
726 rb_insert_color(&br->node, &bc->ranges);
727
728 ti->discards_supported = true;
729
730 return 0;
731
732bad:
733 dm_bow_dtr(ti);
734 return ret;
735}
736
737/****** Handle writes ******/
738
739static int prepare_unchanged_range(struct bow_context *bc, struct bow_range *br,
740 struct bvec_iter *bi_iter,
741 bool record_checksum)
742{
743 struct bow_range *backup_br;
744 struct bvec_iter backup_bi;
745 sector_t log_source, log_dest;
746 unsigned int log_size;
747 u32 checksum = 0;
748 int ret;
749 int original_type;
750 sector_t sector0;
751
752 /* Find a free range */
753 backup_br = find_free_range(bc);
754 if (!backup_br)
Paul Lawrence387e0792019-03-25 09:24:24 -0700755 return -ENOSPC;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700756
757 /* Carve out a backup range. This may be smaller than the br given */
758 backup_bi.bi_sector = backup_br->sector;
759 backup_bi.bi_size = min(range_size(backup_br), (u64) bi_iter->bi_size);
760 ret = split_range(bc, &backup_br, &backup_bi);
761 if (ret)
762 return ret;
763
764 /*
765 * Carve out a changed range. This will not be smaller than the backup
766 * br since the backup br is smaller than the source range and iterator
767 */
768 bi_iter->bi_size = backup_bi.bi_size;
769 ret = split_range(bc, &br, bi_iter);
770 if (ret)
771 return ret;
772 if (range_size(br) != range_size(backup_br)) {
773 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700774 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700775 }
776
777
778 /* Copy data over */
779 ret = copy_data(bc, br, backup_br, record_checksum ? &checksum : NULL);
780 if (ret)
781 return ret;
782
783 /* Add an entry to the log */
784 log_source = br->sector;
785 log_dest = backup_br->sector;
786 log_size = range_size(br);
787
788 /*
789 * Set the types. Note that since set_type also amalgamates ranges
790 * we have to set both sectors to their final type before calling
791 * set_type on either
792 */
793 original_type = br->type;
794 sector0 = backup_br->sector;
Dylan Chang777d6c12020-03-23 14:56:02 +0800795 bc->trims_total -= range_size(backup_br);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700796 if (backup_br->type == TRIMMED)
797 list_del(&backup_br->trimmed_list);
798 backup_br->type = br->type == SECTOR0_CURRENT ? SECTOR0_CURRENT
799 : BACKUP;
800 br->type = CHANGED;
801 set_type(bc, &backup_br, backup_br->type);
802
803 /*
804 * Add the log entry after marking the backup sector, since adding a log
805 * can cause another backup
806 */
807 ret = add_log_entry(bc, log_source, log_dest, log_size, checksum);
808 if (ret) {
809 br->type = original_type;
810 return ret;
811 }
812
813 /* Now it is safe to mark this backup successful */
814 if (original_type == SECTOR0_CURRENT)
815 bc->log_sector->sector0 = sector0;
816
817 set_type(bc, &br, br->type);
818 return ret;
819}
820
821static int prepare_free_range(struct bow_context *bc, struct bow_range *br,
822 struct bvec_iter *bi_iter)
823{
824 int ret;
825
826 ret = split_range(bc, &br, bi_iter);
827 if (ret)
828 return ret;
829 set_type(bc, &br, CHANGED);
Paul Lawrence387e0792019-03-25 09:24:24 -0700830 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700831}
832
833static int prepare_changed_range(struct bow_context *bc, struct bow_range *br,
834 struct bvec_iter *bi_iter)
835{
836 /* Nothing to do ... */
Paul Lawrence387e0792019-03-25 09:24:24 -0700837 return 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700838}
839
840static int prepare_one_range(struct bow_context *bc,
841 struct bvec_iter *bi_iter)
842{
843 struct bow_range *br = find_first_overlapping_range(&bc->ranges,
844 bi_iter);
845 switch (br->type) {
846 case CHANGED:
847 return prepare_changed_range(bc, br, bi_iter);
848
849 case TRIMMED:
850 return prepare_free_range(bc, br, bi_iter);
851
852 case UNCHANGED:
853 case BACKUP:
854 return prepare_unchanged_range(bc, br, bi_iter, true);
855
856 /*
857 * We cannot track the checksum for the active sector0, since it
858 * may change at any point.
859 */
860 case SECTOR0_CURRENT:
861 return prepare_unchanged_range(bc, br, bi_iter, false);
862
863 case SECTOR0: /* Handled in the dm_bow_map */
864 case TOP: /* Illegal - top is off the end of the device */
865 default:
866 WARN_ON(1);
Paul Lawrence387e0792019-03-25 09:24:24 -0700867 return -EIO;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700868 }
869}
870
871struct write_work {
872 struct work_struct work;
873 struct bow_context *bc;
874 struct bio *bio;
875};
876
877static void bow_write(struct work_struct *work)
878{
879 struct write_work *ww = container_of(work, struct write_work, work);
880 struct bow_context *bc = ww->bc;
881 struct bio *bio = ww->bio;
882 struct bvec_iter bi_iter = bio->bi_iter;
Paul Lawrence387e0792019-03-25 09:24:24 -0700883 int ret = 0;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700884
885 kfree(ww);
886
887 mutex_lock(&bc->ranges_lock);
888 do {
889 ret = prepare_one_range(bc, &bi_iter);
890 bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE;
891 bi_iter.bi_size = bio->bi_iter.bi_size
892 - (bi_iter.bi_sector - bio->bi_iter.bi_sector)
893 * SECTOR_SIZE;
894 } while (!ret && bi_iter.bi_size);
895
896 mutex_unlock(&bc->ranges_lock);
897
898 if (!ret) {
Paul Lawrence387e0792019-03-25 09:24:24 -0700899 bio->bi_bdev = bc->dev->bdev;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700900 submit_bio(bio);
901 } else {
902 DMERR("Write failure with error %d", -ret);
Paul Lawrence387e0792019-03-25 09:24:24 -0700903 bio->bi_error = ret;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700904 bio_endio(bio);
905 }
906}
907
908static int queue_write(struct bow_context *bc, struct bio *bio)
909{
910 struct write_work *ww = kmalloc(sizeof(*ww), GFP_NOIO | __GFP_NORETRY
911 | __GFP_NOMEMALLOC | __GFP_NOWARN);
912 if (!ww) {
913 DMERR("Failed to allocate write_work");
914 return -ENOMEM;
915 }
916
917 INIT_WORK(&ww->work, bow_write);
918 ww->bc = bc;
919 ww->bio = bio;
920 queue_work(bc->workqueue, &ww->work);
921 return DM_MAPIO_SUBMITTED;
922}
923
924static int handle_sector0(struct bow_context *bc, struct bio *bio)
925{
926 int ret = DM_MAPIO_REMAPPED;
927
928 if (bio->bi_iter.bi_size > bc->block_size) {
929 struct bio * split = bio_split(bio,
930 bc->block_size >> SECTOR_SHIFT,
931 GFP_NOIO,
Paul Lawrence9aa7b212019-03-22 08:25:28 -0700932 fs_bio_set);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700933 if (!split) {
934 DMERR("Failed to split bio");
Paul Lawrence387e0792019-03-25 09:24:24 -0700935 bio->bi_error = -ENOMEM;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700936 bio_endio(bio);
937 return DM_MAPIO_SUBMITTED;
938 }
939
940 bio_chain(split, bio);
941 split->bi_iter.bi_sector = bc->log_sector->sector0;
Paul Lawrence387e0792019-03-25 09:24:24 -0700942 split->bi_bdev = bc->dev->bdev;
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700943 submit_bio(split);
944
945 if (bio_data_dir(bio) == WRITE)
946 ret = queue_write(bc, bio);
947 } else {
948 bio->bi_iter.bi_sector = bc->log_sector->sector0;
949 }
950
951 return ret;
952}
953
954static int add_trim(struct bow_context *bc, struct bio *bio)
955{
956 struct bow_range *br;
957 struct bvec_iter bi_iter = bio->bi_iter;
958
Paul Lawrencee8bdeec2019-03-26 09:05:55 -0700959 DMDEBUG("add_trim: %llu, %u",
960 (unsigned long long)bio->bi_iter.bi_sector,
961 bio->bi_iter.bi_size);
Paul Lawrencede12cfe2018-10-23 08:56:04 -0700962
963 do {
964 br = find_first_overlapping_range(&bc->ranges, &bi_iter);
965
966 switch (br->type) {
967 case UNCHANGED:
968 if (!split_range(bc, &br, &bi_iter))
969 set_type(bc, &br, TRIMMED);
970 break;
971
972 case TRIMMED:
973 /* Nothing to do */
974 break;
975
976 default:
977 /* No other case is legal in TRIM state */
978 WARN_ON(true);
979 break;
980 }
981
982 bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE;
983 bi_iter.bi_size = bio->bi_iter.bi_size
984 - (bi_iter.bi_sector - bio->bi_iter.bi_sector)
985 * SECTOR_SIZE;
986
987 } while (bi_iter.bi_size);
988
989 bio_endio(bio);
990 return DM_MAPIO_SUBMITTED;
991}
992
993static int remove_trim(struct bow_context *bc, struct bio *bio)
994{
995 struct bow_range *br;
996 struct bvec_iter bi_iter = bio->bi_iter;
997
Paul Lawrencee8bdeec2019-03-26 09:05:55 -0700998 DMDEBUG("remove_trim: %llu, %u",
999 (unsigned long long)bio->bi_iter.bi_sector,
1000 bio->bi_iter.bi_size);
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001001
1002 do {
1003 br = find_first_overlapping_range(&bc->ranges, &bi_iter);
1004
1005 switch (br->type) {
1006 case UNCHANGED:
1007 /* Nothing to do */
1008 break;
1009
1010 case TRIMMED:
1011 if (!split_range(bc, &br, &bi_iter))
1012 set_type(bc, &br, UNCHANGED);
1013 break;
1014
1015 default:
1016 /* No other case is legal in TRIM state */
1017 WARN_ON(true);
1018 break;
1019 }
1020
1021 bi_iter.bi_sector += bi_iter.bi_size / SECTOR_SIZE;
1022 bi_iter.bi_size = bio->bi_iter.bi_size
1023 - (bi_iter.bi_sector - bio->bi_iter.bi_sector)
1024 * SECTOR_SIZE;
1025
1026 } while (bi_iter.bi_size);
1027
1028 return DM_MAPIO_REMAPPED;
1029}
1030
1031int remap_unless_illegal_trim(struct bow_context *bc, struct bio *bio)
1032{
1033 if (!bc->forward_trims && bio_op(bio) == REQ_OP_DISCARD) {
Paul Lawrence387e0792019-03-25 09:24:24 -07001034 bio->bi_error = -EINVAL;
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001035 bio_endio(bio);
1036 return DM_MAPIO_SUBMITTED;
1037 } else {
Paul Lawrence387e0792019-03-25 09:24:24 -07001038 bio->bi_bdev = bc->dev->bdev;
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001039 return DM_MAPIO_REMAPPED;
1040 }
1041}
1042
1043/****** dm interface ******/
1044
1045static int dm_bow_map(struct dm_target *ti, struct bio *bio)
1046{
1047 int ret = DM_MAPIO_REMAPPED;
1048 struct bow_context *bc = ti->private;
1049
1050 if (likely(bc->state.counter == COMMITTED))
1051 return remap_unless_illegal_trim(bc, bio);
1052
1053 if (bio_data_dir(bio) == READ && bio->bi_iter.bi_sector != 0)
1054 return remap_unless_illegal_trim(bc, bio);
1055
1056 if (atomic_read(&bc->state) != COMMITTED) {
1057 enum state state;
1058
1059 mutex_lock(&bc->ranges_lock);
1060 state = atomic_read(&bc->state);
1061 if (state == TRIM) {
1062 if (bio_op(bio) == REQ_OP_DISCARD)
1063 ret = add_trim(bc, bio);
1064 else if (bio_data_dir(bio) == WRITE)
1065 ret = remove_trim(bc, bio);
1066 else
1067 /* pass-through */;
1068 } else if (state == CHECKPOINT) {
1069 if (bio->bi_iter.bi_sector == 0)
1070 ret = handle_sector0(bc, bio);
1071 else if (bio_data_dir(bio) == WRITE)
1072 ret = queue_write(bc, bio);
1073 else
1074 /* pass-through */;
1075 } else {
1076 /* pass-through */
1077 }
1078 mutex_unlock(&bc->ranges_lock);
1079 }
1080
1081 if (ret == DM_MAPIO_REMAPPED)
1082 return remap_unless_illegal_trim(bc, bio);
1083
1084 return ret;
1085}
1086
1087static void dm_bow_tablestatus(struct dm_target *ti, char *result,
1088 unsigned int maxlen)
1089{
1090 char *end = result + maxlen;
1091 struct bow_context *bc = ti->private;
1092 struct rb_node *i;
1093 int trimmed_list_length = 0;
1094 int trimmed_range_count = 0;
1095 struct bow_range *br;
1096
1097 if (maxlen == 0)
1098 return;
1099 result[0] = 0;
1100
1101 list_for_each_entry(br, &bc->trimmed_list, trimmed_list)
1102 if (br->type == TRIMMED) {
1103 ++trimmed_list_length;
1104 } else {
1105 scnprintf(result, end - result,
1106 "ERROR: non-trimmed entry in trimmed_list");
1107 return;
1108 }
1109
1110 if (!rb_first(&bc->ranges)) {
1111 scnprintf(result, end - result, "ERROR: Empty ranges");
1112 return;
1113 }
1114
1115 if (container_of(rb_first(&bc->ranges), struct bow_range, node)
1116 ->sector) {
1117 scnprintf(result, end - result,
1118 "ERROR: First range does not start at sector 0");
1119 return;
1120 }
1121
1122 for (i = rb_first(&bc->ranges); i; i = rb_next(i)) {
1123 struct bow_range *br = container_of(i, struct bow_range, node);
1124
Paul Lawrencee8bdeec2019-03-26 09:05:55 -07001125 result += scnprintf(result, end - result, "%s: %llu",
1126 readable_type[br->type],
1127 (unsigned long long)br->sector);
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001128 if (result >= end)
1129 return;
1130
1131 result += scnprintf(result, end - result, "\n");
1132 if (result >= end)
1133 return;
1134
1135 if (br->type == TRIMMED)
1136 ++trimmed_range_count;
1137
1138 if (br->type == TOP) {
1139 if (br->sector != ti->len) {
1140 scnprintf(result, end - result,
1141 "\nERROR: Top sector is incorrect");
1142 }
1143
1144 if (&br->node != rb_last(&bc->ranges)) {
1145 scnprintf(result, end - result,
1146 "\nERROR: Top sector is not last");
1147 }
1148
1149 break;
1150 }
1151
1152 if (!rb_next(i)) {
1153 scnprintf(result, end - result,
1154 "\nERROR: Last range not of type TOP");
1155 return;
1156 }
1157
1158 if (br->sector > range_top(br)) {
1159 scnprintf(result, end - result,
1160 "\nERROR: sectors out of order");
1161 return;
1162 }
1163 }
1164
1165 if (trimmed_range_count != trimmed_list_length)
1166 scnprintf(result, end - result,
1167 "\nERROR: not all trimmed ranges in trimmed list");
1168}
1169
1170static void dm_bow_status(struct dm_target *ti, status_type_t type,
1171 unsigned int status_flags, char *result,
1172 unsigned int maxlen)
1173{
1174 switch (type) {
1175 case STATUSTYPE_INFO:
1176 if (maxlen)
1177 result[0] = 0;
1178 break;
1179
1180 case STATUSTYPE_TABLE:
1181 dm_bow_tablestatus(ti, result, maxlen);
1182 break;
1183 }
1184}
1185
Paul Lawrence9aa7b212019-03-22 08:25:28 -07001186int dm_bow_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
1187 fmode_t *mode)
Paul Lawrencede12cfe2018-10-23 08:56:04 -07001188{
1189 struct bow_context *bc = ti->private;
1190 struct dm_dev *dev = bc->dev;
1191
1192 *bdev = dev->bdev;
1193 /* Only pass ioctls through if the device sizes match exactly. */
1194 return ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1195}
1196
1197static int dm_bow_iterate_devices(struct dm_target *ti,
1198 iterate_devices_callout_fn fn, void *data)
1199{
1200 struct bow_context *bc = ti->private;
1201
1202 return fn(ti, bc->dev, 0, ti->len, data);
1203}
1204
1205static struct target_type bow_target = {
1206 .name = "bow",
1207 .version = {1, 1, 1},
1208 .module = THIS_MODULE,
1209 .ctr = dm_bow_ctr,
1210 .dtr = dm_bow_dtr,
1211 .map = dm_bow_map,
1212 .status = dm_bow_status,
1213 .prepare_ioctl = dm_bow_prepare_ioctl,
1214 .iterate_devices = dm_bow_iterate_devices,
1215};
1216
1217int __init dm_bow_init(void)
1218{
1219 int r = dm_register_target(&bow_target);
1220
1221 if (r < 0)
1222 DMERR("registering bow failed %d", r);
1223 return r;
1224}
1225
1226void dm_bow_exit(void)
1227{
1228 dm_unregister_target(&bow_target);
1229}
1230
1231MODULE_LICENSE("GPL");
1232
1233module_init(dm_bow_init);
1234module_exit(dm_bow_exit);