blob: 2b9ed0e3af37983336957087d8f5295531324d13 [file] [log] [blame]
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001/*
2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 */
14#include <linux/kernel.h>
15#include <linux/wait.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/raid/md_p.h>
19#include <linux/crc32.h>
20#include <linux/random.h>
21#include "md.h"
22#include "raid5.h"
23
24/*
25 * metadata/data stored in disk with 4k size unit (a block) regardless
26 * underneath hardware sector size. only works with PAGE_SIZE == 4096
27 */
28#define BLOCK_SECTORS (8)
29
Shaohua Li0576b1c2015-08-13 14:32:00 -070030/*
31 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
32 * recovery scans a very long log
33 */
34#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36
Shaohua Lif6bed0e2015-08-13 14:31:59 -070037struct r5l_log {
38 struct md_rdev *rdev;
39
40 u32 uuid_checksum;
41
42 sector_t device_size; /* log device size, round to
43 * BLOCK_SECTORS */
Shaohua Li0576b1c2015-08-13 14:32:00 -070044 sector_t max_free_space; /* reclaim run if free space is at
45 * this size */
Shaohua Lif6bed0e2015-08-13 14:31:59 -070046
47 sector_t last_checkpoint; /* log tail. where recovery scan
48 * starts from */
49 u64 last_cp_seq; /* log tail sequence */
50
51 sector_t log_start; /* log head. where new data appends */
52 u64 seq; /* log head sequence */
53
54 struct mutex io_mutex;
55 struct r5l_io_unit *current_io; /* current io_unit accepting new data */
56
57 spinlock_t io_list_lock;
58 struct list_head running_ios; /* io_units which are still running,
59 * and have not yet been completely
60 * written to the log */
61 struct list_head io_end_ios; /* io_units which have been completely
62 * written to the log but not yet written
63 * to the RAID */
Shaohua Li0576b1c2015-08-13 14:32:00 -070064 struct list_head stripe_end_ios;/* io_units which have been completely
65 * written to the RAID but have not yet
66 * been considered for updating super */
Shaohua Lif6bed0e2015-08-13 14:31:59 -070067
68 struct kmem_cache *io_kc;
69
Shaohua Li0576b1c2015-08-13 14:32:00 -070070 struct md_thread *reclaim_thread;
71 unsigned long reclaim_target; /* number of space that need to be
72 * reclaimed. if it's 0, reclaim spaces
73 * used by io_units which are in
74 * IO_UNIT_STRIPE_END state (eg, reclaim
75 * dones't wait for specific io_unit
76 * switching to IO_UNIT_STRIPE_END
77 * state) */
78
Shaohua Lif6bed0e2015-08-13 14:31:59 -070079 struct list_head no_space_stripes; /* pending stripes, log has no space */
80 spinlock_t no_space_stripes_lock;
81};
82
83/*
84 * an IO range starts from a meta data block and end at the next meta data
85 * block. The io unit's the meta data block tracks data/parity followed it. io
86 * unit is written to log disk with normal write, as we always flush log disk
87 * first and then start move data to raid disks, there is no requirement to
88 * write io unit with FLUSH/FUA
89 */
90struct r5l_io_unit {
91 struct r5l_log *log;
92
93 struct page *meta_page; /* store meta block */
94 int meta_offset; /* current offset in meta_page */
95
96 struct bio_list bios;
97 atomic_t pending_io; /* pending bios not written to log yet */
98 struct bio *current_bio;/* current_bio accepting new data */
99
100 atomic_t pending_stripe;/* how many stripes not flushed to raid */
101 u64 seq; /* seq number of the metablock */
102 sector_t log_start; /* where the io_unit starts */
103 sector_t log_end; /* where the io_unit ends */
104 struct list_head log_sibling; /* log->running_ios */
105 struct list_head stripe_list; /* stripes added to the io_unit */
106
107 int state;
108 wait_queue_head_t wait_state;
109};
110
111/* r5l_io_unit state */
112enum r5l_io_unit_state {
113 IO_UNIT_RUNNING = 0, /* accepting new IO */
114 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
115 * don't accepting new bio */
116 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
117 IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are flushing to raid */
118 IO_UNIT_STRIPE_END = 4, /* stripes data finished writing to raid */
119};
120
121static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
122{
123 start += inc;
124 if (start >= log->device_size)
125 start = start - log->device_size;
126 return start;
127}
128
129static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
130 sector_t end)
131{
132 if (end >= start)
133 return end - start;
134 else
135 return end + log->device_size - start;
136}
137
138static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
139{
140 sector_t used_size;
141
142 used_size = r5l_ring_distance(log, log->last_checkpoint,
143 log->log_start);
144
145 return log->device_size > used_size + size;
146}
147
148static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
149{
150 struct r5l_io_unit *io;
151 /* We can't handle memory allocate failure so far */
152 gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
153
154 io = kmem_cache_zalloc(log->io_kc, gfp);
155 io->log = log;
156 io->meta_page = alloc_page(gfp | __GFP_ZERO);
157
158 bio_list_init(&io->bios);
159 INIT_LIST_HEAD(&io->log_sibling);
160 INIT_LIST_HEAD(&io->stripe_list);
161 io->state = IO_UNIT_RUNNING;
162 init_waitqueue_head(&io->wait_state);
163 return io;
164}
165
166static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
167{
168 __free_page(io->meta_page);
169 kmem_cache_free(log->io_kc, io);
170}
171
172static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
173 enum r5l_io_unit_state state)
174{
175 struct r5l_io_unit *io;
176
177 while (!list_empty(from)) {
178 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
179 /* don't change list order */
180 if (io->state >= state)
181 list_move_tail(&io->log_sibling, to);
182 else
183 break;
184 }
185}
186
Shaohua Li0576b1c2015-08-13 14:32:00 -0700187/*
188 * We don't want too many io_units reside in stripe_end_ios list, which will
189 * waste a lot of memory. So we try to remove some. But we must keep at least 2
190 * io_units. The superblock must point to a valid meta, if it's the last meta,
191 * recovery can scan less
192 */
193static void r5l_compress_stripe_end_list(struct r5l_log *log)
194{
195 struct r5l_io_unit *first, *last, *io;
196
197 first = list_first_entry(&log->stripe_end_ios,
198 struct r5l_io_unit, log_sibling);
199 last = list_last_entry(&log->stripe_end_ios,
200 struct r5l_io_unit, log_sibling);
201 if (first == last)
202 return;
203 list_del(&first->log_sibling);
204 list_del(&last->log_sibling);
205 while (!list_empty(&log->stripe_end_ios)) {
206 io = list_first_entry(&log->stripe_end_ios,
207 struct r5l_io_unit, log_sibling);
208 list_del(&io->log_sibling);
209 first->log_end = io->log_end;
210 r5l_free_io_unit(log, io);
211 }
212 list_add_tail(&first->log_sibling, &log->stripe_end_ios);
213 list_add_tail(&last->log_sibling, &log->stripe_end_ios);
214}
215
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700216static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
217static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
218 enum r5l_io_unit_state state)
219{
220 struct r5l_log *log = io->log;
221
222 if (WARN_ON(io->state >= state))
223 return;
224 io->state = state;
225 if (state == IO_UNIT_IO_END)
226 r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
227 IO_UNIT_IO_END);
Shaohua Li0576b1c2015-08-13 14:32:00 -0700228 if (state == IO_UNIT_STRIPE_END) {
229 struct r5l_io_unit *last;
230 sector_t reclaimable_space;
231
232 r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
233 IO_UNIT_STRIPE_END);
234
235 last = list_last_entry(&log->stripe_end_ios,
236 struct r5l_io_unit, log_sibling);
237 reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
238 last->log_end);
239 if (reclaimable_space >= log->max_free_space)
240 r5l_wake_reclaim(log, 0);
241
242 r5l_compress_stripe_end_list(log);
243 }
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700244 wake_up(&io->wait_state);
245}
246
247static void r5l_set_io_unit_state(struct r5l_io_unit *io,
248 enum r5l_io_unit_state state)
249{
250 struct r5l_log *log = io->log;
251 unsigned long flags;
252
253 spin_lock_irqsave(&log->io_list_lock, flags);
254 __r5l_set_io_unit_state(io, state);
255 spin_unlock_irqrestore(&log->io_list_lock, flags);
256}
257
258/* XXX: totally ignores I/O errors */
259static void r5l_log_endio(struct bio *bio)
260{
261 struct r5l_io_unit *io = bio->bi_private;
262 struct r5l_log *log = io->log;
263
264 bio_put(bio);
265
266 if (!atomic_dec_and_test(&io->pending_io))
267 return;
268
269 r5l_set_io_unit_state(io, IO_UNIT_IO_END);
270 md_wakeup_thread(log->rdev->mddev->thread);
271}
272
273static void r5l_submit_current_io(struct r5l_log *log)
274{
275 struct r5l_io_unit *io = log->current_io;
276 struct r5l_meta_block *block;
277 struct bio *bio;
278 u32 crc;
279
280 if (!io)
281 return;
282
283 block = page_address(io->meta_page);
284 block->meta_size = cpu_to_le32(io->meta_offset);
285 crc = crc32_le(log->uuid_checksum, (void *)block, PAGE_SIZE);
286 block->checksum = cpu_to_le32(crc);
287
288 log->current_io = NULL;
289 r5l_set_io_unit_state(io, IO_UNIT_IO_START);
290
291 while ((bio = bio_list_pop(&io->bios))) {
292 /* all IO must start from rdev->data_offset */
293 bio->bi_iter.bi_sector += log->rdev->data_offset;
294 submit_bio(WRITE, bio);
295 }
296}
297
298static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
299{
300 struct r5l_io_unit *io;
301 struct r5l_meta_block *block;
302 struct bio *bio;
303
304 io = r5l_alloc_io_unit(log);
305
306 block = page_address(io->meta_page);
307 block->magic = cpu_to_le32(R5LOG_MAGIC);
308 block->version = R5LOG_VERSION;
309 block->seq = cpu_to_le64(log->seq);
310 block->position = cpu_to_le64(log->log_start);
311
312 io->log_start = log->log_start;
313 io->meta_offset = sizeof(struct r5l_meta_block);
314 io->seq = log->seq;
315
316 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
317 io->current_bio = bio;
318 bio->bi_rw = WRITE;
319 bio->bi_bdev = log->rdev->bdev;
320 bio->bi_iter.bi_sector = log->log_start;
321 bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
322 bio->bi_end_io = r5l_log_endio;
323 bio->bi_private = io;
324
325 bio_list_add(&io->bios, bio);
326 atomic_inc(&io->pending_io);
327
328 log->seq++;
329 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
330 io->log_end = log->log_start;
331 /* current bio hit disk end */
332 if (log->log_start == 0)
333 io->current_bio = NULL;
334
335 spin_lock_irq(&log->io_list_lock);
336 list_add_tail(&io->log_sibling, &log->running_ios);
337 spin_unlock_irq(&log->io_list_lock);
338
339 return io;
340}
341
342static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
343{
344 struct r5l_io_unit *io;
345
346 io = log->current_io;
347 if (io && io->meta_offset + payload_size > PAGE_SIZE)
348 r5l_submit_current_io(log);
349 io = log->current_io;
350 if (io)
351 return 0;
352
353 log->current_io = r5l_new_meta(log);
354 return 0;
355}
356
357static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
358 sector_t location,
359 u32 checksum1, u32 checksum2,
360 bool checksum2_valid)
361{
362 struct r5l_io_unit *io = log->current_io;
363 struct r5l_payload_data_parity *payload;
364
365 payload = page_address(io->meta_page) + io->meta_offset;
366 payload->header.type = cpu_to_le16(type);
367 payload->header.flags = cpu_to_le16(0);
368 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
369 (PAGE_SHIFT - 9));
370 payload->location = cpu_to_le64(location);
371 payload->checksum[0] = cpu_to_le32(checksum1);
372 if (checksum2_valid)
373 payload->checksum[1] = cpu_to_le32(checksum2);
374
375 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
376 sizeof(__le32) * (1 + !!checksum2_valid);
377}
378
379static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
380{
381 struct r5l_io_unit *io = log->current_io;
382
383alloc_bio:
384 if (!io->current_bio) {
385 struct bio *bio;
386
387 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
388 bio->bi_rw = WRITE;
389 bio->bi_bdev = log->rdev->bdev;
390 bio->bi_iter.bi_sector = log->log_start;
391 bio->bi_end_io = r5l_log_endio;
392 bio->bi_private = io;
393 bio_list_add(&io->bios, bio);
394 atomic_inc(&io->pending_io);
395 io->current_bio = bio;
396 }
397 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
398 io->current_bio = NULL;
399 goto alloc_bio;
400 }
401 log->log_start = r5l_ring_add(log, log->log_start,
402 BLOCK_SECTORS);
403 /* current bio hit disk end */
404 if (log->log_start == 0)
405 io->current_bio = NULL;
406
407 io->log_end = log->log_start;
408}
409
410static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
411 int data_pages, int parity_pages)
412{
413 int i;
414 int meta_size;
415 struct r5l_io_unit *io;
416
417 meta_size =
418 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
419 * data_pages) +
420 sizeof(struct r5l_payload_data_parity) +
421 sizeof(__le32) * parity_pages;
422
423 r5l_get_meta(log, meta_size);
424 io = log->current_io;
425
426 for (i = 0; i < sh->disks; i++) {
427 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
428 continue;
429 if (i == sh->pd_idx || i == sh->qd_idx)
430 continue;
431 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
432 raid5_compute_blocknr(sh, i, 0),
433 sh->dev[i].log_checksum, 0, false);
434 r5l_append_payload_page(log, sh->dev[i].page);
435 }
436
437 if (sh->qd_idx >= 0) {
438 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
439 sh->sector, sh->dev[sh->pd_idx].log_checksum,
440 sh->dev[sh->qd_idx].log_checksum, true);
441 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
442 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
443 } else {
444 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
445 sh->sector, sh->dev[sh->pd_idx].log_checksum,
446 0, false);
447 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
448 }
449
450 list_add_tail(&sh->log_list, &io->stripe_list);
451 atomic_inc(&io->pending_stripe);
452 sh->log_io = io;
453}
454
455/*
456 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
457 * data from log to raid disks), so we shouldn't wait for reclaim here
458 */
459int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
460{
461 int write_disks = 0;
462 int data_pages, parity_pages;
463 int meta_size;
464 int reserve;
465 int i;
466
467 if (!log)
468 return -EAGAIN;
469 /* Don't support stripe batch */
470 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
471 test_bit(STRIPE_SYNCING, &sh->state)) {
472 /* the stripe is written to log, we start writing it to raid */
473 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
474 return -EAGAIN;
475 }
476
477 for (i = 0; i < sh->disks; i++) {
478 void *addr;
479
480 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
481 continue;
482 write_disks++;
483 /* checksum is already calculated in last run */
484 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
485 continue;
486 addr = kmap_atomic(sh->dev[i].page);
487 sh->dev[i].log_checksum = crc32_le(log->uuid_checksum,
488 addr, PAGE_SIZE);
489 kunmap_atomic(addr);
490 }
491 parity_pages = 1 + !!(sh->qd_idx >= 0);
492 data_pages = write_disks - parity_pages;
493
494 meta_size =
495 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
496 * data_pages) +
497 sizeof(struct r5l_payload_data_parity) +
498 sizeof(__le32) * parity_pages;
499 /* Doesn't work with very big raid array */
500 if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
501 return -EINVAL;
502
503 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
504 atomic_inc(&sh->count);
505
506 mutex_lock(&log->io_mutex);
507 /* meta + data */
508 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
509 if (r5l_has_free_space(log, reserve))
510 r5l_log_stripe(log, sh, data_pages, parity_pages);
511 else {
512 spin_lock(&log->no_space_stripes_lock);
513 list_add_tail(&sh->log_list, &log->no_space_stripes);
514 spin_unlock(&log->no_space_stripes_lock);
515
516 r5l_wake_reclaim(log, reserve);
517 }
518 mutex_unlock(&log->io_mutex);
519
520 return 0;
521}
522
523void r5l_write_stripe_run(struct r5l_log *log)
524{
525 if (!log)
526 return;
527 mutex_lock(&log->io_mutex);
528 r5l_submit_current_io(log);
529 mutex_unlock(&log->io_mutex);
530}
531
532/* This will run after log space is reclaimed */
533static void r5l_run_no_space_stripes(struct r5l_log *log)
534{
535 struct stripe_head *sh;
536
537 spin_lock(&log->no_space_stripes_lock);
538 while (!list_empty(&log->no_space_stripes)) {
539 sh = list_first_entry(&log->no_space_stripes,
540 struct stripe_head, log_list);
541 list_del_init(&sh->log_list);
542 set_bit(STRIPE_HANDLE, &sh->state);
543 raid5_release_stripe(sh);
544 }
545 spin_unlock(&log->no_space_stripes_lock);
546}
547
Shaohua Li0576b1c2015-08-13 14:32:00 -0700548void r5l_stripe_write_finished(struct stripe_head *sh)
549{
550 struct r5l_io_unit *io;
551
552 /* Don't support stripe batch */
553 io = sh->log_io;
554 if (!io)
555 return;
556 sh->log_io = NULL;
557
558 if (atomic_dec_and_test(&io->pending_stripe))
559 r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
560}
561
562/*
563 * Starting dispatch IO to raid.
564 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
565 * broken meta in the middle of a log causes recovery can't find meta at the
566 * head of log. If operations require meta at the head persistent in log, we
567 * must make sure meta before it persistent in log too. A case is:
568 *
569 * stripe data/parity is in log, we start write stripe to raid disks. stripe
570 * data/parity must be persistent in log before we do the write to raid disks.
571 *
572 * The solution is we restrictly maintain io_unit list order. In this case, we
573 * only write stripes of an io_unit to raid disks till the io_unit is the first
574 * one whose data/parity is in log.
575 */
576void r5l_flush_stripe_to_raid(struct r5l_log *log)
577{
578 struct r5l_io_unit *io;
579 struct stripe_head *sh;
580 bool run_stripe;
581
582 if (!log)
583 return;
584 spin_lock_irq(&log->io_list_lock);
585 run_stripe = !list_empty(&log->io_end_ios);
586 spin_unlock_irq(&log->io_list_lock);
587
588 if (!run_stripe)
589 return;
590
591 blkdev_issue_flush(log->rdev->bdev, GFP_NOIO, NULL);
592
593 spin_lock_irq(&log->io_list_lock);
594 list_for_each_entry(io, &log->io_end_ios, log_sibling) {
595 if (io->state >= IO_UNIT_STRIPE_START)
596 continue;
597 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_START);
598
599 while (!list_empty(&io->stripe_list)) {
600 sh = list_first_entry(&io->stripe_list,
601 struct stripe_head, log_list);
602 list_del_init(&sh->log_list);
603 set_bit(STRIPE_HANDLE, &sh->state);
604 raid5_release_stripe(sh);
605 }
606 }
607 spin_unlock_irq(&log->io_list_lock);
608}
609
610static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
611{
612 /* the log thread will write the io unit */
613 wait_event(io->wait_state, io->state >= IO_UNIT_IO_END);
614 if (io->state < IO_UNIT_STRIPE_START)
615 r5l_flush_stripe_to_raid(log);
616 wait_event(io->wait_state, io->state >= IO_UNIT_STRIPE_END);
617}
618
619static void r5l_write_super(struct r5l_log *log, sector_t cp);
620static void r5l_do_reclaim(struct r5l_log *log)
621{
622 struct r5l_io_unit *io, *last;
623 LIST_HEAD(list);
624 sector_t free = 0;
625 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
626
627 spin_lock_irq(&log->io_list_lock);
628 /*
629 * move proper io_unit to reclaim list. We should not change the order.
630 * reclaimable/unreclaimable io_unit can be mixed in the list, we
631 * shouldn't reuse space of an unreclaimable io_unit
632 */
633 while (1) {
634 while (!list_empty(&log->stripe_end_ios)) {
635 io = list_first_entry(&log->stripe_end_ios,
636 struct r5l_io_unit, log_sibling);
637 list_move_tail(&io->log_sibling, &list);
638 free += r5l_ring_distance(log, io->log_start,
639 io->log_end);
640 }
641
642 if (free >= reclaim_target ||
643 (list_empty(&log->running_ios) &&
644 list_empty(&log->io_end_ios) &&
645 list_empty(&log->stripe_end_ios)))
646 break;
647
648 /* Below waiting mostly happens when we shutdown the raid */
649 if (!list_empty(&log->io_end_ios)) {
650 io = list_first_entry(&log->io_end_ios,
651 struct r5l_io_unit, log_sibling);
652 spin_unlock_irq(&log->io_list_lock);
653 /* nobody else can delete the io, we are safe */
654 r5l_kick_io_unit(log, io);
655 spin_lock_irq(&log->io_list_lock);
656 continue;
657 }
658
659 if (!list_empty(&log->running_ios)) {
660 io = list_first_entry(&log->running_ios,
661 struct r5l_io_unit, log_sibling);
662 spin_unlock_irq(&log->io_list_lock);
663 /* nobody else can delete the io, we are safe */
664 r5l_kick_io_unit(log, io);
665 spin_lock_irq(&log->io_list_lock);
666 continue;
667 }
668 }
669 spin_unlock_irq(&log->io_list_lock);
670
671 if (list_empty(&list))
672 return;
673
674 /* super always point to last valid meta */
675 last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
676 /*
677 * write_super will flush cache of each raid disk. We must write super
678 * here, because the log area might be reused soon and we don't want to
679 * confuse recovery
680 */
681 r5l_write_super(log, last->log_start);
682
683 mutex_lock(&log->io_mutex);
684 log->last_checkpoint = last->log_start;
685 log->last_cp_seq = last->seq;
686 mutex_unlock(&log->io_mutex);
687 r5l_run_no_space_stripes(log);
688
689 while (!list_empty(&list)) {
690 io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
691 list_del(&io->log_sibling);
692 r5l_free_io_unit(log, io);
693 }
694}
695
696static void r5l_reclaim_thread(struct md_thread *thread)
697{
698 struct mddev *mddev = thread->mddev;
699 struct r5conf *conf = mddev->private;
700 struct r5l_log *log = conf->log;
701
702 if (!log)
703 return;
704 r5l_do_reclaim(log);
705}
706
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700707static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
708{
Shaohua Li0576b1c2015-08-13 14:32:00 -0700709 unsigned long target;
710 unsigned long new = (unsigned long)space; /* overflow in theory */
711
712 do {
713 target = log->reclaim_target;
714 if (new < target)
715 return;
716 } while (cmpxchg(&log->reclaim_target, target, new) != target);
717 md_wakeup_thread(log->reclaim_thread);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700718}
719
Shaohua Li355810d2015-08-13 14:32:01 -0700720struct r5l_recovery_ctx {
721 struct page *meta_page; /* current meta */
722 sector_t meta_total_blocks; /* total size of current meta and data */
723 sector_t pos; /* recovery position */
724 u64 seq; /* recovery position seq */
725};
726
727static int r5l_read_meta_block(struct r5l_log *log,
728 struct r5l_recovery_ctx *ctx)
729{
730 struct page *page = ctx->meta_page;
731 struct r5l_meta_block *mb;
732 u32 crc, stored_crc;
733
734 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
735 return -EIO;
736
737 mb = page_address(page);
738 stored_crc = le32_to_cpu(mb->checksum);
739 mb->checksum = 0;
740
741 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
742 le64_to_cpu(mb->seq) != ctx->seq ||
743 mb->version != R5LOG_VERSION ||
744 le64_to_cpu(mb->position) != ctx->pos)
745 return -EINVAL;
746
747 crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE);
748 if (stored_crc != crc)
749 return -EINVAL;
750
751 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
752 return -EINVAL;
753
754 ctx->meta_total_blocks = BLOCK_SECTORS;
755
756 return 0;
757}
758
759static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
760 struct r5l_recovery_ctx *ctx,
761 sector_t stripe_sect,
762 int *offset, sector_t *log_offset)
763{
764 struct r5conf *conf = log->rdev->mddev->private;
765 struct stripe_head *sh;
766 struct r5l_payload_data_parity *payload;
767 int disk_index;
768
769 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
770 while (1) {
771 payload = page_address(ctx->meta_page) + *offset;
772
773 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
774 raid5_compute_sector(conf,
775 le64_to_cpu(payload->location), 0,
776 &disk_index, sh);
777
778 sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
779 sh->dev[disk_index].page, READ, false);
780 sh->dev[disk_index].log_checksum =
781 le32_to_cpu(payload->checksum[0]);
782 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
783 ctx->meta_total_blocks += BLOCK_SECTORS;
784 } else {
785 disk_index = sh->pd_idx;
786 sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
787 sh->dev[disk_index].page, READ, false);
788 sh->dev[disk_index].log_checksum =
789 le32_to_cpu(payload->checksum[0]);
790 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
791
792 if (sh->qd_idx >= 0) {
793 disk_index = sh->qd_idx;
794 sync_page_io(log->rdev,
795 r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
796 PAGE_SIZE, sh->dev[disk_index].page,
797 READ, false);
798 sh->dev[disk_index].log_checksum =
799 le32_to_cpu(payload->checksum[1]);
800 set_bit(R5_Wantwrite,
801 &sh->dev[disk_index].flags);
802 }
803 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
804 }
805
806 *log_offset = r5l_ring_add(log, *log_offset,
807 le32_to_cpu(payload->size));
808 *offset += sizeof(struct r5l_payload_data_parity) +
809 sizeof(__le32) *
810 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
811 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
812 break;
813 }
814
815 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
816 void *addr;
817 u32 checksum;
818
819 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
820 continue;
821 addr = kmap_atomic(sh->dev[disk_index].page);
822 checksum = crc32_le(log->uuid_checksum, addr, PAGE_SIZE);
823 kunmap_atomic(addr);
824 if (checksum != sh->dev[disk_index].log_checksum)
825 goto error;
826 }
827
828 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
829 struct md_rdev *rdev, *rrdev;
830
831 if (!test_and_clear_bit(R5_Wantwrite,
832 &sh->dev[disk_index].flags))
833 continue;
834
835 /* in case device is broken */
836 rdev = rcu_dereference(conf->disks[disk_index].rdev);
837 if (rdev)
838 sync_page_io(rdev, stripe_sect, PAGE_SIZE,
839 sh->dev[disk_index].page, WRITE, false);
840 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
841 if (rrdev)
842 sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
843 sh->dev[disk_index].page, WRITE, false);
844 }
845 raid5_release_stripe(sh);
846 return 0;
847
848error:
849 for (disk_index = 0; disk_index < sh->disks; disk_index++)
850 sh->dev[disk_index].flags = 0;
851 raid5_release_stripe(sh);
852 return -EINVAL;
853}
854
855static int r5l_recovery_flush_one_meta(struct r5l_log *log,
856 struct r5l_recovery_ctx *ctx)
857{
858 struct r5conf *conf = log->rdev->mddev->private;
859 struct r5l_payload_data_parity *payload;
860 struct r5l_meta_block *mb;
861 int offset;
862 sector_t log_offset;
863 sector_t stripe_sector;
864
865 mb = page_address(ctx->meta_page);
866 offset = sizeof(struct r5l_meta_block);
867 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
868
869 while (offset < le32_to_cpu(mb->meta_size)) {
870 int dd;
871
872 payload = (void *)mb + offset;
873 stripe_sector = raid5_compute_sector(conf,
874 le64_to_cpu(payload->location), 0, &dd, NULL);
875 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
876 &offset, &log_offset))
877 return -EINVAL;
878 }
879 return 0;
880}
881
882/* copy data/parity from log to raid disks */
883static void r5l_recovery_flush_log(struct r5l_log *log,
884 struct r5l_recovery_ctx *ctx)
885{
886 while (1) {
887 if (r5l_read_meta_block(log, ctx))
888 return;
889 if (r5l_recovery_flush_one_meta(log, ctx))
890 return;
891 ctx->seq++;
892 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
893 }
894}
895
896static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
897 u64 seq)
898{
899 struct page *page;
900 struct r5l_meta_block *mb;
901 u32 crc;
902
903 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
904 if (!page)
905 return -ENOMEM;
906 mb = page_address(page);
907 mb->magic = cpu_to_le32(R5LOG_MAGIC);
908 mb->version = R5LOG_VERSION;
909 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
910 mb->seq = cpu_to_le64(seq);
911 mb->position = cpu_to_le64(pos);
912 crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE);
913 mb->checksum = cpu_to_le32(crc);
914
915 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
916 __free_page(page);
917 return -EIO;
918 }
919 __free_page(page);
920 return 0;
921}
922
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700923static int r5l_recovery_log(struct r5l_log *log)
924{
Shaohua Li355810d2015-08-13 14:32:01 -0700925 struct r5l_recovery_ctx ctx;
926
927 ctx.pos = log->last_checkpoint;
928 ctx.seq = log->last_cp_seq;
929 ctx.meta_page = alloc_page(GFP_KERNEL);
930 if (!ctx.meta_page)
931 return -ENOMEM;
932
933 r5l_recovery_flush_log(log, &ctx);
934 __free_page(ctx.meta_page);
935
936 /*
937 * we did a recovery. Now ctx.pos points to an invalid meta block. New
938 * log will start here. but we can't let superblock point to last valid
939 * meta block. The log might looks like:
940 * | meta 1| meta 2| meta 3|
941 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
942 * superblock points to meta 1, we write a new valid meta 2n. if crash
943 * happens again, new recovery will start from meta 1. Since meta 2n is
944 * valid now, recovery will think meta 3 is valid, which is wrong.
945 * The solution is we create a new meta in meta2 with its seq == meta
946 * 1's seq + 10 and let superblock points to meta2. The same recovery will
947 * not think meta 3 is a valid meta, because its seq doesn't match
948 */
949 if (ctx.seq > log->last_cp_seq + 1) {
950 int ret;
951
952 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
953 if (ret)
954 return ret;
955 log->seq = ctx.seq + 11;
956 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
957 r5l_write_super(log, ctx.pos);
958 } else {
959 log->log_start = ctx.pos;
960 log->seq = ctx.seq;
961 }
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700962 return 0;
963}
964
965static void r5l_write_super(struct r5l_log *log, sector_t cp)
966{
967 struct mddev *mddev = log->rdev->mddev;
968
969 log->rdev->journal_tail = cp;
970 set_bit(MD_CHANGE_DEVS, &mddev->flags);
971}
972
973static int r5l_load_log(struct r5l_log *log)
974{
975 struct md_rdev *rdev = log->rdev;
976 struct page *page;
977 struct r5l_meta_block *mb;
978 sector_t cp = log->rdev->journal_tail;
979 u32 stored_crc, expected_crc;
980 bool create_super = false;
981 int ret;
982
983 /* Make sure it's valid */
984 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
985 cp = 0;
986 page = alloc_page(GFP_KERNEL);
987 if (!page)
988 return -ENOMEM;
989
990 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
991 ret = -EIO;
992 goto ioerr;
993 }
994 mb = page_address(page);
995
996 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
997 mb->version != R5LOG_VERSION) {
998 create_super = true;
999 goto create;
1000 }
1001 stored_crc = le32_to_cpu(mb->checksum);
1002 mb->checksum = 0;
1003 expected_crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE);
1004 if (stored_crc != expected_crc) {
1005 create_super = true;
1006 goto create;
1007 }
1008 if (le64_to_cpu(mb->position) != cp) {
1009 create_super = true;
1010 goto create;
1011 }
1012create:
1013 if (create_super) {
1014 log->last_cp_seq = prandom_u32();
1015 cp = 0;
1016 /*
1017 * Make sure super points to correct address. Log might have
1018 * data very soon. If super hasn't correct log tail address,
1019 * recovery can't find the log
1020 */
1021 r5l_write_super(log, cp);
1022 } else
1023 log->last_cp_seq = le64_to_cpu(mb->seq);
1024
1025 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001026 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1027 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1028 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001029 log->last_checkpoint = cp;
1030
1031 __free_page(page);
1032
1033 return r5l_recovery_log(log);
1034ioerr:
1035 __free_page(page);
1036 return ret;
1037}
1038
1039int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1040{
1041 struct r5l_log *log;
1042
1043 if (PAGE_SIZE != 4096)
1044 return -EINVAL;
1045 log = kzalloc(sizeof(*log), GFP_KERNEL);
1046 if (!log)
1047 return -ENOMEM;
1048 log->rdev = rdev;
1049
1050 log->uuid_checksum = crc32_le(~0, (void *)rdev->mddev->uuid,
1051 sizeof(rdev->mddev->uuid));
1052
1053 mutex_init(&log->io_mutex);
1054
1055 spin_lock_init(&log->io_list_lock);
1056 INIT_LIST_HEAD(&log->running_ios);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001057 INIT_LIST_HEAD(&log->io_end_ios);
1058 INIT_LIST_HEAD(&log->stripe_end_ios);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001059
1060 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1061 if (!log->io_kc)
1062 goto io_kc;
1063
Shaohua Li0576b1c2015-08-13 14:32:00 -07001064 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1065 log->rdev->mddev, "reclaim");
1066 if (!log->reclaim_thread)
1067 goto reclaim_thread;
1068
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001069 INIT_LIST_HEAD(&log->no_space_stripes);
1070 spin_lock_init(&log->no_space_stripes_lock);
1071
1072 if (r5l_load_log(log))
1073 goto error;
1074
1075 conf->log = log;
1076 return 0;
1077error:
Shaohua Li0576b1c2015-08-13 14:32:00 -07001078 md_unregister_thread(&log->reclaim_thread);
1079reclaim_thread:
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001080 kmem_cache_destroy(log->io_kc);
1081io_kc:
1082 kfree(log);
1083 return -EINVAL;
1084}
1085
1086void r5l_exit_log(struct r5l_log *log)
1087{
Shaohua Li0576b1c2015-08-13 14:32:00 -07001088 /*
1089 * at this point all stripes are finished, so io_unit is at least in
1090 * STRIPE_END state
1091 */
1092 r5l_wake_reclaim(log, -1L);
1093 md_unregister_thread(&log->reclaim_thread);
1094 r5l_do_reclaim(log);
1095 /*
1096 * force a super update, r5l_do_reclaim might updated the super.
1097 * mddev->thread is already stopped
1098 */
1099 md_update_sb(log->rdev->mddev, 1);
1100
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001101 kmem_cache_destroy(log->io_kc);
1102 kfree(log);
1103}