blob: 390be840c7640beb6817e3c73a3d119a13c1e507 [file] [log] [blame]
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001/*
2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 */
14#include <linux/kernel.h>
15#include <linux/wait.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/raid/md_p.h>
Shaohua Li5cb2fbd2015-10-28 08:41:25 -070019#include <linux/crc32c.h>
Shaohua Lif6bed0e2015-08-13 14:31:59 -070020#include <linux/random.h>
21#include "md.h"
22#include "raid5.h"
Song Liu1e6d6902016-11-17 15:24:39 -080023#include "bitmap.h"
Shaohua Lif6bed0e2015-08-13 14:31:59 -070024
25/*
26 * metadata/data stored in disk with 4k size unit (a block) regardless
27 * underneath hardware sector size. only works with PAGE_SIZE == 4096
28 */
29#define BLOCK_SECTORS (8)
30
Shaohua Li0576b1c2015-08-13 14:32:00 -070031/*
Song Liua39f7af2016-11-17 15:24:40 -080032 * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
33 *
34 * In write through mode, the reclaim runs every log->max_free_space.
35 * This can prevent the recovery scans for too long
Shaohua Li0576b1c2015-08-13 14:32:00 -070036 */
37#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
38#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
39
Song Liua39f7af2016-11-17 15:24:40 -080040/* wake up reclaim thread periodically */
41#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
42/* start flush with these full stripes */
43#define R5C_FULL_STRIPE_FLUSH_BATCH 256
44/* reclaim stripes in groups */
45#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
46
Christoph Hellwigc38d29b2015-12-21 10:51:02 +110047/*
48 * We only need 2 bios per I/O unit to make progress, but ensure we
49 * have a few more available to not get too tight.
50 */
51#define R5L_POOL_SIZE 4
52
Song Liu2ded3702016-11-17 15:24:38 -080053/*
54 * r5c journal modes of the array: write-back or write-through.
55 * write-through mode has identical behavior as existing log only
56 * implementation.
57 */
58enum r5c_journal_mode {
59 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
60 R5C_JOURNAL_MODE_WRITE_BACK = 1,
61};
62
Song Liu2c7da142016-11-17 15:24:41 -080063static char *r5c_journal_mode_str[] = {"write-through",
64 "write-back"};
Song Liu2ded3702016-11-17 15:24:38 -080065/*
66 * raid5 cache state machine
67 *
68 * With rhe RAID cache, each stripe works in two phases:
69 * - caching phase
70 * - writing-out phase
71 *
72 * These two phases are controlled by bit STRIPE_R5C_CACHING:
73 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
74 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
75 *
76 * When there is no journal, or the journal is in write-through mode,
77 * the stripe is always in writing-out phase.
78 *
79 * For write-back journal, the stripe is sent to caching phase on write
80 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
81 * the write-out phase by clearing STRIPE_R5C_CACHING.
82 *
83 * Stripes in caching phase do not write the raid disks. Instead, all
84 * writes are committed from the log device. Therefore, a stripe in
85 * caching phase handles writes as:
86 * - write to log device
87 * - return IO
88 *
89 * Stripes in writing-out phase handle writes as:
90 * - calculate parity
91 * - write pending data and parity to journal
92 * - write data and parity to raid disks
93 * - return IO for pending writes
94 */
95
Shaohua Lif6bed0e2015-08-13 14:31:59 -070096struct r5l_log {
97 struct md_rdev *rdev;
98
99 u32 uuid_checksum;
100
101 sector_t device_size; /* log device size, round to
102 * BLOCK_SECTORS */
Shaohua Li0576b1c2015-08-13 14:32:00 -0700103 sector_t max_free_space; /* reclaim run if free space is at
104 * this size */
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700105
106 sector_t last_checkpoint; /* log tail. where recovery scan
107 * starts from */
108 u64 last_cp_seq; /* log tail sequence */
109
110 sector_t log_start; /* log head. where new data appends */
111 u64 seq; /* log head sequence */
112
Christoph Hellwig17036462015-10-05 09:31:06 +0200113 sector_t next_checkpoint;
114 u64 next_cp_seq;
115
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700116 struct mutex io_mutex;
117 struct r5l_io_unit *current_io; /* current io_unit accepting new data */
118
119 spinlock_t io_list_lock;
120 struct list_head running_ios; /* io_units which are still running,
121 * and have not yet been completely
122 * written to the log */
123 struct list_head io_end_ios; /* io_units which have been completely
124 * written to the log but not yet written
125 * to the RAID */
Shaohua Lia8c34f92015-09-02 13:49:46 -0700126 struct list_head flushing_ios; /* io_units which are waiting for log
127 * cache flush */
Christoph Hellwig04732f72015-10-05 09:31:07 +0200128 struct list_head finished_ios; /* io_units which settle down in log disk */
Shaohua Lia8c34f92015-09-02 13:49:46 -0700129 struct bio flush_bio;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700130
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100131 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
132
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700133 struct kmem_cache *io_kc;
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100134 mempool_t *io_pool;
Christoph Hellwigc38d29b2015-12-21 10:51:02 +1100135 struct bio_set *bs;
Christoph Hellwige8deb632015-12-21 10:51:02 +1100136 mempool_t *meta_pool;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700137
Shaohua Li0576b1c2015-08-13 14:32:00 -0700138 struct md_thread *reclaim_thread;
139 unsigned long reclaim_target; /* number of space that need to be
140 * reclaimed. if it's 0, reclaim spaces
141 * used by io_units which are in
142 * IO_UNIT_STRIPE_END state (eg, reclaim
143 * dones't wait for specific io_unit
144 * switching to IO_UNIT_STRIPE_END
145 * state) */
Shaohua Li0fd22b42015-09-02 13:49:47 -0700146 wait_queue_head_t iounit_wait;
Shaohua Li0576b1c2015-08-13 14:32:00 -0700147
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700148 struct list_head no_space_stripes; /* pending stripes, log has no space */
149 spinlock_t no_space_stripes_lock;
Christoph Hellwig56fef7c2015-10-05 09:31:09 +0200150
151 bool need_cache_flush;
Song Liu2ded3702016-11-17 15:24:38 -0800152
153 /* for r5c_cache */
154 enum r5c_journal_mode r5c_journal_mode;
Song Liua39f7af2016-11-17 15:24:40 -0800155
156 /* all stripes in r5cache, in the order of seq at sh->log_start */
157 struct list_head stripe_in_journal_list;
158
159 spinlock_t stripe_in_journal_lock;
160 atomic_t stripe_in_journal_count;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700161};
162
163/*
164 * an IO range starts from a meta data block and end at the next meta data
165 * block. The io unit's the meta data block tracks data/parity followed it. io
166 * unit is written to log disk with normal write, as we always flush log disk
167 * first and then start move data to raid disks, there is no requirement to
168 * write io unit with FLUSH/FUA
169 */
170struct r5l_io_unit {
171 struct r5l_log *log;
172
173 struct page *meta_page; /* store meta block */
174 int meta_offset; /* current offset in meta_page */
175
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700176 struct bio *current_bio;/* current_bio accepting new data */
177
178 atomic_t pending_stripe;/* how many stripes not flushed to raid */
179 u64 seq; /* seq number of the metablock */
180 sector_t log_start; /* where the io_unit starts */
181 sector_t log_end; /* where the io_unit ends */
182 struct list_head log_sibling; /* log->running_ios */
183 struct list_head stripe_list; /* stripes added to the io_unit */
184
185 int state;
Christoph Hellwig6143e2c2015-10-05 09:31:16 +0200186 bool need_split_bio;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700187};
188
189/* r5l_io_unit state */
190enum r5l_io_unit_state {
191 IO_UNIT_RUNNING = 0, /* accepting new IO */
192 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
193 * don't accepting new bio */
194 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
Shaohua Lia8c34f92015-09-02 13:49:46 -0700195 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700196};
197
Song Liu2ded3702016-11-17 15:24:38 -0800198bool r5c_is_writeback(struct r5l_log *log)
199{
200 return (log != NULL &&
201 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
202}
203
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700204static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
205{
206 start += inc;
207 if (start >= log->device_size)
208 start = start - log->device_size;
209 return start;
210}
211
212static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
213 sector_t end)
214{
215 if (end >= start)
216 return end - start;
217 else
218 return end + log->device_size - start;
219}
220
221static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
222{
223 sector_t used_size;
224
225 used_size = r5l_ring_distance(log, log->last_checkpoint,
226 log->log_start);
227
228 return log->device_size > used_size + size;
229}
230
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700231static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
232 enum r5l_io_unit_state state)
233{
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700234 if (WARN_ON(io->state >= state))
235 return;
236 io->state = state;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700237}
238
Song Liu1e6d6902016-11-17 15:24:39 -0800239static void
240r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
241 struct bio_list *return_bi)
242{
243 struct bio *wbi, *wbi2;
244
245 wbi = dev->written;
246 dev->written = NULL;
247 while (wbi && wbi->bi_iter.bi_sector <
248 dev->sector + STRIPE_SECTORS) {
249 wbi2 = r5_next_bio(wbi, dev->sector);
250 if (!raid5_dec_bi_active_stripes(wbi)) {
251 md_write_end(conf->mddev);
252 bio_list_add(return_bi, wbi);
253 }
254 wbi = wbi2;
255 }
256}
257
258void r5c_handle_cached_data_endio(struct r5conf *conf,
259 struct stripe_head *sh, int disks, struct bio_list *return_bi)
260{
261 int i;
262
263 for (i = sh->disks; i--; ) {
264 if (sh->dev[i].written) {
265 set_bit(R5_UPTODATE, &sh->dev[i].flags);
266 r5c_return_dev_pending_writes(conf, &sh->dev[i],
267 return_bi);
268 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
269 STRIPE_SECTORS,
270 !test_bit(STRIPE_DEGRADED, &sh->state),
271 0);
272 }
273 }
274}
275
Song Liua39f7af2016-11-17 15:24:40 -0800276/* Check whether we should flush some stripes to free up stripe cache */
277void r5c_check_stripe_cache_usage(struct r5conf *conf)
278{
279 int total_cached;
280
281 if (!r5c_is_writeback(conf->log))
282 return;
283
284 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
285 atomic_read(&conf->r5c_cached_full_stripes);
286
287 /*
288 * The following condition is true for either of the following:
289 * - stripe cache pressure high:
290 * total_cached > 3/4 min_nr_stripes ||
291 * empty_inactive_list_nr > 0
292 * - stripe cache pressure moderate:
293 * total_cached > 1/2 min_nr_stripes
294 */
295 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
296 atomic_read(&conf->empty_inactive_list_nr) > 0)
297 r5l_wake_reclaim(conf->log, 0);
298}
299
300/*
301 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
302 * stripes in the cache
303 */
304void r5c_check_cached_full_stripe(struct r5conf *conf)
305{
306 if (!r5c_is_writeback(conf->log))
307 return;
308
309 /*
310 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
311 * or a full stripe (chunk size / 4k stripes).
312 */
313 if (atomic_read(&conf->r5c_cached_full_stripes) >=
314 min(R5C_FULL_STRIPE_FLUSH_BATCH,
315 conf->chunk_sectors >> STRIPE_SHIFT))
316 r5l_wake_reclaim(conf->log, 0);
317}
318
319/*
320 * Total log space (in sectors) needed to flush all data in cache
321 *
322 * Currently, writing-out phase automatically includes all pending writes
323 * to the same sector. So the reclaim of each stripe takes up to
324 * (conf->raid_disks + 1) pages of log space.
325 *
326 * To totally avoid deadlock due to log space, the code reserves
327 * (conf->raid_disks + 1) pages for each stripe in cache, which is not
328 * necessary in most cases.
329 *
330 * To improve this, we will need writing-out phase to be able to NOT include
331 * pending writes, which will reduce the requirement to
332 * (conf->max_degraded + 1) pages per stripe in cache.
333 */
334static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
335{
336 struct r5l_log *log = conf->log;
337
338 if (!r5c_is_writeback(log))
339 return 0;
340
341 return BLOCK_SECTORS * (conf->raid_disks + 1) *
342 atomic_read(&log->stripe_in_journal_count);
343}
344
345/*
346 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
347 *
348 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
349 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
350 * device is less than 2x of reclaim_required_space.
351 */
352static inline void r5c_update_log_state(struct r5l_log *log)
353{
354 struct r5conf *conf = log->rdev->mddev->private;
355 sector_t free_space;
356 sector_t reclaim_space;
357
358 if (!r5c_is_writeback(log))
359 return;
360
361 free_space = r5l_ring_distance(log, log->log_start,
362 log->last_checkpoint);
363 reclaim_space = r5c_log_required_to_flush_cache(conf);
364 if (free_space < 2 * reclaim_space)
365 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
366 else
367 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
368 if (free_space < 3 * reclaim_space)
369 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
370 else
371 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
372}
373
Song Liu2ded3702016-11-17 15:24:38 -0800374/*
375 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
376 * This function should only be called in write-back mode.
377 */
Song Liua39f7af2016-11-17 15:24:40 -0800378void r5c_make_stripe_write_out(struct stripe_head *sh)
Song Liu2ded3702016-11-17 15:24:38 -0800379{
380 struct r5conf *conf = sh->raid_conf;
381 struct r5l_log *log = conf->log;
382
383 BUG_ON(!r5c_is_writeback(log));
384
385 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
386 clear_bit(STRIPE_R5C_CACHING, &sh->state);
Song Liu1e6d6902016-11-17 15:24:39 -0800387
388 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
389 atomic_inc(&conf->preread_active_stripes);
390
391 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
392 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
393 atomic_dec(&conf->r5c_cached_partial_stripes);
394 }
395
396 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
397 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
398 atomic_dec(&conf->r5c_cached_full_stripes);
399 }
400}
401
402static void r5c_handle_data_cached(struct stripe_head *sh)
403{
404 int i;
405
406 for (i = sh->disks; i--; )
407 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
408 set_bit(R5_InJournal, &sh->dev[i].flags);
409 clear_bit(R5_LOCKED, &sh->dev[i].flags);
410 }
411 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
412}
413
414/*
415 * this journal write must contain full parity,
416 * it may also contain some data pages
417 */
418static void r5c_handle_parity_cached(struct stripe_head *sh)
419{
420 int i;
421
422 for (i = sh->disks; i--; )
423 if (test_bit(R5_InJournal, &sh->dev[i].flags))
424 set_bit(R5_Wantwrite, &sh->dev[i].flags);
Song Liu2ded3702016-11-17 15:24:38 -0800425}
426
427/*
428 * Setting proper flags after writing (or flushing) data and/or parity to the
429 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
430 */
431static void r5c_finish_cache_stripe(struct stripe_head *sh)
432{
433 struct r5l_log *log = sh->raid_conf->log;
434
435 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
436 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
437 /*
438 * Set R5_InJournal for parity dev[pd_idx]. This means
439 * all data AND parity in the journal. For RAID 6, it is
440 * NOT necessary to set the flag for dev[qd_idx], as the
441 * two parities are written out together.
442 */
443 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
Song Liu1e6d6902016-11-17 15:24:39 -0800444 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
445 r5c_handle_data_cached(sh);
446 } else {
447 r5c_handle_parity_cached(sh);
448 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
449 }
Song Liu2ded3702016-11-17 15:24:38 -0800450}
451
Christoph Hellwigd8858f42015-10-05 09:31:08 +0200452static void r5l_io_run_stripes(struct r5l_io_unit *io)
453{
454 struct stripe_head *sh, *next;
455
456 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
457 list_del_init(&sh->log_list);
Song Liu2ded3702016-11-17 15:24:38 -0800458
459 r5c_finish_cache_stripe(sh);
460
Christoph Hellwigd8858f42015-10-05 09:31:08 +0200461 set_bit(STRIPE_HANDLE, &sh->state);
462 raid5_release_stripe(sh);
463 }
464}
465
Christoph Hellwig56fef7c2015-10-05 09:31:09 +0200466static void r5l_log_run_stripes(struct r5l_log *log)
467{
468 struct r5l_io_unit *io, *next;
469
470 assert_spin_locked(&log->io_list_lock);
471
472 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
473 /* don't change list order */
474 if (io->state < IO_UNIT_IO_END)
475 break;
476
477 list_move_tail(&io->log_sibling, &log->finished_ios);
478 r5l_io_run_stripes(io);
479 }
480}
481
Christoph Hellwig3848c0b2015-12-21 10:51:01 +1100482static void r5l_move_to_end_ios(struct r5l_log *log)
483{
484 struct r5l_io_unit *io, *next;
485
486 assert_spin_locked(&log->io_list_lock);
487
488 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
489 /* don't change list order */
490 if (io->state < IO_UNIT_IO_END)
491 break;
492 list_move_tail(&io->log_sibling, &log->io_end_ios);
493 }
494}
495
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700496static void r5l_log_endio(struct bio *bio)
497{
498 struct r5l_io_unit *io = bio->bi_private;
499 struct r5l_log *log = io->log;
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700500 unsigned long flags;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700501
Shaohua Li6e74a9c2015-10-08 21:54:08 -0700502 if (bio->bi_error)
503 md_error(log->rdev->mddev, log->rdev);
504
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700505 bio_put(bio);
Christoph Hellwige8deb632015-12-21 10:51:02 +1100506 mempool_free(io->meta_page, log->meta_pool);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700507
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700508 spin_lock_irqsave(&log->io_list_lock, flags);
509 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
Christoph Hellwig56fef7c2015-10-05 09:31:09 +0200510 if (log->need_cache_flush)
Christoph Hellwig3848c0b2015-12-21 10:51:01 +1100511 r5l_move_to_end_ios(log);
Christoph Hellwig56fef7c2015-10-05 09:31:09 +0200512 else
513 r5l_log_run_stripes(log);
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700514 spin_unlock_irqrestore(&log->io_list_lock, flags);
515
Christoph Hellwig56fef7c2015-10-05 09:31:09 +0200516 if (log->need_cache_flush)
517 md_wakeup_thread(log->rdev->mddev->thread);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700518}
519
520static void r5l_submit_current_io(struct r5l_log *log)
521{
522 struct r5l_io_unit *io = log->current_io;
523 struct r5l_meta_block *block;
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700524 unsigned long flags;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700525 u32 crc;
526
527 if (!io)
528 return;
529
530 block = page_address(io->meta_page);
531 block->meta_size = cpu_to_le32(io->meta_offset);
Shaohua Li5cb2fbd2015-10-28 08:41:25 -0700532 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700533 block->checksum = cpu_to_le32(crc);
534
535 log->current_io = NULL;
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700536 spin_lock_irqsave(&log->io_list_lock, flags);
537 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
538 spin_unlock_irqrestore(&log->io_list_lock, flags);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700539
Mike Christie4e49ea42016-06-05 14:31:41 -0500540 submit_bio(io->current_bio);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700541}
542
Christoph Hellwig6143e2c2015-10-05 09:31:16 +0200543static struct bio *r5l_bio_alloc(struct r5l_log *log)
Christoph Hellwigb349feb2015-10-05 09:31:11 +0200544{
Christoph Hellwigc38d29b2015-12-21 10:51:02 +1100545 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
Christoph Hellwigb349feb2015-10-05 09:31:11 +0200546
Mike Christie796a5cf2016-06-05 14:32:07 -0500547 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Christoph Hellwigb349feb2015-10-05 09:31:11 +0200548 bio->bi_bdev = log->rdev->bdev;
Christoph Hellwig1e932a32015-10-05 09:31:12 +0200549 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
Christoph Hellwigb349feb2015-10-05 09:31:11 +0200550
Christoph Hellwigb349feb2015-10-05 09:31:11 +0200551 return bio;
552}
553
Christoph Hellwigc1b99192015-10-05 09:31:14 +0200554static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
555{
556 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
557
Song Liua39f7af2016-11-17 15:24:40 -0800558 r5c_update_log_state(log);
Christoph Hellwigc1b99192015-10-05 09:31:14 +0200559 /*
560 * If we filled up the log device start from the beginning again,
561 * which will require a new bio.
562 *
563 * Note: for this to work properly the log size needs to me a multiple
564 * of BLOCK_SECTORS.
565 */
566 if (log->log_start == 0)
Christoph Hellwig6143e2c2015-10-05 09:31:16 +0200567 io->need_split_bio = true;
Christoph Hellwigc1b99192015-10-05 09:31:14 +0200568
569 io->log_end = log->log_start;
570}
571
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700572static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
573{
574 struct r5l_io_unit *io;
575 struct r5l_meta_block *block;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700576
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100577 io = mempool_alloc(log->io_pool, GFP_ATOMIC);
578 if (!io)
579 return NULL;
580 memset(io, 0, sizeof(*io));
581
Christoph Hellwig51039cd2015-10-05 09:31:13 +0200582 io->log = log;
Christoph Hellwig51039cd2015-10-05 09:31:13 +0200583 INIT_LIST_HEAD(&io->log_sibling);
584 INIT_LIST_HEAD(&io->stripe_list);
585 io->state = IO_UNIT_RUNNING;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700586
Christoph Hellwige8deb632015-12-21 10:51:02 +1100587 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700588 block = page_address(io->meta_page);
Christoph Hellwige8deb632015-12-21 10:51:02 +1100589 clear_page(block);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700590 block->magic = cpu_to_le32(R5LOG_MAGIC);
591 block->version = R5LOG_VERSION;
592 block->seq = cpu_to_le64(log->seq);
593 block->position = cpu_to_le64(log->log_start);
594
595 io->log_start = log->log_start;
596 io->meta_offset = sizeof(struct r5l_meta_block);
Christoph Hellwig2b8ef162015-10-05 09:31:15 +0200597 io->seq = log->seq++;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700598
Christoph Hellwig6143e2c2015-10-05 09:31:16 +0200599 io->current_bio = r5l_bio_alloc(log);
600 io->current_bio->bi_end_io = r5l_log_endio;
601 io->current_bio->bi_private = io;
Christoph Hellwigb349feb2015-10-05 09:31:11 +0200602 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700603
Christoph Hellwigc1b99192015-10-05 09:31:14 +0200604 r5_reserve_log_entry(log, io);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700605
606 spin_lock_irq(&log->io_list_lock);
607 list_add_tail(&io->log_sibling, &log->running_ios);
608 spin_unlock_irq(&log->io_list_lock);
609
610 return io;
611}
612
613static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
614{
Christoph Hellwig22581f52015-10-05 09:31:10 +0200615 if (log->current_io &&
616 log->current_io->meta_offset + payload_size > PAGE_SIZE)
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700617 r5l_submit_current_io(log);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700618
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100619 if (!log->current_io) {
Christoph Hellwig22581f52015-10-05 09:31:10 +0200620 log->current_io = r5l_new_meta(log);
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100621 if (!log->current_io)
622 return -ENOMEM;
623 }
624
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700625 return 0;
626}
627
628static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
629 sector_t location,
630 u32 checksum1, u32 checksum2,
631 bool checksum2_valid)
632{
633 struct r5l_io_unit *io = log->current_io;
634 struct r5l_payload_data_parity *payload;
635
636 payload = page_address(io->meta_page) + io->meta_offset;
637 payload->header.type = cpu_to_le16(type);
638 payload->header.flags = cpu_to_le16(0);
639 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
640 (PAGE_SHIFT - 9));
641 payload->location = cpu_to_le64(location);
642 payload->checksum[0] = cpu_to_le32(checksum1);
643 if (checksum2_valid)
644 payload->checksum[1] = cpu_to_le32(checksum2);
645
646 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
647 sizeof(__le32) * (1 + !!checksum2_valid);
648}
649
650static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
651{
652 struct r5l_io_unit *io = log->current_io;
653
Christoph Hellwig6143e2c2015-10-05 09:31:16 +0200654 if (io->need_split_bio) {
655 struct bio *prev = io->current_bio;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700656
Christoph Hellwig6143e2c2015-10-05 09:31:16 +0200657 io->current_bio = r5l_bio_alloc(log);
658 bio_chain(io->current_bio, prev);
659
Mike Christie4e49ea42016-06-05 14:31:41 -0500660 submit_bio(prev);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700661 }
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700662
Christoph Hellwig6143e2c2015-10-05 09:31:16 +0200663 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
664 BUG();
665
Christoph Hellwigc1b99192015-10-05 09:31:14 +0200666 r5_reserve_log_entry(log, io);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700667}
668
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100669static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700670 int data_pages, int parity_pages)
671{
672 int i;
673 int meta_size;
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100674 int ret;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700675 struct r5l_io_unit *io;
676
677 meta_size =
678 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
679 * data_pages) +
680 sizeof(struct r5l_payload_data_parity) +
681 sizeof(__le32) * parity_pages;
682
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100683 ret = r5l_get_meta(log, meta_size);
684 if (ret)
685 return ret;
686
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700687 io = log->current_io;
688
689 for (i = 0; i < sh->disks; i++) {
Song Liu1e6d6902016-11-17 15:24:39 -0800690 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
691 test_bit(R5_InJournal, &sh->dev[i].flags))
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700692 continue;
693 if (i == sh->pd_idx || i == sh->qd_idx)
694 continue;
695 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
696 raid5_compute_blocknr(sh, i, 0),
697 sh->dev[i].log_checksum, 0, false);
698 r5l_append_payload_page(log, sh->dev[i].page);
699 }
700
Song Liu2ded3702016-11-17 15:24:38 -0800701 if (parity_pages == 2) {
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700702 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
703 sh->sector, sh->dev[sh->pd_idx].log_checksum,
704 sh->dev[sh->qd_idx].log_checksum, true);
705 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
706 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
Song Liu2ded3702016-11-17 15:24:38 -0800707 } else if (parity_pages == 1) {
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700708 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
709 sh->sector, sh->dev[sh->pd_idx].log_checksum,
710 0, false);
711 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
Song Liu2ded3702016-11-17 15:24:38 -0800712 } else /* Just writing data, not parity, in caching phase */
713 BUG_ON(parity_pages != 0);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700714
715 list_add_tail(&sh->log_list, &io->stripe_list);
716 atomic_inc(&io->pending_stripe);
717 sh->log_io = io;
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100718
Song Liua39f7af2016-11-17 15:24:40 -0800719 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
720 return 0;
721
722 if (sh->log_start == MaxSector) {
723 BUG_ON(!list_empty(&sh->r5c));
724 sh->log_start = io->log_start;
725 spin_lock_irq(&log->stripe_in_journal_lock);
726 list_add_tail(&sh->r5c,
727 &log->stripe_in_journal_list);
728 spin_unlock_irq(&log->stripe_in_journal_lock);
729 atomic_inc(&log->stripe_in_journal_count);
730 }
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100731 return 0;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700732}
733
Song Liua39f7af2016-11-17 15:24:40 -0800734/* add stripe to no_space_stripes, and then wake up reclaim */
735static inline void r5l_add_no_space_stripe(struct r5l_log *log,
736 struct stripe_head *sh)
737{
738 spin_lock(&log->no_space_stripes_lock);
739 list_add_tail(&sh->log_list, &log->no_space_stripes);
740 spin_unlock(&log->no_space_stripes_lock);
741}
742
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700743/*
744 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
745 * data from log to raid disks), so we shouldn't wait for reclaim here
746 */
747int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
748{
Song Liua39f7af2016-11-17 15:24:40 -0800749 struct r5conf *conf = sh->raid_conf;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700750 int write_disks = 0;
751 int data_pages, parity_pages;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700752 int reserve;
753 int i;
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100754 int ret = 0;
Song Liua39f7af2016-11-17 15:24:40 -0800755 bool wake_reclaim = false;
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700756
757 if (!log)
758 return -EAGAIN;
759 /* Don't support stripe batch */
760 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
761 test_bit(STRIPE_SYNCING, &sh->state)) {
762 /* the stripe is written to log, we start writing it to raid */
763 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
764 return -EAGAIN;
765 }
766
Song Liu2ded3702016-11-17 15:24:38 -0800767 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
768
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700769 for (i = 0; i < sh->disks; i++) {
770 void *addr;
771
Song Liu1e6d6902016-11-17 15:24:39 -0800772 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
773 test_bit(R5_InJournal, &sh->dev[i].flags))
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700774 continue;
Song Liu1e6d6902016-11-17 15:24:39 -0800775
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700776 write_disks++;
777 /* checksum is already calculated in last run */
778 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
779 continue;
780 addr = kmap_atomic(sh->dev[i].page);
Shaohua Li5cb2fbd2015-10-28 08:41:25 -0700781 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
782 addr, PAGE_SIZE);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700783 kunmap_atomic(addr);
784 }
785 parity_pages = 1 + !!(sh->qd_idx >= 0);
786 data_pages = write_disks - parity_pages;
787
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700788 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
Shaohua Li253f9fd42015-09-04 14:14:16 -0700789 /*
790 * The stripe must enter state machine again to finish the write, so
791 * don't delay.
792 */
793 clear_bit(STRIPE_DELAYED, &sh->state);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700794 atomic_inc(&sh->count);
795
796 mutex_lock(&log->io_mutex);
797 /* meta + data */
798 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700799
Song Liua39f7af2016-11-17 15:24:40 -0800800 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
801 if (!r5l_has_free_space(log, reserve)) {
802 r5l_add_no_space_stripe(log, sh);
803 wake_reclaim = true;
804 } else {
805 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
806 if (ret) {
807 spin_lock_irq(&log->io_list_lock);
808 list_add_tail(&sh->log_list,
809 &log->no_mem_stripes);
810 spin_unlock_irq(&log->io_list_lock);
811 }
812 }
813 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */
814 /*
815 * log space critical, do not process stripes that are
816 * not in cache yet (sh->log_start == MaxSector).
817 */
818 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
819 sh->log_start == MaxSector) {
820 r5l_add_no_space_stripe(log, sh);
821 wake_reclaim = true;
822 reserve = 0;
823 } else if (!r5l_has_free_space(log, reserve)) {
824 if (sh->log_start == log->last_checkpoint)
825 BUG();
826 else
827 r5l_add_no_space_stripe(log, sh);
828 } else {
829 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
830 if (ret) {
831 spin_lock_irq(&log->io_list_lock);
832 list_add_tail(&sh->log_list,
833 &log->no_mem_stripes);
834 spin_unlock_irq(&log->io_list_lock);
835 }
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100836 }
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700837 }
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700838
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100839 mutex_unlock(&log->io_mutex);
Song Liua39f7af2016-11-17 15:24:40 -0800840 if (wake_reclaim)
841 r5l_wake_reclaim(log, reserve);
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700842 return 0;
843}
844
845void r5l_write_stripe_run(struct r5l_log *log)
846{
847 if (!log)
848 return;
849 mutex_lock(&log->io_mutex);
850 r5l_submit_current_io(log);
851 mutex_unlock(&log->io_mutex);
852}
853
Shaohua Li828cbe92015-09-02 13:49:49 -0700854int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
855{
856 if (!log)
857 return -ENODEV;
858 /*
859 * we flush log disk cache first, then write stripe data to raid disks.
860 * So if bio is finished, the log disk cache is flushed already. The
861 * recovery guarantees we can recovery the bio from log disk, so we
862 * don't need to flush again
863 */
864 if (bio->bi_iter.bi_size == 0) {
865 bio_endio(bio);
866 return 0;
867 }
Jens Axboe1eff9d32016-08-05 15:35:16 -0600868 bio->bi_opf &= ~REQ_PREFLUSH;
Shaohua Li828cbe92015-09-02 13:49:49 -0700869 return -EAGAIN;
870}
871
Shaohua Lif6bed0e2015-08-13 14:31:59 -0700872/* This will run after log space is reclaimed */
873static void r5l_run_no_space_stripes(struct r5l_log *log)
874{
875 struct stripe_head *sh;
876
877 spin_lock(&log->no_space_stripes_lock);
878 while (!list_empty(&log->no_space_stripes)) {
879 sh = list_first_entry(&log->no_space_stripes,
880 struct stripe_head, log_list);
881 list_del_init(&sh->log_list);
882 set_bit(STRIPE_HANDLE, &sh->state);
883 raid5_release_stripe(sh);
884 }
885 spin_unlock(&log->no_space_stripes_lock);
886}
887
Song Liua39f7af2016-11-17 15:24:40 -0800888/*
889 * calculate new last_checkpoint
890 * for write through mode, returns log->next_checkpoint
891 * for write back, returns log_start of first sh in stripe_in_journal_list
892 */
893static sector_t r5c_calculate_new_cp(struct r5conf *conf)
894{
895 struct stripe_head *sh;
896 struct r5l_log *log = conf->log;
897 sector_t new_cp;
898 unsigned long flags;
899
900 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
901 return log->next_checkpoint;
902
903 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
904 if (list_empty(&conf->log->stripe_in_journal_list)) {
905 /* all stripes flushed */
906 spin_unlock(&log->stripe_in_journal_lock);
907 return log->next_checkpoint;
908 }
909 sh = list_first_entry(&conf->log->stripe_in_journal_list,
910 struct stripe_head, r5c);
911 new_cp = sh->log_start;
912 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
913 return new_cp;
914}
915
Christoph Hellwig17036462015-10-05 09:31:06 +0200916static sector_t r5l_reclaimable_space(struct r5l_log *log)
917{
Song Liua39f7af2016-11-17 15:24:40 -0800918 struct r5conf *conf = log->rdev->mddev->private;
919
Christoph Hellwig17036462015-10-05 09:31:06 +0200920 return r5l_ring_distance(log, log->last_checkpoint,
Song Liua39f7af2016-11-17 15:24:40 -0800921 r5c_calculate_new_cp(conf));
Christoph Hellwig17036462015-10-05 09:31:06 +0200922}
923
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100924static void r5l_run_no_mem_stripe(struct r5l_log *log)
925{
926 struct stripe_head *sh;
927
928 assert_spin_locked(&log->io_list_lock);
929
930 if (!list_empty(&log->no_mem_stripes)) {
931 sh = list_first_entry(&log->no_mem_stripes,
932 struct stripe_head, log_list);
933 list_del_init(&sh->log_list);
934 set_bit(STRIPE_HANDLE, &sh->state);
935 raid5_release_stripe(sh);
936 }
937}
938
Christoph Hellwig04732f72015-10-05 09:31:07 +0200939static bool r5l_complete_finished_ios(struct r5l_log *log)
Christoph Hellwig17036462015-10-05 09:31:06 +0200940{
941 struct r5l_io_unit *io, *next;
942 bool found = false;
943
944 assert_spin_locked(&log->io_list_lock);
945
Christoph Hellwig04732f72015-10-05 09:31:07 +0200946 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
Christoph Hellwig17036462015-10-05 09:31:06 +0200947 /* don't change list order */
948 if (io->state < IO_UNIT_STRIPE_END)
949 break;
950
951 log->next_checkpoint = io->log_start;
952 log->next_cp_seq = io->seq;
953
954 list_del(&io->log_sibling);
Christoph Hellwig5036c3902015-12-21 10:51:02 +1100955 mempool_free(io, log->io_pool);
956 r5l_run_no_mem_stripe(log);
Christoph Hellwig17036462015-10-05 09:31:06 +0200957
958 found = true;
959 }
960
961 return found;
962}
963
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700964static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
965{
966 struct r5l_log *log = io->log;
Song Liua39f7af2016-11-17 15:24:40 -0800967 struct r5conf *conf = log->rdev->mddev->private;
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700968 unsigned long flags;
969
970 spin_lock_irqsave(&log->io_list_lock, flags);
971 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
Christoph Hellwig17036462015-10-05 09:31:06 +0200972
Christoph Hellwig04732f72015-10-05 09:31:07 +0200973 if (!r5l_complete_finished_ios(log)) {
Shaohua Li85f2f9a2015-09-04 14:14:05 -0700974 spin_unlock_irqrestore(&log->io_list_lock, flags);
975 return;
976 }
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700977
Song Liua39f7af2016-11-17 15:24:40 -0800978 if (r5l_reclaimable_space(log) > log->max_free_space ||
979 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700980 r5l_wake_reclaim(log, 0);
981
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700982 spin_unlock_irqrestore(&log->io_list_lock, flags);
983 wake_up(&log->iounit_wait);
984}
985
Shaohua Li0576b1c2015-08-13 14:32:00 -0700986void r5l_stripe_write_finished(struct stripe_head *sh)
987{
988 struct r5l_io_unit *io;
989
Shaohua Li0576b1c2015-08-13 14:32:00 -0700990 io = sh->log_io;
Shaohua Li0576b1c2015-08-13 14:32:00 -0700991 sh->log_io = NULL;
992
Christoph Hellwig509ffec2015-09-02 13:49:48 -0700993 if (io && atomic_dec_and_test(&io->pending_stripe))
994 __r5l_stripe_write_finished(io);
Shaohua Li0576b1c2015-08-13 14:32:00 -0700995}
996
Shaohua Lia8c34f92015-09-02 13:49:46 -0700997static void r5l_log_flush_endio(struct bio *bio)
998{
999 struct r5l_log *log = container_of(bio, struct r5l_log,
1000 flush_bio);
1001 unsigned long flags;
1002 struct r5l_io_unit *io;
Shaohua Lia8c34f92015-09-02 13:49:46 -07001003
Shaohua Li6e74a9c2015-10-08 21:54:08 -07001004 if (bio->bi_error)
1005 md_error(log->rdev->mddev, log->rdev);
1006
Shaohua Lia8c34f92015-09-02 13:49:46 -07001007 spin_lock_irqsave(&log->io_list_lock, flags);
Christoph Hellwigd8858f42015-10-05 09:31:08 +02001008 list_for_each_entry(io, &log->flushing_ios, log_sibling)
1009 r5l_io_run_stripes(io);
Christoph Hellwig04732f72015-10-05 09:31:07 +02001010 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
Shaohua Lia8c34f92015-09-02 13:49:46 -07001011 spin_unlock_irqrestore(&log->io_list_lock, flags);
1012}
1013
Shaohua Li0576b1c2015-08-13 14:32:00 -07001014/*
1015 * Starting dispatch IO to raid.
1016 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
1017 * broken meta in the middle of a log causes recovery can't find meta at the
1018 * head of log. If operations require meta at the head persistent in log, we
1019 * must make sure meta before it persistent in log too. A case is:
1020 *
1021 * stripe data/parity is in log, we start write stripe to raid disks. stripe
1022 * data/parity must be persistent in log before we do the write to raid disks.
1023 *
1024 * The solution is we restrictly maintain io_unit list order. In this case, we
1025 * only write stripes of an io_unit to raid disks till the io_unit is the first
1026 * one whose data/parity is in log.
1027 */
1028void r5l_flush_stripe_to_raid(struct r5l_log *log)
1029{
Shaohua Lia8c34f92015-09-02 13:49:46 -07001030 bool do_flush;
Christoph Hellwig56fef7c2015-10-05 09:31:09 +02001031
1032 if (!log || !log->need_cache_flush)
Shaohua Li0576b1c2015-08-13 14:32:00 -07001033 return;
Shaohua Li0576b1c2015-08-13 14:32:00 -07001034
Shaohua Lia8c34f92015-09-02 13:49:46 -07001035 spin_lock_irq(&log->io_list_lock);
1036 /* flush bio is running */
1037 if (!list_empty(&log->flushing_ios)) {
1038 spin_unlock_irq(&log->io_list_lock);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001039 return;
Shaohua Li0576b1c2015-08-13 14:32:00 -07001040 }
Shaohua Lia8c34f92015-09-02 13:49:46 -07001041 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1042 do_flush = !list_empty(&log->flushing_ios);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001043 spin_unlock_irq(&log->io_list_lock);
Shaohua Lia8c34f92015-09-02 13:49:46 -07001044
1045 if (!do_flush)
1046 return;
1047 bio_reset(&log->flush_bio);
1048 log->flush_bio.bi_bdev = log->rdev->bdev;
1049 log->flush_bio.bi_end_io = r5l_log_flush_endio;
Mike Christie796a5cf2016-06-05 14:32:07 -05001050 bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
Mike Christie4e49ea42016-06-05 14:31:41 -05001051 submit_bio(&log->flush_bio);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001052}
1053
Shaohua Li0576b1c2015-08-13 14:32:00 -07001054static void r5l_write_super(struct r5l_log *log, sector_t cp);
Shaohua Li4b482042015-10-08 21:54:06 -07001055static void r5l_write_super_and_discard_space(struct r5l_log *log,
1056 sector_t end)
1057{
1058 struct block_device *bdev = log->rdev->bdev;
1059 struct mddev *mddev;
1060
1061 r5l_write_super(log, end);
1062
1063 if (!blk_queue_discard(bdev_get_queue(bdev)))
1064 return;
1065
1066 mddev = log->rdev->mddev;
1067 /*
Shaohua Li8e018c22016-08-25 10:09:39 -07001068 * Discard could zero data, so before discard we must make sure
1069 * superblock is updated to new log tail. Updating superblock (either
1070 * directly call md_update_sb() or depend on md thread) must hold
1071 * reconfig mutex. On the other hand, raid5_quiesce is called with
1072 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
1073 * for all IO finish, hence waitting for reclaim thread, while reclaim
1074 * thread is calling this function and waitting for reconfig mutex. So
1075 * there is a deadlock. We workaround this issue with a trylock.
1076 * FIXME: we could miss discard if we can't take reconfig mutex
Shaohua Li4b482042015-10-08 21:54:06 -07001077 */
Shaohua Li8e018c22016-08-25 10:09:39 -07001078 set_mask_bits(&mddev->flags, 0,
1079 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
1080 if (!mddev_trylock(mddev))
1081 return;
1082 md_update_sb(mddev, 1);
1083 mddev_unlock(mddev);
Shaohua Li4b482042015-10-08 21:54:06 -07001084
Shaohua Li6e74a9c2015-10-08 21:54:08 -07001085 /* discard IO error really doesn't matter, ignore it */
Shaohua Li4b482042015-10-08 21:54:06 -07001086 if (log->last_checkpoint < end) {
1087 blkdev_issue_discard(bdev,
1088 log->last_checkpoint + log->rdev->data_offset,
1089 end - log->last_checkpoint, GFP_NOIO, 0);
1090 } else {
1091 blkdev_issue_discard(bdev,
1092 log->last_checkpoint + log->rdev->data_offset,
1093 log->device_size - log->last_checkpoint,
1094 GFP_NOIO, 0);
1095 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1096 GFP_NOIO, 0);
1097 }
1098}
1099
Song Liua39f7af2016-11-17 15:24:40 -08001100/*
1101 * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1102 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1103 *
1104 * must hold conf->device_lock
1105 */
1106static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1107{
1108 BUG_ON(list_empty(&sh->lru));
1109 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1110 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1111
1112 /*
1113 * The stripe is not ON_RELEASE_LIST, so it is safe to call
1114 * raid5_release_stripe() while holding conf->device_lock
1115 */
1116 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1117 assert_spin_locked(&conf->device_lock);
1118
1119 list_del_init(&sh->lru);
1120 atomic_inc(&sh->count);
1121
1122 set_bit(STRIPE_HANDLE, &sh->state);
1123 atomic_inc(&conf->active_stripes);
1124 r5c_make_stripe_write_out(sh);
1125
1126 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1127 atomic_inc(&conf->preread_active_stripes);
1128 raid5_release_stripe(sh);
1129}
1130
1131/*
1132 * if num == 0, flush all full stripes
1133 * if num > 0, flush all full stripes. If less than num full stripes are
1134 * flushed, flush some partial stripes until totally num stripes are
1135 * flushed or there is no more cached stripes.
1136 */
1137void r5c_flush_cache(struct r5conf *conf, int num)
1138{
1139 int count;
1140 struct stripe_head *sh, *next;
1141
1142 assert_spin_locked(&conf->device_lock);
1143 if (!conf->log)
1144 return;
1145
1146 count = 0;
1147 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1148 r5c_flush_stripe(conf, sh);
1149 count++;
1150 }
1151
1152 if (count >= num)
1153 return;
1154 list_for_each_entry_safe(sh, next,
1155 &conf->r5c_partial_stripe_list, lru) {
1156 r5c_flush_stripe(conf, sh);
1157 if (++count >= num)
1158 break;
1159 }
1160}
1161
1162static void r5c_do_reclaim(struct r5conf *conf)
1163{
1164 struct r5l_log *log = conf->log;
1165 struct stripe_head *sh;
1166 int count = 0;
1167 unsigned long flags;
1168 int total_cached;
1169 int stripes_to_flush;
1170
1171 if (!r5c_is_writeback(log))
1172 return;
1173
1174 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1175 atomic_read(&conf->r5c_cached_full_stripes);
1176
1177 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1178 atomic_read(&conf->empty_inactive_list_nr) > 0)
1179 /*
1180 * if stripe cache pressure high, flush all full stripes and
1181 * some partial stripes
1182 */
1183 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1184 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1185 atomic_read(&conf->r5c_cached_full_stripes) >
1186 R5C_FULL_STRIPE_FLUSH_BATCH)
1187 /*
1188 * if stripe cache pressure moderate, or if there is many full
1189 * stripes,flush all full stripes
1190 */
1191 stripes_to_flush = 0;
1192 else
1193 /* no need to flush */
1194 stripes_to_flush = -1;
1195
1196 if (stripes_to_flush >= 0) {
1197 spin_lock_irqsave(&conf->device_lock, flags);
1198 r5c_flush_cache(conf, stripes_to_flush);
1199 spin_unlock_irqrestore(&conf->device_lock, flags);
1200 }
1201
1202 /* if log space is tight, flush stripes on stripe_in_journal_list */
1203 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1204 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1205 spin_lock(&conf->device_lock);
1206 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1207 /*
1208 * stripes on stripe_in_journal_list could be in any
1209 * state of the stripe_cache state machine. In this
1210 * case, we only want to flush stripe on
1211 * r5c_cached_full/partial_stripes. The following
1212 * condition makes sure the stripe is on one of the
1213 * two lists.
1214 */
1215 if (!list_empty(&sh->lru) &&
1216 !test_bit(STRIPE_HANDLE, &sh->state) &&
1217 atomic_read(&sh->count) == 0) {
1218 r5c_flush_stripe(conf, sh);
1219 }
1220 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1221 break;
1222 }
1223 spin_unlock(&conf->device_lock);
1224 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1225 }
1226 md_wakeup_thread(conf->mddev->thread);
1227}
1228
Shaohua Li0576b1c2015-08-13 14:32:00 -07001229static void r5l_do_reclaim(struct r5l_log *log)
1230{
Song Liua39f7af2016-11-17 15:24:40 -08001231 struct r5conf *conf = log->rdev->mddev->private;
Shaohua Li0576b1c2015-08-13 14:32:00 -07001232 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
Christoph Hellwig17036462015-10-05 09:31:06 +02001233 sector_t reclaimable;
1234 sector_t next_checkpoint;
Song Liua39f7af2016-11-17 15:24:40 -08001235 bool write_super;
Shaohua Li0576b1c2015-08-13 14:32:00 -07001236
1237 spin_lock_irq(&log->io_list_lock);
Song Liua39f7af2016-11-17 15:24:40 -08001238 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1239 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001240 /*
1241 * move proper io_unit to reclaim list. We should not change the order.
1242 * reclaimable/unreclaimable io_unit can be mixed in the list, we
1243 * shouldn't reuse space of an unreclaimable io_unit
1244 */
1245 while (1) {
Christoph Hellwig17036462015-10-05 09:31:06 +02001246 reclaimable = r5l_reclaimable_space(log);
1247 if (reclaimable >= reclaim_target ||
Shaohua Li0576b1c2015-08-13 14:32:00 -07001248 (list_empty(&log->running_ios) &&
1249 list_empty(&log->io_end_ios) &&
Shaohua Lia8c34f92015-09-02 13:49:46 -07001250 list_empty(&log->flushing_ios) &&
Christoph Hellwig04732f72015-10-05 09:31:07 +02001251 list_empty(&log->finished_ios)))
Shaohua Li0576b1c2015-08-13 14:32:00 -07001252 break;
1253
Christoph Hellwig17036462015-10-05 09:31:06 +02001254 md_wakeup_thread(log->rdev->mddev->thread);
1255 wait_event_lock_irq(log->iounit_wait,
1256 r5l_reclaimable_space(log) > reclaimable,
1257 log->io_list_lock);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001258 }
Christoph Hellwig17036462015-10-05 09:31:06 +02001259
Song Liua39f7af2016-11-17 15:24:40 -08001260 next_checkpoint = r5c_calculate_new_cp(conf);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001261 spin_unlock_irq(&log->io_list_lock);
1262
Christoph Hellwig17036462015-10-05 09:31:06 +02001263 BUG_ON(reclaimable < 0);
Song Liua39f7af2016-11-17 15:24:40 -08001264
1265 if (reclaimable == 0 || !write_super)
Shaohua Li0576b1c2015-08-13 14:32:00 -07001266 return;
1267
Shaohua Li0576b1c2015-08-13 14:32:00 -07001268 /*
1269 * write_super will flush cache of each raid disk. We must write super
1270 * here, because the log area might be reused soon and we don't want to
1271 * confuse recovery
1272 */
Shaohua Li4b482042015-10-08 21:54:06 -07001273 r5l_write_super_and_discard_space(log, next_checkpoint);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001274
1275 mutex_lock(&log->io_mutex);
Christoph Hellwig17036462015-10-05 09:31:06 +02001276 log->last_checkpoint = next_checkpoint;
Song Liua39f7af2016-11-17 15:24:40 -08001277 r5c_update_log_state(log);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001278 mutex_unlock(&log->io_mutex);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001279
Christoph Hellwig17036462015-10-05 09:31:06 +02001280 r5l_run_no_space_stripes(log);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001281}
1282
1283static void r5l_reclaim_thread(struct md_thread *thread)
1284{
1285 struct mddev *mddev = thread->mddev;
1286 struct r5conf *conf = mddev->private;
1287 struct r5l_log *log = conf->log;
1288
1289 if (!log)
1290 return;
Song Liua39f7af2016-11-17 15:24:40 -08001291 r5c_do_reclaim(conf);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001292 r5l_do_reclaim(log);
1293}
1294
Song Liua39f7af2016-11-17 15:24:40 -08001295void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001296{
Shaohua Li0576b1c2015-08-13 14:32:00 -07001297 unsigned long target;
1298 unsigned long new = (unsigned long)space; /* overflow in theory */
1299
Song Liua39f7af2016-11-17 15:24:40 -08001300 if (!log)
1301 return;
Shaohua Li0576b1c2015-08-13 14:32:00 -07001302 do {
1303 target = log->reclaim_target;
1304 if (new < target)
1305 return;
1306 } while (cmpxchg(&log->reclaim_target, target, new) != target);
1307 md_wakeup_thread(log->reclaim_thread);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001308}
1309
Shaohua Lie6c033f2015-10-04 09:20:12 -07001310void r5l_quiesce(struct r5l_log *log, int state)
1311{
Shaohua Li4b482042015-10-08 21:54:06 -07001312 struct mddev *mddev;
Shaohua Lie6c033f2015-10-04 09:20:12 -07001313 if (!log || state == 2)
1314 return;
1315 if (state == 0) {
Shaohua Li16a43f62016-01-06 14:37:15 -08001316 /*
1317 * This is a special case for hotadd. In suspend, the array has
1318 * no journal. In resume, journal is initialized as well as the
1319 * reclaim thread.
1320 */
1321 if (log->reclaim_thread)
1322 return;
Shaohua Lie6c033f2015-10-04 09:20:12 -07001323 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1324 log->rdev->mddev, "reclaim");
Song Liua39f7af2016-11-17 15:24:40 -08001325 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
Shaohua Lie6c033f2015-10-04 09:20:12 -07001326 } else if (state == 1) {
Shaohua Li4b482042015-10-08 21:54:06 -07001327 /* make sure r5l_write_super_and_discard_space exits */
1328 mddev = log->rdev->mddev;
1329 wake_up(&mddev->sb_wait);
Song Liua39f7af2016-11-17 15:24:40 -08001330 r5l_wake_reclaim(log, MaxSector);
Shaohua Lie6c033f2015-10-04 09:20:12 -07001331 md_unregister_thread(&log->reclaim_thread);
1332 r5l_do_reclaim(log);
1333 }
1334}
1335
Shaohua Li6e74a9c2015-10-08 21:54:08 -07001336bool r5l_log_disk_error(struct r5conf *conf)
1337{
Shaohua Lif6b6ec52015-12-21 10:51:02 +11001338 struct r5l_log *log;
1339 bool ret;
Shaohua Li7dde2ad2015-10-08 21:54:10 -07001340 /* don't allow write if journal disk is missing */
Shaohua Lif6b6ec52015-12-21 10:51:02 +11001341 rcu_read_lock();
1342 log = rcu_dereference(conf->log);
1343
1344 if (!log)
1345 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1346 else
1347 ret = test_bit(Faulty, &log->rdev->flags);
1348 rcu_read_unlock();
1349 return ret;
Shaohua Li6e74a9c2015-10-08 21:54:08 -07001350}
1351
Shaohua Li355810d2015-08-13 14:32:01 -07001352struct r5l_recovery_ctx {
1353 struct page *meta_page; /* current meta */
1354 sector_t meta_total_blocks; /* total size of current meta and data */
1355 sector_t pos; /* recovery position */
1356 u64 seq; /* recovery position seq */
1357};
1358
Song Liu9ed988f52016-11-17 15:24:42 -08001359static int r5l_recovery_read_meta_block(struct r5l_log *log,
1360 struct r5l_recovery_ctx *ctx)
Shaohua Li355810d2015-08-13 14:32:01 -07001361{
1362 struct page *page = ctx->meta_page;
1363 struct r5l_meta_block *mb;
1364 u32 crc, stored_crc;
1365
Mike Christie796a5cf2016-06-05 14:32:07 -05001366 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1367 false))
Shaohua Li355810d2015-08-13 14:32:01 -07001368 return -EIO;
1369
1370 mb = page_address(page);
1371 stored_crc = le32_to_cpu(mb->checksum);
1372 mb->checksum = 0;
1373
1374 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1375 le64_to_cpu(mb->seq) != ctx->seq ||
1376 mb->version != R5LOG_VERSION ||
1377 le64_to_cpu(mb->position) != ctx->pos)
1378 return -EINVAL;
1379
Shaohua Li5cb2fbd2015-10-28 08:41:25 -07001380 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
Shaohua Li355810d2015-08-13 14:32:01 -07001381 if (stored_crc != crc)
1382 return -EINVAL;
1383
1384 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1385 return -EINVAL;
1386
1387 ctx->meta_total_blocks = BLOCK_SECTORS;
1388
1389 return 0;
1390}
1391
1392static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
1393 struct r5l_recovery_ctx *ctx,
1394 sector_t stripe_sect,
JackieLiu3fd880a2016-11-02 17:02:39 +08001395 int *offset)
Shaohua Li355810d2015-08-13 14:32:01 -07001396{
1397 struct r5conf *conf = log->rdev->mddev->private;
1398 struct stripe_head *sh;
1399 struct r5l_payload_data_parity *payload;
1400 int disk_index;
1401
1402 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
1403 while (1) {
JackieLiu3fd880a2016-11-02 17:02:39 +08001404 sector_t log_offset = r5l_ring_add(log, ctx->pos,
1405 ctx->meta_total_blocks);
Shaohua Li355810d2015-08-13 14:32:01 -07001406 payload = page_address(ctx->meta_page) + *offset;
1407
1408 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
1409 raid5_compute_sector(conf,
1410 le64_to_cpu(payload->location), 0,
1411 &disk_index, sh);
1412
JackieLiu3fd880a2016-11-02 17:02:39 +08001413 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
Mike Christie796a5cf2016-06-05 14:32:07 -05001414 sh->dev[disk_index].page, REQ_OP_READ, 0,
1415 false);
Shaohua Li355810d2015-08-13 14:32:01 -07001416 sh->dev[disk_index].log_checksum =
1417 le32_to_cpu(payload->checksum[0]);
1418 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
Shaohua Li355810d2015-08-13 14:32:01 -07001419 } else {
1420 disk_index = sh->pd_idx;
JackieLiu3fd880a2016-11-02 17:02:39 +08001421 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
Mike Christie796a5cf2016-06-05 14:32:07 -05001422 sh->dev[disk_index].page, REQ_OP_READ, 0,
1423 false);
Shaohua Li355810d2015-08-13 14:32:01 -07001424 sh->dev[disk_index].log_checksum =
1425 le32_to_cpu(payload->checksum[0]);
1426 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1427
1428 if (sh->qd_idx >= 0) {
1429 disk_index = sh->qd_idx;
1430 sync_page_io(log->rdev,
JackieLiu3fd880a2016-11-02 17:02:39 +08001431 r5l_ring_add(log, log_offset, BLOCK_SECTORS),
Shaohua Li355810d2015-08-13 14:32:01 -07001432 PAGE_SIZE, sh->dev[disk_index].page,
Mike Christie796a5cf2016-06-05 14:32:07 -05001433 REQ_OP_READ, 0, false);
Shaohua Li355810d2015-08-13 14:32:01 -07001434 sh->dev[disk_index].log_checksum =
1435 le32_to_cpu(payload->checksum[1]);
1436 set_bit(R5_Wantwrite,
1437 &sh->dev[disk_index].flags);
1438 }
Shaohua Li355810d2015-08-13 14:32:01 -07001439 }
1440
JackieLiu3fd880a2016-11-02 17:02:39 +08001441 ctx->meta_total_blocks += le32_to_cpu(payload->size);
Shaohua Li355810d2015-08-13 14:32:01 -07001442 *offset += sizeof(struct r5l_payload_data_parity) +
1443 sizeof(__le32) *
1444 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1445 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
1446 break;
1447 }
1448
1449 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1450 void *addr;
1451 u32 checksum;
1452
1453 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1454 continue;
1455 addr = kmap_atomic(sh->dev[disk_index].page);
Shaohua Li5cb2fbd2015-10-28 08:41:25 -07001456 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
Shaohua Li355810d2015-08-13 14:32:01 -07001457 kunmap_atomic(addr);
1458 if (checksum != sh->dev[disk_index].log_checksum)
1459 goto error;
1460 }
1461
1462 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1463 struct md_rdev *rdev, *rrdev;
1464
1465 if (!test_and_clear_bit(R5_Wantwrite,
1466 &sh->dev[disk_index].flags))
1467 continue;
1468
1469 /* in case device is broken */
Shaohua Li354b4452016-11-16 17:20:19 -08001470 rcu_read_lock();
Shaohua Li355810d2015-08-13 14:32:01 -07001471 rdev = rcu_dereference(conf->disks[disk_index].rdev);
Shaohua Li354b4452016-11-16 17:20:19 -08001472 if (rdev) {
1473 atomic_inc(&rdev->nr_pending);
1474 rcu_read_unlock();
Shaohua Li355810d2015-08-13 14:32:01 -07001475 sync_page_io(rdev, stripe_sect, PAGE_SIZE,
Mike Christie796a5cf2016-06-05 14:32:07 -05001476 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1477 false);
Shaohua Li354b4452016-11-16 17:20:19 -08001478 rdev_dec_pending(rdev, rdev->mddev);
1479 rcu_read_lock();
1480 }
Shaohua Li355810d2015-08-13 14:32:01 -07001481 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
Shaohua Li354b4452016-11-16 17:20:19 -08001482 if (rrdev) {
1483 atomic_inc(&rrdev->nr_pending);
1484 rcu_read_unlock();
Shaohua Li355810d2015-08-13 14:32:01 -07001485 sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
Mike Christie796a5cf2016-06-05 14:32:07 -05001486 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1487 false);
Shaohua Li354b4452016-11-16 17:20:19 -08001488 rdev_dec_pending(rrdev, rrdev->mddev);
1489 rcu_read_lock();
1490 }
1491 rcu_read_unlock();
Shaohua Li355810d2015-08-13 14:32:01 -07001492 }
1493 raid5_release_stripe(sh);
1494 return 0;
1495
1496error:
1497 for (disk_index = 0; disk_index < sh->disks; disk_index++)
1498 sh->dev[disk_index].flags = 0;
1499 raid5_release_stripe(sh);
1500 return -EINVAL;
1501}
1502
1503static int r5l_recovery_flush_one_meta(struct r5l_log *log,
1504 struct r5l_recovery_ctx *ctx)
1505{
1506 struct r5conf *conf = log->rdev->mddev->private;
1507 struct r5l_payload_data_parity *payload;
1508 struct r5l_meta_block *mb;
1509 int offset;
Shaohua Li355810d2015-08-13 14:32:01 -07001510 sector_t stripe_sector;
1511
1512 mb = page_address(ctx->meta_page);
1513 offset = sizeof(struct r5l_meta_block);
Shaohua Li355810d2015-08-13 14:32:01 -07001514
1515 while (offset < le32_to_cpu(mb->meta_size)) {
1516 int dd;
1517
1518 payload = (void *)mb + offset;
1519 stripe_sector = raid5_compute_sector(conf,
1520 le64_to_cpu(payload->location), 0, &dd, NULL);
1521 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
JackieLiu3fd880a2016-11-02 17:02:39 +08001522 &offset))
Shaohua Li355810d2015-08-13 14:32:01 -07001523 return -EINVAL;
1524 }
1525 return 0;
1526}
1527
1528/* copy data/parity from log to raid disks */
1529static void r5l_recovery_flush_log(struct r5l_log *log,
1530 struct r5l_recovery_ctx *ctx)
1531{
1532 while (1) {
Song Liu9ed988f52016-11-17 15:24:42 -08001533 if (r5l_recovery_read_meta_block(log, ctx))
Shaohua Li355810d2015-08-13 14:32:01 -07001534 return;
1535 if (r5l_recovery_flush_one_meta(log, ctx))
1536 return;
1537 ctx->seq++;
1538 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1539 }
1540}
1541
Song Liu9ed988f52016-11-17 15:24:42 -08001542static void
1543r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1544 struct page *page,
1545 sector_t pos, u64 seq)
Shaohua Li355810d2015-08-13 14:32:01 -07001546{
Shaohua Li355810d2015-08-13 14:32:01 -07001547 struct r5l_meta_block *mb;
1548 u32 crc;
1549
Shaohua Li355810d2015-08-13 14:32:01 -07001550 mb = page_address(page);
Song Liu9ed988f52016-11-17 15:24:42 -08001551 clear_page(mb);
Shaohua Li355810d2015-08-13 14:32:01 -07001552 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1553 mb->version = R5LOG_VERSION;
1554 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1555 mb->seq = cpu_to_le64(seq);
1556 mb->position = cpu_to_le64(pos);
Shaohua Li5cb2fbd2015-10-28 08:41:25 -07001557 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
Shaohua Li355810d2015-08-13 14:32:01 -07001558 mb->checksum = cpu_to_le32(crc);
Song Liu9ed988f52016-11-17 15:24:42 -08001559}
Shaohua Li355810d2015-08-13 14:32:01 -07001560
Song Liu9ed988f52016-11-17 15:24:42 -08001561static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1562 u64 seq)
1563{
1564 struct page *page;
1565
1566 page = alloc_page(GFP_KERNEL);
1567 if (!page)
1568 return -ENOMEM;
1569 r5l_recovery_create_empty_meta_block(log, page, pos, seq);
Mike Christie796a5cf2016-06-05 14:32:07 -05001570 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1571 WRITE_FUA, false)) {
Shaohua Li355810d2015-08-13 14:32:01 -07001572 __free_page(page);
1573 return -EIO;
1574 }
1575 __free_page(page);
1576 return 0;
1577}
1578
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001579static int r5l_recovery_log(struct r5l_log *log)
1580{
Shaohua Li355810d2015-08-13 14:32:01 -07001581 struct r5l_recovery_ctx ctx;
1582
1583 ctx.pos = log->last_checkpoint;
1584 ctx.seq = log->last_cp_seq;
1585 ctx.meta_page = alloc_page(GFP_KERNEL);
1586 if (!ctx.meta_page)
1587 return -ENOMEM;
1588
1589 r5l_recovery_flush_log(log, &ctx);
1590 __free_page(ctx.meta_page);
1591
1592 /*
1593 * we did a recovery. Now ctx.pos points to an invalid meta block. New
1594 * log will start here. but we can't let superblock point to last valid
1595 * meta block. The log might looks like:
1596 * | meta 1| meta 2| meta 3|
1597 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1598 * superblock points to meta 1, we write a new valid meta 2n. if crash
1599 * happens again, new recovery will start from meta 1. Since meta 2n is
1600 * valid now, recovery will think meta 3 is valid, which is wrong.
1601 * The solution is we create a new meta in meta2 with its seq == meta
1602 * 1's seq + 10 and let superblock points to meta2. The same recovery will
1603 * not think meta 3 is a valid meta, because its seq doesn't match
1604 */
Shaohua Li9a8b27f2016-10-27 15:22:13 -07001605 if (ctx.seq > log->last_cp_seq) {
Shaohua Li355810d2015-08-13 14:32:01 -07001606 int ret;
1607
1608 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1609 if (ret)
1610 return ret;
1611 log->seq = ctx.seq + 11;
1612 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1613 r5l_write_super(log, ctx.pos);
Zhengyuan Liu28cd88e2016-10-24 09:55:20 +08001614 log->last_checkpoint = ctx.pos;
1615 log->next_checkpoint = ctx.pos;
Shaohua Li355810d2015-08-13 14:32:01 -07001616 } else {
1617 log->log_start = ctx.pos;
1618 log->seq = ctx.seq;
1619 }
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001620 return 0;
1621}
1622
1623static void r5l_write_super(struct r5l_log *log, sector_t cp)
1624{
1625 struct mddev *mddev = log->rdev->mddev;
1626
1627 log->rdev->journal_tail = cp;
1628 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1629}
1630
Song Liu2c7da142016-11-17 15:24:41 -08001631static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
1632{
1633 struct r5conf *conf = mddev->private;
1634 int ret;
1635
1636 if (!conf->log)
1637 return 0;
1638
1639 switch (conf->log->r5c_journal_mode) {
1640 case R5C_JOURNAL_MODE_WRITE_THROUGH:
1641 ret = snprintf(
1642 page, PAGE_SIZE, "[%s] %s\n",
1643 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
1644 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
1645 break;
1646 case R5C_JOURNAL_MODE_WRITE_BACK:
1647 ret = snprintf(
1648 page, PAGE_SIZE, "%s [%s]\n",
1649 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
1650 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
1651 break;
1652 default:
1653 ret = 0;
1654 }
1655 return ret;
1656}
1657
1658static ssize_t r5c_journal_mode_store(struct mddev *mddev,
1659 const char *page, size_t length)
1660{
1661 struct r5conf *conf = mddev->private;
1662 struct r5l_log *log = conf->log;
1663 int val = -1, i;
1664 int len = length;
1665
1666 if (!log)
1667 return -ENODEV;
1668
1669 if (len && page[len - 1] == '\n')
1670 len -= 1;
1671 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
1672 if (strlen(r5c_journal_mode_str[i]) == len &&
1673 strncmp(page, r5c_journal_mode_str[i], len) == 0) {
1674 val = i;
1675 break;
1676 }
1677 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
1678 val > R5C_JOURNAL_MODE_WRITE_BACK)
1679 return -EINVAL;
1680
1681 mddev_suspend(mddev);
1682 conf->log->r5c_journal_mode = val;
1683 mddev_resume(mddev);
1684
1685 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
1686 mdname(mddev), val, r5c_journal_mode_str[val]);
1687 return length;
1688}
1689
1690struct md_sysfs_entry
1691r5c_journal_mode = __ATTR(journal_mode, 0644,
1692 r5c_journal_mode_show, r5c_journal_mode_store);
1693
Song Liu2ded3702016-11-17 15:24:38 -08001694/*
1695 * Try handle write operation in caching phase. This function should only
1696 * be called in write-back mode.
1697 *
1698 * If all outstanding writes can be handled in caching phase, returns 0
1699 * If writes requires write-out phase, call r5c_make_stripe_write_out()
1700 * and returns -EAGAIN
1701 */
1702int r5c_try_caching_write(struct r5conf *conf,
1703 struct stripe_head *sh,
1704 struct stripe_head_state *s,
1705 int disks)
1706{
1707 struct r5l_log *log = conf->log;
Song Liu1e6d6902016-11-17 15:24:39 -08001708 int i;
1709 struct r5dev *dev;
1710 int to_cache = 0;
Song Liu2ded3702016-11-17 15:24:38 -08001711
1712 BUG_ON(!r5c_is_writeback(log));
1713
Song Liu1e6d6902016-11-17 15:24:39 -08001714 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1715 /*
1716 * There are two different scenarios here:
1717 * 1. The stripe has some data cached, and it is sent to
1718 * write-out phase for reclaim
1719 * 2. The stripe is clean, and this is the first write
1720 *
1721 * For 1, return -EAGAIN, so we continue with
1722 * handle_stripe_dirtying().
1723 *
1724 * For 2, set STRIPE_R5C_CACHING and continue with caching
1725 * write.
1726 */
1727
1728 /* case 1: anything injournal or anything in written */
1729 if (s->injournal > 0 || s->written > 0)
1730 return -EAGAIN;
1731 /* case 2 */
1732 set_bit(STRIPE_R5C_CACHING, &sh->state);
1733 }
1734
1735 for (i = disks; i--; ) {
1736 dev = &sh->dev[i];
1737 /* if non-overwrite, use writing-out phase */
1738 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
1739 !test_bit(R5_InJournal, &dev->flags)) {
1740 r5c_make_stripe_write_out(sh);
1741 return -EAGAIN;
1742 }
1743 }
1744
1745 for (i = disks; i--; ) {
1746 dev = &sh->dev[i];
1747 if (dev->towrite) {
1748 set_bit(R5_Wantwrite, &dev->flags);
1749 set_bit(R5_Wantdrain, &dev->flags);
1750 set_bit(R5_LOCKED, &dev->flags);
1751 to_cache++;
1752 }
1753 }
1754
1755 if (to_cache) {
1756 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1757 /*
1758 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
1759 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
1760 * r5c_handle_data_cached()
1761 */
1762 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1763 }
1764
1765 return 0;
1766}
1767
1768/*
1769 * free extra pages (orig_page) we allocated for prexor
1770 */
1771void r5c_release_extra_page(struct stripe_head *sh)
1772{
1773 int i;
1774
1775 for (i = sh->disks; i--; )
1776 if (sh->dev[i].page != sh->dev[i].orig_page) {
1777 struct page *p = sh->dev[i].orig_page;
1778
1779 sh->dev[i].orig_page = sh->dev[i].page;
1780 put_page(p);
1781 }
Song Liu2ded3702016-11-17 15:24:38 -08001782}
1783
1784/*
1785 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
1786 * stripe is committed to RAID disks.
1787 */
1788void r5c_finish_stripe_write_out(struct r5conf *conf,
1789 struct stripe_head *sh,
1790 struct stripe_head_state *s)
1791{
Song Liu1e6d6902016-11-17 15:24:39 -08001792 int i;
1793 int do_wakeup = 0;
1794
Song Liu2ded3702016-11-17 15:24:38 -08001795 if (!conf->log ||
1796 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
1797 return;
1798
1799 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1800 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
1801
1802 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1803 return;
Song Liu1e6d6902016-11-17 15:24:39 -08001804
1805 for (i = sh->disks; i--; ) {
1806 clear_bit(R5_InJournal, &sh->dev[i].flags);
1807 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1808 do_wakeup = 1;
1809 }
1810
1811 /*
1812 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
1813 * We updated R5_InJournal, so we also update s->injournal.
1814 */
1815 s->injournal = 0;
1816
1817 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1818 if (atomic_dec_and_test(&conf->pending_full_writes))
1819 md_wakeup_thread(conf->mddev->thread);
1820
1821 if (do_wakeup)
1822 wake_up(&conf->wait_for_overlap);
Song Liua39f7af2016-11-17 15:24:40 -08001823
1824 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1825 return;
1826
1827 spin_lock_irq(&conf->log->stripe_in_journal_lock);
1828 list_del_init(&sh->r5c);
1829 spin_unlock_irq(&conf->log->stripe_in_journal_lock);
1830 sh->log_start = MaxSector;
1831 atomic_dec(&conf->log->stripe_in_journal_count);
Song Liu1e6d6902016-11-17 15:24:39 -08001832}
1833
1834int
1835r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
1836 struct stripe_head_state *s)
1837{
Song Liua39f7af2016-11-17 15:24:40 -08001838 struct r5conf *conf = sh->raid_conf;
Song Liu1e6d6902016-11-17 15:24:39 -08001839 int pages = 0;
1840 int reserve;
1841 int i;
1842 int ret = 0;
1843
1844 BUG_ON(!log);
1845
1846 for (i = 0; i < sh->disks; i++) {
1847 void *addr;
1848
1849 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
1850 continue;
1851 addr = kmap_atomic(sh->dev[i].page);
1852 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1853 addr, PAGE_SIZE);
1854 kunmap_atomic(addr);
1855 pages++;
1856 }
1857 WARN_ON(pages == 0);
1858
1859 /*
1860 * The stripe must enter state machine again to call endio, so
1861 * don't delay.
1862 */
1863 clear_bit(STRIPE_DELAYED, &sh->state);
1864 atomic_inc(&sh->count);
1865
1866 mutex_lock(&log->io_mutex);
1867 /* meta + data */
1868 reserve = (1 + pages) << (PAGE_SHIFT - 9);
Song Liu1e6d6902016-11-17 15:24:39 -08001869
Song Liua39f7af2016-11-17 15:24:40 -08001870 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1871 sh->log_start == MaxSector)
1872 r5l_add_no_space_stripe(log, sh);
1873 else if (!r5l_has_free_space(log, reserve)) {
1874 if (sh->log_start == log->last_checkpoint)
1875 BUG();
1876 else
1877 r5l_add_no_space_stripe(log, sh);
Song Liu1e6d6902016-11-17 15:24:39 -08001878 } else {
1879 ret = r5l_log_stripe(log, sh, pages, 0);
1880 if (ret) {
1881 spin_lock_irq(&log->io_list_lock);
1882 list_add_tail(&sh->log_list, &log->no_mem_stripes);
1883 spin_unlock_irq(&log->io_list_lock);
1884 }
1885 }
1886
1887 mutex_unlock(&log->io_mutex);
1888 return 0;
Song Liu2ded3702016-11-17 15:24:38 -08001889}
1890
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001891static int r5l_load_log(struct r5l_log *log)
1892{
1893 struct md_rdev *rdev = log->rdev;
1894 struct page *page;
1895 struct r5l_meta_block *mb;
1896 sector_t cp = log->rdev->journal_tail;
1897 u32 stored_crc, expected_crc;
1898 bool create_super = false;
1899 int ret;
1900
1901 /* Make sure it's valid */
1902 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1903 cp = 0;
1904 page = alloc_page(GFP_KERNEL);
1905 if (!page)
1906 return -ENOMEM;
1907
Mike Christie796a5cf2016-06-05 14:32:07 -05001908 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001909 ret = -EIO;
1910 goto ioerr;
1911 }
1912 mb = page_address(page);
1913
1914 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1915 mb->version != R5LOG_VERSION) {
1916 create_super = true;
1917 goto create;
1918 }
1919 stored_crc = le32_to_cpu(mb->checksum);
1920 mb->checksum = 0;
Shaohua Li5cb2fbd2015-10-28 08:41:25 -07001921 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001922 if (stored_crc != expected_crc) {
1923 create_super = true;
1924 goto create;
1925 }
1926 if (le64_to_cpu(mb->position) != cp) {
1927 create_super = true;
1928 goto create;
1929 }
1930create:
1931 if (create_super) {
1932 log->last_cp_seq = prandom_u32();
1933 cp = 0;
Zhengyuan Liu56056c22016-10-24 16:15:59 +08001934 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001935 /*
1936 * Make sure super points to correct address. Log might have
1937 * data very soon. If super hasn't correct log tail address,
1938 * recovery can't find the log
1939 */
1940 r5l_write_super(log, cp);
1941 } else
1942 log->last_cp_seq = le64_to_cpu(mb->seq);
1943
1944 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001945 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1946 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1947 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001948 log->last_checkpoint = cp;
Zhengyuan Liu28cd88e2016-10-24 09:55:20 +08001949 log->next_checkpoint = cp;
Song Liua39f7af2016-11-17 15:24:40 -08001950 mutex_lock(&log->io_mutex);
1951 r5c_update_log_state(log);
1952 mutex_unlock(&log->io_mutex);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001953
1954 __free_page(page);
1955
1956 return r5l_recovery_log(log);
1957ioerr:
1958 __free_page(page);
1959 return ret;
1960}
1961
1962int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1963{
Jens Axboec888a8f2016-04-13 13:33:19 -06001964 struct request_queue *q = bdev_get_queue(rdev->bdev);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001965 struct r5l_log *log;
1966
1967 if (PAGE_SIZE != 4096)
1968 return -EINVAL;
Song Liuc757ec92016-11-17 15:24:36 -08001969
1970 /*
1971 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
1972 * raid_disks r5l_payload_data_parity.
1973 *
1974 * Write journal and cache does not work for very big array
1975 * (raid_disks > 203)
1976 */
1977 if (sizeof(struct r5l_meta_block) +
1978 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
1979 conf->raid_disks) > PAGE_SIZE) {
1980 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
1981 mdname(conf->mddev), conf->raid_disks);
1982 return -EINVAL;
1983 }
1984
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001985 log = kzalloc(sizeof(*log), GFP_KERNEL);
1986 if (!log)
1987 return -ENOMEM;
1988 log->rdev = rdev;
1989
Jens Axboec888a8f2016-04-13 13:33:19 -06001990 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
Christoph Hellwig56fef7c2015-10-05 09:31:09 +02001991
Shaohua Li5cb2fbd2015-10-28 08:41:25 -07001992 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1993 sizeof(rdev->mddev->uuid));
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001994
1995 mutex_init(&log->io_mutex);
1996
1997 spin_lock_init(&log->io_list_lock);
1998 INIT_LIST_HEAD(&log->running_ios);
Shaohua Li0576b1c2015-08-13 14:32:00 -07001999 INIT_LIST_HEAD(&log->io_end_ios);
Shaohua Lia8c34f92015-09-02 13:49:46 -07002000 INIT_LIST_HEAD(&log->flushing_ios);
Christoph Hellwig04732f72015-10-05 09:31:07 +02002001 INIT_LIST_HEAD(&log->finished_ios);
Shaohua Lia8c34f92015-09-02 13:49:46 -07002002 bio_init(&log->flush_bio);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07002003
2004 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
2005 if (!log->io_kc)
2006 goto io_kc;
2007
Christoph Hellwig5036c3902015-12-21 10:51:02 +11002008 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
2009 if (!log->io_pool)
2010 goto io_pool;
2011
Christoph Hellwigc38d29b2015-12-21 10:51:02 +11002012 log->bs = bioset_create(R5L_POOL_SIZE, 0);
2013 if (!log->bs)
2014 goto io_bs;
2015
Christoph Hellwige8deb632015-12-21 10:51:02 +11002016 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
2017 if (!log->meta_pool)
2018 goto out_mempool;
2019
Shaohua Li0576b1c2015-08-13 14:32:00 -07002020 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
2021 log->rdev->mddev, "reclaim");
2022 if (!log->reclaim_thread)
2023 goto reclaim_thread;
Song Liua39f7af2016-11-17 15:24:40 -08002024 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
2025
Shaohua Li0fd22b42015-09-02 13:49:47 -07002026 init_waitqueue_head(&log->iounit_wait);
Shaohua Li0576b1c2015-08-13 14:32:00 -07002027
Christoph Hellwig5036c3902015-12-21 10:51:02 +11002028 INIT_LIST_HEAD(&log->no_mem_stripes);
2029
Shaohua Lif6bed0e2015-08-13 14:31:59 -07002030 INIT_LIST_HEAD(&log->no_space_stripes);
2031 spin_lock_init(&log->no_space_stripes_lock);
2032
Song Liu2ded3702016-11-17 15:24:38 -08002033 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
Song Liua39f7af2016-11-17 15:24:40 -08002034 INIT_LIST_HEAD(&log->stripe_in_journal_list);
2035 spin_lock_init(&log->stripe_in_journal_lock);
2036 atomic_set(&log->stripe_in_journal_count, 0);
Song Liu2ded3702016-11-17 15:24:38 -08002037
Shaohua Lif6bed0e2015-08-13 14:31:59 -07002038 if (r5l_load_log(log))
2039 goto error;
2040
Shaohua Lif6b6ec52015-12-21 10:51:02 +11002041 rcu_assign_pointer(conf->log, log);
Shaohua Lia62ab492016-01-06 14:37:13 -08002042 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07002043 return 0;
Christoph Hellwige8deb632015-12-21 10:51:02 +11002044
Shaohua Lif6bed0e2015-08-13 14:31:59 -07002045error:
Shaohua Li0576b1c2015-08-13 14:32:00 -07002046 md_unregister_thread(&log->reclaim_thread);
2047reclaim_thread:
Christoph Hellwige8deb632015-12-21 10:51:02 +11002048 mempool_destroy(log->meta_pool);
2049out_mempool:
Christoph Hellwigc38d29b2015-12-21 10:51:02 +11002050 bioset_free(log->bs);
2051io_bs:
Christoph Hellwig5036c3902015-12-21 10:51:02 +11002052 mempool_destroy(log->io_pool);
2053io_pool:
Shaohua Lif6bed0e2015-08-13 14:31:59 -07002054 kmem_cache_destroy(log->io_kc);
2055io_kc:
2056 kfree(log);
2057 return -EINVAL;
2058}
2059
2060void r5l_exit_log(struct r5l_log *log)
2061{
Shaohua Li0576b1c2015-08-13 14:32:00 -07002062 md_unregister_thread(&log->reclaim_thread);
Christoph Hellwige8deb632015-12-21 10:51:02 +11002063 mempool_destroy(log->meta_pool);
Christoph Hellwigc38d29b2015-12-21 10:51:02 +11002064 bioset_free(log->bs);
Christoph Hellwig5036c3902015-12-21 10:51:02 +11002065 mempool_destroy(log->io_pool);
Shaohua Lif6bed0e2015-08-13 14:31:59 -07002066 kmem_cache_destroy(log->io_kc);
2067 kfree(log);
2068}