blob: db5b72b11594fd84cf54424b0cbe9f29fbf0331a [file] [log] [blame]
Artur Paszkiewicz3418d032017-03-09 09:59:59 +01001/*
2 * Partial Parity Log for closing the RAID5 write hole
3 * Copyright (c) 2017, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/kernel.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/crc32c.h>
19#include <linux/flex_array.h>
20#include <linux/async_tx.h>
21#include <linux/raid/md_p.h>
22#include "md.h"
23#include "raid5.h"
24
25/*
26 * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
27 * partial parity data. The header contains an array of entries
28 * (struct ppl_header_entry) which describe the logged write requests.
29 * Partial parity for the entries comes after the header, written in the same
30 * sequence as the entries:
31 *
32 * Header
33 * entry0
34 * ...
35 * entryN
36 * PP data
37 * PP for entry0
38 * ...
39 * PP for entryN
40 *
41 * An entry describes one or more consecutive stripe_heads, up to a full
42 * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
43 * number of stripe_heads in the entry and n is the number of modified data
44 * disks. Every stripe_head in the entry must write to the same data disks.
45 * An example of a valid case described by a single entry (writes to the first
46 * stripe of a 4 disk array, 16k chunk size):
47 *
48 * sh->sector dd0 dd1 dd2 ppl
49 * +-----+-----+-----+
50 * 0 | --- | --- | --- | +----+
51 * 8 | -W- | -W- | --- | | pp | data_sector = 8
52 * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k
53 * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k
54 * +-----+-----+-----+ +----+
55 *
56 * data_sector is the first raid sector of the modified data, data_size is the
57 * total size of modified data and pp_size is the size of partial parity for
58 * this entry. Entries for full stripe writes contain no partial parity
59 * (pp_size = 0), they only mark the stripes for which parity should be
60 * recalculated after an unclean shutdown. Every entry holds a checksum of its
61 * partial parity, the header also has a checksum of the header itself.
62 *
63 * A write request is always logged to the PPL instance stored on the parity
64 * disk of the corresponding stripe. For each member disk there is one ppl_log
65 * used to handle logging for this disk, independently from others. They are
66 * grouped in child_logs array in struct ppl_conf, which is assigned to
67 * r5conf->log_private.
68 *
69 * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
70 * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
71 * can be appended to the last entry if it meets the conditions for a valid
72 * entry described above, otherwise a new entry is added. Checksums of entries
73 * are calculated incrementally as stripes containing partial parity are being
74 * added. ppl_submit_iounit() calculates the checksum of the header and submits
75 * a bio containing the header page and partial parity pages (sh->ppl_page) for
76 * all stripes of the io_unit. When the PPL write completes, the stripes
77 * associated with the io_unit are released and raid5d starts writing their data
78 * and parity. When all stripes are written, the io_unit is freed and the next
79 * can be submitted.
80 *
81 * An io_unit is used to gather stripes until it is submitted or becomes full
82 * (if the maximum number of entries or size of PPL is reached). Another io_unit
83 * can't be submitted until the previous has completed (PPL and stripe
84 * data+parity is written). The log->io_list tracks all io_units of a log
85 * (for a single member disk). New io_units are added to the end of the list
86 * and the first io_unit is submitted, if it is not submitted already.
87 * The current io_unit accepting new stripes is always at the end of the list.
88 */
89
90struct ppl_conf {
91 struct mddev *mddev;
92
93 /* array of child logs, one for each raid disk */
94 struct ppl_log *child_logs;
95 int count;
96
97 int block_size; /* the logical block size used for data_sector
98 * in ppl_header_entry */
99 u32 signature; /* raid array identifier */
100 atomic64_t seq; /* current log write sequence number */
101
102 struct kmem_cache *io_kc;
103 mempool_t *io_pool;
104 struct bio_set *bs;
105 mempool_t *meta_pool;
106};
107
108struct ppl_log {
109 struct ppl_conf *ppl_conf; /* shared between all log instances */
110
111 struct md_rdev *rdev; /* array member disk associated with
112 * this log instance */
113 struct mutex io_mutex;
114 struct ppl_io_unit *current_io; /* current io_unit accepting new data
115 * always at the end of io_list */
116 spinlock_t io_list_lock;
117 struct list_head io_list; /* all io_units of this log */
118 struct list_head no_mem_stripes;/* stripes to retry if failed to
119 * allocate io_unit */
120};
121
122#define PPL_IO_INLINE_BVECS 32
123
124struct ppl_io_unit {
125 struct ppl_log *log;
126
127 struct page *header_page; /* for ppl_header */
128
129 unsigned int entries_count; /* number of entries in ppl_header */
130 unsigned int pp_size; /* total size current of partial parity */
131
132 u64 seq; /* sequence number of this log write */
133 struct list_head log_sibling; /* log->io_list */
134
135 struct list_head stripe_list; /* stripes added to the io_unit */
136 atomic_t pending_stripes; /* how many stripes not written to raid */
137
138 bool submitted; /* true if write to log started */
139
140 /* inline bio and its biovec for submitting the iounit */
141 struct bio bio;
142 struct bio_vec biovec[PPL_IO_INLINE_BVECS];
143};
144
145struct dma_async_tx_descriptor *
146ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
147 struct dma_async_tx_descriptor *tx)
148{
149 int disks = sh->disks;
150 struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
151 int count = 0, pd_idx = sh->pd_idx, i;
152 struct async_submit_ctl submit;
153
154 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
155
156 /*
157 * Partial parity is the XOR of stripe data chunks that are not changed
158 * during the write request. Depending on available data
159 * (read-modify-write vs. reconstruct-write case) we calculate it
160 * differently.
161 */
162 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
163 /* rmw: xor old data and parity from updated disks */
164 for (i = disks; i--;) {
165 struct r5dev *dev = &sh->dev[i];
166 if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
167 xor_srcs[count++] = dev->page;
168 }
169 } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
170 /* rcw: xor data from all not updated disks */
171 for (i = disks; i--;) {
172 struct r5dev *dev = &sh->dev[i];
173 if (test_bit(R5_UPTODATE, &dev->flags))
174 xor_srcs[count++] = dev->page;
175 }
176 } else {
177 return tx;
178 }
179
180 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
181 NULL, sh, flex_array_get(percpu->scribble, 0)
182 + sizeof(struct page *) * (sh->disks + 2));
183
184 if (count == 1)
185 tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
186 &submit);
187 else
188 tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
189 &submit);
190
191 return tx;
192}
193
194static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
195 struct stripe_head *sh)
196{
197 struct ppl_conf *ppl_conf = log->ppl_conf;
198 struct ppl_io_unit *io;
199 struct ppl_header *pplhdr;
200
201 io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
202 if (!io)
203 return NULL;
204
205 memset(io, 0, sizeof(*io));
206 io->log = log;
207 INIT_LIST_HEAD(&io->log_sibling);
208 INIT_LIST_HEAD(&io->stripe_list);
209 atomic_set(&io->pending_stripes, 0);
210 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
211
212 io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
213 pplhdr = page_address(io->header_page);
214 clear_page(pplhdr);
215 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
216 pplhdr->signature = cpu_to_le32(ppl_conf->signature);
217
218 io->seq = atomic64_add_return(1, &ppl_conf->seq);
219 pplhdr->generation = cpu_to_le64(io->seq);
220
221 return io;
222}
223
224static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
225{
226 struct ppl_io_unit *io = log->current_io;
227 struct ppl_header_entry *e = NULL;
228 struct ppl_header *pplhdr;
229 int i;
230 sector_t data_sector = 0;
231 int data_disks = 0;
232 unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
233 struct r5conf *conf = sh->raid_conf;
234
235 pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
236
237 /* check if current io_unit is full */
238 if (io && (io->pp_size == entry_space ||
239 io->entries_count == PPL_HDR_MAX_ENTRIES)) {
240 pr_debug("%s: add io_unit blocked by seq: %llu\n",
241 __func__, io->seq);
242 io = NULL;
243 }
244
245 /* add a new unit if there is none or the current is full */
246 if (!io) {
247 io = ppl_new_iounit(log, sh);
248 if (!io)
249 return -ENOMEM;
250 spin_lock_irq(&log->io_list_lock);
251 list_add_tail(&io->log_sibling, &log->io_list);
252 spin_unlock_irq(&log->io_list_lock);
253
254 log->current_io = io;
255 }
256
257 for (i = 0; i < sh->disks; i++) {
258 struct r5dev *dev = &sh->dev[i];
259
260 if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
261 if (!data_disks || dev->sector < data_sector)
262 data_sector = dev->sector;
263 data_disks++;
264 }
265 }
266 BUG_ON(!data_disks);
267
268 pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
269 io->seq, (unsigned long long)data_sector, data_disks);
270
271 pplhdr = page_address(io->header_page);
272
273 if (io->entries_count > 0) {
274 struct ppl_header_entry *last =
275 &pplhdr->entries[io->entries_count - 1];
276 struct stripe_head *sh_last = list_last_entry(
277 &io->stripe_list, struct stripe_head, log_list);
278 u64 data_sector_last = le64_to_cpu(last->data_sector);
279 u32 data_size_last = le32_to_cpu(last->data_size);
280
281 /*
282 * Check if we can append the stripe to the last entry. It must
283 * be just after the last logged stripe and write to the same
284 * disks. Use bit shift and logarithm to avoid 64-bit division.
285 */
286 if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
287 (data_sector >> ilog2(conf->chunk_sectors) ==
288 data_sector_last >> ilog2(conf->chunk_sectors)) &&
289 ((data_sector - data_sector_last) * data_disks ==
290 data_size_last >> 9))
291 e = last;
292 }
293
294 if (!e) {
295 e = &pplhdr->entries[io->entries_count++];
296 e->data_sector = cpu_to_le64(data_sector);
297 e->parity_disk = cpu_to_le32(sh->pd_idx);
298 e->checksum = cpu_to_le32(~0);
299 }
300
301 le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
302
303 /* don't write any PP if full stripe write */
304 if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
305 le32_add_cpu(&e->pp_size, PAGE_SIZE);
306 io->pp_size += PAGE_SIZE;
307 e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
308 page_address(sh->ppl_page),
309 PAGE_SIZE));
310 }
311
312 list_add_tail(&sh->log_list, &io->stripe_list);
313 atomic_inc(&io->pending_stripes);
314 sh->ppl_io = io;
315
316 return 0;
317}
318
319int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
320{
321 struct ppl_conf *ppl_conf = conf->log_private;
322 struct ppl_io_unit *io = sh->ppl_io;
323 struct ppl_log *log;
324
325 if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
326 !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
327 !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
328 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
329 return -EAGAIN;
330 }
331
332 log = &ppl_conf->child_logs[sh->pd_idx];
333
334 mutex_lock(&log->io_mutex);
335
336 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
337 mutex_unlock(&log->io_mutex);
338 return -EAGAIN;
339 }
340
341 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
342 clear_bit(STRIPE_DELAYED, &sh->state);
343 atomic_inc(&sh->count);
344
345 if (ppl_log_stripe(log, sh)) {
346 spin_lock_irq(&log->io_list_lock);
347 list_add_tail(&sh->log_list, &log->no_mem_stripes);
348 spin_unlock_irq(&log->io_list_lock);
349 }
350
351 mutex_unlock(&log->io_mutex);
352
353 return 0;
354}
355
356static void ppl_log_endio(struct bio *bio)
357{
358 struct ppl_io_unit *io = bio->bi_private;
359 struct ppl_log *log = io->log;
360 struct ppl_conf *ppl_conf = log->ppl_conf;
361 struct stripe_head *sh, *next;
362
363 pr_debug("%s: seq: %llu\n", __func__, io->seq);
364
365 if (bio->bi_error)
366 md_error(ppl_conf->mddev, log->rdev);
367
368 mempool_free(io->header_page, ppl_conf->meta_pool);
369
370 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
371 list_del_init(&sh->log_list);
372
373 set_bit(STRIPE_HANDLE, &sh->state);
374 raid5_release_stripe(sh);
375 }
376}
377
378static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
379{
380 char b[BDEVNAME_SIZE];
381
382 pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
383 __func__, io->seq, bio->bi_iter.bi_size,
384 (unsigned long long)bio->bi_iter.bi_sector,
385 bdevname(bio->bi_bdev, b));
386
387 submit_bio(bio);
388}
389
390static void ppl_submit_iounit(struct ppl_io_unit *io)
391{
392 struct ppl_log *log = io->log;
393 struct ppl_conf *ppl_conf = log->ppl_conf;
394 struct ppl_header *pplhdr = page_address(io->header_page);
395 struct bio *bio = &io->bio;
396 struct stripe_head *sh;
397 int i;
398
399 for (i = 0; i < io->entries_count; i++) {
400 struct ppl_header_entry *e = &pplhdr->entries[i];
401
402 pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
403 __func__, io->seq, i, le64_to_cpu(e->data_sector),
404 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
405
406 e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
407 ilog2(ppl_conf->block_size >> 9));
408 e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
409 }
410
411 pplhdr->entries_count = cpu_to_le32(io->entries_count);
412 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
413
414 bio->bi_private = io;
415 bio->bi_end_io = ppl_log_endio;
416 bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
417 bio->bi_bdev = log->rdev->bdev;
418 bio->bi_iter.bi_sector = log->rdev->ppl.sector;
419 bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
420
421 list_for_each_entry(sh, &io->stripe_list, log_list) {
422 /* entries for full stripe writes have no partial parity */
423 if (test_bit(STRIPE_FULL_WRITE, &sh->state))
424 continue;
425
426 if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
427 struct bio *prev = bio;
428
429 bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
430 ppl_conf->bs);
431 bio->bi_opf = prev->bi_opf;
432 bio->bi_bdev = prev->bi_bdev;
433 bio->bi_iter.bi_sector = bio_end_sector(prev);
434 bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
435
436 bio_chain(bio, prev);
437 ppl_submit_iounit_bio(io, prev);
438 }
439 }
440
441 ppl_submit_iounit_bio(io, bio);
442}
443
444static void ppl_submit_current_io(struct ppl_log *log)
445{
446 struct ppl_io_unit *io;
447
448 spin_lock_irq(&log->io_list_lock);
449
450 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
451 log_sibling);
452 if (io && io->submitted)
453 io = NULL;
454
455 spin_unlock_irq(&log->io_list_lock);
456
457 if (io) {
458 io->submitted = true;
459
460 if (io == log->current_io)
461 log->current_io = NULL;
462
463 ppl_submit_iounit(io);
464 }
465}
466
467void ppl_write_stripe_run(struct r5conf *conf)
468{
469 struct ppl_conf *ppl_conf = conf->log_private;
470 struct ppl_log *log;
471 int i;
472
473 for (i = 0; i < ppl_conf->count; i++) {
474 log = &ppl_conf->child_logs[i];
475
476 mutex_lock(&log->io_mutex);
477 ppl_submit_current_io(log);
478 mutex_unlock(&log->io_mutex);
479 }
480}
481
482static void ppl_io_unit_finished(struct ppl_io_unit *io)
483{
484 struct ppl_log *log = io->log;
485 unsigned long flags;
486
487 pr_debug("%s: seq: %llu\n", __func__, io->seq);
488
489 spin_lock_irqsave(&log->io_list_lock, flags);
490
491 list_del(&io->log_sibling);
492 mempool_free(io, log->ppl_conf->io_pool);
493
494 if (!list_empty(&log->no_mem_stripes)) {
495 struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
496 struct stripe_head,
497 log_list);
498 list_del_init(&sh->log_list);
499 set_bit(STRIPE_HANDLE, &sh->state);
500 raid5_release_stripe(sh);
501 }
502
503 spin_unlock_irqrestore(&log->io_list_lock, flags);
504}
505
506void ppl_stripe_write_finished(struct stripe_head *sh)
507{
508 struct ppl_io_unit *io;
509
510 io = sh->ppl_io;
511 sh->ppl_io = NULL;
512
513 if (io && atomic_dec_and_test(&io->pending_stripes))
514 ppl_io_unit_finished(io);
515}
516
517static void __ppl_exit_log(struct ppl_conf *ppl_conf)
518{
519 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
520
521 kfree(ppl_conf->child_logs);
522
523 mempool_destroy(ppl_conf->meta_pool);
524 if (ppl_conf->bs)
525 bioset_free(ppl_conf->bs);
526 mempool_destroy(ppl_conf->io_pool);
527 kmem_cache_destroy(ppl_conf->io_kc);
528
529 kfree(ppl_conf);
530}
531
532void ppl_exit_log(struct r5conf *conf)
533{
534 struct ppl_conf *ppl_conf = conf->log_private;
535
536 if (ppl_conf) {
537 __ppl_exit_log(ppl_conf);
538 conf->log_private = NULL;
539 }
540}
541
542static int ppl_validate_rdev(struct md_rdev *rdev)
543{
544 char b[BDEVNAME_SIZE];
545 int ppl_data_sectors;
546 int ppl_size_new;
547
548 /*
549 * The configured PPL size must be enough to store
550 * the header and (at the very least) partial parity
551 * for one stripe. Round it down to ensure the data
552 * space is cleanly divisible by stripe size.
553 */
554 ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
555
556 if (ppl_data_sectors > 0)
557 ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
558
559 if (ppl_data_sectors <= 0) {
560 pr_warn("md/raid:%s: PPL space too small on %s\n",
561 mdname(rdev->mddev), bdevname(rdev->bdev, b));
562 return -ENOSPC;
563 }
564
565 ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
566
567 if ((rdev->ppl.sector < rdev->data_offset &&
568 rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
569 (rdev->ppl.sector >= rdev->data_offset &&
570 rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
571 pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
572 mdname(rdev->mddev), bdevname(rdev->bdev, b));
573 return -EINVAL;
574 }
575
576 if (!rdev->mddev->external &&
577 ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
578 (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
579 pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
580 mdname(rdev->mddev), bdevname(rdev->bdev, b));
581 return -EINVAL;
582 }
583
584 rdev->ppl.size = ppl_size_new;
585
586 return 0;
587}
588
589int ppl_init_log(struct r5conf *conf)
590{
591 struct ppl_conf *ppl_conf;
592 struct mddev *mddev = conf->mddev;
593 int ret = 0;
594 int i;
595 bool need_cache_flush;
596
597 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
598 mdname(conf->mddev));
599
600 if (PAGE_SIZE != 4096)
601 return -EINVAL;
602
603 if (mddev->level != 5) {
604 pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
605 mdname(mddev), mddev->level);
606 return -EINVAL;
607 }
608
609 if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
610 pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
611 mdname(mddev));
612 return -EINVAL;
613 }
614
615 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
616 pr_warn("md/raid:%s PPL is not compatible with journal\n",
617 mdname(mddev));
618 return -EINVAL;
619 }
620
621 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
622 if (!ppl_conf)
623 return -ENOMEM;
624
625 ppl_conf->mddev = mddev;
626
627 ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
628 if (!ppl_conf->io_kc) {
629 ret = -EINVAL;
630 goto err;
631 }
632
633 ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
634 if (!ppl_conf->io_pool) {
635 ret = -EINVAL;
636 goto err;
637 }
638
639 ppl_conf->bs = bioset_create(conf->raid_disks, 0);
640 if (!ppl_conf->bs) {
641 ret = -EINVAL;
642 goto err;
643 }
644
645 ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
646 if (!ppl_conf->meta_pool) {
647 ret = -EINVAL;
648 goto err;
649 }
650
651 ppl_conf->count = conf->raid_disks;
652 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
653 GFP_KERNEL);
654 if (!ppl_conf->child_logs) {
655 ret = -ENOMEM;
656 goto err;
657 }
658
659 atomic64_set(&ppl_conf->seq, 0);
660
661 if (!mddev->external) {
662 ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
663 ppl_conf->block_size = 512;
664 } else {
665 ppl_conf->block_size = queue_logical_block_size(mddev->queue);
666 }
667
668 for (i = 0; i < ppl_conf->count; i++) {
669 struct ppl_log *log = &ppl_conf->child_logs[i];
670 struct md_rdev *rdev = conf->disks[i].rdev;
671
672 mutex_init(&log->io_mutex);
673 spin_lock_init(&log->io_list_lock);
674 INIT_LIST_HEAD(&log->io_list);
675 INIT_LIST_HEAD(&log->no_mem_stripes);
676
677 log->ppl_conf = ppl_conf;
678 log->rdev = rdev;
679
680 if (rdev) {
681 struct request_queue *q;
682
683 ret = ppl_validate_rdev(rdev);
684 if (ret)
685 goto err;
686
687 q = bdev_get_queue(rdev->bdev);
688 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
689 need_cache_flush = true;
690 }
691 }
692
693 if (need_cache_flush)
694 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
695 mdname(mddev));
696
697 conf->log_private = ppl_conf;
698
699 return 0;
700err:
701 __ppl_exit_log(ppl_conf);
702 return ret;
703}