Blame - drivers/md/raid5-ppl.c - kernel/msm-4.19

blob: 86ea9addb51a4fcf060dc2abbc5eb0fcd15bdc94 [file] [log] [blame]

Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	1	/*
				2	* Partial Parity Log for closing the RAID5 write hole
				3	* Copyright (c) 2017, Intel Corporation.
				4	*
				5	* This program is free software; you can redistribute it and/or modify it
				6	* under the terms and conditions of the GNU General Public License,
				7	* version 2, as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope it will be useful, but WITHOUT
				10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				12	* more details.
				13	*/
				14
				15	#include <linux/kernel.h>
				16	#include <linux/blkdev.h>
				17	#include <linux/slab.h>
				18	#include <linux/crc32c.h>
				19	#include <linux/flex_array.h>
				20	#include <linux/async_tx.h>
				21	#include <linux/raid/md_p.h>
				22	#include "md.h"
				23	#include "raid5.h"
				24
				25	/*
				26	* PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
				27	* partial parity data. The header contains an array of entries
				28	* (struct ppl_header_entry) which describe the logged write requests.
				29	* Partial parity for the entries comes after the header, written in the same
				30	* sequence as the entries:
				31	*
				32	* Header
				33	* entry0
				34	* ...
				35	* entryN
				36	* PP data
				37	* PP for entry0
				38	* ...
				39	* PP for entryN
				40	*
				41	* An entry describes one or more consecutive stripe_heads, up to a full
				42	* stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
				43	* number of stripe_heads in the entry and n is the number of modified data
				44	* disks. Every stripe_head in the entry must write to the same data disks.
				45	* An example of a valid case described by a single entry (writes to the first
				46	* stripe of a 4 disk array, 16k chunk size):
				47	*
				48	* sh->sector dd0 dd1 dd2 ppl
				49	* +-----+-----+-----+
				50	* 0 \| --- \| --- \| --- \| +----+
				51	* 8 \| -W- \| -W- \| --- \| \| pp \| data_sector = 8
				52	* 16 \| -W- \| -W- \| --- \| \| pp \| data_size = 3 * 2 * 4k
				53	* 24 \| -W- \| -W- \| --- \| \| pp \| pp_size = 3 * 4k
				54	* +-----+-----+-----+ +----+
				55	*
				56	* data_sector is the first raid sector of the modified data, data_size is the
				57	* total size of modified data and pp_size is the size of partial parity for
				58	* this entry. Entries for full stripe writes contain no partial parity
				59	* (pp_size = 0), they only mark the stripes for which parity should be
				60	* recalculated after an unclean shutdown. Every entry holds a checksum of its
				61	* partial parity, the header also has a checksum of the header itself.
				62	*
				63	* A write request is always logged to the PPL instance stored on the parity
				64	* disk of the corresponding stripe. For each member disk there is one ppl_log
				65	* used to handle logging for this disk, independently from others. They are
				66	* grouped in child_logs array in struct ppl_conf, which is assigned to
				67	* r5conf->log_private.
				68	*
				69	* ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
				70	* PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
				71	* can be appended to the last entry if it meets the conditions for a valid
				72	* entry described above, otherwise a new entry is added. Checksums of entries
				73	* are calculated incrementally as stripes containing partial parity are being
				74	* added. ppl_submit_iounit() calculates the checksum of the header and submits
				75	* a bio containing the header page and partial parity pages (sh->ppl_page) for
				76	* all stripes of the io_unit. When the PPL write completes, the stripes
				77	* associated with the io_unit are released and raid5d starts writing their data
				78	* and parity. When all stripes are written, the io_unit is freed and the next
				79	* can be submitted.
				80	*
				81	* An io_unit is used to gather stripes until it is submitted or becomes full
				82	* (if the maximum number of entries or size of PPL is reached). Another io_unit
				83	* can't be submitted until the previous has completed (PPL and stripe
				84	* data+parity is written). The log->io_list tracks all io_units of a log
				85	* (for a single member disk). New io_units are added to the end of the list
				86	* and the first io_unit is submitted, if it is not submitted already.
				87	* The current io_unit accepting new stripes is always at the end of the list.
				88	*/
				89
				90	struct ppl_conf {
				91	struct mddev *mddev;
				92
				93	/* array of child logs, one for each raid disk */
				94	struct ppl_log *child_logs;
				95	int count;
				96
				97	int block_size; /* the logical block size used for data_sector
				98	* in ppl_header_entry */
				99	u32 signature; /* raid array identifier */
				100	atomic64_t seq; /* current log write sequence number */
				101
				102	struct kmem_cache *io_kc;
				103	mempool_t *io_pool;
				104	struct bio_set *bs;
				105	mempool_t *meta_pool;
Artur Paszkiewicz	4536bf9b	2017-03-09 10:00:01 +0100	[diff] [blame]	106
				107	/* used only for recovery */
				108	int recovered_entries;
				109	int mismatch_count;
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	110	};
				111
				112	struct ppl_log {
				113	struct ppl_conf ppl_conf; / shared between all log instances */
				114
				115	struct md_rdev rdev; / array member disk associated with
				116	* this log instance */
				117	struct mutex io_mutex;
				118	struct ppl_io_unit current_io; / current io_unit accepting new data
				119	* always at the end of io_list */
				120	spinlock_t io_list_lock;
				121	struct list_head io_list; /* all io_units of this log */
				122	struct list_head no_mem_stripes;/* stripes to retry if failed to
				123	* allocate io_unit */
				124	};
				125
				126	#define PPL_IO_INLINE_BVECS 32
				127
				128	struct ppl_io_unit {
				129	struct ppl_log *log;
				130
				131	struct page header_page; / for ppl_header */
				132
				133	unsigned int entries_count; /* number of entries in ppl_header */
				134	unsigned int pp_size; /* total size current of partial parity */
				135
				136	u64 seq; /* sequence number of this log write */
				137	struct list_head log_sibling; /* log->io_list */
				138
				139	struct list_head stripe_list; /* stripes added to the io_unit */
				140	atomic_t pending_stripes; /* how many stripes not written to raid */
				141
				142	bool submitted; /* true if write to log started */
				143
				144	/* inline bio and its biovec for submitting the iounit */
				145	struct bio bio;
				146	struct bio_vec biovec[PPL_IO_INLINE_BVECS];
				147	};
				148
				149	struct dma_async_tx_descriptor *
				150	ops_run_partial_parity(struct stripe_head sh, struct raid5_percpu percpu,
				151	struct dma_async_tx_descriptor *tx)
				152	{
				153	int disks = sh->disks;
				154	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
				155	int count = 0, pd_idx = sh->pd_idx, i;
				156	struct async_submit_ctl submit;
				157
				158	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
				159
				160	/*
				161	* Partial parity is the XOR of stripe data chunks that are not changed
				162	* during the write request. Depending on available data
				163	* (read-modify-write vs. reconstruct-write case) we calculate it
				164	* differently.
				165	*/
				166	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
				167	/* rmw: xor old data and parity from updated disks */
				168	for (i = disks; i--;) {
				169	struct r5dev *dev = &sh->dev[i];
				170	if (test_bit(R5_Wantdrain, &dev->flags) \|\| i == pd_idx)
				171	xor_srcs[count++] = dev->page;
				172	}
				173	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
				174	/* rcw: xor data from all not updated disks */
				175	for (i = disks; i--;) {
				176	struct r5dev *dev = &sh->dev[i];
				177	if (test_bit(R5_UPTODATE, &dev->flags))
				178	xor_srcs[count++] = dev->page;
				179	}
				180	} else {
				181	return tx;
				182	}
				183
				184	init_async_submit(&submit, ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, tx,
				185	NULL, sh, flex_array_get(percpu->scribble, 0)
				186	+ sizeof(struct page ) (sh->disks + 2));
				187
				188	if (count == 1)
				189	tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
				190	&submit);
				191	else
				192	tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
				193	&submit);
				194
				195	return tx;
				196	}
				197
				198	static struct ppl_io_unit ppl_new_iounit(struct ppl_log log,
				199	struct stripe_head *sh)
				200	{
				201	struct ppl_conf *ppl_conf = log->ppl_conf;
				202	struct ppl_io_unit *io;
				203	struct ppl_header *pplhdr;
				204
				205	io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
				206	if (!io)
				207	return NULL;
				208
				209	memset(io, 0, sizeof(*io));
				210	io->log = log;
				211	INIT_LIST_HEAD(&io->log_sibling);
				212	INIT_LIST_HEAD(&io->stripe_list);
				213	atomic_set(&io->pending_stripes, 0);
				214	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
				215
				216	io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
				217	pplhdr = page_address(io->header_page);
				218	clear_page(pplhdr);
				219	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
				220	pplhdr->signature = cpu_to_le32(ppl_conf->signature);
				221
				222	io->seq = atomic64_add_return(1, &ppl_conf->seq);
				223	pplhdr->generation = cpu_to_le64(io->seq);
				224
				225	return io;
				226	}
				227
				228	static int ppl_log_stripe(struct ppl_log log, struct stripe_head sh)
				229	{
				230	struct ppl_io_unit *io = log->current_io;
				231	struct ppl_header_entry *e = NULL;
				232	struct ppl_header *pplhdr;
				233	int i;
				234	sector_t data_sector = 0;
				235	int data_disks = 0;
				236	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
				237	struct r5conf *conf = sh->raid_conf;
				238
				239	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
				240
				241	/* check if current io_unit is full */
				242	if (io && (io->pp_size == entry_space \|\|
				243	io->entries_count == PPL_HDR_MAX_ENTRIES)) {
				244	pr_debug("%s: add io_unit blocked by seq: %llu\n",
				245	__func__, io->seq);
				246	io = NULL;
				247	}
				248
				249	/* add a new unit if there is none or the current is full */
				250	if (!io) {
				251	io = ppl_new_iounit(log, sh);
				252	if (!io)
				253	return -ENOMEM;
				254	spin_lock_irq(&log->io_list_lock);
				255	list_add_tail(&io->log_sibling, &log->io_list);
				256	spin_unlock_irq(&log->io_list_lock);
				257
				258	log->current_io = io;
				259	}
				260
				261	for (i = 0; i < sh->disks; i++) {
				262	struct r5dev *dev = &sh->dev[i];
				263
				264	if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
				265	if (!data_disks \|\| dev->sector < data_sector)
				266	data_sector = dev->sector;
				267	data_disks++;
				268	}
				269	}
				270	BUG_ON(!data_disks);
				271
				272	pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
				273	io->seq, (unsigned long long)data_sector, data_disks);
				274
				275	pplhdr = page_address(io->header_page);
				276
				277	if (io->entries_count > 0) {
				278	struct ppl_header_entry *last =
				279	&pplhdr->entries[io->entries_count - 1];
				280	struct stripe_head *sh_last = list_last_entry(
				281	&io->stripe_list, struct stripe_head, log_list);
				282	u64 data_sector_last = le64_to_cpu(last->data_sector);
				283	u32 data_size_last = le32_to_cpu(last->data_size);
				284
				285	/*
				286	* Check if we can append the stripe to the last entry. It must
				287	* be just after the last logged stripe and write to the same
				288	* disks. Use bit shift and logarithm to avoid 64-bit division.
				289	*/
				290	if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
				291	(data_sector >> ilog2(conf->chunk_sectors) ==
				292	data_sector_last >> ilog2(conf->chunk_sectors)) &&
				293	((data_sector - data_sector_last) * data_disks ==
				294	data_size_last >> 9))
				295	e = last;
				296	}
				297
				298	if (!e) {
				299	e = &pplhdr->entries[io->entries_count++];
				300	e->data_sector = cpu_to_le64(data_sector);
				301	e->parity_disk = cpu_to_le32(sh->pd_idx);
				302	e->checksum = cpu_to_le32(~0);
				303	}
				304
				305	le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
				306
				307	/* don't write any PP if full stripe write */
				308	if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
				309	le32_add_cpu(&e->pp_size, PAGE_SIZE);
				310	io->pp_size += PAGE_SIZE;
				311	e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
				312	page_address(sh->ppl_page),
				313	PAGE_SIZE));
				314	}
				315
				316	list_add_tail(&sh->log_list, &io->stripe_list);
				317	atomic_inc(&io->pending_stripes);
				318	sh->ppl_io = io;
				319
				320	return 0;
				321	}
				322
				323	int ppl_write_stripe(struct r5conf conf, struct stripe_head sh)
				324	{
				325	struct ppl_conf *ppl_conf = conf->log_private;
				326	struct ppl_io_unit *io = sh->ppl_io;
				327	struct ppl_log *log;
				328
				329	if (io \|\| test_bit(STRIPE_SYNCING, &sh->state) \|\|
				330	!test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) \|\|
				331	!test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
				332	clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
				333	return -EAGAIN;
				334	}
				335
				336	log = &ppl_conf->child_logs[sh->pd_idx];
				337
				338	mutex_lock(&log->io_mutex);
				339
				340	if (!log->rdev \|\| test_bit(Faulty, &log->rdev->flags)) {
				341	mutex_unlock(&log->io_mutex);
				342	return -EAGAIN;
				343	}
				344
				345	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
				346	clear_bit(STRIPE_DELAYED, &sh->state);
				347	atomic_inc(&sh->count);
				348
				349	if (ppl_log_stripe(log, sh)) {
				350	spin_lock_irq(&log->io_list_lock);
				351	list_add_tail(&sh->log_list, &log->no_mem_stripes);
				352	spin_unlock_irq(&log->io_list_lock);
				353	}
				354
				355	mutex_unlock(&log->io_mutex);
				356
				357	return 0;
				358	}
				359
				360	static void ppl_log_endio(struct bio *bio)
				361	{
				362	struct ppl_io_unit *io = bio->bi_private;
				363	struct ppl_log *log = io->log;
				364	struct ppl_conf *ppl_conf = log->ppl_conf;
				365	struct stripe_head sh, next;
				366
				367	pr_debug("%s: seq: %llu\n", __func__, io->seq);
				368
				369	if (bio->bi_error)
				370	md_error(ppl_conf->mddev, log->rdev);
				371
				372	mempool_free(io->header_page, ppl_conf->meta_pool);
				373
				374	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
				375	list_del_init(&sh->log_list);
				376
				377	set_bit(STRIPE_HANDLE, &sh->state);
				378	raid5_release_stripe(sh);
				379	}
				380	}
				381
				382	static void ppl_submit_iounit_bio(struct ppl_io_unit io, struct bio bio)
				383	{
				384	char b[BDEVNAME_SIZE];
				385
				386	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
				387	__func__, io->seq, bio->bi_iter.bi_size,
				388	(unsigned long long)bio->bi_iter.bi_sector,
				389	bdevname(bio->bi_bdev, b));
				390
				391	submit_bio(bio);
				392	}
				393
				394	static void ppl_submit_iounit(struct ppl_io_unit *io)
				395	{
				396	struct ppl_log *log = io->log;
				397	struct ppl_conf *ppl_conf = log->ppl_conf;
				398	struct ppl_header *pplhdr = page_address(io->header_page);
				399	struct bio *bio = &io->bio;
				400	struct stripe_head *sh;
				401	int i;
				402
Artur Paszkiewicz	6358c23	2017-03-09 10:00:02 +0100	[diff] [blame]	403	bio->bi_private = io;
				404
				405	if (!log->rdev \|\| test_bit(Faulty, &log->rdev->flags)) {
				406	ppl_log_endio(bio);
				407	return;
				408	}
				409
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	410	for (i = 0; i < io->entries_count; i++) {
				411	struct ppl_header_entry *e = &pplhdr->entries[i];
				412
				413	pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
				414	__func__, io->seq, i, le64_to_cpu(e->data_sector),
				415	le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
				416
				417	e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
				418	ilog2(ppl_conf->block_size >> 9));
				419	e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
				420	}
				421
				422	pplhdr->entries_count = cpu_to_le32(io->entries_count);
				423	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
				424
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	425	bio->bi_end_io = ppl_log_endio;
				426	bio->bi_opf = REQ_OP_WRITE \| REQ_FUA;
				427	bio->bi_bdev = log->rdev->bdev;
				428	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
				429	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
				430
				431	list_for_each_entry(sh, &io->stripe_list, log_list) {
				432	/* entries for full stripe writes have no partial parity */
				433	if (test_bit(STRIPE_FULL_WRITE, &sh->state))
				434	continue;
				435
				436	if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
				437	struct bio *prev = bio;
				438
				439	bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
				440	ppl_conf->bs);
				441	bio->bi_opf = prev->bi_opf;
				442	bio->bi_bdev = prev->bi_bdev;
				443	bio->bi_iter.bi_sector = bio_end_sector(prev);
				444	bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
				445
				446	bio_chain(bio, prev);
				447	ppl_submit_iounit_bio(io, prev);
				448	}
				449	}
				450
				451	ppl_submit_iounit_bio(io, bio);
				452	}
				453
				454	static void ppl_submit_current_io(struct ppl_log *log)
				455	{
				456	struct ppl_io_unit *io;
				457
				458	spin_lock_irq(&log->io_list_lock);
				459
				460	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
				461	log_sibling);
				462	if (io && io->submitted)
				463	io = NULL;
				464
				465	spin_unlock_irq(&log->io_list_lock);
				466
				467	if (io) {
				468	io->submitted = true;
				469
				470	if (io == log->current_io)
				471	log->current_io = NULL;
				472
				473	ppl_submit_iounit(io);
				474	}
				475	}
				476
				477	void ppl_write_stripe_run(struct r5conf *conf)
				478	{
				479	struct ppl_conf *ppl_conf = conf->log_private;
				480	struct ppl_log *log;
				481	int i;
				482
				483	for (i = 0; i < ppl_conf->count; i++) {
				484	log = &ppl_conf->child_logs[i];
				485
				486	mutex_lock(&log->io_mutex);
				487	ppl_submit_current_io(log);
				488	mutex_unlock(&log->io_mutex);
				489	}
				490	}
				491
				492	static void ppl_io_unit_finished(struct ppl_io_unit *io)
				493	{
				494	struct ppl_log *log = io->log;
				495	unsigned long flags;
				496
				497	pr_debug("%s: seq: %llu\n", __func__, io->seq);
				498
				499	spin_lock_irqsave(&log->io_list_lock, flags);
				500
				501	list_del(&io->log_sibling);
				502	mempool_free(io, log->ppl_conf->io_pool);
				503
				504	if (!list_empty(&log->no_mem_stripes)) {
				505	struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
				506	struct stripe_head,
				507	log_list);
				508	list_del_init(&sh->log_list);
				509	set_bit(STRIPE_HANDLE, &sh->state);
				510	raid5_release_stripe(sh);
				511	}
				512
				513	spin_unlock_irqrestore(&log->io_list_lock, flags);
				514	}
				515
				516	void ppl_stripe_write_finished(struct stripe_head *sh)
				517	{
				518	struct ppl_io_unit *io;
				519
				520	io = sh->ppl_io;
				521	sh->ppl_io = NULL;
				522
				523	if (io && atomic_dec_and_test(&io->pending_stripes))
				524	ppl_io_unit_finished(io);
				525	}
				526
Artur Paszkiewicz	4536bf9b	2017-03-09 10:00:01 +0100	[diff] [blame]	527	static void ppl_xor(int size, struct page page1, struct page page2)
				528	{
				529	struct async_submit_ctl submit;
				530	struct dma_async_tx_descriptor *tx;
				531	struct page *xor_srcs[] = { page1, page2 };
				532
				533	init_async_submit(&submit, ASYNC_TX_ACK\|ASYNC_TX_XOR_DROP_DST,
				534	NULL, NULL, NULL, NULL);
				535	tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
				536
				537	async_tx_quiesce(&tx);
				538	}
				539
				540	/*
				541	* PPL recovery strategy: xor partial parity and data from all modified data
				542	* disks within a stripe and write the result as the new stripe parity. If all
				543	* stripe data disks are modified (full stripe write), no partial parity is
				544	* available, so just xor the data disks.
				545	*
				546	* Recovery of a PPL entry shall occur only if all modified data disks are
				547	* available and read from all of them succeeds.
				548	*
				549	* A PPL entry applies to a stripe, partial parity size for an entry is at most
				550	* the size of the chunk. Examples of possible cases for a single entry:
				551	*
				552	* case 0: single data disk write:
				553	* data0 data1 data2 ppl parity
				554	* +--------+--------+--------+ +--------------------+
				555	* \| ------ \| ------ \| ------ \| +----+ \| (no change) \|
				556	* \| ------ \| -data- \| ------ \| \| pp \| -> \| data1 ^ pp \|
				557	* \| ------ \| -data- \| ------ \| \| pp \| -> \| data1 ^ pp \|
				558	* \| ------ \| ------ \| ------ \| +----+ \| (no change) \|
				559	* +--------+--------+--------+ +--------------------+
				560	* pp_size = data_size
				561	*
				562	* case 1: more than one data disk write:
				563	* data0 data1 data2 ppl parity
				564	* +--------+--------+--------+ +--------------------+
				565	* \| ------ \| ------ \| ------ \| +----+ \| (no change) \|
				566	* \| -data- \| -data- \| ------ \| \| pp \| -> \| data0 ^ data1 ^ pp \|
				567	* \| -data- \| -data- \| ------ \| \| pp \| -> \| data0 ^ data1 ^ pp \|
				568	* \| ------ \| ------ \| ------ \| +----+ \| (no change) \|
				569	* +--------+--------+--------+ +--------------------+
				570	* pp_size = data_size / modified_data_disks
				571	*
				572	* case 2: write to all data disks (also full stripe write):
				573	* data0 data1 data2 parity
				574	* +--------+--------+--------+ +--------------------+
				575	* \| ------ \| ------ \| ------ \| \| (no change) \|
				576	* \| -data- \| -data- \| -data- \| --------> \| xor all data \|
				577	* \| ------ \| ------ \| ------ \| --------> \| (no change) \|
				578	* \| ------ \| ------ \| ------ \| \| (no change) \|
				579	* +--------+--------+--------+ +--------------------+
				580	* pp_size = 0
				581	*
				582	* The following cases are possible only in other implementations. The recovery
				583	* code can handle them, but they are not generated at runtime because they can
				584	* be reduced to cases 0, 1 and 2:
				585	*
				586	* case 3:
				587	* data0 data1 data2 ppl parity
				588	* +--------+--------+--------+ +----+ +--------------------+
				589	* \| ------ \| -data- \| -data- \| \| pp \| \| data1 ^ data2 ^ pp \|
				590	* \| ------ \| -data- \| -data- \| \| pp \| -> \| data1 ^ data2 ^ pp \|
				591	* \| -data- \| -data- \| -data- \| \| -- \| -> \| xor all data \|
				592	* \| -data- \| -data- \| ------ \| \| pp \| \| data0 ^ data1 ^ pp \|
				593	* +--------+--------+--------+ +----+ +--------------------+
				594	* pp_size = chunk_size
				595	*
				596	* case 4:
				597	* data0 data1 data2 ppl parity
				598	* +--------+--------+--------+ +----+ +--------------------+
				599	* \| ------ \| -data- \| ------ \| \| pp \| \| data1 ^ pp \|
				600	* \| ------ \| ------ \| ------ \| \| -- \| -> \| (no change) \|
				601	* \| ------ \| ------ \| ------ \| \| -- \| -> \| (no change) \|
				602	* \| -data- \| ------ \| ------ \| \| pp \| \| data0 ^ pp \|
				603	* +--------+--------+--------+ +----+ +--------------------+
				604	* pp_size = chunk_size
				605	*/
				606	static int ppl_recover_entry(struct ppl_log log, struct ppl_header_entry e,
				607	sector_t ppl_sector)
				608	{
				609	struct ppl_conf *ppl_conf = log->ppl_conf;
				610	struct mddev *mddev = ppl_conf->mddev;
				611	struct r5conf *conf = mddev->private;
				612	int block_size = ppl_conf->block_size;
				613	struct page *page1;
				614	struct page *page2;
				615	sector_t r_sector_first;
				616	sector_t r_sector_last;
				617	int strip_sectors;
				618	int data_disks;
				619	int i;
				620	int ret = 0;
				621	char b[BDEVNAME_SIZE];
				622	unsigned int pp_size = le32_to_cpu(e->pp_size);
				623	unsigned int data_size = le32_to_cpu(e->data_size);
				624
				625	page1 = alloc_page(GFP_KERNEL);
				626	page2 = alloc_page(GFP_KERNEL);
				627
				628	if (!page1 \|\| !page2) {
				629	ret = -ENOMEM;
				630	goto out;
				631	}
				632
				633	r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
				634
				635	if ((pp_size >> 9) < conf->chunk_sectors) {
				636	if (pp_size > 0) {
				637	data_disks = data_size / pp_size;
				638	strip_sectors = pp_size >> 9;
				639	} else {
				640	data_disks = conf->raid_disks - conf->max_degraded;
				641	strip_sectors = (data_size >> 9) / data_disks;
				642	}
				643	r_sector_last = r_sector_first +
				644	(data_disks - 1) * conf->chunk_sectors +
				645	strip_sectors;
				646	} else {
				647	data_disks = conf->raid_disks - conf->max_degraded;
				648	strip_sectors = conf->chunk_sectors;
				649	r_sector_last = r_sector_first + (data_size >> 9);
				650	}
				651
				652	pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
				653	(unsigned long long)r_sector_first,
				654	(unsigned long long)r_sector_last);
				655
				656	/* if start and end is 4k aligned, use a 4k block */
				657	if (block_size == 512 &&
				658	(r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
				659	(r_sector_last & (STRIPE_SECTORS - 1)) == 0)
				660	block_size = STRIPE_SIZE;
				661
				662	/* iterate through blocks in strip */
				663	for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
				664	bool update_parity = false;
				665	sector_t parity_sector;
				666	struct md_rdev *parity_rdev;
				667	struct stripe_head sh;
				668	int disk;
				669	int indent = 0;
				670
				671	pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
				672	indent += 2;
				673
				674	memset(page_address(page1), 0, PAGE_SIZE);
				675
				676	/* iterate through data member disks */
				677	for (disk = 0; disk < data_disks; disk++) {
				678	int dd_idx;
				679	struct md_rdev *rdev;
				680	sector_t sector;
				681	sector_t r_sector = r_sector_first + i +
				682	(disk * conf->chunk_sectors);
				683
				684	pr_debug("%s:%*s data member disk %d start\n",
				685	__func__, indent, "", disk);
				686	indent += 2;
				687
				688	if (r_sector >= r_sector_last) {
				689	pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
				690	__func__, indent, "",
				691	(unsigned long long)r_sector);
				692	indent -= 2;
				693	continue;
				694	}
				695
				696	update_parity = true;
				697
				698	/* map raid sector to member disk */
				699	sector = raid5_compute_sector(conf, r_sector, 0,
				700	&dd_idx, NULL);
				701	pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
				702	__func__, indent, "",
				703	(unsigned long long)r_sector, dd_idx,
				704	(unsigned long long)sector);
				705
				706	rdev = conf->disks[dd_idx].rdev;
				707	if (!rdev) {
				708	pr_debug("%s:%*s data member disk %d missing\n",
				709	__func__, indent, "", dd_idx);
				710	update_parity = false;
				711	break;
				712	}
				713
				714	pr_debug("%s:%*s reading data member disk %s sector %llu\n",
				715	__func__, indent, "", bdevname(rdev->bdev, b),
				716	(unsigned long long)sector);
				717	if (!sync_page_io(rdev, sector, block_size, page2,
				718	REQ_OP_READ, 0, false)) {
				719	md_error(mddev, rdev);
				720	pr_debug("%s:%*s read failed!\n", __func__,
				721	indent, "");
				722	ret = -EIO;
				723	goto out;
				724	}
				725
				726	ppl_xor(block_size, page1, page2);
				727
				728	indent -= 2;
				729	}
				730
				731	if (!update_parity)
				732	continue;
				733
				734	if (pp_size > 0) {
				735	pr_debug("%s:%*s reading pp disk sector %llu\n",
				736	__func__, indent, "",
				737	(unsigned long long)(ppl_sector + i));
				738	if (!sync_page_io(log->rdev,
				739	ppl_sector - log->rdev->data_offset + i,
				740	block_size, page2, REQ_OP_READ, 0,
				741	false)) {
				742	pr_debug("%s:%*s read failed!\n", __func__,
				743	indent, "");
				744	md_error(mddev, log->rdev);
				745	ret = -EIO;
				746	goto out;
				747	}
				748
				749	ppl_xor(block_size, page1, page2);
				750	}
				751
				752	/* map raid sector to parity disk */
				753	parity_sector = raid5_compute_sector(conf, r_sector_first + i,
				754	0, &disk, &sh);
				755	BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
				756	parity_rdev = conf->disks[sh.pd_idx].rdev;
				757
				758	BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
				759	pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
				760	__func__, indent, "",
				761	(unsigned long long)parity_sector,
				762	bdevname(parity_rdev->bdev, b));
				763	if (!sync_page_io(parity_rdev, parity_sector, block_size,
				764	page1, REQ_OP_WRITE, 0, false)) {
				765	pr_debug("%s:%*s parity write error!\n", __func__,
				766	indent, "");
				767	md_error(mddev, parity_rdev);
				768	ret = -EIO;
				769	goto out;
				770	}
				771	}
				772	out:
				773	if (page1)
				774	__free_page(page1);
				775	if (page2)
				776	__free_page(page2);
				777	return ret;
				778	}
				779
				780	static int ppl_recover(struct ppl_log log, struct ppl_header pplhdr)
				781	{
				782	struct ppl_conf *ppl_conf = log->ppl_conf;
				783	struct md_rdev *rdev = log->rdev;
				784	struct mddev *mddev = rdev->mddev;
				785	sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
				786	struct page *page;
				787	int i;
				788	int ret = 0;
				789
				790	page = alloc_page(GFP_KERNEL);
				791	if (!page)
				792	return -ENOMEM;
				793
				794	/* iterate through all PPL entries saved */
				795	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
				796	struct ppl_header_entry *e = &pplhdr->entries[i];
				797	u32 pp_size = le32_to_cpu(e->pp_size);
				798	sector_t sector = ppl_sector;
				799	int ppl_entry_sectors = pp_size >> 9;
				800	u32 crc, crc_stored;
				801
				802	pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
				803	__func__, rdev->raid_disk, i,
				804	(unsigned long long)ppl_sector, pp_size);
				805
				806	crc = ~0;
				807	crc_stored = le32_to_cpu(e->checksum);
				808
				809	/* read parial parity for this entry and calculate its checksum */
				810	while (pp_size) {
				811	int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
				812
				813	if (!sync_page_io(rdev, sector - rdev->data_offset,
				814	s, page, REQ_OP_READ, 0, false)) {
				815	md_error(mddev, rdev);
				816	ret = -EIO;
				817	goto out;
				818	}
				819
				820	crc = crc32c_le(crc, page_address(page), s);
				821
				822	pp_size -= s;
				823	sector += s >> 9;
				824	}
				825
				826	crc = ~crc;
				827
				828	if (crc != crc_stored) {
				829	/*
				830	* Don't recover this entry if the checksum does not
				831	* match, but keep going and try to recover other
				832	* entries.
				833	*/
				834	pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
				835	__func__, crc_stored, crc);
				836	ppl_conf->mismatch_count++;
				837	} else {
				838	ret = ppl_recover_entry(log, e, ppl_sector);
				839	if (ret)
				840	goto out;
				841	ppl_conf->recovered_entries++;
				842	}
				843
				844	ppl_sector += ppl_entry_sectors;
				845	}
				846
				847	/* flush the disk cache after recovery if necessary */
				848	ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
				849	out:
				850	__free_page(page);
				851	return ret;
				852	}
				853
				854	static int ppl_write_empty_header(struct ppl_log *log)
				855	{
				856	struct page *page;
				857	struct ppl_header *pplhdr;
				858	struct md_rdev *rdev = log->rdev;
				859	int ret = 0;
				860
				861	pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
				862	rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
				863
				864	page = alloc_page(GFP_NOIO \| __GFP_ZERO);
				865	if (!page)
				866	return -ENOMEM;
				867
				868	pplhdr = page_address(page);
				869	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
				870	pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
				871	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
				872
				873	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
				874	PPL_HEADER_SIZE, page, REQ_OP_WRITE \| REQ_FUA, 0,
				875	false)) {
				876	md_error(rdev->mddev, rdev);
				877	ret = -EIO;
				878	}
				879
				880	__free_page(page);
				881	return ret;
				882	}
				883
				884	static int ppl_load_distributed(struct ppl_log *log)
				885	{
				886	struct ppl_conf *ppl_conf = log->ppl_conf;
				887	struct md_rdev *rdev = log->rdev;
				888	struct mddev *mddev = rdev->mddev;
				889	struct page *page;
				890	struct ppl_header *pplhdr;
				891	u32 crc, crc_stored;
				892	u32 signature;
				893	int ret = 0;
				894
				895	pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
				896
				897	/* read PPL header */
				898	page = alloc_page(GFP_KERNEL);
				899	if (!page)
				900	return -ENOMEM;
				901
				902	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
				903	PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
				904	md_error(mddev, rdev);
				905	ret = -EIO;
				906	goto out;
				907	}
				908	pplhdr = page_address(page);
				909
				910	/* check header validity */
				911	crc_stored = le32_to_cpu(pplhdr->checksum);
				912	pplhdr->checksum = 0;
				913	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
				914
				915	if (crc_stored != crc) {
				916	pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
				917	__func__, crc_stored, crc);
				918	ppl_conf->mismatch_count++;
				919	goto out;
				920	}
				921
				922	signature = le32_to_cpu(pplhdr->signature);
				923
				924	if (mddev->external) {
				925	/*
				926	* For external metadata the header signature is set and
				927	* validated in userspace.
				928	*/
				929	ppl_conf->signature = signature;
				930	} else if (ppl_conf->signature != signature) {
				931	pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
				932	__func__, signature, ppl_conf->signature);
				933	ppl_conf->mismatch_count++;
				934	goto out;
				935	}
				936
				937	/* attempt to recover from log if we are starting a dirty array */
				938	if (!mddev->pers && mddev->recovery_cp != MaxSector)
				939	ret = ppl_recover(log, pplhdr);
				940	out:
				941	/* write empty header if we are starting the array */
				942	if (!ret && !mddev->pers)
				943	ret = ppl_write_empty_header(log);
				944
				945	__free_page(page);
				946
				947	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
				948	__func__, ret, ppl_conf->mismatch_count,
				949	ppl_conf->recovered_entries);
				950	return ret;
				951	}
				952
				953	static int ppl_load(struct ppl_conf *ppl_conf)
				954	{
				955	int ret = 0;
				956	u32 signature = 0;
				957	bool signature_set = false;
				958	int i;
				959
				960	for (i = 0; i < ppl_conf->count; i++) {
				961	struct ppl_log *log = &ppl_conf->child_logs[i];
				962
				963	/* skip missing drive */
				964	if (!log->rdev)
				965	continue;
				966
				967	ret = ppl_load_distributed(log);
				968	if (ret)
				969	break;
				970
				971	/*
				972	* For external metadata we can't check if the signature is
				973	* correct on a single drive, but we can check if it is the same
				974	* on all drives.
				975	*/
				976	if (ppl_conf->mddev->external) {
				977	if (!signature_set) {
				978	signature = ppl_conf->signature;
				979	signature_set = true;
				980	} else if (signature != ppl_conf->signature) {
				981	pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
				982	mdname(ppl_conf->mddev));
				983	ret = -EINVAL;
				984	break;
				985	}
				986	}
				987	}
				988
				989	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
				990	__func__, ret, ppl_conf->mismatch_count,
				991	ppl_conf->recovered_entries);
				992	return ret;
				993	}
				994
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	995	static void __ppl_exit_log(struct ppl_conf *ppl_conf)
				996	{
				997	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
				998
				999	kfree(ppl_conf->child_logs);
				1000
				1001	mempool_destroy(ppl_conf->meta_pool);
				1002	if (ppl_conf->bs)
				1003	bioset_free(ppl_conf->bs);
				1004	mempool_destroy(ppl_conf->io_pool);
				1005	kmem_cache_destroy(ppl_conf->io_kc);
				1006
				1007	kfree(ppl_conf);
				1008	}
				1009
				1010	void ppl_exit_log(struct r5conf *conf)
				1011	{
				1012	struct ppl_conf *ppl_conf = conf->log_private;
				1013
				1014	if (ppl_conf) {
				1015	__ppl_exit_log(ppl_conf);
				1016	conf->log_private = NULL;
				1017	}
				1018	}
				1019
				1020	static int ppl_validate_rdev(struct md_rdev *rdev)
				1021	{
				1022	char b[BDEVNAME_SIZE];
				1023	int ppl_data_sectors;
				1024	int ppl_size_new;
				1025
				1026	/*
				1027	* The configured PPL size must be enough to store
				1028	* the header and (at the very least) partial parity
				1029	* for one stripe. Round it down to ensure the data
				1030	* space is cleanly divisible by stripe size.
				1031	*/
				1032	ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
				1033
				1034	if (ppl_data_sectors > 0)
				1035	ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
				1036
				1037	if (ppl_data_sectors <= 0) {
				1038	pr_warn("md/raid:%s: PPL space too small on %s\n",
				1039	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				1040	return -ENOSPC;
				1041	}
				1042
				1043	ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
				1044
				1045	if ((rdev->ppl.sector < rdev->data_offset &&
				1046	rdev->ppl.sector + ppl_size_new > rdev->data_offset) \|\|
				1047	(rdev->ppl.sector >= rdev->data_offset &&
				1048	rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
				1049	pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
				1050	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				1051	return -EINVAL;
				1052	}
				1053
				1054	if (!rdev->mddev->external &&
				1055	((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) \|\|
				1056	(rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
				1057	pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
				1058	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				1059	return -EINVAL;
				1060	}
				1061
				1062	rdev->ppl.size = ppl_size_new;
				1063
				1064	return 0;
				1065	}
				1066
				1067	int ppl_init_log(struct r5conf *conf)
				1068	{
				1069	struct ppl_conf *ppl_conf;
				1070	struct mddev *mddev = conf->mddev;
				1071	int ret = 0;
				1072	int i;
Dan Carpenter	0b408ba	2017-03-21 23:43:05 +0300	[diff] [blame^]	1073	bool need_cache_flush = false;
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	1074
				1075	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
				1076	mdname(conf->mddev));
				1077
				1078	if (PAGE_SIZE != 4096)
				1079	return -EINVAL;
				1080
				1081	if (mddev->level != 5) {
				1082	pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
				1083	mdname(mddev), mddev->level);
				1084	return -EINVAL;
				1085	}
				1086
				1087	if (mddev->bitmap_info.file \|\| mddev->bitmap_info.offset) {
				1088	pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
				1089	mdname(mddev));
				1090	return -EINVAL;
				1091	}
				1092
				1093	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
				1094	pr_warn("md/raid:%s PPL is not compatible with journal\n",
				1095	mdname(mddev));
				1096	return -EINVAL;
				1097	}
				1098
				1099	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
				1100	if (!ppl_conf)
				1101	return -ENOMEM;
				1102
				1103	ppl_conf->mddev = mddev;
				1104
				1105	ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
				1106	if (!ppl_conf->io_kc) {
				1107	ret = -EINVAL;
				1108	goto err;
				1109	}
				1110
				1111	ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
				1112	if (!ppl_conf->io_pool) {
				1113	ret = -EINVAL;
				1114	goto err;
				1115	}
				1116
				1117	ppl_conf->bs = bioset_create(conf->raid_disks, 0);
				1118	if (!ppl_conf->bs) {
				1119	ret = -EINVAL;
				1120	goto err;
				1121	}
				1122
				1123	ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
				1124	if (!ppl_conf->meta_pool) {
				1125	ret = -EINVAL;
				1126	goto err;
				1127	}
				1128
				1129	ppl_conf->count = conf->raid_disks;
				1130	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
				1131	GFP_KERNEL);
				1132	if (!ppl_conf->child_logs) {
				1133	ret = -ENOMEM;
				1134	goto err;
				1135	}
				1136
				1137	atomic64_set(&ppl_conf->seq, 0);
				1138
				1139	if (!mddev->external) {
				1140	ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
				1141	ppl_conf->block_size = 512;
				1142	} else {
				1143	ppl_conf->block_size = queue_logical_block_size(mddev->queue);
				1144	}
				1145
				1146	for (i = 0; i < ppl_conf->count; i++) {
				1147	struct ppl_log *log = &ppl_conf->child_logs[i];
				1148	struct md_rdev *rdev = conf->disks[i].rdev;
				1149
				1150	mutex_init(&log->io_mutex);
				1151	spin_lock_init(&log->io_list_lock);
				1152	INIT_LIST_HEAD(&log->io_list);
				1153	INIT_LIST_HEAD(&log->no_mem_stripes);
				1154
				1155	log->ppl_conf = ppl_conf;
				1156	log->rdev = rdev;
				1157
				1158	if (rdev) {
				1159	struct request_queue *q;
				1160
				1161	ret = ppl_validate_rdev(rdev);
				1162	if (ret)
				1163	goto err;
				1164
				1165	q = bdev_get_queue(rdev->bdev);
				1166	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
				1167	need_cache_flush = true;
				1168	}
				1169	}
				1170
				1171	if (need_cache_flush)
				1172	pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
				1173	mdname(mddev));
				1174
Artur Paszkiewicz	4536bf9b	2017-03-09 10:00:01 +0100	[diff] [blame]	1175	/* load and possibly recover the logs from the member disks */
				1176	ret = ppl_load(ppl_conf);
				1177
				1178	if (ret) {
				1179	goto err;
				1180	} else if (!mddev->pers &&
				1181	mddev->recovery_cp == 0 && !mddev->degraded &&
				1182	ppl_conf->recovered_entries > 0 &&
				1183	ppl_conf->mismatch_count == 0) {
				1184	/*
				1185	* If we are starting a dirty array and the recovery succeeds
				1186	* without any issues, set the array as clean.
				1187	*/
				1188	mddev->recovery_cp = MaxSector;
				1189	set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
Artur Paszkiewicz	ba903a3	2017-03-09 10:00:03 +0100	[diff] [blame]	1190	} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
				1191	/* no mismatch allowed when enabling PPL for a running array */
				1192	ret = -EINVAL;
				1193	goto err;
Artur Paszkiewicz	4536bf9b	2017-03-09 10:00:01 +0100	[diff] [blame]	1194	}
				1195
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	1196	conf->log_private = ppl_conf;
				1197
				1198	return 0;
				1199	err:
				1200	__ppl_exit_log(ppl_conf);
				1201	return ret;
				1202	}
Artur Paszkiewicz	6358c23	2017-03-09 10:00:02 +0100	[diff] [blame]	1203
				1204	int ppl_modify_log(struct r5conf conf, struct md_rdev rdev, bool add)
				1205	{
				1206	struct ppl_conf *ppl_conf = conf->log_private;
				1207	struct ppl_log *log;
				1208	int ret = 0;
				1209	char b[BDEVNAME_SIZE];
				1210
				1211	if (!rdev)
				1212	return -EINVAL;
				1213
				1214	pr_debug("%s: disk: %d operation: %s dev: %s\n",
				1215	__func__, rdev->raid_disk, add ? "add" : "remove",
				1216	bdevname(rdev->bdev, b));
				1217
				1218	if (rdev->raid_disk < 0)
				1219	return 0;
				1220
				1221	if (rdev->raid_disk >= ppl_conf->count)
				1222	return -ENODEV;
				1223
				1224	log = &ppl_conf->child_logs[rdev->raid_disk];
				1225
				1226	mutex_lock(&log->io_mutex);
				1227	if (add) {
				1228	ret = ppl_validate_rdev(rdev);
				1229	if (!ret) {
				1230	log->rdev = rdev;
				1231	ret = ppl_write_empty_header(log);
				1232	}
				1233	} else {
				1234	log->rdev = NULL;
				1235	}
				1236	mutex_unlock(&log->io_mutex);
				1237
				1238	return ret;
				1239	}