Blame - fs/btrfs/raid56.c - kernel/msm-4.19

blob: 5e4ad134b9ad12881edb33b7962e13155edc9d45 [file] [log] [blame]

David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2	/*
				3	* Copyright (C) 2012 Fusion-io All rights reserved.
				4	* Copyright (C) 2012 Intel Corp. All rights reserved.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	5	*/
David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	6
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	7	#include <linux/sched.h>
				8	#include <linux/wait.h>
				9	#include <linux/bio.h>
				10	#include <linux/slab.h>
				11	#include <linux/buffer_head.h>
				12	#include <linux/blkdev.h>
				13	#include <linux/random.h>
				14	#include <linux/iocontext.h>
				15	#include <linux/capability.h>
				16	#include <linux/ratelimit.h>
				17	#include <linux/kthread.h>
				18	#include <linux/raid/pq.h>
				19	#include <linux/hash.h>
				20	#include <linux/list_sort.h>
				21	#include <linux/raid/xor.h>
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	22	#include <linux/mm.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	23	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	24	#include "ctree.h"
				25	#include "extent_map.h"
				26	#include "disk-io.h"
				27	#include "transaction.h"
				28	#include "print-tree.h"
				29	#include "volumes.h"
				30	#include "raid56.h"
				31	#include "async-thread.h"
				32	#include "check-integrity.h"
				33	#include "rcu-string.h"
				34
				35	/* set when additional merges to this rbio are not allowed */
				36	#define RBIO_RMW_LOCKED_BIT 1
				37
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	38	/*
				39	* set when this rbio is sitting in the hash, but it is just a cache
				40	* of past RMW
				41	*/
				42	#define RBIO_CACHE_BIT 2
				43
				44	/*
				45	* set when it is safe to trust the stripe_pages for caching
				46	*/
				47	#define RBIO_CACHE_READY_BIT 3
				48
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	49	#define RBIO_CACHE_SIZE 1024
				50
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	51	enum btrfs_rbio_ops {
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	52	BTRFS_RBIO_WRITE,
				53	BTRFS_RBIO_READ_REBUILD,
				54	BTRFS_RBIO_PARITY_SCRUB,
				55	BTRFS_RBIO_REBUILD_MISSING,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	56	};
				57
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	58	struct btrfs_raid_bio {
				59	struct btrfs_fs_info *fs_info;
				60	struct btrfs_bio *bbio;
				61
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	62	/* while we're doing rmw on a stripe
				63	* we put it into a hash table so we can
				64	* lock the stripe and merge more rbios
				65	* into it.
				66	*/
				67	struct list_head hash_list;
				68
				69	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	70	* LRU list for the stripe cache
				71	*/
				72	struct list_head stripe_cache;
				73
				74	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	75	* for scheduling work in the helper threads
				76	*/
				77	struct btrfs_work work;
				78
				79	/*
				80	* bio list and bio_list_lock are used
				81	* to add more bios into the stripe
				82	* in hopes of avoiding the full rmw
				83	*/
				84	struct bio_list bio_list;
				85	spinlock_t bio_list_lock;
				86
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	87	/* also protected by the bio_list_lock, the
				88	* plug list is used by the plugging code
				89	* to collect partial bios while plugged. The
				90	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	91	* the stripe lock to the next pending IO
				92	*/
				93	struct list_head plug_list;
				94
				95	/*
				96	* flags that tell us if it is safe to
				97	* merge with this bio
				98	*/
				99	unsigned long flags;
				100
				101	/* size of each individual stripe on disk */
				102	int stripe_len;
				103
				104	/* number of data stripes (no p/q) */
				105	int nr_data;
				106
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	107	int real_stripes;
				108
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	109	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	110	/*
				111	* set if we're doing a parity rebuild
				112	* for a read from higher up, which is handled
				113	* differently from a parity rebuild as part of
				114	* rmw
				115	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	116	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	117
				118	/* first bad stripe */
				119	int faila;
				120
				121	/* second bad stripe (for raid6 use) */
				122	int failb;
				123
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	124	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	125	/*
				126	* number of pages needed to represent the full
				127	* stripe
				128	*/
				129	int nr_pages;
				130
				131	/*
				132	* size of all the bios in the bio_list. This
				133	* helps us decide if the rbio maps to a full
				134	* stripe or not
				135	*/
				136	int bio_list_bytes;
				137
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	138	int generic_bio_cnt;
				139
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	140	refcount_t refs;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	141
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	142	atomic_t stripes_pending;
				143
				144	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	145	/*
				146	* these are two arrays of pointers. We allocate the
				147	* rbio big enough to hold them both and setup their
				148	* locations when the rbio is allocated
				149	*/
				150
				151	/* pointers to pages that we allocated for
				152	* reading/writing stripes directly from the disk (including P/Q)
				153	*/
				154	struct page **stripe_pages;
				155
				156	/*
				157	* pointers to the pages in the bio_list. Stored
				158	* here for faster lookup
				159	*/
				160	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	161
				162	/*
				163	* bitmap to record which horizontal stripe has data
				164	*/
				165	unsigned long *dbitmap;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame^]	166
				167	/* allocated with real_stripes-many pointers for finish_() calls /
				168	void **finish_pointers;
				169
				170	/* allocated with stripe_npages-many bits for finish_() calls /
				171	unsigned long *finish_pbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	172	};
				173
				174	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				175	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				176	static void rmw_work(struct btrfs_work *work);
				177	static void read_rebuild_work(struct btrfs_work *work);
				178	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				179	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				180	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				181	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				182	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				183	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				184	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				185
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	186	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				187	int need_check);
				188	static void async_scrub_parity(struct btrfs_raid_bio *rbio);
				189
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	190	/*
				191	* the stripe hash table is used for locking, and to collect
				192	* bios in hopes of making a full stripe
				193	*/
				194	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				195	{
				196	struct btrfs_stripe_hash_table *table;
				197	struct btrfs_stripe_hash_table *x;
				198	struct btrfs_stripe_hash *cur;
				199	struct btrfs_stripe_hash *h;
				200	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				201	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	202	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	203
				204	if (info->stripe_hash_table)
				205	return 0;
				206
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	207	/*
				208	* The table is large, starting with order 4 and can go as high as
				209	* order 7 in case lock debugging is turned on.
				210	*
				211	* Try harder to allocate and fallback to vmalloc to lower the chance
				212	* of a failing mount.
				213	*/
				214	table_size = sizeof(table) + sizeof(h) * num_entries;
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	215	table = kvzalloc(table_size, GFP_KERNEL);
				216	if (!table)
				217	return -ENOMEM;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	218
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	219	spin_lock_init(&table->cache_lock);
				220	INIT_LIST_HEAD(&table->stripe_cache);
				221
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	222	h = table->table;
				223
				224	for (i = 0; i < num_entries; i++) {
				225	cur = h + i;
				226	INIT_LIST_HEAD(&cur->hash_list);
				227	spin_lock_init(&cur->lock);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	228	}
				229
				230	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	231	if (x)
				232	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	233	return 0;
				234	}
				235
				236	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	237	* caching an rbio means to copy anything from the
				238	* bio_pages array into the stripe_pages array. We
				239	* use the page uptodate bit in the stripe cache array
				240	* to indicate if it has valid data
				241	*
				242	* once the caching is done, we set the cache ready
				243	* bit.
				244	*/
				245	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				246	{
				247	int i;
				248	char *s;
				249	char *d;
				250	int ret;
				251
				252	ret = alloc_rbio_pages(rbio);
				253	if (ret)
				254	return;
				255
				256	for (i = 0; i < rbio->nr_pages; i++) {
				257	if (!rbio->bio_pages[i])
				258	continue;
				259
				260	s = kmap(rbio->bio_pages[i]);
				261	d = kmap(rbio->stripe_pages[i]);
				262
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	263	memcpy(d, s, PAGE_SIZE);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	264
				265	kunmap(rbio->bio_pages[i]);
				266	kunmap(rbio->stripe_pages[i]);
				267	SetPageUptodate(rbio->stripe_pages[i]);
				268	}
				269	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				270	}
				271
				272	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	273	* we hash on the first logical address of the stripe
				274	*/
				275	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				276	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	277	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	278
				279	/*
				280	* we shift down quite a bit. We're using byte
				281	* addressing, and most of the lower bits are zeros.
				282	* This tends to upset hash_64, and it consistently
				283	* returns just one or two different values.
				284	*
				285	* shifting off the lower bits fixes things.
				286	*/
				287	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				288	}
				289
				290	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	291	* stealing an rbio means taking all the uptodate pages from the stripe
				292	* array in the source rbio and putting them into the destination rbio
				293	*/
				294	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				295	{
				296	int i;
				297	struct page *s;
				298	struct page *d;
				299
				300	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				301	return;
				302
				303	for (i = 0; i < dest->nr_pages; i++) {
				304	s = src->stripe_pages[i];
				305	if (!s \|\| !PageUptodate(s)) {
				306	continue;
				307	}
				308
				309	d = dest->stripe_pages[i];
				310	if (d)
				311	__free_page(d);
				312
				313	dest->stripe_pages[i] = s;
				314	src->stripe_pages[i] = NULL;
				315	}
				316	}
				317
				318	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	319	* merging means we take the bio_list from the victim and
				320	* splice it into the destination. The victim should
				321	* be discarded afterwards.
				322	*
				323	* must be called with dest->rbio_list_lock held
				324	*/
				325	static void merge_rbio(struct btrfs_raid_bio *dest,
				326	struct btrfs_raid_bio *victim)
				327	{
				328	bio_list_merge(&dest->bio_list, &victim->bio_list);
				329	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	330	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	331	bio_list_init(&victim->bio_list);
				332	}
				333
				334	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	335	* used to prune items that are in the cache. The caller
				336	* must hold the hash table lock.
				337	*/
				338	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				339	{
				340	int bucket = rbio_bucket(rbio);
				341	struct btrfs_stripe_hash_table *table;
				342	struct btrfs_stripe_hash *h;
				343	int freeit = 0;
				344
				345	/*
				346	* check the bit again under the hash table lock.
				347	*/
				348	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				349	return;
				350
				351	table = rbio->fs_info->stripe_hash_table;
				352	h = table->table + bucket;
				353
				354	/* hold the lock for the bucket because we may be
				355	* removing it from the hash table
				356	*/
				357	spin_lock(&h->lock);
				358
				359	/*
				360	* hold the lock for the bio list because we need
				361	* to make sure the bio list is empty
				362	*/
				363	spin_lock(&rbio->bio_list_lock);
				364
				365	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				366	list_del_init(&rbio->stripe_cache);
				367	table->cache_size -= 1;
				368	freeit = 1;
				369
				370	/* if the bio list isn't empty, this rbio is
				371	* still involved in an IO. We take it out
				372	* of the cache list, and drop the ref that
				373	* was held for the list.
				374	*
				375	* If the bio_list was empty, we also remove
				376	* the rbio from the hash_table, and drop
				377	* the corresponding ref
				378	*/
				379	if (bio_list_empty(&rbio->bio_list)) {
				380	if (!list_empty(&rbio->hash_list)) {
				381	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	382	refcount_dec(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	383	BUG_ON(!list_empty(&rbio->plug_list));
				384	}
				385	}
				386	}
				387
				388	spin_unlock(&rbio->bio_list_lock);
				389	spin_unlock(&h->lock);
				390
				391	if (freeit)
				392	__free_raid_bio(rbio);
				393	}
				394
				395	/*
				396	* prune a given rbio from the cache
				397	*/
				398	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				399	{
				400	struct btrfs_stripe_hash_table *table;
				401	unsigned long flags;
				402
				403	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				404	return;
				405
				406	table = rbio->fs_info->stripe_hash_table;
				407
				408	spin_lock_irqsave(&table->cache_lock, flags);
				409	__remove_rbio_from_cache(rbio);
				410	spin_unlock_irqrestore(&table->cache_lock, flags);
				411	}
				412
				413	/*
				414	* remove everything in the cache
				415	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	416	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	417	{
				418	struct btrfs_stripe_hash_table *table;
				419	unsigned long flags;
				420	struct btrfs_raid_bio *rbio;
				421
				422	table = info->stripe_hash_table;
				423
				424	spin_lock_irqsave(&table->cache_lock, flags);
				425	while (!list_empty(&table->stripe_cache)) {
				426	rbio = list_entry(table->stripe_cache.next,
				427	struct btrfs_raid_bio,
				428	stripe_cache);
				429	__remove_rbio_from_cache(rbio);
				430	}
				431	spin_unlock_irqrestore(&table->cache_lock, flags);
				432	}
				433
				434	/*
				435	* remove all cached entries and free the hash table
				436	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	437	*/
				438	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				439	{
				440	if (!info->stripe_hash_table)
				441	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	442	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	443	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	444	info->stripe_hash_table = NULL;
				445	}
				446
				447	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	448	* insert an rbio into the stripe cache. It
				449	* must have already been prepared by calling
				450	* cache_rbio_pages
				451	*
				452	* If this rbio was already cached, it gets
				453	* moved to the front of the lru.
				454	*
				455	* If the size of the rbio cache is too big, we
				456	* prune an item.
				457	*/
				458	static void cache_rbio(struct btrfs_raid_bio *rbio)
				459	{
				460	struct btrfs_stripe_hash_table *table;
				461	unsigned long flags;
				462
				463	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				464	return;
				465
				466	table = rbio->fs_info->stripe_hash_table;
				467
				468	spin_lock_irqsave(&table->cache_lock, flags);
				469	spin_lock(&rbio->bio_list_lock);
				470
				471	/* bump our ref if we were not in the list before */
				472	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	473	refcount_inc(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	474
				475	if (!list_empty(&rbio->stripe_cache)){
				476	list_move(&rbio->stripe_cache, &table->stripe_cache);
				477	} else {
				478	list_add(&rbio->stripe_cache, &table->stripe_cache);
				479	table->cache_size += 1;
				480	}
				481
				482	spin_unlock(&rbio->bio_list_lock);
				483
				484	if (table->cache_size > RBIO_CACHE_SIZE) {
				485	struct btrfs_raid_bio *found;
				486
				487	found = list_entry(table->stripe_cache.prev,
				488	struct btrfs_raid_bio,
				489	stripe_cache);
				490
				491	if (found != rbio)
				492	__remove_rbio_from_cache(found);
				493	}
				494
				495	spin_unlock_irqrestore(&table->cache_lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	496	}
				497
				498	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	499	* helper function to run the xor_blocks api. It is only
				500	* able to do MAX_XOR_BLOCKS at a time, so we need to
				501	* loop through.
				502	*/
				503	static void run_xor(void **pages, int src_cnt, ssize_t len)
				504	{
				505	int src_off = 0;
				506	int xor_src_cnt = 0;
				507	void *dest = pages[src_cnt];
				508
				509	while(src_cnt > 0) {
				510	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				511	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				512
				513	src_cnt -= xor_src_cnt;
				514	src_off += xor_src_cnt;
				515	}
				516	}
				517
				518	/*
				519	* returns true if the bio list inside this rbio
				520	* covers an entire stripe (no rmw required).
				521	* Must be called with the bio list lock held, or
				522	* at a time when you know it is impossible to add
				523	* new bios into the list
				524	*/
				525	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				526	{
				527	unsigned long size = rbio->bio_list_bytes;
				528	int ret = 1;
				529
				530	if (size != rbio->nr_data * rbio->stripe_len)
				531	ret = 0;
				532
				533	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				534	return ret;
				535	}
				536
				537	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				538	{
				539	unsigned long flags;
				540	int ret;
				541
				542	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				543	ret = __rbio_is_full(rbio);
				544	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				545	return ret;
				546	}
				547
				548	/*
				549	* returns 1 if it is safe to merge two rbios together.
				550	* The merging is safe if the two rbios correspond to
				551	* the same stripe and if they are both going in the same
				552	* direction (read vs write), and if neither one is
				553	* locked for final IO
				554	*
				555	* The caller is responsible for locking such that
				556	* rmw_locked is safe to test
				557	*/
				558	static int rbio_can_merge(struct btrfs_raid_bio *last,
				559	struct btrfs_raid_bio *cur)
				560	{
				561	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				562	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				563	return 0;
				564
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	565	/*
				566	* we can't merge with cached rbios, since the
				567	* idea is that when we merge the destination
				568	* rbio is going to run our IO for us. We can
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	569	* steal from cached rbios though, other functions
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	570	* handle that.
				571	*/
				572	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				573	test_bit(RBIO_CACHE_BIT, &cur->flags))
				574	return 0;
				575
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	576	if (last->bbio->raid_map[0] !=
				577	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	578	return 0;
				579
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	580	/* we can't merge with different operations */
				581	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	582	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	583	/*
				584	* We've need read the full stripe from the drive.
				585	* check and repair the parity and write the new results.
				586	*
				587	* We're not allowed to add any new bios to the
				588	* bio list here, anyone else that wants to
				589	* change this stripe needs to do their own rmw.
				590	*/
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	591	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	592	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	593
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	594	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	595	return 0;
				596
Liu Bo	cc54ff6	2017-12-11 14:56:31 -0700	[diff] [blame]	597	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
				598	int fa = last->faila;
				599	int fb = last->failb;
				600	int cur_fa = cur->faila;
				601	int cur_fb = cur->failb;
				602
				603	if (last->faila >= last->failb) {
				604	fa = last->failb;
				605	fb = last->faila;
				606	}
				607
				608	if (cur->faila >= cur->failb) {
				609	cur_fa = cur->failb;
				610	cur_fb = cur->faila;
				611	}
				612
				613	if (fa != cur_fa \|\| fb != cur_fb)
				614	return 0;
				615	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	616	return 1;
				617	}
				618
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	619	static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
				620	int index)
				621	{
				622	return stripe * rbio->stripe_npages + index;
				623	}
				624
				625	/*
				626	* these are just the pages from the rbio array, not from anything
				627	* the FS sent down to us
				628	*/
				629	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe,
				630	int index)
				631	{
				632	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
				633	}
				634
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	635	/*
				636	* helper to index into the pstripe
				637	*/
				638	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				639	{
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	640	return rbio_stripe_page(rbio, rbio->nr_data, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	641	}
				642
				643	/*
				644	* helper to index into the qstripe, returns null
				645	* if there is no qstripe
				646	*/
				647	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				648	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	649	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	650	return NULL;
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	651	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	652	}
				653
				654	/*
				655	* The first stripe in the table for a logical address
				656	* has the lock. rbios are added in one of three ways:
				657	*
				658	* 1) Nobody has the stripe locked yet. The rbio is given
				659	* the lock and 0 is returned. The caller must start the IO
				660	* themselves.
				661	*
				662	* 2) Someone has the stripe locked, but we're able to merge
				663	* with the lock owner. The rbio is freed and the IO will
				664	* start automatically along with the existing rbio. 1 is returned.
				665	*
				666	* 3) Someone has the stripe locked, but we're not able to merge.
				667	* The rbio is added to the lock owner's plug list, or merged into
				668	* an rbio already on the plug list. When the lock owner unlocks,
				669	* the next rbio on the list is run and the IO is started automatically.
				670	* 1 is returned
				671	*
				672	* If we return 0, the caller still owns the rbio and must continue with
				673	* IO submission. If we return 1, the caller must assume the rbio has
				674	* already been freed.
				675	*/
				676	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				677	{
				678	int bucket = rbio_bucket(rbio);
				679	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				680	struct btrfs_raid_bio *cur;
				681	struct btrfs_raid_bio *pending;
				682	unsigned long flags;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	683	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	684	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	685	int ret = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	686
				687	spin_lock_irqsave(&h->lock, flags);
				688	list_for_each_entry(cur, &h->hash_list, hash_list) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	689	if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	690	spin_lock(&cur->bio_list_lock);
				691
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	692	/* can we steal this cached rbio's pages? */
				693	if (bio_list_empty(&cur->bio_list) &&
				694	list_empty(&cur->plug_list) &&
				695	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				696	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				697	list_del_init(&cur->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	698	refcount_dec(&cur->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	699
				700	steal_rbio(cur, rbio);
				701	cache_drop = cur;
				702	spin_unlock(&cur->bio_list_lock);
				703
				704	goto lockit;
				705	}
				706
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	707	/* can we merge into the lock owner? */
				708	if (rbio_can_merge(cur, rbio)) {
				709	merge_rbio(cur, rbio);
				710	spin_unlock(&cur->bio_list_lock);
				711	freeit = rbio;
				712	ret = 1;
				713	goto out;
				714	}
				715
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	716
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	717	/*
				718	* we couldn't merge with the running
				719	* rbio, see if we can merge with the
				720	* pending ones. We don't have to
				721	* check for rmw_locked because there
				722	* is no way they are inside finish_rmw
				723	* right now
				724	*/
				725	list_for_each_entry(pending, &cur->plug_list,
				726	plug_list) {
				727	if (rbio_can_merge(pending, rbio)) {
				728	merge_rbio(pending, rbio);
				729	spin_unlock(&cur->bio_list_lock);
				730	freeit = rbio;
				731	ret = 1;
				732	goto out;
				733	}
				734	}
				735
				736	/* no merging, put us on the tail of the plug list,
				737	* our rbio will be started with the currently
				738	* running rbio unlocks
				739	*/
				740	list_add_tail(&rbio->plug_list, &cur->plug_list);
				741	spin_unlock(&cur->bio_list_lock);
				742	ret = 1;
				743	goto out;
				744	}
				745	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	746	lockit:
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	747	refcount_inc(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	748	list_add(&rbio->hash_list, &h->hash_list);
				749	out:
				750	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	751	if (cache_drop)
				752	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	753	if (freeit)
				754	__free_raid_bio(freeit);
				755	return ret;
				756	}
				757
				758	/*
				759	* called as rmw or parity rebuild is completed. If the plug list has more
				760	* rbios waiting for this stripe, the next one on the list will be started
				761	*/
				762	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				763	{
				764	int bucket;
				765	struct btrfs_stripe_hash *h;
				766	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	767	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	768
				769	bucket = rbio_bucket(rbio);
				770	h = rbio->fs_info->stripe_hash_table->table + bucket;
				771
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	772	if (list_empty(&rbio->plug_list))
				773	cache_rbio(rbio);
				774
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	775	spin_lock_irqsave(&h->lock, flags);
				776	spin_lock(&rbio->bio_list_lock);
				777
				778	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	779	/*
				780	* if we're still cached and there is no other IO
				781	* to perform, just leave this rbio here for others
				782	* to steal from later
				783	*/
				784	if (list_empty(&rbio->plug_list) &&
				785	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				786	keep_cache = 1;
				787	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				788	BUG_ON(!bio_list_empty(&rbio->bio_list));
				789	goto done;
				790	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	791
				792	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	793	refcount_dec(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	794
				795	/*
				796	* we use the plug list to hold all the rbios
				797	* waiting for the chance to lock this stripe.
				798	* hand the lock over to one of them.
				799	*/
				800	if (!list_empty(&rbio->plug_list)) {
				801	struct btrfs_raid_bio *next;
				802	struct list_head *head = rbio->plug_list.next;
				803
				804	next = list_entry(head, struct btrfs_raid_bio,
				805	plug_list);
				806
				807	list_del_init(&rbio->plug_list);
				808
				809	list_add(&next->hash_list, &h->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	810	refcount_inc(&next->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	811	spin_unlock(&rbio->bio_list_lock);
				812	spin_unlock_irqrestore(&h->lock, flags);
				813
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	814	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	815	async_read_rebuild(next);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	816	else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
				817	steal_rbio(rbio, next);
				818	async_read_rebuild(next);
				819	} else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	820	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	821	async_rmw_stripe(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	822	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				823	steal_rbio(rbio, next);
				824	async_scrub_parity(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	825	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	826
				827	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	828	}
				829	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	830	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	831	spin_unlock(&rbio->bio_list_lock);
				832	spin_unlock_irqrestore(&h->lock, flags);
				833
				834	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	835	if (!keep_cache)
				836	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	837	}
				838
				839	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				840	{
				841	int i;
				842
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	843	if (!refcount_dec_and_test(&rbio->refs))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	844	return;
				845
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	846	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	847	WARN_ON(!list_empty(&rbio->hash_list));
				848	WARN_ON(!bio_list_empty(&rbio->bio_list));
				849
				850	for (i = 0; i < rbio->nr_pages; i++) {
				851	if (rbio->stripe_pages[i]) {
				852	__free_page(rbio->stripe_pages[i]);
				853	rbio->stripe_pages[i] = NULL;
				854	}
				855	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	856
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	857	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	858	kfree(rbio);
				859	}
				860
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	861	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	862	{
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	863	struct bio *next;
				864
				865	while (cur) {
				866	next = cur->bi_next;
				867	cur->bi_next = NULL;
				868	cur->bi_status = err;
				869	bio_endio(cur);
				870	cur = next;
				871	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	872	}
				873
				874	/*
				875	* this frees the rbio and runs through all the bios in the
				876	* bio_list and calls end_io on them
				877	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	878	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	879	{
				880	struct bio *cur = bio_list_get(&rbio->bio_list);
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	881	struct bio *extra;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	882
				883	if (rbio->generic_bio_cnt)
				884	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				885
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	886	/*
				887	* At this moment, rbio->bio_list is empty, however since rbio does not
				888	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
				889	* hash list, rbio may be merged with others so that rbio->bio_list
				890	* becomes non-empty.
				891	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
				892	* more and we can call bio_endio() on all queued bios.
				893	*/
				894	unlock_stripe(rbio);
				895	extra = bio_list_get(&rbio->bio_list);
				896	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	897
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	898	rbio_endio_bio_list(cur, err);
				899	if (extra)
				900	rbio_endio_bio_list(extra, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	901	}
				902
				903	/*
				904	* end io function used by finish_rmw. When we finally
				905	* get here, we've written a full stripe
				906	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	907	static void raid_write_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	908	{
				909	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	910	blk_status_t err = bio->bi_status;
Zhao Lei	a6111d1	2016-01-12 17:52:13 +0800	[diff] [blame]	911	int max_errors;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	912
				913	if (err)
				914	fail_bio_stripe(rbio, bio);
				915
				916	bio_put(bio);
				917
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	918	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	919	return;
				920
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	921	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	922
				923	/* OK, we have read all the stripes we need to. */
Zhao Lei	a6111d1	2016-01-12 17:52:13 +0800	[diff] [blame]	924	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
				925	0 : rbio->bbio->max_errors;
				926	if (atomic_read(&rbio->error) > max_errors)
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	927	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	928
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	929	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	930	}
				931
				932	/*
				933	* the read/modify/write code wants to use the original bio for
				934	* any pages it included, and then use the rbio for everything
				935	* else. This function decides if a given index (stripe number)
				936	* and page number in that stripe fall inside the original bio
				937	* or the rbio.
				938	*
				939	* if you set bio_list_only, you'll get a NULL back for any ranges
				940	* that are outside the bio_list
				941	*
				942	* This doesn't take any refs on anything, you get a bare page pointer
				943	* and the caller must bump refs as required.
				944	*
				945	* You must call index_rbio_pages once before you can trust
				946	* the answers from this function.
				947	*/
				948	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				949	int index, int pagenr, int bio_list_only)
				950	{
				951	int chunk_page;
				952	struct page *p = NULL;
				953
				954	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				955
				956	spin_lock_irq(&rbio->bio_list_lock);
				957	p = rbio->bio_pages[chunk_page];
				958	spin_unlock_irq(&rbio->bio_list_lock);
				959
				960	if (p \|\| bio_list_only)
				961	return p;
				962
				963	return rbio->stripe_pages[chunk_page];
				964	}
				965
				966	/*
				967	* number of pages we need for the entire stripe across all the
				968	* drives
				969	*/
				970	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				971	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	972	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	973	}
				974
				975	/*
				976	* allocation and initial setup for the btrfs_raid_bio. Not
				977	* this does not allocate any pages for rbio->pages.
				978	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	979	static struct btrfs_raid_bio alloc_rbio(struct btrfs_fs_info fs_info,
				980	struct btrfs_bio *bbio,
				981	u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	982	{
				983	struct btrfs_raid_bio *rbio;
				984	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	985	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				986	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	987	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	988	void *p;
				989
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame^]	990	rbio = kzalloc(sizeof(*rbio) +
				991	sizeof(rbio->stripe_pages) num_pages +
				992	sizeof(rbio->bio_pages) num_pages +
				993	sizeof(rbio->finish_pointers) real_stripes +
				994	sizeof(rbio->dbitmap) BITS_TO_LONGS(stripe_npages) +
				995	sizeof(rbio->finish_pbitmap)
				996	BITS_TO_LONGS(stripe_npages),
				997	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	998	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	999	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1000
				1001	bio_list_init(&rbio->bio_list);
				1002	INIT_LIST_HEAD(&rbio->plug_list);
				1003	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1004	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1005	INIT_LIST_HEAD(&rbio->hash_list);
				1006	rbio->bbio = bbio;
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1007	rbio->fs_info = fs_info;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1008	rbio->stripe_len = stripe_len;
				1009	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1010	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1011	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1012	rbio->faila = -1;
				1013	rbio->failb = -1;
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	1014	refcount_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1015	atomic_set(&rbio->error, 0);
				1016	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1017
				1018	/*
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame^]	1019	* the stripe_pages, bio_pages, etc arrays point to the extra
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1020	* memory we allocated past the end of the rbio
				1021	*/
				1022	p = rbio + 1;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame^]	1023	#define CONSUME_ALLOC(ptr, count) do { \
				1024	ptr = p; \
				1025	p = (unsigned char )p + sizeof((ptr)) * (count); \
				1026	} while (0)
				1027	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
				1028	CONSUME_ALLOC(rbio->bio_pages, num_pages);
				1029	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
				1030	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
				1031	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
				1032	#undef CONSUME_ALLOC
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1033
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1034	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				1035	nr_data = real_stripes - 1;
				1036	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1037	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1038	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1039	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1040
				1041	rbio->nr_data = nr_data;
				1042	return rbio;
				1043	}
				1044
				1045	/* allocate pages for all the stripes in the bio, including parity */
				1046	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1047	{
				1048	int i;
				1049	struct page *page;
				1050
				1051	for (i = 0; i < rbio->nr_pages; i++) {
				1052	if (rbio->stripe_pages[i])
				1053	continue;
				1054	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1055	if (!page)
				1056	return -ENOMEM;
				1057	rbio->stripe_pages[i] = page;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1058	}
				1059	return 0;
				1060	}
				1061
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1062	/* only allocate pages for p/q stripes */
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1063	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1064	{
				1065	int i;
				1066	struct page *page;
				1067
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1068	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1069
				1070	for (; i < rbio->nr_pages; i++) {
				1071	if (rbio->stripe_pages[i])
				1072	continue;
				1073	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1074	if (!page)
				1075	return -ENOMEM;
				1076	rbio->stripe_pages[i] = page;
				1077	}
				1078	return 0;
				1079	}
				1080
				1081	/*
				1082	* add a single page from a specific stripe into our list of bios for IO
				1083	* this will try to merge into existing bios if possible, and returns
				1084	* zero if all went well.
				1085	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1086	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1087	struct bio_list *bio_list,
				1088	struct page *page,
				1089	int stripe_nr,
				1090	unsigned long page_index,
				1091	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1092	{
				1093	struct bio *last = bio_list->tail;
				1094	u64 last_end = 0;
				1095	int ret;
				1096	struct bio *bio;
				1097	struct btrfs_bio_stripe *stripe;
				1098	u64 disk_start;
				1099
				1100	stripe = &rbio->bbio->stripes[stripe_nr];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1101	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1102
				1103	/* if the device is missing, just fail this stripe */
				1104	if (!stripe->dev->bdev)
				1105	return fail_rbio_index(rbio, stripe_nr);
				1106
				1107	/* see if we can add this page onto our existing bio */
				1108	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1109	last_end = (u64)last->bi_iter.bi_sector << 9;
				1110	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1111
				1112	/*
				1113	* we can't merge these if they are from different
				1114	* devices or if they are not contiguous
				1115	*/
				1116	if (last_end == disk_start && stripe->dev->bdev &&
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1117	!last->bi_status &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1118	last->bi_disk == stripe->dev->bdev->bd_disk &&
				1119	last->bi_partno == stripe->dev->bdev->bd_partno) {
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1120	ret = bio_add_page(last, page, PAGE_SIZE, 0);
				1121	if (ret == PAGE_SIZE)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1122	return 0;
				1123	}
				1124	}
				1125
				1126	/* put a new bio on the list */
David Sterba	c5e4c3d	2017-06-12 17:29:41 +0200	[diff] [blame]	1127	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1128	bio->bi_iter.bi_size = 0;
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1129	bio_set_dev(bio, stripe->dev->bdev);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1130	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1131
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1132	bio_add_page(bio, page, PAGE_SIZE, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1133	bio_list_add(bio_list, bio);
				1134	return 0;
				1135	}
				1136
				1137	/*
				1138	* while we're doing the read/modify/write cycle, we could
				1139	* have errors in reading pages off the disk. This checks
				1140	* for errors and if we're not able to read the page it'll
				1141	* trigger parity reconstruction. The rmw will be finished
				1142	* after we've reconstructed the failed stripes
				1143	*/
				1144	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1145	{
				1146	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1147	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1148	__raid56_parity_recover(rbio);
				1149	} else {
				1150	finish_rmw(rbio);
				1151	}
				1152	}
				1153
				1154	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1155	* helper function to walk our bio list and populate the bio_pages array with
				1156	* the result. This seems expensive, but it is faster than constantly
				1157	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1158	* reconstruction.
				1159	*
				1160	* This must be called before you trust the answers from page_in_rbio
				1161	*/
				1162	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1163	{
				1164	struct bio *bio;
				1165	u64 start;
				1166	unsigned long stripe_offset;
				1167	unsigned long page_index;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1168
				1169	spin_lock_irq(&rbio->bio_list_lock);
				1170	bio_list_for_each(bio, &rbio->bio_list) {
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1171	struct bio_vec bvec;
				1172	struct bvec_iter iter;
				1173	int i = 0;
				1174
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1175	start = (u64)bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1176	stripe_offset = start - rbio->bbio->raid_map[0];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1177	page_index = stripe_offset >> PAGE_SHIFT;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1178
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1179	if (bio_flagged(bio, BIO_CLONED))
				1180	bio->bi_iter = btrfs_io_bio(bio)->iter;
				1181
				1182	bio_for_each_segment(bvec, bio, iter) {
				1183	rbio->bio_pages[page_index + i] = bvec.bv_page;
				1184	i++;
				1185	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1186	}
				1187	spin_unlock_irq(&rbio->bio_list_lock);
				1188	}
				1189
				1190	/*
				1191	* this is called from one of two situations. We either
				1192	* have a full stripe from the higher layers, or we've read all
				1193	* the missing bits off disk.
				1194	*
				1195	* This will calculate the parity and then send down any
				1196	* changed blocks.
				1197	*/
				1198	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1199	{
				1200	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame^]	1201	void **pointers = rbio->finish_pointers;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1202	int nr_data = rbio->nr_data;
				1203	int stripe;
				1204	int pagenr;
				1205	int p_stripe = -1;
				1206	int q_stripe = -1;
				1207	struct bio_list bio_list;
				1208	struct bio *bio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1209	int ret;
				1210
				1211	bio_list_init(&bio_list);
				1212
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1213	if (rbio->real_stripes - rbio->nr_data == 1) {
				1214	p_stripe = rbio->real_stripes - 1;
				1215	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				1216	p_stripe = rbio->real_stripes - 2;
				1217	q_stripe = rbio->real_stripes - 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1218	} else {
				1219	BUG();
				1220	}
				1221
				1222	/* at this point we either have a full stripe,
				1223	* or we've read the full stripe from the drive.
				1224	* recalculate the parity and write the new results.
				1225	*
				1226	* We're not allowed to add any new bios to the
				1227	* bio list here, anyone else that wants to
				1228	* change this stripe needs to do their own rmw.
				1229	*/
				1230	spin_lock_irq(&rbio->bio_list_lock);
				1231	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1232	spin_unlock_irq(&rbio->bio_list_lock);
				1233
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1234	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1235
				1236	/*
				1237	* now that we've set rmw_locked, run through the
				1238	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1239	*
				1240	* We don't cache full rbios because we're assuming
				1241	* the higher layers are unlikely to use this area of
				1242	* the disk again soon. If they do use it again,
				1243	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1244	*/
				1245	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1246	if (!rbio_is_full(rbio))
				1247	cache_rbio_pages(rbio);
				1248	else
				1249	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1250
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1251	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1252	struct page *p;
				1253	/* first collect one page from each data stripe */
				1254	for (stripe = 0; stripe < nr_data; stripe++) {
				1255	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1256	pointers[stripe] = kmap(p);
				1257	}
				1258
				1259	/* then add the parity stripe */
				1260	p = rbio_pstripe_page(rbio, pagenr);
				1261	SetPageUptodate(p);
				1262	pointers[stripe++] = kmap(p);
				1263
				1264	if (q_stripe != -1) {
				1265
				1266	/*
				1267	* raid6, add the qstripe and call the
				1268	* library function to fill in our p/q
				1269	*/
				1270	p = rbio_qstripe_page(rbio, pagenr);
				1271	SetPageUptodate(p);
				1272	pointers[stripe++] = kmap(p);
				1273
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1274	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1275	pointers);
				1276	} else {
				1277	/* raid5 */
				1278	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1279	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1280	}
				1281
				1282
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1283	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1284	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1285	}
				1286
				1287	/*
				1288	* time to start writing. Make bios for everything from the
				1289	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1290	* everything else.
				1291	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1292	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1293	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1294	struct page *page;
				1295	if (stripe < rbio->nr_data) {
				1296	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1297	if (!page)
				1298	continue;
				1299	} else {
				1300	page = rbio_stripe_page(rbio, stripe, pagenr);
				1301	}
				1302
				1303	ret = rbio_add_io_page(rbio, &bio_list,
				1304	page, stripe, pagenr, rbio->stripe_len);
				1305	if (ret)
				1306	goto cleanup;
				1307	}
				1308	}
				1309
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1310	if (likely(!bbio->num_tgtdevs))
				1311	goto write_data;
				1312
				1313	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1314	if (!bbio->tgtdev_map[stripe])
				1315	continue;
				1316
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1317	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1318	struct page *page;
				1319	if (stripe < rbio->nr_data) {
				1320	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1321	if (!page)
				1322	continue;
				1323	} else {
				1324	page = rbio_stripe_page(rbio, stripe, pagenr);
				1325	}
				1326
				1327	ret = rbio_add_io_page(rbio, &bio_list, page,
				1328	rbio->bbio->tgtdev_map[stripe],
				1329	pagenr, rbio->stripe_len);
				1330	if (ret)
				1331	goto cleanup;
				1332	}
				1333	}
				1334
				1335	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1336	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1337	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1338
				1339	while (1) {
				1340	bio = bio_list_pop(&bio_list);
				1341	if (!bio)
				1342	break;
				1343
				1344	bio->bi_private = rbio;
				1345	bio->bi_end_io = raid_write_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	1346	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1347
				1348	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1349	}
				1350	return;
				1351
				1352	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1353	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1354
				1355	while ((bio = bio_list_pop(&bio_list)))
				1356	bio_put(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1357	}
				1358
				1359	/*
				1360	* helper to find the stripe number for a given bio. Used to figure out which
				1361	* stripe has failed. This expects the bio to correspond to a physical disk,
				1362	* so it looks up based on physical sector numbers.
				1363	*/
				1364	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1365	struct bio *bio)
				1366	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1367	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1368	u64 stripe_start;
				1369	int i;
				1370	struct btrfs_bio_stripe *stripe;
				1371
				1372	physical <<= 9;
				1373
				1374	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1375	stripe = &rbio->bbio->stripes[i];
				1376	stripe_start = stripe->physical;
				1377	if (physical >= stripe_start &&
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1378	physical < stripe_start + rbio->stripe_len &&
Dmitriy Gorokh	047fdea	2018-02-16 19:51:38 +0000	[diff] [blame]	1379	stripe->dev->bdev &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1380	bio->bi_disk == stripe->dev->bdev->bd_disk &&
				1381	bio->bi_partno == stripe->dev->bdev->bd_partno) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1382	return i;
				1383	}
				1384	}
				1385	return -1;
				1386	}
				1387
				1388	/*
				1389	* helper to find the stripe number for a given
				1390	* bio (before mapping). Used to figure out which stripe has
				1391	* failed. This looks up based on logical block numbers.
				1392	*/
				1393	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1394	struct bio *bio)
				1395	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1396	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1397	u64 stripe_start;
				1398	int i;
				1399
				1400	logical <<= 9;
				1401
				1402	for (i = 0; i < rbio->nr_data; i++) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1403	stripe_start = rbio->bbio->raid_map[i];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1404	if (logical >= stripe_start &&
				1405	logical < stripe_start + rbio->stripe_len) {
				1406	return i;
				1407	}
				1408	}
				1409	return -1;
				1410	}
				1411
				1412	/*
				1413	* returns -EIO if we had too many failures
				1414	*/
				1415	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1416	{
				1417	unsigned long flags;
				1418	int ret = 0;
				1419
				1420	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1421
				1422	/* we already know this stripe is bad, move on */
				1423	if (rbio->faila == failed \|\| rbio->failb == failed)
				1424	goto out;
				1425
				1426	if (rbio->faila == -1) {
				1427	/* first failure on this rbio */
				1428	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1429	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1430	} else if (rbio->failb == -1) {
				1431	/* second failure on this rbio */
				1432	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1433	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1434	} else {
				1435	ret = -EIO;
				1436	}
				1437	out:
				1438	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1439
				1440	return ret;
				1441	}
				1442
				1443	/*
				1444	* helper to fail a stripe based on a physical disk
				1445	* bio.
				1446	*/
				1447	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1448	struct bio *bio)
				1449	{
				1450	int failed = find_bio_stripe(rbio, bio);
				1451
				1452	if (failed < 0)
				1453	return -EIO;
				1454
				1455	return fail_rbio_index(rbio, failed);
				1456	}
				1457
				1458	/*
				1459	* this sets each page in the bio uptodate. It should only be used on private
				1460	* rbio pages, nothing that comes in from the higher layers
				1461	*/
				1462	static void set_bio_pages_uptodate(struct bio *bio)
				1463	{
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1464	struct bio_vec *bvec;
				1465	int i;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1466
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1467	ASSERT(!bio_flagged(bio, BIO_CLONED));
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1468
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1469	bio_for_each_segment_all(bvec, bio, i)
				1470	SetPageUptodate(bvec->bv_page);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1471	}
				1472
				1473	/*
				1474	* end io for the read phase of the rmw cycle. All the bios here are physical
				1475	* stripe bios we've read from the disk so we can recalculate the parity of the
				1476	* stripe.
				1477	*
				1478	* This will usually kick off finish_rmw once all the bios are read in, but it
				1479	* may trigger parity reconstruction if we had any errors along the way
				1480	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1481	static void raid_rmw_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1482	{
				1483	struct btrfs_raid_bio *rbio = bio->bi_private;
				1484
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1485	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1486	fail_bio_stripe(rbio, bio);
				1487	else
				1488	set_bio_pages_uptodate(bio);
				1489
				1490	bio_put(bio);
				1491
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1492	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1493	return;
				1494
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1495	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1496	goto cleanup;
				1497
				1498	/*
				1499	* this will normally call finish_rmw to start our write
				1500	* but if there are any failed stripes we'll reconstruct
				1501	* from parity first
				1502	*/
				1503	validate_rbio_for_rmw(rbio);
				1504	return;
				1505
				1506	cleanup:
				1507
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1508	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1509	}
				1510
				1511	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1512	{
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1513	btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL);
				1514	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1515	}
				1516
				1517	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1518	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1519	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1520	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1521
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1522	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1523	}
				1524
				1525	/*
				1526	* the stripe must be locked by the caller. It will
				1527	* unlock after all the writes are done
				1528	*/
				1529	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1530	{
				1531	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1532	struct bio_list bio_list;
				1533	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1534	int pagenr;
				1535	int stripe;
				1536	struct bio *bio;
				1537
				1538	bio_list_init(&bio_list);
				1539
				1540	ret = alloc_rbio_pages(rbio);
				1541	if (ret)
				1542	goto cleanup;
				1543
				1544	index_rbio_pages(rbio);
				1545
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1546	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1547	/*
				1548	* build a list of bios to read all the missing parts of this
				1549	* stripe
				1550	*/
				1551	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1552	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1553	struct page *page;
				1554	/*
				1555	* we want to find all the pages missing from
				1556	* the rbio and read them from the disk. If
				1557	* page_in_rbio finds a page in the bio list
				1558	* we don't need to read it off the stripe.
				1559	*/
				1560	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1561	if (page)
				1562	continue;
				1563
				1564	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1565	/*
				1566	* the bio cache may have handed us an uptodate
				1567	* page. If so, be happy and use it
				1568	*/
				1569	if (PageUptodate(page))
				1570	continue;
				1571
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1572	ret = rbio_add_io_page(rbio, &bio_list, page,
				1573	stripe, pagenr, rbio->stripe_len);
				1574	if (ret)
				1575	goto cleanup;
				1576	}
				1577	}
				1578
				1579	bios_to_read = bio_list_size(&bio_list);
				1580	if (!bios_to_read) {
				1581	/*
				1582	* this can happen if others have merged with
				1583	* us, it means there is nothing left to read.
				1584	* But if there are missing devices it may not be
				1585	* safe to do the full stripe write yet.
				1586	*/
				1587	goto finish;
				1588	}
				1589
				1590	/*
				1591	* the bbio may be freed once we submit the last bio. Make sure
				1592	* not to touch it after that
				1593	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1594	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1595	while (1) {
				1596	bio = bio_list_pop(&bio_list);
				1597	if (!bio)
				1598	break;
				1599
				1600	bio->bi_private = rbio;
				1601	bio->bi_end_io = raid_rmw_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	1602	bio_set_op_attrs(bio, REQ_OP_READ, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1603
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1604	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1605
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1606	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1607	}
				1608	/* the actual write will happen once the reads are done */
				1609	return 0;
				1610
				1611	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1612	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1613
				1614	while ((bio = bio_list_pop(&bio_list)))
				1615	bio_put(bio);
				1616
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1617	return -EIO;
				1618
				1619	finish:
				1620	validate_rbio_for_rmw(rbio);
				1621	return 0;
				1622	}
				1623
				1624	/*
				1625	* if the upper layers pass in a full stripe, we thank them by only allocating
				1626	* enough pages to hold the parity, and sending it all down quickly.
				1627	*/
				1628	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1629	{
				1630	int ret;
				1631
				1632	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1633	if (ret) {
				1634	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1635	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1636	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1637
				1638	ret = lock_stripe_add(rbio);
				1639	if (ret == 0)
				1640	finish_rmw(rbio);
				1641	return 0;
				1642	}
				1643
				1644	/*
				1645	* partial stripe writes get handed over to async helpers.
				1646	* We're really hoping to merge a few more writes into this
				1647	* rbio before calculating new parity
				1648	*/
				1649	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1650	{
				1651	int ret;
				1652
				1653	ret = lock_stripe_add(rbio);
				1654	if (ret == 0)
				1655	async_rmw_stripe(rbio);
				1656	return 0;
				1657	}
				1658
				1659	/*
				1660	* sometimes while we were reading from the drive to
				1661	* recalculate parity, enough new bios come into create
				1662	* a full stripe. So we do a check here to see if we can
				1663	* go directly to finish_rmw
				1664	*/
				1665	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1666	{
				1667	/* head off into rmw land if we don't have a full stripe */
				1668	if (!rbio_is_full(rbio))
				1669	return partial_stripe_write(rbio);
				1670	return full_stripe_write(rbio);
				1671	}
				1672
				1673	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1674	* We use plugging call backs to collect full stripes.
				1675	* Any time we get a partial stripe write while plugged
				1676	* we collect it into a list. When the unplug comes down,
				1677	* we sort the list by logical block number and merge
				1678	* everything we can into the same rbios
				1679	*/
				1680	struct btrfs_plug_cb {
				1681	struct blk_plug_cb cb;
				1682	struct btrfs_fs_info *info;
				1683	struct list_head rbio_list;
				1684	struct btrfs_work work;
				1685	};
				1686
				1687	/*
				1688	* rbios on the plug list are sorted for easier merging.
				1689	*/
				1690	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1691	{
				1692	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1693	plug_list);
				1694	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1695	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1696	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1697	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1698
				1699	if (a_sector < b_sector)
				1700	return -1;
				1701	if (a_sector > b_sector)
				1702	return 1;
				1703	return 0;
				1704	}
				1705
				1706	static void run_plug(struct btrfs_plug_cb *plug)
				1707	{
				1708	struct btrfs_raid_bio *cur;
				1709	struct btrfs_raid_bio *last = NULL;
				1710
				1711	/*
				1712	* sort our plug list then try to merge
				1713	* everything we can in hopes of creating full
				1714	* stripes.
				1715	*/
				1716	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1717	while (!list_empty(&plug->rbio_list)) {
				1718	cur = list_entry(plug->rbio_list.next,
				1719	struct btrfs_raid_bio, plug_list);
				1720	list_del_init(&cur->plug_list);
				1721
				1722	if (rbio_is_full(cur)) {
				1723	/* we have a full stripe, send it down */
				1724	full_stripe_write(cur);
				1725	continue;
				1726	}
				1727	if (last) {
				1728	if (rbio_can_merge(last, cur)) {
				1729	merge_rbio(last, cur);
				1730	__free_raid_bio(cur);
				1731	continue;
				1732
				1733	}
				1734	__raid56_parity_write(last);
				1735	}
				1736	last = cur;
				1737	}
				1738	if (last) {
				1739	__raid56_parity_write(last);
				1740	}
				1741	kfree(plug);
				1742	}
				1743
				1744	/*
				1745	* if the unplug comes from schedule, we have to push the
				1746	* work off to a helper thread
				1747	*/
				1748	static void unplug_work(struct btrfs_work *work)
				1749	{
				1750	struct btrfs_plug_cb *plug;
				1751	plug = container_of(work, struct btrfs_plug_cb, work);
				1752	run_plug(plug);
				1753	}
				1754
				1755	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1756	{
				1757	struct btrfs_plug_cb *plug;
				1758	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1759
				1760	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1761	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1762	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1763	btrfs_queue_work(plug->info->rmw_workers,
				1764	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1765	return;
				1766	}
				1767	run_plug(plug);
				1768	}
				1769
				1770	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1771	* our main entry point for writes from the rest of the FS.
				1772	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1773	int raid56_parity_write(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1774	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1775	{
				1776	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1777	struct btrfs_plug_cb *plug = NULL;
				1778	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1779	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1780
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1781	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1782	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1783	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1784	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1785	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1786	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1787	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1788	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1789
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1790	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1791	rbio->generic_bio_cnt = 1;
				1792
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1793	/*
				1794	* don't plug on full rbios, just get them out the door
				1795	* as quickly as we can
				1796	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1797	if (rbio_is_full(rbio)) {
				1798	ret = full_stripe_write(rbio);
				1799	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1800	btrfs_bio_counter_dec(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1801	return ret;
				1802	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1803
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1804	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1805	if (cb) {
				1806	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1807	if (!plug->info) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1808	plug->info = fs_info;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1809	INIT_LIST_HEAD(&plug->rbio_list);
				1810	}
				1811	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1812	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1813	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1814	ret = __raid56_parity_write(rbio);
				1815	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1816	btrfs_bio_counter_dec(fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1817	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1818	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1819	}
				1820
				1821	/*
				1822	* all parity reconstruction happens here. We've read in everything
				1823	* we can find from the drives and this does the heavy lifting of
				1824	* sorting the good from the bad.
				1825	*/
				1826	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1827	{
				1828	int pagenr, stripe;
				1829	void **pointers;
				1830	int faila = -1, failb = -1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1831	struct page *page;
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1832	blk_status_t err;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1833	int i;
				1834
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1835	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1836	if (!pointers) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1837	err = BLK_STS_RESOURCE;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1838	goto cleanup_io;
				1839	}
				1840
				1841	faila = rbio->faila;
				1842	failb = rbio->failb;
				1843
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1844	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1845	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1846	spin_lock_irq(&rbio->bio_list_lock);
				1847	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1848	spin_unlock_irq(&rbio->bio_list_lock);
				1849	}
				1850
				1851	index_rbio_pages(rbio);
				1852
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1853	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1854	/*
				1855	* Now we just use bitmap to mark the horizontal stripes in
				1856	* which we have data when doing parity scrub.
				1857	*/
				1858	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1859	!test_bit(pagenr, rbio->dbitmap))
				1860	continue;
				1861
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1862	/* setup our array of pointers with pages
				1863	* from each stripe
				1864	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1865	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1866	/*
				1867	* if we're rebuilding a read, we have to use
				1868	* pages from the bio list
				1869	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1870	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1871	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1872	(stripe == faila \|\| stripe == failb)) {
				1873	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1874	} else {
				1875	page = rbio_stripe_page(rbio, stripe, pagenr);
				1876	}
				1877	pointers[stripe] = kmap(page);
				1878	}
				1879
				1880	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1881	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1882	/*
				1883	* single failure, rebuild from parity raid5
				1884	* style
				1885	*/
				1886	if (failb < 0) {
				1887	if (faila == rbio->nr_data) {
				1888	/*
				1889	* Just the P stripe has failed, without
				1890	* a bad data or Q stripe.
				1891	* TODO, we should redo the xor here.
				1892	*/
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1893	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1894	goto cleanup;
				1895	}
				1896	/*
				1897	* a single failure in raid6 is rebuilt
				1898	* in the pstripe code below
				1899	*/
				1900	goto pstripe;
				1901	}
				1902
				1903	/* make sure our ps and qs are in order */
				1904	if (faila > failb) {
				1905	int tmp = failb;
				1906	failb = faila;
				1907	faila = tmp;
				1908	}
				1909
				1910	/* if the q stripe is failed, do a pstripe reconstruction
				1911	* from the xors.
				1912	* If both the q stripe and the P stripe are failed, we're
				1913	* here due to a crc mismatch and we can't give them the
				1914	* data they want
				1915	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1916	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1917	if (rbio->bbio->raid_map[faila] ==
				1918	RAID5_P_STRIPE) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1919	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1920	goto cleanup;
				1921	}
				1922	/*
				1923	* otherwise we have one bad data stripe and
				1924	* a good P stripe. raid5!
				1925	*/
				1926	goto pstripe;
				1927	}
				1928
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1929	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1930	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1931	PAGE_SIZE, faila, pointers);
				1932	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1933	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1934	PAGE_SIZE, faila, failb,
				1935	pointers);
				1936	}
				1937	} else {
				1938	void *p;
				1939
				1940	/* rebuild from P stripe here (raid5 or raid6) */
				1941	BUG_ON(failb != -1);
				1942	pstripe:
				1943	/* Copy parity block into failed block to start with */
				1944	memcpy(pointers[faila],
				1945	pointers[rbio->nr_data],
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1946	PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1947
				1948	/* rearrange the pointer array */
				1949	p = pointers[faila];
				1950	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1951	pointers[stripe] = pointers[stripe + 1];
				1952	pointers[rbio->nr_data - 1] = p;
				1953
				1954	/* xor in the rest */
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1955	run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1956	}
				1957	/* if we're doing this rebuild as part of an rmw, go through
				1958	* and set all of our private rbio pages in the
				1959	* failed stripes as uptodate. This way finish_rmw will
				1960	* know they can be trusted. If this was a read reconstruction,
				1961	* other endio functions will fiddle the uptodate bits
				1962	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1963	if (rbio->operation == BTRFS_RBIO_WRITE) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1964	for (i = 0; i < rbio->stripe_npages; i++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1965	if (faila != -1) {
				1966	page = rbio_stripe_page(rbio, faila, i);
				1967	SetPageUptodate(page);
				1968	}
				1969	if (failb != -1) {
				1970	page = rbio_stripe_page(rbio, failb, i);
				1971	SetPageUptodate(page);
				1972	}
				1973	}
				1974	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1975	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1976	/*
				1977	* if we're rebuilding a read, we have to use
				1978	* pages from the bio list
				1979	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1980	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1981	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1982	(stripe == faila \|\| stripe == failb)) {
				1983	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1984	} else {
				1985	page = rbio_stripe_page(rbio, stripe, pagenr);
				1986	}
				1987	kunmap(page);
				1988	}
				1989	}
				1990
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1991	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1992	cleanup:
				1993	kfree(pointers);
				1994
				1995	cleanup_io:
Liu Bo	580c6ef	2018-03-22 09:20:11 +0800	[diff] [blame]	1996	/*
				1997	* Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
				1998	* valid rbio which is consistent with ondisk content, thus such a
				1999	* valid rbio can be cached to avoid further disk reads.
				2000	*/
				2001	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2002	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	2003	/*
				2004	* - In case of two failures, where rbio->failb != -1:
				2005	*
				2006	* Do not cache this rbio since the above read reconstruction
				2007	* (raid6_datap_recov() or raid6_2data_recov()) may have
				2008	* changed some content of stripes which are not identical to
				2009	* on-disk content any more, otherwise, a later write/recover
				2010	* may steal stripe_pages from this rbio and end up with
				2011	* corruptions or rebuild failures.
				2012	*
				2013	* - In case of single failure, where rbio->failb == -1:
				2014	*
				2015	* Cache this rbio iff the above read reconstruction is
				2016	* excuted without problems.
				2017	*/
				2018	if (err == BLK_STS_OK && rbio->failb < 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2019	cache_rbio_pages(rbio);
				2020	else
				2021	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2022
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2023	rbio_orig_end_io(rbio, err);
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2024	} else if (err == BLK_STS_OK) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2025	rbio->faila = -1;
				2026	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2027
				2028	if (rbio->operation == BTRFS_RBIO_WRITE)
				2029	finish_rmw(rbio);
				2030	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				2031	finish_parity_scrub(rbio, 0);
				2032	else
				2033	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2034	} else {
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2035	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2036	}
				2037	}
				2038
				2039	/*
				2040	* This is called only for stripes we've read from disk to
				2041	* reconstruct the parity.
				2042	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2043	static void raid_recover_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2044	{
				2045	struct btrfs_raid_bio *rbio = bio->bi_private;
				2046
				2047	/*
				2048	* we only read stripe pages off the disk, set them
				2049	* up to date if there were no errors
				2050	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2051	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2052	fail_bio_stripe(rbio, bio);
				2053	else
				2054	set_bio_pages_uptodate(bio);
				2055	bio_put(bio);
				2056
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2057	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2058	return;
				2059
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2060	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2061	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2062	else
				2063	__raid_recover_end_io(rbio);
				2064	}
				2065
				2066	/*
				2067	* reads everything we need off the disk to reconstruct
				2068	* the parity. endio handlers trigger final reconstruction
				2069	* when the IO is done.
				2070	*
				2071	* This is used both for reads from the higher layers and for
				2072	* parity construction required to finish a rmw cycle.
				2073	*/
				2074	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2075	{
				2076	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2077	struct bio_list bio_list;
				2078	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2079	int pagenr;
				2080	int stripe;
				2081	struct bio *bio;
				2082
				2083	bio_list_init(&bio_list);
				2084
				2085	ret = alloc_rbio_pages(rbio);
				2086	if (ret)
				2087	goto cleanup;
				2088
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2089	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2090
				2091	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2092	* read everything that hasn't failed. Thanks to the
				2093	* stripe cache, it is possible that some or all of these
				2094	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2095	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2096	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2097	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2098	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2099	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2100	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2101
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	2102	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2103	struct page *p;
				2104
				2105	/*
				2106	* the rmw code may have already read this
				2107	* page in
				2108	*/
				2109	p = rbio_stripe_page(rbio, stripe, pagenr);
				2110	if (PageUptodate(p))
				2111	continue;
				2112
				2113	ret = rbio_add_io_page(rbio, &bio_list,
				2114	rbio_stripe_page(rbio, stripe, pagenr),
				2115	stripe, pagenr, rbio->stripe_len);
				2116	if (ret < 0)
				2117	goto cleanup;
				2118	}
				2119	}
				2120
				2121	bios_to_read = bio_list_size(&bio_list);
				2122	if (!bios_to_read) {
				2123	/*
				2124	* we might have no bios to read just because the pages
				2125	* were up to date, or we might have no bios to read because
				2126	* the devices were gone.
				2127	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2128	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2129	__raid_recover_end_io(rbio);
				2130	goto out;
				2131	} else {
				2132	goto cleanup;
				2133	}
				2134	}
				2135
				2136	/*
				2137	* the bbio may be freed once we submit the last bio. Make sure
				2138	* not to touch it after that
				2139	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2140	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2141	while (1) {
				2142	bio = bio_list_pop(&bio_list);
				2143	if (!bio)
				2144	break;
				2145
				2146	bio->bi_private = rbio;
				2147	bio->bi_end_io = raid_recover_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	2148	bio_set_op_attrs(bio, REQ_OP_READ, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2149
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2150	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2151
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2152	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2153	}
				2154	out:
				2155	return 0;
				2156
				2157	cleanup:
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2158	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2159	rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2160	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2161
				2162	while ((bio = bio_list_pop(&bio_list)))
				2163	bio_put(bio);
				2164
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2165	return -EIO;
				2166	}
				2167
				2168	/*
				2169	* the main entry point for reads from the higher layers. This
				2170	* is really only called when the normal read path had a failure,
				2171	* so we assume the bio they send down corresponds to a failed part
				2172	* of the drive.
				2173	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2174	int raid56_parity_recover(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2175	struct btrfs_bio *bbio, u64 stripe_len,
				2176	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2177	{
				2178	struct btrfs_raid_bio *rbio;
				2179	int ret;
				2180
Liu Bo	abad60c	2017-03-29 10:54:26 -0700	[diff] [blame]	2181	if (generic_io) {
				2182	ASSERT(bbio->mirror_num == mirror_num);
				2183	btrfs_io_bio(bio)->mirror_num = mirror_num;
				2184	}
				2185
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2186	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2187	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2188	if (generic_io)
				2189	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2190	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2191	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2192
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2193	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2194	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2195	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2196
				2197	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2198	if (rbio->faila == -1) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2199	btrfs_warn(fs_info,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2200	"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
				2201	__func__, (u64)bio->bi_iter.bi_sector << 9,
				2202	(u64)bio->bi_iter.bi_size, bbio->map_type);
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2203	if (generic_io)
				2204	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2205	kfree(rbio);
				2206	return -EIO;
				2207	}
				2208
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2209	if (generic_io) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2210	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2211	rbio->generic_bio_cnt = 1;
				2212	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2213	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2214	}
				2215
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2216	/*
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2217	* Loop retry:
				2218	* for 'mirror == 2', reconstruct from all other stripes.
				2219	* for 'mirror_num > 2', select a stripe to fail on every retry.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2220	*/
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2221	if (mirror_num > 2) {
				2222	/*
				2223	* 'mirror == 3' is to fail the p stripe and
				2224	* reconstruct from the q stripe. 'mirror > 3' is to
				2225	* fail a data stripe and reconstruct from p+q stripe.
				2226	*/
				2227	rbio->failb = rbio->real_stripes - (mirror_num - 1);
				2228	ASSERT(rbio->failb > 0);
				2229	if (rbio->failb <= rbio->faila)
				2230	rbio->failb--;
				2231	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2232
				2233	ret = lock_stripe_add(rbio);
				2234
				2235	/*
				2236	* __raid56_parity_recover will end the bio with
				2237	* any errors it hits. We don't want to return
				2238	* its error value up the stack because our caller
				2239	* will end up calling bio_endio with any nonzero
				2240	* return
				2241	*/
				2242	if (ret == 0)
				2243	__raid56_parity_recover(rbio);
				2244	/*
				2245	* our rbio has been added to the list of
				2246	* rbios that will be handled after the
				2247	* currently lock owner is done
				2248	*/
				2249	return 0;
				2250
				2251	}
				2252
				2253	static void rmw_work(struct btrfs_work *work)
				2254	{
				2255	struct btrfs_raid_bio *rbio;
				2256
				2257	rbio = container_of(work, struct btrfs_raid_bio, work);
				2258	raid56_rmw_stripe(rbio);
				2259	}
				2260
				2261	static void read_rebuild_work(struct btrfs_work *work)
				2262	{
				2263	struct btrfs_raid_bio *rbio;
				2264
				2265	rbio = container_of(work, struct btrfs_raid_bio, work);
				2266	__raid56_parity_recover(rbio);
				2267	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2268
				2269	/*
				2270	* The following code is used to scrub/replace the parity stripe
				2271	*
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2272	* Caller must have already increased bio_counter for getting @bbio.
				2273	*
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2274	* Note: We need make sure all the pages that add into the scrub/replace
				2275	* raid bio are correct and not be changed during the scrub/replace. That
				2276	* is those pages just hold metadata or file data with checksum.
				2277	*/
				2278
				2279	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2280	raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2281	struct btrfs_bio *bbio, u64 stripe_len,
				2282	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2283	unsigned long *dbitmap, int stripe_nsectors)
				2284	{
				2285	struct btrfs_raid_bio *rbio;
				2286	int i;
				2287
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2288	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2289	if (IS_ERR(rbio))
				2290	return NULL;
				2291	bio_list_add(&rbio->bio_list, bio);
				2292	/*
				2293	* This is a special bio which is used to hold the completion handler
				2294	* and make the scrub rbio is similar to the other types
				2295	*/
				2296	ASSERT(!bio->bi_iter.bi_size);
				2297	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2298
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2299	/*
				2300	* After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
				2301	* to the end position, so this search can start from the first parity
				2302	* stripe.
				2303	*/
				2304	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2305	if (bbio->stripes[i].dev == scrub_dev) {
				2306	rbio->scrubp = i;
				2307	break;
				2308	}
				2309	}
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2310	ASSERT(i < rbio->real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2311
				2312	/* Now we just support the sectorsize equals to page size */
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2313	ASSERT(fs_info->sectorsize == PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2314	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2315	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2316
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2317	/*
				2318	* We have already increased bio_counter when getting bbio, record it
				2319	* so we can free it at rbio_orig_end_io().
				2320	*/
				2321	rbio->generic_bio_cnt = 1;
				2322
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2323	return rbio;
				2324	}
				2325
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2326	/* Used for both parity scrub and missing. */
				2327	void raid56_add_scrub_pages(struct btrfs_raid_bio rbio, struct page page,
				2328	u64 logical)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2329	{
				2330	int stripe_offset;
				2331	int index;
				2332
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2333	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2334	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2335	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2336	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2337	index = stripe_offset >> PAGE_SHIFT;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2338	rbio->bio_pages[index] = page;
				2339	}
				2340
				2341	/*
				2342	* We just scrub the parity that we have correct data on the same horizontal,
				2343	* so we needn't allocate all pages for all the stripes.
				2344	*/
				2345	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2346	{
				2347	int i;
				2348	int bit;
				2349	int index;
				2350	struct page *page;
				2351
				2352	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2353	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2354	index = i * rbio->stripe_npages + bit;
				2355	if (rbio->stripe_pages[index])
				2356	continue;
				2357
				2358	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2359	if (!page)
				2360	return -ENOMEM;
				2361	rbio->stripe_pages[index] = page;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2362	}
				2363	}
				2364	return 0;
				2365	}
				2366
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2367	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2368	int need_check)
				2369	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2370	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame^]	2371	void **pointers = rbio->finish_pointers;
				2372	unsigned long *pbitmap = rbio->finish_pbitmap;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2373	int nr_data = rbio->nr_data;
				2374	int stripe;
				2375	int pagenr;
				2376	int p_stripe = -1;
				2377	int q_stripe = -1;
				2378	struct page *p_page = NULL;
				2379	struct page *q_page = NULL;
				2380	struct bio_list bio_list;
				2381	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2382	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2383	int ret;
				2384
				2385	bio_list_init(&bio_list);
				2386
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2387	if (rbio->real_stripes - rbio->nr_data == 1) {
				2388	p_stripe = rbio->real_stripes - 1;
				2389	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				2390	p_stripe = rbio->real_stripes - 2;
				2391	q_stripe = rbio->real_stripes - 1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2392	} else {
				2393	BUG();
				2394	}
				2395
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2396	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2397	is_replace = 1;
				2398	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2399	}
				2400
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2401	/*
				2402	* Because the higher layers(scrubber) are unlikely to
				2403	* use this area of the disk again soon, so don't cache
				2404	* it.
				2405	*/
				2406	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2407
				2408	if (!need_check)
				2409	goto writeback;
				2410
				2411	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2412	if (!p_page)
				2413	goto cleanup;
				2414	SetPageUptodate(p_page);
				2415
				2416	if (q_stripe != -1) {
				2417	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2418	if (!q_page) {
				2419	__free_page(p_page);
				2420	goto cleanup;
				2421	}
				2422	SetPageUptodate(q_page);
				2423	}
				2424
				2425	atomic_set(&rbio->error, 0);
				2426
				2427	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2428	struct page *p;
				2429	void *parity;
				2430	/* first collect one page from each data stripe */
				2431	for (stripe = 0; stripe < nr_data; stripe++) {
				2432	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2433	pointers[stripe] = kmap(p);
				2434	}
				2435
				2436	/* then add the parity stripe */
				2437	pointers[stripe++] = kmap(p_page);
				2438
				2439	if (q_stripe != -1) {
				2440
				2441	/*
				2442	* raid6, add the qstripe and call the
				2443	* library function to fill in our p/q
				2444	*/
				2445	pointers[stripe++] = kmap(q_page);
				2446
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2447	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2448	pointers);
				2449	} else {
				2450	/* raid5 */
				2451	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2452	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2453	}
				2454
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2455	/* Check scrubbing parity and repair it */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2456	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2457	parity = kmap(p);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2458	if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
				2459	memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2460	else
				2461	/* Parity is right, needn't writeback */
				2462	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2463	kunmap(p);
				2464
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2465	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2466	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				2467	}
				2468
				2469	__free_page(p_page);
				2470	if (q_page)
				2471	__free_page(q_page);
				2472
				2473	writeback:
				2474	/*
				2475	* time to start writing. Make bios for everything from the
				2476	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2477	* everything else.
				2478	*/
				2479	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2480	struct page *page;
				2481
				2482	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2483	ret = rbio_add_io_page(rbio, &bio_list,
				2484	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2485	if (ret)
				2486	goto cleanup;
				2487	}
				2488
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2489	if (!is_replace)
				2490	goto submit_write;
				2491
				2492	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2493	struct page *page;
				2494
				2495	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2496	ret = rbio_add_io_page(rbio, &bio_list, page,
				2497	bbio->tgtdev_map[rbio->scrubp],
				2498	pagenr, rbio->stripe_len);
				2499	if (ret)
				2500	goto cleanup;
				2501	}
				2502
				2503	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2504	nr_data = bio_list_size(&bio_list);
				2505	if (!nr_data) {
				2506	/* Every parity is right */
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2507	rbio_orig_end_io(rbio, BLK_STS_OK);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2508	return;
				2509	}
				2510
				2511	atomic_set(&rbio->stripes_pending, nr_data);
				2512
				2513	while (1) {
				2514	bio = bio_list_pop(&bio_list);
				2515	if (!bio)
				2516	break;
				2517
				2518	bio->bi_private = rbio;
Zhao Lei	a6111d1	2016-01-12 17:52:13 +0800	[diff] [blame]	2519	bio->bi_end_io = raid_write_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	2520	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2521
				2522	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2523	}
				2524	return;
				2525
				2526	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2527	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2528
				2529	while ((bio = bio_list_pop(&bio_list)))
				2530	bio_put(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2531	}
				2532
				2533	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2534	{
				2535	if (stripe >= 0 && stripe < rbio->nr_data)
				2536	return 1;
				2537	return 0;
				2538	}
				2539
				2540	/*
				2541	* While we're doing the parity check and repair, we could have errors
				2542	* in reading pages off the disk. This checks for errors and if we're
				2543	* not able to read the page it'll trigger parity reconstruction. The
				2544	* parity scrub will be finished after we've reconstructed the failed
				2545	* stripes
				2546	*/
				2547	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2548	{
				2549	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2550	goto cleanup;
				2551
				2552	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2553	int dfail = 0, failp = -1;
				2554
				2555	if (is_data_stripe(rbio, rbio->faila))
				2556	dfail++;
				2557	else if (is_parity_stripe(rbio->faila))
				2558	failp = rbio->faila;
				2559
				2560	if (is_data_stripe(rbio, rbio->failb))
				2561	dfail++;
				2562	else if (is_parity_stripe(rbio->failb))
				2563	failp = rbio->failb;
				2564
				2565	/*
				2566	* Because we can not use a scrubbing parity to repair
				2567	* the data, so the capability of the repair is declined.
				2568	* (In the case of RAID5, we can not repair anything)
				2569	*/
				2570	if (dfail > rbio->bbio->max_errors - 1)
				2571	goto cleanup;
				2572
				2573	/*
				2574	* If all data is good, only parity is correctly, just
				2575	* repair the parity.
				2576	*/
				2577	if (dfail == 0) {
				2578	finish_parity_scrub(rbio, 0);
				2579	return;
				2580	}
				2581
				2582	/*
				2583	* Here means we got one corrupted data stripe and one
				2584	* corrupted parity on RAID6, if the corrupted parity
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2585	* is scrubbing parity, luckily, use the other one to repair
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2586	* the data, or we can not repair the data stripe.
				2587	*/
				2588	if (failp != rbio->scrubp)
				2589	goto cleanup;
				2590
				2591	__raid_recover_end_io(rbio);
				2592	} else {
				2593	finish_parity_scrub(rbio, 1);
				2594	}
				2595	return;
				2596
				2597	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2598	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2599	}
				2600
				2601	/*
				2602	* end io for the read phase of the rmw cycle. All the bios here are physical
				2603	* stripe bios we've read from the disk so we can recalculate the parity of the
				2604	* stripe.
				2605	*
				2606	* This will usually kick off finish_rmw once all the bios are read in, but it
				2607	* may trigger parity reconstruction if we had any errors along the way
				2608	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2609	static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2610	{
				2611	struct btrfs_raid_bio *rbio = bio->bi_private;
				2612
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2613	if (bio->bi_status)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2614	fail_bio_stripe(rbio, bio);
				2615	else
				2616	set_bio_pages_uptodate(bio);
				2617
				2618	bio_put(bio);
				2619
				2620	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2621	return;
				2622
				2623	/*
				2624	* this will normally call finish_rmw to start our write
				2625	* but if there are any failed stripes we'll reconstruct
				2626	* from parity first
				2627	*/
				2628	validate_rbio_for_parity_scrub(rbio);
				2629	}
				2630
				2631	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2632	{
				2633	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2634	struct bio_list bio_list;
				2635	int ret;
				2636	int pagenr;
				2637	int stripe;
				2638	struct bio *bio;
				2639
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2640	bio_list_init(&bio_list);
				2641
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2642	ret = alloc_rbio_essential_pages(rbio);
				2643	if (ret)
				2644	goto cleanup;
				2645
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2646	atomic_set(&rbio->error, 0);
				2647	/*
				2648	* build a list of bios to read all the missing parts of this
				2649	* stripe
				2650	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2651	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2652	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2653	struct page *page;
				2654	/*
				2655	* we want to find all the pages missing from
				2656	* the rbio and read them from the disk. If
				2657	* page_in_rbio finds a page in the bio list
				2658	* we don't need to read it off the stripe.
				2659	*/
				2660	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2661	if (page)
				2662	continue;
				2663
				2664	page = rbio_stripe_page(rbio, stripe, pagenr);
				2665	/*
				2666	* the bio cache may have handed us an uptodate
				2667	* page. If so, be happy and use it
				2668	*/
				2669	if (PageUptodate(page))
				2670	continue;
				2671
				2672	ret = rbio_add_io_page(rbio, &bio_list, page,
				2673	stripe, pagenr, rbio->stripe_len);
				2674	if (ret)
				2675	goto cleanup;
				2676	}
				2677	}
				2678
				2679	bios_to_read = bio_list_size(&bio_list);
				2680	if (!bios_to_read) {
				2681	/*
				2682	* this can happen if others have merged with
				2683	* us, it means there is nothing left to read.
				2684	* But if there are missing devices it may not be
				2685	* safe to do the full stripe write yet.
				2686	*/
				2687	goto finish;
				2688	}
				2689
				2690	/*
				2691	* the bbio may be freed once we submit the last bio. Make sure
				2692	* not to touch it after that
				2693	*/
				2694	atomic_set(&rbio->stripes_pending, bios_to_read);
				2695	while (1) {
				2696	bio = bio_list_pop(&bio_list);
				2697	if (!bio)
				2698	break;
				2699
				2700	bio->bi_private = rbio;
				2701	bio->bi_end_io = raid56_parity_scrub_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	2702	bio_set_op_attrs(bio, REQ_OP_READ, 0);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2703
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2704	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2705
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2706	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2707	}
				2708	/* the actual write will happen once the reads are done */
				2709	return;
				2710
				2711	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2712	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2713
				2714	while ((bio = bio_list_pop(&bio_list)))
				2715	bio_put(bio);
				2716
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2717	return;
				2718
				2719	finish:
				2720	validate_rbio_for_parity_scrub(rbio);
				2721	}
				2722
				2723	static void scrub_parity_work(struct btrfs_work *work)
				2724	{
				2725	struct btrfs_raid_bio *rbio;
				2726
				2727	rbio = container_of(work, struct btrfs_raid_bio, work);
				2728	raid56_parity_scrub_stripe(rbio);
				2729	}
				2730
				2731	static void async_scrub_parity(struct btrfs_raid_bio *rbio)
				2732	{
				2733	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2734	scrub_parity_work, NULL, NULL);
				2735
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2736	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2737	}
				2738
				2739	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2740	{
				2741	if (!lock_stripe_add(rbio))
				2742	async_scrub_parity(rbio);
				2743	}
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2744
				2745	/* The following code is used for dev replace of a missing RAID 5/6 device. */
				2746
				2747	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2748	raid56_alloc_missing_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2749	struct btrfs_bio *bbio, u64 length)
				2750	{
				2751	struct btrfs_raid_bio *rbio;
				2752
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2753	rbio = alloc_rbio(fs_info, bbio, length);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2754	if (IS_ERR(rbio))
				2755	return NULL;
				2756
				2757	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
				2758	bio_list_add(&rbio->bio_list, bio);
				2759	/*
				2760	* This is a special bio which is used to hold the completion handler
				2761	* and make the scrub rbio is similar to the other types
				2762	*/
				2763	ASSERT(!bio->bi_iter.bi_size);
				2764
				2765	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2766	if (rbio->faila == -1) {
				2767	BUG();
				2768	kfree(rbio);
				2769	return NULL;
				2770	}
				2771
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2772	/*
				2773	* When we get bbio, we have already increased bio_counter, record it
				2774	* so we can free it at rbio_orig_end_io()
				2775	*/
				2776	rbio->generic_bio_cnt = 1;
				2777
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2778	return rbio;
				2779	}
				2780
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2781	void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
				2782	{
				2783	if (!lock_stripe_add(rbio))
Liu Bo	d6a6913	2018-03-02 16:10:39 -0700	[diff] [blame]	2784	async_read_rebuild(rbio);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2785	}