Blame - fs/btrfs/raid56.c - kernel/msm-5.4

blob: 67a6f7d4740230aaa24b1ecad5e91ec5b94b5f70 [file] [log] [blame]

David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2	/*
				3	* Copyright (C) 2012 Fusion-io All rights reserved.
				4	* Copyright (C) 2012 Intel Corp. All rights reserved.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	5	*/
David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	6
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	7	#include <linux/sched.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	8	#include <linux/bio.h>
				9	#include <linux/slab.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	10	#include <linux/blkdev.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	11	#include <linux/raid/pq.h>
				12	#include <linux/hash.h>
				13	#include <linux/list_sort.h>
				14	#include <linux/raid/xor.h>
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	15	#include <linux/mm.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	16	#include "ctree.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	17	#include "disk-io.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	18	#include "volumes.h"
				19	#include "raid56.h"
				20	#include "async-thread.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	21
				22	/* set when additional merges to this rbio are not allowed */
				23	#define RBIO_RMW_LOCKED_BIT 1
				24
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	25	/*
				26	* set when this rbio is sitting in the hash, but it is just a cache
				27	* of past RMW
				28	*/
				29	#define RBIO_CACHE_BIT 2
				30
				31	/*
				32	* set when it is safe to trust the stripe_pages for caching
				33	*/
				34	#define RBIO_CACHE_READY_BIT 3
				35
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	36	#define RBIO_CACHE_SIZE 1024
				37
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	38	enum btrfs_rbio_ops {
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	39	BTRFS_RBIO_WRITE,
				40	BTRFS_RBIO_READ_REBUILD,
				41	BTRFS_RBIO_PARITY_SCRUB,
				42	BTRFS_RBIO_REBUILD_MISSING,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	43	};
				44
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	45	struct btrfs_raid_bio {
				46	struct btrfs_fs_info *fs_info;
				47	struct btrfs_bio *bbio;
				48
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	49	/* while we're doing rmw on a stripe
				50	* we put it into a hash table so we can
				51	* lock the stripe and merge more rbios
				52	* into it.
				53	*/
				54	struct list_head hash_list;
				55
				56	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	57	* LRU list for the stripe cache
				58	*/
				59	struct list_head stripe_cache;
				60
				61	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	62	* for scheduling work in the helper threads
				63	*/
				64	struct btrfs_work work;
				65
				66	/*
				67	* bio list and bio_list_lock are used
				68	* to add more bios into the stripe
				69	* in hopes of avoiding the full rmw
				70	*/
				71	struct bio_list bio_list;
				72	spinlock_t bio_list_lock;
				73
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	74	/* also protected by the bio_list_lock, the
				75	* plug list is used by the plugging code
				76	* to collect partial bios while plugged. The
				77	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	78	* the stripe lock to the next pending IO
				79	*/
				80	struct list_head plug_list;
				81
				82	/*
				83	* flags that tell us if it is safe to
				84	* merge with this bio
				85	*/
				86	unsigned long flags;
				87
				88	/* size of each individual stripe on disk */
				89	int stripe_len;
				90
				91	/* number of data stripes (no p/q) */
				92	int nr_data;
				93
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	94	int real_stripes;
				95
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	96	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	97	/*
				98	* set if we're doing a parity rebuild
				99	* for a read from higher up, which is handled
				100	* differently from a parity rebuild as part of
				101	* rmw
				102	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	103	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	104
				105	/* first bad stripe */
				106	int faila;
				107
				108	/* second bad stripe (for raid6 use) */
				109	int failb;
				110
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	111	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	112	/*
				113	* number of pages needed to represent the full
				114	* stripe
				115	*/
				116	int nr_pages;
				117
				118	/*
				119	* size of all the bios in the bio_list. This
				120	* helps us decide if the rbio maps to a full
				121	* stripe or not
				122	*/
				123	int bio_list_bytes;
				124
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	125	int generic_bio_cnt;
				126
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	127	refcount_t refs;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	128
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	129	atomic_t stripes_pending;
				130
				131	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	132	/*
				133	* these are two arrays of pointers. We allocate the
				134	* rbio big enough to hold them both and setup their
				135	* locations when the rbio is allocated
				136	*/
				137
				138	/* pointers to pages that we allocated for
				139	* reading/writing stripes directly from the disk (including P/Q)
				140	*/
				141	struct page **stripe_pages;
				142
				143	/*
				144	* pointers to the pages in the bio_list. Stored
				145	* here for faster lookup
				146	*/
				147	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	148
				149	/*
				150	* bitmap to record which horizontal stripe has data
				151	*/
				152	unsigned long *dbitmap;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	153
				154	/* allocated with real_stripes-many pointers for finish_() calls /
				155	void **finish_pointers;
				156
				157	/* allocated with stripe_npages-many bits for finish_() calls /
				158	unsigned long *finish_pbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	159	};
				160
				161	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				162	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				163	static void rmw_work(struct btrfs_work *work);
				164	static void read_rebuild_work(struct btrfs_work *work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	165	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				166	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				167	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				168	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				169	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				170
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	171	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				172	int need_check);
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	173	static void scrub_parity_work(struct btrfs_work *work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	174
David Sterba	ac63885	2018-06-29 10:56:56 +0200	[diff] [blame]	175	static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
				176	{
				177	btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
				178	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
				179	}
				180
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	181	/*
				182	* the stripe hash table is used for locking, and to collect
				183	* bios in hopes of making a full stripe
				184	*/
				185	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				186	{
				187	struct btrfs_stripe_hash_table *table;
				188	struct btrfs_stripe_hash_table *x;
				189	struct btrfs_stripe_hash *cur;
				190	struct btrfs_stripe_hash *h;
				191	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				192	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	193	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	194
				195	if (info->stripe_hash_table)
				196	return 0;
				197
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	198	/*
				199	* The table is large, starting with order 4 and can go as high as
				200	* order 7 in case lock debugging is turned on.
				201	*
				202	* Try harder to allocate and fallback to vmalloc to lower the chance
				203	* of a failing mount.
				204	*/
				205	table_size = sizeof(table) + sizeof(h) * num_entries;
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	206	table = kvzalloc(table_size, GFP_KERNEL);
				207	if (!table)
				208	return -ENOMEM;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	209
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	210	spin_lock_init(&table->cache_lock);
				211	INIT_LIST_HEAD(&table->stripe_cache);
				212
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	213	h = table->table;
				214
				215	for (i = 0; i < num_entries; i++) {
				216	cur = h + i;
				217	INIT_LIST_HEAD(&cur->hash_list);
				218	spin_lock_init(&cur->lock);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	219	}
				220
				221	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	222	if (x)
				223	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	224	return 0;
				225	}
				226
				227	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	228	* caching an rbio means to copy anything from the
				229	* bio_pages array into the stripe_pages array. We
				230	* use the page uptodate bit in the stripe cache array
				231	* to indicate if it has valid data
				232	*
				233	* once the caching is done, we set the cache ready
				234	* bit.
				235	*/
				236	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				237	{
				238	int i;
				239	char *s;
				240	char *d;
				241	int ret;
				242
				243	ret = alloc_rbio_pages(rbio);
				244	if (ret)
				245	return;
				246
				247	for (i = 0; i < rbio->nr_pages; i++) {
				248	if (!rbio->bio_pages[i])
				249	continue;
				250
				251	s = kmap(rbio->bio_pages[i]);
				252	d = kmap(rbio->stripe_pages[i]);
				253
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	254	copy_page(d, s);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	255
				256	kunmap(rbio->bio_pages[i]);
				257	kunmap(rbio->stripe_pages[i]);
				258	SetPageUptodate(rbio->stripe_pages[i]);
				259	}
				260	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				261	}
				262
				263	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	264	* we hash on the first logical address of the stripe
				265	*/
				266	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				267	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	268	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	269
				270	/*
				271	* we shift down quite a bit. We're using byte
				272	* addressing, and most of the lower bits are zeros.
				273	* This tends to upset hash_64, and it consistently
				274	* returns just one or two different values.
				275	*
				276	* shifting off the lower bits fixes things.
				277	*/
				278	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				279	}
				280
				281	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	282	* stealing an rbio means taking all the uptodate pages from the stripe
				283	* array in the source rbio and putting them into the destination rbio
				284	*/
				285	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				286	{
				287	int i;
				288	struct page *s;
				289	struct page *d;
				290
				291	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				292	return;
				293
				294	for (i = 0; i < dest->nr_pages; i++) {
				295	s = src->stripe_pages[i];
				296	if (!s \|\| !PageUptodate(s)) {
				297	continue;
				298	}
				299
				300	d = dest->stripe_pages[i];
				301	if (d)
				302	__free_page(d);
				303
				304	dest->stripe_pages[i] = s;
				305	src->stripe_pages[i] = NULL;
				306	}
				307	}
				308
				309	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	310	* merging means we take the bio_list from the victim and
				311	* splice it into the destination. The victim should
				312	* be discarded afterwards.
				313	*
				314	* must be called with dest->rbio_list_lock held
				315	*/
				316	static void merge_rbio(struct btrfs_raid_bio *dest,
				317	struct btrfs_raid_bio *victim)
				318	{
				319	bio_list_merge(&dest->bio_list, &victim->bio_list);
				320	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	321	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	322	bio_list_init(&victim->bio_list);
				323	}
				324
				325	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	326	* used to prune items that are in the cache. The caller
				327	* must hold the hash table lock.
				328	*/
				329	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				330	{
				331	int bucket = rbio_bucket(rbio);
				332	struct btrfs_stripe_hash_table *table;
				333	struct btrfs_stripe_hash *h;
				334	int freeit = 0;
				335
				336	/*
				337	* check the bit again under the hash table lock.
				338	*/
				339	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				340	return;
				341
				342	table = rbio->fs_info->stripe_hash_table;
				343	h = table->table + bucket;
				344
				345	/* hold the lock for the bucket because we may be
				346	* removing it from the hash table
				347	*/
				348	spin_lock(&h->lock);
				349
				350	/*
				351	* hold the lock for the bio list because we need
				352	* to make sure the bio list is empty
				353	*/
				354	spin_lock(&rbio->bio_list_lock);
				355
				356	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				357	list_del_init(&rbio->stripe_cache);
				358	table->cache_size -= 1;
				359	freeit = 1;
				360
				361	/* if the bio list isn't empty, this rbio is
				362	* still involved in an IO. We take it out
				363	* of the cache list, and drop the ref that
				364	* was held for the list.
				365	*
				366	* If the bio_list was empty, we also remove
				367	* the rbio from the hash_table, and drop
				368	* the corresponding ref
				369	*/
				370	if (bio_list_empty(&rbio->bio_list)) {
				371	if (!list_empty(&rbio->hash_list)) {
				372	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	373	refcount_dec(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	374	BUG_ON(!list_empty(&rbio->plug_list));
				375	}
				376	}
				377	}
				378
				379	spin_unlock(&rbio->bio_list_lock);
				380	spin_unlock(&h->lock);
				381
				382	if (freeit)
				383	__free_raid_bio(rbio);
				384	}
				385
				386	/*
				387	* prune a given rbio from the cache
				388	*/
				389	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				390	{
				391	struct btrfs_stripe_hash_table *table;
				392	unsigned long flags;
				393
				394	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				395	return;
				396
				397	table = rbio->fs_info->stripe_hash_table;
				398
				399	spin_lock_irqsave(&table->cache_lock, flags);
				400	__remove_rbio_from_cache(rbio);
				401	spin_unlock_irqrestore(&table->cache_lock, flags);
				402	}
				403
				404	/*
				405	* remove everything in the cache
				406	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	407	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	408	{
				409	struct btrfs_stripe_hash_table *table;
				410	unsigned long flags;
				411	struct btrfs_raid_bio *rbio;
				412
				413	table = info->stripe_hash_table;
				414
				415	spin_lock_irqsave(&table->cache_lock, flags);
				416	while (!list_empty(&table->stripe_cache)) {
				417	rbio = list_entry(table->stripe_cache.next,
				418	struct btrfs_raid_bio,
				419	stripe_cache);
				420	__remove_rbio_from_cache(rbio);
				421	}
				422	spin_unlock_irqrestore(&table->cache_lock, flags);
				423	}
				424
				425	/*
				426	* remove all cached entries and free the hash table
				427	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	428	*/
				429	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				430	{
				431	if (!info->stripe_hash_table)
				432	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	433	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	434	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	435	info->stripe_hash_table = NULL;
				436	}
				437
				438	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	439	* insert an rbio into the stripe cache. It
				440	* must have already been prepared by calling
				441	* cache_rbio_pages
				442	*
				443	* If this rbio was already cached, it gets
				444	* moved to the front of the lru.
				445	*
				446	* If the size of the rbio cache is too big, we
				447	* prune an item.
				448	*/
				449	static void cache_rbio(struct btrfs_raid_bio *rbio)
				450	{
				451	struct btrfs_stripe_hash_table *table;
				452	unsigned long flags;
				453
				454	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				455	return;
				456
				457	table = rbio->fs_info->stripe_hash_table;
				458
				459	spin_lock_irqsave(&table->cache_lock, flags);
				460	spin_lock(&rbio->bio_list_lock);
				461
				462	/* bump our ref if we were not in the list before */
				463	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	464	refcount_inc(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	465
				466	if (!list_empty(&rbio->stripe_cache)){
				467	list_move(&rbio->stripe_cache, &table->stripe_cache);
				468	} else {
				469	list_add(&rbio->stripe_cache, &table->stripe_cache);
				470	table->cache_size += 1;
				471	}
				472
				473	spin_unlock(&rbio->bio_list_lock);
				474
				475	if (table->cache_size > RBIO_CACHE_SIZE) {
				476	struct btrfs_raid_bio *found;
				477
				478	found = list_entry(table->stripe_cache.prev,
				479	struct btrfs_raid_bio,
				480	stripe_cache);
				481
				482	if (found != rbio)
				483	__remove_rbio_from_cache(found);
				484	}
				485
				486	spin_unlock_irqrestore(&table->cache_lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	487	}
				488
				489	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	490	* helper function to run the xor_blocks api. It is only
				491	* able to do MAX_XOR_BLOCKS at a time, so we need to
				492	* loop through.
				493	*/
				494	static void run_xor(void **pages, int src_cnt, ssize_t len)
				495	{
				496	int src_off = 0;
				497	int xor_src_cnt = 0;
				498	void *dest = pages[src_cnt];
				499
				500	while(src_cnt > 0) {
				501	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				502	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				503
				504	src_cnt -= xor_src_cnt;
				505	src_off += xor_src_cnt;
				506	}
				507	}
				508
				509	/*
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	510	* Returns true if the bio list inside this rbio covers an entire stripe (no
				511	* rmw required).
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	512	*/
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	513	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				514	{
				515	unsigned long flags;
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	516	unsigned long size = rbio->bio_list_bytes;
				517	int ret = 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	518
				519	spin_lock_irqsave(&rbio->bio_list_lock, flags);
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	520	if (size != rbio->nr_data * rbio->stripe_len)
				521	ret = 0;
				522	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	523	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	524
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	525	return ret;
				526	}
				527
				528	/*
				529	* returns 1 if it is safe to merge two rbios together.
				530	* The merging is safe if the two rbios correspond to
				531	* the same stripe and if they are both going in the same
				532	* direction (read vs write), and if neither one is
				533	* locked for final IO
				534	*
				535	* The caller is responsible for locking such that
				536	* rmw_locked is safe to test
				537	*/
				538	static int rbio_can_merge(struct btrfs_raid_bio *last,
				539	struct btrfs_raid_bio *cur)
				540	{
				541	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				542	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				543	return 0;
				544
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	545	/*
				546	* we can't merge with cached rbios, since the
				547	* idea is that when we merge the destination
				548	* rbio is going to run our IO for us. We can
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	549	* steal from cached rbios though, other functions
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	550	* handle that.
				551	*/
				552	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				553	test_bit(RBIO_CACHE_BIT, &cur->flags))
				554	return 0;
				555
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	556	if (last->bbio->raid_map[0] !=
				557	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	558	return 0;
				559
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	560	/* we can't merge with different operations */
				561	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	562	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	563	/*
				564	* We've need read the full stripe from the drive.
				565	* check and repair the parity and write the new results.
				566	*
				567	* We're not allowed to add any new bios to the
				568	* bio list here, anyone else that wants to
				569	* change this stripe needs to do their own rmw.
				570	*/
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	571	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	572	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	573
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	574	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	575	return 0;
				576
Liu Bo	cc54ff6	2017-12-11 14:56:31 -0700	[diff] [blame]	577	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
				578	int fa = last->faila;
				579	int fb = last->failb;
				580	int cur_fa = cur->faila;
				581	int cur_fb = cur->failb;
				582
				583	if (last->faila >= last->failb) {
				584	fa = last->failb;
				585	fb = last->faila;
				586	}
				587
				588	if (cur->faila >= cur->failb) {
				589	cur_fa = cur->failb;
				590	cur_fb = cur->faila;
				591	}
				592
				593	if (fa != cur_fa \|\| fb != cur_fb)
				594	return 0;
				595	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	596	return 1;
				597	}
				598
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	599	static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
				600	int index)
				601	{
				602	return stripe * rbio->stripe_npages + index;
				603	}
				604
				605	/*
				606	* these are just the pages from the rbio array, not from anything
				607	* the FS sent down to us
				608	*/
				609	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe,
				610	int index)
				611	{
				612	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
				613	}
				614
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	615	/*
				616	* helper to index into the pstripe
				617	*/
				618	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				619	{
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	620	return rbio_stripe_page(rbio, rbio->nr_data, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	621	}
				622
				623	/*
				624	* helper to index into the qstripe, returns null
				625	* if there is no qstripe
				626	*/
				627	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				628	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	629	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	630	return NULL;
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	631	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	632	}
				633
				634	/*
				635	* The first stripe in the table for a logical address
				636	* has the lock. rbios are added in one of three ways:
				637	*
				638	* 1) Nobody has the stripe locked yet. The rbio is given
				639	* the lock and 0 is returned. The caller must start the IO
				640	* themselves.
				641	*
				642	* 2) Someone has the stripe locked, but we're able to merge
				643	* with the lock owner. The rbio is freed and the IO will
				644	* start automatically along with the existing rbio. 1 is returned.
				645	*
				646	* 3) Someone has the stripe locked, but we're not able to merge.
				647	* The rbio is added to the lock owner's plug list, or merged into
				648	* an rbio already on the plug list. When the lock owner unlocks,
				649	* the next rbio on the list is run and the IO is started automatically.
				650	* 1 is returned
				651	*
				652	* If we return 0, the caller still owns the rbio and must continue with
				653	* IO submission. If we return 1, the caller must assume the rbio has
				654	* already been freed.
				655	*/
				656	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				657	{
				658	int bucket = rbio_bucket(rbio);
				659	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				660	struct btrfs_raid_bio *cur;
				661	struct btrfs_raid_bio *pending;
				662	unsigned long flags;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	663	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	664	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	665	int ret = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	666
				667	spin_lock_irqsave(&h->lock, flags);
				668	list_for_each_entry(cur, &h->hash_list, hash_list) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	669	if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	670	spin_lock(&cur->bio_list_lock);
				671
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	672	/* can we steal this cached rbio's pages? */
				673	if (bio_list_empty(&cur->bio_list) &&
				674	list_empty(&cur->plug_list) &&
				675	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				676	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				677	list_del_init(&cur->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	678	refcount_dec(&cur->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	679
				680	steal_rbio(cur, rbio);
				681	cache_drop = cur;
				682	spin_unlock(&cur->bio_list_lock);
				683
				684	goto lockit;
				685	}
				686
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	687	/* can we merge into the lock owner? */
				688	if (rbio_can_merge(cur, rbio)) {
				689	merge_rbio(cur, rbio);
				690	spin_unlock(&cur->bio_list_lock);
				691	freeit = rbio;
				692	ret = 1;
				693	goto out;
				694	}
				695
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	696
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	697	/*
				698	* we couldn't merge with the running
				699	* rbio, see if we can merge with the
				700	* pending ones. We don't have to
				701	* check for rmw_locked because there
				702	* is no way they are inside finish_rmw
				703	* right now
				704	*/
				705	list_for_each_entry(pending, &cur->plug_list,
				706	plug_list) {
				707	if (rbio_can_merge(pending, rbio)) {
				708	merge_rbio(pending, rbio);
				709	spin_unlock(&cur->bio_list_lock);
				710	freeit = rbio;
				711	ret = 1;
				712	goto out;
				713	}
				714	}
				715
				716	/* no merging, put us on the tail of the plug list,
				717	* our rbio will be started with the currently
				718	* running rbio unlocks
				719	*/
				720	list_add_tail(&rbio->plug_list, &cur->plug_list);
				721	spin_unlock(&cur->bio_list_lock);
				722	ret = 1;
				723	goto out;
				724	}
				725	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	726	lockit:
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	727	refcount_inc(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	728	list_add(&rbio->hash_list, &h->hash_list);
				729	out:
				730	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	731	if (cache_drop)
				732	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	733	if (freeit)
				734	__free_raid_bio(freeit);
				735	return ret;
				736	}
				737
				738	/*
				739	* called as rmw or parity rebuild is completed. If the plug list has more
				740	* rbios waiting for this stripe, the next one on the list will be started
				741	*/
				742	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				743	{
				744	int bucket;
				745	struct btrfs_stripe_hash *h;
				746	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	747	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	748
				749	bucket = rbio_bucket(rbio);
				750	h = rbio->fs_info->stripe_hash_table->table + bucket;
				751
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	752	if (list_empty(&rbio->plug_list))
				753	cache_rbio(rbio);
				754
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	755	spin_lock_irqsave(&h->lock, flags);
				756	spin_lock(&rbio->bio_list_lock);
				757
				758	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	759	/*
				760	* if we're still cached and there is no other IO
				761	* to perform, just leave this rbio here for others
				762	* to steal from later
				763	*/
				764	if (list_empty(&rbio->plug_list) &&
				765	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				766	keep_cache = 1;
				767	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				768	BUG_ON(!bio_list_empty(&rbio->bio_list));
				769	goto done;
				770	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	771
				772	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	773	refcount_dec(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	774
				775	/*
				776	* we use the plug list to hold all the rbios
				777	* waiting for the chance to lock this stripe.
				778	* hand the lock over to one of them.
				779	*/
				780	if (!list_empty(&rbio->plug_list)) {
				781	struct btrfs_raid_bio *next;
				782	struct list_head *head = rbio->plug_list.next;
				783
				784	next = list_entry(head, struct btrfs_raid_bio,
				785	plug_list);
				786
				787	list_del_init(&rbio->plug_list);
				788
				789	list_add(&next->hash_list, &h->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	790	refcount_inc(&next->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	791	spin_unlock(&rbio->bio_list_lock);
				792	spin_unlock_irqrestore(&h->lock, flags);
				793
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	794	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	795	start_async_work(next, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	796	else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
				797	steal_rbio(rbio, next);
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	798	start_async_work(next, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	799	} else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	800	steal_rbio(rbio, next);
David Sterba	cf6a4a7	2018-06-29 10:56:58 +0200	[diff] [blame]	801	start_async_work(next, rmw_work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	802	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				803	steal_rbio(rbio, next);
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	804	start_async_work(next, scrub_parity_work);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	805	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	806
				807	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	808	}
				809	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	810	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	811	spin_unlock(&rbio->bio_list_lock);
				812	spin_unlock_irqrestore(&h->lock, flags);
				813
				814	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	815	if (!keep_cache)
				816	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	817	}
				818
				819	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				820	{
				821	int i;
				822
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	823	if (!refcount_dec_and_test(&rbio->refs))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	824	return;
				825
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	826	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	827	WARN_ON(!list_empty(&rbio->hash_list));
				828	WARN_ON(!bio_list_empty(&rbio->bio_list));
				829
				830	for (i = 0; i < rbio->nr_pages; i++) {
				831	if (rbio->stripe_pages[i]) {
				832	__free_page(rbio->stripe_pages[i]);
				833	rbio->stripe_pages[i] = NULL;
				834	}
				835	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	836
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	837	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	838	kfree(rbio);
				839	}
				840
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	841	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	842	{
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	843	struct bio *next;
				844
				845	while (cur) {
				846	next = cur->bi_next;
				847	cur->bi_next = NULL;
				848	cur->bi_status = err;
				849	bio_endio(cur);
				850	cur = next;
				851	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	852	}
				853
				854	/*
				855	* this frees the rbio and runs through all the bios in the
				856	* bio_list and calls end_io on them
				857	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	858	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	859	{
				860	struct bio *cur = bio_list_get(&rbio->bio_list);
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	861	struct bio *extra;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	862
				863	if (rbio->generic_bio_cnt)
				864	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				865
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	866	/*
				867	* At this moment, rbio->bio_list is empty, however since rbio does not
				868	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
				869	* hash list, rbio may be merged with others so that rbio->bio_list
				870	* becomes non-empty.
				871	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
				872	* more and we can call bio_endio() on all queued bios.
				873	*/
				874	unlock_stripe(rbio);
				875	extra = bio_list_get(&rbio->bio_list);
				876	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	877
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	878	rbio_endio_bio_list(cur, err);
				879	if (extra)
				880	rbio_endio_bio_list(extra, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	881	}
				882
				883	/*
				884	* end io function used by finish_rmw. When we finally
				885	* get here, we've written a full stripe
				886	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	887	static void raid_write_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	888	{
				889	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	890	blk_status_t err = bio->bi_status;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	891	int max_errors;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	892
				893	if (err)
				894	fail_bio_stripe(rbio, bio);
				895
				896	bio_put(bio);
				897
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	898	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	899	return;
				900
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	901	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	902
				903	/* OK, we have read all the stripes we need to. */
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	904	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
				905	0 : rbio->bbio->max_errors;
				906	if (atomic_read(&rbio->error) > max_errors)
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	907	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	908
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	909	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	910	}
				911
				912	/*
				913	* the read/modify/write code wants to use the original bio for
				914	* any pages it included, and then use the rbio for everything
				915	* else. This function decides if a given index (stripe number)
				916	* and page number in that stripe fall inside the original bio
				917	* or the rbio.
				918	*
				919	* if you set bio_list_only, you'll get a NULL back for any ranges
				920	* that are outside the bio_list
				921	*
				922	* This doesn't take any refs on anything, you get a bare page pointer
				923	* and the caller must bump refs as required.
				924	*
				925	* You must call index_rbio_pages once before you can trust
				926	* the answers from this function.
				927	*/
				928	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				929	int index, int pagenr, int bio_list_only)
				930	{
				931	int chunk_page;
				932	struct page *p = NULL;
				933
				934	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				935
				936	spin_lock_irq(&rbio->bio_list_lock);
				937	p = rbio->bio_pages[chunk_page];
				938	spin_unlock_irq(&rbio->bio_list_lock);
				939
				940	if (p \|\| bio_list_only)
				941	return p;
				942
				943	return rbio->stripe_pages[chunk_page];
				944	}
				945
				946	/*
				947	* number of pages we need for the entire stripe across all the
				948	* drives
				949	*/
				950	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				951	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	952	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	953	}
				954
				955	/*
				956	* allocation and initial setup for the btrfs_raid_bio. Not
				957	* this does not allocate any pages for rbio->pages.
				958	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	959	static struct btrfs_raid_bio alloc_rbio(struct btrfs_fs_info fs_info,
				960	struct btrfs_bio *bbio,
				961	u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	962	{
				963	struct btrfs_raid_bio *rbio;
				964	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	965	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				966	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	967	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	968	void *p;
				969
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	970	rbio = kzalloc(sizeof(*rbio) +
				971	sizeof(rbio->stripe_pages) num_pages +
				972	sizeof(rbio->bio_pages) num_pages +
				973	sizeof(rbio->finish_pointers) real_stripes +
				974	sizeof(rbio->dbitmap) BITS_TO_LONGS(stripe_npages) +
				975	sizeof(rbio->finish_pbitmap)
				976	BITS_TO_LONGS(stripe_npages),
				977	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	978	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	979	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	980
				981	bio_list_init(&rbio->bio_list);
				982	INIT_LIST_HEAD(&rbio->plug_list);
				983	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	984	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	985	INIT_LIST_HEAD(&rbio->hash_list);
				986	rbio->bbio = bbio;
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	987	rbio->fs_info = fs_info;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	988	rbio->stripe_len = stripe_len;
				989	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	990	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	991	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	992	rbio->faila = -1;
				993	rbio->failb = -1;
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	994	refcount_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	995	atomic_set(&rbio->error, 0);
				996	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	997
				998	/*
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	999	* the stripe_pages, bio_pages, etc arrays point to the extra
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1000	* memory we allocated past the end of the rbio
				1001	*/
				1002	p = rbio + 1;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1003	#define CONSUME_ALLOC(ptr, count) do { \
				1004	ptr = p; \
				1005	p = (unsigned char )p + sizeof((ptr)) * (count); \
				1006	} while (0)
				1007	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
				1008	CONSUME_ALLOC(rbio->bio_pages, num_pages);
				1009	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
				1010	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
				1011	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
				1012	#undef CONSUME_ALLOC
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1013
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1014	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				1015	nr_data = real_stripes - 1;
				1016	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1017	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1018	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1019	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1020
				1021	rbio->nr_data = nr_data;
				1022	return rbio;
				1023	}
				1024
				1025	/* allocate pages for all the stripes in the bio, including parity */
				1026	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1027	{
				1028	int i;
				1029	struct page *page;
				1030
				1031	for (i = 0; i < rbio->nr_pages; i++) {
				1032	if (rbio->stripe_pages[i])
				1033	continue;
				1034	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1035	if (!page)
				1036	return -ENOMEM;
				1037	rbio->stripe_pages[i] = page;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1038	}
				1039	return 0;
				1040	}
				1041
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1042	/* only allocate pages for p/q stripes */
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1043	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1044	{
				1045	int i;
				1046	struct page *page;
				1047
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1048	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1049
				1050	for (; i < rbio->nr_pages; i++) {
				1051	if (rbio->stripe_pages[i])
				1052	continue;
				1053	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1054	if (!page)
				1055	return -ENOMEM;
				1056	rbio->stripe_pages[i] = page;
				1057	}
				1058	return 0;
				1059	}
				1060
				1061	/*
				1062	* add a single page from a specific stripe into our list of bios for IO
				1063	* this will try to merge into existing bios if possible, and returns
				1064	* zero if all went well.
				1065	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1066	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1067	struct bio_list *bio_list,
				1068	struct page *page,
				1069	int stripe_nr,
				1070	unsigned long page_index,
				1071	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1072	{
				1073	struct bio *last = bio_list->tail;
				1074	u64 last_end = 0;
				1075	int ret;
				1076	struct bio *bio;
				1077	struct btrfs_bio_stripe *stripe;
				1078	u64 disk_start;
				1079
				1080	stripe = &rbio->bbio->stripes[stripe_nr];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1081	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1082
				1083	/* if the device is missing, just fail this stripe */
				1084	if (!stripe->dev->bdev)
				1085	return fail_rbio_index(rbio, stripe_nr);
				1086
				1087	/* see if we can add this page onto our existing bio */
				1088	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1089	last_end = (u64)last->bi_iter.bi_sector << 9;
				1090	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1091
				1092	/*
				1093	* we can't merge these if they are from different
				1094	* devices or if they are not contiguous
				1095	*/
				1096	if (last_end == disk_start && stripe->dev->bdev &&
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1097	!last->bi_status &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1098	last->bi_disk == stripe->dev->bdev->bd_disk &&
				1099	last->bi_partno == stripe->dev->bdev->bd_partno) {
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1100	ret = bio_add_page(last, page, PAGE_SIZE, 0);
				1101	if (ret == PAGE_SIZE)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1102	return 0;
				1103	}
				1104	}
				1105
				1106	/* put a new bio on the list */
David Sterba	c5e4c3d	2017-06-12 17:29:41 +0200	[diff] [blame]	1107	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1108	bio->bi_iter.bi_size = 0;
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1109	bio_set_dev(bio, stripe->dev->bdev);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1110	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1111
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1112	bio_add_page(bio, page, PAGE_SIZE, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1113	bio_list_add(bio_list, bio);
				1114	return 0;
				1115	}
				1116
				1117	/*
				1118	* while we're doing the read/modify/write cycle, we could
				1119	* have errors in reading pages off the disk. This checks
				1120	* for errors and if we're not able to read the page it'll
				1121	* trigger parity reconstruction. The rmw will be finished
				1122	* after we've reconstructed the failed stripes
				1123	*/
				1124	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1125	{
				1126	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1127	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1128	__raid56_parity_recover(rbio);
				1129	} else {
				1130	finish_rmw(rbio);
				1131	}
				1132	}
				1133
				1134	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1135	* helper function to walk our bio list and populate the bio_pages array with
				1136	* the result. This seems expensive, but it is faster than constantly
				1137	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1138	* reconstruction.
				1139	*
				1140	* This must be called before you trust the answers from page_in_rbio
				1141	*/
				1142	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1143	{
				1144	struct bio *bio;
				1145	u64 start;
				1146	unsigned long stripe_offset;
				1147	unsigned long page_index;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1148
				1149	spin_lock_irq(&rbio->bio_list_lock);
				1150	bio_list_for_each(bio, &rbio->bio_list) {
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1151	struct bio_vec bvec;
				1152	struct bvec_iter iter;
				1153	int i = 0;
				1154
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1155	start = (u64)bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1156	stripe_offset = start - rbio->bbio->raid_map[0];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1157	page_index = stripe_offset >> PAGE_SHIFT;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1158
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1159	if (bio_flagged(bio, BIO_CLONED))
				1160	bio->bi_iter = btrfs_io_bio(bio)->iter;
				1161
				1162	bio_for_each_segment(bvec, bio, iter) {
				1163	rbio->bio_pages[page_index + i] = bvec.bv_page;
				1164	i++;
				1165	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1166	}
				1167	spin_unlock_irq(&rbio->bio_list_lock);
				1168	}
				1169
				1170	/*
				1171	* this is called from one of two situations. We either
				1172	* have a full stripe from the higher layers, or we've read all
				1173	* the missing bits off disk.
				1174	*
				1175	* This will calculate the parity and then send down any
				1176	* changed blocks.
				1177	*/
				1178	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1179	{
				1180	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1181	void **pointers = rbio->finish_pointers;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1182	int nr_data = rbio->nr_data;
				1183	int stripe;
				1184	int pagenr;
				1185	int p_stripe = -1;
				1186	int q_stripe = -1;
				1187	struct bio_list bio_list;
				1188	struct bio *bio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1189	int ret;
				1190
				1191	bio_list_init(&bio_list);
				1192
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1193	if (rbio->real_stripes - rbio->nr_data == 1) {
				1194	p_stripe = rbio->real_stripes - 1;
				1195	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				1196	p_stripe = rbio->real_stripes - 2;
				1197	q_stripe = rbio->real_stripes - 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1198	} else {
				1199	BUG();
				1200	}
				1201
				1202	/* at this point we either have a full stripe,
				1203	* or we've read the full stripe from the drive.
				1204	* recalculate the parity and write the new results.
				1205	*
				1206	* We're not allowed to add any new bios to the
				1207	* bio list here, anyone else that wants to
				1208	* change this stripe needs to do their own rmw.
				1209	*/
				1210	spin_lock_irq(&rbio->bio_list_lock);
				1211	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1212	spin_unlock_irq(&rbio->bio_list_lock);
				1213
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1214	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1215
				1216	/*
				1217	* now that we've set rmw_locked, run through the
				1218	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1219	*
				1220	* We don't cache full rbios because we're assuming
				1221	* the higher layers are unlikely to use this area of
				1222	* the disk again soon. If they do use it again,
				1223	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1224	*/
				1225	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1226	if (!rbio_is_full(rbio))
				1227	cache_rbio_pages(rbio);
				1228	else
				1229	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1230
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1231	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1232	struct page *p;
				1233	/* first collect one page from each data stripe */
				1234	for (stripe = 0; stripe < nr_data; stripe++) {
				1235	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1236	pointers[stripe] = kmap(p);
				1237	}
				1238
				1239	/* then add the parity stripe */
				1240	p = rbio_pstripe_page(rbio, pagenr);
				1241	SetPageUptodate(p);
				1242	pointers[stripe++] = kmap(p);
				1243
				1244	if (q_stripe != -1) {
				1245
				1246	/*
				1247	* raid6, add the qstripe and call the
				1248	* library function to fill in our p/q
				1249	*/
				1250	p = rbio_qstripe_page(rbio, pagenr);
				1251	SetPageUptodate(p);
				1252	pointers[stripe++] = kmap(p);
				1253
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1254	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1255	pointers);
				1256	} else {
				1257	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1258	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1259	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1260	}
				1261
				1262
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1263	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1264	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1265	}
				1266
				1267	/*
				1268	* time to start writing. Make bios for everything from the
				1269	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1270	* everything else.
				1271	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1272	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1273	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1274	struct page *page;
				1275	if (stripe < rbio->nr_data) {
				1276	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1277	if (!page)
				1278	continue;
				1279	} else {
				1280	page = rbio_stripe_page(rbio, stripe, pagenr);
				1281	}
				1282
				1283	ret = rbio_add_io_page(rbio, &bio_list,
				1284	page, stripe, pagenr, rbio->stripe_len);
				1285	if (ret)
				1286	goto cleanup;
				1287	}
				1288	}
				1289
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1290	if (likely(!bbio->num_tgtdevs))
				1291	goto write_data;
				1292
				1293	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1294	if (!bbio->tgtdev_map[stripe])
				1295	continue;
				1296
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1297	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1298	struct page *page;
				1299	if (stripe < rbio->nr_data) {
				1300	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1301	if (!page)
				1302	continue;
				1303	} else {
				1304	page = rbio_stripe_page(rbio, stripe, pagenr);
				1305	}
				1306
				1307	ret = rbio_add_io_page(rbio, &bio_list, page,
				1308	rbio->bbio->tgtdev_map[stripe],
				1309	pagenr, rbio->stripe_len);
				1310	if (ret)
				1311	goto cleanup;
				1312	}
				1313	}
				1314
				1315	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1316	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1317	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1318
				1319	while (1) {
				1320	bio = bio_list_pop(&bio_list);
				1321	if (!bio)
				1322	break;
				1323
				1324	bio->bi_private = rbio;
				1325	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1326	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1327
				1328	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1329	}
				1330	return;
				1331
				1332	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1333	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1334
				1335	while ((bio = bio_list_pop(&bio_list)))
				1336	bio_put(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1337	}
				1338
				1339	/*
				1340	* helper to find the stripe number for a given bio. Used to figure out which
				1341	* stripe has failed. This expects the bio to correspond to a physical disk,
				1342	* so it looks up based on physical sector numbers.
				1343	*/
				1344	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1345	struct bio *bio)
				1346	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1347	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1348	u64 stripe_start;
				1349	int i;
				1350	struct btrfs_bio_stripe *stripe;
				1351
				1352	physical <<= 9;
				1353
				1354	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1355	stripe = &rbio->bbio->stripes[i];
				1356	stripe_start = stripe->physical;
				1357	if (physical >= stripe_start &&
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1358	physical < stripe_start + rbio->stripe_len &&
Dmitriy Gorokh	047fdea	2018-02-16 19:51:38 +0000	[diff] [blame]	1359	stripe->dev->bdev &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1360	bio->bi_disk == stripe->dev->bdev->bd_disk &&
				1361	bio->bi_partno == stripe->dev->bdev->bd_partno) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1362	return i;
				1363	}
				1364	}
				1365	return -1;
				1366	}
				1367
				1368	/*
				1369	* helper to find the stripe number for a given
				1370	* bio (before mapping). Used to figure out which stripe has
				1371	* failed. This looks up based on logical block numbers.
				1372	*/
				1373	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1374	struct bio *bio)
				1375	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1376	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1377	u64 stripe_start;
				1378	int i;
				1379
				1380	logical <<= 9;
				1381
				1382	for (i = 0; i < rbio->nr_data; i++) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1383	stripe_start = rbio->bbio->raid_map[i];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1384	if (logical >= stripe_start &&
				1385	logical < stripe_start + rbio->stripe_len) {
				1386	return i;
				1387	}
				1388	}
				1389	return -1;
				1390	}
				1391
				1392	/*
				1393	* returns -EIO if we had too many failures
				1394	*/
				1395	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1396	{
				1397	unsigned long flags;
				1398	int ret = 0;
				1399
				1400	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1401
				1402	/* we already know this stripe is bad, move on */
				1403	if (rbio->faila == failed \|\| rbio->failb == failed)
				1404	goto out;
				1405
				1406	if (rbio->faila == -1) {
				1407	/* first failure on this rbio */
				1408	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1409	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1410	} else if (rbio->failb == -1) {
				1411	/* second failure on this rbio */
				1412	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1413	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1414	} else {
				1415	ret = -EIO;
				1416	}
				1417	out:
				1418	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1419
				1420	return ret;
				1421	}
				1422
				1423	/*
				1424	* helper to fail a stripe based on a physical disk
				1425	* bio.
				1426	*/
				1427	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1428	struct bio *bio)
				1429	{
				1430	int failed = find_bio_stripe(rbio, bio);
				1431
				1432	if (failed < 0)
				1433	return -EIO;
				1434
				1435	return fail_rbio_index(rbio, failed);
				1436	}
				1437
				1438	/*
				1439	* this sets each page in the bio uptodate. It should only be used on private
				1440	* rbio pages, nothing that comes in from the higher layers
				1441	*/
				1442	static void set_bio_pages_uptodate(struct bio *bio)
				1443	{
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1444	struct bio_vec *bvec;
				1445	int i;
Ming Lei	6dc4f10	2019-02-15 19:13:19 +0800	[diff] [blame]	1446	struct bvec_iter_all iter_all;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1447
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1448	ASSERT(!bio_flagged(bio, BIO_CLONED));
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1449
Ming Lei	6dc4f10	2019-02-15 19:13:19 +0800	[diff] [blame]	1450	bio_for_each_segment_all(bvec, bio, i, iter_all)
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1451	SetPageUptodate(bvec->bv_page);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1452	}
				1453
				1454	/*
				1455	* end io for the read phase of the rmw cycle. All the bios here are physical
				1456	* stripe bios we've read from the disk so we can recalculate the parity of the
				1457	* stripe.
				1458	*
				1459	* This will usually kick off finish_rmw once all the bios are read in, but it
				1460	* may trigger parity reconstruction if we had any errors along the way
				1461	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1462	static void raid_rmw_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1463	{
				1464	struct btrfs_raid_bio *rbio = bio->bi_private;
				1465
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1466	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1467	fail_bio_stripe(rbio, bio);
				1468	else
				1469	set_bio_pages_uptodate(bio);
				1470
				1471	bio_put(bio);
				1472
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1473	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1474	return;
				1475
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1476	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1477	goto cleanup;
				1478
				1479	/*
				1480	* this will normally call finish_rmw to start our write
				1481	* but if there are any failed stripes we'll reconstruct
				1482	* from parity first
				1483	*/
				1484	validate_rbio_for_rmw(rbio);
				1485	return;
				1486
				1487	cleanup:
				1488
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1489	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1490	}
				1491
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1492	/*
				1493	* the stripe must be locked by the caller. It will
				1494	* unlock after all the writes are done
				1495	*/
				1496	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1497	{
				1498	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1499	struct bio_list bio_list;
				1500	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1501	int pagenr;
				1502	int stripe;
				1503	struct bio *bio;
				1504
				1505	bio_list_init(&bio_list);
				1506
				1507	ret = alloc_rbio_pages(rbio);
				1508	if (ret)
				1509	goto cleanup;
				1510
				1511	index_rbio_pages(rbio);
				1512
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1513	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1514	/*
				1515	* build a list of bios to read all the missing parts of this
				1516	* stripe
				1517	*/
				1518	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1519	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1520	struct page *page;
				1521	/*
				1522	* we want to find all the pages missing from
				1523	* the rbio and read them from the disk. If
				1524	* page_in_rbio finds a page in the bio list
				1525	* we don't need to read it off the stripe.
				1526	*/
				1527	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1528	if (page)
				1529	continue;
				1530
				1531	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1532	/*
				1533	* the bio cache may have handed us an uptodate
				1534	* page. If so, be happy and use it
				1535	*/
				1536	if (PageUptodate(page))
				1537	continue;
				1538
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1539	ret = rbio_add_io_page(rbio, &bio_list, page,
				1540	stripe, pagenr, rbio->stripe_len);
				1541	if (ret)
				1542	goto cleanup;
				1543	}
				1544	}
				1545
				1546	bios_to_read = bio_list_size(&bio_list);
				1547	if (!bios_to_read) {
				1548	/*
				1549	* this can happen if others have merged with
				1550	* us, it means there is nothing left to read.
				1551	* But if there are missing devices it may not be
				1552	* safe to do the full stripe write yet.
				1553	*/
				1554	goto finish;
				1555	}
				1556
				1557	/*
				1558	* the bbio may be freed once we submit the last bio. Make sure
				1559	* not to touch it after that
				1560	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1561	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1562	while (1) {
				1563	bio = bio_list_pop(&bio_list);
				1564	if (!bio)
				1565	break;
				1566
				1567	bio->bi_private = rbio;
				1568	bio->bi_end_io = raid_rmw_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1569	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1570
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1571	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1572
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1573	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1574	}
				1575	/* the actual write will happen once the reads are done */
				1576	return 0;
				1577
				1578	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1579	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1580
				1581	while ((bio = bio_list_pop(&bio_list)))
				1582	bio_put(bio);
				1583
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1584	return -EIO;
				1585
				1586	finish:
				1587	validate_rbio_for_rmw(rbio);
				1588	return 0;
				1589	}
				1590
				1591	/*
				1592	* if the upper layers pass in a full stripe, we thank them by only allocating
				1593	* enough pages to hold the parity, and sending it all down quickly.
				1594	*/
				1595	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1596	{
				1597	int ret;
				1598
				1599	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1600	if (ret) {
				1601	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1602	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1603	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1604
				1605	ret = lock_stripe_add(rbio);
				1606	if (ret == 0)
				1607	finish_rmw(rbio);
				1608	return 0;
				1609	}
				1610
				1611	/*
				1612	* partial stripe writes get handed over to async helpers.
				1613	* We're really hoping to merge a few more writes into this
				1614	* rbio before calculating new parity
				1615	*/
				1616	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1617	{
				1618	int ret;
				1619
				1620	ret = lock_stripe_add(rbio);
				1621	if (ret == 0)
David Sterba	cf6a4a7	2018-06-29 10:56:58 +0200	[diff] [blame]	1622	start_async_work(rbio, rmw_work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1623	return 0;
				1624	}
				1625
				1626	/*
				1627	* sometimes while we were reading from the drive to
				1628	* recalculate parity, enough new bios come into create
				1629	* a full stripe. So we do a check here to see if we can
				1630	* go directly to finish_rmw
				1631	*/
				1632	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1633	{
				1634	/* head off into rmw land if we don't have a full stripe */
				1635	if (!rbio_is_full(rbio))
				1636	return partial_stripe_write(rbio);
				1637	return full_stripe_write(rbio);
				1638	}
				1639
				1640	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1641	* We use plugging call backs to collect full stripes.
				1642	* Any time we get a partial stripe write while plugged
				1643	* we collect it into a list. When the unplug comes down,
				1644	* we sort the list by logical block number and merge
				1645	* everything we can into the same rbios
				1646	*/
				1647	struct btrfs_plug_cb {
				1648	struct blk_plug_cb cb;
				1649	struct btrfs_fs_info *info;
				1650	struct list_head rbio_list;
				1651	struct btrfs_work work;
				1652	};
				1653
				1654	/*
				1655	* rbios on the plug list are sorted for easier merging.
				1656	*/
				1657	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1658	{
				1659	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1660	plug_list);
				1661	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1662	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1663	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1664	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1665
				1666	if (a_sector < b_sector)
				1667	return -1;
				1668	if (a_sector > b_sector)
				1669	return 1;
				1670	return 0;
				1671	}
				1672
				1673	static void run_plug(struct btrfs_plug_cb *plug)
				1674	{
				1675	struct btrfs_raid_bio *cur;
				1676	struct btrfs_raid_bio *last = NULL;
				1677
				1678	/*
				1679	* sort our plug list then try to merge
				1680	* everything we can in hopes of creating full
				1681	* stripes.
				1682	*/
				1683	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1684	while (!list_empty(&plug->rbio_list)) {
				1685	cur = list_entry(plug->rbio_list.next,
				1686	struct btrfs_raid_bio, plug_list);
				1687	list_del_init(&cur->plug_list);
				1688
				1689	if (rbio_is_full(cur)) {
David Sterba	c7b562c	2018-06-29 10:57:10 +0200	[diff] [blame]	1690	int ret;
				1691
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1692	/* we have a full stripe, send it down */
David Sterba	c7b562c	2018-06-29 10:57:10 +0200	[diff] [blame]	1693	ret = full_stripe_write(cur);
				1694	BUG_ON(ret);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1695	continue;
				1696	}
				1697	if (last) {
				1698	if (rbio_can_merge(last, cur)) {
				1699	merge_rbio(last, cur);
				1700	__free_raid_bio(cur);
				1701	continue;
				1702
				1703	}
				1704	__raid56_parity_write(last);
				1705	}
				1706	last = cur;
				1707	}
				1708	if (last) {
				1709	__raid56_parity_write(last);
				1710	}
				1711	kfree(plug);
				1712	}
				1713
				1714	/*
				1715	* if the unplug comes from schedule, we have to push the
				1716	* work off to a helper thread
				1717	*/
				1718	static void unplug_work(struct btrfs_work *work)
				1719	{
				1720	struct btrfs_plug_cb *plug;
				1721	plug = container_of(work, struct btrfs_plug_cb, work);
				1722	run_plug(plug);
				1723	}
				1724
				1725	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1726	{
				1727	struct btrfs_plug_cb *plug;
				1728	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1729
				1730	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1731	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1732	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1733	btrfs_queue_work(plug->info->rmw_workers,
				1734	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1735	return;
				1736	}
				1737	run_plug(plug);
				1738	}
				1739
				1740	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1741	* our main entry point for writes from the rest of the FS.
				1742	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1743	int raid56_parity_write(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1744	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1745	{
				1746	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1747	struct btrfs_plug_cb *plug = NULL;
				1748	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1749	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1750
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1751	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1752	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1753	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1754	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1755	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1756	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1757	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1758	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1759
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1760	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1761	rbio->generic_bio_cnt = 1;
				1762
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1763	/*
				1764	* don't plug on full rbios, just get them out the door
				1765	* as quickly as we can
				1766	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1767	if (rbio_is_full(rbio)) {
				1768	ret = full_stripe_write(rbio);
				1769	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1770	btrfs_bio_counter_dec(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1771	return ret;
				1772	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1773
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1774	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1775	if (cb) {
				1776	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1777	if (!plug->info) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1778	plug->info = fs_info;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1779	INIT_LIST_HEAD(&plug->rbio_list);
				1780	}
				1781	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1782	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1783	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1784	ret = __raid56_parity_write(rbio);
				1785	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1786	btrfs_bio_counter_dec(fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1787	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1788	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1789	}
				1790
				1791	/*
				1792	* all parity reconstruction happens here. We've read in everything
				1793	* we can find from the drives and this does the heavy lifting of
				1794	* sorting the good from the bad.
				1795	*/
				1796	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1797	{
				1798	int pagenr, stripe;
				1799	void **pointers;
				1800	int faila = -1, failb = -1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1801	struct page *page;
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1802	blk_status_t err;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1803	int i;
				1804
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1805	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1806	if (!pointers) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1807	err = BLK_STS_RESOURCE;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1808	goto cleanup_io;
				1809	}
				1810
				1811	faila = rbio->faila;
				1812	failb = rbio->failb;
				1813
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1814	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1815	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1816	spin_lock_irq(&rbio->bio_list_lock);
				1817	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1818	spin_unlock_irq(&rbio->bio_list_lock);
				1819	}
				1820
				1821	index_rbio_pages(rbio);
				1822
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1823	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1824	/*
				1825	* Now we just use bitmap to mark the horizontal stripes in
				1826	* which we have data when doing parity scrub.
				1827	*/
				1828	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1829	!test_bit(pagenr, rbio->dbitmap))
				1830	continue;
				1831
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1832	/* setup our array of pointers with pages
				1833	* from each stripe
				1834	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1835	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1836	/*
				1837	* if we're rebuilding a read, we have to use
				1838	* pages from the bio list
				1839	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1840	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1841	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1842	(stripe == faila \|\| stripe == failb)) {
				1843	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1844	} else {
				1845	page = rbio_stripe_page(rbio, stripe, pagenr);
				1846	}
				1847	pointers[stripe] = kmap(page);
				1848	}
				1849
				1850	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1851	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1852	/*
				1853	* single failure, rebuild from parity raid5
				1854	* style
				1855	*/
				1856	if (failb < 0) {
				1857	if (faila == rbio->nr_data) {
				1858	/*
				1859	* Just the P stripe has failed, without
				1860	* a bad data or Q stripe.
				1861	* TODO, we should redo the xor here.
				1862	*/
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1863	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1864	goto cleanup;
				1865	}
				1866	/*
				1867	* a single failure in raid6 is rebuilt
				1868	* in the pstripe code below
				1869	*/
				1870	goto pstripe;
				1871	}
				1872
				1873	/* make sure our ps and qs are in order */
				1874	if (faila > failb) {
				1875	int tmp = failb;
				1876	failb = faila;
				1877	faila = tmp;
				1878	}
				1879
				1880	/* if the q stripe is failed, do a pstripe reconstruction
				1881	* from the xors.
				1882	* If both the q stripe and the P stripe are failed, we're
				1883	* here due to a crc mismatch and we can't give them the
				1884	* data they want
				1885	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1886	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1887	if (rbio->bbio->raid_map[faila] ==
				1888	RAID5_P_STRIPE) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1889	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1890	goto cleanup;
				1891	}
				1892	/*
				1893	* otherwise we have one bad data stripe and
				1894	* a good P stripe. raid5!
				1895	*/
				1896	goto pstripe;
				1897	}
				1898
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1899	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1900	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1901	PAGE_SIZE, faila, pointers);
				1902	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1903	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1904	PAGE_SIZE, faila, failb,
				1905	pointers);
				1906	}
				1907	} else {
				1908	void *p;
				1909
				1910	/* rebuild from P stripe here (raid5 or raid6) */
				1911	BUG_ON(failb != -1);
				1912	pstripe:
				1913	/* Copy parity block into failed block to start with */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1914	copy_page(pointers[faila], pointers[rbio->nr_data]);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1915
				1916	/* rearrange the pointer array */
				1917	p = pointers[faila];
				1918	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1919	pointers[stripe] = pointers[stripe + 1];
				1920	pointers[rbio->nr_data - 1] = p;
				1921
				1922	/* xor in the rest */
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1923	run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1924	}
				1925	/* if we're doing this rebuild as part of an rmw, go through
				1926	* and set all of our private rbio pages in the
				1927	* failed stripes as uptodate. This way finish_rmw will
				1928	* know they can be trusted. If this was a read reconstruction,
				1929	* other endio functions will fiddle the uptodate bits
				1930	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1931	if (rbio->operation == BTRFS_RBIO_WRITE) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1932	for (i = 0; i < rbio->stripe_npages; i++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1933	if (faila != -1) {
				1934	page = rbio_stripe_page(rbio, faila, i);
				1935	SetPageUptodate(page);
				1936	}
				1937	if (failb != -1) {
				1938	page = rbio_stripe_page(rbio, failb, i);
				1939	SetPageUptodate(page);
				1940	}
				1941	}
				1942	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1943	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1944	/*
				1945	* if we're rebuilding a read, we have to use
				1946	* pages from the bio list
				1947	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1948	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1949	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1950	(stripe == faila \|\| stripe == failb)) {
				1951	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1952	} else {
				1953	page = rbio_stripe_page(rbio, stripe, pagenr);
				1954	}
				1955	kunmap(page);
				1956	}
				1957	}
				1958
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1959	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1960	cleanup:
				1961	kfree(pointers);
				1962
				1963	cleanup_io:
Liu Bo	580c6ef	2018-03-22 09:20:11 +0800	[diff] [blame]	1964	/*
				1965	* Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
				1966	* valid rbio which is consistent with ondisk content, thus such a
				1967	* valid rbio can be cached to avoid further disk reads.
				1968	*/
				1969	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1970	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1971	/*
				1972	* - In case of two failures, where rbio->failb != -1:
				1973	*
				1974	* Do not cache this rbio since the above read reconstruction
				1975	* (raid6_datap_recov() or raid6_2data_recov()) may have
				1976	* changed some content of stripes which are not identical to
				1977	* on-disk content any more, otherwise, a later write/recover
				1978	* may steal stripe_pages from this rbio and end up with
				1979	* corruptions or rebuild failures.
				1980	*
				1981	* - In case of single failure, where rbio->failb == -1:
				1982	*
				1983	* Cache this rbio iff the above read reconstruction is
Andrea Gelmini	52042d8	2018-11-28 12:05:13 +0100	[diff] [blame]	1984	* executed without problems.
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1985	*/
				1986	if (err == BLK_STS_OK && rbio->failb < 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1987	cache_rbio_pages(rbio);
				1988	else
				1989	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1990
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1991	rbio_orig_end_io(rbio, err);
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1992	} else if (err == BLK_STS_OK) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1993	rbio->faila = -1;
				1994	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1995
				1996	if (rbio->operation == BTRFS_RBIO_WRITE)
				1997	finish_rmw(rbio);
				1998	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				1999	finish_parity_scrub(rbio, 0);
				2000	else
				2001	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2002	} else {
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2003	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2004	}
				2005	}
				2006
				2007	/*
				2008	* This is called only for stripes we've read from disk to
				2009	* reconstruct the parity.
				2010	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2011	static void raid_recover_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2012	{
				2013	struct btrfs_raid_bio *rbio = bio->bi_private;
				2014
				2015	/*
				2016	* we only read stripe pages off the disk, set them
				2017	* up to date if there were no errors
				2018	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2019	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2020	fail_bio_stripe(rbio, bio);
				2021	else
				2022	set_bio_pages_uptodate(bio);
				2023	bio_put(bio);
				2024
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2025	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2026	return;
				2027
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2028	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2029	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2030	else
				2031	__raid_recover_end_io(rbio);
				2032	}
				2033
				2034	/*
				2035	* reads everything we need off the disk to reconstruct
				2036	* the parity. endio handlers trigger final reconstruction
				2037	* when the IO is done.
				2038	*
				2039	* This is used both for reads from the higher layers and for
				2040	* parity construction required to finish a rmw cycle.
				2041	*/
				2042	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2043	{
				2044	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2045	struct bio_list bio_list;
				2046	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2047	int pagenr;
				2048	int stripe;
				2049	struct bio *bio;
				2050
				2051	bio_list_init(&bio_list);
				2052
				2053	ret = alloc_rbio_pages(rbio);
				2054	if (ret)
				2055	goto cleanup;
				2056
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2057	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2058
				2059	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2060	* read everything that hasn't failed. Thanks to the
				2061	* stripe cache, it is possible that some or all of these
				2062	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2063	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2064	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2065	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2066	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2067	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2068	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2069
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	2070	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2071	struct page *p;
				2072
				2073	/*
				2074	* the rmw code may have already read this
				2075	* page in
				2076	*/
				2077	p = rbio_stripe_page(rbio, stripe, pagenr);
				2078	if (PageUptodate(p))
				2079	continue;
				2080
				2081	ret = rbio_add_io_page(rbio, &bio_list,
				2082	rbio_stripe_page(rbio, stripe, pagenr),
				2083	stripe, pagenr, rbio->stripe_len);
				2084	if (ret < 0)
				2085	goto cleanup;
				2086	}
				2087	}
				2088
				2089	bios_to_read = bio_list_size(&bio_list);
				2090	if (!bios_to_read) {
				2091	/*
				2092	* we might have no bios to read just because the pages
				2093	* were up to date, or we might have no bios to read because
				2094	* the devices were gone.
				2095	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2096	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2097	__raid_recover_end_io(rbio);
				2098	goto out;
				2099	} else {
				2100	goto cleanup;
				2101	}
				2102	}
				2103
				2104	/*
				2105	* the bbio may be freed once we submit the last bio. Make sure
				2106	* not to touch it after that
				2107	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2108	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2109	while (1) {
				2110	bio = bio_list_pop(&bio_list);
				2111	if (!bio)
				2112	break;
				2113
				2114	bio->bi_private = rbio;
				2115	bio->bi_end_io = raid_recover_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2116	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2117
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2118	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2119
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2120	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2121	}
				2122	out:
				2123	return 0;
				2124
				2125	cleanup:
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2126	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2127	rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2128	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2129
				2130	while ((bio = bio_list_pop(&bio_list)))
				2131	bio_put(bio);
				2132
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2133	return -EIO;
				2134	}
				2135
				2136	/*
				2137	* the main entry point for reads from the higher layers. This
				2138	* is really only called when the normal read path had a failure,
				2139	* so we assume the bio they send down corresponds to a failed part
				2140	* of the drive.
				2141	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2142	int raid56_parity_recover(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2143	struct btrfs_bio *bbio, u64 stripe_len,
				2144	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2145	{
				2146	struct btrfs_raid_bio *rbio;
				2147	int ret;
				2148
Liu Bo	abad60c	2017-03-29 10:54:26 -0700	[diff] [blame]	2149	if (generic_io) {
				2150	ASSERT(bbio->mirror_num == mirror_num);
				2151	btrfs_io_bio(bio)->mirror_num = mirror_num;
				2152	}
				2153
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2154	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2155	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2156	if (generic_io)
				2157	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2158	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2159	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2160
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2161	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2162	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2163	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2164
				2165	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2166	if (rbio->faila == -1) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2167	btrfs_warn(fs_info,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2168	"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
				2169	__func__, (u64)bio->bi_iter.bi_sector << 9,
				2170	(u64)bio->bi_iter.bi_size, bbio->map_type);
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2171	if (generic_io)
				2172	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2173	kfree(rbio);
				2174	return -EIO;
				2175	}
				2176
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2177	if (generic_io) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2178	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2179	rbio->generic_bio_cnt = 1;
				2180	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2181	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2182	}
				2183
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2184	/*
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2185	* Loop retry:
				2186	* for 'mirror == 2', reconstruct from all other stripes.
				2187	* for 'mirror_num > 2', select a stripe to fail on every retry.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2188	*/
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2189	if (mirror_num > 2) {
				2190	/*
				2191	* 'mirror == 3' is to fail the p stripe and
				2192	* reconstruct from the q stripe. 'mirror > 3' is to
				2193	* fail a data stripe and reconstruct from p+q stripe.
				2194	*/
				2195	rbio->failb = rbio->real_stripes - (mirror_num - 1);
				2196	ASSERT(rbio->failb > 0);
				2197	if (rbio->failb <= rbio->faila)
				2198	rbio->failb--;
				2199	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2200
				2201	ret = lock_stripe_add(rbio);
				2202
				2203	/*
				2204	* __raid56_parity_recover will end the bio with
				2205	* any errors it hits. We don't want to return
				2206	* its error value up the stack because our caller
				2207	* will end up calling bio_endio with any nonzero
				2208	* return
				2209	*/
				2210	if (ret == 0)
				2211	__raid56_parity_recover(rbio);
				2212	/*
				2213	* our rbio has been added to the list of
				2214	* rbios that will be handled after the
				2215	* currently lock owner is done
				2216	*/
				2217	return 0;
				2218
				2219	}
				2220
				2221	static void rmw_work(struct btrfs_work *work)
				2222	{
				2223	struct btrfs_raid_bio *rbio;
				2224
				2225	rbio = container_of(work, struct btrfs_raid_bio, work);
				2226	raid56_rmw_stripe(rbio);
				2227	}
				2228
				2229	static void read_rebuild_work(struct btrfs_work *work)
				2230	{
				2231	struct btrfs_raid_bio *rbio;
				2232
				2233	rbio = container_of(work, struct btrfs_raid_bio, work);
				2234	__raid56_parity_recover(rbio);
				2235	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2236
				2237	/*
				2238	* The following code is used to scrub/replace the parity stripe
				2239	*
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2240	* Caller must have already increased bio_counter for getting @bbio.
				2241	*
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2242	* Note: We need make sure all the pages that add into the scrub/replace
				2243	* raid bio are correct and not be changed during the scrub/replace. That
				2244	* is those pages just hold metadata or file data with checksum.
				2245	*/
				2246
				2247	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2248	raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2249	struct btrfs_bio *bbio, u64 stripe_len,
				2250	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2251	unsigned long *dbitmap, int stripe_nsectors)
				2252	{
				2253	struct btrfs_raid_bio *rbio;
				2254	int i;
				2255
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2256	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2257	if (IS_ERR(rbio))
				2258	return NULL;
				2259	bio_list_add(&rbio->bio_list, bio);
				2260	/*
				2261	* This is a special bio which is used to hold the completion handler
				2262	* and make the scrub rbio is similar to the other types
				2263	*/
				2264	ASSERT(!bio->bi_iter.bi_size);
				2265	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2266
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2267	/*
				2268	* After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
				2269	* to the end position, so this search can start from the first parity
				2270	* stripe.
				2271	*/
				2272	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2273	if (bbio->stripes[i].dev == scrub_dev) {
				2274	rbio->scrubp = i;
				2275	break;
				2276	}
				2277	}
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2278	ASSERT(i < rbio->real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2279
				2280	/* Now we just support the sectorsize equals to page size */
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2281	ASSERT(fs_info->sectorsize == PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2282	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2283	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2284
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2285	/*
				2286	* We have already increased bio_counter when getting bbio, record it
				2287	* so we can free it at rbio_orig_end_io().
				2288	*/
				2289	rbio->generic_bio_cnt = 1;
				2290
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2291	return rbio;
				2292	}
				2293
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2294	/* Used for both parity scrub and missing. */
				2295	void raid56_add_scrub_pages(struct btrfs_raid_bio rbio, struct page page,
				2296	u64 logical)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2297	{
				2298	int stripe_offset;
				2299	int index;
				2300
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2301	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2302	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2303	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2304	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2305	index = stripe_offset >> PAGE_SHIFT;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2306	rbio->bio_pages[index] = page;
				2307	}
				2308
				2309	/*
				2310	* We just scrub the parity that we have correct data on the same horizontal,
				2311	* so we needn't allocate all pages for all the stripes.
				2312	*/
				2313	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2314	{
				2315	int i;
				2316	int bit;
				2317	int index;
				2318	struct page *page;
				2319
				2320	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2321	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2322	index = i * rbio->stripe_npages + bit;
				2323	if (rbio->stripe_pages[index])
				2324	continue;
				2325
				2326	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2327	if (!page)
				2328	return -ENOMEM;
				2329	rbio->stripe_pages[index] = page;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2330	}
				2331	}
				2332	return 0;
				2333	}
				2334
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2335	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2336	int need_check)
				2337	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2338	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	2339	void **pointers = rbio->finish_pointers;
				2340	unsigned long *pbitmap = rbio->finish_pbitmap;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2341	int nr_data = rbio->nr_data;
				2342	int stripe;
				2343	int pagenr;
				2344	int p_stripe = -1;
				2345	int q_stripe = -1;
				2346	struct page *p_page = NULL;
				2347	struct page *q_page = NULL;
				2348	struct bio_list bio_list;
				2349	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2350	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2351	int ret;
				2352
				2353	bio_list_init(&bio_list);
				2354
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2355	if (rbio->real_stripes - rbio->nr_data == 1) {
				2356	p_stripe = rbio->real_stripes - 1;
				2357	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				2358	p_stripe = rbio->real_stripes - 2;
				2359	q_stripe = rbio->real_stripes - 1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2360	} else {
				2361	BUG();
				2362	}
				2363
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2364	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2365	is_replace = 1;
				2366	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2367	}
				2368
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2369	/*
				2370	* Because the higher layers(scrubber) are unlikely to
				2371	* use this area of the disk again soon, so don't cache
				2372	* it.
				2373	*/
				2374	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2375
				2376	if (!need_check)
				2377	goto writeback;
				2378
				2379	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2380	if (!p_page)
				2381	goto cleanup;
				2382	SetPageUptodate(p_page);
				2383
				2384	if (q_stripe != -1) {
				2385	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2386	if (!q_page) {
				2387	__free_page(p_page);
				2388	goto cleanup;
				2389	}
				2390	SetPageUptodate(q_page);
				2391	}
				2392
				2393	atomic_set(&rbio->error, 0);
				2394
				2395	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2396	struct page *p;
				2397	void *parity;
				2398	/* first collect one page from each data stripe */
				2399	for (stripe = 0; stripe < nr_data; stripe++) {
				2400	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2401	pointers[stripe] = kmap(p);
				2402	}
				2403
				2404	/* then add the parity stripe */
				2405	pointers[stripe++] = kmap(p_page);
				2406
				2407	if (q_stripe != -1) {
				2408
				2409	/*
				2410	* raid6, add the qstripe and call the
				2411	* library function to fill in our p/q
				2412	*/
				2413	pointers[stripe++] = kmap(q_page);
				2414
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2415	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2416	pointers);
				2417	} else {
				2418	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2419	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2420	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2421	}
				2422
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2423	/* Check scrubbing parity and repair it */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2424	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2425	parity = kmap(p);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2426	if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2427	copy_page(parity, pointers[rbio->scrubp]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2428	else
				2429	/* Parity is right, needn't writeback */
				2430	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2431	kunmap(p);
				2432
Andrea Righi	3897b6f	2019-03-14 08:56:28 +0100	[diff] [blame]	2433	for (stripe = 0; stripe < nr_data; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2434	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
Andrea Righi	3897b6f	2019-03-14 08:56:28 +0100	[diff] [blame]	2435	kunmap(p_page);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2436	}
				2437
				2438	__free_page(p_page);
				2439	if (q_page)
				2440	__free_page(q_page);
				2441
				2442	writeback:
				2443	/*
				2444	* time to start writing. Make bios for everything from the
				2445	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2446	* everything else.
				2447	*/
				2448	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2449	struct page *page;
				2450
				2451	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2452	ret = rbio_add_io_page(rbio, &bio_list,
				2453	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2454	if (ret)
				2455	goto cleanup;
				2456	}
				2457
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2458	if (!is_replace)
				2459	goto submit_write;
				2460
				2461	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2462	struct page *page;
				2463
				2464	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2465	ret = rbio_add_io_page(rbio, &bio_list, page,
				2466	bbio->tgtdev_map[rbio->scrubp],
				2467	pagenr, rbio->stripe_len);
				2468	if (ret)
				2469	goto cleanup;
				2470	}
				2471
				2472	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2473	nr_data = bio_list_size(&bio_list);
				2474	if (!nr_data) {
				2475	/* Every parity is right */
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2476	rbio_orig_end_io(rbio, BLK_STS_OK);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2477	return;
				2478	}
				2479
				2480	atomic_set(&rbio->stripes_pending, nr_data);
				2481
				2482	while (1) {
				2483	bio = bio_list_pop(&bio_list);
				2484	if (!bio)
				2485	break;
				2486
				2487	bio->bi_private = rbio;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	2488	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2489	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2490
				2491	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2492	}
				2493	return;
				2494
				2495	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2496	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2497
				2498	while ((bio = bio_list_pop(&bio_list)))
				2499	bio_put(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2500	}
				2501
				2502	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2503	{
				2504	if (stripe >= 0 && stripe < rbio->nr_data)
				2505	return 1;
				2506	return 0;
				2507	}
				2508
				2509	/*
				2510	* While we're doing the parity check and repair, we could have errors
				2511	* in reading pages off the disk. This checks for errors and if we're
				2512	* not able to read the page it'll trigger parity reconstruction. The
				2513	* parity scrub will be finished after we've reconstructed the failed
				2514	* stripes
				2515	*/
				2516	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2517	{
				2518	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2519	goto cleanup;
				2520
				2521	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2522	int dfail = 0, failp = -1;
				2523
				2524	if (is_data_stripe(rbio, rbio->faila))
				2525	dfail++;
				2526	else if (is_parity_stripe(rbio->faila))
				2527	failp = rbio->faila;
				2528
				2529	if (is_data_stripe(rbio, rbio->failb))
				2530	dfail++;
				2531	else if (is_parity_stripe(rbio->failb))
				2532	failp = rbio->failb;
				2533
				2534	/*
				2535	* Because we can not use a scrubbing parity to repair
				2536	* the data, so the capability of the repair is declined.
				2537	* (In the case of RAID5, we can not repair anything)
				2538	*/
				2539	if (dfail > rbio->bbio->max_errors - 1)
				2540	goto cleanup;
				2541
				2542	/*
				2543	* If all data is good, only parity is correctly, just
				2544	* repair the parity.
				2545	*/
				2546	if (dfail == 0) {
				2547	finish_parity_scrub(rbio, 0);
				2548	return;
				2549	}
				2550
				2551	/*
				2552	* Here means we got one corrupted data stripe and one
				2553	* corrupted parity on RAID6, if the corrupted parity
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2554	* is scrubbing parity, luckily, use the other one to repair
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2555	* the data, or we can not repair the data stripe.
				2556	*/
				2557	if (failp != rbio->scrubp)
				2558	goto cleanup;
				2559
				2560	__raid_recover_end_io(rbio);
				2561	} else {
				2562	finish_parity_scrub(rbio, 1);
				2563	}
				2564	return;
				2565
				2566	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2567	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2568	}
				2569
				2570	/*
				2571	* end io for the read phase of the rmw cycle. All the bios here are physical
				2572	* stripe bios we've read from the disk so we can recalculate the parity of the
				2573	* stripe.
				2574	*
				2575	* This will usually kick off finish_rmw once all the bios are read in, but it
				2576	* may trigger parity reconstruction if we had any errors along the way
				2577	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2578	static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2579	{
				2580	struct btrfs_raid_bio *rbio = bio->bi_private;
				2581
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2582	if (bio->bi_status)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2583	fail_bio_stripe(rbio, bio);
				2584	else
				2585	set_bio_pages_uptodate(bio);
				2586
				2587	bio_put(bio);
				2588
				2589	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2590	return;
				2591
				2592	/*
				2593	* this will normally call finish_rmw to start our write
				2594	* but if there are any failed stripes we'll reconstruct
				2595	* from parity first
				2596	*/
				2597	validate_rbio_for_parity_scrub(rbio);
				2598	}
				2599
				2600	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2601	{
				2602	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2603	struct bio_list bio_list;
				2604	int ret;
				2605	int pagenr;
				2606	int stripe;
				2607	struct bio *bio;
				2608
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2609	bio_list_init(&bio_list);
				2610
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2611	ret = alloc_rbio_essential_pages(rbio);
				2612	if (ret)
				2613	goto cleanup;
				2614
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2615	atomic_set(&rbio->error, 0);
				2616	/*
				2617	* build a list of bios to read all the missing parts of this
				2618	* stripe
				2619	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2620	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2621	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2622	struct page *page;
				2623	/*
				2624	* we want to find all the pages missing from
				2625	* the rbio and read them from the disk. If
				2626	* page_in_rbio finds a page in the bio list
				2627	* we don't need to read it off the stripe.
				2628	*/
				2629	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2630	if (page)
				2631	continue;
				2632
				2633	page = rbio_stripe_page(rbio, stripe, pagenr);
				2634	/*
				2635	* the bio cache may have handed us an uptodate
				2636	* page. If so, be happy and use it
				2637	*/
				2638	if (PageUptodate(page))
				2639	continue;
				2640
				2641	ret = rbio_add_io_page(rbio, &bio_list, page,
				2642	stripe, pagenr, rbio->stripe_len);
				2643	if (ret)
				2644	goto cleanup;
				2645	}
				2646	}
				2647
				2648	bios_to_read = bio_list_size(&bio_list);
				2649	if (!bios_to_read) {
				2650	/*
				2651	* this can happen if others have merged with
				2652	* us, it means there is nothing left to read.
				2653	* But if there are missing devices it may not be
				2654	* safe to do the full stripe write yet.
				2655	*/
				2656	goto finish;
				2657	}
				2658
				2659	/*
				2660	* the bbio may be freed once we submit the last bio. Make sure
				2661	* not to touch it after that
				2662	*/
				2663	atomic_set(&rbio->stripes_pending, bios_to_read);
				2664	while (1) {
				2665	bio = bio_list_pop(&bio_list);
				2666	if (!bio)
				2667	break;
				2668
				2669	bio->bi_private = rbio;
				2670	bio->bi_end_io = raid56_parity_scrub_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2671	bio->bi_opf = REQ_OP_READ;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2672
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2673	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2674
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2675	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2676	}
				2677	/* the actual write will happen once the reads are done */
				2678	return;
				2679
				2680	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2681	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2682
				2683	while ((bio = bio_list_pop(&bio_list)))
				2684	bio_put(bio);
				2685
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2686	return;
				2687
				2688	finish:
				2689	validate_rbio_for_parity_scrub(rbio);
				2690	}
				2691
				2692	static void scrub_parity_work(struct btrfs_work *work)
				2693	{
				2694	struct btrfs_raid_bio *rbio;
				2695
				2696	rbio = container_of(work, struct btrfs_raid_bio, work);
				2697	raid56_parity_scrub_stripe(rbio);
				2698	}
				2699
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2700	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2701	{
				2702	if (!lock_stripe_add(rbio))
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	2703	start_async_work(rbio, scrub_parity_work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2704	}
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2705
				2706	/* The following code is used for dev replace of a missing RAID 5/6 device. */
				2707
				2708	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2709	raid56_alloc_missing_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2710	struct btrfs_bio *bbio, u64 length)
				2711	{
				2712	struct btrfs_raid_bio *rbio;
				2713
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2714	rbio = alloc_rbio(fs_info, bbio, length);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2715	if (IS_ERR(rbio))
				2716	return NULL;
				2717
				2718	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
				2719	bio_list_add(&rbio->bio_list, bio);
				2720	/*
				2721	* This is a special bio which is used to hold the completion handler
				2722	* and make the scrub rbio is similar to the other types
				2723	*/
				2724	ASSERT(!bio->bi_iter.bi_size);
				2725
				2726	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2727	if (rbio->faila == -1) {
				2728	BUG();
				2729	kfree(rbio);
				2730	return NULL;
				2731	}
				2732
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2733	/*
				2734	* When we get bbio, we have already increased bio_counter, record it
				2735	* so we can free it at rbio_orig_end_io()
				2736	*/
				2737	rbio->generic_bio_cnt = 1;
				2738
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2739	return rbio;
				2740	}
				2741
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2742	void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
				2743	{
				2744	if (!lock_stripe_add(rbio))
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	2745	start_async_work(rbio, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2746	}