Blame - fs/btrfs/raid56.c - kernel/msm-4.9

blob: fa72068bd256018e27a13fbc8326b82ce49b207b [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
Geert Uytterhoeven	d7011f5	2013-03-03 04:44:41 -0700	[diff] [blame]	34	#include <linux/vmalloc.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	61	#define RBIO_CACHE_SIZE 1024
				62
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	63	enum btrfs_rbio_ops {
				64	BTRFS_RBIO_WRITE = 0,
				65	BTRFS_RBIO_READ_REBUILD = 1,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	66	BTRFS_RBIO_PARITY_SCRUB = 2,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	67	};
				68
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	69	struct btrfs_raid_bio {
				70	struct btrfs_fs_info *fs_info;
				71	struct btrfs_bio *bbio;
				72
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	73	/* while we're doing rmw on a stripe
				74	* we put it into a hash table so we can
				75	* lock the stripe and merge more rbios
				76	* into it.
				77	*/
				78	struct list_head hash_list;
				79
				80	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	81	* LRU list for the stripe cache
				82	*/
				83	struct list_head stripe_cache;
				84
				85	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	86	* for scheduling work in the helper threads
				87	*/
				88	struct btrfs_work work;
				89
				90	/*
				91	* bio list and bio_list_lock are used
				92	* to add more bios into the stripe
				93	* in hopes of avoiding the full rmw
				94	*/
				95	struct bio_list bio_list;
				96	spinlock_t bio_list_lock;
				97
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	98	/* also protected by the bio_list_lock, the
				99	* plug list is used by the plugging code
				100	* to collect partial bios while plugged. The
				101	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	102	* the stripe lock to the next pending IO
				103	*/
				104	struct list_head plug_list;
				105
				106	/*
				107	* flags that tell us if it is safe to
				108	* merge with this bio
				109	*/
				110	unsigned long flags;
				111
				112	/* size of each individual stripe on disk */
				113	int stripe_len;
				114
				115	/* number of data stripes (no p/q) */
				116	int nr_data;
				117
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	118	int real_stripes;
				119
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	120	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	121	/*
				122	* set if we're doing a parity rebuild
				123	* for a read from higher up, which is handled
				124	* differently from a parity rebuild as part of
				125	* rmw
				126	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	127	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	128
				129	/* first bad stripe */
				130	int faila;
				131
				132	/* second bad stripe (for raid6 use) */
				133	int failb;
				134
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	135	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	136	/*
				137	* number of pages needed to represent the full
				138	* stripe
				139	*/
				140	int nr_pages;
				141
				142	/*
				143	* size of all the bios in the bio_list. This
				144	* helps us decide if the rbio maps to a full
				145	* stripe or not
				146	*/
				147	int bio_list_bytes;
				148
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	149	int generic_bio_cnt;
				150
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	151	atomic_t refs;
				152
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	153	atomic_t stripes_pending;
				154
				155	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	156	/*
				157	* these are two arrays of pointers. We allocate the
				158	* rbio big enough to hold them both and setup their
				159	* locations when the rbio is allocated
				160	*/
				161
				162	/* pointers to pages that we allocated for
				163	* reading/writing stripes directly from the disk (including P/Q)
				164	*/
				165	struct page **stripe_pages;
				166
				167	/*
				168	* pointers to the pages in the bio_list. Stored
				169	* here for faster lookup
				170	*/
				171	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	172
				173	/*
				174	* bitmap to record which horizontal stripe has data
				175	*/
				176	unsigned long *dbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	177	};
				178
				179	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				180	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				181	static void rmw_work(struct btrfs_work *work);
				182	static void read_rebuild_work(struct btrfs_work *work);
				183	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				184	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				185	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				186	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				187	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				188	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				189	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				190
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	191	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				192	int need_check);
				193	static void async_scrub_parity(struct btrfs_raid_bio *rbio);
				194
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	195	/*
				196	* the stripe hash table is used for locking, and to collect
				197	* bios in hopes of making a full stripe
				198	*/
				199	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				200	{
				201	struct btrfs_stripe_hash_table *table;
				202	struct btrfs_stripe_hash_table *x;
				203	struct btrfs_stripe_hash *cur;
				204	struct btrfs_stripe_hash *h;
				205	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				206	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	207	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	208
				209	if (info->stripe_hash_table)
				210	return 0;
				211
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	212	/*
				213	* The table is large, starting with order 4 and can go as high as
				214	* order 7 in case lock debugging is turned on.
				215	*
				216	* Try harder to allocate and fallback to vmalloc to lower the chance
				217	* of a failing mount.
				218	*/
				219	table_size = sizeof(table) + sizeof(h) * num_entries;
				220	table = kzalloc(table_size, GFP_KERNEL \| __GFP_NOWARN \| __GFP_REPEAT);
				221	if (!table) {
				222	table = vzalloc(table_size);
				223	if (!table)
				224	return -ENOMEM;
				225	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	226
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	227	spin_lock_init(&table->cache_lock);
				228	INIT_LIST_HEAD(&table->stripe_cache);
				229
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	230	h = table->table;
				231
				232	for (i = 0; i < num_entries; i++) {
				233	cur = h + i;
				234	INIT_LIST_HEAD(&cur->hash_list);
				235	spin_lock_init(&cur->lock);
				236	init_waitqueue_head(&cur->wait);
				237	}
				238
				239	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	240	if (x)
				241	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	242	return 0;
				243	}
				244
				245	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	246	* caching an rbio means to copy anything from the
				247	* bio_pages array into the stripe_pages array. We
				248	* use the page uptodate bit in the stripe cache array
				249	* to indicate if it has valid data
				250	*
				251	* once the caching is done, we set the cache ready
				252	* bit.
				253	*/
				254	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				255	{
				256	int i;
				257	char *s;
				258	char *d;
				259	int ret;
				260
				261	ret = alloc_rbio_pages(rbio);
				262	if (ret)
				263	return;
				264
				265	for (i = 0; i < rbio->nr_pages; i++) {
				266	if (!rbio->bio_pages[i])
				267	continue;
				268
				269	s = kmap(rbio->bio_pages[i]);
				270	d = kmap(rbio->stripe_pages[i]);
				271
				272	memcpy(d, s, PAGE_CACHE_SIZE);
				273
				274	kunmap(rbio->bio_pages[i]);
				275	kunmap(rbio->stripe_pages[i]);
				276	SetPageUptodate(rbio->stripe_pages[i]);
				277	}
				278	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				279	}
				280
				281	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	282	* we hash on the first logical address of the stripe
				283	*/
				284	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				285	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	286	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	287
				288	/*
				289	* we shift down quite a bit. We're using byte
				290	* addressing, and most of the lower bits are zeros.
				291	* This tends to upset hash_64, and it consistently
				292	* returns just one or two different values.
				293	*
				294	* shifting off the lower bits fixes things.
				295	*/
				296	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				297	}
				298
				299	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	300	* stealing an rbio means taking all the uptodate pages from the stripe
				301	* array in the source rbio and putting them into the destination rbio
				302	*/
				303	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				304	{
				305	int i;
				306	struct page *s;
				307	struct page *d;
				308
				309	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				310	return;
				311
				312	for (i = 0; i < dest->nr_pages; i++) {
				313	s = src->stripe_pages[i];
				314	if (!s \|\| !PageUptodate(s)) {
				315	continue;
				316	}
				317
				318	d = dest->stripe_pages[i];
				319	if (d)
				320	__free_page(d);
				321
				322	dest->stripe_pages[i] = s;
				323	src->stripe_pages[i] = NULL;
				324	}
				325	}
				326
				327	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	328	* merging means we take the bio_list from the victim and
				329	* splice it into the destination. The victim should
				330	* be discarded afterwards.
				331	*
				332	* must be called with dest->rbio_list_lock held
				333	*/
				334	static void merge_rbio(struct btrfs_raid_bio *dest,
				335	struct btrfs_raid_bio *victim)
				336	{
				337	bio_list_merge(&dest->bio_list, &victim->bio_list);
				338	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	339	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	340	bio_list_init(&victim->bio_list);
				341	}
				342
				343	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	344	* used to prune items that are in the cache. The caller
				345	* must hold the hash table lock.
				346	*/
				347	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				348	{
				349	int bucket = rbio_bucket(rbio);
				350	struct btrfs_stripe_hash_table *table;
				351	struct btrfs_stripe_hash *h;
				352	int freeit = 0;
				353
				354	/*
				355	* check the bit again under the hash table lock.
				356	*/
				357	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				358	return;
				359
				360	table = rbio->fs_info->stripe_hash_table;
				361	h = table->table + bucket;
				362
				363	/* hold the lock for the bucket because we may be
				364	* removing it from the hash table
				365	*/
				366	spin_lock(&h->lock);
				367
				368	/*
				369	* hold the lock for the bio list because we need
				370	* to make sure the bio list is empty
				371	*/
				372	spin_lock(&rbio->bio_list_lock);
				373
				374	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				375	list_del_init(&rbio->stripe_cache);
				376	table->cache_size -= 1;
				377	freeit = 1;
				378
				379	/* if the bio list isn't empty, this rbio is
				380	* still involved in an IO. We take it out
				381	* of the cache list, and drop the ref that
				382	* was held for the list.
				383	*
				384	* If the bio_list was empty, we also remove
				385	* the rbio from the hash_table, and drop
				386	* the corresponding ref
				387	*/
				388	if (bio_list_empty(&rbio->bio_list)) {
				389	if (!list_empty(&rbio->hash_list)) {
				390	list_del_init(&rbio->hash_list);
				391	atomic_dec(&rbio->refs);
				392	BUG_ON(!list_empty(&rbio->plug_list));
				393	}
				394	}
				395	}
				396
				397	spin_unlock(&rbio->bio_list_lock);
				398	spin_unlock(&h->lock);
				399
				400	if (freeit)
				401	__free_raid_bio(rbio);
				402	}
				403
				404	/*
				405	* prune a given rbio from the cache
				406	*/
				407	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				408	{
				409	struct btrfs_stripe_hash_table *table;
				410	unsigned long flags;
				411
				412	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				413	return;
				414
				415	table = rbio->fs_info->stripe_hash_table;
				416
				417	spin_lock_irqsave(&table->cache_lock, flags);
				418	__remove_rbio_from_cache(rbio);
				419	spin_unlock_irqrestore(&table->cache_lock, flags);
				420	}
				421
				422	/*
				423	* remove everything in the cache
				424	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	425	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	426	{
				427	struct btrfs_stripe_hash_table *table;
				428	unsigned long flags;
				429	struct btrfs_raid_bio *rbio;
				430
				431	table = info->stripe_hash_table;
				432
				433	spin_lock_irqsave(&table->cache_lock, flags);
				434	while (!list_empty(&table->stripe_cache)) {
				435	rbio = list_entry(table->stripe_cache.next,
				436	struct btrfs_raid_bio,
				437	stripe_cache);
				438	__remove_rbio_from_cache(rbio);
				439	}
				440	spin_unlock_irqrestore(&table->cache_lock, flags);
				441	}
				442
				443	/*
				444	* remove all cached entries and free the hash table
				445	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	446	*/
				447	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				448	{
				449	if (!info->stripe_hash_table)
				450	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	451	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	452	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	453	info->stripe_hash_table = NULL;
				454	}
				455
				456	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	457	* insert an rbio into the stripe cache. It
				458	* must have already been prepared by calling
				459	* cache_rbio_pages
				460	*
				461	* If this rbio was already cached, it gets
				462	* moved to the front of the lru.
				463	*
				464	* If the size of the rbio cache is too big, we
				465	* prune an item.
				466	*/
				467	static void cache_rbio(struct btrfs_raid_bio *rbio)
				468	{
				469	struct btrfs_stripe_hash_table *table;
				470	unsigned long flags;
				471
				472	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				473	return;
				474
				475	table = rbio->fs_info->stripe_hash_table;
				476
				477	spin_lock_irqsave(&table->cache_lock, flags);
				478	spin_lock(&rbio->bio_list_lock);
				479
				480	/* bump our ref if we were not in the list before */
				481	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				482	atomic_inc(&rbio->refs);
				483
				484	if (!list_empty(&rbio->stripe_cache)){
				485	list_move(&rbio->stripe_cache, &table->stripe_cache);
				486	} else {
				487	list_add(&rbio->stripe_cache, &table->stripe_cache);
				488	table->cache_size += 1;
				489	}
				490
				491	spin_unlock(&rbio->bio_list_lock);
				492
				493	if (table->cache_size > RBIO_CACHE_SIZE) {
				494	struct btrfs_raid_bio *found;
				495
				496	found = list_entry(table->stripe_cache.prev,
				497	struct btrfs_raid_bio,
				498	stripe_cache);
				499
				500	if (found != rbio)
				501	__remove_rbio_from_cache(found);
				502	}
				503
				504	spin_unlock_irqrestore(&table->cache_lock, flags);
				505	return;
				506	}
				507
				508	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	509	* helper function to run the xor_blocks api. It is only
				510	* able to do MAX_XOR_BLOCKS at a time, so we need to
				511	* loop through.
				512	*/
				513	static void run_xor(void **pages, int src_cnt, ssize_t len)
				514	{
				515	int src_off = 0;
				516	int xor_src_cnt = 0;
				517	void *dest = pages[src_cnt];
				518
				519	while(src_cnt > 0) {
				520	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				521	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				522
				523	src_cnt -= xor_src_cnt;
				524	src_off += xor_src_cnt;
				525	}
				526	}
				527
				528	/*
				529	* returns true if the bio list inside this rbio
				530	* covers an entire stripe (no rmw required).
				531	* Must be called with the bio list lock held, or
				532	* at a time when you know it is impossible to add
				533	* new bios into the list
				534	*/
				535	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				536	{
				537	unsigned long size = rbio->bio_list_bytes;
				538	int ret = 1;
				539
				540	if (size != rbio->nr_data * rbio->stripe_len)
				541	ret = 0;
				542
				543	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				544	return ret;
				545	}
				546
				547	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				548	{
				549	unsigned long flags;
				550	int ret;
				551
				552	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				553	ret = __rbio_is_full(rbio);
				554	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				555	return ret;
				556	}
				557
				558	/*
				559	* returns 1 if it is safe to merge two rbios together.
				560	* The merging is safe if the two rbios correspond to
				561	* the same stripe and if they are both going in the same
				562	* direction (read vs write), and if neither one is
				563	* locked for final IO
				564	*
				565	* The caller is responsible for locking such that
				566	* rmw_locked is safe to test
				567	*/
				568	static int rbio_can_merge(struct btrfs_raid_bio *last,
				569	struct btrfs_raid_bio *cur)
				570	{
				571	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				572	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				573	return 0;
				574
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	575	/*
				576	* we can't merge with cached rbios, since the
				577	* idea is that when we merge the destination
				578	* rbio is going to run our IO for us. We can
				579	* steal from cached rbio's though, other functions
				580	* handle that.
				581	*/
				582	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				583	test_bit(RBIO_CACHE_BIT, &cur->flags))
				584	return 0;
				585
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	586	if (last->bbio->raid_map[0] !=
				587	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	588	return 0;
				589
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	590	/* we can't merge with different operations */
				591	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	592	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	593	/*
				594	* We've need read the full stripe from the drive.
				595	* check and repair the parity and write the new results.
				596	*
				597	* We're not allowed to add any new bios to the
				598	* bio list here, anyone else that wants to
				599	* change this stripe needs to do their own rmw.
				600	*/
				601	if (last->operation == BTRFS_RBIO_PARITY_SCRUB \|\|
				602	cur->operation == BTRFS_RBIO_PARITY_SCRUB)
				603	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	604
				605	return 1;
				606	}
				607
				608	/*
				609	* helper to index into the pstripe
				610	*/
				611	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				612	{
				613	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				614	return rbio->stripe_pages[index];
				615	}
				616
				617	/*
				618	* helper to index into the qstripe, returns null
				619	* if there is no qstripe
				620	*/
				621	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				622	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	623	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	624	return NULL;
				625
				626	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				627	PAGE_CACHE_SHIFT;
				628	return rbio->stripe_pages[index];
				629	}
				630
				631	/*
				632	* The first stripe in the table for a logical address
				633	* has the lock. rbios are added in one of three ways:
				634	*
				635	* 1) Nobody has the stripe locked yet. The rbio is given
				636	* the lock and 0 is returned. The caller must start the IO
				637	* themselves.
				638	*
				639	* 2) Someone has the stripe locked, but we're able to merge
				640	* with the lock owner. The rbio is freed and the IO will
				641	* start automatically along with the existing rbio. 1 is returned.
				642	*
				643	* 3) Someone has the stripe locked, but we're not able to merge.
				644	* The rbio is added to the lock owner's plug list, or merged into
				645	* an rbio already on the plug list. When the lock owner unlocks,
				646	* the next rbio on the list is run and the IO is started automatically.
				647	* 1 is returned
				648	*
				649	* If we return 0, the caller still owns the rbio and must continue with
				650	* IO submission. If we return 1, the caller must assume the rbio has
				651	* already been freed.
				652	*/
				653	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				654	{
				655	int bucket = rbio_bucket(rbio);
				656	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				657	struct btrfs_raid_bio *cur;
				658	struct btrfs_raid_bio *pending;
				659	unsigned long flags;
				660	DEFINE_WAIT(wait);
				661	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	662	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	663	int ret = 0;
				664	int walk = 0;
				665
				666	spin_lock_irqsave(&h->lock, flags);
				667	list_for_each_entry(cur, &h->hash_list, hash_list) {
				668	walk++;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	669	if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	670	spin_lock(&cur->bio_list_lock);
				671
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	672	/* can we steal this cached rbio's pages? */
				673	if (bio_list_empty(&cur->bio_list) &&
				674	list_empty(&cur->plug_list) &&
				675	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				676	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				677	list_del_init(&cur->hash_list);
				678	atomic_dec(&cur->refs);
				679
				680	steal_rbio(cur, rbio);
				681	cache_drop = cur;
				682	spin_unlock(&cur->bio_list_lock);
				683
				684	goto lockit;
				685	}
				686
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	687	/* can we merge into the lock owner? */
				688	if (rbio_can_merge(cur, rbio)) {
				689	merge_rbio(cur, rbio);
				690	spin_unlock(&cur->bio_list_lock);
				691	freeit = rbio;
				692	ret = 1;
				693	goto out;
				694	}
				695
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	696
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	697	/*
				698	* we couldn't merge with the running
				699	* rbio, see if we can merge with the
				700	* pending ones. We don't have to
				701	* check for rmw_locked because there
				702	* is no way they are inside finish_rmw
				703	* right now
				704	*/
				705	list_for_each_entry(pending, &cur->plug_list,
				706	plug_list) {
				707	if (rbio_can_merge(pending, rbio)) {
				708	merge_rbio(pending, rbio);
				709	spin_unlock(&cur->bio_list_lock);
				710	freeit = rbio;
				711	ret = 1;
				712	goto out;
				713	}
				714	}
				715
				716	/* no merging, put us on the tail of the plug list,
				717	* our rbio will be started with the currently
				718	* running rbio unlocks
				719	*/
				720	list_add_tail(&rbio->plug_list, &cur->plug_list);
				721	spin_unlock(&cur->bio_list_lock);
				722	ret = 1;
				723	goto out;
				724	}
				725	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	726	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	727	atomic_inc(&rbio->refs);
				728	list_add(&rbio->hash_list, &h->hash_list);
				729	out:
				730	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	731	if (cache_drop)
				732	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	733	if (freeit)
				734	__free_raid_bio(freeit);
				735	return ret;
				736	}
				737
				738	/*
				739	* called as rmw or parity rebuild is completed. If the plug list has more
				740	* rbios waiting for this stripe, the next one on the list will be started
				741	*/
				742	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				743	{
				744	int bucket;
				745	struct btrfs_stripe_hash *h;
				746	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	747	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	748
				749	bucket = rbio_bucket(rbio);
				750	h = rbio->fs_info->stripe_hash_table->table + bucket;
				751
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	752	if (list_empty(&rbio->plug_list))
				753	cache_rbio(rbio);
				754
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	755	spin_lock_irqsave(&h->lock, flags);
				756	spin_lock(&rbio->bio_list_lock);
				757
				758	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	759	/*
				760	* if we're still cached and there is no other IO
				761	* to perform, just leave this rbio here for others
				762	* to steal from later
				763	*/
				764	if (list_empty(&rbio->plug_list) &&
				765	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				766	keep_cache = 1;
				767	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				768	BUG_ON(!bio_list_empty(&rbio->bio_list));
				769	goto done;
				770	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	771
				772	list_del_init(&rbio->hash_list);
				773	atomic_dec(&rbio->refs);
				774
				775	/*
				776	* we use the plug list to hold all the rbios
				777	* waiting for the chance to lock this stripe.
				778	* hand the lock over to one of them.
				779	*/
				780	if (!list_empty(&rbio->plug_list)) {
				781	struct btrfs_raid_bio *next;
				782	struct list_head *head = rbio->plug_list.next;
				783
				784	next = list_entry(head, struct btrfs_raid_bio,
				785	plug_list);
				786
				787	list_del_init(&rbio->plug_list);
				788
				789	list_add(&next->hash_list, &h->hash_list);
				790	atomic_inc(&next->refs);
				791	spin_unlock(&rbio->bio_list_lock);
				792	spin_unlock_irqrestore(&h->lock, flags);
				793
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	794	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	795	async_read_rebuild(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	796	else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	797	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	798	async_rmw_stripe(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	799	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				800	steal_rbio(rbio, next);
				801	async_scrub_parity(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	802	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	803
				804	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	805	} else if (waitqueue_active(&h->wait)) {
				806	spin_unlock(&rbio->bio_list_lock);
				807	spin_unlock_irqrestore(&h->lock, flags);
				808	wake_up(&h->wait);
				809	goto done_nolock;
				810	}
				811	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	812	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	813	spin_unlock(&rbio->bio_list_lock);
				814	spin_unlock_irqrestore(&h->lock, flags);
				815
				816	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	817	if (!keep_cache)
				818	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	819	}
				820
				821	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				822	{
				823	int i;
				824
				825	WARN_ON(atomic_read(&rbio->refs) < 0);
				826	if (!atomic_dec_and_test(&rbio->refs))
				827	return;
				828
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	829	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	830	WARN_ON(!list_empty(&rbio->hash_list));
				831	WARN_ON(!bio_list_empty(&rbio->bio_list));
				832
				833	for (i = 0; i < rbio->nr_pages; i++) {
				834	if (rbio->stripe_pages[i]) {
				835	__free_page(rbio->stripe_pages[i]);
				836	rbio->stripe_pages[i] = NULL;
				837	}
				838	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	839
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	840	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	841	kfree(rbio);
				842	}
				843
				844	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				845	{
				846	unlock_stripe(rbio);
				847	__free_raid_bio(rbio);
				848	}
				849
				850	/*
				851	* this frees the rbio and runs through all the bios in the
				852	* bio_list and calls end_io on them
				853	*/
				854	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				855	{
				856	struct bio *cur = bio_list_get(&rbio->bio_list);
				857	struct bio *next;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	858
				859	if (rbio->generic_bio_cnt)
				860	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				861
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	862	free_raid_bio(rbio);
				863
				864	while (cur) {
				865	next = cur->bi_next;
				866	cur->bi_next = NULL;
				867	if (uptodate)
				868	set_bit(BIO_UPTODATE, &cur->bi_flags);
				869	bio_endio(cur, err);
				870	cur = next;
				871	}
				872	}
				873
				874	/*
				875	* end io function used by finish_rmw. When we finally
				876	* get here, we've written a full stripe
				877	*/
				878	static void raid_write_end_io(struct bio *bio, int err)
				879	{
				880	struct btrfs_raid_bio *rbio = bio->bi_private;
				881
				882	if (err)
				883	fail_bio_stripe(rbio, bio);
				884
				885	bio_put(bio);
				886
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	887	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	888	return;
				889
				890	err = 0;
				891
				892	/* OK, we have read all the stripes we need to. */
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	893	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	894	err = -EIO;
				895
				896	rbio_orig_end_io(rbio, err, 0);
				897	return;
				898	}
				899
				900	/*
				901	* the read/modify/write code wants to use the original bio for
				902	* any pages it included, and then use the rbio for everything
				903	* else. This function decides if a given index (stripe number)
				904	* and page number in that stripe fall inside the original bio
				905	* or the rbio.
				906	*
				907	* if you set bio_list_only, you'll get a NULL back for any ranges
				908	* that are outside the bio_list
				909	*
				910	* This doesn't take any refs on anything, you get a bare page pointer
				911	* and the caller must bump refs as required.
				912	*
				913	* You must call index_rbio_pages once before you can trust
				914	* the answers from this function.
				915	*/
				916	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				917	int index, int pagenr, int bio_list_only)
				918	{
				919	int chunk_page;
				920	struct page *p = NULL;
				921
				922	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				923
				924	spin_lock_irq(&rbio->bio_list_lock);
				925	p = rbio->bio_pages[chunk_page];
				926	spin_unlock_irq(&rbio->bio_list_lock);
				927
				928	if (p \|\| bio_list_only)
				929	return p;
				930
				931	return rbio->stripe_pages[chunk_page];
				932	}
				933
				934	/*
				935	* number of pages we need for the entire stripe across all the
				936	* drives
				937	*/
				938	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				939	{
				940	unsigned long nr = stripe_len * nr_stripes;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	941	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	942	}
				943
				944	/*
				945	* allocation and initial setup for the btrfs_raid_bio. Not
				946	* this does not allocate any pages for rbio->pages.
				947	*/
				948	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	949	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	950	{
				951	struct btrfs_raid_bio *rbio;
				952	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	953	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				954	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	955	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	956	void *p;
				957
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	958	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2 +
				959	DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	960	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	961	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	962	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	963
				964	bio_list_init(&rbio->bio_list);
				965	INIT_LIST_HEAD(&rbio->plug_list);
				966	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	967	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	968	INIT_LIST_HEAD(&rbio->hash_list);
				969	rbio->bbio = bbio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	970	rbio->fs_info = root->fs_info;
				971	rbio->stripe_len = stripe_len;
				972	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	973	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	974	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	975	rbio->faila = -1;
				976	rbio->failb = -1;
				977	atomic_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	978	atomic_set(&rbio->error, 0);
				979	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	980
				981	/*
				982	* the stripe_pages and bio_pages array point to the extra
				983	* memory we allocated past the end of the rbio
				984	*/
				985	p = rbio + 1;
				986	rbio->stripe_pages = p;
				987	rbio->bio_pages = p + sizeof(struct page ) num_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	988	rbio->dbitmap = p + sizeof(struct page ) num_pages * 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	989
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	990	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				991	nr_data = real_stripes - 1;
				992	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	993	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	994	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	995	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	996
				997	rbio->nr_data = nr_data;
				998	return rbio;
				999	}
				1000
				1001	/* allocate pages for all the stripes in the bio, including parity */
				1002	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1003	{
				1004	int i;
				1005	struct page *page;
				1006
				1007	for (i = 0; i < rbio->nr_pages; i++) {
				1008	if (rbio->stripe_pages[i])
				1009	continue;
				1010	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1011	if (!page)
				1012	return -ENOMEM;
				1013	rbio->stripe_pages[i] = page;
				1014	ClearPageUptodate(page);
				1015	}
				1016	return 0;
				1017	}
				1018
				1019	/* allocate pages for just the p/q stripes */
				1020	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1021	{
				1022	int i;
				1023	struct page *page;
				1024
				1025	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				1026
				1027	for (; i < rbio->nr_pages; i++) {
				1028	if (rbio->stripe_pages[i])
				1029	continue;
				1030	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1031	if (!page)
				1032	return -ENOMEM;
				1033	rbio->stripe_pages[i] = page;
				1034	}
				1035	return 0;
				1036	}
				1037
				1038	/*
				1039	* add a single page from a specific stripe into our list of bios for IO
				1040	* this will try to merge into existing bios if possible, and returns
				1041	* zero if all went well.
				1042	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1043	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1044	struct bio_list *bio_list,
				1045	struct page *page,
				1046	int stripe_nr,
				1047	unsigned long page_index,
				1048	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1049	{
				1050	struct bio *last = bio_list->tail;
				1051	u64 last_end = 0;
				1052	int ret;
				1053	struct bio *bio;
				1054	struct btrfs_bio_stripe *stripe;
				1055	u64 disk_start;
				1056
				1057	stripe = &rbio->bbio->stripes[stripe_nr];
				1058	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1059
				1060	/* if the device is missing, just fail this stripe */
				1061	if (!stripe->dev->bdev)
				1062	return fail_rbio_index(rbio, stripe_nr);
				1063
				1064	/* see if we can add this page onto our existing bio */
				1065	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1066	last_end = (u64)last->bi_iter.bi_sector << 9;
				1067	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1068
				1069	/*
				1070	* we can't merge these if they are from different
				1071	* devices or if they are not contiguous
				1072	*/
				1073	if (last_end == disk_start && stripe->dev->bdev &&
				1074	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1075	last->bi_bdev == stripe->dev->bdev) {
				1076	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1077	if (ret == PAGE_CACHE_SIZE)
				1078	return 0;
				1079	}
				1080	}
				1081
				1082	/* put a new bio on the list */
Chris Mason	9be3395	2013-05-17 18:30:14 -0400	[diff] [blame]	1083	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1084	if (!bio)
				1085	return -ENOMEM;
				1086
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1087	bio->bi_iter.bi_size = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1088	bio->bi_bdev = stripe->dev->bdev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1089	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1090	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1091
				1092	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1093	bio_list_add(bio_list, bio);
				1094	return 0;
				1095	}
				1096
				1097	/*
				1098	* while we're doing the read/modify/write cycle, we could
				1099	* have errors in reading pages off the disk. This checks
				1100	* for errors and if we're not able to read the page it'll
				1101	* trigger parity reconstruction. The rmw will be finished
				1102	* after we've reconstructed the failed stripes
				1103	*/
				1104	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1105	{
				1106	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1107	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1108	__raid56_parity_recover(rbio);
				1109	} else {
				1110	finish_rmw(rbio);
				1111	}
				1112	}
				1113
				1114	/*
				1115	* these are just the pages from the rbio array, not from anything
				1116	* the FS sent down to us
				1117	*/
				1118	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1119	{
				1120	int index;
				1121	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1122	index += page;
				1123	return rbio->stripe_pages[index];
				1124	}
				1125
				1126	/*
				1127	* helper function to walk our bio list and populate the bio_pages array with
				1128	* the result. This seems expensive, but it is faster than constantly
				1129	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1130	* reconstruction.
				1131	*
				1132	* This must be called before you trust the answers from page_in_rbio
				1133	*/
				1134	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1135	{
				1136	struct bio *bio;
				1137	u64 start;
				1138	unsigned long stripe_offset;
				1139	unsigned long page_index;
				1140	struct page *p;
				1141	int i;
				1142
				1143	spin_lock_irq(&rbio->bio_list_lock);
				1144	bio_list_for_each(bio, &rbio->bio_list) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1145	start = (u64)bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1146	stripe_offset = start - rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1147	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1148
				1149	for (i = 0; i < bio->bi_vcnt; i++) {
				1150	p = bio->bi_io_vec[i].bv_page;
				1151	rbio->bio_pages[page_index + i] = p;
				1152	}
				1153	}
				1154	spin_unlock_irq(&rbio->bio_list_lock);
				1155	}
				1156
				1157	/*
				1158	* this is called from one of two situations. We either
				1159	* have a full stripe from the higher layers, or we've read all
				1160	* the missing bits off disk.
				1161	*
				1162	* This will calculate the parity and then send down any
				1163	* changed blocks.
				1164	*/
				1165	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1166	{
				1167	struct btrfs_bio *bbio = rbio->bbio;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1168	void *pointers[rbio->real_stripes];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1169	int stripe_len = rbio->stripe_len;
				1170	int nr_data = rbio->nr_data;
				1171	int stripe;
				1172	int pagenr;
				1173	int p_stripe = -1;
				1174	int q_stripe = -1;
				1175	struct bio_list bio_list;
				1176	struct bio *bio;
				1177	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1178	int ret;
				1179
				1180	bio_list_init(&bio_list);
				1181
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1182	if (rbio->real_stripes - rbio->nr_data == 1) {
				1183	p_stripe = rbio->real_stripes - 1;
				1184	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				1185	p_stripe = rbio->real_stripes - 2;
				1186	q_stripe = rbio->real_stripes - 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1187	} else {
				1188	BUG();
				1189	}
				1190
				1191	/* at this point we either have a full stripe,
				1192	* or we've read the full stripe from the drive.
				1193	* recalculate the parity and write the new results.
				1194	*
				1195	* We're not allowed to add any new bios to the
				1196	* bio list here, anyone else that wants to
				1197	* change this stripe needs to do their own rmw.
				1198	*/
				1199	spin_lock_irq(&rbio->bio_list_lock);
				1200	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1201	spin_unlock_irq(&rbio->bio_list_lock);
				1202
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1203	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1204
				1205	/*
				1206	* now that we've set rmw_locked, run through the
				1207	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1208	*
				1209	* We don't cache full rbios because we're assuming
				1210	* the higher layers are unlikely to use this area of
				1211	* the disk again soon. If they do use it again,
				1212	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1213	*/
				1214	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1215	if (!rbio_is_full(rbio))
				1216	cache_rbio_pages(rbio);
				1217	else
				1218	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1219
				1220	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1221	struct page *p;
				1222	/* first collect one page from each data stripe */
				1223	for (stripe = 0; stripe < nr_data; stripe++) {
				1224	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1225	pointers[stripe] = kmap(p);
				1226	}
				1227
				1228	/* then add the parity stripe */
				1229	p = rbio_pstripe_page(rbio, pagenr);
				1230	SetPageUptodate(p);
				1231	pointers[stripe++] = kmap(p);
				1232
				1233	if (q_stripe != -1) {
				1234
				1235	/*
				1236	* raid6, add the qstripe and call the
				1237	* library function to fill in our p/q
				1238	*/
				1239	p = rbio_qstripe_page(rbio, pagenr);
				1240	SetPageUptodate(p);
				1241	pointers[stripe++] = kmap(p);
				1242
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1243	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1244	pointers);
				1245	} else {
				1246	/* raid5 */
				1247	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1248	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1249	}
				1250
				1251
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1252	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1253	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1254	}
				1255
				1256	/*
				1257	* time to start writing. Make bios for everything from the
				1258	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1259	* everything else.
				1260	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1261	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1262	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1263	struct page *page;
				1264	if (stripe < rbio->nr_data) {
				1265	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1266	if (!page)
				1267	continue;
				1268	} else {
				1269	page = rbio_stripe_page(rbio, stripe, pagenr);
				1270	}
				1271
				1272	ret = rbio_add_io_page(rbio, &bio_list,
				1273	page, stripe, pagenr, rbio->stripe_len);
				1274	if (ret)
				1275	goto cleanup;
				1276	}
				1277	}
				1278
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1279	if (likely(!bbio->num_tgtdevs))
				1280	goto write_data;
				1281
				1282	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1283	if (!bbio->tgtdev_map[stripe])
				1284	continue;
				1285
				1286	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1287	struct page *page;
				1288	if (stripe < rbio->nr_data) {
				1289	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1290	if (!page)
				1291	continue;
				1292	} else {
				1293	page = rbio_stripe_page(rbio, stripe, pagenr);
				1294	}
				1295
				1296	ret = rbio_add_io_page(rbio, &bio_list, page,
				1297	rbio->bbio->tgtdev_map[stripe],
				1298	pagenr, rbio->stripe_len);
				1299	if (ret)
				1300	goto cleanup;
				1301	}
				1302	}
				1303
				1304	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1305	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1306	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1307
				1308	while (1) {
				1309	bio = bio_list_pop(&bio_list);
				1310	if (!bio)
				1311	break;
				1312
				1313	bio->bi_private = rbio;
				1314	bio->bi_end_io = raid_write_end_io;
				1315	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1316	submit_bio(WRITE, bio);
				1317	}
				1318	return;
				1319
				1320	cleanup:
				1321	rbio_orig_end_io(rbio, -EIO, 0);
				1322	}
				1323
				1324	/*
				1325	* helper to find the stripe number for a given bio. Used to figure out which
				1326	* stripe has failed. This expects the bio to correspond to a physical disk,
				1327	* so it looks up based on physical sector numbers.
				1328	*/
				1329	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1330	struct bio *bio)
				1331	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1332	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1333	u64 stripe_start;
				1334	int i;
				1335	struct btrfs_bio_stripe *stripe;
				1336
				1337	physical <<= 9;
				1338
				1339	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1340	stripe = &rbio->bbio->stripes[i];
				1341	stripe_start = stripe->physical;
				1342	if (physical >= stripe_start &&
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1343	physical < stripe_start + rbio->stripe_len &&
				1344	bio->bi_bdev == stripe->dev->bdev) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1345	return i;
				1346	}
				1347	}
				1348	return -1;
				1349	}
				1350
				1351	/*
				1352	* helper to find the stripe number for a given
				1353	* bio (before mapping). Used to figure out which stripe has
				1354	* failed. This looks up based on logical block numbers.
				1355	*/
				1356	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1357	struct bio *bio)
				1358	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1359	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1360	u64 stripe_start;
				1361	int i;
				1362
				1363	logical <<= 9;
				1364
				1365	for (i = 0; i < rbio->nr_data; i++) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1366	stripe_start = rbio->bbio->raid_map[i];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1367	if (logical >= stripe_start &&
				1368	logical < stripe_start + rbio->stripe_len) {
				1369	return i;
				1370	}
				1371	}
				1372	return -1;
				1373	}
				1374
				1375	/*
				1376	* returns -EIO if we had too many failures
				1377	*/
				1378	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1379	{
				1380	unsigned long flags;
				1381	int ret = 0;
				1382
				1383	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1384
				1385	/* we already know this stripe is bad, move on */
				1386	if (rbio->faila == failed \|\| rbio->failb == failed)
				1387	goto out;
				1388
				1389	if (rbio->faila == -1) {
				1390	/* first failure on this rbio */
				1391	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1392	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1393	} else if (rbio->failb == -1) {
				1394	/* second failure on this rbio */
				1395	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1396	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1397	} else {
				1398	ret = -EIO;
				1399	}
				1400	out:
				1401	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1402
				1403	return ret;
				1404	}
				1405
				1406	/*
				1407	* helper to fail a stripe based on a physical disk
				1408	* bio.
				1409	*/
				1410	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1411	struct bio *bio)
				1412	{
				1413	int failed = find_bio_stripe(rbio, bio);
				1414
				1415	if (failed < 0)
				1416	return -EIO;
				1417
				1418	return fail_rbio_index(rbio, failed);
				1419	}
				1420
				1421	/*
				1422	* this sets each page in the bio uptodate. It should only be used on private
				1423	* rbio pages, nothing that comes in from the higher layers
				1424	*/
				1425	static void set_bio_pages_uptodate(struct bio *bio)
				1426	{
				1427	int i;
				1428	struct page *p;
				1429
				1430	for (i = 0; i < bio->bi_vcnt; i++) {
				1431	p = bio->bi_io_vec[i].bv_page;
				1432	SetPageUptodate(p);
				1433	}
				1434	}
				1435
				1436	/*
				1437	* end io for the read phase of the rmw cycle. All the bios here are physical
				1438	* stripe bios we've read from the disk so we can recalculate the parity of the
				1439	* stripe.
				1440	*
				1441	* This will usually kick off finish_rmw once all the bios are read in, but it
				1442	* may trigger parity reconstruction if we had any errors along the way
				1443	*/
				1444	static void raid_rmw_end_io(struct bio *bio, int err)
				1445	{
				1446	struct btrfs_raid_bio *rbio = bio->bi_private;
				1447
				1448	if (err)
				1449	fail_bio_stripe(rbio, bio);
				1450	else
				1451	set_bio_pages_uptodate(bio);
				1452
				1453	bio_put(bio);
				1454
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1455	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1456	return;
				1457
				1458	err = 0;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1459	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1460	goto cleanup;
				1461
				1462	/*
				1463	* this will normally call finish_rmw to start our write
				1464	* but if there are any failed stripes we'll reconstruct
				1465	* from parity first
				1466	*/
				1467	validate_rbio_for_rmw(rbio);
				1468	return;
				1469
				1470	cleanup:
				1471
				1472	rbio_orig_end_io(rbio, -EIO, 0);
				1473	}
				1474
				1475	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1476	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1477	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1478	rmw_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1479
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1480	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1481	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1482	}
				1483
				1484	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1485	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1486	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1487	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1488
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1489	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1490	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1491	}
				1492
				1493	/*
				1494	* the stripe must be locked by the caller. It will
				1495	* unlock after all the writes are done
				1496	*/
				1497	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1498	{
				1499	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1500	struct bio_list bio_list;
				1501	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1502	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1503	int pagenr;
				1504	int stripe;
				1505	struct bio *bio;
				1506
				1507	bio_list_init(&bio_list);
				1508
				1509	ret = alloc_rbio_pages(rbio);
				1510	if (ret)
				1511	goto cleanup;
				1512
				1513	index_rbio_pages(rbio);
				1514
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1515	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1516	/*
				1517	* build a list of bios to read all the missing parts of this
				1518	* stripe
				1519	*/
				1520	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1521	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1522	struct page *page;
				1523	/*
				1524	* we want to find all the pages missing from
				1525	* the rbio and read them from the disk. If
				1526	* page_in_rbio finds a page in the bio list
				1527	* we don't need to read it off the stripe.
				1528	*/
				1529	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1530	if (page)
				1531	continue;
				1532
				1533	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1534	/*
				1535	* the bio cache may have handed us an uptodate
				1536	* page. If so, be happy and use it
				1537	*/
				1538	if (PageUptodate(page))
				1539	continue;
				1540
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1541	ret = rbio_add_io_page(rbio, &bio_list, page,
				1542	stripe, pagenr, rbio->stripe_len);
				1543	if (ret)
				1544	goto cleanup;
				1545	}
				1546	}
				1547
				1548	bios_to_read = bio_list_size(&bio_list);
				1549	if (!bios_to_read) {
				1550	/*
				1551	* this can happen if others have merged with
				1552	* us, it means there is nothing left to read.
				1553	* But if there are missing devices it may not be
				1554	* safe to do the full stripe write yet.
				1555	*/
				1556	goto finish;
				1557	}
				1558
				1559	/*
				1560	* the bbio may be freed once we submit the last bio. Make sure
				1561	* not to touch it after that
				1562	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1563	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1564	while (1) {
				1565	bio = bio_list_pop(&bio_list);
				1566	if (!bio)
				1567	break;
				1568
				1569	bio->bi_private = rbio;
				1570	bio->bi_end_io = raid_rmw_end_io;
				1571
				1572	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1573	BTRFS_WQ_ENDIO_RAID56);
				1574
				1575	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1576	submit_bio(READ, bio);
				1577	}
				1578	/* the actual write will happen once the reads are done */
				1579	return 0;
				1580
				1581	cleanup:
				1582	rbio_orig_end_io(rbio, -EIO, 0);
				1583	return -EIO;
				1584
				1585	finish:
				1586	validate_rbio_for_rmw(rbio);
				1587	return 0;
				1588	}
				1589
				1590	/*
				1591	* if the upper layers pass in a full stripe, we thank them by only allocating
				1592	* enough pages to hold the parity, and sending it all down quickly.
				1593	*/
				1594	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1595	{
				1596	int ret;
				1597
				1598	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1599	if (ret) {
				1600	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1601	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1602	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1603
				1604	ret = lock_stripe_add(rbio);
				1605	if (ret == 0)
				1606	finish_rmw(rbio);
				1607	return 0;
				1608	}
				1609
				1610	/*
				1611	* partial stripe writes get handed over to async helpers.
				1612	* We're really hoping to merge a few more writes into this
				1613	* rbio before calculating new parity
				1614	*/
				1615	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1616	{
				1617	int ret;
				1618
				1619	ret = lock_stripe_add(rbio);
				1620	if (ret == 0)
				1621	async_rmw_stripe(rbio);
				1622	return 0;
				1623	}
				1624
				1625	/*
				1626	* sometimes while we were reading from the drive to
				1627	* recalculate parity, enough new bios come into create
				1628	* a full stripe. So we do a check here to see if we can
				1629	* go directly to finish_rmw
				1630	*/
				1631	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1632	{
				1633	/* head off into rmw land if we don't have a full stripe */
				1634	if (!rbio_is_full(rbio))
				1635	return partial_stripe_write(rbio);
				1636	return full_stripe_write(rbio);
				1637	}
				1638
				1639	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1640	* We use plugging call backs to collect full stripes.
				1641	* Any time we get a partial stripe write while plugged
				1642	* we collect it into a list. When the unplug comes down,
				1643	* we sort the list by logical block number and merge
				1644	* everything we can into the same rbios
				1645	*/
				1646	struct btrfs_plug_cb {
				1647	struct blk_plug_cb cb;
				1648	struct btrfs_fs_info *info;
				1649	struct list_head rbio_list;
				1650	struct btrfs_work work;
				1651	};
				1652
				1653	/*
				1654	* rbios on the plug list are sorted for easier merging.
				1655	*/
				1656	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1657	{
				1658	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1659	plug_list);
				1660	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1661	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1662	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1663	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1664
				1665	if (a_sector < b_sector)
				1666	return -1;
				1667	if (a_sector > b_sector)
				1668	return 1;
				1669	return 0;
				1670	}
				1671
				1672	static void run_plug(struct btrfs_plug_cb *plug)
				1673	{
				1674	struct btrfs_raid_bio *cur;
				1675	struct btrfs_raid_bio *last = NULL;
				1676
				1677	/*
				1678	* sort our plug list then try to merge
				1679	* everything we can in hopes of creating full
				1680	* stripes.
				1681	*/
				1682	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1683	while (!list_empty(&plug->rbio_list)) {
				1684	cur = list_entry(plug->rbio_list.next,
				1685	struct btrfs_raid_bio, plug_list);
				1686	list_del_init(&cur->plug_list);
				1687
				1688	if (rbio_is_full(cur)) {
				1689	/* we have a full stripe, send it down */
				1690	full_stripe_write(cur);
				1691	continue;
				1692	}
				1693	if (last) {
				1694	if (rbio_can_merge(last, cur)) {
				1695	merge_rbio(last, cur);
				1696	__free_raid_bio(cur);
				1697	continue;
				1698
				1699	}
				1700	__raid56_parity_write(last);
				1701	}
				1702	last = cur;
				1703	}
				1704	if (last) {
				1705	__raid56_parity_write(last);
				1706	}
				1707	kfree(plug);
				1708	}
				1709
				1710	/*
				1711	* if the unplug comes from schedule, we have to push the
				1712	* work off to a helper thread
				1713	*/
				1714	static void unplug_work(struct btrfs_work *work)
				1715	{
				1716	struct btrfs_plug_cb *plug;
				1717	plug = container_of(work, struct btrfs_plug_cb, work);
				1718	run_plug(plug);
				1719	}
				1720
				1721	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1722	{
				1723	struct btrfs_plug_cb *plug;
				1724	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1725
				1726	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1727	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1728	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1729	btrfs_queue_work(plug->info->rmw_workers,
				1730	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1731	return;
				1732	}
				1733	run_plug(plug);
				1734	}
				1735
				1736	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1737	* our main entry point for writes from the rest of the FS.
				1738	*/
				1739	int raid56_parity_write(struct btrfs_root root, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1740	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1741	{
				1742	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1743	struct btrfs_plug_cb *plug = NULL;
				1744	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1745	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1746
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1747	rbio = alloc_rbio(root, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1748	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1749	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1750	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1751	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1752	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1753	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1754	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1755
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1756	btrfs_bio_counter_inc_noblocked(root->fs_info);
				1757	rbio->generic_bio_cnt = 1;
				1758
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1759	/*
				1760	* don't plug on full rbios, just get them out the door
				1761	* as quickly as we can
				1762	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1763	if (rbio_is_full(rbio)) {
				1764	ret = full_stripe_write(rbio);
				1765	if (ret)
				1766	btrfs_bio_counter_dec(root->fs_info);
				1767	return ret;
				1768	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1769
				1770	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1771	sizeof(*plug));
				1772	if (cb) {
				1773	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1774	if (!plug->info) {
				1775	plug->info = root->fs_info;
				1776	INIT_LIST_HEAD(&plug->rbio_list);
				1777	}
				1778	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1779	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1780	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1781	ret = __raid56_parity_write(rbio);
				1782	if (ret)
				1783	btrfs_bio_counter_dec(root->fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1784	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1785	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1786	}
				1787
				1788	/*
				1789	* all parity reconstruction happens here. We've read in everything
				1790	* we can find from the drives and this does the heavy lifting of
				1791	* sorting the good from the bad.
				1792	*/
				1793	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1794	{
				1795	int pagenr, stripe;
				1796	void **pointers;
				1797	int faila = -1, failb = -1;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1798	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1799	struct page *page;
				1800	int err;
				1801	int i;
				1802
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1803	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1804	if (!pointers) {
				1805	err = -ENOMEM;
				1806	goto cleanup_io;
				1807	}
				1808
				1809	faila = rbio->faila;
				1810	failb = rbio->failb;
				1811
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1812	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1813	spin_lock_irq(&rbio->bio_list_lock);
				1814	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1815	spin_unlock_irq(&rbio->bio_list_lock);
				1816	}
				1817
				1818	index_rbio_pages(rbio);
				1819
				1820	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1821	/*
				1822	* Now we just use bitmap to mark the horizontal stripes in
				1823	* which we have data when doing parity scrub.
				1824	*/
				1825	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1826	!test_bit(pagenr, rbio->dbitmap))
				1827	continue;
				1828
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1829	/* setup our array of pointers with pages
				1830	* from each stripe
				1831	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1832	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1833	/*
				1834	* if we're rebuilding a read, we have to use
				1835	* pages from the bio list
				1836	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1837	if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1838	(stripe == faila \|\| stripe == failb)) {
				1839	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1840	} else {
				1841	page = rbio_stripe_page(rbio, stripe, pagenr);
				1842	}
				1843	pointers[stripe] = kmap(page);
				1844	}
				1845
				1846	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1847	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1848	/*
				1849	* single failure, rebuild from parity raid5
				1850	* style
				1851	*/
				1852	if (failb < 0) {
				1853	if (faila == rbio->nr_data) {
				1854	/*
				1855	* Just the P stripe has failed, without
				1856	* a bad data or Q stripe.
				1857	* TODO, we should redo the xor here.
				1858	*/
				1859	err = -EIO;
				1860	goto cleanup;
				1861	}
				1862	/*
				1863	* a single failure in raid6 is rebuilt
				1864	* in the pstripe code below
				1865	*/
				1866	goto pstripe;
				1867	}
				1868
				1869	/* make sure our ps and qs are in order */
				1870	if (faila > failb) {
				1871	int tmp = failb;
				1872	failb = faila;
				1873	faila = tmp;
				1874	}
				1875
				1876	/* if the q stripe is failed, do a pstripe reconstruction
				1877	* from the xors.
				1878	* If both the q stripe and the P stripe are failed, we're
				1879	* here due to a crc mismatch and we can't give them the
				1880	* data they want
				1881	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1882	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1883	if (rbio->bbio->raid_map[faila] ==
				1884	RAID5_P_STRIPE) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1885	err = -EIO;
				1886	goto cleanup;
				1887	}
				1888	/*
				1889	* otherwise we have one bad data stripe and
				1890	* a good P stripe. raid5!
				1891	*/
				1892	goto pstripe;
				1893	}
				1894
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1895	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1896	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1897	PAGE_SIZE, faila, pointers);
				1898	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1899	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1900	PAGE_SIZE, faila, failb,
				1901	pointers);
				1902	}
				1903	} else {
				1904	void *p;
				1905
				1906	/* rebuild from P stripe here (raid5 or raid6) */
				1907	BUG_ON(failb != -1);
				1908	pstripe:
				1909	/* Copy parity block into failed block to start with */
				1910	memcpy(pointers[faila],
				1911	pointers[rbio->nr_data],
				1912	PAGE_CACHE_SIZE);
				1913
				1914	/* rearrange the pointer array */
				1915	p = pointers[faila];
				1916	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1917	pointers[stripe] = pointers[stripe + 1];
				1918	pointers[rbio->nr_data - 1] = p;
				1919
				1920	/* xor in the rest */
				1921	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1922	}
				1923	/* if we're doing this rebuild as part of an rmw, go through
				1924	* and set all of our private rbio pages in the
				1925	* failed stripes as uptodate. This way finish_rmw will
				1926	* know they can be trusted. If this was a read reconstruction,
				1927	* other endio functions will fiddle the uptodate bits
				1928	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1929	if (rbio->operation == BTRFS_RBIO_WRITE) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1930	for (i = 0; i < nr_pages; i++) {
				1931	if (faila != -1) {
				1932	page = rbio_stripe_page(rbio, faila, i);
				1933	SetPageUptodate(page);
				1934	}
				1935	if (failb != -1) {
				1936	page = rbio_stripe_page(rbio, failb, i);
				1937	SetPageUptodate(page);
				1938	}
				1939	}
				1940	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1941	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1942	/*
				1943	* if we're rebuilding a read, we have to use
				1944	* pages from the bio list
				1945	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1946	if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1947	(stripe == faila \|\| stripe == failb)) {
				1948	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1949	} else {
				1950	page = rbio_stripe_page(rbio, stripe, pagenr);
				1951	}
				1952	kunmap(page);
				1953	}
				1954	}
				1955
				1956	err = 0;
				1957	cleanup:
				1958	kfree(pointers);
				1959
				1960	cleanup_io:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1961	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1962	if (err == 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1963	cache_rbio_pages(rbio);
				1964	else
				1965	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1966
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1967	rbio_orig_end_io(rbio, err, err == 0);
				1968	} else if (err == 0) {
				1969	rbio->faila = -1;
				1970	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1971
				1972	if (rbio->operation == BTRFS_RBIO_WRITE)
				1973	finish_rmw(rbio);
				1974	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				1975	finish_parity_scrub(rbio, 0);
				1976	else
				1977	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1978	} else {
				1979	rbio_orig_end_io(rbio, err, 0);
				1980	}
				1981	}
				1982
				1983	/*
				1984	* This is called only for stripes we've read from disk to
				1985	* reconstruct the parity.
				1986	*/
				1987	static void raid_recover_end_io(struct bio *bio, int err)
				1988	{
				1989	struct btrfs_raid_bio *rbio = bio->bi_private;
				1990
				1991	/*
				1992	* we only read stripe pages off the disk, set them
				1993	* up to date if there were no errors
				1994	*/
				1995	if (err)
				1996	fail_bio_stripe(rbio, bio);
				1997	else
				1998	set_bio_pages_uptodate(bio);
				1999	bio_put(bio);
				2000
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2001	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2002	return;
				2003
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2004	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2005	rbio_orig_end_io(rbio, -EIO, 0);
				2006	else
				2007	__raid_recover_end_io(rbio);
				2008	}
				2009
				2010	/*
				2011	* reads everything we need off the disk to reconstruct
				2012	* the parity. endio handlers trigger final reconstruction
				2013	* when the IO is done.
				2014	*
				2015	* This is used both for reads from the higher layers and for
				2016	* parity construction required to finish a rmw cycle.
				2017	*/
				2018	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2019	{
				2020	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2021	struct bio_list bio_list;
				2022	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	2023	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2024	int pagenr;
				2025	int stripe;
				2026	struct bio *bio;
				2027
				2028	bio_list_init(&bio_list);
				2029
				2030	ret = alloc_rbio_pages(rbio);
				2031	if (ret)
				2032	goto cleanup;
				2033
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2034	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2035
				2036	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2037	* read everything that hasn't failed. Thanks to the
				2038	* stripe cache, it is possible that some or all of these
				2039	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2040	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2041	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2042	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2043	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2044	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2045	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2046
				2047	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				2048	struct page *p;
				2049
				2050	/*
				2051	* the rmw code may have already read this
				2052	* page in
				2053	*/
				2054	p = rbio_stripe_page(rbio, stripe, pagenr);
				2055	if (PageUptodate(p))
				2056	continue;
				2057
				2058	ret = rbio_add_io_page(rbio, &bio_list,
				2059	rbio_stripe_page(rbio, stripe, pagenr),
				2060	stripe, pagenr, rbio->stripe_len);
				2061	if (ret < 0)
				2062	goto cleanup;
				2063	}
				2064	}
				2065
				2066	bios_to_read = bio_list_size(&bio_list);
				2067	if (!bios_to_read) {
				2068	/*
				2069	* we might have no bios to read just because the pages
				2070	* were up to date, or we might have no bios to read because
				2071	* the devices were gone.
				2072	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2073	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2074	__raid_recover_end_io(rbio);
				2075	goto out;
				2076	} else {
				2077	goto cleanup;
				2078	}
				2079	}
				2080
				2081	/*
				2082	* the bbio may be freed once we submit the last bio. Make sure
				2083	* not to touch it after that
				2084	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2085	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2086	while (1) {
				2087	bio = bio_list_pop(&bio_list);
				2088	if (!bio)
				2089	break;
				2090
				2091	bio->bi_private = rbio;
				2092	bio->bi_end_io = raid_recover_end_io;
				2093
				2094	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2095	BTRFS_WQ_ENDIO_RAID56);
				2096
				2097	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2098	submit_bio(READ, bio);
				2099	}
				2100	out:
				2101	return 0;
				2102
				2103	cleanup:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2104	if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2105	rbio_orig_end_io(rbio, -EIO, 0);
				2106	return -EIO;
				2107	}
				2108
				2109	/*
				2110	* the main entry point for reads from the higher layers. This
				2111	* is really only called when the normal read path had a failure,
				2112	* so we assume the bio they send down corresponds to a failed part
				2113	* of the drive.
				2114	*/
				2115	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2116	struct btrfs_bio *bbio, u64 stripe_len,
				2117	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2118	{
				2119	struct btrfs_raid_bio *rbio;
				2120	int ret;
				2121
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2122	rbio = alloc_rbio(root, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2123	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2124	if (generic_io)
				2125	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2126	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2127	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2128
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2129	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2130	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2131	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2132
				2133	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2134	if (rbio->faila == -1) {
				2135	BUG();
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2136	if (generic_io)
				2137	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2138	kfree(rbio);
				2139	return -EIO;
				2140	}
				2141
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2142	if (generic_io) {
				2143	btrfs_bio_counter_inc_noblocked(root->fs_info);
				2144	rbio->generic_bio_cnt = 1;
				2145	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2146	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2147	}
				2148
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2149	/*
				2150	* reconstruct from the q stripe if they are
				2151	* asking for mirror 3
				2152	*/
				2153	if (mirror_num == 3)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2154	rbio->failb = rbio->real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2155
				2156	ret = lock_stripe_add(rbio);
				2157
				2158	/*
				2159	* __raid56_parity_recover will end the bio with
				2160	* any errors it hits. We don't want to return
				2161	* its error value up the stack because our caller
				2162	* will end up calling bio_endio with any nonzero
				2163	* return
				2164	*/
				2165	if (ret == 0)
				2166	__raid56_parity_recover(rbio);
				2167	/*
				2168	* our rbio has been added to the list of
				2169	* rbios that will be handled after the
				2170	* currently lock owner is done
				2171	*/
				2172	return 0;
				2173
				2174	}
				2175
				2176	static void rmw_work(struct btrfs_work *work)
				2177	{
				2178	struct btrfs_raid_bio *rbio;
				2179
				2180	rbio = container_of(work, struct btrfs_raid_bio, work);
				2181	raid56_rmw_stripe(rbio);
				2182	}
				2183
				2184	static void read_rebuild_work(struct btrfs_work *work)
				2185	{
				2186	struct btrfs_raid_bio *rbio;
				2187
				2188	rbio = container_of(work, struct btrfs_raid_bio, work);
				2189	__raid56_parity_recover(rbio);
				2190	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2191
				2192	/*
				2193	* The following code is used to scrub/replace the parity stripe
				2194	*
				2195	* Note: We need make sure all the pages that add into the scrub/replace
				2196	* raid bio are correct and not be changed during the scrub/replace. That
				2197	* is those pages just hold metadata or file data with checksum.
				2198	*/
				2199
				2200	struct btrfs_raid_bio *
				2201	raid56_parity_alloc_scrub_rbio(struct btrfs_root root, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2202	struct btrfs_bio *bbio, u64 stripe_len,
				2203	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2204	unsigned long *dbitmap, int stripe_nsectors)
				2205	{
				2206	struct btrfs_raid_bio *rbio;
				2207	int i;
				2208
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2209	rbio = alloc_rbio(root, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2210	if (IS_ERR(rbio))
				2211	return NULL;
				2212	bio_list_add(&rbio->bio_list, bio);
				2213	/*
				2214	* This is a special bio which is used to hold the completion handler
				2215	* and make the scrub rbio is similar to the other types
				2216	*/
				2217	ASSERT(!bio->bi_iter.bi_size);
				2218	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2219
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2220	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2221	if (bbio->stripes[i].dev == scrub_dev) {
				2222	rbio->scrubp = i;
				2223	break;
				2224	}
				2225	}
				2226
				2227	/* Now we just support the sectorsize equals to page size */
				2228	ASSERT(root->sectorsize == PAGE_SIZE);
				2229	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2230	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2231
				2232	return rbio;
				2233	}
				2234
				2235	void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
				2236	struct page *page, u64 logical)
				2237	{
				2238	int stripe_offset;
				2239	int index;
				2240
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2241	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2242	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2243	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2244	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2245	index = stripe_offset >> PAGE_CACHE_SHIFT;
				2246	rbio->bio_pages[index] = page;
				2247	}
				2248
				2249	/*
				2250	* We just scrub the parity that we have correct data on the same horizontal,
				2251	* so we needn't allocate all pages for all the stripes.
				2252	*/
				2253	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2254	{
				2255	int i;
				2256	int bit;
				2257	int index;
				2258	struct page *page;
				2259
				2260	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2261	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2262	index = i * rbio->stripe_npages + bit;
				2263	if (rbio->stripe_pages[index])
				2264	continue;
				2265
				2266	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2267	if (!page)
				2268	return -ENOMEM;
				2269	rbio->stripe_pages[index] = page;
				2270	ClearPageUptodate(page);
				2271	}
				2272	}
				2273	return 0;
				2274	}
				2275
				2276	/*
				2277	* end io function used by finish_rmw. When we finally
				2278	* get here, we've written a full stripe
				2279	*/
				2280	static void raid_write_parity_end_io(struct bio *bio, int err)
				2281	{
				2282	struct btrfs_raid_bio *rbio = bio->bi_private;
				2283
				2284	if (err)
				2285	fail_bio_stripe(rbio, bio);
				2286
				2287	bio_put(bio);
				2288
				2289	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2290	return;
				2291
				2292	err = 0;
				2293
				2294	if (atomic_read(&rbio->error))
				2295	err = -EIO;
				2296
				2297	rbio_orig_end_io(rbio, err, 0);
				2298	}
				2299
				2300	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2301	int need_check)
				2302	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2303	struct btrfs_bio *bbio = rbio->bbio;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2304	void *pointers[rbio->real_stripes];
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2305	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2306	int nr_data = rbio->nr_data;
				2307	int stripe;
				2308	int pagenr;
				2309	int p_stripe = -1;
				2310	int q_stripe = -1;
				2311	struct page *p_page = NULL;
				2312	struct page *q_page = NULL;
				2313	struct bio_list bio_list;
				2314	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2315	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2316	int ret;
				2317
				2318	bio_list_init(&bio_list);
				2319
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2320	if (rbio->real_stripes - rbio->nr_data == 1) {
				2321	p_stripe = rbio->real_stripes - 1;
				2322	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				2323	p_stripe = rbio->real_stripes - 2;
				2324	q_stripe = rbio->real_stripes - 1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2325	} else {
				2326	BUG();
				2327	}
				2328
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2329	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2330	is_replace = 1;
				2331	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2332	}
				2333
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2334	/*
				2335	* Because the higher layers(scrubber) are unlikely to
				2336	* use this area of the disk again soon, so don't cache
				2337	* it.
				2338	*/
				2339	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2340
				2341	if (!need_check)
				2342	goto writeback;
				2343
				2344	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2345	if (!p_page)
				2346	goto cleanup;
				2347	SetPageUptodate(p_page);
				2348
				2349	if (q_stripe != -1) {
				2350	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2351	if (!q_page) {
				2352	__free_page(p_page);
				2353	goto cleanup;
				2354	}
				2355	SetPageUptodate(q_page);
				2356	}
				2357
				2358	atomic_set(&rbio->error, 0);
				2359
				2360	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2361	struct page *p;
				2362	void *parity;
				2363	/* first collect one page from each data stripe */
				2364	for (stripe = 0; stripe < nr_data; stripe++) {
				2365	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2366	pointers[stripe] = kmap(p);
				2367	}
				2368
				2369	/* then add the parity stripe */
				2370	pointers[stripe++] = kmap(p_page);
				2371
				2372	if (q_stripe != -1) {
				2373
				2374	/*
				2375	* raid6, add the qstripe and call the
				2376	* library function to fill in our p/q
				2377	*/
				2378	pointers[stripe++] = kmap(q_page);
				2379
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2380	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2381	pointers);
				2382	} else {
				2383	/* raid5 */
				2384	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				2385	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				2386	}
				2387
				2388	/* Check scrubbing pairty and repair it */
				2389	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2390	parity = kmap(p);
				2391	if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
				2392	memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
				2393	else
				2394	/* Parity is right, needn't writeback */
				2395	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2396	kunmap(p);
				2397
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2398	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2399	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				2400	}
				2401
				2402	__free_page(p_page);
				2403	if (q_page)
				2404	__free_page(q_page);
				2405
				2406	writeback:
				2407	/*
				2408	* time to start writing. Make bios for everything from the
				2409	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2410	* everything else.
				2411	*/
				2412	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2413	struct page *page;
				2414
				2415	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2416	ret = rbio_add_io_page(rbio, &bio_list,
				2417	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2418	if (ret)
				2419	goto cleanup;
				2420	}
				2421
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2422	if (!is_replace)
				2423	goto submit_write;
				2424
				2425	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2426	struct page *page;
				2427
				2428	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2429	ret = rbio_add_io_page(rbio, &bio_list, page,
				2430	bbio->tgtdev_map[rbio->scrubp],
				2431	pagenr, rbio->stripe_len);
				2432	if (ret)
				2433	goto cleanup;
				2434	}
				2435
				2436	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2437	nr_data = bio_list_size(&bio_list);
				2438	if (!nr_data) {
				2439	/* Every parity is right */
				2440	rbio_orig_end_io(rbio, 0, 0);
				2441	return;
				2442	}
				2443
				2444	atomic_set(&rbio->stripes_pending, nr_data);
				2445
				2446	while (1) {
				2447	bio = bio_list_pop(&bio_list);
				2448	if (!bio)
				2449	break;
				2450
				2451	bio->bi_private = rbio;
				2452	bio->bi_end_io = raid_write_parity_end_io;
				2453	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2454	submit_bio(WRITE, bio);
				2455	}
				2456	return;
				2457
				2458	cleanup:
				2459	rbio_orig_end_io(rbio, -EIO, 0);
				2460	}
				2461
				2462	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2463	{
				2464	if (stripe >= 0 && stripe < rbio->nr_data)
				2465	return 1;
				2466	return 0;
				2467	}
				2468
				2469	/*
				2470	* While we're doing the parity check and repair, we could have errors
				2471	* in reading pages off the disk. This checks for errors and if we're
				2472	* not able to read the page it'll trigger parity reconstruction. The
				2473	* parity scrub will be finished after we've reconstructed the failed
				2474	* stripes
				2475	*/
				2476	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2477	{
				2478	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2479	goto cleanup;
				2480
				2481	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2482	int dfail = 0, failp = -1;
				2483
				2484	if (is_data_stripe(rbio, rbio->faila))
				2485	dfail++;
				2486	else if (is_parity_stripe(rbio->faila))
				2487	failp = rbio->faila;
				2488
				2489	if (is_data_stripe(rbio, rbio->failb))
				2490	dfail++;
				2491	else if (is_parity_stripe(rbio->failb))
				2492	failp = rbio->failb;
				2493
				2494	/*
				2495	* Because we can not use a scrubbing parity to repair
				2496	* the data, so the capability of the repair is declined.
				2497	* (In the case of RAID5, we can not repair anything)
				2498	*/
				2499	if (dfail > rbio->bbio->max_errors - 1)
				2500	goto cleanup;
				2501
				2502	/*
				2503	* If all data is good, only parity is correctly, just
				2504	* repair the parity.
				2505	*/
				2506	if (dfail == 0) {
				2507	finish_parity_scrub(rbio, 0);
				2508	return;
				2509	}
				2510
				2511	/*
				2512	* Here means we got one corrupted data stripe and one
				2513	* corrupted parity on RAID6, if the corrupted parity
				2514	* is scrubbing parity, luckly, use the other one to repair
				2515	* the data, or we can not repair the data stripe.
				2516	*/
				2517	if (failp != rbio->scrubp)
				2518	goto cleanup;
				2519
				2520	__raid_recover_end_io(rbio);
				2521	} else {
				2522	finish_parity_scrub(rbio, 1);
				2523	}
				2524	return;
				2525
				2526	cleanup:
				2527	rbio_orig_end_io(rbio, -EIO, 0);
				2528	}
				2529
				2530	/*
				2531	* end io for the read phase of the rmw cycle. All the bios here are physical
				2532	* stripe bios we've read from the disk so we can recalculate the parity of the
				2533	* stripe.
				2534	*
				2535	* This will usually kick off finish_rmw once all the bios are read in, but it
				2536	* may trigger parity reconstruction if we had any errors along the way
				2537	*/
				2538	static void raid56_parity_scrub_end_io(struct bio *bio, int err)
				2539	{
				2540	struct btrfs_raid_bio *rbio = bio->bi_private;
				2541
				2542	if (err)
				2543	fail_bio_stripe(rbio, bio);
				2544	else
				2545	set_bio_pages_uptodate(bio);
				2546
				2547	bio_put(bio);
				2548
				2549	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2550	return;
				2551
				2552	/*
				2553	* this will normally call finish_rmw to start our write
				2554	* but if there are any failed stripes we'll reconstruct
				2555	* from parity first
				2556	*/
				2557	validate_rbio_for_parity_scrub(rbio);
				2558	}
				2559
				2560	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2561	{
				2562	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2563	struct bio_list bio_list;
				2564	int ret;
				2565	int pagenr;
				2566	int stripe;
				2567	struct bio *bio;
				2568
				2569	ret = alloc_rbio_essential_pages(rbio);
				2570	if (ret)
				2571	goto cleanup;
				2572
				2573	bio_list_init(&bio_list);
				2574
				2575	atomic_set(&rbio->error, 0);
				2576	/*
				2577	* build a list of bios to read all the missing parts of this
				2578	* stripe
				2579	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2580	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2581	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2582	struct page *page;
				2583	/*
				2584	* we want to find all the pages missing from
				2585	* the rbio and read them from the disk. If
				2586	* page_in_rbio finds a page in the bio list
				2587	* we don't need to read it off the stripe.
				2588	*/
				2589	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2590	if (page)
				2591	continue;
				2592
				2593	page = rbio_stripe_page(rbio, stripe, pagenr);
				2594	/*
				2595	* the bio cache may have handed us an uptodate
				2596	* page. If so, be happy and use it
				2597	*/
				2598	if (PageUptodate(page))
				2599	continue;
				2600
				2601	ret = rbio_add_io_page(rbio, &bio_list, page,
				2602	stripe, pagenr, rbio->stripe_len);
				2603	if (ret)
				2604	goto cleanup;
				2605	}
				2606	}
				2607
				2608	bios_to_read = bio_list_size(&bio_list);
				2609	if (!bios_to_read) {
				2610	/*
				2611	* this can happen if others have merged with
				2612	* us, it means there is nothing left to read.
				2613	* But if there are missing devices it may not be
				2614	* safe to do the full stripe write yet.
				2615	*/
				2616	goto finish;
				2617	}
				2618
				2619	/*
				2620	* the bbio may be freed once we submit the last bio. Make sure
				2621	* not to touch it after that
				2622	*/
				2623	atomic_set(&rbio->stripes_pending, bios_to_read);
				2624	while (1) {
				2625	bio = bio_list_pop(&bio_list);
				2626	if (!bio)
				2627	break;
				2628
				2629	bio->bi_private = rbio;
				2630	bio->bi_end_io = raid56_parity_scrub_end_io;
				2631
				2632	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2633	BTRFS_WQ_ENDIO_RAID56);
				2634
				2635	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2636	submit_bio(READ, bio);
				2637	}
				2638	/* the actual write will happen once the reads are done */
				2639	return;
				2640
				2641	cleanup:
				2642	rbio_orig_end_io(rbio, -EIO, 0);
				2643	return;
				2644
				2645	finish:
				2646	validate_rbio_for_parity_scrub(rbio);
				2647	}
				2648
				2649	static void scrub_parity_work(struct btrfs_work *work)
				2650	{
				2651	struct btrfs_raid_bio *rbio;
				2652
				2653	rbio = container_of(work, struct btrfs_raid_bio, work);
				2654	raid56_parity_scrub_stripe(rbio);
				2655	}
				2656
				2657	static void async_scrub_parity(struct btrfs_raid_bio *rbio)
				2658	{
				2659	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2660	scrub_parity_work, NULL, NULL);
				2661
				2662	btrfs_queue_work(rbio->fs_info->rmw_workers,
				2663	&rbio->work);
				2664	}
				2665
				2666	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2667	{
				2668	if (!lock_stripe_add(rbio))
				2669	async_scrub_parity(rbio);
				2670	}