Blame - fs/btrfs/raid56.c - kernel/msm-4.9

blob: 8ab2a17bbba8b754bdcf90721d3ca40fc0e2f4b6 [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
Geert Uytterhoeven	d7011f5	2013-03-03 04:44:41 -0700	[diff] [blame]	34	#include <linux/vmalloc.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	61	/*
				62	* bbio and raid_map is managed by the caller, so we shouldn't free
				63	* them here. And besides that, all rbios with this flag should not
				64	* be cached, because we need raid_map to check the rbios' stripe
				65	* is the same or not, but it is very likely that the caller has
				66	* free raid_map, so don't cache those rbios.
				67	*/
				68	#define RBIO_HOLD_BBIO_MAP_BIT 4
				69
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	70	#define RBIO_CACHE_SIZE 1024
				71
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	72	enum btrfs_rbio_ops {
				73	BTRFS_RBIO_WRITE = 0,
				74	BTRFS_RBIO_READ_REBUILD = 1,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	75	BTRFS_RBIO_PARITY_SCRUB = 2,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	76	};
				77
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	78	struct btrfs_raid_bio {
				79	struct btrfs_fs_info *fs_info;
				80	struct btrfs_bio *bbio;
				81
				82	/*
				83	* logical block numbers for the start of each stripe
				84	* The last one or two are p/q. These are sorted,
				85	* so raid_map[0] is the start of our full stripe
				86	*/
				87	u64 *raid_map;
				88
				89	/* while we're doing rmw on a stripe
				90	* we put it into a hash table so we can
				91	* lock the stripe and merge more rbios
				92	* into it.
				93	*/
				94	struct list_head hash_list;
				95
				96	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	97	* LRU list for the stripe cache
				98	*/
				99	struct list_head stripe_cache;
				100
				101	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	102	* for scheduling work in the helper threads
				103	*/
				104	struct btrfs_work work;
				105
				106	/*
				107	* bio list and bio_list_lock are used
				108	* to add more bios into the stripe
				109	* in hopes of avoiding the full rmw
				110	*/
				111	struct bio_list bio_list;
				112	spinlock_t bio_list_lock;
				113
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	114	/* also protected by the bio_list_lock, the
				115	* plug list is used by the plugging code
				116	* to collect partial bios while plugged. The
				117	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	118	* the stripe lock to the next pending IO
				119	*/
				120	struct list_head plug_list;
				121
				122	/*
				123	* flags that tell us if it is safe to
				124	* merge with this bio
				125	*/
				126	unsigned long flags;
				127
				128	/* size of each individual stripe on disk */
				129	int stripe_len;
				130
				131	/* number of data stripes (no p/q) */
				132	int nr_data;
				133
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	134	int real_stripes;
				135
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	136	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	137	/*
				138	* set if we're doing a parity rebuild
				139	* for a read from higher up, which is handled
				140	* differently from a parity rebuild as part of
				141	* rmw
				142	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	143	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	144
				145	/* first bad stripe */
				146	int faila;
				147
				148	/* second bad stripe (for raid6 use) */
				149	int failb;
				150
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	151	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	152	/*
				153	* number of pages needed to represent the full
				154	* stripe
				155	*/
				156	int nr_pages;
				157
				158	/*
				159	* size of all the bios in the bio_list. This
				160	* helps us decide if the rbio maps to a full
				161	* stripe or not
				162	*/
				163	int bio_list_bytes;
				164
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	165	int generic_bio_cnt;
				166
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	167	atomic_t refs;
				168
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	169	atomic_t stripes_pending;
				170
				171	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	172	/*
				173	* these are two arrays of pointers. We allocate the
				174	* rbio big enough to hold them both and setup their
				175	* locations when the rbio is allocated
				176	*/
				177
				178	/* pointers to pages that we allocated for
				179	* reading/writing stripes directly from the disk (including P/Q)
				180	*/
				181	struct page **stripe_pages;
				182
				183	/*
				184	* pointers to the pages in the bio_list. Stored
				185	* here for faster lookup
				186	*/
				187	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	188
				189	/*
				190	* bitmap to record which horizontal stripe has data
				191	*/
				192	unsigned long *dbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	193	};
				194
				195	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				196	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				197	static void rmw_work(struct btrfs_work *work);
				198	static void read_rebuild_work(struct btrfs_work *work);
				199	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				200	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				201	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				202	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				203	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				204	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				205	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				206
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	207	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				208	int need_check);
				209	static void async_scrub_parity(struct btrfs_raid_bio *rbio);
				210
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	211	/*
				212	* the stripe hash table is used for locking, and to collect
				213	* bios in hopes of making a full stripe
				214	*/
				215	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				216	{
				217	struct btrfs_stripe_hash_table *table;
				218	struct btrfs_stripe_hash_table *x;
				219	struct btrfs_stripe_hash *cur;
				220	struct btrfs_stripe_hash *h;
				221	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				222	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	223	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	224
				225	if (info->stripe_hash_table)
				226	return 0;
				227
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	228	/*
				229	* The table is large, starting with order 4 and can go as high as
				230	* order 7 in case lock debugging is turned on.
				231	*
				232	* Try harder to allocate and fallback to vmalloc to lower the chance
				233	* of a failing mount.
				234	*/
				235	table_size = sizeof(table) + sizeof(h) * num_entries;
				236	table = kzalloc(table_size, GFP_KERNEL \| __GFP_NOWARN \| __GFP_REPEAT);
				237	if (!table) {
				238	table = vzalloc(table_size);
				239	if (!table)
				240	return -ENOMEM;
				241	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	242
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	243	spin_lock_init(&table->cache_lock);
				244	INIT_LIST_HEAD(&table->stripe_cache);
				245
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	246	h = table->table;
				247
				248	for (i = 0; i < num_entries; i++) {
				249	cur = h + i;
				250	INIT_LIST_HEAD(&cur->hash_list);
				251	spin_lock_init(&cur->lock);
				252	init_waitqueue_head(&cur->wait);
				253	}
				254
				255	x = cmpxchg(&info->stripe_hash_table, NULL, table);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	256	if (x) {
				257	if (is_vmalloc_addr(x))
				258	vfree(x);
				259	else
				260	kfree(x);
				261	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	262	return 0;
				263	}
				264
				265	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	266	* caching an rbio means to copy anything from the
				267	* bio_pages array into the stripe_pages array. We
				268	* use the page uptodate bit in the stripe cache array
				269	* to indicate if it has valid data
				270	*
				271	* once the caching is done, we set the cache ready
				272	* bit.
				273	*/
				274	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				275	{
				276	int i;
				277	char *s;
				278	char *d;
				279	int ret;
				280
				281	ret = alloc_rbio_pages(rbio);
				282	if (ret)
				283	return;
				284
				285	for (i = 0; i < rbio->nr_pages; i++) {
				286	if (!rbio->bio_pages[i])
				287	continue;
				288
				289	s = kmap(rbio->bio_pages[i]);
				290	d = kmap(rbio->stripe_pages[i]);
				291
				292	memcpy(d, s, PAGE_CACHE_SIZE);
				293
				294	kunmap(rbio->bio_pages[i]);
				295	kunmap(rbio->stripe_pages[i]);
				296	SetPageUptodate(rbio->stripe_pages[i]);
				297	}
				298	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				299	}
				300
				301	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	302	* we hash on the first logical address of the stripe
				303	*/
				304	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				305	{
				306	u64 num = rbio->raid_map[0];
				307
				308	/*
				309	* we shift down quite a bit. We're using byte
				310	* addressing, and most of the lower bits are zeros.
				311	* This tends to upset hash_64, and it consistently
				312	* returns just one or two different values.
				313	*
				314	* shifting off the lower bits fixes things.
				315	*/
				316	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				317	}
				318
				319	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	320	* stealing an rbio means taking all the uptodate pages from the stripe
				321	* array in the source rbio and putting them into the destination rbio
				322	*/
				323	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				324	{
				325	int i;
				326	struct page *s;
				327	struct page *d;
				328
				329	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				330	return;
				331
				332	for (i = 0; i < dest->nr_pages; i++) {
				333	s = src->stripe_pages[i];
				334	if (!s \|\| !PageUptodate(s)) {
				335	continue;
				336	}
				337
				338	d = dest->stripe_pages[i];
				339	if (d)
				340	__free_page(d);
				341
				342	dest->stripe_pages[i] = s;
				343	src->stripe_pages[i] = NULL;
				344	}
				345	}
				346
				347	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	348	* merging means we take the bio_list from the victim and
				349	* splice it into the destination. The victim should
				350	* be discarded afterwards.
				351	*
				352	* must be called with dest->rbio_list_lock held
				353	*/
				354	static void merge_rbio(struct btrfs_raid_bio *dest,
				355	struct btrfs_raid_bio *victim)
				356	{
				357	bio_list_merge(&dest->bio_list, &victim->bio_list);
				358	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	359	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	360	bio_list_init(&victim->bio_list);
				361	}
				362
				363	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	364	* used to prune items that are in the cache. The caller
				365	* must hold the hash table lock.
				366	*/
				367	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				368	{
				369	int bucket = rbio_bucket(rbio);
				370	struct btrfs_stripe_hash_table *table;
				371	struct btrfs_stripe_hash *h;
				372	int freeit = 0;
				373
				374	/*
				375	* check the bit again under the hash table lock.
				376	*/
				377	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				378	return;
				379
				380	table = rbio->fs_info->stripe_hash_table;
				381	h = table->table + bucket;
				382
				383	/* hold the lock for the bucket because we may be
				384	* removing it from the hash table
				385	*/
				386	spin_lock(&h->lock);
				387
				388	/*
				389	* hold the lock for the bio list because we need
				390	* to make sure the bio list is empty
				391	*/
				392	spin_lock(&rbio->bio_list_lock);
				393
				394	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				395	list_del_init(&rbio->stripe_cache);
				396	table->cache_size -= 1;
				397	freeit = 1;
				398
				399	/* if the bio list isn't empty, this rbio is
				400	* still involved in an IO. We take it out
				401	* of the cache list, and drop the ref that
				402	* was held for the list.
				403	*
				404	* If the bio_list was empty, we also remove
				405	* the rbio from the hash_table, and drop
				406	* the corresponding ref
				407	*/
				408	if (bio_list_empty(&rbio->bio_list)) {
				409	if (!list_empty(&rbio->hash_list)) {
				410	list_del_init(&rbio->hash_list);
				411	atomic_dec(&rbio->refs);
				412	BUG_ON(!list_empty(&rbio->plug_list));
				413	}
				414	}
				415	}
				416
				417	spin_unlock(&rbio->bio_list_lock);
				418	spin_unlock(&h->lock);
				419
				420	if (freeit)
				421	__free_raid_bio(rbio);
				422	}
				423
				424	/*
				425	* prune a given rbio from the cache
				426	*/
				427	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				428	{
				429	struct btrfs_stripe_hash_table *table;
				430	unsigned long flags;
				431
				432	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				433	return;
				434
				435	table = rbio->fs_info->stripe_hash_table;
				436
				437	spin_lock_irqsave(&table->cache_lock, flags);
				438	__remove_rbio_from_cache(rbio);
				439	spin_unlock_irqrestore(&table->cache_lock, flags);
				440	}
				441
				442	/*
				443	* remove everything in the cache
				444	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	445	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	446	{
				447	struct btrfs_stripe_hash_table *table;
				448	unsigned long flags;
				449	struct btrfs_raid_bio *rbio;
				450
				451	table = info->stripe_hash_table;
				452
				453	spin_lock_irqsave(&table->cache_lock, flags);
				454	while (!list_empty(&table->stripe_cache)) {
				455	rbio = list_entry(table->stripe_cache.next,
				456	struct btrfs_raid_bio,
				457	stripe_cache);
				458	__remove_rbio_from_cache(rbio);
				459	}
				460	spin_unlock_irqrestore(&table->cache_lock, flags);
				461	}
				462
				463	/*
				464	* remove all cached entries and free the hash table
				465	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	466	*/
				467	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				468	{
				469	if (!info->stripe_hash_table)
				470	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	471	btrfs_clear_rbio_cache(info);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	472	if (is_vmalloc_addr(info->stripe_hash_table))
				473	vfree(info->stripe_hash_table);
				474	else
				475	kfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	476	info->stripe_hash_table = NULL;
				477	}
				478
				479	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	480	* insert an rbio into the stripe cache. It
				481	* must have already been prepared by calling
				482	* cache_rbio_pages
				483	*
				484	* If this rbio was already cached, it gets
				485	* moved to the front of the lru.
				486	*
				487	* If the size of the rbio cache is too big, we
				488	* prune an item.
				489	*/
				490	static void cache_rbio(struct btrfs_raid_bio *rbio)
				491	{
				492	struct btrfs_stripe_hash_table *table;
				493	unsigned long flags;
				494
				495	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				496	return;
				497
				498	table = rbio->fs_info->stripe_hash_table;
				499
				500	spin_lock_irqsave(&table->cache_lock, flags);
				501	spin_lock(&rbio->bio_list_lock);
				502
				503	/* bump our ref if we were not in the list before */
				504	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				505	atomic_inc(&rbio->refs);
				506
				507	if (!list_empty(&rbio->stripe_cache)){
				508	list_move(&rbio->stripe_cache, &table->stripe_cache);
				509	} else {
				510	list_add(&rbio->stripe_cache, &table->stripe_cache);
				511	table->cache_size += 1;
				512	}
				513
				514	spin_unlock(&rbio->bio_list_lock);
				515
				516	if (table->cache_size > RBIO_CACHE_SIZE) {
				517	struct btrfs_raid_bio *found;
				518
				519	found = list_entry(table->stripe_cache.prev,
				520	struct btrfs_raid_bio,
				521	stripe_cache);
				522
				523	if (found != rbio)
				524	__remove_rbio_from_cache(found);
				525	}
				526
				527	spin_unlock_irqrestore(&table->cache_lock, flags);
				528	return;
				529	}
				530
				531	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	532	* helper function to run the xor_blocks api. It is only
				533	* able to do MAX_XOR_BLOCKS at a time, so we need to
				534	* loop through.
				535	*/
				536	static void run_xor(void **pages, int src_cnt, ssize_t len)
				537	{
				538	int src_off = 0;
				539	int xor_src_cnt = 0;
				540	void *dest = pages[src_cnt];
				541
				542	while(src_cnt > 0) {
				543	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				544	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				545
				546	src_cnt -= xor_src_cnt;
				547	src_off += xor_src_cnt;
				548	}
				549	}
				550
				551	/*
				552	* returns true if the bio list inside this rbio
				553	* covers an entire stripe (no rmw required).
				554	* Must be called with the bio list lock held, or
				555	* at a time when you know it is impossible to add
				556	* new bios into the list
				557	*/
				558	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				559	{
				560	unsigned long size = rbio->bio_list_bytes;
				561	int ret = 1;
				562
				563	if (size != rbio->nr_data * rbio->stripe_len)
				564	ret = 0;
				565
				566	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				567	return ret;
				568	}
				569
				570	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				571	{
				572	unsigned long flags;
				573	int ret;
				574
				575	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				576	ret = __rbio_is_full(rbio);
				577	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				578	return ret;
				579	}
				580
				581	/*
				582	* returns 1 if it is safe to merge two rbios together.
				583	* The merging is safe if the two rbios correspond to
				584	* the same stripe and if they are both going in the same
				585	* direction (read vs write), and if neither one is
				586	* locked for final IO
				587	*
				588	* The caller is responsible for locking such that
				589	* rmw_locked is safe to test
				590	*/
				591	static int rbio_can_merge(struct btrfs_raid_bio *last,
				592	struct btrfs_raid_bio *cur)
				593	{
				594	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				595	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				596	return 0;
				597
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	598	/*
				599	* we can't merge with cached rbios, since the
				600	* idea is that when we merge the destination
				601	* rbio is going to run our IO for us. We can
				602	* steal from cached rbio's though, other functions
				603	* handle that.
				604	*/
				605	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				606	test_bit(RBIO_CACHE_BIT, &cur->flags))
				607	return 0;
				608
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	609	if (last->raid_map[0] !=
				610	cur->raid_map[0])
				611	return 0;
				612
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	613	/* we can't merge with different operations */
				614	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	615	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	616	/*
				617	* We've need read the full stripe from the drive.
				618	* check and repair the parity and write the new results.
				619	*
				620	* We're not allowed to add any new bios to the
				621	* bio list here, anyone else that wants to
				622	* change this stripe needs to do their own rmw.
				623	*/
				624	if (last->operation == BTRFS_RBIO_PARITY_SCRUB \|\|
				625	cur->operation == BTRFS_RBIO_PARITY_SCRUB)
				626	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	627
				628	return 1;
				629	}
				630
				631	/*
				632	* helper to index into the pstripe
				633	*/
				634	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				635	{
				636	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				637	return rbio->stripe_pages[index];
				638	}
				639
				640	/*
				641	* helper to index into the qstripe, returns null
				642	* if there is no qstripe
				643	*/
				644	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				645	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	646	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	647	return NULL;
				648
				649	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				650	PAGE_CACHE_SHIFT;
				651	return rbio->stripe_pages[index];
				652	}
				653
				654	/*
				655	* The first stripe in the table for a logical address
				656	* has the lock. rbios are added in one of three ways:
				657	*
				658	* 1) Nobody has the stripe locked yet. The rbio is given
				659	* the lock and 0 is returned. The caller must start the IO
				660	* themselves.
				661	*
				662	* 2) Someone has the stripe locked, but we're able to merge
				663	* with the lock owner. The rbio is freed and the IO will
				664	* start automatically along with the existing rbio. 1 is returned.
				665	*
				666	* 3) Someone has the stripe locked, but we're not able to merge.
				667	* The rbio is added to the lock owner's plug list, or merged into
				668	* an rbio already on the plug list. When the lock owner unlocks,
				669	* the next rbio on the list is run and the IO is started automatically.
				670	* 1 is returned
				671	*
				672	* If we return 0, the caller still owns the rbio and must continue with
				673	* IO submission. If we return 1, the caller must assume the rbio has
				674	* already been freed.
				675	*/
				676	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				677	{
				678	int bucket = rbio_bucket(rbio);
				679	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				680	struct btrfs_raid_bio *cur;
				681	struct btrfs_raid_bio *pending;
				682	unsigned long flags;
				683	DEFINE_WAIT(wait);
				684	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	685	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	686	int ret = 0;
				687	int walk = 0;
				688
				689	spin_lock_irqsave(&h->lock, flags);
				690	list_for_each_entry(cur, &h->hash_list, hash_list) {
				691	walk++;
				692	if (cur->raid_map[0] == rbio->raid_map[0]) {
				693	spin_lock(&cur->bio_list_lock);
				694
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	695	/* can we steal this cached rbio's pages? */
				696	if (bio_list_empty(&cur->bio_list) &&
				697	list_empty(&cur->plug_list) &&
				698	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				699	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				700	list_del_init(&cur->hash_list);
				701	atomic_dec(&cur->refs);
				702
				703	steal_rbio(cur, rbio);
				704	cache_drop = cur;
				705	spin_unlock(&cur->bio_list_lock);
				706
				707	goto lockit;
				708	}
				709
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	710	/* can we merge into the lock owner? */
				711	if (rbio_can_merge(cur, rbio)) {
				712	merge_rbio(cur, rbio);
				713	spin_unlock(&cur->bio_list_lock);
				714	freeit = rbio;
				715	ret = 1;
				716	goto out;
				717	}
				718
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	719
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	720	/*
				721	* we couldn't merge with the running
				722	* rbio, see if we can merge with the
				723	* pending ones. We don't have to
				724	* check for rmw_locked because there
				725	* is no way they are inside finish_rmw
				726	* right now
				727	*/
				728	list_for_each_entry(pending, &cur->plug_list,
				729	plug_list) {
				730	if (rbio_can_merge(pending, rbio)) {
				731	merge_rbio(pending, rbio);
				732	spin_unlock(&cur->bio_list_lock);
				733	freeit = rbio;
				734	ret = 1;
				735	goto out;
				736	}
				737	}
				738
				739	/* no merging, put us on the tail of the plug list,
				740	* our rbio will be started with the currently
				741	* running rbio unlocks
				742	*/
				743	list_add_tail(&rbio->plug_list, &cur->plug_list);
				744	spin_unlock(&cur->bio_list_lock);
				745	ret = 1;
				746	goto out;
				747	}
				748	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	749	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	750	atomic_inc(&rbio->refs);
				751	list_add(&rbio->hash_list, &h->hash_list);
				752	out:
				753	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	754	if (cache_drop)
				755	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	756	if (freeit)
				757	__free_raid_bio(freeit);
				758	return ret;
				759	}
				760
				761	/*
				762	* called as rmw or parity rebuild is completed. If the plug list has more
				763	* rbios waiting for this stripe, the next one on the list will be started
				764	*/
				765	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				766	{
				767	int bucket;
				768	struct btrfs_stripe_hash *h;
				769	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	770	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	771
				772	bucket = rbio_bucket(rbio);
				773	h = rbio->fs_info->stripe_hash_table->table + bucket;
				774
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	775	if (list_empty(&rbio->plug_list))
				776	cache_rbio(rbio);
				777
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	778	spin_lock_irqsave(&h->lock, flags);
				779	spin_lock(&rbio->bio_list_lock);
				780
				781	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	782	/*
				783	* if we're still cached and there is no other IO
				784	* to perform, just leave this rbio here for others
				785	* to steal from later
				786	*/
				787	if (list_empty(&rbio->plug_list) &&
				788	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				789	keep_cache = 1;
				790	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				791	BUG_ON(!bio_list_empty(&rbio->bio_list));
				792	goto done;
				793	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	794
				795	list_del_init(&rbio->hash_list);
				796	atomic_dec(&rbio->refs);
				797
				798	/*
				799	* we use the plug list to hold all the rbios
				800	* waiting for the chance to lock this stripe.
				801	* hand the lock over to one of them.
				802	*/
				803	if (!list_empty(&rbio->plug_list)) {
				804	struct btrfs_raid_bio *next;
				805	struct list_head *head = rbio->plug_list.next;
				806
				807	next = list_entry(head, struct btrfs_raid_bio,
				808	plug_list);
				809
				810	list_del_init(&rbio->plug_list);
				811
				812	list_add(&next->hash_list, &h->hash_list);
				813	atomic_inc(&next->refs);
				814	spin_unlock(&rbio->bio_list_lock);
				815	spin_unlock_irqrestore(&h->lock, flags);
				816
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	817	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	818	async_read_rebuild(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	819	else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	820	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	821	async_rmw_stripe(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	822	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				823	steal_rbio(rbio, next);
				824	async_scrub_parity(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	825	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	826
				827	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	828	} else if (waitqueue_active(&h->wait)) {
				829	spin_unlock(&rbio->bio_list_lock);
				830	spin_unlock_irqrestore(&h->lock, flags);
				831	wake_up(&h->wait);
				832	goto done_nolock;
				833	}
				834	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	835	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	836	spin_unlock(&rbio->bio_list_lock);
				837	spin_unlock_irqrestore(&h->lock, flags);
				838
				839	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	840	if (!keep_cache)
				841	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	842	}
				843
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	844	static inline void
				845	__free_bbio_and_raid_map(struct btrfs_bio bbio, u64 raid_map, int need)
				846	{
				847	if (need) {
				848	kfree(raid_map);
				849	kfree(bbio);
				850	}
				851	}
				852
				853	static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
				854	{
				855	__free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
				856	!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
				857	}
				858
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	859	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				860	{
				861	int i;
				862
				863	WARN_ON(atomic_read(&rbio->refs) < 0);
				864	if (!atomic_dec_and_test(&rbio->refs))
				865	return;
				866
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	867	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	868	WARN_ON(!list_empty(&rbio->hash_list));
				869	WARN_ON(!bio_list_empty(&rbio->bio_list));
				870
				871	for (i = 0; i < rbio->nr_pages; i++) {
				872	if (rbio->stripe_pages[i]) {
				873	__free_page(rbio->stripe_pages[i]);
				874	rbio->stripe_pages[i] = NULL;
				875	}
				876	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	877
				878	free_bbio_and_raid_map(rbio);
				879
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	880	kfree(rbio);
				881	}
				882
				883	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				884	{
				885	unlock_stripe(rbio);
				886	__free_raid_bio(rbio);
				887	}
				888
				889	/*
				890	* this frees the rbio and runs through all the bios in the
				891	* bio_list and calls end_io on them
				892	*/
				893	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				894	{
				895	struct bio *cur = bio_list_get(&rbio->bio_list);
				896	struct bio *next;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	897
				898	if (rbio->generic_bio_cnt)
				899	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				900
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	901	free_raid_bio(rbio);
				902
				903	while (cur) {
				904	next = cur->bi_next;
				905	cur->bi_next = NULL;
				906	if (uptodate)
				907	set_bit(BIO_UPTODATE, &cur->bi_flags);
				908	bio_endio(cur, err);
				909	cur = next;
				910	}
				911	}
				912
				913	/*
				914	* end io function used by finish_rmw. When we finally
				915	* get here, we've written a full stripe
				916	*/
				917	static void raid_write_end_io(struct bio *bio, int err)
				918	{
				919	struct btrfs_raid_bio *rbio = bio->bi_private;
				920
				921	if (err)
				922	fail_bio_stripe(rbio, bio);
				923
				924	bio_put(bio);
				925
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	926	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	927	return;
				928
				929	err = 0;
				930
				931	/* OK, we have read all the stripes we need to. */
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	932	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	933	err = -EIO;
				934
				935	rbio_orig_end_io(rbio, err, 0);
				936	return;
				937	}
				938
				939	/*
				940	* the read/modify/write code wants to use the original bio for
				941	* any pages it included, and then use the rbio for everything
				942	* else. This function decides if a given index (stripe number)
				943	* and page number in that stripe fall inside the original bio
				944	* or the rbio.
				945	*
				946	* if you set bio_list_only, you'll get a NULL back for any ranges
				947	* that are outside the bio_list
				948	*
				949	* This doesn't take any refs on anything, you get a bare page pointer
				950	* and the caller must bump refs as required.
				951	*
				952	* You must call index_rbio_pages once before you can trust
				953	* the answers from this function.
				954	*/
				955	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				956	int index, int pagenr, int bio_list_only)
				957	{
				958	int chunk_page;
				959	struct page *p = NULL;
				960
				961	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				962
				963	spin_lock_irq(&rbio->bio_list_lock);
				964	p = rbio->bio_pages[chunk_page];
				965	spin_unlock_irq(&rbio->bio_list_lock);
				966
				967	if (p \|\| bio_list_only)
				968	return p;
				969
				970	return rbio->stripe_pages[chunk_page];
				971	}
				972
				973	/*
				974	* number of pages we need for the entire stripe across all the
				975	* drives
				976	*/
				977	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				978	{
				979	unsigned long nr = stripe_len * nr_stripes;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	980	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	981	}
				982
				983	/*
				984	* allocation and initial setup for the btrfs_raid_bio. Not
				985	* this does not allocate any pages for rbio->pages.
				986	*/
				987	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				988	struct btrfs_bio bbio, u64 raid_map,
				989	u64 stripe_len)
				990	{
				991	struct btrfs_raid_bio *rbio;
				992	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	993	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				994	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	995	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	996	void *p;
				997
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	998	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2 +
				999	DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1000	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1001	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1002	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1003
				1004	bio_list_init(&rbio->bio_list);
				1005	INIT_LIST_HEAD(&rbio->plug_list);
				1006	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1007	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1008	INIT_LIST_HEAD(&rbio->hash_list);
				1009	rbio->bbio = bbio;
				1010	rbio->raid_map = raid_map;
				1011	rbio->fs_info = root->fs_info;
				1012	rbio->stripe_len = stripe_len;
				1013	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1014	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1015	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1016	rbio->faila = -1;
				1017	rbio->failb = -1;
				1018	atomic_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1019	atomic_set(&rbio->error, 0);
				1020	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1021
				1022	/*
				1023	* the stripe_pages and bio_pages array point to the extra
				1024	* memory we allocated past the end of the rbio
				1025	*/
				1026	p = rbio + 1;
				1027	rbio->stripe_pages = p;
				1028	rbio->bio_pages = p + sizeof(struct page ) num_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1029	rbio->dbitmap = p + sizeof(struct page ) num_pages * 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1030
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1031	if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
				1032	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1033	else
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1034	nr_data = real_stripes - 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1035
				1036	rbio->nr_data = nr_data;
				1037	return rbio;
				1038	}
				1039
				1040	/* allocate pages for all the stripes in the bio, including parity */
				1041	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1042	{
				1043	int i;
				1044	struct page *page;
				1045
				1046	for (i = 0; i < rbio->nr_pages; i++) {
				1047	if (rbio->stripe_pages[i])
				1048	continue;
				1049	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1050	if (!page)
				1051	return -ENOMEM;
				1052	rbio->stripe_pages[i] = page;
				1053	ClearPageUptodate(page);
				1054	}
				1055	return 0;
				1056	}
				1057
				1058	/* allocate pages for just the p/q stripes */
				1059	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1060	{
				1061	int i;
				1062	struct page *page;
				1063
				1064	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				1065
				1066	for (; i < rbio->nr_pages; i++) {
				1067	if (rbio->stripe_pages[i])
				1068	continue;
				1069	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1070	if (!page)
				1071	return -ENOMEM;
				1072	rbio->stripe_pages[i] = page;
				1073	}
				1074	return 0;
				1075	}
				1076
				1077	/*
				1078	* add a single page from a specific stripe into our list of bios for IO
				1079	* this will try to merge into existing bios if possible, and returns
				1080	* zero if all went well.
				1081	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1082	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1083	struct bio_list *bio_list,
				1084	struct page *page,
				1085	int stripe_nr,
				1086	unsigned long page_index,
				1087	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1088	{
				1089	struct bio *last = bio_list->tail;
				1090	u64 last_end = 0;
				1091	int ret;
				1092	struct bio *bio;
				1093	struct btrfs_bio_stripe *stripe;
				1094	u64 disk_start;
				1095
				1096	stripe = &rbio->bbio->stripes[stripe_nr];
				1097	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1098
				1099	/* if the device is missing, just fail this stripe */
				1100	if (!stripe->dev->bdev)
				1101	return fail_rbio_index(rbio, stripe_nr);
				1102
				1103	/* see if we can add this page onto our existing bio */
				1104	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1105	last_end = (u64)last->bi_iter.bi_sector << 9;
				1106	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1107
				1108	/*
				1109	* we can't merge these if they are from different
				1110	* devices or if they are not contiguous
				1111	*/
				1112	if (last_end == disk_start && stripe->dev->bdev &&
				1113	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1114	last->bi_bdev == stripe->dev->bdev) {
				1115	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1116	if (ret == PAGE_CACHE_SIZE)
				1117	return 0;
				1118	}
				1119	}
				1120
				1121	/* put a new bio on the list */
Chris Mason	9be3395	2013-05-17 18:30:14 -0400	[diff] [blame]	1122	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1123	if (!bio)
				1124	return -ENOMEM;
				1125
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1126	bio->bi_iter.bi_size = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1127	bio->bi_bdev = stripe->dev->bdev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1128	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1129	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1130
				1131	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1132	bio_list_add(bio_list, bio);
				1133	return 0;
				1134	}
				1135
				1136	/*
				1137	* while we're doing the read/modify/write cycle, we could
				1138	* have errors in reading pages off the disk. This checks
				1139	* for errors and if we're not able to read the page it'll
				1140	* trigger parity reconstruction. The rmw will be finished
				1141	* after we've reconstructed the failed stripes
				1142	*/
				1143	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1144	{
				1145	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1146	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1147	__raid56_parity_recover(rbio);
				1148	} else {
				1149	finish_rmw(rbio);
				1150	}
				1151	}
				1152
				1153	/*
				1154	* these are just the pages from the rbio array, not from anything
				1155	* the FS sent down to us
				1156	*/
				1157	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1158	{
				1159	int index;
				1160	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1161	index += page;
				1162	return rbio->stripe_pages[index];
				1163	}
				1164
				1165	/*
				1166	* helper function to walk our bio list and populate the bio_pages array with
				1167	* the result. This seems expensive, but it is faster than constantly
				1168	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1169	* reconstruction.
				1170	*
				1171	* This must be called before you trust the answers from page_in_rbio
				1172	*/
				1173	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1174	{
				1175	struct bio *bio;
				1176	u64 start;
				1177	unsigned long stripe_offset;
				1178	unsigned long page_index;
				1179	struct page *p;
				1180	int i;
				1181
				1182	spin_lock_irq(&rbio->bio_list_lock);
				1183	bio_list_for_each(bio, &rbio->bio_list) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1184	start = (u64)bio->bi_iter.bi_sector << 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1185	stripe_offset = start - rbio->raid_map[0];
				1186	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1187
				1188	for (i = 0; i < bio->bi_vcnt; i++) {
				1189	p = bio->bi_io_vec[i].bv_page;
				1190	rbio->bio_pages[page_index + i] = p;
				1191	}
				1192	}
				1193	spin_unlock_irq(&rbio->bio_list_lock);
				1194	}
				1195
				1196	/*
				1197	* this is called from one of two situations. We either
				1198	* have a full stripe from the higher layers, or we've read all
				1199	* the missing bits off disk.
				1200	*
				1201	* This will calculate the parity and then send down any
				1202	* changed blocks.
				1203	*/
				1204	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1205	{
				1206	struct btrfs_bio *bbio = rbio->bbio;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1207	void *pointers[rbio->real_stripes];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1208	int stripe_len = rbio->stripe_len;
				1209	int nr_data = rbio->nr_data;
				1210	int stripe;
				1211	int pagenr;
				1212	int p_stripe = -1;
				1213	int q_stripe = -1;
				1214	struct bio_list bio_list;
				1215	struct bio *bio;
				1216	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1217	int ret;
				1218
				1219	bio_list_init(&bio_list);
				1220
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1221	if (rbio->real_stripes - rbio->nr_data == 1) {
				1222	p_stripe = rbio->real_stripes - 1;
				1223	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				1224	p_stripe = rbio->real_stripes - 2;
				1225	q_stripe = rbio->real_stripes - 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1226	} else {
				1227	BUG();
				1228	}
				1229
				1230	/* at this point we either have a full stripe,
				1231	* or we've read the full stripe from the drive.
				1232	* recalculate the parity and write the new results.
				1233	*
				1234	* We're not allowed to add any new bios to the
				1235	* bio list here, anyone else that wants to
				1236	* change this stripe needs to do their own rmw.
				1237	*/
				1238	spin_lock_irq(&rbio->bio_list_lock);
				1239	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1240	spin_unlock_irq(&rbio->bio_list_lock);
				1241
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1242	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1243
				1244	/*
				1245	* now that we've set rmw_locked, run through the
				1246	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1247	*
				1248	* We don't cache full rbios because we're assuming
				1249	* the higher layers are unlikely to use this area of
				1250	* the disk again soon. If they do use it again,
				1251	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1252	*/
				1253	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1254	if (!rbio_is_full(rbio))
				1255	cache_rbio_pages(rbio);
				1256	else
				1257	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1258
				1259	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1260	struct page *p;
				1261	/* first collect one page from each data stripe */
				1262	for (stripe = 0; stripe < nr_data; stripe++) {
				1263	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1264	pointers[stripe] = kmap(p);
				1265	}
				1266
				1267	/* then add the parity stripe */
				1268	p = rbio_pstripe_page(rbio, pagenr);
				1269	SetPageUptodate(p);
				1270	pointers[stripe++] = kmap(p);
				1271
				1272	if (q_stripe != -1) {
				1273
				1274	/*
				1275	* raid6, add the qstripe and call the
				1276	* library function to fill in our p/q
				1277	*/
				1278	p = rbio_qstripe_page(rbio, pagenr);
				1279	SetPageUptodate(p);
				1280	pointers[stripe++] = kmap(p);
				1281
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1282	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1283	pointers);
				1284	} else {
				1285	/* raid5 */
				1286	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1287	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1288	}
				1289
				1290
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1291	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1292	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1293	}
				1294
				1295	/*
				1296	* time to start writing. Make bios for everything from the
				1297	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1298	* everything else.
				1299	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1300	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1301	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1302	struct page *page;
				1303	if (stripe < rbio->nr_data) {
				1304	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1305	if (!page)
				1306	continue;
				1307	} else {
				1308	page = rbio_stripe_page(rbio, stripe, pagenr);
				1309	}
				1310
				1311	ret = rbio_add_io_page(rbio, &bio_list,
				1312	page, stripe, pagenr, rbio->stripe_len);
				1313	if (ret)
				1314	goto cleanup;
				1315	}
				1316	}
				1317
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1318	if (likely(!bbio->num_tgtdevs))
				1319	goto write_data;
				1320
				1321	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1322	if (!bbio->tgtdev_map[stripe])
				1323	continue;
				1324
				1325	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1326	struct page *page;
				1327	if (stripe < rbio->nr_data) {
				1328	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1329	if (!page)
				1330	continue;
				1331	} else {
				1332	page = rbio_stripe_page(rbio, stripe, pagenr);
				1333	}
				1334
				1335	ret = rbio_add_io_page(rbio, &bio_list, page,
				1336	rbio->bbio->tgtdev_map[stripe],
				1337	pagenr, rbio->stripe_len);
				1338	if (ret)
				1339	goto cleanup;
				1340	}
				1341	}
				1342
				1343	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1344	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1345	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1346
				1347	while (1) {
				1348	bio = bio_list_pop(&bio_list);
				1349	if (!bio)
				1350	break;
				1351
				1352	bio->bi_private = rbio;
				1353	bio->bi_end_io = raid_write_end_io;
				1354	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1355	submit_bio(WRITE, bio);
				1356	}
				1357	return;
				1358
				1359	cleanup:
				1360	rbio_orig_end_io(rbio, -EIO, 0);
				1361	}
				1362
				1363	/*
				1364	* helper to find the stripe number for a given bio. Used to figure out which
				1365	* stripe has failed. This expects the bio to correspond to a physical disk,
				1366	* so it looks up based on physical sector numbers.
				1367	*/
				1368	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1369	struct bio *bio)
				1370	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1371	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1372	u64 stripe_start;
				1373	int i;
				1374	struct btrfs_bio_stripe *stripe;
				1375
				1376	physical <<= 9;
				1377
				1378	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1379	stripe = &rbio->bbio->stripes[i];
				1380	stripe_start = stripe->physical;
				1381	if (physical >= stripe_start &&
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1382	physical < stripe_start + rbio->stripe_len &&
				1383	bio->bi_bdev == stripe->dev->bdev) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1384	return i;
				1385	}
				1386	}
				1387	return -1;
				1388	}
				1389
				1390	/*
				1391	* helper to find the stripe number for a given
				1392	* bio (before mapping). Used to figure out which stripe has
				1393	* failed. This looks up based on logical block numbers.
				1394	*/
				1395	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1396	struct bio *bio)
				1397	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1398	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1399	u64 stripe_start;
				1400	int i;
				1401
				1402	logical <<= 9;
				1403
				1404	for (i = 0; i < rbio->nr_data; i++) {
				1405	stripe_start = rbio->raid_map[i];
				1406	if (logical >= stripe_start &&
				1407	logical < stripe_start + rbio->stripe_len) {
				1408	return i;
				1409	}
				1410	}
				1411	return -1;
				1412	}
				1413
				1414	/*
				1415	* returns -EIO if we had too many failures
				1416	*/
				1417	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1418	{
				1419	unsigned long flags;
				1420	int ret = 0;
				1421
				1422	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1423
				1424	/* we already know this stripe is bad, move on */
				1425	if (rbio->faila == failed \|\| rbio->failb == failed)
				1426	goto out;
				1427
				1428	if (rbio->faila == -1) {
				1429	/* first failure on this rbio */
				1430	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1431	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1432	} else if (rbio->failb == -1) {
				1433	/* second failure on this rbio */
				1434	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1435	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1436	} else {
				1437	ret = -EIO;
				1438	}
				1439	out:
				1440	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1441
				1442	return ret;
				1443	}
				1444
				1445	/*
				1446	* helper to fail a stripe based on a physical disk
				1447	* bio.
				1448	*/
				1449	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1450	struct bio *bio)
				1451	{
				1452	int failed = find_bio_stripe(rbio, bio);
				1453
				1454	if (failed < 0)
				1455	return -EIO;
				1456
				1457	return fail_rbio_index(rbio, failed);
				1458	}
				1459
				1460	/*
				1461	* this sets each page in the bio uptodate. It should only be used on private
				1462	* rbio pages, nothing that comes in from the higher layers
				1463	*/
				1464	static void set_bio_pages_uptodate(struct bio *bio)
				1465	{
				1466	int i;
				1467	struct page *p;
				1468
				1469	for (i = 0; i < bio->bi_vcnt; i++) {
				1470	p = bio->bi_io_vec[i].bv_page;
				1471	SetPageUptodate(p);
				1472	}
				1473	}
				1474
				1475	/*
				1476	* end io for the read phase of the rmw cycle. All the bios here are physical
				1477	* stripe bios we've read from the disk so we can recalculate the parity of the
				1478	* stripe.
				1479	*
				1480	* This will usually kick off finish_rmw once all the bios are read in, but it
				1481	* may trigger parity reconstruction if we had any errors along the way
				1482	*/
				1483	static void raid_rmw_end_io(struct bio *bio, int err)
				1484	{
				1485	struct btrfs_raid_bio *rbio = bio->bi_private;
				1486
				1487	if (err)
				1488	fail_bio_stripe(rbio, bio);
				1489	else
				1490	set_bio_pages_uptodate(bio);
				1491
				1492	bio_put(bio);
				1493
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1494	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1495	return;
				1496
				1497	err = 0;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1498	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1499	goto cleanup;
				1500
				1501	/*
				1502	* this will normally call finish_rmw to start our write
				1503	* but if there are any failed stripes we'll reconstruct
				1504	* from parity first
				1505	*/
				1506	validate_rbio_for_rmw(rbio);
				1507	return;
				1508
				1509	cleanup:
				1510
				1511	rbio_orig_end_io(rbio, -EIO, 0);
				1512	}
				1513
				1514	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1515	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1516	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1517	rmw_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1518
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1519	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1520	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1521	}
				1522
				1523	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1524	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1525	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1526	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1527
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1528	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1529	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1530	}
				1531
				1532	/*
				1533	* the stripe must be locked by the caller. It will
				1534	* unlock after all the writes are done
				1535	*/
				1536	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1537	{
				1538	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1539	struct bio_list bio_list;
				1540	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1541	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1542	int pagenr;
				1543	int stripe;
				1544	struct bio *bio;
				1545
				1546	bio_list_init(&bio_list);
				1547
				1548	ret = alloc_rbio_pages(rbio);
				1549	if (ret)
				1550	goto cleanup;
				1551
				1552	index_rbio_pages(rbio);
				1553
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1554	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1555	/*
				1556	* build a list of bios to read all the missing parts of this
				1557	* stripe
				1558	*/
				1559	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1560	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1561	struct page *page;
				1562	/*
				1563	* we want to find all the pages missing from
				1564	* the rbio and read them from the disk. If
				1565	* page_in_rbio finds a page in the bio list
				1566	* we don't need to read it off the stripe.
				1567	*/
				1568	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1569	if (page)
				1570	continue;
				1571
				1572	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1573	/*
				1574	* the bio cache may have handed us an uptodate
				1575	* page. If so, be happy and use it
				1576	*/
				1577	if (PageUptodate(page))
				1578	continue;
				1579
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1580	ret = rbio_add_io_page(rbio, &bio_list, page,
				1581	stripe, pagenr, rbio->stripe_len);
				1582	if (ret)
				1583	goto cleanup;
				1584	}
				1585	}
				1586
				1587	bios_to_read = bio_list_size(&bio_list);
				1588	if (!bios_to_read) {
				1589	/*
				1590	* this can happen if others have merged with
				1591	* us, it means there is nothing left to read.
				1592	* But if there are missing devices it may not be
				1593	* safe to do the full stripe write yet.
				1594	*/
				1595	goto finish;
				1596	}
				1597
				1598	/*
				1599	* the bbio may be freed once we submit the last bio. Make sure
				1600	* not to touch it after that
				1601	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1602	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1603	while (1) {
				1604	bio = bio_list_pop(&bio_list);
				1605	if (!bio)
				1606	break;
				1607
				1608	bio->bi_private = rbio;
				1609	bio->bi_end_io = raid_rmw_end_io;
				1610
				1611	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1612	BTRFS_WQ_ENDIO_RAID56);
				1613
				1614	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1615	submit_bio(READ, bio);
				1616	}
				1617	/* the actual write will happen once the reads are done */
				1618	return 0;
				1619
				1620	cleanup:
				1621	rbio_orig_end_io(rbio, -EIO, 0);
				1622	return -EIO;
				1623
				1624	finish:
				1625	validate_rbio_for_rmw(rbio);
				1626	return 0;
				1627	}
				1628
				1629	/*
				1630	* if the upper layers pass in a full stripe, we thank them by only allocating
				1631	* enough pages to hold the parity, and sending it all down quickly.
				1632	*/
				1633	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1634	{
				1635	int ret;
				1636
				1637	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1638	if (ret) {
				1639	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1640	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1641	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1642
				1643	ret = lock_stripe_add(rbio);
				1644	if (ret == 0)
				1645	finish_rmw(rbio);
				1646	return 0;
				1647	}
				1648
				1649	/*
				1650	* partial stripe writes get handed over to async helpers.
				1651	* We're really hoping to merge a few more writes into this
				1652	* rbio before calculating new parity
				1653	*/
				1654	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1655	{
				1656	int ret;
				1657
				1658	ret = lock_stripe_add(rbio);
				1659	if (ret == 0)
				1660	async_rmw_stripe(rbio);
				1661	return 0;
				1662	}
				1663
				1664	/*
				1665	* sometimes while we were reading from the drive to
				1666	* recalculate parity, enough new bios come into create
				1667	* a full stripe. So we do a check here to see if we can
				1668	* go directly to finish_rmw
				1669	*/
				1670	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1671	{
				1672	/* head off into rmw land if we don't have a full stripe */
				1673	if (!rbio_is_full(rbio))
				1674	return partial_stripe_write(rbio);
				1675	return full_stripe_write(rbio);
				1676	}
				1677
				1678	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1679	* We use plugging call backs to collect full stripes.
				1680	* Any time we get a partial stripe write while plugged
				1681	* we collect it into a list. When the unplug comes down,
				1682	* we sort the list by logical block number and merge
				1683	* everything we can into the same rbios
				1684	*/
				1685	struct btrfs_plug_cb {
				1686	struct blk_plug_cb cb;
				1687	struct btrfs_fs_info *info;
				1688	struct list_head rbio_list;
				1689	struct btrfs_work work;
				1690	};
				1691
				1692	/*
				1693	* rbios on the plug list are sorted for easier merging.
				1694	*/
				1695	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1696	{
				1697	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1698	plug_list);
				1699	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1700	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1701	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1702	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1703
				1704	if (a_sector < b_sector)
				1705	return -1;
				1706	if (a_sector > b_sector)
				1707	return 1;
				1708	return 0;
				1709	}
				1710
				1711	static void run_plug(struct btrfs_plug_cb *plug)
				1712	{
				1713	struct btrfs_raid_bio *cur;
				1714	struct btrfs_raid_bio *last = NULL;
				1715
				1716	/*
				1717	* sort our plug list then try to merge
				1718	* everything we can in hopes of creating full
				1719	* stripes.
				1720	*/
				1721	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1722	while (!list_empty(&plug->rbio_list)) {
				1723	cur = list_entry(plug->rbio_list.next,
				1724	struct btrfs_raid_bio, plug_list);
				1725	list_del_init(&cur->plug_list);
				1726
				1727	if (rbio_is_full(cur)) {
				1728	/* we have a full stripe, send it down */
				1729	full_stripe_write(cur);
				1730	continue;
				1731	}
				1732	if (last) {
				1733	if (rbio_can_merge(last, cur)) {
				1734	merge_rbio(last, cur);
				1735	__free_raid_bio(cur);
				1736	continue;
				1737
				1738	}
				1739	__raid56_parity_write(last);
				1740	}
				1741	last = cur;
				1742	}
				1743	if (last) {
				1744	__raid56_parity_write(last);
				1745	}
				1746	kfree(plug);
				1747	}
				1748
				1749	/*
				1750	* if the unplug comes from schedule, we have to push the
				1751	* work off to a helper thread
				1752	*/
				1753	static void unplug_work(struct btrfs_work *work)
				1754	{
				1755	struct btrfs_plug_cb *plug;
				1756	plug = container_of(work, struct btrfs_plug_cb, work);
				1757	run_plug(plug);
				1758	}
				1759
				1760	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1761	{
				1762	struct btrfs_plug_cb *plug;
				1763	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1764
				1765	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1766	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1767	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1768	btrfs_queue_work(plug->info->rmw_workers,
				1769	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1770	return;
				1771	}
				1772	run_plug(plug);
				1773	}
				1774
				1775	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1776	* our main entry point for writes from the rest of the FS.
				1777	*/
				1778	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1779	struct btrfs_bio bbio, u64 raid_map,
				1780	u64 stripe_len)
				1781	{
				1782	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1783	struct btrfs_plug_cb *plug = NULL;
				1784	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1785	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1786
				1787	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1788	if (IS_ERR(rbio)) {
				1789	__free_bbio_and_raid_map(bbio, raid_map, 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1790	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1791	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1792	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1793	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1794	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1795
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1796	btrfs_bio_counter_inc_noblocked(root->fs_info);
				1797	rbio->generic_bio_cnt = 1;
				1798
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1799	/*
				1800	* don't plug on full rbios, just get them out the door
				1801	* as quickly as we can
				1802	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1803	if (rbio_is_full(rbio)) {
				1804	ret = full_stripe_write(rbio);
				1805	if (ret)
				1806	btrfs_bio_counter_dec(root->fs_info);
				1807	return ret;
				1808	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1809
				1810	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1811	sizeof(*plug));
				1812	if (cb) {
				1813	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1814	if (!plug->info) {
				1815	plug->info = root->fs_info;
				1816	INIT_LIST_HEAD(&plug->rbio_list);
				1817	}
				1818	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1819	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1820	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1821	ret = __raid56_parity_write(rbio);
				1822	if (ret)
				1823	btrfs_bio_counter_dec(root->fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1824	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1825	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1826	}
				1827
				1828	/*
				1829	* all parity reconstruction happens here. We've read in everything
				1830	* we can find from the drives and this does the heavy lifting of
				1831	* sorting the good from the bad.
				1832	*/
				1833	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1834	{
				1835	int pagenr, stripe;
				1836	void **pointers;
				1837	int faila = -1, failb = -1;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1838	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1839	struct page *page;
				1840	int err;
				1841	int i;
				1842
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1843	pointers = kzalloc(rbio->real_stripes * sizeof(void *),
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1844	GFP_NOFS);
				1845	if (!pointers) {
				1846	err = -ENOMEM;
				1847	goto cleanup_io;
				1848	}
				1849
				1850	faila = rbio->faila;
				1851	failb = rbio->failb;
				1852
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1853	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1854	spin_lock_irq(&rbio->bio_list_lock);
				1855	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1856	spin_unlock_irq(&rbio->bio_list_lock);
				1857	}
				1858
				1859	index_rbio_pages(rbio);
				1860
				1861	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1862	/*
				1863	* Now we just use bitmap to mark the horizontal stripes in
				1864	* which we have data when doing parity scrub.
				1865	*/
				1866	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1867	!test_bit(pagenr, rbio->dbitmap))
				1868	continue;
				1869
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1870	/* setup our array of pointers with pages
				1871	* from each stripe
				1872	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1873	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1874	/*
				1875	* if we're rebuilding a read, we have to use
				1876	* pages from the bio list
				1877	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1878	if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1879	(stripe == faila \|\| stripe == failb)) {
				1880	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1881	} else {
				1882	page = rbio_stripe_page(rbio, stripe, pagenr);
				1883	}
				1884	pointers[stripe] = kmap(page);
				1885	}
				1886
				1887	/* all raid6 handling here */
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1888	if (rbio->raid_map[rbio->real_stripes - 1] ==
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1889	RAID6_Q_STRIPE) {
				1890
				1891	/*
				1892	* single failure, rebuild from parity raid5
				1893	* style
				1894	*/
				1895	if (failb < 0) {
				1896	if (faila == rbio->nr_data) {
				1897	/*
				1898	* Just the P stripe has failed, without
				1899	* a bad data or Q stripe.
				1900	* TODO, we should redo the xor here.
				1901	*/
				1902	err = -EIO;
				1903	goto cleanup;
				1904	}
				1905	/*
				1906	* a single failure in raid6 is rebuilt
				1907	* in the pstripe code below
				1908	*/
				1909	goto pstripe;
				1910	}
				1911
				1912	/* make sure our ps and qs are in order */
				1913	if (faila > failb) {
				1914	int tmp = failb;
				1915	failb = faila;
				1916	faila = tmp;
				1917	}
				1918
				1919	/* if the q stripe is failed, do a pstripe reconstruction
				1920	* from the xors.
				1921	* If both the q stripe and the P stripe are failed, we're
				1922	* here due to a crc mismatch and we can't give them the
				1923	* data they want
				1924	*/
				1925	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1926	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1927	err = -EIO;
				1928	goto cleanup;
				1929	}
				1930	/*
				1931	* otherwise we have one bad data stripe and
				1932	* a good P stripe. raid5!
				1933	*/
				1934	goto pstripe;
				1935	}
				1936
				1937	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1938	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1939	PAGE_SIZE, faila, pointers);
				1940	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1941	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1942	PAGE_SIZE, faila, failb,
				1943	pointers);
				1944	}
				1945	} else {
				1946	void *p;
				1947
				1948	/* rebuild from P stripe here (raid5 or raid6) */
				1949	BUG_ON(failb != -1);
				1950	pstripe:
				1951	/* Copy parity block into failed block to start with */
				1952	memcpy(pointers[faila],
				1953	pointers[rbio->nr_data],
				1954	PAGE_CACHE_SIZE);
				1955
				1956	/* rearrange the pointer array */
				1957	p = pointers[faila];
				1958	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1959	pointers[stripe] = pointers[stripe + 1];
				1960	pointers[rbio->nr_data - 1] = p;
				1961
				1962	/* xor in the rest */
				1963	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1964	}
				1965	/* if we're doing this rebuild as part of an rmw, go through
				1966	* and set all of our private rbio pages in the
				1967	* failed stripes as uptodate. This way finish_rmw will
				1968	* know they can be trusted. If this was a read reconstruction,
				1969	* other endio functions will fiddle the uptodate bits
				1970	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1971	if (rbio->operation == BTRFS_RBIO_WRITE) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1972	for (i = 0; i < nr_pages; i++) {
				1973	if (faila != -1) {
				1974	page = rbio_stripe_page(rbio, faila, i);
				1975	SetPageUptodate(page);
				1976	}
				1977	if (failb != -1) {
				1978	page = rbio_stripe_page(rbio, failb, i);
				1979	SetPageUptodate(page);
				1980	}
				1981	}
				1982	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1983	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1984	/*
				1985	* if we're rebuilding a read, we have to use
				1986	* pages from the bio list
				1987	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1988	if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1989	(stripe == faila \|\| stripe == failb)) {
				1990	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1991	} else {
				1992	page = rbio_stripe_page(rbio, stripe, pagenr);
				1993	}
				1994	kunmap(page);
				1995	}
				1996	}
				1997
				1998	err = 0;
				1999	cleanup:
				2000	kfree(pointers);
				2001
				2002	cleanup_io:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2003	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2004	if (err == 0 &&
				2005	!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2006	cache_rbio_pages(rbio);
				2007	else
				2008	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2009
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2010	rbio_orig_end_io(rbio, err, err == 0);
				2011	} else if (err == 0) {
				2012	rbio->faila = -1;
				2013	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2014
				2015	if (rbio->operation == BTRFS_RBIO_WRITE)
				2016	finish_rmw(rbio);
				2017	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				2018	finish_parity_scrub(rbio, 0);
				2019	else
				2020	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2021	} else {
				2022	rbio_orig_end_io(rbio, err, 0);
				2023	}
				2024	}
				2025
				2026	/*
				2027	* This is called only for stripes we've read from disk to
				2028	* reconstruct the parity.
				2029	*/
				2030	static void raid_recover_end_io(struct bio *bio, int err)
				2031	{
				2032	struct btrfs_raid_bio *rbio = bio->bi_private;
				2033
				2034	/*
				2035	* we only read stripe pages off the disk, set them
				2036	* up to date if there were no errors
				2037	*/
				2038	if (err)
				2039	fail_bio_stripe(rbio, bio);
				2040	else
				2041	set_bio_pages_uptodate(bio);
				2042	bio_put(bio);
				2043
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2044	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2045	return;
				2046
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2047	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2048	rbio_orig_end_io(rbio, -EIO, 0);
				2049	else
				2050	__raid_recover_end_io(rbio);
				2051	}
				2052
				2053	/*
				2054	* reads everything we need off the disk to reconstruct
				2055	* the parity. endio handlers trigger final reconstruction
				2056	* when the IO is done.
				2057	*
				2058	* This is used both for reads from the higher layers and for
				2059	* parity construction required to finish a rmw cycle.
				2060	*/
				2061	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2062	{
				2063	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2064	struct bio_list bio_list;
				2065	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	2066	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2067	int pagenr;
				2068	int stripe;
				2069	struct bio *bio;
				2070
				2071	bio_list_init(&bio_list);
				2072
				2073	ret = alloc_rbio_pages(rbio);
				2074	if (ret)
				2075	goto cleanup;
				2076
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2077	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2078
				2079	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2080	* read everything that hasn't failed. Thanks to the
				2081	* stripe cache, it is possible that some or all of these
				2082	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2083	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2084	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2085	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2086	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2087	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2088	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2089
				2090	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				2091	struct page *p;
				2092
				2093	/*
				2094	* the rmw code may have already read this
				2095	* page in
				2096	*/
				2097	p = rbio_stripe_page(rbio, stripe, pagenr);
				2098	if (PageUptodate(p))
				2099	continue;
				2100
				2101	ret = rbio_add_io_page(rbio, &bio_list,
				2102	rbio_stripe_page(rbio, stripe, pagenr),
				2103	stripe, pagenr, rbio->stripe_len);
				2104	if (ret < 0)
				2105	goto cleanup;
				2106	}
				2107	}
				2108
				2109	bios_to_read = bio_list_size(&bio_list);
				2110	if (!bios_to_read) {
				2111	/*
				2112	* we might have no bios to read just because the pages
				2113	* were up to date, or we might have no bios to read because
				2114	* the devices were gone.
				2115	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2116	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2117	__raid_recover_end_io(rbio);
				2118	goto out;
				2119	} else {
				2120	goto cleanup;
				2121	}
				2122	}
				2123
				2124	/*
				2125	* the bbio may be freed once we submit the last bio. Make sure
				2126	* not to touch it after that
				2127	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2128	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2129	while (1) {
				2130	bio = bio_list_pop(&bio_list);
				2131	if (!bio)
				2132	break;
				2133
				2134	bio->bi_private = rbio;
				2135	bio->bi_end_io = raid_recover_end_io;
				2136
				2137	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2138	BTRFS_WQ_ENDIO_RAID56);
				2139
				2140	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2141	submit_bio(READ, bio);
				2142	}
				2143	out:
				2144	return 0;
				2145
				2146	cleanup:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2147	if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2148	rbio_orig_end_io(rbio, -EIO, 0);
				2149	return -EIO;
				2150	}
				2151
				2152	/*
				2153	* the main entry point for reads from the higher layers. This
				2154	* is really only called when the normal read path had a failure,
				2155	* so we assume the bio they send down corresponds to a failed part
				2156	* of the drive.
				2157	*/
				2158	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				2159	struct btrfs_bio bbio, u64 raid_map,
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2160	u64 stripe_len, int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2161	{
				2162	struct btrfs_raid_bio *rbio;
				2163	int ret;
				2164
				2165	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2166	if (IS_ERR(rbio)) {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2167	__free_bbio_and_raid_map(bbio, raid_map, generic_io);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2168	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2169	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2170
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2171	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2172	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2173	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2174
				2175	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2176	if (rbio->faila == -1) {
				2177	BUG();
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2178	__free_bbio_and_raid_map(bbio, raid_map, generic_io);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2179	kfree(rbio);
				2180	return -EIO;
				2181	}
				2182
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2183	if (generic_io) {
				2184	btrfs_bio_counter_inc_noblocked(root->fs_info);
				2185	rbio->generic_bio_cnt = 1;
				2186	} else {
				2187	set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
				2188	}
				2189
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2190	/*
				2191	* reconstruct from the q stripe if they are
				2192	* asking for mirror 3
				2193	*/
				2194	if (mirror_num == 3)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2195	rbio->failb = rbio->real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2196
				2197	ret = lock_stripe_add(rbio);
				2198
				2199	/*
				2200	* __raid56_parity_recover will end the bio with
				2201	* any errors it hits. We don't want to return
				2202	* its error value up the stack because our caller
				2203	* will end up calling bio_endio with any nonzero
				2204	* return
				2205	*/
				2206	if (ret == 0)
				2207	__raid56_parity_recover(rbio);
				2208	/*
				2209	* our rbio has been added to the list of
				2210	* rbios that will be handled after the
				2211	* currently lock owner is done
				2212	*/
				2213	return 0;
				2214
				2215	}
				2216
				2217	static void rmw_work(struct btrfs_work *work)
				2218	{
				2219	struct btrfs_raid_bio *rbio;
				2220
				2221	rbio = container_of(work, struct btrfs_raid_bio, work);
				2222	raid56_rmw_stripe(rbio);
				2223	}
				2224
				2225	static void read_rebuild_work(struct btrfs_work *work)
				2226	{
				2227	struct btrfs_raid_bio *rbio;
				2228
				2229	rbio = container_of(work, struct btrfs_raid_bio, work);
				2230	__raid56_parity_recover(rbio);
				2231	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2232
				2233	/*
				2234	* The following code is used to scrub/replace the parity stripe
				2235	*
				2236	* Note: We need make sure all the pages that add into the scrub/replace
				2237	* raid bio are correct and not be changed during the scrub/replace. That
				2238	* is those pages just hold metadata or file data with checksum.
				2239	*/
				2240
				2241	struct btrfs_raid_bio *
				2242	raid56_parity_alloc_scrub_rbio(struct btrfs_root root, struct bio bio,
				2243	struct btrfs_bio bbio, u64 raid_map,
				2244	u64 stripe_len, struct btrfs_device *scrub_dev,
				2245	unsigned long *dbitmap, int stripe_nsectors)
				2246	{
				2247	struct btrfs_raid_bio *rbio;
				2248	int i;
				2249
				2250	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				2251	if (IS_ERR(rbio))
				2252	return NULL;
				2253	bio_list_add(&rbio->bio_list, bio);
				2254	/*
				2255	* This is a special bio which is used to hold the completion handler
				2256	* and make the scrub rbio is similar to the other types
				2257	*/
				2258	ASSERT(!bio->bi_iter.bi_size);
				2259	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2260
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2261	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2262	if (bbio->stripes[i].dev == scrub_dev) {
				2263	rbio->scrubp = i;
				2264	break;
				2265	}
				2266	}
				2267
				2268	/* Now we just support the sectorsize equals to page size */
				2269	ASSERT(root->sectorsize == PAGE_SIZE);
				2270	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2271	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2272
				2273	return rbio;
				2274	}
				2275
				2276	void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
				2277	struct page *page, u64 logical)
				2278	{
				2279	int stripe_offset;
				2280	int index;
				2281
				2282	ASSERT(logical >= rbio->raid_map[0]);
				2283	ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
				2284	rbio->stripe_len * rbio->nr_data);
				2285	stripe_offset = (int)(logical - rbio->raid_map[0]);
				2286	index = stripe_offset >> PAGE_CACHE_SHIFT;
				2287	rbio->bio_pages[index] = page;
				2288	}
				2289
				2290	/*
				2291	* We just scrub the parity that we have correct data on the same horizontal,
				2292	* so we needn't allocate all pages for all the stripes.
				2293	*/
				2294	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2295	{
				2296	int i;
				2297	int bit;
				2298	int index;
				2299	struct page *page;
				2300
				2301	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2302	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2303	index = i * rbio->stripe_npages + bit;
				2304	if (rbio->stripe_pages[index])
				2305	continue;
				2306
				2307	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2308	if (!page)
				2309	return -ENOMEM;
				2310	rbio->stripe_pages[index] = page;
				2311	ClearPageUptodate(page);
				2312	}
				2313	}
				2314	return 0;
				2315	}
				2316
				2317	/*
				2318	* end io function used by finish_rmw. When we finally
				2319	* get here, we've written a full stripe
				2320	*/
				2321	static void raid_write_parity_end_io(struct bio *bio, int err)
				2322	{
				2323	struct btrfs_raid_bio *rbio = bio->bi_private;
				2324
				2325	if (err)
				2326	fail_bio_stripe(rbio, bio);
				2327
				2328	bio_put(bio);
				2329
				2330	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2331	return;
				2332
				2333	err = 0;
				2334
				2335	if (atomic_read(&rbio->error))
				2336	err = -EIO;
				2337
				2338	rbio_orig_end_io(rbio, err, 0);
				2339	}
				2340
				2341	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2342	int need_check)
				2343	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2344	struct btrfs_bio *bbio = rbio->bbio;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2345	void *pointers[rbio->real_stripes];
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2346	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2347	int nr_data = rbio->nr_data;
				2348	int stripe;
				2349	int pagenr;
				2350	int p_stripe = -1;
				2351	int q_stripe = -1;
				2352	struct page *p_page = NULL;
				2353	struct page *q_page = NULL;
				2354	struct bio_list bio_list;
				2355	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2356	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2357	int ret;
				2358
				2359	bio_list_init(&bio_list);
				2360
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2361	if (rbio->real_stripes - rbio->nr_data == 1) {
				2362	p_stripe = rbio->real_stripes - 1;
				2363	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				2364	p_stripe = rbio->real_stripes - 2;
				2365	q_stripe = rbio->real_stripes - 1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2366	} else {
				2367	BUG();
				2368	}
				2369
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2370	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2371	is_replace = 1;
				2372	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2373	}
				2374
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2375	/*
				2376	* Because the higher layers(scrubber) are unlikely to
				2377	* use this area of the disk again soon, so don't cache
				2378	* it.
				2379	*/
				2380	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2381
				2382	if (!need_check)
				2383	goto writeback;
				2384
				2385	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2386	if (!p_page)
				2387	goto cleanup;
				2388	SetPageUptodate(p_page);
				2389
				2390	if (q_stripe != -1) {
				2391	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2392	if (!q_page) {
				2393	__free_page(p_page);
				2394	goto cleanup;
				2395	}
				2396	SetPageUptodate(q_page);
				2397	}
				2398
				2399	atomic_set(&rbio->error, 0);
				2400
				2401	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2402	struct page *p;
				2403	void *parity;
				2404	/* first collect one page from each data stripe */
				2405	for (stripe = 0; stripe < nr_data; stripe++) {
				2406	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2407	pointers[stripe] = kmap(p);
				2408	}
				2409
				2410	/* then add the parity stripe */
				2411	pointers[stripe++] = kmap(p_page);
				2412
				2413	if (q_stripe != -1) {
				2414
				2415	/*
				2416	* raid6, add the qstripe and call the
				2417	* library function to fill in our p/q
				2418	*/
				2419	pointers[stripe++] = kmap(q_page);
				2420
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2421	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2422	pointers);
				2423	} else {
				2424	/* raid5 */
				2425	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				2426	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				2427	}
				2428
				2429	/* Check scrubbing pairty and repair it */
				2430	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2431	parity = kmap(p);
				2432	if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
				2433	memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
				2434	else
				2435	/* Parity is right, needn't writeback */
				2436	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2437	kunmap(p);
				2438
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2439	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2440	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				2441	}
				2442
				2443	__free_page(p_page);
				2444	if (q_page)
				2445	__free_page(q_page);
				2446
				2447	writeback:
				2448	/*
				2449	* time to start writing. Make bios for everything from the
				2450	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2451	* everything else.
				2452	*/
				2453	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2454	struct page *page;
				2455
				2456	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2457	ret = rbio_add_io_page(rbio, &bio_list,
				2458	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2459	if (ret)
				2460	goto cleanup;
				2461	}
				2462
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2463	if (!is_replace)
				2464	goto submit_write;
				2465
				2466	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2467	struct page *page;
				2468
				2469	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2470	ret = rbio_add_io_page(rbio, &bio_list, page,
				2471	bbio->tgtdev_map[rbio->scrubp],
				2472	pagenr, rbio->stripe_len);
				2473	if (ret)
				2474	goto cleanup;
				2475	}
				2476
				2477	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2478	nr_data = bio_list_size(&bio_list);
				2479	if (!nr_data) {
				2480	/* Every parity is right */
				2481	rbio_orig_end_io(rbio, 0, 0);
				2482	return;
				2483	}
				2484
				2485	atomic_set(&rbio->stripes_pending, nr_data);
				2486
				2487	while (1) {
				2488	bio = bio_list_pop(&bio_list);
				2489	if (!bio)
				2490	break;
				2491
				2492	bio->bi_private = rbio;
				2493	bio->bi_end_io = raid_write_parity_end_io;
				2494	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2495	submit_bio(WRITE, bio);
				2496	}
				2497	return;
				2498
				2499	cleanup:
				2500	rbio_orig_end_io(rbio, -EIO, 0);
				2501	}
				2502
				2503	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2504	{
				2505	if (stripe >= 0 && stripe < rbio->nr_data)
				2506	return 1;
				2507	return 0;
				2508	}
				2509
				2510	/*
				2511	* While we're doing the parity check and repair, we could have errors
				2512	* in reading pages off the disk. This checks for errors and if we're
				2513	* not able to read the page it'll trigger parity reconstruction. The
				2514	* parity scrub will be finished after we've reconstructed the failed
				2515	* stripes
				2516	*/
				2517	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2518	{
				2519	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2520	goto cleanup;
				2521
				2522	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2523	int dfail = 0, failp = -1;
				2524
				2525	if (is_data_stripe(rbio, rbio->faila))
				2526	dfail++;
				2527	else if (is_parity_stripe(rbio->faila))
				2528	failp = rbio->faila;
				2529
				2530	if (is_data_stripe(rbio, rbio->failb))
				2531	dfail++;
				2532	else if (is_parity_stripe(rbio->failb))
				2533	failp = rbio->failb;
				2534
				2535	/*
				2536	* Because we can not use a scrubbing parity to repair
				2537	* the data, so the capability of the repair is declined.
				2538	* (In the case of RAID5, we can not repair anything)
				2539	*/
				2540	if (dfail > rbio->bbio->max_errors - 1)
				2541	goto cleanup;
				2542
				2543	/*
				2544	* If all data is good, only parity is correctly, just
				2545	* repair the parity.
				2546	*/
				2547	if (dfail == 0) {
				2548	finish_parity_scrub(rbio, 0);
				2549	return;
				2550	}
				2551
				2552	/*
				2553	* Here means we got one corrupted data stripe and one
				2554	* corrupted parity on RAID6, if the corrupted parity
				2555	* is scrubbing parity, luckly, use the other one to repair
				2556	* the data, or we can not repair the data stripe.
				2557	*/
				2558	if (failp != rbio->scrubp)
				2559	goto cleanup;
				2560
				2561	__raid_recover_end_io(rbio);
				2562	} else {
				2563	finish_parity_scrub(rbio, 1);
				2564	}
				2565	return;
				2566
				2567	cleanup:
				2568	rbio_orig_end_io(rbio, -EIO, 0);
				2569	}
				2570
				2571	/*
				2572	* end io for the read phase of the rmw cycle. All the bios here are physical
				2573	* stripe bios we've read from the disk so we can recalculate the parity of the
				2574	* stripe.
				2575	*
				2576	* This will usually kick off finish_rmw once all the bios are read in, but it
				2577	* may trigger parity reconstruction if we had any errors along the way
				2578	*/
				2579	static void raid56_parity_scrub_end_io(struct bio *bio, int err)
				2580	{
				2581	struct btrfs_raid_bio *rbio = bio->bi_private;
				2582
				2583	if (err)
				2584	fail_bio_stripe(rbio, bio);
				2585	else
				2586	set_bio_pages_uptodate(bio);
				2587
				2588	bio_put(bio);
				2589
				2590	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2591	return;
				2592
				2593	/*
				2594	* this will normally call finish_rmw to start our write
				2595	* but if there are any failed stripes we'll reconstruct
				2596	* from parity first
				2597	*/
				2598	validate_rbio_for_parity_scrub(rbio);
				2599	}
				2600
				2601	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2602	{
				2603	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2604	struct bio_list bio_list;
				2605	int ret;
				2606	int pagenr;
				2607	int stripe;
				2608	struct bio *bio;
				2609
				2610	ret = alloc_rbio_essential_pages(rbio);
				2611	if (ret)
				2612	goto cleanup;
				2613
				2614	bio_list_init(&bio_list);
				2615
				2616	atomic_set(&rbio->error, 0);
				2617	/*
				2618	* build a list of bios to read all the missing parts of this
				2619	* stripe
				2620	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2621	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2622	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2623	struct page *page;
				2624	/*
				2625	* we want to find all the pages missing from
				2626	* the rbio and read them from the disk. If
				2627	* page_in_rbio finds a page in the bio list
				2628	* we don't need to read it off the stripe.
				2629	*/
				2630	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2631	if (page)
				2632	continue;
				2633
				2634	page = rbio_stripe_page(rbio, stripe, pagenr);
				2635	/*
				2636	* the bio cache may have handed us an uptodate
				2637	* page. If so, be happy and use it
				2638	*/
				2639	if (PageUptodate(page))
				2640	continue;
				2641
				2642	ret = rbio_add_io_page(rbio, &bio_list, page,
				2643	stripe, pagenr, rbio->stripe_len);
				2644	if (ret)
				2645	goto cleanup;
				2646	}
				2647	}
				2648
				2649	bios_to_read = bio_list_size(&bio_list);
				2650	if (!bios_to_read) {
				2651	/*
				2652	* this can happen if others have merged with
				2653	* us, it means there is nothing left to read.
				2654	* But if there are missing devices it may not be
				2655	* safe to do the full stripe write yet.
				2656	*/
				2657	goto finish;
				2658	}
				2659
				2660	/*
				2661	* the bbio may be freed once we submit the last bio. Make sure
				2662	* not to touch it after that
				2663	*/
				2664	atomic_set(&rbio->stripes_pending, bios_to_read);
				2665	while (1) {
				2666	bio = bio_list_pop(&bio_list);
				2667	if (!bio)
				2668	break;
				2669
				2670	bio->bi_private = rbio;
				2671	bio->bi_end_io = raid56_parity_scrub_end_io;
				2672
				2673	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2674	BTRFS_WQ_ENDIO_RAID56);
				2675
				2676	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2677	submit_bio(READ, bio);
				2678	}
				2679	/* the actual write will happen once the reads are done */
				2680	return;
				2681
				2682	cleanup:
				2683	rbio_orig_end_io(rbio, -EIO, 0);
				2684	return;
				2685
				2686	finish:
				2687	validate_rbio_for_parity_scrub(rbio);
				2688	}
				2689
				2690	static void scrub_parity_work(struct btrfs_work *work)
				2691	{
				2692	struct btrfs_raid_bio *rbio;
				2693
				2694	rbio = container_of(work, struct btrfs_raid_bio, work);
				2695	raid56_parity_scrub_stripe(rbio);
				2696	}
				2697
				2698	static void async_scrub_parity(struct btrfs_raid_bio *rbio)
				2699	{
				2700	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2701	scrub_parity_work, NULL, NULL);
				2702
				2703	btrfs_queue_work(rbio->fs_info->rmw_workers,
				2704	&rbio->work);
				2705	}
				2706
				2707	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2708	{
				2709	if (!lock_stripe_add(rbio))
				2710	async_scrub_parity(rbio);
				2711	}