Blame - fs/btrfs/raid56.c - kernel/msm-4.9

blob: fcf7265ca46fd84a65b647619f4e07ba996d7a9f [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
Geert Uytterhoeven	d7011f5	2013-03-03 04:44:41 -0700	[diff] [blame]	34	#include <linux/vmalloc.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	61	#define RBIO_CACHE_SIZE 1024
				62
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	63	enum btrfs_rbio_ops {
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	64	BTRFS_RBIO_WRITE,
				65	BTRFS_RBIO_READ_REBUILD,
				66	BTRFS_RBIO_PARITY_SCRUB,
				67	BTRFS_RBIO_REBUILD_MISSING,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	68	};
				69
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	70	struct btrfs_raid_bio {
				71	struct btrfs_fs_info *fs_info;
				72	struct btrfs_bio *bbio;
				73
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	74	/* while we're doing rmw on a stripe
				75	* we put it into a hash table so we can
				76	* lock the stripe and merge more rbios
				77	* into it.
				78	*/
				79	struct list_head hash_list;
				80
				81	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	82	* LRU list for the stripe cache
				83	*/
				84	struct list_head stripe_cache;
				85
				86	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	87	* for scheduling work in the helper threads
				88	*/
				89	struct btrfs_work work;
				90
				91	/*
				92	* bio list and bio_list_lock are used
				93	* to add more bios into the stripe
				94	* in hopes of avoiding the full rmw
				95	*/
				96	struct bio_list bio_list;
				97	spinlock_t bio_list_lock;
				98
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	99	/* also protected by the bio_list_lock, the
				100	* plug list is used by the plugging code
				101	* to collect partial bios while plugged. The
				102	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	103	* the stripe lock to the next pending IO
				104	*/
				105	struct list_head plug_list;
				106
				107	/*
				108	* flags that tell us if it is safe to
				109	* merge with this bio
				110	*/
				111	unsigned long flags;
				112
				113	/* size of each individual stripe on disk */
				114	int stripe_len;
				115
				116	/* number of data stripes (no p/q) */
				117	int nr_data;
				118
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	119	int real_stripes;
				120
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	121	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	122	/*
				123	* set if we're doing a parity rebuild
				124	* for a read from higher up, which is handled
				125	* differently from a parity rebuild as part of
				126	* rmw
				127	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	128	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	129
				130	/* first bad stripe */
				131	int faila;
				132
				133	/* second bad stripe (for raid6 use) */
				134	int failb;
				135
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	136	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	137	/*
				138	* number of pages needed to represent the full
				139	* stripe
				140	*/
				141	int nr_pages;
				142
				143	/*
				144	* size of all the bios in the bio_list. This
				145	* helps us decide if the rbio maps to a full
				146	* stripe or not
				147	*/
				148	int bio_list_bytes;
				149
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	150	int generic_bio_cnt;
				151
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	152	atomic_t refs;
				153
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	154	atomic_t stripes_pending;
				155
				156	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	157	/*
				158	* these are two arrays of pointers. We allocate the
				159	* rbio big enough to hold them both and setup their
				160	* locations when the rbio is allocated
				161	*/
				162
				163	/* pointers to pages that we allocated for
				164	* reading/writing stripes directly from the disk (including P/Q)
				165	*/
				166	struct page **stripe_pages;
				167
				168	/*
				169	* pointers to the pages in the bio_list. Stored
				170	* here for faster lookup
				171	*/
				172	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	173
				174	/*
				175	* bitmap to record which horizontal stripe has data
				176	*/
				177	unsigned long *dbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	178	};
				179
				180	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				181	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				182	static void rmw_work(struct btrfs_work *work);
				183	static void read_rebuild_work(struct btrfs_work *work);
				184	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				185	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				186	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				187	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				188	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				189	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				190	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				191
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	192	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				193	int need_check);
				194	static void async_scrub_parity(struct btrfs_raid_bio *rbio);
				195
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	196	/*
				197	* the stripe hash table is used for locking, and to collect
				198	* bios in hopes of making a full stripe
				199	*/
				200	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				201	{
				202	struct btrfs_stripe_hash_table *table;
				203	struct btrfs_stripe_hash_table *x;
				204	struct btrfs_stripe_hash *cur;
				205	struct btrfs_stripe_hash *h;
				206	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				207	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	208	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	209
				210	if (info->stripe_hash_table)
				211	return 0;
				212
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	213	/*
				214	* The table is large, starting with order 4 and can go as high as
				215	* order 7 in case lock debugging is turned on.
				216	*
				217	* Try harder to allocate and fallback to vmalloc to lower the chance
				218	* of a failing mount.
				219	*/
				220	table_size = sizeof(table) + sizeof(h) * num_entries;
				221	table = kzalloc(table_size, GFP_KERNEL \| __GFP_NOWARN \| __GFP_REPEAT);
				222	if (!table) {
				223	table = vzalloc(table_size);
				224	if (!table)
				225	return -ENOMEM;
				226	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	227
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	228	spin_lock_init(&table->cache_lock);
				229	INIT_LIST_HEAD(&table->stripe_cache);
				230
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	231	h = table->table;
				232
				233	for (i = 0; i < num_entries; i++) {
				234	cur = h + i;
				235	INIT_LIST_HEAD(&cur->hash_list);
				236	spin_lock_init(&cur->lock);
				237	init_waitqueue_head(&cur->wait);
				238	}
				239
				240	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	241	if (x)
				242	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	243	return 0;
				244	}
				245
				246	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	247	* caching an rbio means to copy anything from the
				248	* bio_pages array into the stripe_pages array. We
				249	* use the page uptodate bit in the stripe cache array
				250	* to indicate if it has valid data
				251	*
				252	* once the caching is done, we set the cache ready
				253	* bit.
				254	*/
				255	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				256	{
				257	int i;
				258	char *s;
				259	char *d;
				260	int ret;
				261
				262	ret = alloc_rbio_pages(rbio);
				263	if (ret)
				264	return;
				265
				266	for (i = 0; i < rbio->nr_pages; i++) {
				267	if (!rbio->bio_pages[i])
				268	continue;
				269
				270	s = kmap(rbio->bio_pages[i]);
				271	d = kmap(rbio->stripe_pages[i]);
				272
				273	memcpy(d, s, PAGE_CACHE_SIZE);
				274
				275	kunmap(rbio->bio_pages[i]);
				276	kunmap(rbio->stripe_pages[i]);
				277	SetPageUptodate(rbio->stripe_pages[i]);
				278	}
				279	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				280	}
				281
				282	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	283	* we hash on the first logical address of the stripe
				284	*/
				285	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				286	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	287	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	288
				289	/*
				290	* we shift down quite a bit. We're using byte
				291	* addressing, and most of the lower bits are zeros.
				292	* This tends to upset hash_64, and it consistently
				293	* returns just one or two different values.
				294	*
				295	* shifting off the lower bits fixes things.
				296	*/
				297	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				298	}
				299
				300	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	301	* stealing an rbio means taking all the uptodate pages from the stripe
				302	* array in the source rbio and putting them into the destination rbio
				303	*/
				304	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				305	{
				306	int i;
				307	struct page *s;
				308	struct page *d;
				309
				310	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				311	return;
				312
				313	for (i = 0; i < dest->nr_pages; i++) {
				314	s = src->stripe_pages[i];
				315	if (!s \|\| !PageUptodate(s)) {
				316	continue;
				317	}
				318
				319	d = dest->stripe_pages[i];
				320	if (d)
				321	__free_page(d);
				322
				323	dest->stripe_pages[i] = s;
				324	src->stripe_pages[i] = NULL;
				325	}
				326	}
				327
				328	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	329	* merging means we take the bio_list from the victim and
				330	* splice it into the destination. The victim should
				331	* be discarded afterwards.
				332	*
				333	* must be called with dest->rbio_list_lock held
				334	*/
				335	static void merge_rbio(struct btrfs_raid_bio *dest,
				336	struct btrfs_raid_bio *victim)
				337	{
				338	bio_list_merge(&dest->bio_list, &victim->bio_list);
				339	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	340	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	341	bio_list_init(&victim->bio_list);
				342	}
				343
				344	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	345	* used to prune items that are in the cache. The caller
				346	* must hold the hash table lock.
				347	*/
				348	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				349	{
				350	int bucket = rbio_bucket(rbio);
				351	struct btrfs_stripe_hash_table *table;
				352	struct btrfs_stripe_hash *h;
				353	int freeit = 0;
				354
				355	/*
				356	* check the bit again under the hash table lock.
				357	*/
				358	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				359	return;
				360
				361	table = rbio->fs_info->stripe_hash_table;
				362	h = table->table + bucket;
				363
				364	/* hold the lock for the bucket because we may be
				365	* removing it from the hash table
				366	*/
				367	spin_lock(&h->lock);
				368
				369	/*
				370	* hold the lock for the bio list because we need
				371	* to make sure the bio list is empty
				372	*/
				373	spin_lock(&rbio->bio_list_lock);
				374
				375	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				376	list_del_init(&rbio->stripe_cache);
				377	table->cache_size -= 1;
				378	freeit = 1;
				379
				380	/* if the bio list isn't empty, this rbio is
				381	* still involved in an IO. We take it out
				382	* of the cache list, and drop the ref that
				383	* was held for the list.
				384	*
				385	* If the bio_list was empty, we also remove
				386	* the rbio from the hash_table, and drop
				387	* the corresponding ref
				388	*/
				389	if (bio_list_empty(&rbio->bio_list)) {
				390	if (!list_empty(&rbio->hash_list)) {
				391	list_del_init(&rbio->hash_list);
				392	atomic_dec(&rbio->refs);
				393	BUG_ON(!list_empty(&rbio->plug_list));
				394	}
				395	}
				396	}
				397
				398	spin_unlock(&rbio->bio_list_lock);
				399	spin_unlock(&h->lock);
				400
				401	if (freeit)
				402	__free_raid_bio(rbio);
				403	}
				404
				405	/*
				406	* prune a given rbio from the cache
				407	*/
				408	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				409	{
				410	struct btrfs_stripe_hash_table *table;
				411	unsigned long flags;
				412
				413	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				414	return;
				415
				416	table = rbio->fs_info->stripe_hash_table;
				417
				418	spin_lock_irqsave(&table->cache_lock, flags);
				419	__remove_rbio_from_cache(rbio);
				420	spin_unlock_irqrestore(&table->cache_lock, flags);
				421	}
				422
				423	/*
				424	* remove everything in the cache
				425	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	426	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	427	{
				428	struct btrfs_stripe_hash_table *table;
				429	unsigned long flags;
				430	struct btrfs_raid_bio *rbio;
				431
				432	table = info->stripe_hash_table;
				433
				434	spin_lock_irqsave(&table->cache_lock, flags);
				435	while (!list_empty(&table->stripe_cache)) {
				436	rbio = list_entry(table->stripe_cache.next,
				437	struct btrfs_raid_bio,
				438	stripe_cache);
				439	__remove_rbio_from_cache(rbio);
				440	}
				441	spin_unlock_irqrestore(&table->cache_lock, flags);
				442	}
				443
				444	/*
				445	* remove all cached entries and free the hash table
				446	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	447	*/
				448	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				449	{
				450	if (!info->stripe_hash_table)
				451	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	452	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	453	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	454	info->stripe_hash_table = NULL;
				455	}
				456
				457	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	458	* insert an rbio into the stripe cache. It
				459	* must have already been prepared by calling
				460	* cache_rbio_pages
				461	*
				462	* If this rbio was already cached, it gets
				463	* moved to the front of the lru.
				464	*
				465	* If the size of the rbio cache is too big, we
				466	* prune an item.
				467	*/
				468	static void cache_rbio(struct btrfs_raid_bio *rbio)
				469	{
				470	struct btrfs_stripe_hash_table *table;
				471	unsigned long flags;
				472
				473	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				474	return;
				475
				476	table = rbio->fs_info->stripe_hash_table;
				477
				478	spin_lock_irqsave(&table->cache_lock, flags);
				479	spin_lock(&rbio->bio_list_lock);
				480
				481	/* bump our ref if we were not in the list before */
				482	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				483	atomic_inc(&rbio->refs);
				484
				485	if (!list_empty(&rbio->stripe_cache)){
				486	list_move(&rbio->stripe_cache, &table->stripe_cache);
				487	} else {
				488	list_add(&rbio->stripe_cache, &table->stripe_cache);
				489	table->cache_size += 1;
				490	}
				491
				492	spin_unlock(&rbio->bio_list_lock);
				493
				494	if (table->cache_size > RBIO_CACHE_SIZE) {
				495	struct btrfs_raid_bio *found;
				496
				497	found = list_entry(table->stripe_cache.prev,
				498	struct btrfs_raid_bio,
				499	stripe_cache);
				500
				501	if (found != rbio)
				502	__remove_rbio_from_cache(found);
				503	}
				504
				505	spin_unlock_irqrestore(&table->cache_lock, flags);
				506	return;
				507	}
				508
				509	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	510	* helper function to run the xor_blocks api. It is only
				511	* able to do MAX_XOR_BLOCKS at a time, so we need to
				512	* loop through.
				513	*/
				514	static void run_xor(void **pages, int src_cnt, ssize_t len)
				515	{
				516	int src_off = 0;
				517	int xor_src_cnt = 0;
				518	void *dest = pages[src_cnt];
				519
				520	while(src_cnt > 0) {
				521	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				522	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				523
				524	src_cnt -= xor_src_cnt;
				525	src_off += xor_src_cnt;
				526	}
				527	}
				528
				529	/*
				530	* returns true if the bio list inside this rbio
				531	* covers an entire stripe (no rmw required).
				532	* Must be called with the bio list lock held, or
				533	* at a time when you know it is impossible to add
				534	* new bios into the list
				535	*/
				536	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				537	{
				538	unsigned long size = rbio->bio_list_bytes;
				539	int ret = 1;
				540
				541	if (size != rbio->nr_data * rbio->stripe_len)
				542	ret = 0;
				543
				544	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				545	return ret;
				546	}
				547
				548	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				549	{
				550	unsigned long flags;
				551	int ret;
				552
				553	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				554	ret = __rbio_is_full(rbio);
				555	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				556	return ret;
				557	}
				558
				559	/*
				560	* returns 1 if it is safe to merge two rbios together.
				561	* The merging is safe if the two rbios correspond to
				562	* the same stripe and if they are both going in the same
				563	* direction (read vs write), and if neither one is
				564	* locked for final IO
				565	*
				566	* The caller is responsible for locking such that
				567	* rmw_locked is safe to test
				568	*/
				569	static int rbio_can_merge(struct btrfs_raid_bio *last,
				570	struct btrfs_raid_bio *cur)
				571	{
				572	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				573	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				574	return 0;
				575
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	576	/*
				577	* we can't merge with cached rbios, since the
				578	* idea is that when we merge the destination
				579	* rbio is going to run our IO for us. We can
				580	* steal from cached rbio's though, other functions
				581	* handle that.
				582	*/
				583	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				584	test_bit(RBIO_CACHE_BIT, &cur->flags))
				585	return 0;
				586
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	587	if (last->bbio->raid_map[0] !=
				588	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	589	return 0;
				590
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	591	/* we can't merge with different operations */
				592	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	593	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	594	/*
				595	* We've need read the full stripe from the drive.
				596	* check and repair the parity and write the new results.
				597	*
				598	* We're not allowed to add any new bios to the
				599	* bio list here, anyone else that wants to
				600	* change this stripe needs to do their own rmw.
				601	*/
				602	if (last->operation == BTRFS_RBIO_PARITY_SCRUB \|\|
				603	cur->operation == BTRFS_RBIO_PARITY_SCRUB)
				604	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	605
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	606	if (last->operation == BTRFS_RBIO_REBUILD_MISSING \|\|
				607	cur->operation == BTRFS_RBIO_REBUILD_MISSING)
				608	return 0;
				609
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	610	return 1;
				611	}
				612
				613	/*
				614	* helper to index into the pstripe
				615	*/
				616	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				617	{
				618	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				619	return rbio->stripe_pages[index];
				620	}
				621
				622	/*
				623	* helper to index into the qstripe, returns null
				624	* if there is no qstripe
				625	*/
				626	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				627	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	628	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	629	return NULL;
				630
				631	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				632	PAGE_CACHE_SHIFT;
				633	return rbio->stripe_pages[index];
				634	}
				635
				636	/*
				637	* The first stripe in the table for a logical address
				638	* has the lock. rbios are added in one of three ways:
				639	*
				640	* 1) Nobody has the stripe locked yet. The rbio is given
				641	* the lock and 0 is returned. The caller must start the IO
				642	* themselves.
				643	*
				644	* 2) Someone has the stripe locked, but we're able to merge
				645	* with the lock owner. The rbio is freed and the IO will
				646	* start automatically along with the existing rbio. 1 is returned.
				647	*
				648	* 3) Someone has the stripe locked, but we're not able to merge.
				649	* The rbio is added to the lock owner's plug list, or merged into
				650	* an rbio already on the plug list. When the lock owner unlocks,
				651	* the next rbio on the list is run and the IO is started automatically.
				652	* 1 is returned
				653	*
				654	* If we return 0, the caller still owns the rbio and must continue with
				655	* IO submission. If we return 1, the caller must assume the rbio has
				656	* already been freed.
				657	*/
				658	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				659	{
				660	int bucket = rbio_bucket(rbio);
				661	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				662	struct btrfs_raid_bio *cur;
				663	struct btrfs_raid_bio *pending;
				664	unsigned long flags;
				665	DEFINE_WAIT(wait);
				666	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	667	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	668	int ret = 0;
				669	int walk = 0;
				670
				671	spin_lock_irqsave(&h->lock, flags);
				672	list_for_each_entry(cur, &h->hash_list, hash_list) {
				673	walk++;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	674	if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	675	spin_lock(&cur->bio_list_lock);
				676
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	677	/* can we steal this cached rbio's pages? */
				678	if (bio_list_empty(&cur->bio_list) &&
				679	list_empty(&cur->plug_list) &&
				680	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				681	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				682	list_del_init(&cur->hash_list);
				683	atomic_dec(&cur->refs);
				684
				685	steal_rbio(cur, rbio);
				686	cache_drop = cur;
				687	spin_unlock(&cur->bio_list_lock);
				688
				689	goto lockit;
				690	}
				691
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	692	/* can we merge into the lock owner? */
				693	if (rbio_can_merge(cur, rbio)) {
				694	merge_rbio(cur, rbio);
				695	spin_unlock(&cur->bio_list_lock);
				696	freeit = rbio;
				697	ret = 1;
				698	goto out;
				699	}
				700
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	701
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	702	/*
				703	* we couldn't merge with the running
				704	* rbio, see if we can merge with the
				705	* pending ones. We don't have to
				706	* check for rmw_locked because there
				707	* is no way they are inside finish_rmw
				708	* right now
				709	*/
				710	list_for_each_entry(pending, &cur->plug_list,
				711	plug_list) {
				712	if (rbio_can_merge(pending, rbio)) {
				713	merge_rbio(pending, rbio);
				714	spin_unlock(&cur->bio_list_lock);
				715	freeit = rbio;
				716	ret = 1;
				717	goto out;
				718	}
				719	}
				720
				721	/* no merging, put us on the tail of the plug list,
				722	* our rbio will be started with the currently
				723	* running rbio unlocks
				724	*/
				725	list_add_tail(&rbio->plug_list, &cur->plug_list);
				726	spin_unlock(&cur->bio_list_lock);
				727	ret = 1;
				728	goto out;
				729	}
				730	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	731	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	732	atomic_inc(&rbio->refs);
				733	list_add(&rbio->hash_list, &h->hash_list);
				734	out:
				735	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	736	if (cache_drop)
				737	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	738	if (freeit)
				739	__free_raid_bio(freeit);
				740	return ret;
				741	}
				742
				743	/*
				744	* called as rmw or parity rebuild is completed. If the plug list has more
				745	* rbios waiting for this stripe, the next one on the list will be started
				746	*/
				747	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				748	{
				749	int bucket;
				750	struct btrfs_stripe_hash *h;
				751	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	752	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	753
				754	bucket = rbio_bucket(rbio);
				755	h = rbio->fs_info->stripe_hash_table->table + bucket;
				756
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	757	if (list_empty(&rbio->plug_list))
				758	cache_rbio(rbio);
				759
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	760	spin_lock_irqsave(&h->lock, flags);
				761	spin_lock(&rbio->bio_list_lock);
				762
				763	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	764	/*
				765	* if we're still cached and there is no other IO
				766	* to perform, just leave this rbio here for others
				767	* to steal from later
				768	*/
				769	if (list_empty(&rbio->plug_list) &&
				770	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				771	keep_cache = 1;
				772	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				773	BUG_ON(!bio_list_empty(&rbio->bio_list));
				774	goto done;
				775	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	776
				777	list_del_init(&rbio->hash_list);
				778	atomic_dec(&rbio->refs);
				779
				780	/*
				781	* we use the plug list to hold all the rbios
				782	* waiting for the chance to lock this stripe.
				783	* hand the lock over to one of them.
				784	*/
				785	if (!list_empty(&rbio->plug_list)) {
				786	struct btrfs_raid_bio *next;
				787	struct list_head *head = rbio->plug_list.next;
				788
				789	next = list_entry(head, struct btrfs_raid_bio,
				790	plug_list);
				791
				792	list_del_init(&rbio->plug_list);
				793
				794	list_add(&next->hash_list, &h->hash_list);
				795	atomic_inc(&next->refs);
				796	spin_unlock(&rbio->bio_list_lock);
				797	spin_unlock_irqrestore(&h->lock, flags);
				798
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	799	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	800	async_read_rebuild(next);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	801	else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
				802	steal_rbio(rbio, next);
				803	async_read_rebuild(next);
				804	} else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	805	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	806	async_rmw_stripe(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	807	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				808	steal_rbio(rbio, next);
				809	async_scrub_parity(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	810	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	811
				812	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	813	} else if (waitqueue_active(&h->wait)) {
				814	spin_unlock(&rbio->bio_list_lock);
				815	spin_unlock_irqrestore(&h->lock, flags);
				816	wake_up(&h->wait);
				817	goto done_nolock;
				818	}
				819	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	820	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	821	spin_unlock(&rbio->bio_list_lock);
				822	spin_unlock_irqrestore(&h->lock, flags);
				823
				824	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	825	if (!keep_cache)
				826	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	827	}
				828
				829	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				830	{
				831	int i;
				832
				833	WARN_ON(atomic_read(&rbio->refs) < 0);
				834	if (!atomic_dec_and_test(&rbio->refs))
				835	return;
				836
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	837	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	838	WARN_ON(!list_empty(&rbio->hash_list));
				839	WARN_ON(!bio_list_empty(&rbio->bio_list));
				840
				841	for (i = 0; i < rbio->nr_pages; i++) {
				842	if (rbio->stripe_pages[i]) {
				843	__free_page(rbio->stripe_pages[i]);
				844	rbio->stripe_pages[i] = NULL;
				845	}
				846	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	847
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	848	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	849	kfree(rbio);
				850	}
				851
				852	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				853	{
				854	unlock_stripe(rbio);
				855	__free_raid_bio(rbio);
				856	}
				857
				858	/*
				859	* this frees the rbio and runs through all the bios in the
				860	* bio_list and calls end_io on them
				861	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	862	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	863	{
				864	struct bio *cur = bio_list_get(&rbio->bio_list);
				865	struct bio *next;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	866
				867	if (rbio->generic_bio_cnt)
				868	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				869
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	870	free_raid_bio(rbio);
				871
				872	while (cur) {
				873	next = cur->bi_next;
				874	cur->bi_next = NULL;
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	875	cur->bi_error = err;
				876	bio_endio(cur);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	877	cur = next;
				878	}
				879	}
				880
				881	/*
				882	* end io function used by finish_rmw. When we finally
				883	* get here, we've written a full stripe
				884	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	885	static void raid_write_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	886	{
				887	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	888	int err = bio->bi_error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	889
				890	if (err)
				891	fail_bio_stripe(rbio, bio);
				892
				893	bio_put(bio);
				894
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	895	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	896	return;
				897
				898	err = 0;
				899
				900	/* OK, we have read all the stripes we need to. */
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	901	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	902	err = -EIO;
				903
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	904	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	905	return;
				906	}
				907
				908	/*
				909	* the read/modify/write code wants to use the original bio for
				910	* any pages it included, and then use the rbio for everything
				911	* else. This function decides if a given index (stripe number)
				912	* and page number in that stripe fall inside the original bio
				913	* or the rbio.
				914	*
				915	* if you set bio_list_only, you'll get a NULL back for any ranges
				916	* that are outside the bio_list
				917	*
				918	* This doesn't take any refs on anything, you get a bare page pointer
				919	* and the caller must bump refs as required.
				920	*
				921	* You must call index_rbio_pages once before you can trust
				922	* the answers from this function.
				923	*/
				924	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				925	int index, int pagenr, int bio_list_only)
				926	{
				927	int chunk_page;
				928	struct page *p = NULL;
				929
				930	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				931
				932	spin_lock_irq(&rbio->bio_list_lock);
				933	p = rbio->bio_pages[chunk_page];
				934	spin_unlock_irq(&rbio->bio_list_lock);
				935
				936	if (p \|\| bio_list_only)
				937	return p;
				938
				939	return rbio->stripe_pages[chunk_page];
				940	}
				941
				942	/*
				943	* number of pages we need for the entire stripe across all the
				944	* drives
				945	*/
				946	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				947	{
				948	unsigned long nr = stripe_len * nr_stripes;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	949	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	950	}
				951
				952	/*
				953	* allocation and initial setup for the btrfs_raid_bio. Not
				954	* this does not allocate any pages for rbio->pages.
				955	*/
				956	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	957	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	958	{
				959	struct btrfs_raid_bio *rbio;
				960	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	961	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				962	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	963	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	964	void *p;
				965
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	966	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2 +
				967	DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	968	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	969	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	970	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	971
				972	bio_list_init(&rbio->bio_list);
				973	INIT_LIST_HEAD(&rbio->plug_list);
				974	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	975	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	976	INIT_LIST_HEAD(&rbio->hash_list);
				977	rbio->bbio = bbio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	978	rbio->fs_info = root->fs_info;
				979	rbio->stripe_len = stripe_len;
				980	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	981	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	982	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	983	rbio->faila = -1;
				984	rbio->failb = -1;
				985	atomic_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	986	atomic_set(&rbio->error, 0);
				987	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	988
				989	/*
				990	* the stripe_pages and bio_pages array point to the extra
				991	* memory we allocated past the end of the rbio
				992	*/
				993	p = rbio + 1;
				994	rbio->stripe_pages = p;
				995	rbio->bio_pages = p + sizeof(struct page ) num_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	996	rbio->dbitmap = p + sizeof(struct page ) num_pages * 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	997
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	998	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				999	nr_data = real_stripes - 1;
				1000	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1001	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1002	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1003	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1004
				1005	rbio->nr_data = nr_data;
				1006	return rbio;
				1007	}
				1008
				1009	/* allocate pages for all the stripes in the bio, including parity */
				1010	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1011	{
				1012	int i;
				1013	struct page *page;
				1014
				1015	for (i = 0; i < rbio->nr_pages; i++) {
				1016	if (rbio->stripe_pages[i])
				1017	continue;
				1018	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1019	if (!page)
				1020	return -ENOMEM;
				1021	rbio->stripe_pages[i] = page;
				1022	ClearPageUptodate(page);
				1023	}
				1024	return 0;
				1025	}
				1026
				1027	/* allocate pages for just the p/q stripes */
				1028	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1029	{
				1030	int i;
				1031	struct page *page;
				1032
				1033	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				1034
				1035	for (; i < rbio->nr_pages; i++) {
				1036	if (rbio->stripe_pages[i])
				1037	continue;
				1038	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1039	if (!page)
				1040	return -ENOMEM;
				1041	rbio->stripe_pages[i] = page;
				1042	}
				1043	return 0;
				1044	}
				1045
				1046	/*
				1047	* add a single page from a specific stripe into our list of bios for IO
				1048	* this will try to merge into existing bios if possible, and returns
				1049	* zero if all went well.
				1050	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1051	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1052	struct bio_list *bio_list,
				1053	struct page *page,
				1054	int stripe_nr,
				1055	unsigned long page_index,
				1056	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1057	{
				1058	struct bio *last = bio_list->tail;
				1059	u64 last_end = 0;
				1060	int ret;
				1061	struct bio *bio;
				1062	struct btrfs_bio_stripe *stripe;
				1063	u64 disk_start;
				1064
				1065	stripe = &rbio->bbio->stripes[stripe_nr];
				1066	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1067
				1068	/* if the device is missing, just fail this stripe */
				1069	if (!stripe->dev->bdev)
				1070	return fail_rbio_index(rbio, stripe_nr);
				1071
				1072	/* see if we can add this page onto our existing bio */
				1073	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1074	last_end = (u64)last->bi_iter.bi_sector << 9;
				1075	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1076
				1077	/*
				1078	* we can't merge these if they are from different
				1079	* devices or if they are not contiguous
				1080	*/
				1081	if (last_end == disk_start && stripe->dev->bdev &&
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1082	!last->bi_error &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1083	last->bi_bdev == stripe->dev->bdev) {
				1084	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1085	if (ret == PAGE_CACHE_SIZE)
				1086	return 0;
				1087	}
				1088	}
				1089
				1090	/* put a new bio on the list */
Chris Mason	9be3395	2013-05-17 18:30:14 -0400	[diff] [blame]	1091	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1092	if (!bio)
				1093	return -ENOMEM;
				1094
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1095	bio->bi_iter.bi_size = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1096	bio->bi_bdev = stripe->dev->bdev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1097	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1098
				1099	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1100	bio_list_add(bio_list, bio);
				1101	return 0;
				1102	}
				1103
				1104	/*
				1105	* while we're doing the read/modify/write cycle, we could
				1106	* have errors in reading pages off the disk. This checks
				1107	* for errors and if we're not able to read the page it'll
				1108	* trigger parity reconstruction. The rmw will be finished
				1109	* after we've reconstructed the failed stripes
				1110	*/
				1111	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1112	{
				1113	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1114	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1115	__raid56_parity_recover(rbio);
				1116	} else {
				1117	finish_rmw(rbio);
				1118	}
				1119	}
				1120
				1121	/*
				1122	* these are just the pages from the rbio array, not from anything
				1123	* the FS sent down to us
				1124	*/
				1125	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1126	{
				1127	int index;
				1128	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1129	index += page;
				1130	return rbio->stripe_pages[index];
				1131	}
				1132
				1133	/*
				1134	* helper function to walk our bio list and populate the bio_pages array with
				1135	* the result. This seems expensive, but it is faster than constantly
				1136	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1137	* reconstruction.
				1138	*
				1139	* This must be called before you trust the answers from page_in_rbio
				1140	*/
				1141	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1142	{
				1143	struct bio *bio;
				1144	u64 start;
				1145	unsigned long stripe_offset;
				1146	unsigned long page_index;
				1147	struct page *p;
				1148	int i;
				1149
				1150	spin_lock_irq(&rbio->bio_list_lock);
				1151	bio_list_for_each(bio, &rbio->bio_list) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1152	start = (u64)bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1153	stripe_offset = start - rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1154	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1155
				1156	for (i = 0; i < bio->bi_vcnt; i++) {
				1157	p = bio->bi_io_vec[i].bv_page;
				1158	rbio->bio_pages[page_index + i] = p;
				1159	}
				1160	}
				1161	spin_unlock_irq(&rbio->bio_list_lock);
				1162	}
				1163
				1164	/*
				1165	* this is called from one of two situations. We either
				1166	* have a full stripe from the higher layers, or we've read all
				1167	* the missing bits off disk.
				1168	*
				1169	* This will calculate the parity and then send down any
				1170	* changed blocks.
				1171	*/
				1172	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1173	{
				1174	struct btrfs_bio *bbio = rbio->bbio;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1175	void *pointers[rbio->real_stripes];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1176	int stripe_len = rbio->stripe_len;
				1177	int nr_data = rbio->nr_data;
				1178	int stripe;
				1179	int pagenr;
				1180	int p_stripe = -1;
				1181	int q_stripe = -1;
				1182	struct bio_list bio_list;
				1183	struct bio *bio;
				1184	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1185	int ret;
				1186
				1187	bio_list_init(&bio_list);
				1188
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1189	if (rbio->real_stripes - rbio->nr_data == 1) {
				1190	p_stripe = rbio->real_stripes - 1;
				1191	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				1192	p_stripe = rbio->real_stripes - 2;
				1193	q_stripe = rbio->real_stripes - 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1194	} else {
				1195	BUG();
				1196	}
				1197
				1198	/* at this point we either have a full stripe,
				1199	* or we've read the full stripe from the drive.
				1200	* recalculate the parity and write the new results.
				1201	*
				1202	* We're not allowed to add any new bios to the
				1203	* bio list here, anyone else that wants to
				1204	* change this stripe needs to do their own rmw.
				1205	*/
				1206	spin_lock_irq(&rbio->bio_list_lock);
				1207	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1208	spin_unlock_irq(&rbio->bio_list_lock);
				1209
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1210	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1211
				1212	/*
				1213	* now that we've set rmw_locked, run through the
				1214	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1215	*
				1216	* We don't cache full rbios because we're assuming
				1217	* the higher layers are unlikely to use this area of
				1218	* the disk again soon. If they do use it again,
				1219	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1220	*/
				1221	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1222	if (!rbio_is_full(rbio))
				1223	cache_rbio_pages(rbio);
				1224	else
				1225	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1226
				1227	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1228	struct page *p;
				1229	/* first collect one page from each data stripe */
				1230	for (stripe = 0; stripe < nr_data; stripe++) {
				1231	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1232	pointers[stripe] = kmap(p);
				1233	}
				1234
				1235	/* then add the parity stripe */
				1236	p = rbio_pstripe_page(rbio, pagenr);
				1237	SetPageUptodate(p);
				1238	pointers[stripe++] = kmap(p);
				1239
				1240	if (q_stripe != -1) {
				1241
				1242	/*
				1243	* raid6, add the qstripe and call the
				1244	* library function to fill in our p/q
				1245	*/
				1246	p = rbio_qstripe_page(rbio, pagenr);
				1247	SetPageUptodate(p);
				1248	pointers[stripe++] = kmap(p);
				1249
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1250	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1251	pointers);
				1252	} else {
				1253	/* raid5 */
				1254	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1255	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1256	}
				1257
				1258
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1259	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1260	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1261	}
				1262
				1263	/*
				1264	* time to start writing. Make bios for everything from the
				1265	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1266	* everything else.
				1267	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1268	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1269	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1270	struct page *page;
				1271	if (stripe < rbio->nr_data) {
				1272	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1273	if (!page)
				1274	continue;
				1275	} else {
				1276	page = rbio_stripe_page(rbio, stripe, pagenr);
				1277	}
				1278
				1279	ret = rbio_add_io_page(rbio, &bio_list,
				1280	page, stripe, pagenr, rbio->stripe_len);
				1281	if (ret)
				1282	goto cleanup;
				1283	}
				1284	}
				1285
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1286	if (likely(!bbio->num_tgtdevs))
				1287	goto write_data;
				1288
				1289	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1290	if (!bbio->tgtdev_map[stripe])
				1291	continue;
				1292
				1293	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1294	struct page *page;
				1295	if (stripe < rbio->nr_data) {
				1296	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1297	if (!page)
				1298	continue;
				1299	} else {
				1300	page = rbio_stripe_page(rbio, stripe, pagenr);
				1301	}
				1302
				1303	ret = rbio_add_io_page(rbio, &bio_list, page,
				1304	rbio->bbio->tgtdev_map[stripe],
				1305	pagenr, rbio->stripe_len);
				1306	if (ret)
				1307	goto cleanup;
				1308	}
				1309	}
				1310
				1311	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1312	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1313	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1314
				1315	while (1) {
				1316	bio = bio_list_pop(&bio_list);
				1317	if (!bio)
				1318	break;
				1319
				1320	bio->bi_private = rbio;
				1321	bio->bi_end_io = raid_write_end_io;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1322	submit_bio(WRITE, bio);
				1323	}
				1324	return;
				1325
				1326	cleanup:
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1327	rbio_orig_end_io(rbio, -EIO);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1328	}
				1329
				1330	/*
				1331	* helper to find the stripe number for a given bio. Used to figure out which
				1332	* stripe has failed. This expects the bio to correspond to a physical disk,
				1333	* so it looks up based on physical sector numbers.
				1334	*/
				1335	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1336	struct bio *bio)
				1337	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1338	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1339	u64 stripe_start;
				1340	int i;
				1341	struct btrfs_bio_stripe *stripe;
				1342
				1343	physical <<= 9;
				1344
				1345	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1346	stripe = &rbio->bbio->stripes[i];
				1347	stripe_start = stripe->physical;
				1348	if (physical >= stripe_start &&
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1349	physical < stripe_start + rbio->stripe_len &&
				1350	bio->bi_bdev == stripe->dev->bdev) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1351	return i;
				1352	}
				1353	}
				1354	return -1;
				1355	}
				1356
				1357	/*
				1358	* helper to find the stripe number for a given
				1359	* bio (before mapping). Used to figure out which stripe has
				1360	* failed. This looks up based on logical block numbers.
				1361	*/
				1362	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1363	struct bio *bio)
				1364	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1365	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1366	u64 stripe_start;
				1367	int i;
				1368
				1369	logical <<= 9;
				1370
				1371	for (i = 0; i < rbio->nr_data; i++) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1372	stripe_start = rbio->bbio->raid_map[i];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1373	if (logical >= stripe_start &&
				1374	logical < stripe_start + rbio->stripe_len) {
				1375	return i;
				1376	}
				1377	}
				1378	return -1;
				1379	}
				1380
				1381	/*
				1382	* returns -EIO if we had too many failures
				1383	*/
				1384	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1385	{
				1386	unsigned long flags;
				1387	int ret = 0;
				1388
				1389	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1390
				1391	/* we already know this stripe is bad, move on */
				1392	if (rbio->faila == failed \|\| rbio->failb == failed)
				1393	goto out;
				1394
				1395	if (rbio->faila == -1) {
				1396	/* first failure on this rbio */
				1397	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1398	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1399	} else if (rbio->failb == -1) {
				1400	/* second failure on this rbio */
				1401	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1402	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1403	} else {
				1404	ret = -EIO;
				1405	}
				1406	out:
				1407	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1408
				1409	return ret;
				1410	}
				1411
				1412	/*
				1413	* helper to fail a stripe based on a physical disk
				1414	* bio.
				1415	*/
				1416	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1417	struct bio *bio)
				1418	{
				1419	int failed = find_bio_stripe(rbio, bio);
				1420
				1421	if (failed < 0)
				1422	return -EIO;
				1423
				1424	return fail_rbio_index(rbio, failed);
				1425	}
				1426
				1427	/*
				1428	* this sets each page in the bio uptodate. It should only be used on private
				1429	* rbio pages, nothing that comes in from the higher layers
				1430	*/
				1431	static void set_bio_pages_uptodate(struct bio *bio)
				1432	{
				1433	int i;
				1434	struct page *p;
				1435
				1436	for (i = 0; i < bio->bi_vcnt; i++) {
				1437	p = bio->bi_io_vec[i].bv_page;
				1438	SetPageUptodate(p);
				1439	}
				1440	}
				1441
				1442	/*
				1443	* end io for the read phase of the rmw cycle. All the bios here are physical
				1444	* stripe bios we've read from the disk so we can recalculate the parity of the
				1445	* stripe.
				1446	*
				1447	* This will usually kick off finish_rmw once all the bios are read in, but it
				1448	* may trigger parity reconstruction if we had any errors along the way
				1449	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1450	static void raid_rmw_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1451	{
				1452	struct btrfs_raid_bio *rbio = bio->bi_private;
				1453
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1454	if (bio->bi_error)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1455	fail_bio_stripe(rbio, bio);
				1456	else
				1457	set_bio_pages_uptodate(bio);
				1458
				1459	bio_put(bio);
				1460
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1461	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1462	return;
				1463
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1464	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1465	goto cleanup;
				1466
				1467	/*
				1468	* this will normally call finish_rmw to start our write
				1469	* but if there are any failed stripes we'll reconstruct
				1470	* from parity first
				1471	*/
				1472	validate_rbio_for_rmw(rbio);
				1473	return;
				1474
				1475	cleanup:
				1476
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1477	rbio_orig_end_io(rbio, -EIO);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1478	}
				1479
				1480	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1481	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1482	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1483	rmw_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1484
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1485	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1486	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1487	}
				1488
				1489	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1490	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1491	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1492	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1493
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1494	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1495	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1496	}
				1497
				1498	/*
				1499	* the stripe must be locked by the caller. It will
				1500	* unlock after all the writes are done
				1501	*/
				1502	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1503	{
				1504	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1505	struct bio_list bio_list;
				1506	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1507	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1508	int pagenr;
				1509	int stripe;
				1510	struct bio *bio;
				1511
				1512	bio_list_init(&bio_list);
				1513
				1514	ret = alloc_rbio_pages(rbio);
				1515	if (ret)
				1516	goto cleanup;
				1517
				1518	index_rbio_pages(rbio);
				1519
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1520	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1521	/*
				1522	* build a list of bios to read all the missing parts of this
				1523	* stripe
				1524	*/
				1525	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1526	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1527	struct page *page;
				1528	/*
				1529	* we want to find all the pages missing from
				1530	* the rbio and read them from the disk. If
				1531	* page_in_rbio finds a page in the bio list
				1532	* we don't need to read it off the stripe.
				1533	*/
				1534	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1535	if (page)
				1536	continue;
				1537
				1538	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1539	/*
				1540	* the bio cache may have handed us an uptodate
				1541	* page. If so, be happy and use it
				1542	*/
				1543	if (PageUptodate(page))
				1544	continue;
				1545
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1546	ret = rbio_add_io_page(rbio, &bio_list, page,
				1547	stripe, pagenr, rbio->stripe_len);
				1548	if (ret)
				1549	goto cleanup;
				1550	}
				1551	}
				1552
				1553	bios_to_read = bio_list_size(&bio_list);
				1554	if (!bios_to_read) {
				1555	/*
				1556	* this can happen if others have merged with
				1557	* us, it means there is nothing left to read.
				1558	* But if there are missing devices it may not be
				1559	* safe to do the full stripe write yet.
				1560	*/
				1561	goto finish;
				1562	}
				1563
				1564	/*
				1565	* the bbio may be freed once we submit the last bio. Make sure
				1566	* not to touch it after that
				1567	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1568	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1569	while (1) {
				1570	bio = bio_list_pop(&bio_list);
				1571	if (!bio)
				1572	break;
				1573
				1574	bio->bi_private = rbio;
				1575	bio->bi_end_io = raid_rmw_end_io;
				1576
				1577	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1578	BTRFS_WQ_ENDIO_RAID56);
				1579
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1580	submit_bio(READ, bio);
				1581	}
				1582	/* the actual write will happen once the reads are done */
				1583	return 0;
				1584
				1585	cleanup:
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1586	rbio_orig_end_io(rbio, -EIO);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1587	return -EIO;
				1588
				1589	finish:
				1590	validate_rbio_for_rmw(rbio);
				1591	return 0;
				1592	}
				1593
				1594	/*
				1595	* if the upper layers pass in a full stripe, we thank them by only allocating
				1596	* enough pages to hold the parity, and sending it all down quickly.
				1597	*/
				1598	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1599	{
				1600	int ret;
				1601
				1602	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1603	if (ret) {
				1604	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1605	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1606	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1607
				1608	ret = lock_stripe_add(rbio);
				1609	if (ret == 0)
				1610	finish_rmw(rbio);
				1611	return 0;
				1612	}
				1613
				1614	/*
				1615	* partial stripe writes get handed over to async helpers.
				1616	* We're really hoping to merge a few more writes into this
				1617	* rbio before calculating new parity
				1618	*/
				1619	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1620	{
				1621	int ret;
				1622
				1623	ret = lock_stripe_add(rbio);
				1624	if (ret == 0)
				1625	async_rmw_stripe(rbio);
				1626	return 0;
				1627	}
				1628
				1629	/*
				1630	* sometimes while we were reading from the drive to
				1631	* recalculate parity, enough new bios come into create
				1632	* a full stripe. So we do a check here to see if we can
				1633	* go directly to finish_rmw
				1634	*/
				1635	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1636	{
				1637	/* head off into rmw land if we don't have a full stripe */
				1638	if (!rbio_is_full(rbio))
				1639	return partial_stripe_write(rbio);
				1640	return full_stripe_write(rbio);
				1641	}
				1642
				1643	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1644	* We use plugging call backs to collect full stripes.
				1645	* Any time we get a partial stripe write while plugged
				1646	* we collect it into a list. When the unplug comes down,
				1647	* we sort the list by logical block number and merge
				1648	* everything we can into the same rbios
				1649	*/
				1650	struct btrfs_plug_cb {
				1651	struct blk_plug_cb cb;
				1652	struct btrfs_fs_info *info;
				1653	struct list_head rbio_list;
				1654	struct btrfs_work work;
				1655	};
				1656
				1657	/*
				1658	* rbios on the plug list are sorted for easier merging.
				1659	*/
				1660	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1661	{
				1662	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1663	plug_list);
				1664	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1665	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1666	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1667	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1668
				1669	if (a_sector < b_sector)
				1670	return -1;
				1671	if (a_sector > b_sector)
				1672	return 1;
				1673	return 0;
				1674	}
				1675
				1676	static void run_plug(struct btrfs_plug_cb *plug)
				1677	{
				1678	struct btrfs_raid_bio *cur;
				1679	struct btrfs_raid_bio *last = NULL;
				1680
				1681	/*
				1682	* sort our plug list then try to merge
				1683	* everything we can in hopes of creating full
				1684	* stripes.
				1685	*/
				1686	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1687	while (!list_empty(&plug->rbio_list)) {
				1688	cur = list_entry(plug->rbio_list.next,
				1689	struct btrfs_raid_bio, plug_list);
				1690	list_del_init(&cur->plug_list);
				1691
				1692	if (rbio_is_full(cur)) {
				1693	/* we have a full stripe, send it down */
				1694	full_stripe_write(cur);
				1695	continue;
				1696	}
				1697	if (last) {
				1698	if (rbio_can_merge(last, cur)) {
				1699	merge_rbio(last, cur);
				1700	__free_raid_bio(cur);
				1701	continue;
				1702
				1703	}
				1704	__raid56_parity_write(last);
				1705	}
				1706	last = cur;
				1707	}
				1708	if (last) {
				1709	__raid56_parity_write(last);
				1710	}
				1711	kfree(plug);
				1712	}
				1713
				1714	/*
				1715	* if the unplug comes from schedule, we have to push the
				1716	* work off to a helper thread
				1717	*/
				1718	static void unplug_work(struct btrfs_work *work)
				1719	{
				1720	struct btrfs_plug_cb *plug;
				1721	plug = container_of(work, struct btrfs_plug_cb, work);
				1722	run_plug(plug);
				1723	}
				1724
				1725	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1726	{
				1727	struct btrfs_plug_cb *plug;
				1728	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1729
				1730	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1731	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1732	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1733	btrfs_queue_work(plug->info->rmw_workers,
				1734	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1735	return;
				1736	}
				1737	run_plug(plug);
				1738	}
				1739
				1740	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1741	* our main entry point for writes from the rest of the FS.
				1742	*/
				1743	int raid56_parity_write(struct btrfs_root root, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1744	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1745	{
				1746	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1747	struct btrfs_plug_cb *plug = NULL;
				1748	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1749	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1750
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1751	rbio = alloc_rbio(root, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1752	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1753	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1754	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1755	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1756	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1757	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1758	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1759
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1760	btrfs_bio_counter_inc_noblocked(root->fs_info);
				1761	rbio->generic_bio_cnt = 1;
				1762
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1763	/*
				1764	* don't plug on full rbios, just get them out the door
				1765	* as quickly as we can
				1766	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1767	if (rbio_is_full(rbio)) {
				1768	ret = full_stripe_write(rbio);
				1769	if (ret)
				1770	btrfs_bio_counter_dec(root->fs_info);
				1771	return ret;
				1772	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1773
				1774	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1775	sizeof(*plug));
				1776	if (cb) {
				1777	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1778	if (!plug->info) {
				1779	plug->info = root->fs_info;
				1780	INIT_LIST_HEAD(&plug->rbio_list);
				1781	}
				1782	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1783	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1784	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1785	ret = __raid56_parity_write(rbio);
				1786	if (ret)
				1787	btrfs_bio_counter_dec(root->fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1788	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1789	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1790	}
				1791
				1792	/*
				1793	* all parity reconstruction happens here. We've read in everything
				1794	* we can find from the drives and this does the heavy lifting of
				1795	* sorting the good from the bad.
				1796	*/
				1797	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1798	{
				1799	int pagenr, stripe;
				1800	void **pointers;
				1801	int faila = -1, failb = -1;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1802	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1803	struct page *page;
				1804	int err;
				1805	int i;
				1806
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1807	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1808	if (!pointers) {
				1809	err = -ENOMEM;
				1810	goto cleanup_io;
				1811	}
				1812
				1813	faila = rbio->faila;
				1814	failb = rbio->failb;
				1815
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1816	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1817	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1818	spin_lock_irq(&rbio->bio_list_lock);
				1819	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1820	spin_unlock_irq(&rbio->bio_list_lock);
				1821	}
				1822
				1823	index_rbio_pages(rbio);
				1824
				1825	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1826	/*
				1827	* Now we just use bitmap to mark the horizontal stripes in
				1828	* which we have data when doing parity scrub.
				1829	*/
				1830	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1831	!test_bit(pagenr, rbio->dbitmap))
				1832	continue;
				1833
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1834	/* setup our array of pointers with pages
				1835	* from each stripe
				1836	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1837	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1838	/*
				1839	* if we're rebuilding a read, we have to use
				1840	* pages from the bio list
				1841	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1842	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1843	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1844	(stripe == faila \|\| stripe == failb)) {
				1845	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1846	} else {
				1847	page = rbio_stripe_page(rbio, stripe, pagenr);
				1848	}
				1849	pointers[stripe] = kmap(page);
				1850	}
				1851
				1852	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1853	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1854	/*
				1855	* single failure, rebuild from parity raid5
				1856	* style
				1857	*/
				1858	if (failb < 0) {
				1859	if (faila == rbio->nr_data) {
				1860	/*
				1861	* Just the P stripe has failed, without
				1862	* a bad data or Q stripe.
				1863	* TODO, we should redo the xor here.
				1864	*/
				1865	err = -EIO;
				1866	goto cleanup;
				1867	}
				1868	/*
				1869	* a single failure in raid6 is rebuilt
				1870	* in the pstripe code below
				1871	*/
				1872	goto pstripe;
				1873	}
				1874
				1875	/* make sure our ps and qs are in order */
				1876	if (faila > failb) {
				1877	int tmp = failb;
				1878	failb = faila;
				1879	faila = tmp;
				1880	}
				1881
				1882	/* if the q stripe is failed, do a pstripe reconstruction
				1883	* from the xors.
				1884	* If both the q stripe and the P stripe are failed, we're
				1885	* here due to a crc mismatch and we can't give them the
				1886	* data they want
				1887	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1888	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1889	if (rbio->bbio->raid_map[faila] ==
				1890	RAID5_P_STRIPE) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1891	err = -EIO;
				1892	goto cleanup;
				1893	}
				1894	/*
				1895	* otherwise we have one bad data stripe and
				1896	* a good P stripe. raid5!
				1897	*/
				1898	goto pstripe;
				1899	}
				1900
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1901	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1902	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1903	PAGE_SIZE, faila, pointers);
				1904	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1905	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1906	PAGE_SIZE, faila, failb,
				1907	pointers);
				1908	}
				1909	} else {
				1910	void *p;
				1911
				1912	/* rebuild from P stripe here (raid5 or raid6) */
				1913	BUG_ON(failb != -1);
				1914	pstripe:
				1915	/* Copy parity block into failed block to start with */
				1916	memcpy(pointers[faila],
				1917	pointers[rbio->nr_data],
				1918	PAGE_CACHE_SIZE);
				1919
				1920	/* rearrange the pointer array */
				1921	p = pointers[faila];
				1922	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1923	pointers[stripe] = pointers[stripe + 1];
				1924	pointers[rbio->nr_data - 1] = p;
				1925
				1926	/* xor in the rest */
				1927	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1928	}
				1929	/* if we're doing this rebuild as part of an rmw, go through
				1930	* and set all of our private rbio pages in the
				1931	* failed stripes as uptodate. This way finish_rmw will
				1932	* know they can be trusted. If this was a read reconstruction,
				1933	* other endio functions will fiddle the uptodate bits
				1934	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1935	if (rbio->operation == BTRFS_RBIO_WRITE) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1936	for (i = 0; i < nr_pages; i++) {
				1937	if (faila != -1) {
				1938	page = rbio_stripe_page(rbio, faila, i);
				1939	SetPageUptodate(page);
				1940	}
				1941	if (failb != -1) {
				1942	page = rbio_stripe_page(rbio, failb, i);
				1943	SetPageUptodate(page);
				1944	}
				1945	}
				1946	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1947	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1948	/*
				1949	* if we're rebuilding a read, we have to use
				1950	* pages from the bio list
				1951	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1952	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1953	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1954	(stripe == faila \|\| stripe == failb)) {
				1955	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1956	} else {
				1957	page = rbio_stripe_page(rbio, stripe, pagenr);
				1958	}
				1959	kunmap(page);
				1960	}
				1961	}
				1962
				1963	err = 0;
				1964	cleanup:
				1965	kfree(pointers);
				1966
				1967	cleanup_io:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1968	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1969	if (err == 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1970	cache_rbio_pages(rbio);
				1971	else
				1972	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1973
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1974	rbio_orig_end_io(rbio, err);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1975	} else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Linus Torvalds	2236597	2015-09-05 15:14:43 -0700	[diff] [blame]	1976	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1977	} else if (err == 0) {
				1978	rbio->faila = -1;
				1979	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1980
				1981	if (rbio->operation == BTRFS_RBIO_WRITE)
				1982	finish_rmw(rbio);
				1983	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				1984	finish_parity_scrub(rbio, 0);
				1985	else
				1986	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1987	} else {
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1988	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1989	}
				1990	}
				1991
				1992	/*
				1993	* This is called only for stripes we've read from disk to
				1994	* reconstruct the parity.
				1995	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1996	static void raid_recover_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1997	{
				1998	struct btrfs_raid_bio *rbio = bio->bi_private;
				1999
				2000	/*
				2001	* we only read stripe pages off the disk, set them
				2002	* up to date if there were no errors
				2003	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2004	if (bio->bi_error)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2005	fail_bio_stripe(rbio, bio);
				2006	else
				2007	set_bio_pages_uptodate(bio);
				2008	bio_put(bio);
				2009
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2010	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2011	return;
				2012
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2013	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2014	rbio_orig_end_io(rbio, -EIO);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2015	else
				2016	__raid_recover_end_io(rbio);
				2017	}
				2018
				2019	/*
				2020	* reads everything we need off the disk to reconstruct
				2021	* the parity. endio handlers trigger final reconstruction
				2022	* when the IO is done.
				2023	*
				2024	* This is used both for reads from the higher layers and for
				2025	* parity construction required to finish a rmw cycle.
				2026	*/
				2027	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2028	{
				2029	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2030	struct bio_list bio_list;
				2031	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	2032	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2033	int pagenr;
				2034	int stripe;
				2035	struct bio *bio;
				2036
				2037	bio_list_init(&bio_list);
				2038
				2039	ret = alloc_rbio_pages(rbio);
				2040	if (ret)
				2041	goto cleanup;
				2042
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2043	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2044
				2045	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2046	* read everything that hasn't failed. Thanks to the
				2047	* stripe cache, it is possible that some or all of these
				2048	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2049	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2050	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2051	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2052	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2053	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2054	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2055
				2056	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				2057	struct page *p;
				2058
				2059	/*
				2060	* the rmw code may have already read this
				2061	* page in
				2062	*/
				2063	p = rbio_stripe_page(rbio, stripe, pagenr);
				2064	if (PageUptodate(p))
				2065	continue;
				2066
				2067	ret = rbio_add_io_page(rbio, &bio_list,
				2068	rbio_stripe_page(rbio, stripe, pagenr),
				2069	stripe, pagenr, rbio->stripe_len);
				2070	if (ret < 0)
				2071	goto cleanup;
				2072	}
				2073	}
				2074
				2075	bios_to_read = bio_list_size(&bio_list);
				2076	if (!bios_to_read) {
				2077	/*
				2078	* we might have no bios to read just because the pages
				2079	* were up to date, or we might have no bios to read because
				2080	* the devices were gone.
				2081	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2082	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2083	__raid_recover_end_io(rbio);
				2084	goto out;
				2085	} else {
				2086	goto cleanup;
				2087	}
				2088	}
				2089
				2090	/*
				2091	* the bbio may be freed once we submit the last bio. Make sure
				2092	* not to touch it after that
				2093	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2094	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2095	while (1) {
				2096	bio = bio_list_pop(&bio_list);
				2097	if (!bio)
				2098	break;
				2099
				2100	bio->bi_private = rbio;
				2101	bio->bi_end_io = raid_recover_end_io;
				2102
				2103	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2104	BTRFS_WQ_ENDIO_RAID56);
				2105
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2106	submit_bio(READ, bio);
				2107	}
				2108	out:
				2109	return 0;
				2110
				2111	cleanup:
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2112	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2113	rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2114	rbio_orig_end_io(rbio, -EIO);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2115	return -EIO;
				2116	}
				2117
				2118	/*
				2119	* the main entry point for reads from the higher layers. This
				2120	* is really only called when the normal read path had a failure,
				2121	* so we assume the bio they send down corresponds to a failed part
				2122	* of the drive.
				2123	*/
				2124	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2125	struct btrfs_bio *bbio, u64 stripe_len,
				2126	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2127	{
				2128	struct btrfs_raid_bio *rbio;
				2129	int ret;
				2130
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2131	rbio = alloc_rbio(root, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2132	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2133	if (generic_io)
				2134	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2135	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2136	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2137
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2138	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2139	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2140	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2141
				2142	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2143	if (rbio->faila == -1) {
				2144	BUG();
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2145	if (generic_io)
				2146	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2147	kfree(rbio);
				2148	return -EIO;
				2149	}
				2150
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2151	if (generic_io) {
				2152	btrfs_bio_counter_inc_noblocked(root->fs_info);
				2153	rbio->generic_bio_cnt = 1;
				2154	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2155	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2156	}
				2157
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2158	/*
				2159	* reconstruct from the q stripe if they are
				2160	* asking for mirror 3
				2161	*/
				2162	if (mirror_num == 3)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2163	rbio->failb = rbio->real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2164
				2165	ret = lock_stripe_add(rbio);
				2166
				2167	/*
				2168	* __raid56_parity_recover will end the bio with
				2169	* any errors it hits. We don't want to return
				2170	* its error value up the stack because our caller
				2171	* will end up calling bio_endio with any nonzero
				2172	* return
				2173	*/
				2174	if (ret == 0)
				2175	__raid56_parity_recover(rbio);
				2176	/*
				2177	* our rbio has been added to the list of
				2178	* rbios that will be handled after the
				2179	* currently lock owner is done
				2180	*/
				2181	return 0;
				2182
				2183	}
				2184
				2185	static void rmw_work(struct btrfs_work *work)
				2186	{
				2187	struct btrfs_raid_bio *rbio;
				2188
				2189	rbio = container_of(work, struct btrfs_raid_bio, work);
				2190	raid56_rmw_stripe(rbio);
				2191	}
				2192
				2193	static void read_rebuild_work(struct btrfs_work *work)
				2194	{
				2195	struct btrfs_raid_bio *rbio;
				2196
				2197	rbio = container_of(work, struct btrfs_raid_bio, work);
				2198	__raid56_parity_recover(rbio);
				2199	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2200
				2201	/*
				2202	* The following code is used to scrub/replace the parity stripe
				2203	*
				2204	* Note: We need make sure all the pages that add into the scrub/replace
				2205	* raid bio are correct and not be changed during the scrub/replace. That
				2206	* is those pages just hold metadata or file data with checksum.
				2207	*/
				2208
				2209	struct btrfs_raid_bio *
				2210	raid56_parity_alloc_scrub_rbio(struct btrfs_root root, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2211	struct btrfs_bio *bbio, u64 stripe_len,
				2212	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2213	unsigned long *dbitmap, int stripe_nsectors)
				2214	{
				2215	struct btrfs_raid_bio *rbio;
				2216	int i;
				2217
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2218	rbio = alloc_rbio(root, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2219	if (IS_ERR(rbio))
				2220	return NULL;
				2221	bio_list_add(&rbio->bio_list, bio);
				2222	/*
				2223	* This is a special bio which is used to hold the completion handler
				2224	* and make the scrub rbio is similar to the other types
				2225	*/
				2226	ASSERT(!bio->bi_iter.bi_size);
				2227	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2228
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2229	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2230	if (bbio->stripes[i].dev == scrub_dev) {
				2231	rbio->scrubp = i;
				2232	break;
				2233	}
				2234	}
				2235
				2236	/* Now we just support the sectorsize equals to page size */
				2237	ASSERT(root->sectorsize == PAGE_SIZE);
				2238	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2239	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2240
				2241	return rbio;
				2242	}
				2243
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2244	/* Used for both parity scrub and missing. */
				2245	void raid56_add_scrub_pages(struct btrfs_raid_bio rbio, struct page page,
				2246	u64 logical)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2247	{
				2248	int stripe_offset;
				2249	int index;
				2250
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2251	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2252	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2253	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2254	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2255	index = stripe_offset >> PAGE_CACHE_SHIFT;
				2256	rbio->bio_pages[index] = page;
				2257	}
				2258
				2259	/*
				2260	* We just scrub the parity that we have correct data on the same horizontal,
				2261	* so we needn't allocate all pages for all the stripes.
				2262	*/
				2263	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2264	{
				2265	int i;
				2266	int bit;
				2267	int index;
				2268	struct page *page;
				2269
				2270	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2271	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2272	index = i * rbio->stripe_npages + bit;
				2273	if (rbio->stripe_pages[index])
				2274	continue;
				2275
				2276	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2277	if (!page)
				2278	return -ENOMEM;
				2279	rbio->stripe_pages[index] = page;
				2280	ClearPageUptodate(page);
				2281	}
				2282	}
				2283	return 0;
				2284	}
				2285
				2286	/*
				2287	* end io function used by finish_rmw. When we finally
				2288	* get here, we've written a full stripe
				2289	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2290	static void raid_write_parity_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2291	{
				2292	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2293	int err = bio->bi_error;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2294
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2295	if (bio->bi_error)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2296	fail_bio_stripe(rbio, bio);
				2297
				2298	bio_put(bio);
				2299
				2300	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2301	return;
				2302
				2303	err = 0;
				2304
				2305	if (atomic_read(&rbio->error))
				2306	err = -EIO;
				2307
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2308	rbio_orig_end_io(rbio, err);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2309	}
				2310
				2311	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2312	int need_check)
				2313	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2314	struct btrfs_bio *bbio = rbio->bbio;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2315	void *pointers[rbio->real_stripes];
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2316	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2317	int nr_data = rbio->nr_data;
				2318	int stripe;
				2319	int pagenr;
				2320	int p_stripe = -1;
				2321	int q_stripe = -1;
				2322	struct page *p_page = NULL;
				2323	struct page *q_page = NULL;
				2324	struct bio_list bio_list;
				2325	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2326	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2327	int ret;
				2328
				2329	bio_list_init(&bio_list);
				2330
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2331	if (rbio->real_stripes - rbio->nr_data == 1) {
				2332	p_stripe = rbio->real_stripes - 1;
				2333	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				2334	p_stripe = rbio->real_stripes - 2;
				2335	q_stripe = rbio->real_stripes - 1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2336	} else {
				2337	BUG();
				2338	}
				2339
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2340	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2341	is_replace = 1;
				2342	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2343	}
				2344
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2345	/*
				2346	* Because the higher layers(scrubber) are unlikely to
				2347	* use this area of the disk again soon, so don't cache
				2348	* it.
				2349	*/
				2350	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2351
				2352	if (!need_check)
				2353	goto writeback;
				2354
				2355	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2356	if (!p_page)
				2357	goto cleanup;
				2358	SetPageUptodate(p_page);
				2359
				2360	if (q_stripe != -1) {
				2361	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2362	if (!q_page) {
				2363	__free_page(p_page);
				2364	goto cleanup;
				2365	}
				2366	SetPageUptodate(q_page);
				2367	}
				2368
				2369	atomic_set(&rbio->error, 0);
				2370
				2371	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2372	struct page *p;
				2373	void *parity;
				2374	/* first collect one page from each data stripe */
				2375	for (stripe = 0; stripe < nr_data; stripe++) {
				2376	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2377	pointers[stripe] = kmap(p);
				2378	}
				2379
				2380	/* then add the parity stripe */
				2381	pointers[stripe++] = kmap(p_page);
				2382
				2383	if (q_stripe != -1) {
				2384
				2385	/*
				2386	* raid6, add the qstripe and call the
				2387	* library function to fill in our p/q
				2388	*/
				2389	pointers[stripe++] = kmap(q_page);
				2390
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2391	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2392	pointers);
				2393	} else {
				2394	/* raid5 */
				2395	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				2396	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				2397	}
				2398
				2399	/* Check scrubbing pairty and repair it */
				2400	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2401	parity = kmap(p);
				2402	if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
				2403	memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
				2404	else
				2405	/* Parity is right, needn't writeback */
				2406	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2407	kunmap(p);
				2408
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2409	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2410	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				2411	}
				2412
				2413	__free_page(p_page);
				2414	if (q_page)
				2415	__free_page(q_page);
				2416
				2417	writeback:
				2418	/*
				2419	* time to start writing. Make bios for everything from the
				2420	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2421	* everything else.
				2422	*/
				2423	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2424	struct page *page;
				2425
				2426	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2427	ret = rbio_add_io_page(rbio, &bio_list,
				2428	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2429	if (ret)
				2430	goto cleanup;
				2431	}
				2432
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2433	if (!is_replace)
				2434	goto submit_write;
				2435
				2436	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2437	struct page *page;
				2438
				2439	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2440	ret = rbio_add_io_page(rbio, &bio_list, page,
				2441	bbio->tgtdev_map[rbio->scrubp],
				2442	pagenr, rbio->stripe_len);
				2443	if (ret)
				2444	goto cleanup;
				2445	}
				2446
				2447	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2448	nr_data = bio_list_size(&bio_list);
				2449	if (!nr_data) {
				2450	/* Every parity is right */
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2451	rbio_orig_end_io(rbio, 0);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2452	return;
				2453	}
				2454
				2455	atomic_set(&rbio->stripes_pending, nr_data);
				2456
				2457	while (1) {
				2458	bio = bio_list_pop(&bio_list);
				2459	if (!bio)
				2460	break;
				2461
				2462	bio->bi_private = rbio;
				2463	bio->bi_end_io = raid_write_parity_end_io;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2464	submit_bio(WRITE, bio);
				2465	}
				2466	return;
				2467
				2468	cleanup:
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2469	rbio_orig_end_io(rbio, -EIO);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2470	}
				2471
				2472	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2473	{
				2474	if (stripe >= 0 && stripe < rbio->nr_data)
				2475	return 1;
				2476	return 0;
				2477	}
				2478
				2479	/*
				2480	* While we're doing the parity check and repair, we could have errors
				2481	* in reading pages off the disk. This checks for errors and if we're
				2482	* not able to read the page it'll trigger parity reconstruction. The
				2483	* parity scrub will be finished after we've reconstructed the failed
				2484	* stripes
				2485	*/
				2486	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2487	{
				2488	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2489	goto cleanup;
				2490
				2491	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2492	int dfail = 0, failp = -1;
				2493
				2494	if (is_data_stripe(rbio, rbio->faila))
				2495	dfail++;
				2496	else if (is_parity_stripe(rbio->faila))
				2497	failp = rbio->faila;
				2498
				2499	if (is_data_stripe(rbio, rbio->failb))
				2500	dfail++;
				2501	else if (is_parity_stripe(rbio->failb))
				2502	failp = rbio->failb;
				2503
				2504	/*
				2505	* Because we can not use a scrubbing parity to repair
				2506	* the data, so the capability of the repair is declined.
				2507	* (In the case of RAID5, we can not repair anything)
				2508	*/
				2509	if (dfail > rbio->bbio->max_errors - 1)
				2510	goto cleanup;
				2511
				2512	/*
				2513	* If all data is good, only parity is correctly, just
				2514	* repair the parity.
				2515	*/
				2516	if (dfail == 0) {
				2517	finish_parity_scrub(rbio, 0);
				2518	return;
				2519	}
				2520
				2521	/*
				2522	* Here means we got one corrupted data stripe and one
				2523	* corrupted parity on RAID6, if the corrupted parity
				2524	* is scrubbing parity, luckly, use the other one to repair
				2525	* the data, or we can not repair the data stripe.
				2526	*/
				2527	if (failp != rbio->scrubp)
				2528	goto cleanup;
				2529
				2530	__raid_recover_end_io(rbio);
				2531	} else {
				2532	finish_parity_scrub(rbio, 1);
				2533	}
				2534	return;
				2535
				2536	cleanup:
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2537	rbio_orig_end_io(rbio, -EIO);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2538	}
				2539
				2540	/*
				2541	* end io for the read phase of the rmw cycle. All the bios here are physical
				2542	* stripe bios we've read from the disk so we can recalculate the parity of the
				2543	* stripe.
				2544	*
				2545	* This will usually kick off finish_rmw once all the bios are read in, but it
				2546	* may trigger parity reconstruction if we had any errors along the way
				2547	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2548	static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2549	{
				2550	struct btrfs_raid_bio *rbio = bio->bi_private;
				2551
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2552	if (bio->bi_error)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2553	fail_bio_stripe(rbio, bio);
				2554	else
				2555	set_bio_pages_uptodate(bio);
				2556
				2557	bio_put(bio);
				2558
				2559	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2560	return;
				2561
				2562	/*
				2563	* this will normally call finish_rmw to start our write
				2564	* but if there are any failed stripes we'll reconstruct
				2565	* from parity first
				2566	*/
				2567	validate_rbio_for_parity_scrub(rbio);
				2568	}
				2569
				2570	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2571	{
				2572	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2573	struct bio_list bio_list;
				2574	int ret;
				2575	int pagenr;
				2576	int stripe;
				2577	struct bio *bio;
				2578
				2579	ret = alloc_rbio_essential_pages(rbio);
				2580	if (ret)
				2581	goto cleanup;
				2582
				2583	bio_list_init(&bio_list);
				2584
				2585	atomic_set(&rbio->error, 0);
				2586	/*
				2587	* build a list of bios to read all the missing parts of this
				2588	* stripe
				2589	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2590	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2591	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2592	struct page *page;
				2593	/*
				2594	* we want to find all the pages missing from
				2595	* the rbio and read them from the disk. If
				2596	* page_in_rbio finds a page in the bio list
				2597	* we don't need to read it off the stripe.
				2598	*/
				2599	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2600	if (page)
				2601	continue;
				2602
				2603	page = rbio_stripe_page(rbio, stripe, pagenr);
				2604	/*
				2605	* the bio cache may have handed us an uptodate
				2606	* page. If so, be happy and use it
				2607	*/
				2608	if (PageUptodate(page))
				2609	continue;
				2610
				2611	ret = rbio_add_io_page(rbio, &bio_list, page,
				2612	stripe, pagenr, rbio->stripe_len);
				2613	if (ret)
				2614	goto cleanup;
				2615	}
				2616	}
				2617
				2618	bios_to_read = bio_list_size(&bio_list);
				2619	if (!bios_to_read) {
				2620	/*
				2621	* this can happen if others have merged with
				2622	* us, it means there is nothing left to read.
				2623	* But if there are missing devices it may not be
				2624	* safe to do the full stripe write yet.
				2625	*/
				2626	goto finish;
				2627	}
				2628
				2629	/*
				2630	* the bbio may be freed once we submit the last bio. Make sure
				2631	* not to touch it after that
				2632	*/
				2633	atomic_set(&rbio->stripes_pending, bios_to_read);
				2634	while (1) {
				2635	bio = bio_list_pop(&bio_list);
				2636	if (!bio)
				2637	break;
				2638
				2639	bio->bi_private = rbio;
				2640	bio->bi_end_io = raid56_parity_scrub_end_io;
				2641
				2642	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2643	BTRFS_WQ_ENDIO_RAID56);
				2644
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2645	submit_bio(READ, bio);
				2646	}
				2647	/* the actual write will happen once the reads are done */
				2648	return;
				2649
				2650	cleanup:
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2651	rbio_orig_end_io(rbio, -EIO);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2652	return;
				2653
				2654	finish:
				2655	validate_rbio_for_parity_scrub(rbio);
				2656	}
				2657
				2658	static void scrub_parity_work(struct btrfs_work *work)
				2659	{
				2660	struct btrfs_raid_bio *rbio;
				2661
				2662	rbio = container_of(work, struct btrfs_raid_bio, work);
				2663	raid56_parity_scrub_stripe(rbio);
				2664	}
				2665
				2666	static void async_scrub_parity(struct btrfs_raid_bio *rbio)
				2667	{
				2668	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2669	scrub_parity_work, NULL, NULL);
				2670
				2671	btrfs_queue_work(rbio->fs_info->rmw_workers,
				2672	&rbio->work);
				2673	}
				2674
				2675	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2676	{
				2677	if (!lock_stripe_add(rbio))
				2678	async_scrub_parity(rbio);
				2679	}
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2680
				2681	/* The following code is used for dev replace of a missing RAID 5/6 device. */
				2682
				2683	struct btrfs_raid_bio *
				2684	raid56_alloc_missing_rbio(struct btrfs_root root, struct bio bio,
				2685	struct btrfs_bio *bbio, u64 length)
				2686	{
				2687	struct btrfs_raid_bio *rbio;
				2688
				2689	rbio = alloc_rbio(root, bbio, length);
				2690	if (IS_ERR(rbio))
				2691	return NULL;
				2692
				2693	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
				2694	bio_list_add(&rbio->bio_list, bio);
				2695	/*
				2696	* This is a special bio which is used to hold the completion handler
				2697	* and make the scrub rbio is similar to the other types
				2698	*/
				2699	ASSERT(!bio->bi_iter.bi_size);
				2700
				2701	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2702	if (rbio->faila == -1) {
				2703	BUG();
				2704	kfree(rbio);
				2705	return NULL;
				2706	}
				2707
				2708	return rbio;
				2709	}
				2710
				2711	static void missing_raid56_work(struct btrfs_work *work)
				2712	{
				2713	struct btrfs_raid_bio *rbio;
				2714
				2715	rbio = container_of(work, struct btrfs_raid_bio, work);
				2716	__raid56_parity_recover(rbio);
				2717	}
				2718
				2719	static void async_missing_raid56(struct btrfs_raid_bio *rbio)
				2720	{
				2721	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2722	missing_raid56_work, NULL, NULL);
				2723
				2724	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
				2725	}
				2726
				2727	void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
				2728	{
				2729	if (!lock_stripe_add(rbio))
				2730	async_missing_raid56(rbio);
				2731	}