Blame - fs/btrfs/raid56.c - kernel/msm-4.9

blob: b85d68f721b811350d0ee6155c989b53649647b6 [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
Geert Uytterhoeven	d7011f5	2013-03-03 04:44:41 -0700	[diff] [blame]	34	#include <linux/vmalloc.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	61	/*
				62	* bbio and raid_map is managed by the caller, so we shouldn't free
				63	* them here. And besides that, all rbios with this flag should not
				64	* be cached, because we need raid_map to check the rbios' stripe
				65	* is the same or not, but it is very likely that the caller has
				66	* free raid_map, so don't cache those rbios.
				67	*/
				68	#define RBIO_HOLD_BBIO_MAP_BIT 4
				69
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	70	#define RBIO_CACHE_SIZE 1024
				71
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	72	enum btrfs_rbio_ops {
				73	BTRFS_RBIO_WRITE = 0,
				74	BTRFS_RBIO_READ_REBUILD = 1,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	75	BTRFS_RBIO_PARITY_SCRUB = 2,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	76	};
				77
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	78	struct btrfs_raid_bio {
				79	struct btrfs_fs_info *fs_info;
				80	struct btrfs_bio *bbio;
				81
				82	/*
				83	* logical block numbers for the start of each stripe
				84	* The last one or two are p/q. These are sorted,
				85	* so raid_map[0] is the start of our full stripe
				86	*/
				87	u64 *raid_map;
				88
				89	/* while we're doing rmw on a stripe
				90	* we put it into a hash table so we can
				91	* lock the stripe and merge more rbios
				92	* into it.
				93	*/
				94	struct list_head hash_list;
				95
				96	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	97	* LRU list for the stripe cache
				98	*/
				99	struct list_head stripe_cache;
				100
				101	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	102	* for scheduling work in the helper threads
				103	*/
				104	struct btrfs_work work;
				105
				106	/*
				107	* bio list and bio_list_lock are used
				108	* to add more bios into the stripe
				109	* in hopes of avoiding the full rmw
				110	*/
				111	struct bio_list bio_list;
				112	spinlock_t bio_list_lock;
				113
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	114	/* also protected by the bio_list_lock, the
				115	* plug list is used by the plugging code
				116	* to collect partial bios while plugged. The
				117	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	118	* the stripe lock to the next pending IO
				119	*/
				120	struct list_head plug_list;
				121
				122	/*
				123	* flags that tell us if it is safe to
				124	* merge with this bio
				125	*/
				126	unsigned long flags;
				127
				128	/* size of each individual stripe on disk */
				129	int stripe_len;
				130
				131	/* number of data stripes (no p/q) */
				132	int nr_data;
				133
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	134	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	135	/*
				136	* set if we're doing a parity rebuild
				137	* for a read from higher up, which is handled
				138	* differently from a parity rebuild as part of
				139	* rmw
				140	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	141	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	142
				143	/* first bad stripe */
				144	int faila;
				145
				146	/* second bad stripe (for raid6 use) */
				147	int failb;
				148
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	149	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	150	/*
				151	* number of pages needed to represent the full
				152	* stripe
				153	*/
				154	int nr_pages;
				155
				156	/*
				157	* size of all the bios in the bio_list. This
				158	* helps us decide if the rbio maps to a full
				159	* stripe or not
				160	*/
				161	int bio_list_bytes;
				162
				163	atomic_t refs;
				164
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	165	atomic_t stripes_pending;
				166
				167	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	168	/*
				169	* these are two arrays of pointers. We allocate the
				170	* rbio big enough to hold them both and setup their
				171	* locations when the rbio is allocated
				172	*/
				173
				174	/* pointers to pages that we allocated for
				175	* reading/writing stripes directly from the disk (including P/Q)
				176	*/
				177	struct page **stripe_pages;
				178
				179	/*
				180	* pointers to the pages in the bio_list. Stored
				181	* here for faster lookup
				182	*/
				183	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	184
				185	/*
				186	* bitmap to record which horizontal stripe has data
				187	*/
				188	unsigned long *dbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	189	};
				190
				191	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				192	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				193	static void rmw_work(struct btrfs_work *work);
				194	static void read_rebuild_work(struct btrfs_work *work);
				195	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				196	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				197	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				198	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				199	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				200	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				201	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				202
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	203	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				204	int need_check);
				205	static void async_scrub_parity(struct btrfs_raid_bio *rbio);
				206
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	207	/*
				208	* the stripe hash table is used for locking, and to collect
				209	* bios in hopes of making a full stripe
				210	*/
				211	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				212	{
				213	struct btrfs_stripe_hash_table *table;
				214	struct btrfs_stripe_hash_table *x;
				215	struct btrfs_stripe_hash *cur;
				216	struct btrfs_stripe_hash *h;
				217	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				218	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	219	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	220
				221	if (info->stripe_hash_table)
				222	return 0;
				223
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	224	/*
				225	* The table is large, starting with order 4 and can go as high as
				226	* order 7 in case lock debugging is turned on.
				227	*
				228	* Try harder to allocate and fallback to vmalloc to lower the chance
				229	* of a failing mount.
				230	*/
				231	table_size = sizeof(table) + sizeof(h) * num_entries;
				232	table = kzalloc(table_size, GFP_KERNEL \| __GFP_NOWARN \| __GFP_REPEAT);
				233	if (!table) {
				234	table = vzalloc(table_size);
				235	if (!table)
				236	return -ENOMEM;
				237	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	238
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	239	spin_lock_init(&table->cache_lock);
				240	INIT_LIST_HEAD(&table->stripe_cache);
				241
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	242	h = table->table;
				243
				244	for (i = 0; i < num_entries; i++) {
				245	cur = h + i;
				246	INIT_LIST_HEAD(&cur->hash_list);
				247	spin_lock_init(&cur->lock);
				248	init_waitqueue_head(&cur->wait);
				249	}
				250
				251	x = cmpxchg(&info->stripe_hash_table, NULL, table);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	252	if (x) {
				253	if (is_vmalloc_addr(x))
				254	vfree(x);
				255	else
				256	kfree(x);
				257	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	258	return 0;
				259	}
				260
				261	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	262	* caching an rbio means to copy anything from the
				263	* bio_pages array into the stripe_pages array. We
				264	* use the page uptodate bit in the stripe cache array
				265	* to indicate if it has valid data
				266	*
				267	* once the caching is done, we set the cache ready
				268	* bit.
				269	*/
				270	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				271	{
				272	int i;
				273	char *s;
				274	char *d;
				275	int ret;
				276
				277	ret = alloc_rbio_pages(rbio);
				278	if (ret)
				279	return;
				280
				281	for (i = 0; i < rbio->nr_pages; i++) {
				282	if (!rbio->bio_pages[i])
				283	continue;
				284
				285	s = kmap(rbio->bio_pages[i]);
				286	d = kmap(rbio->stripe_pages[i]);
				287
				288	memcpy(d, s, PAGE_CACHE_SIZE);
				289
				290	kunmap(rbio->bio_pages[i]);
				291	kunmap(rbio->stripe_pages[i]);
				292	SetPageUptodate(rbio->stripe_pages[i]);
				293	}
				294	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				295	}
				296
				297	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	298	* we hash on the first logical address of the stripe
				299	*/
				300	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				301	{
				302	u64 num = rbio->raid_map[0];
				303
				304	/*
				305	* we shift down quite a bit. We're using byte
				306	* addressing, and most of the lower bits are zeros.
				307	* This tends to upset hash_64, and it consistently
				308	* returns just one or two different values.
				309	*
				310	* shifting off the lower bits fixes things.
				311	*/
				312	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				313	}
				314
				315	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	316	* stealing an rbio means taking all the uptodate pages from the stripe
				317	* array in the source rbio and putting them into the destination rbio
				318	*/
				319	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				320	{
				321	int i;
				322	struct page *s;
				323	struct page *d;
				324
				325	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				326	return;
				327
				328	for (i = 0; i < dest->nr_pages; i++) {
				329	s = src->stripe_pages[i];
				330	if (!s \|\| !PageUptodate(s)) {
				331	continue;
				332	}
				333
				334	d = dest->stripe_pages[i];
				335	if (d)
				336	__free_page(d);
				337
				338	dest->stripe_pages[i] = s;
				339	src->stripe_pages[i] = NULL;
				340	}
				341	}
				342
				343	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	344	* merging means we take the bio_list from the victim and
				345	* splice it into the destination. The victim should
				346	* be discarded afterwards.
				347	*
				348	* must be called with dest->rbio_list_lock held
				349	*/
				350	static void merge_rbio(struct btrfs_raid_bio *dest,
				351	struct btrfs_raid_bio *victim)
				352	{
				353	bio_list_merge(&dest->bio_list, &victim->bio_list);
				354	dest->bio_list_bytes += victim->bio_list_bytes;
				355	bio_list_init(&victim->bio_list);
				356	}
				357
				358	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	359	* used to prune items that are in the cache. The caller
				360	* must hold the hash table lock.
				361	*/
				362	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				363	{
				364	int bucket = rbio_bucket(rbio);
				365	struct btrfs_stripe_hash_table *table;
				366	struct btrfs_stripe_hash *h;
				367	int freeit = 0;
				368
				369	/*
				370	* check the bit again under the hash table lock.
				371	*/
				372	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				373	return;
				374
				375	table = rbio->fs_info->stripe_hash_table;
				376	h = table->table + bucket;
				377
				378	/* hold the lock for the bucket because we may be
				379	* removing it from the hash table
				380	*/
				381	spin_lock(&h->lock);
				382
				383	/*
				384	* hold the lock for the bio list because we need
				385	* to make sure the bio list is empty
				386	*/
				387	spin_lock(&rbio->bio_list_lock);
				388
				389	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				390	list_del_init(&rbio->stripe_cache);
				391	table->cache_size -= 1;
				392	freeit = 1;
				393
				394	/* if the bio list isn't empty, this rbio is
				395	* still involved in an IO. We take it out
				396	* of the cache list, and drop the ref that
				397	* was held for the list.
				398	*
				399	* If the bio_list was empty, we also remove
				400	* the rbio from the hash_table, and drop
				401	* the corresponding ref
				402	*/
				403	if (bio_list_empty(&rbio->bio_list)) {
				404	if (!list_empty(&rbio->hash_list)) {
				405	list_del_init(&rbio->hash_list);
				406	atomic_dec(&rbio->refs);
				407	BUG_ON(!list_empty(&rbio->plug_list));
				408	}
				409	}
				410	}
				411
				412	spin_unlock(&rbio->bio_list_lock);
				413	spin_unlock(&h->lock);
				414
				415	if (freeit)
				416	__free_raid_bio(rbio);
				417	}
				418
				419	/*
				420	* prune a given rbio from the cache
				421	*/
				422	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				423	{
				424	struct btrfs_stripe_hash_table *table;
				425	unsigned long flags;
				426
				427	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				428	return;
				429
				430	table = rbio->fs_info->stripe_hash_table;
				431
				432	spin_lock_irqsave(&table->cache_lock, flags);
				433	__remove_rbio_from_cache(rbio);
				434	spin_unlock_irqrestore(&table->cache_lock, flags);
				435	}
				436
				437	/*
				438	* remove everything in the cache
				439	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	440	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	441	{
				442	struct btrfs_stripe_hash_table *table;
				443	unsigned long flags;
				444	struct btrfs_raid_bio *rbio;
				445
				446	table = info->stripe_hash_table;
				447
				448	spin_lock_irqsave(&table->cache_lock, flags);
				449	while (!list_empty(&table->stripe_cache)) {
				450	rbio = list_entry(table->stripe_cache.next,
				451	struct btrfs_raid_bio,
				452	stripe_cache);
				453	__remove_rbio_from_cache(rbio);
				454	}
				455	spin_unlock_irqrestore(&table->cache_lock, flags);
				456	}
				457
				458	/*
				459	* remove all cached entries and free the hash table
				460	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	461	*/
				462	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				463	{
				464	if (!info->stripe_hash_table)
				465	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	466	btrfs_clear_rbio_cache(info);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	467	if (is_vmalloc_addr(info->stripe_hash_table))
				468	vfree(info->stripe_hash_table);
				469	else
				470	kfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	471	info->stripe_hash_table = NULL;
				472	}
				473
				474	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	475	* insert an rbio into the stripe cache. It
				476	* must have already been prepared by calling
				477	* cache_rbio_pages
				478	*
				479	* If this rbio was already cached, it gets
				480	* moved to the front of the lru.
				481	*
				482	* If the size of the rbio cache is too big, we
				483	* prune an item.
				484	*/
				485	static void cache_rbio(struct btrfs_raid_bio *rbio)
				486	{
				487	struct btrfs_stripe_hash_table *table;
				488	unsigned long flags;
				489
				490	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				491	return;
				492
				493	table = rbio->fs_info->stripe_hash_table;
				494
				495	spin_lock_irqsave(&table->cache_lock, flags);
				496	spin_lock(&rbio->bio_list_lock);
				497
				498	/* bump our ref if we were not in the list before */
				499	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				500	atomic_inc(&rbio->refs);
				501
				502	if (!list_empty(&rbio->stripe_cache)){
				503	list_move(&rbio->stripe_cache, &table->stripe_cache);
				504	} else {
				505	list_add(&rbio->stripe_cache, &table->stripe_cache);
				506	table->cache_size += 1;
				507	}
				508
				509	spin_unlock(&rbio->bio_list_lock);
				510
				511	if (table->cache_size > RBIO_CACHE_SIZE) {
				512	struct btrfs_raid_bio *found;
				513
				514	found = list_entry(table->stripe_cache.prev,
				515	struct btrfs_raid_bio,
				516	stripe_cache);
				517
				518	if (found != rbio)
				519	__remove_rbio_from_cache(found);
				520	}
				521
				522	spin_unlock_irqrestore(&table->cache_lock, flags);
				523	return;
				524	}
				525
				526	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	527	* helper function to run the xor_blocks api. It is only
				528	* able to do MAX_XOR_BLOCKS at a time, so we need to
				529	* loop through.
				530	*/
				531	static void run_xor(void **pages, int src_cnt, ssize_t len)
				532	{
				533	int src_off = 0;
				534	int xor_src_cnt = 0;
				535	void *dest = pages[src_cnt];
				536
				537	while(src_cnt > 0) {
				538	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				539	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				540
				541	src_cnt -= xor_src_cnt;
				542	src_off += xor_src_cnt;
				543	}
				544	}
				545
				546	/*
				547	* returns true if the bio list inside this rbio
				548	* covers an entire stripe (no rmw required).
				549	* Must be called with the bio list lock held, or
				550	* at a time when you know it is impossible to add
				551	* new bios into the list
				552	*/
				553	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				554	{
				555	unsigned long size = rbio->bio_list_bytes;
				556	int ret = 1;
				557
				558	if (size != rbio->nr_data * rbio->stripe_len)
				559	ret = 0;
				560
				561	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				562	return ret;
				563	}
				564
				565	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				566	{
				567	unsigned long flags;
				568	int ret;
				569
				570	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				571	ret = __rbio_is_full(rbio);
				572	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				573	return ret;
				574	}
				575
				576	/*
				577	* returns 1 if it is safe to merge two rbios together.
				578	* The merging is safe if the two rbios correspond to
				579	* the same stripe and if they are both going in the same
				580	* direction (read vs write), and if neither one is
				581	* locked for final IO
				582	*
				583	* The caller is responsible for locking such that
				584	* rmw_locked is safe to test
				585	*/
				586	static int rbio_can_merge(struct btrfs_raid_bio *last,
				587	struct btrfs_raid_bio *cur)
				588	{
				589	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				590	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				591	return 0;
				592
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	593	/*
				594	* we can't merge with cached rbios, since the
				595	* idea is that when we merge the destination
				596	* rbio is going to run our IO for us. We can
				597	* steal from cached rbio's though, other functions
				598	* handle that.
				599	*/
				600	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				601	test_bit(RBIO_CACHE_BIT, &cur->flags))
				602	return 0;
				603
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	604	if (last->raid_map[0] !=
				605	cur->raid_map[0])
				606	return 0;
				607
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	608	/* we can't merge with different operations */
				609	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	610	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	611	/*
				612	* We've need read the full stripe from the drive.
				613	* check and repair the parity and write the new results.
				614	*
				615	* We're not allowed to add any new bios to the
				616	* bio list here, anyone else that wants to
				617	* change this stripe needs to do their own rmw.
				618	*/
				619	if (last->operation == BTRFS_RBIO_PARITY_SCRUB \|\|
				620	cur->operation == BTRFS_RBIO_PARITY_SCRUB)
				621	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	622
				623	return 1;
				624	}
				625
				626	/*
				627	* helper to index into the pstripe
				628	*/
				629	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				630	{
				631	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				632	return rbio->stripe_pages[index];
				633	}
				634
				635	/*
				636	* helper to index into the qstripe, returns null
				637	* if there is no qstripe
				638	*/
				639	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				640	{
				641	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
				642	return NULL;
				643
				644	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				645	PAGE_CACHE_SHIFT;
				646	return rbio->stripe_pages[index];
				647	}
				648
				649	/*
				650	* The first stripe in the table for a logical address
				651	* has the lock. rbios are added in one of three ways:
				652	*
				653	* 1) Nobody has the stripe locked yet. The rbio is given
				654	* the lock and 0 is returned. The caller must start the IO
				655	* themselves.
				656	*
				657	* 2) Someone has the stripe locked, but we're able to merge
				658	* with the lock owner. The rbio is freed and the IO will
				659	* start automatically along with the existing rbio. 1 is returned.
				660	*
				661	* 3) Someone has the stripe locked, but we're not able to merge.
				662	* The rbio is added to the lock owner's plug list, or merged into
				663	* an rbio already on the plug list. When the lock owner unlocks,
				664	* the next rbio on the list is run and the IO is started automatically.
				665	* 1 is returned
				666	*
				667	* If we return 0, the caller still owns the rbio and must continue with
				668	* IO submission. If we return 1, the caller must assume the rbio has
				669	* already been freed.
				670	*/
				671	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				672	{
				673	int bucket = rbio_bucket(rbio);
				674	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				675	struct btrfs_raid_bio *cur;
				676	struct btrfs_raid_bio *pending;
				677	unsigned long flags;
				678	DEFINE_WAIT(wait);
				679	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	680	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	681	int ret = 0;
				682	int walk = 0;
				683
				684	spin_lock_irqsave(&h->lock, flags);
				685	list_for_each_entry(cur, &h->hash_list, hash_list) {
				686	walk++;
				687	if (cur->raid_map[0] == rbio->raid_map[0]) {
				688	spin_lock(&cur->bio_list_lock);
				689
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	690	/* can we steal this cached rbio's pages? */
				691	if (bio_list_empty(&cur->bio_list) &&
				692	list_empty(&cur->plug_list) &&
				693	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				694	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				695	list_del_init(&cur->hash_list);
				696	atomic_dec(&cur->refs);
				697
				698	steal_rbio(cur, rbio);
				699	cache_drop = cur;
				700	spin_unlock(&cur->bio_list_lock);
				701
				702	goto lockit;
				703	}
				704
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	705	/* can we merge into the lock owner? */
				706	if (rbio_can_merge(cur, rbio)) {
				707	merge_rbio(cur, rbio);
				708	spin_unlock(&cur->bio_list_lock);
				709	freeit = rbio;
				710	ret = 1;
				711	goto out;
				712	}
				713
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	714
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	715	/*
				716	* we couldn't merge with the running
				717	* rbio, see if we can merge with the
				718	* pending ones. We don't have to
				719	* check for rmw_locked because there
				720	* is no way they are inside finish_rmw
				721	* right now
				722	*/
				723	list_for_each_entry(pending, &cur->plug_list,
				724	plug_list) {
				725	if (rbio_can_merge(pending, rbio)) {
				726	merge_rbio(pending, rbio);
				727	spin_unlock(&cur->bio_list_lock);
				728	freeit = rbio;
				729	ret = 1;
				730	goto out;
				731	}
				732	}
				733
				734	/* no merging, put us on the tail of the plug list,
				735	* our rbio will be started with the currently
				736	* running rbio unlocks
				737	*/
				738	list_add_tail(&rbio->plug_list, &cur->plug_list);
				739	spin_unlock(&cur->bio_list_lock);
				740	ret = 1;
				741	goto out;
				742	}
				743	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	744	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	745	atomic_inc(&rbio->refs);
				746	list_add(&rbio->hash_list, &h->hash_list);
				747	out:
				748	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	749	if (cache_drop)
				750	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	751	if (freeit)
				752	__free_raid_bio(freeit);
				753	return ret;
				754	}
				755
				756	/*
				757	* called as rmw or parity rebuild is completed. If the plug list has more
				758	* rbios waiting for this stripe, the next one on the list will be started
				759	*/
				760	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				761	{
				762	int bucket;
				763	struct btrfs_stripe_hash *h;
				764	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	765	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	766
				767	bucket = rbio_bucket(rbio);
				768	h = rbio->fs_info->stripe_hash_table->table + bucket;
				769
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	770	if (list_empty(&rbio->plug_list))
				771	cache_rbio(rbio);
				772
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	773	spin_lock_irqsave(&h->lock, flags);
				774	spin_lock(&rbio->bio_list_lock);
				775
				776	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	777	/*
				778	* if we're still cached and there is no other IO
				779	* to perform, just leave this rbio here for others
				780	* to steal from later
				781	*/
				782	if (list_empty(&rbio->plug_list) &&
				783	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				784	keep_cache = 1;
				785	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				786	BUG_ON(!bio_list_empty(&rbio->bio_list));
				787	goto done;
				788	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	789
				790	list_del_init(&rbio->hash_list);
				791	atomic_dec(&rbio->refs);
				792
				793	/*
				794	* we use the plug list to hold all the rbios
				795	* waiting for the chance to lock this stripe.
				796	* hand the lock over to one of them.
				797	*/
				798	if (!list_empty(&rbio->plug_list)) {
				799	struct btrfs_raid_bio *next;
				800	struct list_head *head = rbio->plug_list.next;
				801
				802	next = list_entry(head, struct btrfs_raid_bio,
				803	plug_list);
				804
				805	list_del_init(&rbio->plug_list);
				806
				807	list_add(&next->hash_list, &h->hash_list);
				808	atomic_inc(&next->refs);
				809	spin_unlock(&rbio->bio_list_lock);
				810	spin_unlock_irqrestore(&h->lock, flags);
				811
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	812	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	813	async_read_rebuild(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	814	else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	815	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	816	async_rmw_stripe(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	817	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				818	steal_rbio(rbio, next);
				819	async_scrub_parity(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	820	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	821
				822	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	823	} else if (waitqueue_active(&h->wait)) {
				824	spin_unlock(&rbio->bio_list_lock);
				825	spin_unlock_irqrestore(&h->lock, flags);
				826	wake_up(&h->wait);
				827	goto done_nolock;
				828	}
				829	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	830	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	831	spin_unlock(&rbio->bio_list_lock);
				832	spin_unlock_irqrestore(&h->lock, flags);
				833
				834	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	835	if (!keep_cache)
				836	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	837	}
				838
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	839	static inline void
				840	__free_bbio_and_raid_map(struct btrfs_bio bbio, u64 raid_map, int need)
				841	{
				842	if (need) {
				843	kfree(raid_map);
				844	kfree(bbio);
				845	}
				846	}
				847
				848	static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
				849	{
				850	__free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
				851	!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
				852	}
				853
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	854	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				855	{
				856	int i;
				857
				858	WARN_ON(atomic_read(&rbio->refs) < 0);
				859	if (!atomic_dec_and_test(&rbio->refs))
				860	return;
				861
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	862	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	863	WARN_ON(!list_empty(&rbio->hash_list));
				864	WARN_ON(!bio_list_empty(&rbio->bio_list));
				865
				866	for (i = 0; i < rbio->nr_pages; i++) {
				867	if (rbio->stripe_pages[i]) {
				868	__free_page(rbio->stripe_pages[i]);
				869	rbio->stripe_pages[i] = NULL;
				870	}
				871	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	872
				873	free_bbio_and_raid_map(rbio);
				874
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	875	kfree(rbio);
				876	}
				877
				878	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				879	{
				880	unlock_stripe(rbio);
				881	__free_raid_bio(rbio);
				882	}
				883
				884	/*
				885	* this frees the rbio and runs through all the bios in the
				886	* bio_list and calls end_io on them
				887	*/
				888	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				889	{
				890	struct bio *cur = bio_list_get(&rbio->bio_list);
				891	struct bio *next;
				892	free_raid_bio(rbio);
				893
				894	while (cur) {
				895	next = cur->bi_next;
				896	cur->bi_next = NULL;
				897	if (uptodate)
				898	set_bit(BIO_UPTODATE, &cur->bi_flags);
				899	bio_endio(cur, err);
				900	cur = next;
				901	}
				902	}
				903
				904	/*
				905	* end io function used by finish_rmw. When we finally
				906	* get here, we've written a full stripe
				907	*/
				908	static void raid_write_end_io(struct bio *bio, int err)
				909	{
				910	struct btrfs_raid_bio *rbio = bio->bi_private;
				911
				912	if (err)
				913	fail_bio_stripe(rbio, bio);
				914
				915	bio_put(bio);
				916
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	917	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	918	return;
				919
				920	err = 0;
				921
				922	/* OK, we have read all the stripes we need to. */
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	923	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	924	err = -EIO;
				925
				926	rbio_orig_end_io(rbio, err, 0);
				927	return;
				928	}
				929
				930	/*
				931	* the read/modify/write code wants to use the original bio for
				932	* any pages it included, and then use the rbio for everything
				933	* else. This function decides if a given index (stripe number)
				934	* and page number in that stripe fall inside the original bio
				935	* or the rbio.
				936	*
				937	* if you set bio_list_only, you'll get a NULL back for any ranges
				938	* that are outside the bio_list
				939	*
				940	* This doesn't take any refs on anything, you get a bare page pointer
				941	* and the caller must bump refs as required.
				942	*
				943	* You must call index_rbio_pages once before you can trust
				944	* the answers from this function.
				945	*/
				946	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				947	int index, int pagenr, int bio_list_only)
				948	{
				949	int chunk_page;
				950	struct page *p = NULL;
				951
				952	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				953
				954	spin_lock_irq(&rbio->bio_list_lock);
				955	p = rbio->bio_pages[chunk_page];
				956	spin_unlock_irq(&rbio->bio_list_lock);
				957
				958	if (p \|\| bio_list_only)
				959	return p;
				960
				961	return rbio->stripe_pages[chunk_page];
				962	}
				963
				964	/*
				965	* number of pages we need for the entire stripe across all the
				966	* drives
				967	*/
				968	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				969	{
				970	unsigned long nr = stripe_len * nr_stripes;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	971	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	972	}
				973
				974	/*
				975	* allocation and initial setup for the btrfs_raid_bio. Not
				976	* this does not allocate any pages for rbio->pages.
				977	*/
				978	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				979	struct btrfs_bio bbio, u64 raid_map,
				980	u64 stripe_len)
				981	{
				982	struct btrfs_raid_bio *rbio;
				983	int nr_data = 0;
				984	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	985	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	986	void *p;
				987
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	988	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2 +
				989	DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	990	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	991	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	992	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	993
				994	bio_list_init(&rbio->bio_list);
				995	INIT_LIST_HEAD(&rbio->plug_list);
				996	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	997	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	998	INIT_LIST_HEAD(&rbio->hash_list);
				999	rbio->bbio = bbio;
				1000	rbio->raid_map = raid_map;
				1001	rbio->fs_info = root->fs_info;
				1002	rbio->stripe_len = stripe_len;
				1003	rbio->nr_pages = num_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	1004	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1005	rbio->faila = -1;
				1006	rbio->failb = -1;
				1007	atomic_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1008	atomic_set(&rbio->error, 0);
				1009	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1010
				1011	/*
				1012	* the stripe_pages and bio_pages array point to the extra
				1013	* memory we allocated past the end of the rbio
				1014	*/
				1015	p = rbio + 1;
				1016	rbio->stripe_pages = p;
				1017	rbio->bio_pages = p + sizeof(struct page ) num_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	1018	rbio->dbitmap = p + sizeof(struct page ) num_pages * 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1019
				1020	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
				1021	nr_data = bbio->num_stripes - 2;
				1022	else
				1023	nr_data = bbio->num_stripes - 1;
				1024
				1025	rbio->nr_data = nr_data;
				1026	return rbio;
				1027	}
				1028
				1029	/* allocate pages for all the stripes in the bio, including parity */
				1030	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1031	{
				1032	int i;
				1033	struct page *page;
				1034
				1035	for (i = 0; i < rbio->nr_pages; i++) {
				1036	if (rbio->stripe_pages[i])
				1037	continue;
				1038	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1039	if (!page)
				1040	return -ENOMEM;
				1041	rbio->stripe_pages[i] = page;
				1042	ClearPageUptodate(page);
				1043	}
				1044	return 0;
				1045	}
				1046
				1047	/* allocate pages for just the p/q stripes */
				1048	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1049	{
				1050	int i;
				1051	struct page *page;
				1052
				1053	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				1054
				1055	for (; i < rbio->nr_pages; i++) {
				1056	if (rbio->stripe_pages[i])
				1057	continue;
				1058	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1059	if (!page)
				1060	return -ENOMEM;
				1061	rbio->stripe_pages[i] = page;
				1062	}
				1063	return 0;
				1064	}
				1065
				1066	/*
				1067	* add a single page from a specific stripe into our list of bios for IO
				1068	* this will try to merge into existing bios if possible, and returns
				1069	* zero if all went well.
				1070	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1071	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1072	struct bio_list *bio_list,
				1073	struct page *page,
				1074	int stripe_nr,
				1075	unsigned long page_index,
				1076	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1077	{
				1078	struct bio *last = bio_list->tail;
				1079	u64 last_end = 0;
				1080	int ret;
				1081	struct bio *bio;
				1082	struct btrfs_bio_stripe *stripe;
				1083	u64 disk_start;
				1084
				1085	stripe = &rbio->bbio->stripes[stripe_nr];
				1086	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1087
				1088	/* if the device is missing, just fail this stripe */
				1089	if (!stripe->dev->bdev)
				1090	return fail_rbio_index(rbio, stripe_nr);
				1091
				1092	/* see if we can add this page onto our existing bio */
				1093	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1094	last_end = (u64)last->bi_iter.bi_sector << 9;
				1095	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1096
				1097	/*
				1098	* we can't merge these if they are from different
				1099	* devices or if they are not contiguous
				1100	*/
				1101	if (last_end == disk_start && stripe->dev->bdev &&
				1102	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1103	last->bi_bdev == stripe->dev->bdev) {
				1104	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1105	if (ret == PAGE_CACHE_SIZE)
				1106	return 0;
				1107	}
				1108	}
				1109
				1110	/* put a new bio on the list */
Chris Mason	9be3395	2013-05-17 18:30:14 -0400	[diff] [blame]	1111	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1112	if (!bio)
				1113	return -ENOMEM;
				1114
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1115	bio->bi_iter.bi_size = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1116	bio->bi_bdev = stripe->dev->bdev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1117	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1118	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1119
				1120	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1121	bio_list_add(bio_list, bio);
				1122	return 0;
				1123	}
				1124
				1125	/*
				1126	* while we're doing the read/modify/write cycle, we could
				1127	* have errors in reading pages off the disk. This checks
				1128	* for errors and if we're not able to read the page it'll
				1129	* trigger parity reconstruction. The rmw will be finished
				1130	* after we've reconstructed the failed stripes
				1131	*/
				1132	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1133	{
				1134	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				1135	BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
				1136	__raid56_parity_recover(rbio);
				1137	} else {
				1138	finish_rmw(rbio);
				1139	}
				1140	}
				1141
				1142	/*
				1143	* these are just the pages from the rbio array, not from anything
				1144	* the FS sent down to us
				1145	*/
				1146	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1147	{
				1148	int index;
				1149	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1150	index += page;
				1151	return rbio->stripe_pages[index];
				1152	}
				1153
				1154	/*
				1155	* helper function to walk our bio list and populate the bio_pages array with
				1156	* the result. This seems expensive, but it is faster than constantly
				1157	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1158	* reconstruction.
				1159	*
				1160	* This must be called before you trust the answers from page_in_rbio
				1161	*/
				1162	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1163	{
				1164	struct bio *bio;
				1165	u64 start;
				1166	unsigned long stripe_offset;
				1167	unsigned long page_index;
				1168	struct page *p;
				1169	int i;
				1170
				1171	spin_lock_irq(&rbio->bio_list_lock);
				1172	bio_list_for_each(bio, &rbio->bio_list) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1173	start = (u64)bio->bi_iter.bi_sector << 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1174	stripe_offset = start - rbio->raid_map[0];
				1175	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1176
				1177	for (i = 0; i < bio->bi_vcnt; i++) {
				1178	p = bio->bi_io_vec[i].bv_page;
				1179	rbio->bio_pages[page_index + i] = p;
				1180	}
				1181	}
				1182	spin_unlock_irq(&rbio->bio_list_lock);
				1183	}
				1184
				1185	/*
				1186	* this is called from one of two situations. We either
				1187	* have a full stripe from the higher layers, or we've read all
				1188	* the missing bits off disk.
				1189	*
				1190	* This will calculate the parity and then send down any
				1191	* changed blocks.
				1192	*/
				1193	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1194	{
				1195	struct btrfs_bio *bbio = rbio->bbio;
				1196	void *pointers[bbio->num_stripes];
				1197	int stripe_len = rbio->stripe_len;
				1198	int nr_data = rbio->nr_data;
				1199	int stripe;
				1200	int pagenr;
				1201	int p_stripe = -1;
				1202	int q_stripe = -1;
				1203	struct bio_list bio_list;
				1204	struct bio *bio;
				1205	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1206	int ret;
				1207
				1208	bio_list_init(&bio_list);
				1209
				1210	if (bbio->num_stripes - rbio->nr_data == 1) {
				1211	p_stripe = bbio->num_stripes - 1;
				1212	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				1213	p_stripe = bbio->num_stripes - 2;
				1214	q_stripe = bbio->num_stripes - 1;
				1215	} else {
				1216	BUG();
				1217	}
				1218
				1219	/* at this point we either have a full stripe,
				1220	* or we've read the full stripe from the drive.
				1221	* recalculate the parity and write the new results.
				1222	*
				1223	* We're not allowed to add any new bios to the
				1224	* bio list here, anyone else that wants to
				1225	* change this stripe needs to do their own rmw.
				1226	*/
				1227	spin_lock_irq(&rbio->bio_list_lock);
				1228	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1229	spin_unlock_irq(&rbio->bio_list_lock);
				1230
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1231	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1232
				1233	/*
				1234	* now that we've set rmw_locked, run through the
				1235	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1236	*
				1237	* We don't cache full rbios because we're assuming
				1238	* the higher layers are unlikely to use this area of
				1239	* the disk again soon. If they do use it again,
				1240	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1241	*/
				1242	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1243	if (!rbio_is_full(rbio))
				1244	cache_rbio_pages(rbio);
				1245	else
				1246	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1247
				1248	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1249	struct page *p;
				1250	/* first collect one page from each data stripe */
				1251	for (stripe = 0; stripe < nr_data; stripe++) {
				1252	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1253	pointers[stripe] = kmap(p);
				1254	}
				1255
				1256	/* then add the parity stripe */
				1257	p = rbio_pstripe_page(rbio, pagenr);
				1258	SetPageUptodate(p);
				1259	pointers[stripe++] = kmap(p);
				1260
				1261	if (q_stripe != -1) {
				1262
				1263	/*
				1264	* raid6, add the qstripe and call the
				1265	* library function to fill in our p/q
				1266	*/
				1267	p = rbio_qstripe_page(rbio, pagenr);
				1268	SetPageUptodate(p);
				1269	pointers[stripe++] = kmap(p);
				1270
				1271	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				1272	pointers);
				1273	} else {
				1274	/* raid5 */
				1275	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1276	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1277	}
				1278
				1279
				1280	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				1281	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1282	}
				1283
				1284	/*
				1285	* time to start writing. Make bios for everything from the
				1286	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1287	* everything else.
				1288	*/
				1289	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1290	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1291	struct page *page;
				1292	if (stripe < rbio->nr_data) {
				1293	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1294	if (!page)
				1295	continue;
				1296	} else {
				1297	page = rbio_stripe_page(rbio, stripe, pagenr);
				1298	}
				1299
				1300	ret = rbio_add_io_page(rbio, &bio_list,
				1301	page, stripe, pagenr, rbio->stripe_len);
				1302	if (ret)
				1303	goto cleanup;
				1304	}
				1305	}
				1306
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1307	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1308	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1309
				1310	while (1) {
				1311	bio = bio_list_pop(&bio_list);
				1312	if (!bio)
				1313	break;
				1314
				1315	bio->bi_private = rbio;
				1316	bio->bi_end_io = raid_write_end_io;
				1317	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1318	submit_bio(WRITE, bio);
				1319	}
				1320	return;
				1321
				1322	cleanup:
				1323	rbio_orig_end_io(rbio, -EIO, 0);
				1324	}
				1325
				1326	/*
				1327	* helper to find the stripe number for a given bio. Used to figure out which
				1328	* stripe has failed. This expects the bio to correspond to a physical disk,
				1329	* so it looks up based on physical sector numbers.
				1330	*/
				1331	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1332	struct bio *bio)
				1333	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1334	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1335	u64 stripe_start;
				1336	int i;
				1337	struct btrfs_bio_stripe *stripe;
				1338
				1339	physical <<= 9;
				1340
				1341	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1342	stripe = &rbio->bbio->stripes[i];
				1343	stripe_start = stripe->physical;
				1344	if (physical >= stripe_start &&
				1345	physical < stripe_start + rbio->stripe_len) {
				1346	return i;
				1347	}
				1348	}
				1349	return -1;
				1350	}
				1351
				1352	/*
				1353	* helper to find the stripe number for a given
				1354	* bio (before mapping). Used to figure out which stripe has
				1355	* failed. This looks up based on logical block numbers.
				1356	*/
				1357	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1358	struct bio *bio)
				1359	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1360	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1361	u64 stripe_start;
				1362	int i;
				1363
				1364	logical <<= 9;
				1365
				1366	for (i = 0; i < rbio->nr_data; i++) {
				1367	stripe_start = rbio->raid_map[i];
				1368	if (logical >= stripe_start &&
				1369	logical < stripe_start + rbio->stripe_len) {
				1370	return i;
				1371	}
				1372	}
				1373	return -1;
				1374	}
				1375
				1376	/*
				1377	* returns -EIO if we had too many failures
				1378	*/
				1379	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1380	{
				1381	unsigned long flags;
				1382	int ret = 0;
				1383
				1384	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1385
				1386	/* we already know this stripe is bad, move on */
				1387	if (rbio->faila == failed \|\| rbio->failb == failed)
				1388	goto out;
				1389
				1390	if (rbio->faila == -1) {
				1391	/* first failure on this rbio */
				1392	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1393	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1394	} else if (rbio->failb == -1) {
				1395	/* second failure on this rbio */
				1396	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1397	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1398	} else {
				1399	ret = -EIO;
				1400	}
				1401	out:
				1402	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1403
				1404	return ret;
				1405	}
				1406
				1407	/*
				1408	* helper to fail a stripe based on a physical disk
				1409	* bio.
				1410	*/
				1411	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1412	struct bio *bio)
				1413	{
				1414	int failed = find_bio_stripe(rbio, bio);
				1415
				1416	if (failed < 0)
				1417	return -EIO;
				1418
				1419	return fail_rbio_index(rbio, failed);
				1420	}
				1421
				1422	/*
				1423	* this sets each page in the bio uptodate. It should only be used on private
				1424	* rbio pages, nothing that comes in from the higher layers
				1425	*/
				1426	static void set_bio_pages_uptodate(struct bio *bio)
				1427	{
				1428	int i;
				1429	struct page *p;
				1430
				1431	for (i = 0; i < bio->bi_vcnt; i++) {
				1432	p = bio->bi_io_vec[i].bv_page;
				1433	SetPageUptodate(p);
				1434	}
				1435	}
				1436
				1437	/*
				1438	* end io for the read phase of the rmw cycle. All the bios here are physical
				1439	* stripe bios we've read from the disk so we can recalculate the parity of the
				1440	* stripe.
				1441	*
				1442	* This will usually kick off finish_rmw once all the bios are read in, but it
				1443	* may trigger parity reconstruction if we had any errors along the way
				1444	*/
				1445	static void raid_rmw_end_io(struct bio *bio, int err)
				1446	{
				1447	struct btrfs_raid_bio *rbio = bio->bi_private;
				1448
				1449	if (err)
				1450	fail_bio_stripe(rbio, bio);
				1451	else
				1452	set_bio_pages_uptodate(bio);
				1453
				1454	bio_put(bio);
				1455
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1456	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1457	return;
				1458
				1459	err = 0;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1460	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1461	goto cleanup;
				1462
				1463	/*
				1464	* this will normally call finish_rmw to start our write
				1465	* but if there are any failed stripes we'll reconstruct
				1466	* from parity first
				1467	*/
				1468	validate_rbio_for_rmw(rbio);
				1469	return;
				1470
				1471	cleanup:
				1472
				1473	rbio_orig_end_io(rbio, -EIO, 0);
				1474	}
				1475
				1476	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1477	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1478	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1479	rmw_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1480
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1481	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1482	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1483	}
				1484
				1485	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1486	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1487	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1488	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1489
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1490	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1491	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1492	}
				1493
				1494	/*
				1495	* the stripe must be locked by the caller. It will
				1496	* unlock after all the writes are done
				1497	*/
				1498	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1499	{
				1500	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1501	struct bio_list bio_list;
				1502	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1503	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1504	int pagenr;
				1505	int stripe;
				1506	struct bio *bio;
				1507
				1508	bio_list_init(&bio_list);
				1509
				1510	ret = alloc_rbio_pages(rbio);
				1511	if (ret)
				1512	goto cleanup;
				1513
				1514	index_rbio_pages(rbio);
				1515
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1516	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1517	/*
				1518	* build a list of bios to read all the missing parts of this
				1519	* stripe
				1520	*/
				1521	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1522	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1523	struct page *page;
				1524	/*
				1525	* we want to find all the pages missing from
				1526	* the rbio and read them from the disk. If
				1527	* page_in_rbio finds a page in the bio list
				1528	* we don't need to read it off the stripe.
				1529	*/
				1530	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1531	if (page)
				1532	continue;
				1533
				1534	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1535	/*
				1536	* the bio cache may have handed us an uptodate
				1537	* page. If so, be happy and use it
				1538	*/
				1539	if (PageUptodate(page))
				1540	continue;
				1541
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1542	ret = rbio_add_io_page(rbio, &bio_list, page,
				1543	stripe, pagenr, rbio->stripe_len);
				1544	if (ret)
				1545	goto cleanup;
				1546	}
				1547	}
				1548
				1549	bios_to_read = bio_list_size(&bio_list);
				1550	if (!bios_to_read) {
				1551	/*
				1552	* this can happen if others have merged with
				1553	* us, it means there is nothing left to read.
				1554	* But if there are missing devices it may not be
				1555	* safe to do the full stripe write yet.
				1556	*/
				1557	goto finish;
				1558	}
				1559
				1560	/*
				1561	* the bbio may be freed once we submit the last bio. Make sure
				1562	* not to touch it after that
				1563	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1564	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1565	while (1) {
				1566	bio = bio_list_pop(&bio_list);
				1567	if (!bio)
				1568	break;
				1569
				1570	bio->bi_private = rbio;
				1571	bio->bi_end_io = raid_rmw_end_io;
				1572
				1573	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1574	BTRFS_WQ_ENDIO_RAID56);
				1575
				1576	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1577	submit_bio(READ, bio);
				1578	}
				1579	/* the actual write will happen once the reads are done */
				1580	return 0;
				1581
				1582	cleanup:
				1583	rbio_orig_end_io(rbio, -EIO, 0);
				1584	return -EIO;
				1585
				1586	finish:
				1587	validate_rbio_for_rmw(rbio);
				1588	return 0;
				1589	}
				1590
				1591	/*
				1592	* if the upper layers pass in a full stripe, we thank them by only allocating
				1593	* enough pages to hold the parity, and sending it all down quickly.
				1594	*/
				1595	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1596	{
				1597	int ret;
				1598
				1599	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1600	if (ret) {
				1601	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1602	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1603	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1604
				1605	ret = lock_stripe_add(rbio);
				1606	if (ret == 0)
				1607	finish_rmw(rbio);
				1608	return 0;
				1609	}
				1610
				1611	/*
				1612	* partial stripe writes get handed over to async helpers.
				1613	* We're really hoping to merge a few more writes into this
				1614	* rbio before calculating new parity
				1615	*/
				1616	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1617	{
				1618	int ret;
				1619
				1620	ret = lock_stripe_add(rbio);
				1621	if (ret == 0)
				1622	async_rmw_stripe(rbio);
				1623	return 0;
				1624	}
				1625
				1626	/*
				1627	* sometimes while we were reading from the drive to
				1628	* recalculate parity, enough new bios come into create
				1629	* a full stripe. So we do a check here to see if we can
				1630	* go directly to finish_rmw
				1631	*/
				1632	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1633	{
				1634	/* head off into rmw land if we don't have a full stripe */
				1635	if (!rbio_is_full(rbio))
				1636	return partial_stripe_write(rbio);
				1637	return full_stripe_write(rbio);
				1638	}
				1639
				1640	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1641	* We use plugging call backs to collect full stripes.
				1642	* Any time we get a partial stripe write while plugged
				1643	* we collect it into a list. When the unplug comes down,
				1644	* we sort the list by logical block number and merge
				1645	* everything we can into the same rbios
				1646	*/
				1647	struct btrfs_plug_cb {
				1648	struct blk_plug_cb cb;
				1649	struct btrfs_fs_info *info;
				1650	struct list_head rbio_list;
				1651	struct btrfs_work work;
				1652	};
				1653
				1654	/*
				1655	* rbios on the plug list are sorted for easier merging.
				1656	*/
				1657	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1658	{
				1659	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1660	plug_list);
				1661	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1662	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1663	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1664	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1665
				1666	if (a_sector < b_sector)
				1667	return -1;
				1668	if (a_sector > b_sector)
				1669	return 1;
				1670	return 0;
				1671	}
				1672
				1673	static void run_plug(struct btrfs_plug_cb *plug)
				1674	{
				1675	struct btrfs_raid_bio *cur;
				1676	struct btrfs_raid_bio *last = NULL;
				1677
				1678	/*
				1679	* sort our plug list then try to merge
				1680	* everything we can in hopes of creating full
				1681	* stripes.
				1682	*/
				1683	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1684	while (!list_empty(&plug->rbio_list)) {
				1685	cur = list_entry(plug->rbio_list.next,
				1686	struct btrfs_raid_bio, plug_list);
				1687	list_del_init(&cur->plug_list);
				1688
				1689	if (rbio_is_full(cur)) {
				1690	/* we have a full stripe, send it down */
				1691	full_stripe_write(cur);
				1692	continue;
				1693	}
				1694	if (last) {
				1695	if (rbio_can_merge(last, cur)) {
				1696	merge_rbio(last, cur);
				1697	__free_raid_bio(cur);
				1698	continue;
				1699
				1700	}
				1701	__raid56_parity_write(last);
				1702	}
				1703	last = cur;
				1704	}
				1705	if (last) {
				1706	__raid56_parity_write(last);
				1707	}
				1708	kfree(plug);
				1709	}
				1710
				1711	/*
				1712	* if the unplug comes from schedule, we have to push the
				1713	* work off to a helper thread
				1714	*/
				1715	static void unplug_work(struct btrfs_work *work)
				1716	{
				1717	struct btrfs_plug_cb *plug;
				1718	plug = container_of(work, struct btrfs_plug_cb, work);
				1719	run_plug(plug);
				1720	}
				1721
				1722	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1723	{
				1724	struct btrfs_plug_cb *plug;
				1725	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1726
				1727	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1728	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1729	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1730	btrfs_queue_work(plug->info->rmw_workers,
				1731	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1732	return;
				1733	}
				1734	run_plug(plug);
				1735	}
				1736
				1737	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1738	* our main entry point for writes from the rest of the FS.
				1739	*/
				1740	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1741	struct btrfs_bio bbio, u64 raid_map,
				1742	u64 stripe_len)
				1743	{
				1744	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1745	struct btrfs_plug_cb *plug = NULL;
				1746	struct blk_plug_cb *cb;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1747
				1748	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1749	if (IS_ERR(rbio)) {
				1750	__free_bbio_and_raid_map(bbio, raid_map, 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1751	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1752	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1753	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1754	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1755	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1756
				1757	/*
				1758	* don't plug on full rbios, just get them out the door
				1759	* as quickly as we can
				1760	*/
				1761	if (rbio_is_full(rbio))
				1762	return full_stripe_write(rbio);
				1763
				1764	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1765	sizeof(*plug));
				1766	if (cb) {
				1767	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1768	if (!plug->info) {
				1769	plug->info = root->fs_info;
				1770	INIT_LIST_HEAD(&plug->rbio_list);
				1771	}
				1772	list_add_tail(&rbio->plug_list, &plug->rbio_list);
				1773	} else {
				1774	return __raid56_parity_write(rbio);
				1775	}
				1776	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1777	}
				1778
				1779	/*
				1780	* all parity reconstruction happens here. We've read in everything
				1781	* we can find from the drives and this does the heavy lifting of
				1782	* sorting the good from the bad.
				1783	*/
				1784	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1785	{
				1786	int pagenr, stripe;
				1787	void **pointers;
				1788	int faila = -1, failb = -1;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1789	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1790	struct page *page;
				1791	int err;
				1792	int i;
				1793
				1794	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
				1795	GFP_NOFS);
				1796	if (!pointers) {
				1797	err = -ENOMEM;
				1798	goto cleanup_io;
				1799	}
				1800
				1801	faila = rbio->faila;
				1802	failb = rbio->failb;
				1803
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1804	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1805	spin_lock_irq(&rbio->bio_list_lock);
				1806	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1807	spin_unlock_irq(&rbio->bio_list_lock);
				1808	}
				1809
				1810	index_rbio_pages(rbio);
				1811
				1812	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	1813	/*
				1814	* Now we just use bitmap to mark the horizontal stripes in
				1815	* which we have data when doing parity scrub.
				1816	*/
				1817	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1818	!test_bit(pagenr, rbio->dbitmap))
				1819	continue;
				1820
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1821	/* setup our array of pointers with pages
				1822	* from each stripe
				1823	*/
				1824	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1825	/*
				1826	* if we're rebuilding a read, we have to use
				1827	* pages from the bio list
				1828	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1829	if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1830	(stripe == faila \|\| stripe == failb)) {
				1831	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1832	} else {
				1833	page = rbio_stripe_page(rbio, stripe, pagenr);
				1834	}
				1835	pointers[stripe] = kmap(page);
				1836	}
				1837
				1838	/* all raid6 handling here */
				1839	if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
				1840	RAID6_Q_STRIPE) {
				1841
				1842	/*
				1843	* single failure, rebuild from parity raid5
				1844	* style
				1845	*/
				1846	if (failb < 0) {
				1847	if (faila == rbio->nr_data) {
				1848	/*
				1849	* Just the P stripe has failed, without
				1850	* a bad data or Q stripe.
				1851	* TODO, we should redo the xor here.
				1852	*/
				1853	err = -EIO;
				1854	goto cleanup;
				1855	}
				1856	/*
				1857	* a single failure in raid6 is rebuilt
				1858	* in the pstripe code below
				1859	*/
				1860	goto pstripe;
				1861	}
				1862
				1863	/* make sure our ps and qs are in order */
				1864	if (faila > failb) {
				1865	int tmp = failb;
				1866	failb = faila;
				1867	faila = tmp;
				1868	}
				1869
				1870	/* if the q stripe is failed, do a pstripe reconstruction
				1871	* from the xors.
				1872	* If both the q stripe and the P stripe are failed, we're
				1873	* here due to a crc mismatch and we can't give them the
				1874	* data they want
				1875	*/
				1876	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1877	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1878	err = -EIO;
				1879	goto cleanup;
				1880	}
				1881	/*
				1882	* otherwise we have one bad data stripe and
				1883	* a good P stripe. raid5!
				1884	*/
				1885	goto pstripe;
				1886	}
				1887
				1888	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
				1889	raid6_datap_recov(rbio->bbio->num_stripes,
				1890	PAGE_SIZE, faila, pointers);
				1891	} else {
				1892	raid6_2data_recov(rbio->bbio->num_stripes,
				1893	PAGE_SIZE, faila, failb,
				1894	pointers);
				1895	}
				1896	} else {
				1897	void *p;
				1898
				1899	/* rebuild from P stripe here (raid5 or raid6) */
				1900	BUG_ON(failb != -1);
				1901	pstripe:
				1902	/* Copy parity block into failed block to start with */
				1903	memcpy(pointers[faila],
				1904	pointers[rbio->nr_data],
				1905	PAGE_CACHE_SIZE);
				1906
				1907	/* rearrange the pointer array */
				1908	p = pointers[faila];
				1909	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1910	pointers[stripe] = pointers[stripe + 1];
				1911	pointers[rbio->nr_data - 1] = p;
				1912
				1913	/* xor in the rest */
				1914	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1915	}
				1916	/* if we're doing this rebuild as part of an rmw, go through
				1917	* and set all of our private rbio pages in the
				1918	* failed stripes as uptodate. This way finish_rmw will
				1919	* know they can be trusted. If this was a read reconstruction,
				1920	* other endio functions will fiddle the uptodate bits
				1921	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1922	if (rbio->operation == BTRFS_RBIO_WRITE) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1923	for (i = 0; i < nr_pages; i++) {
				1924	if (faila != -1) {
				1925	page = rbio_stripe_page(rbio, faila, i);
				1926	SetPageUptodate(page);
				1927	}
				1928	if (failb != -1) {
				1929	page = rbio_stripe_page(rbio, failb, i);
				1930	SetPageUptodate(page);
				1931	}
				1932	}
				1933	}
				1934	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1935	/*
				1936	* if we're rebuilding a read, we have to use
				1937	* pages from the bio list
				1938	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1939	if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1940	(stripe == faila \|\| stripe == failb)) {
				1941	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1942	} else {
				1943	page = rbio_stripe_page(rbio, stripe, pagenr);
				1944	}
				1945	kunmap(page);
				1946	}
				1947	}
				1948
				1949	err = 0;
				1950	cleanup:
				1951	kfree(pointers);
				1952
				1953	cleanup_io:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1954	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1955	if (err == 0 &&
				1956	!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1957	cache_rbio_pages(rbio);
				1958	else
				1959	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1960
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1961	rbio_orig_end_io(rbio, err, err == 0);
				1962	} else if (err == 0) {
				1963	rbio->faila = -1;
				1964	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	1965
				1966	if (rbio->operation == BTRFS_RBIO_WRITE)
				1967	finish_rmw(rbio);
				1968	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				1969	finish_parity_scrub(rbio, 0);
				1970	else
				1971	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1972	} else {
				1973	rbio_orig_end_io(rbio, err, 0);
				1974	}
				1975	}
				1976
				1977	/*
				1978	* This is called only for stripes we've read from disk to
				1979	* reconstruct the parity.
				1980	*/
				1981	static void raid_recover_end_io(struct bio *bio, int err)
				1982	{
				1983	struct btrfs_raid_bio *rbio = bio->bi_private;
				1984
				1985	/*
				1986	* we only read stripe pages off the disk, set them
				1987	* up to date if there were no errors
				1988	*/
				1989	if (err)
				1990	fail_bio_stripe(rbio, bio);
				1991	else
				1992	set_bio_pages_uptodate(bio);
				1993	bio_put(bio);
				1994
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1995	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1996	return;
				1997
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1998	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1999	rbio_orig_end_io(rbio, -EIO, 0);
				2000	else
				2001	__raid_recover_end_io(rbio);
				2002	}
				2003
				2004	/*
				2005	* reads everything we need off the disk to reconstruct
				2006	* the parity. endio handlers trigger final reconstruction
				2007	* when the IO is done.
				2008	*
				2009	* This is used both for reads from the higher layers and for
				2010	* parity construction required to finish a rmw cycle.
				2011	*/
				2012	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2013	{
				2014	int bios_to_read = 0;
				2015	struct btrfs_bio *bbio = rbio->bbio;
				2016	struct bio_list bio_list;
				2017	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	2018	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2019	int pagenr;
				2020	int stripe;
				2021	struct bio *bio;
				2022
				2023	bio_list_init(&bio_list);
				2024
				2025	ret = alloc_rbio_pages(rbio);
				2026	if (ret)
				2027	goto cleanup;
				2028
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2029	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2030
				2031	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2032	* read everything that hasn't failed. Thanks to the
				2033	* stripe cache, it is possible that some or all of these
				2034	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2035	*/
				2036	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2037	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2038	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2039	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2040	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2041
				2042	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				2043	struct page *p;
				2044
				2045	/*
				2046	* the rmw code may have already read this
				2047	* page in
				2048	*/
				2049	p = rbio_stripe_page(rbio, stripe, pagenr);
				2050	if (PageUptodate(p))
				2051	continue;
				2052
				2053	ret = rbio_add_io_page(rbio, &bio_list,
				2054	rbio_stripe_page(rbio, stripe, pagenr),
				2055	stripe, pagenr, rbio->stripe_len);
				2056	if (ret < 0)
				2057	goto cleanup;
				2058	}
				2059	}
				2060
				2061	bios_to_read = bio_list_size(&bio_list);
				2062	if (!bios_to_read) {
				2063	/*
				2064	* we might have no bios to read just because the pages
				2065	* were up to date, or we might have no bios to read because
				2066	* the devices were gone.
				2067	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2068	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2069	__raid_recover_end_io(rbio);
				2070	goto out;
				2071	} else {
				2072	goto cleanup;
				2073	}
				2074	}
				2075
				2076	/*
				2077	* the bbio may be freed once we submit the last bio. Make sure
				2078	* not to touch it after that
				2079	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2080	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2081	while (1) {
				2082	bio = bio_list_pop(&bio_list);
				2083	if (!bio)
				2084	break;
				2085
				2086	bio->bi_private = rbio;
				2087	bio->bi_end_io = raid_recover_end_io;
				2088
				2089	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2090	BTRFS_WQ_ENDIO_RAID56);
				2091
				2092	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2093	submit_bio(READ, bio);
				2094	}
				2095	out:
				2096	return 0;
				2097
				2098	cleanup:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2099	if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2100	rbio_orig_end_io(rbio, -EIO, 0);
				2101	return -EIO;
				2102	}
				2103
				2104	/*
				2105	* the main entry point for reads from the higher layers. This
				2106	* is really only called when the normal read path had a failure,
				2107	* so we assume the bio they send down corresponds to a failed part
				2108	* of the drive.
				2109	*/
				2110	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				2111	struct btrfs_bio bbio, u64 raid_map,
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2112	u64 stripe_len, int mirror_num, int hold_bbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2113	{
				2114	struct btrfs_raid_bio *rbio;
				2115	int ret;
				2116
				2117	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2118	if (IS_ERR(rbio)) {
				2119	__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2120	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2121	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2122
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2123	if (hold_bbio)
				2124	set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2125	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2126	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2127	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2128
				2129	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2130	if (rbio->faila == -1) {
				2131	BUG();
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2132	__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2133	kfree(rbio);
				2134	return -EIO;
				2135	}
				2136
				2137	/*
				2138	* reconstruct from the q stripe if they are
				2139	* asking for mirror 3
				2140	*/
				2141	if (mirror_num == 3)
				2142	rbio->failb = bbio->num_stripes - 2;
				2143
				2144	ret = lock_stripe_add(rbio);
				2145
				2146	/*
				2147	* __raid56_parity_recover will end the bio with
				2148	* any errors it hits. We don't want to return
				2149	* its error value up the stack because our caller
				2150	* will end up calling bio_endio with any nonzero
				2151	* return
				2152	*/
				2153	if (ret == 0)
				2154	__raid56_parity_recover(rbio);
				2155	/*
				2156	* our rbio has been added to the list of
				2157	* rbios that will be handled after the
				2158	* currently lock owner is done
				2159	*/
				2160	return 0;
				2161
				2162	}
				2163
				2164	static void rmw_work(struct btrfs_work *work)
				2165	{
				2166	struct btrfs_raid_bio *rbio;
				2167
				2168	rbio = container_of(work, struct btrfs_raid_bio, work);
				2169	raid56_rmw_stripe(rbio);
				2170	}
				2171
				2172	static void read_rebuild_work(struct btrfs_work *work)
				2173	{
				2174	struct btrfs_raid_bio *rbio;
				2175
				2176	rbio = container_of(work, struct btrfs_raid_bio, work);
				2177	__raid56_parity_recover(rbio);
				2178	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame^]	2179
				2180	/*
				2181	* The following code is used to scrub/replace the parity stripe
				2182	*
				2183	* Note: We need make sure all the pages that add into the scrub/replace
				2184	* raid bio are correct and not be changed during the scrub/replace. That
				2185	* is those pages just hold metadata or file data with checksum.
				2186	*/
				2187
				2188	struct btrfs_raid_bio *
				2189	raid56_parity_alloc_scrub_rbio(struct btrfs_root root, struct bio bio,
				2190	struct btrfs_bio bbio, u64 raid_map,
				2191	u64 stripe_len, struct btrfs_device *scrub_dev,
				2192	unsigned long *dbitmap, int stripe_nsectors)
				2193	{
				2194	struct btrfs_raid_bio *rbio;
				2195	int i;
				2196
				2197	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				2198	if (IS_ERR(rbio))
				2199	return NULL;
				2200	bio_list_add(&rbio->bio_list, bio);
				2201	/*
				2202	* This is a special bio which is used to hold the completion handler
				2203	* and make the scrub rbio is similar to the other types
				2204	*/
				2205	ASSERT(!bio->bi_iter.bi_size);
				2206	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2207
				2208	for (i = 0; i < bbio->num_stripes; i++) {
				2209	if (bbio->stripes[i].dev == scrub_dev) {
				2210	rbio->scrubp = i;
				2211	break;
				2212	}
				2213	}
				2214
				2215	/* Now we just support the sectorsize equals to page size */
				2216	ASSERT(root->sectorsize == PAGE_SIZE);
				2217	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2218	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2219
				2220	return rbio;
				2221	}
				2222
				2223	void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
				2224	struct page *page, u64 logical)
				2225	{
				2226	int stripe_offset;
				2227	int index;
				2228
				2229	ASSERT(logical >= rbio->raid_map[0]);
				2230	ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
				2231	rbio->stripe_len * rbio->nr_data);
				2232	stripe_offset = (int)(logical - rbio->raid_map[0]);
				2233	index = stripe_offset >> PAGE_CACHE_SHIFT;
				2234	rbio->bio_pages[index] = page;
				2235	}
				2236
				2237	/*
				2238	* We just scrub the parity that we have correct data on the same horizontal,
				2239	* so we needn't allocate all pages for all the stripes.
				2240	*/
				2241	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2242	{
				2243	int i;
				2244	int bit;
				2245	int index;
				2246	struct page *page;
				2247
				2248	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
				2249	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				2250	index = i * rbio->stripe_npages + bit;
				2251	if (rbio->stripe_pages[index])
				2252	continue;
				2253
				2254	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2255	if (!page)
				2256	return -ENOMEM;
				2257	rbio->stripe_pages[index] = page;
				2258	ClearPageUptodate(page);
				2259	}
				2260	}
				2261	return 0;
				2262	}
				2263
				2264	/*
				2265	* end io function used by finish_rmw. When we finally
				2266	* get here, we've written a full stripe
				2267	*/
				2268	static void raid_write_parity_end_io(struct bio *bio, int err)
				2269	{
				2270	struct btrfs_raid_bio *rbio = bio->bi_private;
				2271
				2272	if (err)
				2273	fail_bio_stripe(rbio, bio);
				2274
				2275	bio_put(bio);
				2276
				2277	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2278	return;
				2279
				2280	err = 0;
				2281
				2282	if (atomic_read(&rbio->error))
				2283	err = -EIO;
				2284
				2285	rbio_orig_end_io(rbio, err, 0);
				2286	}
				2287
				2288	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2289	int need_check)
				2290	{
				2291	struct btrfs_bio *bbio = rbio->bbio;
				2292	void *pointers[bbio->num_stripes];
				2293	int nr_data = rbio->nr_data;
				2294	int stripe;
				2295	int pagenr;
				2296	int p_stripe = -1;
				2297	int q_stripe = -1;
				2298	struct page *p_page = NULL;
				2299	struct page *q_page = NULL;
				2300	struct bio_list bio_list;
				2301	struct bio *bio;
				2302	int ret;
				2303
				2304	bio_list_init(&bio_list);
				2305
				2306	if (bbio->num_stripes - rbio->nr_data == 1) {
				2307	p_stripe = bbio->num_stripes - 1;
				2308	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				2309	p_stripe = bbio->num_stripes - 2;
				2310	q_stripe = bbio->num_stripes - 1;
				2311	} else {
				2312	BUG();
				2313	}
				2314
				2315	/*
				2316	* Because the higher layers(scrubber) are unlikely to
				2317	* use this area of the disk again soon, so don't cache
				2318	* it.
				2319	*/
				2320	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2321
				2322	if (!need_check)
				2323	goto writeback;
				2324
				2325	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2326	if (!p_page)
				2327	goto cleanup;
				2328	SetPageUptodate(p_page);
				2329
				2330	if (q_stripe != -1) {
				2331	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2332	if (!q_page) {
				2333	__free_page(p_page);
				2334	goto cleanup;
				2335	}
				2336	SetPageUptodate(q_page);
				2337	}
				2338
				2339	atomic_set(&rbio->error, 0);
				2340
				2341	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2342	struct page *p;
				2343	void *parity;
				2344	/* first collect one page from each data stripe */
				2345	for (stripe = 0; stripe < nr_data; stripe++) {
				2346	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2347	pointers[stripe] = kmap(p);
				2348	}
				2349
				2350	/* then add the parity stripe */
				2351	pointers[stripe++] = kmap(p_page);
				2352
				2353	if (q_stripe != -1) {
				2354
				2355	/*
				2356	* raid6, add the qstripe and call the
				2357	* library function to fill in our p/q
				2358	*/
				2359	pointers[stripe++] = kmap(q_page);
				2360
				2361	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				2362	pointers);
				2363	} else {
				2364	/* raid5 */
				2365	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				2366	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				2367	}
				2368
				2369	/* Check scrubbing pairty and repair it */
				2370	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2371	parity = kmap(p);
				2372	if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
				2373	memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
				2374	else
				2375	/* Parity is right, needn't writeback */
				2376	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2377	kunmap(p);
				2378
				2379	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				2380	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				2381	}
				2382
				2383	__free_page(p_page);
				2384	if (q_page)
				2385	__free_page(q_page);
				2386
				2387	writeback:
				2388	/*
				2389	* time to start writing. Make bios for everything from the
				2390	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2391	* everything else.
				2392	*/
				2393	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2394	struct page *page;
				2395
				2396	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2397	ret = rbio_add_io_page(rbio, &bio_list,
				2398	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2399	if (ret)
				2400	goto cleanup;
				2401	}
				2402
				2403	nr_data = bio_list_size(&bio_list);
				2404	if (!nr_data) {
				2405	/* Every parity is right */
				2406	rbio_orig_end_io(rbio, 0, 0);
				2407	return;
				2408	}
				2409
				2410	atomic_set(&rbio->stripes_pending, nr_data);
				2411
				2412	while (1) {
				2413	bio = bio_list_pop(&bio_list);
				2414	if (!bio)
				2415	break;
				2416
				2417	bio->bi_private = rbio;
				2418	bio->bi_end_io = raid_write_parity_end_io;
				2419	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2420	submit_bio(WRITE, bio);
				2421	}
				2422	return;
				2423
				2424	cleanup:
				2425	rbio_orig_end_io(rbio, -EIO, 0);
				2426	}
				2427
				2428	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2429	{
				2430	if (stripe >= 0 && stripe < rbio->nr_data)
				2431	return 1;
				2432	return 0;
				2433	}
				2434
				2435	/*
				2436	* While we're doing the parity check and repair, we could have errors
				2437	* in reading pages off the disk. This checks for errors and if we're
				2438	* not able to read the page it'll trigger parity reconstruction. The
				2439	* parity scrub will be finished after we've reconstructed the failed
				2440	* stripes
				2441	*/
				2442	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2443	{
				2444	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2445	goto cleanup;
				2446
				2447	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2448	int dfail = 0, failp = -1;
				2449
				2450	if (is_data_stripe(rbio, rbio->faila))
				2451	dfail++;
				2452	else if (is_parity_stripe(rbio->faila))
				2453	failp = rbio->faila;
				2454
				2455	if (is_data_stripe(rbio, rbio->failb))
				2456	dfail++;
				2457	else if (is_parity_stripe(rbio->failb))
				2458	failp = rbio->failb;
				2459
				2460	/*
				2461	* Because we can not use a scrubbing parity to repair
				2462	* the data, so the capability of the repair is declined.
				2463	* (In the case of RAID5, we can not repair anything)
				2464	*/
				2465	if (dfail > rbio->bbio->max_errors - 1)
				2466	goto cleanup;
				2467
				2468	/*
				2469	* If all data is good, only parity is correctly, just
				2470	* repair the parity.
				2471	*/
				2472	if (dfail == 0) {
				2473	finish_parity_scrub(rbio, 0);
				2474	return;
				2475	}
				2476
				2477	/*
				2478	* Here means we got one corrupted data stripe and one
				2479	* corrupted parity on RAID6, if the corrupted parity
				2480	* is scrubbing parity, luckly, use the other one to repair
				2481	* the data, or we can not repair the data stripe.
				2482	*/
				2483	if (failp != rbio->scrubp)
				2484	goto cleanup;
				2485
				2486	__raid_recover_end_io(rbio);
				2487	} else {
				2488	finish_parity_scrub(rbio, 1);
				2489	}
				2490	return;
				2491
				2492	cleanup:
				2493	rbio_orig_end_io(rbio, -EIO, 0);
				2494	}
				2495
				2496	/*
				2497	* end io for the read phase of the rmw cycle. All the bios here are physical
				2498	* stripe bios we've read from the disk so we can recalculate the parity of the
				2499	* stripe.
				2500	*
				2501	* This will usually kick off finish_rmw once all the bios are read in, but it
				2502	* may trigger parity reconstruction if we had any errors along the way
				2503	*/
				2504	static void raid56_parity_scrub_end_io(struct bio *bio, int err)
				2505	{
				2506	struct btrfs_raid_bio *rbio = bio->bi_private;
				2507
				2508	if (err)
				2509	fail_bio_stripe(rbio, bio);
				2510	else
				2511	set_bio_pages_uptodate(bio);
				2512
				2513	bio_put(bio);
				2514
				2515	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2516	return;
				2517
				2518	/*
				2519	* this will normally call finish_rmw to start our write
				2520	* but if there are any failed stripes we'll reconstruct
				2521	* from parity first
				2522	*/
				2523	validate_rbio_for_parity_scrub(rbio);
				2524	}
				2525
				2526	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2527	{
				2528	int bios_to_read = 0;
				2529	struct btrfs_bio *bbio = rbio->bbio;
				2530	struct bio_list bio_list;
				2531	int ret;
				2532	int pagenr;
				2533	int stripe;
				2534	struct bio *bio;
				2535
				2536	ret = alloc_rbio_essential_pages(rbio);
				2537	if (ret)
				2538	goto cleanup;
				2539
				2540	bio_list_init(&bio_list);
				2541
				2542	atomic_set(&rbio->error, 0);
				2543	/*
				2544	* build a list of bios to read all the missing parts of this
				2545	* stripe
				2546	*/
				2547	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				2548	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2549	struct page *page;
				2550	/*
				2551	* we want to find all the pages missing from
				2552	* the rbio and read them from the disk. If
				2553	* page_in_rbio finds a page in the bio list
				2554	* we don't need to read it off the stripe.
				2555	*/
				2556	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2557	if (page)
				2558	continue;
				2559
				2560	page = rbio_stripe_page(rbio, stripe, pagenr);
				2561	/*
				2562	* the bio cache may have handed us an uptodate
				2563	* page. If so, be happy and use it
				2564	*/
				2565	if (PageUptodate(page))
				2566	continue;
				2567
				2568	ret = rbio_add_io_page(rbio, &bio_list, page,
				2569	stripe, pagenr, rbio->stripe_len);
				2570	if (ret)
				2571	goto cleanup;
				2572	}
				2573	}
				2574
				2575	bios_to_read = bio_list_size(&bio_list);
				2576	if (!bios_to_read) {
				2577	/*
				2578	* this can happen if others have merged with
				2579	* us, it means there is nothing left to read.
				2580	* But if there are missing devices it may not be
				2581	* safe to do the full stripe write yet.
				2582	*/
				2583	goto finish;
				2584	}
				2585
				2586	/*
				2587	* the bbio may be freed once we submit the last bio. Make sure
				2588	* not to touch it after that
				2589	*/
				2590	atomic_set(&rbio->stripes_pending, bios_to_read);
				2591	while (1) {
				2592	bio = bio_list_pop(&bio_list);
				2593	if (!bio)
				2594	break;
				2595
				2596	bio->bi_private = rbio;
				2597	bio->bi_end_io = raid56_parity_scrub_end_io;
				2598
				2599	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2600	BTRFS_WQ_ENDIO_RAID56);
				2601
				2602	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2603	submit_bio(READ, bio);
				2604	}
				2605	/* the actual write will happen once the reads are done */
				2606	return;
				2607
				2608	cleanup:
				2609	rbio_orig_end_io(rbio, -EIO, 0);
				2610	return;
				2611
				2612	finish:
				2613	validate_rbio_for_parity_scrub(rbio);
				2614	}
				2615
				2616	static void scrub_parity_work(struct btrfs_work *work)
				2617	{
				2618	struct btrfs_raid_bio *rbio;
				2619
				2620	rbio = container_of(work, struct btrfs_raid_bio, work);
				2621	raid56_parity_scrub_stripe(rbio);
				2622	}
				2623
				2624	static void async_scrub_parity(struct btrfs_raid_bio *rbio)
				2625	{
				2626	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2627	scrub_parity_work, NULL, NULL);
				2628
				2629	btrfs_queue_work(rbio->fs_info->rmw_workers,
				2630	&rbio->work);
				2631	}
				2632
				2633	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2634	{
				2635	if (!lock_stripe_add(rbio))
				2636	async_scrub_parity(rbio);
				2637	}