Blame - fs/btrfs/raid56.c - kernel/msm-5.4

blob: 7ccddca9ee713dd1766d4eace8df7819b9e64578 [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
				34	#include <asm/div64.h>
				35	#include "compat.h"
				36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
				61
				62	#define RBIO_CACHE_SIZE 1024
				63
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	64	struct btrfs_raid_bio {
				65	struct btrfs_fs_info *fs_info;
				66	struct btrfs_bio *bbio;
				67
				68	/*
				69	* logical block numbers for the start of each stripe
				70	* The last one or two are p/q. These are sorted,
				71	* so raid_map[0] is the start of our full stripe
				72	*/
				73	u64 *raid_map;
				74
				75	/* while we're doing rmw on a stripe
				76	* we put it into a hash table so we can
				77	* lock the stripe and merge more rbios
				78	* into it.
				79	*/
				80	struct list_head hash_list;
				81
				82	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	83	* LRU list for the stripe cache
				84	*/
				85	struct list_head stripe_cache;
				86
				87	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	88	* for scheduling work in the helper threads
				89	*/
				90	struct btrfs_work work;
				91
				92	/*
				93	* bio list and bio_list_lock are used
				94	* to add more bios into the stripe
				95	* in hopes of avoiding the full rmw
				96	*/
				97	struct bio_list bio_list;
				98	spinlock_t bio_list_lock;
				99
				100	/*
				101	* also protected by the bio_list_lock, the
				102	* stripe locking code uses plug_list to hand off
				103	* the stripe lock to the next pending IO
				104	*/
				105	struct list_head plug_list;
				106
				107	/*
				108	* flags that tell us if it is safe to
				109	* merge with this bio
				110	*/
				111	unsigned long flags;
				112
				113	/* size of each individual stripe on disk */
				114	int stripe_len;
				115
				116	/* number of data stripes (no p/q) */
				117	int nr_data;
				118
				119	/*
				120	* set if we're doing a parity rebuild
				121	* for a read from higher up, which is handled
				122	* differently from a parity rebuild as part of
				123	* rmw
				124	*/
				125	int read_rebuild;
				126
				127	/* first bad stripe */
				128	int faila;
				129
				130	/* second bad stripe (for raid6 use) */
				131	int failb;
				132
				133	/*
				134	* number of pages needed to represent the full
				135	* stripe
				136	*/
				137	int nr_pages;
				138
				139	/*
				140	* size of all the bios in the bio_list. This
				141	* helps us decide if the rbio maps to a full
				142	* stripe or not
				143	*/
				144	int bio_list_bytes;
				145
				146	atomic_t refs;
				147
				148	/*
				149	* these are two arrays of pointers. We allocate the
				150	* rbio big enough to hold them both and setup their
				151	* locations when the rbio is allocated
				152	*/
				153
				154	/* pointers to pages that we allocated for
				155	* reading/writing stripes directly from the disk (including P/Q)
				156	*/
				157	struct page **stripe_pages;
				158
				159	/*
				160	* pointers to the pages in the bio_list. Stored
				161	* here for faster lookup
				162	*/
				163	struct page **bio_pages;
				164	};
				165
				166	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				167	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				168	static void rmw_work(struct btrfs_work *work);
				169	static void read_rebuild_work(struct btrfs_work *work);
				170	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				171	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				172	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				173	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				174	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				175	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				176	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				177
				178	/*
				179	* the stripe hash table is used for locking, and to collect
				180	* bios in hopes of making a full stripe
				181	*/
				182	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				183	{
				184	struct btrfs_stripe_hash_table *table;
				185	struct btrfs_stripe_hash_table *x;
				186	struct btrfs_stripe_hash *cur;
				187	struct btrfs_stripe_hash *h;
				188	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				189	int i;
				190
				191	if (info->stripe_hash_table)
				192	return 0;
				193
				194	table = kzalloc(sizeof(table) + sizeof(h) * num_entries, GFP_NOFS);
				195	if (!table)
				196	return -ENOMEM;
				197
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	198	spin_lock_init(&table->cache_lock);
				199	INIT_LIST_HEAD(&table->stripe_cache);
				200
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	201	h = table->table;
				202
				203	for (i = 0; i < num_entries; i++) {
				204	cur = h + i;
				205	INIT_LIST_HEAD(&cur->hash_list);
				206	spin_lock_init(&cur->lock);
				207	init_waitqueue_head(&cur->wait);
				208	}
				209
				210	x = cmpxchg(&info->stripe_hash_table, NULL, table);
				211	if (x)
				212	kfree(x);
				213	return 0;
				214	}
				215
				216	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	217	* caching an rbio means to copy anything from the
				218	* bio_pages array into the stripe_pages array. We
				219	* use the page uptodate bit in the stripe cache array
				220	* to indicate if it has valid data
				221	*
				222	* once the caching is done, we set the cache ready
				223	* bit.
				224	*/
				225	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				226	{
				227	int i;
				228	char *s;
				229	char *d;
				230	int ret;
				231
				232	ret = alloc_rbio_pages(rbio);
				233	if (ret)
				234	return;
				235
				236	for (i = 0; i < rbio->nr_pages; i++) {
				237	if (!rbio->bio_pages[i])
				238	continue;
				239
				240	s = kmap(rbio->bio_pages[i]);
				241	d = kmap(rbio->stripe_pages[i]);
				242
				243	memcpy(d, s, PAGE_CACHE_SIZE);
				244
				245	kunmap(rbio->bio_pages[i]);
				246	kunmap(rbio->stripe_pages[i]);
				247	SetPageUptodate(rbio->stripe_pages[i]);
				248	}
				249	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				250	}
				251
				252	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	253	* we hash on the first logical address of the stripe
				254	*/
				255	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				256	{
				257	u64 num = rbio->raid_map[0];
				258
				259	/*
				260	* we shift down quite a bit. We're using byte
				261	* addressing, and most of the lower bits are zeros.
				262	* This tends to upset hash_64, and it consistently
				263	* returns just one or two different values.
				264	*
				265	* shifting off the lower bits fixes things.
				266	*/
				267	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				268	}
				269
				270	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	271	* stealing an rbio means taking all the uptodate pages from the stripe
				272	* array in the source rbio and putting them into the destination rbio
				273	*/
				274	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				275	{
				276	int i;
				277	struct page *s;
				278	struct page *d;
				279
				280	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				281	return;
				282
				283	for (i = 0; i < dest->nr_pages; i++) {
				284	s = src->stripe_pages[i];
				285	if (!s \|\| !PageUptodate(s)) {
				286	continue;
				287	}
				288
				289	d = dest->stripe_pages[i];
				290	if (d)
				291	__free_page(d);
				292
				293	dest->stripe_pages[i] = s;
				294	src->stripe_pages[i] = NULL;
				295	}
				296	}
				297
				298	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	299	* merging means we take the bio_list from the victim and
				300	* splice it into the destination. The victim should
				301	* be discarded afterwards.
				302	*
				303	* must be called with dest->rbio_list_lock held
				304	*/
				305	static void merge_rbio(struct btrfs_raid_bio *dest,
				306	struct btrfs_raid_bio *victim)
				307	{
				308	bio_list_merge(&dest->bio_list, &victim->bio_list);
				309	dest->bio_list_bytes += victim->bio_list_bytes;
				310	bio_list_init(&victim->bio_list);
				311	}
				312
				313	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	314	* used to prune items that are in the cache. The caller
				315	* must hold the hash table lock.
				316	*/
				317	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				318	{
				319	int bucket = rbio_bucket(rbio);
				320	struct btrfs_stripe_hash_table *table;
				321	struct btrfs_stripe_hash *h;
				322	int freeit = 0;
				323
				324	/*
				325	* check the bit again under the hash table lock.
				326	*/
				327	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				328	return;
				329
				330	table = rbio->fs_info->stripe_hash_table;
				331	h = table->table + bucket;
				332
				333	/* hold the lock for the bucket because we may be
				334	* removing it from the hash table
				335	*/
				336	spin_lock(&h->lock);
				337
				338	/*
				339	* hold the lock for the bio list because we need
				340	* to make sure the bio list is empty
				341	*/
				342	spin_lock(&rbio->bio_list_lock);
				343
				344	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				345	list_del_init(&rbio->stripe_cache);
				346	table->cache_size -= 1;
				347	freeit = 1;
				348
				349	/* if the bio list isn't empty, this rbio is
				350	* still involved in an IO. We take it out
				351	* of the cache list, and drop the ref that
				352	* was held for the list.
				353	*
				354	* If the bio_list was empty, we also remove
				355	* the rbio from the hash_table, and drop
				356	* the corresponding ref
				357	*/
				358	if (bio_list_empty(&rbio->bio_list)) {
				359	if (!list_empty(&rbio->hash_list)) {
				360	list_del_init(&rbio->hash_list);
				361	atomic_dec(&rbio->refs);
				362	BUG_ON(!list_empty(&rbio->plug_list));
				363	}
				364	}
				365	}
				366
				367	spin_unlock(&rbio->bio_list_lock);
				368	spin_unlock(&h->lock);
				369
				370	if (freeit)
				371	__free_raid_bio(rbio);
				372	}
				373
				374	/*
				375	* prune a given rbio from the cache
				376	*/
				377	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				378	{
				379	struct btrfs_stripe_hash_table *table;
				380	unsigned long flags;
				381
				382	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				383	return;
				384
				385	table = rbio->fs_info->stripe_hash_table;
				386
				387	spin_lock_irqsave(&table->cache_lock, flags);
				388	__remove_rbio_from_cache(rbio);
				389	spin_unlock_irqrestore(&table->cache_lock, flags);
				390	}
				391
				392	/*
				393	* remove everything in the cache
				394	*/
				395	void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
				396	{
				397	struct btrfs_stripe_hash_table *table;
				398	unsigned long flags;
				399	struct btrfs_raid_bio *rbio;
				400
				401	table = info->stripe_hash_table;
				402
				403	spin_lock_irqsave(&table->cache_lock, flags);
				404	while (!list_empty(&table->stripe_cache)) {
				405	rbio = list_entry(table->stripe_cache.next,
				406	struct btrfs_raid_bio,
				407	stripe_cache);
				408	__remove_rbio_from_cache(rbio);
				409	}
				410	spin_unlock_irqrestore(&table->cache_lock, flags);
				411	}
				412
				413	/*
				414	* remove all cached entries and free the hash table
				415	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	416	*/
				417	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				418	{
				419	if (!info->stripe_hash_table)
				420	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	421	btrfs_clear_rbio_cache(info);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	422	kfree(info->stripe_hash_table);
				423	info->stripe_hash_table = NULL;
				424	}
				425
				426	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	427	* insert an rbio into the stripe cache. It
				428	* must have already been prepared by calling
				429	* cache_rbio_pages
				430	*
				431	* If this rbio was already cached, it gets
				432	* moved to the front of the lru.
				433	*
				434	* If the size of the rbio cache is too big, we
				435	* prune an item.
				436	*/
				437	static void cache_rbio(struct btrfs_raid_bio *rbio)
				438	{
				439	struct btrfs_stripe_hash_table *table;
				440	unsigned long flags;
				441
				442	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				443	return;
				444
				445	table = rbio->fs_info->stripe_hash_table;
				446
				447	spin_lock_irqsave(&table->cache_lock, flags);
				448	spin_lock(&rbio->bio_list_lock);
				449
				450	/* bump our ref if we were not in the list before */
				451	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				452	atomic_inc(&rbio->refs);
				453
				454	if (!list_empty(&rbio->stripe_cache)){
				455	list_move(&rbio->stripe_cache, &table->stripe_cache);
				456	} else {
				457	list_add(&rbio->stripe_cache, &table->stripe_cache);
				458	table->cache_size += 1;
				459	}
				460
				461	spin_unlock(&rbio->bio_list_lock);
				462
				463	if (table->cache_size > RBIO_CACHE_SIZE) {
				464	struct btrfs_raid_bio *found;
				465
				466	found = list_entry(table->stripe_cache.prev,
				467	struct btrfs_raid_bio,
				468	stripe_cache);
				469
				470	if (found != rbio)
				471	__remove_rbio_from_cache(found);
				472	}
				473
				474	spin_unlock_irqrestore(&table->cache_lock, flags);
				475	return;
				476	}
				477
				478	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	479	* helper function to run the xor_blocks api. It is only
				480	* able to do MAX_XOR_BLOCKS at a time, so we need to
				481	* loop through.
				482	*/
				483	static void run_xor(void **pages, int src_cnt, ssize_t len)
				484	{
				485	int src_off = 0;
				486	int xor_src_cnt = 0;
				487	void *dest = pages[src_cnt];
				488
				489	while(src_cnt > 0) {
				490	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				491	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				492
				493	src_cnt -= xor_src_cnt;
				494	src_off += xor_src_cnt;
				495	}
				496	}
				497
				498	/*
				499	* returns true if the bio list inside this rbio
				500	* covers an entire stripe (no rmw required).
				501	* Must be called with the bio list lock held, or
				502	* at a time when you know it is impossible to add
				503	* new bios into the list
				504	*/
				505	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				506	{
				507	unsigned long size = rbio->bio_list_bytes;
				508	int ret = 1;
				509
				510	if (size != rbio->nr_data * rbio->stripe_len)
				511	ret = 0;
				512
				513	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				514	return ret;
				515	}
				516
				517	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				518	{
				519	unsigned long flags;
				520	int ret;
				521
				522	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				523	ret = __rbio_is_full(rbio);
				524	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				525	return ret;
				526	}
				527
				528	/*
				529	* returns 1 if it is safe to merge two rbios together.
				530	* The merging is safe if the two rbios correspond to
				531	* the same stripe and if they are both going in the same
				532	* direction (read vs write), and if neither one is
				533	* locked for final IO
				534	*
				535	* The caller is responsible for locking such that
				536	* rmw_locked is safe to test
				537	*/
				538	static int rbio_can_merge(struct btrfs_raid_bio *last,
				539	struct btrfs_raid_bio *cur)
				540	{
				541	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				542	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				543	return 0;
				544
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	545	/*
				546	* we can't merge with cached rbios, since the
				547	* idea is that when we merge the destination
				548	* rbio is going to run our IO for us. We can
				549	* steal from cached rbio's though, other functions
				550	* handle that.
				551	*/
				552	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				553	test_bit(RBIO_CACHE_BIT, &cur->flags))
				554	return 0;
				555
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	556	if (last->raid_map[0] !=
				557	cur->raid_map[0])
				558	return 0;
				559
				560	/* reads can't merge with writes */
				561	if (last->read_rebuild !=
				562	cur->read_rebuild) {
				563	return 0;
				564	}
				565
				566	return 1;
				567	}
				568
				569	/*
				570	* helper to index into the pstripe
				571	*/
				572	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				573	{
				574	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				575	return rbio->stripe_pages[index];
				576	}
				577
				578	/*
				579	* helper to index into the qstripe, returns null
				580	* if there is no qstripe
				581	*/
				582	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				583	{
				584	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
				585	return NULL;
				586
				587	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				588	PAGE_CACHE_SHIFT;
				589	return rbio->stripe_pages[index];
				590	}
				591
				592	/*
				593	* The first stripe in the table for a logical address
				594	* has the lock. rbios are added in one of three ways:
				595	*
				596	* 1) Nobody has the stripe locked yet. The rbio is given
				597	* the lock and 0 is returned. The caller must start the IO
				598	* themselves.
				599	*
				600	* 2) Someone has the stripe locked, but we're able to merge
				601	* with the lock owner. The rbio is freed and the IO will
				602	* start automatically along with the existing rbio. 1 is returned.
				603	*
				604	* 3) Someone has the stripe locked, but we're not able to merge.
				605	* The rbio is added to the lock owner's plug list, or merged into
				606	* an rbio already on the plug list. When the lock owner unlocks,
				607	* the next rbio on the list is run and the IO is started automatically.
				608	* 1 is returned
				609	*
				610	* If we return 0, the caller still owns the rbio and must continue with
				611	* IO submission. If we return 1, the caller must assume the rbio has
				612	* already been freed.
				613	*/
				614	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				615	{
				616	int bucket = rbio_bucket(rbio);
				617	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				618	struct btrfs_raid_bio *cur;
				619	struct btrfs_raid_bio *pending;
				620	unsigned long flags;
				621	DEFINE_WAIT(wait);
				622	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	623	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	624	int ret = 0;
				625	int walk = 0;
				626
				627	spin_lock_irqsave(&h->lock, flags);
				628	list_for_each_entry(cur, &h->hash_list, hash_list) {
				629	walk++;
				630	if (cur->raid_map[0] == rbio->raid_map[0]) {
				631	spin_lock(&cur->bio_list_lock);
				632
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	633	/* can we steal this cached rbio's pages? */
				634	if (bio_list_empty(&cur->bio_list) &&
				635	list_empty(&cur->plug_list) &&
				636	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				637	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				638	list_del_init(&cur->hash_list);
				639	atomic_dec(&cur->refs);
				640
				641	steal_rbio(cur, rbio);
				642	cache_drop = cur;
				643	spin_unlock(&cur->bio_list_lock);
				644
				645	goto lockit;
				646	}
				647
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	648	/* can we merge into the lock owner? */
				649	if (rbio_can_merge(cur, rbio)) {
				650	merge_rbio(cur, rbio);
				651	spin_unlock(&cur->bio_list_lock);
				652	freeit = rbio;
				653	ret = 1;
				654	goto out;
				655	}
				656
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	657
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	658	/*
				659	* we couldn't merge with the running
				660	* rbio, see if we can merge with the
				661	* pending ones. We don't have to
				662	* check for rmw_locked because there
				663	* is no way they are inside finish_rmw
				664	* right now
				665	*/
				666	list_for_each_entry(pending, &cur->plug_list,
				667	plug_list) {
				668	if (rbio_can_merge(pending, rbio)) {
				669	merge_rbio(pending, rbio);
				670	spin_unlock(&cur->bio_list_lock);
				671	freeit = rbio;
				672	ret = 1;
				673	goto out;
				674	}
				675	}
				676
				677	/* no merging, put us on the tail of the plug list,
				678	* our rbio will be started with the currently
				679	* running rbio unlocks
				680	*/
				681	list_add_tail(&rbio->plug_list, &cur->plug_list);
				682	spin_unlock(&cur->bio_list_lock);
				683	ret = 1;
				684	goto out;
				685	}
				686	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	687	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	688	atomic_inc(&rbio->refs);
				689	list_add(&rbio->hash_list, &h->hash_list);
				690	out:
				691	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	692	if (cache_drop)
				693	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	694	if (freeit)
				695	__free_raid_bio(freeit);
				696	return ret;
				697	}
				698
				699	/*
				700	* called as rmw or parity rebuild is completed. If the plug list has more
				701	* rbios waiting for this stripe, the next one on the list will be started
				702	*/
				703	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				704	{
				705	int bucket;
				706	struct btrfs_stripe_hash *h;
				707	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	708	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	709
				710	bucket = rbio_bucket(rbio);
				711	h = rbio->fs_info->stripe_hash_table->table + bucket;
				712
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	713	if (list_empty(&rbio->plug_list))
				714	cache_rbio(rbio);
				715
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	716	spin_lock_irqsave(&h->lock, flags);
				717	spin_lock(&rbio->bio_list_lock);
				718
				719	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	720	/*
				721	* if we're still cached and there is no other IO
				722	* to perform, just leave this rbio here for others
				723	* to steal from later
				724	*/
				725	if (list_empty(&rbio->plug_list) &&
				726	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				727	keep_cache = 1;
				728	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				729	BUG_ON(!bio_list_empty(&rbio->bio_list));
				730	goto done;
				731	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	732
				733	list_del_init(&rbio->hash_list);
				734	atomic_dec(&rbio->refs);
				735
				736	/*
				737	* we use the plug list to hold all the rbios
				738	* waiting for the chance to lock this stripe.
				739	* hand the lock over to one of them.
				740	*/
				741	if (!list_empty(&rbio->plug_list)) {
				742	struct btrfs_raid_bio *next;
				743	struct list_head *head = rbio->plug_list.next;
				744
				745	next = list_entry(head, struct btrfs_raid_bio,
				746	plug_list);
				747
				748	list_del_init(&rbio->plug_list);
				749
				750	list_add(&next->hash_list, &h->hash_list);
				751	atomic_inc(&next->refs);
				752	spin_unlock(&rbio->bio_list_lock);
				753	spin_unlock_irqrestore(&h->lock, flags);
				754
				755	if (next->read_rebuild)
				756	async_read_rebuild(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	757	else {
				758	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	759	async_rmw_stripe(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	760	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	761
				762	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	763	} else if (waitqueue_active(&h->wait)) {
				764	spin_unlock(&rbio->bio_list_lock);
				765	spin_unlock_irqrestore(&h->lock, flags);
				766	wake_up(&h->wait);
				767	goto done_nolock;
				768	}
				769	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	770	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	771	spin_unlock(&rbio->bio_list_lock);
				772	spin_unlock_irqrestore(&h->lock, flags);
				773
				774	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	775	if (!keep_cache)
				776	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	777	}
				778
				779	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				780	{
				781	int i;
				782
				783	WARN_ON(atomic_read(&rbio->refs) < 0);
				784	if (!atomic_dec_and_test(&rbio->refs))
				785	return;
				786
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	787	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	788	WARN_ON(!list_empty(&rbio->hash_list));
				789	WARN_ON(!bio_list_empty(&rbio->bio_list));
				790
				791	for (i = 0; i < rbio->nr_pages; i++) {
				792	if (rbio->stripe_pages[i]) {
				793	__free_page(rbio->stripe_pages[i]);
				794	rbio->stripe_pages[i] = NULL;
				795	}
				796	}
				797	kfree(rbio->raid_map);
				798	kfree(rbio->bbio);
				799	kfree(rbio);
				800	}
				801
				802	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				803	{
				804	unlock_stripe(rbio);
				805	__free_raid_bio(rbio);
				806	}
				807
				808	/*
				809	* this frees the rbio and runs through all the bios in the
				810	* bio_list and calls end_io on them
				811	*/
				812	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				813	{
				814	struct bio *cur = bio_list_get(&rbio->bio_list);
				815	struct bio *next;
				816	free_raid_bio(rbio);
				817
				818	while (cur) {
				819	next = cur->bi_next;
				820	cur->bi_next = NULL;
				821	if (uptodate)
				822	set_bit(BIO_UPTODATE, &cur->bi_flags);
				823	bio_endio(cur, err);
				824	cur = next;
				825	}
				826	}
				827
				828	/*
				829	* end io function used by finish_rmw. When we finally
				830	* get here, we've written a full stripe
				831	*/
				832	static void raid_write_end_io(struct bio *bio, int err)
				833	{
				834	struct btrfs_raid_bio *rbio = bio->bi_private;
				835
				836	if (err)
				837	fail_bio_stripe(rbio, bio);
				838
				839	bio_put(bio);
				840
				841	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				842	return;
				843
				844	err = 0;
				845
				846	/* OK, we have read all the stripes we need to. */
				847	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				848	err = -EIO;
				849
				850	rbio_orig_end_io(rbio, err, 0);
				851	return;
				852	}
				853
				854	/*
				855	* the read/modify/write code wants to use the original bio for
				856	* any pages it included, and then use the rbio for everything
				857	* else. This function decides if a given index (stripe number)
				858	* and page number in that stripe fall inside the original bio
				859	* or the rbio.
				860	*
				861	* if you set bio_list_only, you'll get a NULL back for any ranges
				862	* that are outside the bio_list
				863	*
				864	* This doesn't take any refs on anything, you get a bare page pointer
				865	* and the caller must bump refs as required.
				866	*
				867	* You must call index_rbio_pages once before you can trust
				868	* the answers from this function.
				869	*/
				870	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				871	int index, int pagenr, int bio_list_only)
				872	{
				873	int chunk_page;
				874	struct page *p = NULL;
				875
				876	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				877
				878	spin_lock_irq(&rbio->bio_list_lock);
				879	p = rbio->bio_pages[chunk_page];
				880	spin_unlock_irq(&rbio->bio_list_lock);
				881
				882	if (p \|\| bio_list_only)
				883	return p;
				884
				885	return rbio->stripe_pages[chunk_page];
				886	}
				887
				888	/*
				889	* number of pages we need for the entire stripe across all the
				890	* drives
				891	*/
				892	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				893	{
				894	unsigned long nr = stripe_len * nr_stripes;
				895	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				896	}
				897
				898	/*
				899	* allocation and initial setup for the btrfs_raid_bio. Not
				900	* this does not allocate any pages for rbio->pages.
				901	*/
				902	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				903	struct btrfs_bio bbio, u64 raid_map,
				904	u64 stripe_len)
				905	{
				906	struct btrfs_raid_bio *rbio;
				907	int nr_data = 0;
				908	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
				909	void *p;
				910
				911	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2,
				912	GFP_NOFS);
				913	if (!rbio) {
				914	kfree(raid_map);
				915	kfree(bbio);
				916	return ERR_PTR(-ENOMEM);
				917	}
				918
				919	bio_list_init(&rbio->bio_list);
				920	INIT_LIST_HEAD(&rbio->plug_list);
				921	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	922	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	923	INIT_LIST_HEAD(&rbio->hash_list);
				924	rbio->bbio = bbio;
				925	rbio->raid_map = raid_map;
				926	rbio->fs_info = root->fs_info;
				927	rbio->stripe_len = stripe_len;
				928	rbio->nr_pages = num_pages;
				929	rbio->faila = -1;
				930	rbio->failb = -1;
				931	atomic_set(&rbio->refs, 1);
				932
				933	/*
				934	* the stripe_pages and bio_pages array point to the extra
				935	* memory we allocated past the end of the rbio
				936	*/
				937	p = rbio + 1;
				938	rbio->stripe_pages = p;
				939	rbio->bio_pages = p + sizeof(struct page ) num_pages;
				940
				941	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
				942	nr_data = bbio->num_stripes - 2;
				943	else
				944	nr_data = bbio->num_stripes - 1;
				945
				946	rbio->nr_data = nr_data;
				947	return rbio;
				948	}
				949
				950	/* allocate pages for all the stripes in the bio, including parity */
				951	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				952	{
				953	int i;
				954	struct page *page;
				955
				956	for (i = 0; i < rbio->nr_pages; i++) {
				957	if (rbio->stripe_pages[i])
				958	continue;
				959	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				960	if (!page)
				961	return -ENOMEM;
				962	rbio->stripe_pages[i] = page;
				963	ClearPageUptodate(page);
				964	}
				965	return 0;
				966	}
				967
				968	/* allocate pages for just the p/q stripes */
				969	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				970	{
				971	int i;
				972	struct page *page;
				973
				974	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				975
				976	for (; i < rbio->nr_pages; i++) {
				977	if (rbio->stripe_pages[i])
				978	continue;
				979	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				980	if (!page)
				981	return -ENOMEM;
				982	rbio->stripe_pages[i] = page;
				983	}
				984	return 0;
				985	}
				986
				987	/*
				988	* add a single page from a specific stripe into our list of bios for IO
				989	* this will try to merge into existing bios if possible, and returns
				990	* zero if all went well.
				991	*/
				992	int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				993	struct bio_list *bio_list,
				994	struct page *page,
				995	int stripe_nr,
				996	unsigned long page_index,
				997	unsigned long bio_max_len)
				998	{
				999	struct bio *last = bio_list->tail;
				1000	u64 last_end = 0;
				1001	int ret;
				1002	struct bio *bio;
				1003	struct btrfs_bio_stripe *stripe;
				1004	u64 disk_start;
				1005
				1006	stripe = &rbio->bbio->stripes[stripe_nr];
				1007	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1008
				1009	/* if the device is missing, just fail this stripe */
				1010	if (!stripe->dev->bdev)
				1011	return fail_rbio_index(rbio, stripe_nr);
				1012
				1013	/* see if we can add this page onto our existing bio */
				1014	if (last) {
				1015	last_end = (u64)last->bi_sector << 9;
				1016	last_end += last->bi_size;
				1017
				1018	/*
				1019	* we can't merge these if they are from different
				1020	* devices or if they are not contiguous
				1021	*/
				1022	if (last_end == disk_start && stripe->dev->bdev &&
				1023	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1024	last->bi_bdev == stripe->dev->bdev) {
				1025	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1026	if (ret == PAGE_CACHE_SIZE)
				1027	return 0;
				1028	}
				1029	}
				1030
				1031	/* put a new bio on the list */
				1032	bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
				1033	if (!bio)
				1034	return -ENOMEM;
				1035
				1036	bio->bi_size = 0;
				1037	bio->bi_bdev = stripe->dev->bdev;
				1038	bio->bi_sector = disk_start >> 9;
				1039	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1040
				1041	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1042	bio_list_add(bio_list, bio);
				1043	return 0;
				1044	}
				1045
				1046	/*
				1047	* while we're doing the read/modify/write cycle, we could
				1048	* have errors in reading pages off the disk. This checks
				1049	* for errors and if we're not able to read the page it'll
				1050	* trigger parity reconstruction. The rmw will be finished
				1051	* after we've reconstructed the failed stripes
				1052	*/
				1053	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1054	{
				1055	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				1056	BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
				1057	__raid56_parity_recover(rbio);
				1058	} else {
				1059	finish_rmw(rbio);
				1060	}
				1061	}
				1062
				1063	/*
				1064	* these are just the pages from the rbio array, not from anything
				1065	* the FS sent down to us
				1066	*/
				1067	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1068	{
				1069	int index;
				1070	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1071	index += page;
				1072	return rbio->stripe_pages[index];
				1073	}
				1074
				1075	/*
				1076	* helper function to walk our bio list and populate the bio_pages array with
				1077	* the result. This seems expensive, but it is faster than constantly
				1078	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1079	* reconstruction.
				1080	*
				1081	* This must be called before you trust the answers from page_in_rbio
				1082	*/
				1083	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1084	{
				1085	struct bio *bio;
				1086	u64 start;
				1087	unsigned long stripe_offset;
				1088	unsigned long page_index;
				1089	struct page *p;
				1090	int i;
				1091
				1092	spin_lock_irq(&rbio->bio_list_lock);
				1093	bio_list_for_each(bio, &rbio->bio_list) {
				1094	start = (u64)bio->bi_sector << 9;
				1095	stripe_offset = start - rbio->raid_map[0];
				1096	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1097
				1098	for (i = 0; i < bio->bi_vcnt; i++) {
				1099	p = bio->bi_io_vec[i].bv_page;
				1100	rbio->bio_pages[page_index + i] = p;
				1101	}
				1102	}
				1103	spin_unlock_irq(&rbio->bio_list_lock);
				1104	}
				1105
				1106	/*
				1107	* this is called from one of two situations. We either
				1108	* have a full stripe from the higher layers, or we've read all
				1109	* the missing bits off disk.
				1110	*
				1111	* This will calculate the parity and then send down any
				1112	* changed blocks.
				1113	*/
				1114	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1115	{
				1116	struct btrfs_bio *bbio = rbio->bbio;
				1117	void *pointers[bbio->num_stripes];
				1118	int stripe_len = rbio->stripe_len;
				1119	int nr_data = rbio->nr_data;
				1120	int stripe;
				1121	int pagenr;
				1122	int p_stripe = -1;
				1123	int q_stripe = -1;
				1124	struct bio_list bio_list;
				1125	struct bio *bio;
				1126	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1127	int ret;
				1128
				1129	bio_list_init(&bio_list);
				1130
				1131	if (bbio->num_stripes - rbio->nr_data == 1) {
				1132	p_stripe = bbio->num_stripes - 1;
				1133	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				1134	p_stripe = bbio->num_stripes - 2;
				1135	q_stripe = bbio->num_stripes - 1;
				1136	} else {
				1137	BUG();
				1138	}
				1139
				1140	/* at this point we either have a full stripe,
				1141	* or we've read the full stripe from the drive.
				1142	* recalculate the parity and write the new results.
				1143	*
				1144	* We're not allowed to add any new bios to the
				1145	* bio list here, anyone else that wants to
				1146	* change this stripe needs to do their own rmw.
				1147	*/
				1148	spin_lock_irq(&rbio->bio_list_lock);
				1149	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1150	spin_unlock_irq(&rbio->bio_list_lock);
				1151
				1152	atomic_set(&rbio->bbio->error, 0);
				1153
				1154	/*
				1155	* now that we've set rmw_locked, run through the
				1156	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	1157	*
				1158	* We don't cache full rbios because we're assuming
				1159	* the higher layers are unlikely to use this area of
				1160	* the disk again soon. If they do use it again,
				1161	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1162	*/
				1163	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	1164	if (!rbio_is_full(rbio))
				1165	cache_rbio_pages(rbio);
				1166	else
				1167	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1168
				1169	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1170	struct page *p;
				1171	/* first collect one page from each data stripe */
				1172	for (stripe = 0; stripe < nr_data; stripe++) {
				1173	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1174	pointers[stripe] = kmap(p);
				1175	}
				1176
				1177	/* then add the parity stripe */
				1178	p = rbio_pstripe_page(rbio, pagenr);
				1179	SetPageUptodate(p);
				1180	pointers[stripe++] = kmap(p);
				1181
				1182	if (q_stripe != -1) {
				1183
				1184	/*
				1185	* raid6, add the qstripe and call the
				1186	* library function to fill in our p/q
				1187	*/
				1188	p = rbio_qstripe_page(rbio, pagenr);
				1189	SetPageUptodate(p);
				1190	pointers[stripe++] = kmap(p);
				1191
				1192	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				1193	pointers);
				1194	} else {
				1195	/* raid5 */
				1196	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1197	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1198	}
				1199
				1200
				1201	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				1202	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1203	}
				1204
				1205	/*
				1206	* time to start writing. Make bios for everything from the
				1207	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1208	* everything else.
				1209	*/
				1210	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1211	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1212	struct page *page;
				1213	if (stripe < rbio->nr_data) {
				1214	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1215	if (!page)
				1216	continue;
				1217	} else {
				1218	page = rbio_stripe_page(rbio, stripe, pagenr);
				1219	}
				1220
				1221	ret = rbio_add_io_page(rbio, &bio_list,
				1222	page, stripe, pagenr, rbio->stripe_len);
				1223	if (ret)
				1224	goto cleanup;
				1225	}
				1226	}
				1227
				1228	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
				1229	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
				1230
				1231	while (1) {
				1232	bio = bio_list_pop(&bio_list);
				1233	if (!bio)
				1234	break;
				1235
				1236	bio->bi_private = rbio;
				1237	bio->bi_end_io = raid_write_end_io;
				1238	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1239	submit_bio(WRITE, bio);
				1240	}
				1241	return;
				1242
				1243	cleanup:
				1244	rbio_orig_end_io(rbio, -EIO, 0);
				1245	}
				1246
				1247	/*
				1248	* helper to find the stripe number for a given bio. Used to figure out which
				1249	* stripe has failed. This expects the bio to correspond to a physical disk,
				1250	* so it looks up based on physical sector numbers.
				1251	*/
				1252	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1253	struct bio *bio)
				1254	{
				1255	u64 physical = bio->bi_sector;
				1256	u64 stripe_start;
				1257	int i;
				1258	struct btrfs_bio_stripe *stripe;
				1259
				1260	physical <<= 9;
				1261
				1262	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1263	stripe = &rbio->bbio->stripes[i];
				1264	stripe_start = stripe->physical;
				1265	if (physical >= stripe_start &&
				1266	physical < stripe_start + rbio->stripe_len) {
				1267	return i;
				1268	}
				1269	}
				1270	return -1;
				1271	}
				1272
				1273	/*
				1274	* helper to find the stripe number for a given
				1275	* bio (before mapping). Used to figure out which stripe has
				1276	* failed. This looks up based on logical block numbers.
				1277	*/
				1278	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1279	struct bio *bio)
				1280	{
				1281	u64 logical = bio->bi_sector;
				1282	u64 stripe_start;
				1283	int i;
				1284
				1285	logical <<= 9;
				1286
				1287	for (i = 0; i < rbio->nr_data; i++) {
				1288	stripe_start = rbio->raid_map[i];
				1289	if (logical >= stripe_start &&
				1290	logical < stripe_start + rbio->stripe_len) {
				1291	return i;
				1292	}
				1293	}
				1294	return -1;
				1295	}
				1296
				1297	/*
				1298	* returns -EIO if we had too many failures
				1299	*/
				1300	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1301	{
				1302	unsigned long flags;
				1303	int ret = 0;
				1304
				1305	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1306
				1307	/* we already know this stripe is bad, move on */
				1308	if (rbio->faila == failed \|\| rbio->failb == failed)
				1309	goto out;
				1310
				1311	if (rbio->faila == -1) {
				1312	/* first failure on this rbio */
				1313	rbio->faila = failed;
				1314	atomic_inc(&rbio->bbio->error);
				1315	} else if (rbio->failb == -1) {
				1316	/* second failure on this rbio */
				1317	rbio->failb = failed;
				1318	atomic_inc(&rbio->bbio->error);
				1319	} else {
				1320	ret = -EIO;
				1321	}
				1322	out:
				1323	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1324
				1325	return ret;
				1326	}
				1327
				1328	/*
				1329	* helper to fail a stripe based on a physical disk
				1330	* bio.
				1331	*/
				1332	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1333	struct bio *bio)
				1334	{
				1335	int failed = find_bio_stripe(rbio, bio);
				1336
				1337	if (failed < 0)
				1338	return -EIO;
				1339
				1340	return fail_rbio_index(rbio, failed);
				1341	}
				1342
				1343	/*
				1344	* this sets each page in the bio uptodate. It should only be used on private
				1345	* rbio pages, nothing that comes in from the higher layers
				1346	*/
				1347	static void set_bio_pages_uptodate(struct bio *bio)
				1348	{
				1349	int i;
				1350	struct page *p;
				1351
				1352	for (i = 0; i < bio->bi_vcnt; i++) {
				1353	p = bio->bi_io_vec[i].bv_page;
				1354	SetPageUptodate(p);
				1355	}
				1356	}
				1357
				1358	/*
				1359	* end io for the read phase of the rmw cycle. All the bios here are physical
				1360	* stripe bios we've read from the disk so we can recalculate the parity of the
				1361	* stripe.
				1362	*
				1363	* This will usually kick off finish_rmw once all the bios are read in, but it
				1364	* may trigger parity reconstruction if we had any errors along the way
				1365	*/
				1366	static void raid_rmw_end_io(struct bio *bio, int err)
				1367	{
				1368	struct btrfs_raid_bio *rbio = bio->bi_private;
				1369
				1370	if (err)
				1371	fail_bio_stripe(rbio, bio);
				1372	else
				1373	set_bio_pages_uptodate(bio);
				1374
				1375	bio_put(bio);
				1376
				1377	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1378	return;
				1379
				1380	err = 0;
				1381	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1382	goto cleanup;
				1383
				1384	/*
				1385	* this will normally call finish_rmw to start our write
				1386	* but if there are any failed stripes we'll reconstruct
				1387	* from parity first
				1388	*/
				1389	validate_rbio_for_rmw(rbio);
				1390	return;
				1391
				1392	cleanup:
				1393
				1394	rbio_orig_end_io(rbio, -EIO, 0);
				1395	}
				1396
				1397	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1398	{
				1399	rbio->work.flags = 0;
				1400	rbio->work.func = rmw_work;
				1401
				1402	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1403	&rbio->work);
				1404	}
				1405
				1406	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1407	{
				1408	rbio->work.flags = 0;
				1409	rbio->work.func = read_rebuild_work;
				1410
				1411	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1412	&rbio->work);
				1413	}
				1414
				1415	/*
				1416	* the stripe must be locked by the caller. It will
				1417	* unlock after all the writes are done
				1418	*/
				1419	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1420	{
				1421	int bios_to_read = 0;
				1422	struct btrfs_bio *bbio = rbio->bbio;
				1423	struct bio_list bio_list;
				1424	int ret;
				1425	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1426	int pagenr;
				1427	int stripe;
				1428	struct bio *bio;
				1429
				1430	bio_list_init(&bio_list);
				1431
				1432	ret = alloc_rbio_pages(rbio);
				1433	if (ret)
				1434	goto cleanup;
				1435
				1436	index_rbio_pages(rbio);
				1437
				1438	atomic_set(&rbio->bbio->error, 0);
				1439	/*
				1440	* build a list of bios to read all the missing parts of this
				1441	* stripe
				1442	*/
				1443	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1444	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1445	struct page *page;
				1446	/*
				1447	* we want to find all the pages missing from
				1448	* the rbio and read them from the disk. If
				1449	* page_in_rbio finds a page in the bio list
				1450	* we don't need to read it off the stripe.
				1451	*/
				1452	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1453	if (page)
				1454	continue;
				1455
				1456	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	1457	/*
				1458	* the bio cache may have handed us an uptodate
				1459	* page. If so, be happy and use it
				1460	*/
				1461	if (PageUptodate(page))
				1462	continue;
				1463
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1464	ret = rbio_add_io_page(rbio, &bio_list, page,
				1465	stripe, pagenr, rbio->stripe_len);
				1466	if (ret)
				1467	goto cleanup;
				1468	}
				1469	}
				1470
				1471	bios_to_read = bio_list_size(&bio_list);
				1472	if (!bios_to_read) {
				1473	/*
				1474	* this can happen if others have merged with
				1475	* us, it means there is nothing left to read.
				1476	* But if there are missing devices it may not be
				1477	* safe to do the full stripe write yet.
				1478	*/
				1479	goto finish;
				1480	}
				1481
				1482	/*
				1483	* the bbio may be freed once we submit the last bio. Make sure
				1484	* not to touch it after that
				1485	*/
				1486	atomic_set(&bbio->stripes_pending, bios_to_read);
				1487	while (1) {
				1488	bio = bio_list_pop(&bio_list);
				1489	if (!bio)
				1490	break;
				1491
				1492	bio->bi_private = rbio;
				1493	bio->bi_end_io = raid_rmw_end_io;
				1494
				1495	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1496	BTRFS_WQ_ENDIO_RAID56);
				1497
				1498	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1499	submit_bio(READ, bio);
				1500	}
				1501	/* the actual write will happen once the reads are done */
				1502	return 0;
				1503
				1504	cleanup:
				1505	rbio_orig_end_io(rbio, -EIO, 0);
				1506	return -EIO;
				1507
				1508	finish:
				1509	validate_rbio_for_rmw(rbio);
				1510	return 0;
				1511	}
				1512
				1513	/*
				1514	* if the upper layers pass in a full stripe, we thank them by only allocating
				1515	* enough pages to hold the parity, and sending it all down quickly.
				1516	*/
				1517	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1518	{
				1519	int ret;
				1520
				1521	ret = alloc_rbio_parity_pages(rbio);
				1522	if (ret)
				1523	return ret;
				1524
				1525	ret = lock_stripe_add(rbio);
				1526	if (ret == 0)
				1527	finish_rmw(rbio);
				1528	return 0;
				1529	}
				1530
				1531	/*
				1532	* partial stripe writes get handed over to async helpers.
				1533	* We're really hoping to merge a few more writes into this
				1534	* rbio before calculating new parity
				1535	*/
				1536	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1537	{
				1538	int ret;
				1539
				1540	ret = lock_stripe_add(rbio);
				1541	if (ret == 0)
				1542	async_rmw_stripe(rbio);
				1543	return 0;
				1544	}
				1545
				1546	/*
				1547	* sometimes while we were reading from the drive to
				1548	* recalculate parity, enough new bios come into create
				1549	* a full stripe. So we do a check here to see if we can
				1550	* go directly to finish_rmw
				1551	*/
				1552	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1553	{
				1554	/* head off into rmw land if we don't have a full stripe */
				1555	if (!rbio_is_full(rbio))
				1556	return partial_stripe_write(rbio);
				1557	return full_stripe_write(rbio);
				1558	}
				1559
				1560	/*
				1561	* our main entry point for writes from the rest of the FS.
				1562	*/
				1563	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1564	struct btrfs_bio bbio, u64 raid_map,
				1565	u64 stripe_len)
				1566	{
				1567	struct btrfs_raid_bio *rbio;
				1568
				1569	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				1570	if (IS_ERR(rbio)) {
				1571	kfree(raid_map);
				1572	kfree(bbio);
				1573	return PTR_ERR(rbio);
				1574	}
				1575	bio_list_add(&rbio->bio_list, bio);
				1576	rbio->bio_list_bytes = bio->bi_size;
				1577	return __raid56_parity_write(rbio);
				1578	}
				1579
				1580	/*
				1581	* all parity reconstruction happens here. We've read in everything
				1582	* we can find from the drives and this does the heavy lifting of
				1583	* sorting the good from the bad.
				1584	*/
				1585	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1586	{
				1587	int pagenr, stripe;
				1588	void **pointers;
				1589	int faila = -1, failb = -1;
				1590	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1591	struct page *page;
				1592	int err;
				1593	int i;
				1594
				1595	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
				1596	GFP_NOFS);
				1597	if (!pointers) {
				1598	err = -ENOMEM;
				1599	goto cleanup_io;
				1600	}
				1601
				1602	faila = rbio->faila;
				1603	failb = rbio->failb;
				1604
				1605	if (rbio->read_rebuild) {
				1606	spin_lock_irq(&rbio->bio_list_lock);
				1607	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1608	spin_unlock_irq(&rbio->bio_list_lock);
				1609	}
				1610
				1611	index_rbio_pages(rbio);
				1612
				1613	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1614	/* setup our array of pointers with pages
				1615	* from each stripe
				1616	*/
				1617	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1618	/*
				1619	* if we're rebuilding a read, we have to use
				1620	* pages from the bio list
				1621	*/
				1622	if (rbio->read_rebuild &&
				1623	(stripe == faila \|\| stripe == failb)) {
				1624	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1625	} else {
				1626	page = rbio_stripe_page(rbio, stripe, pagenr);
				1627	}
				1628	pointers[stripe] = kmap(page);
				1629	}
				1630
				1631	/* all raid6 handling here */
				1632	if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
				1633	RAID6_Q_STRIPE) {
				1634
				1635	/*
				1636	* single failure, rebuild from parity raid5
				1637	* style
				1638	*/
				1639	if (failb < 0) {
				1640	if (faila == rbio->nr_data) {
				1641	/*
				1642	* Just the P stripe has failed, without
				1643	* a bad data or Q stripe.
				1644	* TODO, we should redo the xor here.
				1645	*/
				1646	err = -EIO;
				1647	goto cleanup;
				1648	}
				1649	/*
				1650	* a single failure in raid6 is rebuilt
				1651	* in the pstripe code below
				1652	*/
				1653	goto pstripe;
				1654	}
				1655
				1656	/* make sure our ps and qs are in order */
				1657	if (faila > failb) {
				1658	int tmp = failb;
				1659	failb = faila;
				1660	faila = tmp;
				1661	}
				1662
				1663	/* if the q stripe is failed, do a pstripe reconstruction
				1664	* from the xors.
				1665	* If both the q stripe and the P stripe are failed, we're
				1666	* here due to a crc mismatch and we can't give them the
				1667	* data they want
				1668	*/
				1669	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1670	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1671	err = -EIO;
				1672	goto cleanup;
				1673	}
				1674	/*
				1675	* otherwise we have one bad data stripe and
				1676	* a good P stripe. raid5!
				1677	*/
				1678	goto pstripe;
				1679	}
				1680
				1681	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
				1682	raid6_datap_recov(rbio->bbio->num_stripes,
				1683	PAGE_SIZE, faila, pointers);
				1684	} else {
				1685	raid6_2data_recov(rbio->bbio->num_stripes,
				1686	PAGE_SIZE, faila, failb,
				1687	pointers);
				1688	}
				1689	} else {
				1690	void *p;
				1691
				1692	/* rebuild from P stripe here (raid5 or raid6) */
				1693	BUG_ON(failb != -1);
				1694	pstripe:
				1695	/* Copy parity block into failed block to start with */
				1696	memcpy(pointers[faila],
				1697	pointers[rbio->nr_data],
				1698	PAGE_CACHE_SIZE);
				1699
				1700	/* rearrange the pointer array */
				1701	p = pointers[faila];
				1702	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1703	pointers[stripe] = pointers[stripe + 1];
				1704	pointers[rbio->nr_data - 1] = p;
				1705
				1706	/* xor in the rest */
				1707	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1708	}
				1709	/* if we're doing this rebuild as part of an rmw, go through
				1710	* and set all of our private rbio pages in the
				1711	* failed stripes as uptodate. This way finish_rmw will
				1712	* know they can be trusted. If this was a read reconstruction,
				1713	* other endio functions will fiddle the uptodate bits
				1714	*/
				1715	if (!rbio->read_rebuild) {
				1716	for (i = 0; i < nr_pages; i++) {
				1717	if (faila != -1) {
				1718	page = rbio_stripe_page(rbio, faila, i);
				1719	SetPageUptodate(page);
				1720	}
				1721	if (failb != -1) {
				1722	page = rbio_stripe_page(rbio, failb, i);
				1723	SetPageUptodate(page);
				1724	}
				1725	}
				1726	}
				1727	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1728	/*
				1729	* if we're rebuilding a read, we have to use
				1730	* pages from the bio list
				1731	*/
				1732	if (rbio->read_rebuild &&
				1733	(stripe == faila \|\| stripe == failb)) {
				1734	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1735	} else {
				1736	page = rbio_stripe_page(rbio, stripe, pagenr);
				1737	}
				1738	kunmap(page);
				1739	}
				1740	}
				1741
				1742	err = 0;
				1743	cleanup:
				1744	kfree(pointers);
				1745
				1746	cleanup_io:
				1747
				1748	if (rbio->read_rebuild) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	1749	if (err == 0)
				1750	cache_rbio_pages(rbio);
				1751	else
				1752	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1753
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1754	rbio_orig_end_io(rbio, err, err == 0);
				1755	} else if (err == 0) {
				1756	rbio->faila = -1;
				1757	rbio->failb = -1;
				1758	finish_rmw(rbio);
				1759	} else {
				1760	rbio_orig_end_io(rbio, err, 0);
				1761	}
				1762	}
				1763
				1764	/*
				1765	* This is called only for stripes we've read from disk to
				1766	* reconstruct the parity.
				1767	*/
				1768	static void raid_recover_end_io(struct bio *bio, int err)
				1769	{
				1770	struct btrfs_raid_bio *rbio = bio->bi_private;
				1771
				1772	/*
				1773	* we only read stripe pages off the disk, set them
				1774	* up to date if there were no errors
				1775	*/
				1776	if (err)
				1777	fail_bio_stripe(rbio, bio);
				1778	else
				1779	set_bio_pages_uptodate(bio);
				1780	bio_put(bio);
				1781
				1782	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1783	return;
				1784
				1785	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1786	rbio_orig_end_io(rbio, -EIO, 0);
				1787	else
				1788	__raid_recover_end_io(rbio);
				1789	}
				1790
				1791	/*
				1792	* reads everything we need off the disk to reconstruct
				1793	* the parity. endio handlers trigger final reconstruction
				1794	* when the IO is done.
				1795	*
				1796	* This is used both for reads from the higher layers and for
				1797	* parity construction required to finish a rmw cycle.
				1798	*/
				1799	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				1800	{
				1801	int bios_to_read = 0;
				1802	struct btrfs_bio *bbio = rbio->bbio;
				1803	struct bio_list bio_list;
				1804	int ret;
				1805	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1806	int pagenr;
				1807	int stripe;
				1808	struct bio *bio;
				1809
				1810	bio_list_init(&bio_list);
				1811
				1812	ret = alloc_rbio_pages(rbio);
				1813	if (ret)
				1814	goto cleanup;
				1815
				1816	atomic_set(&rbio->bbio->error, 0);
				1817
				1818	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame^]	1819	* read everything that hasn't failed. Thanks to the
				1820	* stripe cache, it is possible that some or all of these
				1821	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1822	*/
				1823	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1824	if (rbio->faila == stripe \|\|
				1825	rbio->failb == stripe)
				1826	continue;
				1827
				1828	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1829	struct page *p;
				1830
				1831	/*
				1832	* the rmw code may have already read this
				1833	* page in
				1834	*/
				1835	p = rbio_stripe_page(rbio, stripe, pagenr);
				1836	if (PageUptodate(p))
				1837	continue;
				1838
				1839	ret = rbio_add_io_page(rbio, &bio_list,
				1840	rbio_stripe_page(rbio, stripe, pagenr),
				1841	stripe, pagenr, rbio->stripe_len);
				1842	if (ret < 0)
				1843	goto cleanup;
				1844	}
				1845	}
				1846
				1847	bios_to_read = bio_list_size(&bio_list);
				1848	if (!bios_to_read) {
				1849	/*
				1850	* we might have no bios to read just because the pages
				1851	* were up to date, or we might have no bios to read because
				1852	* the devices were gone.
				1853	*/
				1854	if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
				1855	__raid_recover_end_io(rbio);
				1856	goto out;
				1857	} else {
				1858	goto cleanup;
				1859	}
				1860	}
				1861
				1862	/*
				1863	* the bbio may be freed once we submit the last bio. Make sure
				1864	* not to touch it after that
				1865	*/
				1866	atomic_set(&bbio->stripes_pending, bios_to_read);
				1867	while (1) {
				1868	bio = bio_list_pop(&bio_list);
				1869	if (!bio)
				1870	break;
				1871
				1872	bio->bi_private = rbio;
				1873	bio->bi_end_io = raid_recover_end_io;
				1874
				1875	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1876	BTRFS_WQ_ENDIO_RAID56);
				1877
				1878	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1879	submit_bio(READ, bio);
				1880	}
				1881	out:
				1882	return 0;
				1883
				1884	cleanup:
				1885	if (rbio->read_rebuild)
				1886	rbio_orig_end_io(rbio, -EIO, 0);
				1887	return -EIO;
				1888	}
				1889
				1890	/*
				1891	* the main entry point for reads from the higher layers. This
				1892	* is really only called when the normal read path had a failure,
				1893	* so we assume the bio they send down corresponds to a failed part
				1894	* of the drive.
				1895	*/
				1896	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				1897	struct btrfs_bio bbio, u64 raid_map,
				1898	u64 stripe_len, int mirror_num)
				1899	{
				1900	struct btrfs_raid_bio *rbio;
				1901	int ret;
				1902
				1903	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				1904	if (IS_ERR(rbio)) {
				1905	return PTR_ERR(rbio);
				1906	}
				1907
				1908	rbio->read_rebuild = 1;
				1909	bio_list_add(&rbio->bio_list, bio);
				1910	rbio->bio_list_bytes = bio->bi_size;
				1911
				1912	rbio->faila = find_logical_bio_stripe(rbio, bio);
				1913	if (rbio->faila == -1) {
				1914	BUG();
				1915	kfree(rbio);
				1916	return -EIO;
				1917	}
				1918
				1919	/*
				1920	* reconstruct from the q stripe if they are
				1921	* asking for mirror 3
				1922	*/
				1923	if (mirror_num == 3)
				1924	rbio->failb = bbio->num_stripes - 2;
				1925
				1926	ret = lock_stripe_add(rbio);
				1927
				1928	/*
				1929	* __raid56_parity_recover will end the bio with
				1930	* any errors it hits. We don't want to return
				1931	* its error value up the stack because our caller
				1932	* will end up calling bio_endio with any nonzero
				1933	* return
				1934	*/
				1935	if (ret == 0)
				1936	__raid56_parity_recover(rbio);
				1937	/*
				1938	* our rbio has been added to the list of
				1939	* rbios that will be handled after the
				1940	* currently lock owner is done
				1941	*/
				1942	return 0;
				1943
				1944	}
				1945
				1946	static void rmw_work(struct btrfs_work *work)
				1947	{
				1948	struct btrfs_raid_bio *rbio;
				1949
				1950	rbio = container_of(work, struct btrfs_raid_bio, work);
				1951	raid56_rmw_stripe(rbio);
				1952	}
				1953
				1954	static void read_rebuild_work(struct btrfs_work *work)
				1955	{
				1956	struct btrfs_raid_bio *rbio;
				1957
				1958	rbio = container_of(work, struct btrfs_raid_bio, work);
				1959	__raid56_parity_recover(rbio);
				1960	}