Blame - fs/btrfs/raid56.c - kernel/msm-4.9

blob: 0525e1389f5b16658ccea028da6408da812c974b [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
Geert Uytterhoeven	d7011f5	2013-03-03 04:44:41 -0700	[diff] [blame]	34	#include <linux/vmalloc.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
				36	#include "compat.h"
				37	#include "ctree.h"
				38	#include "extent_map.h"
				39	#include "disk-io.h"
				40	#include "transaction.h"
				41	#include "print-tree.h"
				42	#include "volumes.h"
				43	#include "raid56.h"
				44	#include "async-thread.h"
				45	#include "check-integrity.h"
				46	#include "rcu-string.h"
				47
				48	/* set when additional merges to this rbio are not allowed */
				49	#define RBIO_RMW_LOCKED_BIT 1
				50
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	51	/*
				52	* set when this rbio is sitting in the hash, but it is just a cache
				53	* of past RMW
				54	*/
				55	#define RBIO_CACHE_BIT 2
				56
				57	/*
				58	* set when it is safe to trust the stripe_pages for caching
				59	*/
				60	#define RBIO_CACHE_READY_BIT 3
				61
				62
				63	#define RBIO_CACHE_SIZE 1024
				64
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	65	struct btrfs_raid_bio {
				66	struct btrfs_fs_info *fs_info;
				67	struct btrfs_bio *bbio;
				68
				69	/*
				70	* logical block numbers for the start of each stripe
				71	* The last one or two are p/q. These are sorted,
				72	* so raid_map[0] is the start of our full stripe
				73	*/
				74	u64 *raid_map;
				75
				76	/* while we're doing rmw on a stripe
				77	* we put it into a hash table so we can
				78	* lock the stripe and merge more rbios
				79	* into it.
				80	*/
				81	struct list_head hash_list;
				82
				83	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	84	* LRU list for the stripe cache
				85	*/
				86	struct list_head stripe_cache;
				87
				88	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	89	* for scheduling work in the helper threads
				90	*/
				91	struct btrfs_work work;
				92
				93	/*
				94	* bio list and bio_list_lock are used
				95	* to add more bios into the stripe
				96	* in hopes of avoiding the full rmw
				97	*/
				98	struct bio_list bio_list;
				99	spinlock_t bio_list_lock;
				100
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	101	/* also protected by the bio_list_lock, the
				102	* plug list is used by the plugging code
				103	* to collect partial bios while plugged. The
				104	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	105	* the stripe lock to the next pending IO
				106	*/
				107	struct list_head plug_list;
				108
				109	/*
				110	* flags that tell us if it is safe to
				111	* merge with this bio
				112	*/
				113	unsigned long flags;
				114
				115	/* size of each individual stripe on disk */
				116	int stripe_len;
				117
				118	/* number of data stripes (no p/q) */
				119	int nr_data;
				120
				121	/*
				122	* set if we're doing a parity rebuild
				123	* for a read from higher up, which is handled
				124	* differently from a parity rebuild as part of
				125	* rmw
				126	*/
				127	int read_rebuild;
				128
				129	/* first bad stripe */
				130	int faila;
				131
				132	/* second bad stripe (for raid6 use) */
				133	int failb;
				134
				135	/*
				136	* number of pages needed to represent the full
				137	* stripe
				138	*/
				139	int nr_pages;
				140
				141	/*
				142	* size of all the bios in the bio_list. This
				143	* helps us decide if the rbio maps to a full
				144	* stripe or not
				145	*/
				146	int bio_list_bytes;
				147
				148	atomic_t refs;
				149
				150	/*
				151	* these are two arrays of pointers. We allocate the
				152	* rbio big enough to hold them both and setup their
				153	* locations when the rbio is allocated
				154	*/
				155
				156	/* pointers to pages that we allocated for
				157	* reading/writing stripes directly from the disk (including P/Q)
				158	*/
				159	struct page **stripe_pages;
				160
				161	/*
				162	* pointers to the pages in the bio_list. Stored
				163	* here for faster lookup
				164	*/
				165	struct page **bio_pages;
				166	};
				167
				168	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				169	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				170	static void rmw_work(struct btrfs_work *work);
				171	static void read_rebuild_work(struct btrfs_work *work);
				172	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				173	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				174	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				175	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				176	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				177	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				178	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				179
				180	/*
				181	* the stripe hash table is used for locking, and to collect
				182	* bios in hopes of making a full stripe
				183	*/
				184	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				185	{
				186	struct btrfs_stripe_hash_table *table;
				187	struct btrfs_stripe_hash_table *x;
				188	struct btrfs_stripe_hash *cur;
				189	struct btrfs_stripe_hash *h;
				190	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				191	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	192	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	193
				194	if (info->stripe_hash_table)
				195	return 0;
				196
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	197	/*
				198	* The table is large, starting with order 4 and can go as high as
				199	* order 7 in case lock debugging is turned on.
				200	*
				201	* Try harder to allocate and fallback to vmalloc to lower the chance
				202	* of a failing mount.
				203	*/
				204	table_size = sizeof(table) + sizeof(h) * num_entries;
				205	table = kzalloc(table_size, GFP_KERNEL \| __GFP_NOWARN \| __GFP_REPEAT);
				206	if (!table) {
				207	table = vzalloc(table_size);
				208	if (!table)
				209	return -ENOMEM;
				210	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	211
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	212	spin_lock_init(&table->cache_lock);
				213	INIT_LIST_HEAD(&table->stripe_cache);
				214
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	215	h = table->table;
				216
				217	for (i = 0; i < num_entries; i++) {
				218	cur = h + i;
				219	INIT_LIST_HEAD(&cur->hash_list);
				220	spin_lock_init(&cur->lock);
				221	init_waitqueue_head(&cur->wait);
				222	}
				223
				224	x = cmpxchg(&info->stripe_hash_table, NULL, table);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	225	if (x) {
				226	if (is_vmalloc_addr(x))
				227	vfree(x);
				228	else
				229	kfree(x);
				230	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	231	return 0;
				232	}
				233
				234	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	235	* caching an rbio means to copy anything from the
				236	* bio_pages array into the stripe_pages array. We
				237	* use the page uptodate bit in the stripe cache array
				238	* to indicate if it has valid data
				239	*
				240	* once the caching is done, we set the cache ready
				241	* bit.
				242	*/
				243	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				244	{
				245	int i;
				246	char *s;
				247	char *d;
				248	int ret;
				249
				250	ret = alloc_rbio_pages(rbio);
				251	if (ret)
				252	return;
				253
				254	for (i = 0; i < rbio->nr_pages; i++) {
				255	if (!rbio->bio_pages[i])
				256	continue;
				257
				258	s = kmap(rbio->bio_pages[i]);
				259	d = kmap(rbio->stripe_pages[i]);
				260
				261	memcpy(d, s, PAGE_CACHE_SIZE);
				262
				263	kunmap(rbio->bio_pages[i]);
				264	kunmap(rbio->stripe_pages[i]);
				265	SetPageUptodate(rbio->stripe_pages[i]);
				266	}
				267	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				268	}
				269
				270	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	271	* we hash on the first logical address of the stripe
				272	*/
				273	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				274	{
				275	u64 num = rbio->raid_map[0];
				276
				277	/*
				278	* we shift down quite a bit. We're using byte
				279	* addressing, and most of the lower bits are zeros.
				280	* This tends to upset hash_64, and it consistently
				281	* returns just one or two different values.
				282	*
				283	* shifting off the lower bits fixes things.
				284	*/
				285	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				286	}
				287
				288	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	289	* stealing an rbio means taking all the uptodate pages from the stripe
				290	* array in the source rbio and putting them into the destination rbio
				291	*/
				292	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				293	{
				294	int i;
				295	struct page *s;
				296	struct page *d;
				297
				298	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				299	return;
				300
				301	for (i = 0; i < dest->nr_pages; i++) {
				302	s = src->stripe_pages[i];
				303	if (!s \|\| !PageUptodate(s)) {
				304	continue;
				305	}
				306
				307	d = dest->stripe_pages[i];
				308	if (d)
				309	__free_page(d);
				310
				311	dest->stripe_pages[i] = s;
				312	src->stripe_pages[i] = NULL;
				313	}
				314	}
				315
				316	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	317	* merging means we take the bio_list from the victim and
				318	* splice it into the destination. The victim should
				319	* be discarded afterwards.
				320	*
				321	* must be called with dest->rbio_list_lock held
				322	*/
				323	static void merge_rbio(struct btrfs_raid_bio *dest,
				324	struct btrfs_raid_bio *victim)
				325	{
				326	bio_list_merge(&dest->bio_list, &victim->bio_list);
				327	dest->bio_list_bytes += victim->bio_list_bytes;
				328	bio_list_init(&victim->bio_list);
				329	}
				330
				331	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	332	* used to prune items that are in the cache. The caller
				333	* must hold the hash table lock.
				334	*/
				335	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				336	{
				337	int bucket = rbio_bucket(rbio);
				338	struct btrfs_stripe_hash_table *table;
				339	struct btrfs_stripe_hash *h;
				340	int freeit = 0;
				341
				342	/*
				343	* check the bit again under the hash table lock.
				344	*/
				345	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				346	return;
				347
				348	table = rbio->fs_info->stripe_hash_table;
				349	h = table->table + bucket;
				350
				351	/* hold the lock for the bucket because we may be
				352	* removing it from the hash table
				353	*/
				354	spin_lock(&h->lock);
				355
				356	/*
				357	* hold the lock for the bio list because we need
				358	* to make sure the bio list is empty
				359	*/
				360	spin_lock(&rbio->bio_list_lock);
				361
				362	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				363	list_del_init(&rbio->stripe_cache);
				364	table->cache_size -= 1;
				365	freeit = 1;
				366
				367	/* if the bio list isn't empty, this rbio is
				368	* still involved in an IO. We take it out
				369	* of the cache list, and drop the ref that
				370	* was held for the list.
				371	*
				372	* If the bio_list was empty, we also remove
				373	* the rbio from the hash_table, and drop
				374	* the corresponding ref
				375	*/
				376	if (bio_list_empty(&rbio->bio_list)) {
				377	if (!list_empty(&rbio->hash_list)) {
				378	list_del_init(&rbio->hash_list);
				379	atomic_dec(&rbio->refs);
				380	BUG_ON(!list_empty(&rbio->plug_list));
				381	}
				382	}
				383	}
				384
				385	spin_unlock(&rbio->bio_list_lock);
				386	spin_unlock(&h->lock);
				387
				388	if (freeit)
				389	__free_raid_bio(rbio);
				390	}
				391
				392	/*
				393	* prune a given rbio from the cache
				394	*/
				395	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				396	{
				397	struct btrfs_stripe_hash_table *table;
				398	unsigned long flags;
				399
				400	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				401	return;
				402
				403	table = rbio->fs_info->stripe_hash_table;
				404
				405	spin_lock_irqsave(&table->cache_lock, flags);
				406	__remove_rbio_from_cache(rbio);
				407	spin_unlock_irqrestore(&table->cache_lock, flags);
				408	}
				409
				410	/*
				411	* remove everything in the cache
				412	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	413	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	414	{
				415	struct btrfs_stripe_hash_table *table;
				416	unsigned long flags;
				417	struct btrfs_raid_bio *rbio;
				418
				419	table = info->stripe_hash_table;
				420
				421	spin_lock_irqsave(&table->cache_lock, flags);
				422	while (!list_empty(&table->stripe_cache)) {
				423	rbio = list_entry(table->stripe_cache.next,
				424	struct btrfs_raid_bio,
				425	stripe_cache);
				426	__remove_rbio_from_cache(rbio);
				427	}
				428	spin_unlock_irqrestore(&table->cache_lock, flags);
				429	}
				430
				431	/*
				432	* remove all cached entries and free the hash table
				433	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	434	*/
				435	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				436	{
				437	if (!info->stripe_hash_table)
				438	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	439	btrfs_clear_rbio_cache(info);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	440	if (is_vmalloc_addr(info->stripe_hash_table))
				441	vfree(info->stripe_hash_table);
				442	else
				443	kfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	444	info->stripe_hash_table = NULL;
				445	}
				446
				447	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	448	* insert an rbio into the stripe cache. It
				449	* must have already been prepared by calling
				450	* cache_rbio_pages
				451	*
				452	* If this rbio was already cached, it gets
				453	* moved to the front of the lru.
				454	*
				455	* If the size of the rbio cache is too big, we
				456	* prune an item.
				457	*/
				458	static void cache_rbio(struct btrfs_raid_bio *rbio)
				459	{
				460	struct btrfs_stripe_hash_table *table;
				461	unsigned long flags;
				462
				463	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				464	return;
				465
				466	table = rbio->fs_info->stripe_hash_table;
				467
				468	spin_lock_irqsave(&table->cache_lock, flags);
				469	spin_lock(&rbio->bio_list_lock);
				470
				471	/* bump our ref if we were not in the list before */
				472	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				473	atomic_inc(&rbio->refs);
				474
				475	if (!list_empty(&rbio->stripe_cache)){
				476	list_move(&rbio->stripe_cache, &table->stripe_cache);
				477	} else {
				478	list_add(&rbio->stripe_cache, &table->stripe_cache);
				479	table->cache_size += 1;
				480	}
				481
				482	spin_unlock(&rbio->bio_list_lock);
				483
				484	if (table->cache_size > RBIO_CACHE_SIZE) {
				485	struct btrfs_raid_bio *found;
				486
				487	found = list_entry(table->stripe_cache.prev,
				488	struct btrfs_raid_bio,
				489	stripe_cache);
				490
				491	if (found != rbio)
				492	__remove_rbio_from_cache(found);
				493	}
				494
				495	spin_unlock_irqrestore(&table->cache_lock, flags);
				496	return;
				497	}
				498
				499	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	500	* helper function to run the xor_blocks api. It is only
				501	* able to do MAX_XOR_BLOCKS at a time, so we need to
				502	* loop through.
				503	*/
				504	static void run_xor(void **pages, int src_cnt, ssize_t len)
				505	{
				506	int src_off = 0;
				507	int xor_src_cnt = 0;
				508	void *dest = pages[src_cnt];
				509
				510	while(src_cnt > 0) {
				511	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				512	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				513
				514	src_cnt -= xor_src_cnt;
				515	src_off += xor_src_cnt;
				516	}
				517	}
				518
				519	/*
				520	* returns true if the bio list inside this rbio
				521	* covers an entire stripe (no rmw required).
				522	* Must be called with the bio list lock held, or
				523	* at a time when you know it is impossible to add
				524	* new bios into the list
				525	*/
				526	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				527	{
				528	unsigned long size = rbio->bio_list_bytes;
				529	int ret = 1;
				530
				531	if (size != rbio->nr_data * rbio->stripe_len)
				532	ret = 0;
				533
				534	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				535	return ret;
				536	}
				537
				538	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				539	{
				540	unsigned long flags;
				541	int ret;
				542
				543	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				544	ret = __rbio_is_full(rbio);
				545	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				546	return ret;
				547	}
				548
				549	/*
				550	* returns 1 if it is safe to merge two rbios together.
				551	* The merging is safe if the two rbios correspond to
				552	* the same stripe and if they are both going in the same
				553	* direction (read vs write), and if neither one is
				554	* locked for final IO
				555	*
				556	* The caller is responsible for locking such that
				557	* rmw_locked is safe to test
				558	*/
				559	static int rbio_can_merge(struct btrfs_raid_bio *last,
				560	struct btrfs_raid_bio *cur)
				561	{
				562	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				563	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				564	return 0;
				565
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	566	/*
				567	* we can't merge with cached rbios, since the
				568	* idea is that when we merge the destination
				569	* rbio is going to run our IO for us. We can
				570	* steal from cached rbio's though, other functions
				571	* handle that.
				572	*/
				573	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				574	test_bit(RBIO_CACHE_BIT, &cur->flags))
				575	return 0;
				576
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	577	if (last->raid_map[0] !=
				578	cur->raid_map[0])
				579	return 0;
				580
				581	/* reads can't merge with writes */
				582	if (last->read_rebuild !=
				583	cur->read_rebuild) {
				584	return 0;
				585	}
				586
				587	return 1;
				588	}
				589
				590	/*
				591	* helper to index into the pstripe
				592	*/
				593	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				594	{
				595	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				596	return rbio->stripe_pages[index];
				597	}
				598
				599	/*
				600	* helper to index into the qstripe, returns null
				601	* if there is no qstripe
				602	*/
				603	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				604	{
				605	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
				606	return NULL;
				607
				608	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				609	PAGE_CACHE_SHIFT;
				610	return rbio->stripe_pages[index];
				611	}
				612
				613	/*
				614	* The first stripe in the table for a logical address
				615	* has the lock. rbios are added in one of three ways:
				616	*
				617	* 1) Nobody has the stripe locked yet. The rbio is given
				618	* the lock and 0 is returned. The caller must start the IO
				619	* themselves.
				620	*
				621	* 2) Someone has the stripe locked, but we're able to merge
				622	* with the lock owner. The rbio is freed and the IO will
				623	* start automatically along with the existing rbio. 1 is returned.
				624	*
				625	* 3) Someone has the stripe locked, but we're not able to merge.
				626	* The rbio is added to the lock owner's plug list, or merged into
				627	* an rbio already on the plug list. When the lock owner unlocks,
				628	* the next rbio on the list is run and the IO is started automatically.
				629	* 1 is returned
				630	*
				631	* If we return 0, the caller still owns the rbio and must continue with
				632	* IO submission. If we return 1, the caller must assume the rbio has
				633	* already been freed.
				634	*/
				635	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				636	{
				637	int bucket = rbio_bucket(rbio);
				638	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				639	struct btrfs_raid_bio *cur;
				640	struct btrfs_raid_bio *pending;
				641	unsigned long flags;
				642	DEFINE_WAIT(wait);
				643	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	644	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	645	int ret = 0;
				646	int walk = 0;
				647
				648	spin_lock_irqsave(&h->lock, flags);
				649	list_for_each_entry(cur, &h->hash_list, hash_list) {
				650	walk++;
				651	if (cur->raid_map[0] == rbio->raid_map[0]) {
				652	spin_lock(&cur->bio_list_lock);
				653
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	654	/* can we steal this cached rbio's pages? */
				655	if (bio_list_empty(&cur->bio_list) &&
				656	list_empty(&cur->plug_list) &&
				657	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				658	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				659	list_del_init(&cur->hash_list);
				660	atomic_dec(&cur->refs);
				661
				662	steal_rbio(cur, rbio);
				663	cache_drop = cur;
				664	spin_unlock(&cur->bio_list_lock);
				665
				666	goto lockit;
				667	}
				668
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	669	/* can we merge into the lock owner? */
				670	if (rbio_can_merge(cur, rbio)) {
				671	merge_rbio(cur, rbio);
				672	spin_unlock(&cur->bio_list_lock);
				673	freeit = rbio;
				674	ret = 1;
				675	goto out;
				676	}
				677
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	678
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	679	/*
				680	* we couldn't merge with the running
				681	* rbio, see if we can merge with the
				682	* pending ones. We don't have to
				683	* check for rmw_locked because there
				684	* is no way they are inside finish_rmw
				685	* right now
				686	*/
				687	list_for_each_entry(pending, &cur->plug_list,
				688	plug_list) {
				689	if (rbio_can_merge(pending, rbio)) {
				690	merge_rbio(pending, rbio);
				691	spin_unlock(&cur->bio_list_lock);
				692	freeit = rbio;
				693	ret = 1;
				694	goto out;
				695	}
				696	}
				697
				698	/* no merging, put us on the tail of the plug list,
				699	* our rbio will be started with the currently
				700	* running rbio unlocks
				701	*/
				702	list_add_tail(&rbio->plug_list, &cur->plug_list);
				703	spin_unlock(&cur->bio_list_lock);
				704	ret = 1;
				705	goto out;
				706	}
				707	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	708	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	709	atomic_inc(&rbio->refs);
				710	list_add(&rbio->hash_list, &h->hash_list);
				711	out:
				712	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	713	if (cache_drop)
				714	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	715	if (freeit)
				716	__free_raid_bio(freeit);
				717	return ret;
				718	}
				719
				720	/*
				721	* called as rmw or parity rebuild is completed. If the plug list has more
				722	* rbios waiting for this stripe, the next one on the list will be started
				723	*/
				724	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				725	{
				726	int bucket;
				727	struct btrfs_stripe_hash *h;
				728	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	729	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	730
				731	bucket = rbio_bucket(rbio);
				732	h = rbio->fs_info->stripe_hash_table->table + bucket;
				733
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	734	if (list_empty(&rbio->plug_list))
				735	cache_rbio(rbio);
				736
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	737	spin_lock_irqsave(&h->lock, flags);
				738	spin_lock(&rbio->bio_list_lock);
				739
				740	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	741	/*
				742	* if we're still cached and there is no other IO
				743	* to perform, just leave this rbio here for others
				744	* to steal from later
				745	*/
				746	if (list_empty(&rbio->plug_list) &&
				747	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				748	keep_cache = 1;
				749	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				750	BUG_ON(!bio_list_empty(&rbio->bio_list));
				751	goto done;
				752	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	753
				754	list_del_init(&rbio->hash_list);
				755	atomic_dec(&rbio->refs);
				756
				757	/*
				758	* we use the plug list to hold all the rbios
				759	* waiting for the chance to lock this stripe.
				760	* hand the lock over to one of them.
				761	*/
				762	if (!list_empty(&rbio->plug_list)) {
				763	struct btrfs_raid_bio *next;
				764	struct list_head *head = rbio->plug_list.next;
				765
				766	next = list_entry(head, struct btrfs_raid_bio,
				767	plug_list);
				768
				769	list_del_init(&rbio->plug_list);
				770
				771	list_add(&next->hash_list, &h->hash_list);
				772	atomic_inc(&next->refs);
				773	spin_unlock(&rbio->bio_list_lock);
				774	spin_unlock_irqrestore(&h->lock, flags);
				775
				776	if (next->read_rebuild)
				777	async_read_rebuild(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	778	else {
				779	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	780	async_rmw_stripe(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	781	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	782
				783	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	784	} else if (waitqueue_active(&h->wait)) {
				785	spin_unlock(&rbio->bio_list_lock);
				786	spin_unlock_irqrestore(&h->lock, flags);
				787	wake_up(&h->wait);
				788	goto done_nolock;
				789	}
				790	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	791	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	792	spin_unlock(&rbio->bio_list_lock);
				793	spin_unlock_irqrestore(&h->lock, flags);
				794
				795	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	796	if (!keep_cache)
				797	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	798	}
				799
				800	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				801	{
				802	int i;
				803
				804	WARN_ON(atomic_read(&rbio->refs) < 0);
				805	if (!atomic_dec_and_test(&rbio->refs))
				806	return;
				807
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	808	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	809	WARN_ON(!list_empty(&rbio->hash_list));
				810	WARN_ON(!bio_list_empty(&rbio->bio_list));
				811
				812	for (i = 0; i < rbio->nr_pages; i++) {
				813	if (rbio->stripe_pages[i]) {
				814	__free_page(rbio->stripe_pages[i]);
				815	rbio->stripe_pages[i] = NULL;
				816	}
				817	}
				818	kfree(rbio->raid_map);
				819	kfree(rbio->bbio);
				820	kfree(rbio);
				821	}
				822
				823	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				824	{
				825	unlock_stripe(rbio);
				826	__free_raid_bio(rbio);
				827	}
				828
				829	/*
				830	* this frees the rbio and runs through all the bios in the
				831	* bio_list and calls end_io on them
				832	*/
				833	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				834	{
				835	struct bio *cur = bio_list_get(&rbio->bio_list);
				836	struct bio *next;
				837	free_raid_bio(rbio);
				838
				839	while (cur) {
				840	next = cur->bi_next;
				841	cur->bi_next = NULL;
				842	if (uptodate)
				843	set_bit(BIO_UPTODATE, &cur->bi_flags);
				844	bio_endio(cur, err);
				845	cur = next;
				846	}
				847	}
				848
				849	/*
				850	* end io function used by finish_rmw. When we finally
				851	* get here, we've written a full stripe
				852	*/
				853	static void raid_write_end_io(struct bio *bio, int err)
				854	{
				855	struct btrfs_raid_bio *rbio = bio->bi_private;
				856
				857	if (err)
				858	fail_bio_stripe(rbio, bio);
				859
				860	bio_put(bio);
				861
				862	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				863	return;
				864
				865	err = 0;
				866
				867	/* OK, we have read all the stripes we need to. */
				868	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				869	err = -EIO;
				870
				871	rbio_orig_end_io(rbio, err, 0);
				872	return;
				873	}
				874
				875	/*
				876	* the read/modify/write code wants to use the original bio for
				877	* any pages it included, and then use the rbio for everything
				878	* else. This function decides if a given index (stripe number)
				879	* and page number in that stripe fall inside the original bio
				880	* or the rbio.
				881	*
				882	* if you set bio_list_only, you'll get a NULL back for any ranges
				883	* that are outside the bio_list
				884	*
				885	* This doesn't take any refs on anything, you get a bare page pointer
				886	* and the caller must bump refs as required.
				887	*
				888	* You must call index_rbio_pages once before you can trust
				889	* the answers from this function.
				890	*/
				891	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				892	int index, int pagenr, int bio_list_only)
				893	{
				894	int chunk_page;
				895	struct page *p = NULL;
				896
				897	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				898
				899	spin_lock_irq(&rbio->bio_list_lock);
				900	p = rbio->bio_pages[chunk_page];
				901	spin_unlock_irq(&rbio->bio_list_lock);
				902
				903	if (p \|\| bio_list_only)
				904	return p;
				905
				906	return rbio->stripe_pages[chunk_page];
				907	}
				908
				909	/*
				910	* number of pages we need for the entire stripe across all the
				911	* drives
				912	*/
				913	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				914	{
				915	unsigned long nr = stripe_len * nr_stripes;
				916	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				917	}
				918
				919	/*
				920	* allocation and initial setup for the btrfs_raid_bio. Not
				921	* this does not allocate any pages for rbio->pages.
				922	*/
				923	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				924	struct btrfs_bio bbio, u64 raid_map,
				925	u64 stripe_len)
				926	{
				927	struct btrfs_raid_bio *rbio;
				928	int nr_data = 0;
				929	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
				930	void *p;
				931
				932	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2,
				933	GFP_NOFS);
				934	if (!rbio) {
				935	kfree(raid_map);
				936	kfree(bbio);
				937	return ERR_PTR(-ENOMEM);
				938	}
				939
				940	bio_list_init(&rbio->bio_list);
				941	INIT_LIST_HEAD(&rbio->plug_list);
				942	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	943	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	944	INIT_LIST_HEAD(&rbio->hash_list);
				945	rbio->bbio = bbio;
				946	rbio->raid_map = raid_map;
				947	rbio->fs_info = root->fs_info;
				948	rbio->stripe_len = stripe_len;
				949	rbio->nr_pages = num_pages;
				950	rbio->faila = -1;
				951	rbio->failb = -1;
				952	atomic_set(&rbio->refs, 1);
				953
				954	/*
				955	* the stripe_pages and bio_pages array point to the extra
				956	* memory we allocated past the end of the rbio
				957	*/
				958	p = rbio + 1;
				959	rbio->stripe_pages = p;
				960	rbio->bio_pages = p + sizeof(struct page ) num_pages;
				961
				962	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
				963	nr_data = bbio->num_stripes - 2;
				964	else
				965	nr_data = bbio->num_stripes - 1;
				966
				967	rbio->nr_data = nr_data;
				968	return rbio;
				969	}
				970
				971	/* allocate pages for all the stripes in the bio, including parity */
				972	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				973	{
				974	int i;
				975	struct page *page;
				976
				977	for (i = 0; i < rbio->nr_pages; i++) {
				978	if (rbio->stripe_pages[i])
				979	continue;
				980	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				981	if (!page)
				982	return -ENOMEM;
				983	rbio->stripe_pages[i] = page;
				984	ClearPageUptodate(page);
				985	}
				986	return 0;
				987	}
				988
				989	/* allocate pages for just the p/q stripes */
				990	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				991	{
				992	int i;
				993	struct page *page;
				994
				995	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				996
				997	for (; i < rbio->nr_pages; i++) {
				998	if (rbio->stripe_pages[i])
				999	continue;
				1000	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1001	if (!page)
				1002	return -ENOMEM;
				1003	rbio->stripe_pages[i] = page;
				1004	}
				1005	return 0;
				1006	}
				1007
				1008	/*
				1009	* add a single page from a specific stripe into our list of bios for IO
				1010	* this will try to merge into existing bios if possible, and returns
				1011	* zero if all went well.
				1012	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1013	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1014	struct bio_list *bio_list,
				1015	struct page *page,
				1016	int stripe_nr,
				1017	unsigned long page_index,
				1018	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1019	{
				1020	struct bio *last = bio_list->tail;
				1021	u64 last_end = 0;
				1022	int ret;
				1023	struct bio *bio;
				1024	struct btrfs_bio_stripe *stripe;
				1025	u64 disk_start;
				1026
				1027	stripe = &rbio->bbio->stripes[stripe_nr];
				1028	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1029
				1030	/* if the device is missing, just fail this stripe */
				1031	if (!stripe->dev->bdev)
				1032	return fail_rbio_index(rbio, stripe_nr);
				1033
				1034	/* see if we can add this page onto our existing bio */
				1035	if (last) {
				1036	last_end = (u64)last->bi_sector << 9;
				1037	last_end += last->bi_size;
				1038
				1039	/*
				1040	* we can't merge these if they are from different
				1041	* devices or if they are not contiguous
				1042	*/
				1043	if (last_end == disk_start && stripe->dev->bdev &&
				1044	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1045	last->bi_bdev == stripe->dev->bdev) {
				1046	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1047	if (ret == PAGE_CACHE_SIZE)
				1048	return 0;
				1049	}
				1050	}
				1051
				1052	/* put a new bio on the list */
Chris Mason	9be3395	2013-05-17 18:30:14 -0400	[diff] [blame]	1053	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1054	if (!bio)
				1055	return -ENOMEM;
				1056
				1057	bio->bi_size = 0;
				1058	bio->bi_bdev = stripe->dev->bdev;
				1059	bio->bi_sector = disk_start >> 9;
				1060	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1061
				1062	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1063	bio_list_add(bio_list, bio);
				1064	return 0;
				1065	}
				1066
				1067	/*
				1068	* while we're doing the read/modify/write cycle, we could
				1069	* have errors in reading pages off the disk. This checks
				1070	* for errors and if we're not able to read the page it'll
				1071	* trigger parity reconstruction. The rmw will be finished
				1072	* after we've reconstructed the failed stripes
				1073	*/
				1074	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1075	{
				1076	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				1077	BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
				1078	__raid56_parity_recover(rbio);
				1079	} else {
				1080	finish_rmw(rbio);
				1081	}
				1082	}
				1083
				1084	/*
				1085	* these are just the pages from the rbio array, not from anything
				1086	* the FS sent down to us
				1087	*/
				1088	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1089	{
				1090	int index;
				1091	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1092	index += page;
				1093	return rbio->stripe_pages[index];
				1094	}
				1095
				1096	/*
				1097	* helper function to walk our bio list and populate the bio_pages array with
				1098	* the result. This seems expensive, but it is faster than constantly
				1099	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1100	* reconstruction.
				1101	*
				1102	* This must be called before you trust the answers from page_in_rbio
				1103	*/
				1104	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1105	{
				1106	struct bio *bio;
				1107	u64 start;
				1108	unsigned long stripe_offset;
				1109	unsigned long page_index;
				1110	struct page *p;
				1111	int i;
				1112
				1113	spin_lock_irq(&rbio->bio_list_lock);
				1114	bio_list_for_each(bio, &rbio->bio_list) {
				1115	start = (u64)bio->bi_sector << 9;
				1116	stripe_offset = start - rbio->raid_map[0];
				1117	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1118
				1119	for (i = 0; i < bio->bi_vcnt; i++) {
				1120	p = bio->bi_io_vec[i].bv_page;
				1121	rbio->bio_pages[page_index + i] = p;
				1122	}
				1123	}
				1124	spin_unlock_irq(&rbio->bio_list_lock);
				1125	}
				1126
				1127	/*
				1128	* this is called from one of two situations. We either
				1129	* have a full stripe from the higher layers, or we've read all
				1130	* the missing bits off disk.
				1131	*
				1132	* This will calculate the parity and then send down any
				1133	* changed blocks.
				1134	*/
				1135	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1136	{
				1137	struct btrfs_bio *bbio = rbio->bbio;
				1138	void *pointers[bbio->num_stripes];
				1139	int stripe_len = rbio->stripe_len;
				1140	int nr_data = rbio->nr_data;
				1141	int stripe;
				1142	int pagenr;
				1143	int p_stripe = -1;
				1144	int q_stripe = -1;
				1145	struct bio_list bio_list;
				1146	struct bio *bio;
				1147	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1148	int ret;
				1149
				1150	bio_list_init(&bio_list);
				1151
				1152	if (bbio->num_stripes - rbio->nr_data == 1) {
				1153	p_stripe = bbio->num_stripes - 1;
				1154	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				1155	p_stripe = bbio->num_stripes - 2;
				1156	q_stripe = bbio->num_stripes - 1;
				1157	} else {
				1158	BUG();
				1159	}
				1160
				1161	/* at this point we either have a full stripe,
				1162	* or we've read the full stripe from the drive.
				1163	* recalculate the parity and write the new results.
				1164	*
				1165	* We're not allowed to add any new bios to the
				1166	* bio list here, anyone else that wants to
				1167	* change this stripe needs to do their own rmw.
				1168	*/
				1169	spin_lock_irq(&rbio->bio_list_lock);
				1170	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1171	spin_unlock_irq(&rbio->bio_list_lock);
				1172
				1173	atomic_set(&rbio->bbio->error, 0);
				1174
				1175	/*
				1176	* now that we've set rmw_locked, run through the
				1177	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1178	*
				1179	* We don't cache full rbios because we're assuming
				1180	* the higher layers are unlikely to use this area of
				1181	* the disk again soon. If they do use it again,
				1182	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1183	*/
				1184	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1185	if (!rbio_is_full(rbio))
				1186	cache_rbio_pages(rbio);
				1187	else
				1188	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1189
				1190	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1191	struct page *p;
				1192	/* first collect one page from each data stripe */
				1193	for (stripe = 0; stripe < nr_data; stripe++) {
				1194	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1195	pointers[stripe] = kmap(p);
				1196	}
				1197
				1198	/* then add the parity stripe */
				1199	p = rbio_pstripe_page(rbio, pagenr);
				1200	SetPageUptodate(p);
				1201	pointers[stripe++] = kmap(p);
				1202
				1203	if (q_stripe != -1) {
				1204
				1205	/*
				1206	* raid6, add the qstripe and call the
				1207	* library function to fill in our p/q
				1208	*/
				1209	p = rbio_qstripe_page(rbio, pagenr);
				1210	SetPageUptodate(p);
				1211	pointers[stripe++] = kmap(p);
				1212
				1213	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				1214	pointers);
				1215	} else {
				1216	/* raid5 */
				1217	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1218	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1219	}
				1220
				1221
				1222	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				1223	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1224	}
				1225
				1226	/*
				1227	* time to start writing. Make bios for everything from the
				1228	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1229	* everything else.
				1230	*/
				1231	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1232	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1233	struct page *page;
				1234	if (stripe < rbio->nr_data) {
				1235	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1236	if (!page)
				1237	continue;
				1238	} else {
				1239	page = rbio_stripe_page(rbio, stripe, pagenr);
				1240	}
				1241
				1242	ret = rbio_add_io_page(rbio, &bio_list,
				1243	page, stripe, pagenr, rbio->stripe_len);
				1244	if (ret)
				1245	goto cleanup;
				1246	}
				1247	}
				1248
				1249	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
				1250	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
				1251
				1252	while (1) {
				1253	bio = bio_list_pop(&bio_list);
				1254	if (!bio)
				1255	break;
				1256
				1257	bio->bi_private = rbio;
				1258	bio->bi_end_io = raid_write_end_io;
				1259	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1260	submit_bio(WRITE, bio);
				1261	}
				1262	return;
				1263
				1264	cleanup:
				1265	rbio_orig_end_io(rbio, -EIO, 0);
				1266	}
				1267
				1268	/*
				1269	* helper to find the stripe number for a given bio. Used to figure out which
				1270	* stripe has failed. This expects the bio to correspond to a physical disk,
				1271	* so it looks up based on physical sector numbers.
				1272	*/
				1273	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1274	struct bio *bio)
				1275	{
				1276	u64 physical = bio->bi_sector;
				1277	u64 stripe_start;
				1278	int i;
				1279	struct btrfs_bio_stripe *stripe;
				1280
				1281	physical <<= 9;
				1282
				1283	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1284	stripe = &rbio->bbio->stripes[i];
				1285	stripe_start = stripe->physical;
				1286	if (physical >= stripe_start &&
				1287	physical < stripe_start + rbio->stripe_len) {
				1288	return i;
				1289	}
				1290	}
				1291	return -1;
				1292	}
				1293
				1294	/*
				1295	* helper to find the stripe number for a given
				1296	* bio (before mapping). Used to figure out which stripe has
				1297	* failed. This looks up based on logical block numbers.
				1298	*/
				1299	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1300	struct bio *bio)
				1301	{
				1302	u64 logical = bio->bi_sector;
				1303	u64 stripe_start;
				1304	int i;
				1305
				1306	logical <<= 9;
				1307
				1308	for (i = 0; i < rbio->nr_data; i++) {
				1309	stripe_start = rbio->raid_map[i];
				1310	if (logical >= stripe_start &&
				1311	logical < stripe_start + rbio->stripe_len) {
				1312	return i;
				1313	}
				1314	}
				1315	return -1;
				1316	}
				1317
				1318	/*
				1319	* returns -EIO if we had too many failures
				1320	*/
				1321	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1322	{
				1323	unsigned long flags;
				1324	int ret = 0;
				1325
				1326	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1327
				1328	/* we already know this stripe is bad, move on */
				1329	if (rbio->faila == failed \|\| rbio->failb == failed)
				1330	goto out;
				1331
				1332	if (rbio->faila == -1) {
				1333	/* first failure on this rbio */
				1334	rbio->faila = failed;
				1335	atomic_inc(&rbio->bbio->error);
				1336	} else if (rbio->failb == -1) {
				1337	/* second failure on this rbio */
				1338	rbio->failb = failed;
				1339	atomic_inc(&rbio->bbio->error);
				1340	} else {
				1341	ret = -EIO;
				1342	}
				1343	out:
				1344	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1345
				1346	return ret;
				1347	}
				1348
				1349	/*
				1350	* helper to fail a stripe based on a physical disk
				1351	* bio.
				1352	*/
				1353	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1354	struct bio *bio)
				1355	{
				1356	int failed = find_bio_stripe(rbio, bio);
				1357
				1358	if (failed < 0)
				1359	return -EIO;
				1360
				1361	return fail_rbio_index(rbio, failed);
				1362	}
				1363
				1364	/*
				1365	* this sets each page in the bio uptodate. It should only be used on private
				1366	* rbio pages, nothing that comes in from the higher layers
				1367	*/
				1368	static void set_bio_pages_uptodate(struct bio *bio)
				1369	{
				1370	int i;
				1371	struct page *p;
				1372
				1373	for (i = 0; i < bio->bi_vcnt; i++) {
				1374	p = bio->bi_io_vec[i].bv_page;
				1375	SetPageUptodate(p);
				1376	}
				1377	}
				1378
				1379	/*
				1380	* end io for the read phase of the rmw cycle. All the bios here are physical
				1381	* stripe bios we've read from the disk so we can recalculate the parity of the
				1382	* stripe.
				1383	*
				1384	* This will usually kick off finish_rmw once all the bios are read in, but it
				1385	* may trigger parity reconstruction if we had any errors along the way
				1386	*/
				1387	static void raid_rmw_end_io(struct bio *bio, int err)
				1388	{
				1389	struct btrfs_raid_bio *rbio = bio->bi_private;
				1390
				1391	if (err)
				1392	fail_bio_stripe(rbio, bio);
				1393	else
				1394	set_bio_pages_uptodate(bio);
				1395
				1396	bio_put(bio);
				1397
				1398	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1399	return;
				1400
				1401	err = 0;
				1402	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1403	goto cleanup;
				1404
				1405	/*
				1406	* this will normally call finish_rmw to start our write
				1407	* but if there are any failed stripes we'll reconstruct
				1408	* from parity first
				1409	*/
				1410	validate_rbio_for_rmw(rbio);
				1411	return;
				1412
				1413	cleanup:
				1414
				1415	rbio_orig_end_io(rbio, -EIO, 0);
				1416	}
				1417
				1418	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1419	{
				1420	rbio->work.flags = 0;
				1421	rbio->work.func = rmw_work;
				1422
				1423	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1424	&rbio->work);
				1425	}
				1426
				1427	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1428	{
				1429	rbio->work.flags = 0;
				1430	rbio->work.func = read_rebuild_work;
				1431
				1432	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1433	&rbio->work);
				1434	}
				1435
				1436	/*
				1437	* the stripe must be locked by the caller. It will
				1438	* unlock after all the writes are done
				1439	*/
				1440	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1441	{
				1442	int bios_to_read = 0;
				1443	struct btrfs_bio *bbio = rbio->bbio;
				1444	struct bio_list bio_list;
				1445	int ret;
				1446	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1447	int pagenr;
				1448	int stripe;
				1449	struct bio *bio;
				1450
				1451	bio_list_init(&bio_list);
				1452
				1453	ret = alloc_rbio_pages(rbio);
				1454	if (ret)
				1455	goto cleanup;
				1456
				1457	index_rbio_pages(rbio);
				1458
				1459	atomic_set(&rbio->bbio->error, 0);
				1460	/*
				1461	* build a list of bios to read all the missing parts of this
				1462	* stripe
				1463	*/
				1464	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1465	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1466	struct page *page;
				1467	/*
				1468	* we want to find all the pages missing from
				1469	* the rbio and read them from the disk. If
				1470	* page_in_rbio finds a page in the bio list
				1471	* we don't need to read it off the stripe.
				1472	*/
				1473	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1474	if (page)
				1475	continue;
				1476
				1477	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1478	/*
				1479	* the bio cache may have handed us an uptodate
				1480	* page. If so, be happy and use it
				1481	*/
				1482	if (PageUptodate(page))
				1483	continue;
				1484
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1485	ret = rbio_add_io_page(rbio, &bio_list, page,
				1486	stripe, pagenr, rbio->stripe_len);
				1487	if (ret)
				1488	goto cleanup;
				1489	}
				1490	}
				1491
				1492	bios_to_read = bio_list_size(&bio_list);
				1493	if (!bios_to_read) {
				1494	/*
				1495	* this can happen if others have merged with
				1496	* us, it means there is nothing left to read.
				1497	* But if there are missing devices it may not be
				1498	* safe to do the full stripe write yet.
				1499	*/
				1500	goto finish;
				1501	}
				1502
				1503	/*
				1504	* the bbio may be freed once we submit the last bio. Make sure
				1505	* not to touch it after that
				1506	*/
				1507	atomic_set(&bbio->stripes_pending, bios_to_read);
				1508	while (1) {
				1509	bio = bio_list_pop(&bio_list);
				1510	if (!bio)
				1511	break;
				1512
				1513	bio->bi_private = rbio;
				1514	bio->bi_end_io = raid_rmw_end_io;
				1515
				1516	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1517	BTRFS_WQ_ENDIO_RAID56);
				1518
				1519	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1520	submit_bio(READ, bio);
				1521	}
				1522	/* the actual write will happen once the reads are done */
				1523	return 0;
				1524
				1525	cleanup:
				1526	rbio_orig_end_io(rbio, -EIO, 0);
				1527	return -EIO;
				1528
				1529	finish:
				1530	validate_rbio_for_rmw(rbio);
				1531	return 0;
				1532	}
				1533
				1534	/*
				1535	* if the upper layers pass in a full stripe, we thank them by only allocating
				1536	* enough pages to hold the parity, and sending it all down quickly.
				1537	*/
				1538	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1539	{
				1540	int ret;
				1541
				1542	ret = alloc_rbio_parity_pages(rbio);
				1543	if (ret)
				1544	return ret;
				1545
				1546	ret = lock_stripe_add(rbio);
				1547	if (ret == 0)
				1548	finish_rmw(rbio);
				1549	return 0;
				1550	}
				1551
				1552	/*
				1553	* partial stripe writes get handed over to async helpers.
				1554	* We're really hoping to merge a few more writes into this
				1555	* rbio before calculating new parity
				1556	*/
				1557	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1558	{
				1559	int ret;
				1560
				1561	ret = lock_stripe_add(rbio);
				1562	if (ret == 0)
				1563	async_rmw_stripe(rbio);
				1564	return 0;
				1565	}
				1566
				1567	/*
				1568	* sometimes while we were reading from the drive to
				1569	* recalculate parity, enough new bios come into create
				1570	* a full stripe. So we do a check here to see if we can
				1571	* go directly to finish_rmw
				1572	*/
				1573	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1574	{
				1575	/* head off into rmw land if we don't have a full stripe */
				1576	if (!rbio_is_full(rbio))
				1577	return partial_stripe_write(rbio);
				1578	return full_stripe_write(rbio);
				1579	}
				1580
				1581	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1582	* We use plugging call backs to collect full stripes.
				1583	* Any time we get a partial stripe write while plugged
				1584	* we collect it into a list. When the unplug comes down,
				1585	* we sort the list by logical block number and merge
				1586	* everything we can into the same rbios
				1587	*/
				1588	struct btrfs_plug_cb {
				1589	struct blk_plug_cb cb;
				1590	struct btrfs_fs_info *info;
				1591	struct list_head rbio_list;
				1592	struct btrfs_work work;
				1593	};
				1594
				1595	/*
				1596	* rbios on the plug list are sorted for easier merging.
				1597	*/
				1598	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1599	{
				1600	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1601	plug_list);
				1602	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1603	plug_list);
				1604	u64 a_sector = ra->bio_list.head->bi_sector;
				1605	u64 b_sector = rb->bio_list.head->bi_sector;
				1606
				1607	if (a_sector < b_sector)
				1608	return -1;
				1609	if (a_sector > b_sector)
				1610	return 1;
				1611	return 0;
				1612	}
				1613
				1614	static void run_plug(struct btrfs_plug_cb *plug)
				1615	{
				1616	struct btrfs_raid_bio *cur;
				1617	struct btrfs_raid_bio *last = NULL;
				1618
				1619	/*
				1620	* sort our plug list then try to merge
				1621	* everything we can in hopes of creating full
				1622	* stripes.
				1623	*/
				1624	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1625	while (!list_empty(&plug->rbio_list)) {
				1626	cur = list_entry(plug->rbio_list.next,
				1627	struct btrfs_raid_bio, plug_list);
				1628	list_del_init(&cur->plug_list);
				1629
				1630	if (rbio_is_full(cur)) {
				1631	/* we have a full stripe, send it down */
				1632	full_stripe_write(cur);
				1633	continue;
				1634	}
				1635	if (last) {
				1636	if (rbio_can_merge(last, cur)) {
				1637	merge_rbio(last, cur);
				1638	__free_raid_bio(cur);
				1639	continue;
				1640
				1641	}
				1642	__raid56_parity_write(last);
				1643	}
				1644	last = cur;
				1645	}
				1646	if (last) {
				1647	__raid56_parity_write(last);
				1648	}
				1649	kfree(plug);
				1650	}
				1651
				1652	/*
				1653	* if the unplug comes from schedule, we have to push the
				1654	* work off to a helper thread
				1655	*/
				1656	static void unplug_work(struct btrfs_work *work)
				1657	{
				1658	struct btrfs_plug_cb *plug;
				1659	plug = container_of(work, struct btrfs_plug_cb, work);
				1660	run_plug(plug);
				1661	}
				1662
				1663	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1664	{
				1665	struct btrfs_plug_cb *plug;
				1666	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1667
				1668	if (from_schedule) {
				1669	plug->work.flags = 0;
				1670	plug->work.func = unplug_work;
				1671	btrfs_queue_worker(&plug->info->rmw_workers,
				1672	&plug->work);
				1673	return;
				1674	}
				1675	run_plug(plug);
				1676	}
				1677
				1678	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1679	* our main entry point for writes from the rest of the FS.
				1680	*/
				1681	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1682	struct btrfs_bio bbio, u64 raid_map,
				1683	u64 stripe_len)
				1684	{
				1685	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1686	struct btrfs_plug_cb *plug = NULL;
				1687	struct blk_plug_cb *cb;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1688
				1689	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				1690	if (IS_ERR(rbio)) {
				1691	kfree(raid_map);
				1692	kfree(bbio);
				1693	return PTR_ERR(rbio);
				1694	}
				1695	bio_list_add(&rbio->bio_list, bio);
				1696	rbio->bio_list_bytes = bio->bi_size;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1697
				1698	/*
				1699	* don't plug on full rbios, just get them out the door
				1700	* as quickly as we can
				1701	*/
				1702	if (rbio_is_full(rbio))
				1703	return full_stripe_write(rbio);
				1704
				1705	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1706	sizeof(*plug));
				1707	if (cb) {
				1708	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1709	if (!plug->info) {
				1710	plug->info = root->fs_info;
				1711	INIT_LIST_HEAD(&plug->rbio_list);
				1712	}
				1713	list_add_tail(&rbio->plug_list, &plug->rbio_list);
				1714	} else {
				1715	return __raid56_parity_write(rbio);
				1716	}
				1717	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1718	}
				1719
				1720	/*
				1721	* all parity reconstruction happens here. We've read in everything
				1722	* we can find from the drives and this does the heavy lifting of
				1723	* sorting the good from the bad.
				1724	*/
				1725	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1726	{
				1727	int pagenr, stripe;
				1728	void **pointers;
				1729	int faila = -1, failb = -1;
				1730	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1731	struct page *page;
				1732	int err;
				1733	int i;
				1734
				1735	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
				1736	GFP_NOFS);
				1737	if (!pointers) {
				1738	err = -ENOMEM;
				1739	goto cleanup_io;
				1740	}
				1741
				1742	faila = rbio->faila;
				1743	failb = rbio->failb;
				1744
				1745	if (rbio->read_rebuild) {
				1746	spin_lock_irq(&rbio->bio_list_lock);
				1747	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1748	spin_unlock_irq(&rbio->bio_list_lock);
				1749	}
				1750
				1751	index_rbio_pages(rbio);
				1752
				1753	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1754	/* setup our array of pointers with pages
				1755	* from each stripe
				1756	*/
				1757	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1758	/*
				1759	* if we're rebuilding a read, we have to use
				1760	* pages from the bio list
				1761	*/
				1762	if (rbio->read_rebuild &&
				1763	(stripe == faila \|\| stripe == failb)) {
				1764	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1765	} else {
				1766	page = rbio_stripe_page(rbio, stripe, pagenr);
				1767	}
				1768	pointers[stripe] = kmap(page);
				1769	}
				1770
				1771	/* all raid6 handling here */
				1772	if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
				1773	RAID6_Q_STRIPE) {
				1774
				1775	/*
				1776	* single failure, rebuild from parity raid5
				1777	* style
				1778	*/
				1779	if (failb < 0) {
				1780	if (faila == rbio->nr_data) {
				1781	/*
				1782	* Just the P stripe has failed, without
				1783	* a bad data or Q stripe.
				1784	* TODO, we should redo the xor here.
				1785	*/
				1786	err = -EIO;
				1787	goto cleanup;
				1788	}
				1789	/*
				1790	* a single failure in raid6 is rebuilt
				1791	* in the pstripe code below
				1792	*/
				1793	goto pstripe;
				1794	}
				1795
				1796	/* make sure our ps and qs are in order */
				1797	if (faila > failb) {
				1798	int tmp = failb;
				1799	failb = faila;
				1800	faila = tmp;
				1801	}
				1802
				1803	/* if the q stripe is failed, do a pstripe reconstruction
				1804	* from the xors.
				1805	* If both the q stripe and the P stripe are failed, we're
				1806	* here due to a crc mismatch and we can't give them the
				1807	* data they want
				1808	*/
				1809	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1810	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1811	err = -EIO;
				1812	goto cleanup;
				1813	}
				1814	/*
				1815	* otherwise we have one bad data stripe and
				1816	* a good P stripe. raid5!
				1817	*/
				1818	goto pstripe;
				1819	}
				1820
				1821	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
				1822	raid6_datap_recov(rbio->bbio->num_stripes,
				1823	PAGE_SIZE, faila, pointers);
				1824	} else {
				1825	raid6_2data_recov(rbio->bbio->num_stripes,
				1826	PAGE_SIZE, faila, failb,
				1827	pointers);
				1828	}
				1829	} else {
				1830	void *p;
				1831
				1832	/* rebuild from P stripe here (raid5 or raid6) */
				1833	BUG_ON(failb != -1);
				1834	pstripe:
				1835	/* Copy parity block into failed block to start with */
				1836	memcpy(pointers[faila],
				1837	pointers[rbio->nr_data],
				1838	PAGE_CACHE_SIZE);
				1839
				1840	/* rearrange the pointer array */
				1841	p = pointers[faila];
				1842	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1843	pointers[stripe] = pointers[stripe + 1];
				1844	pointers[rbio->nr_data - 1] = p;
				1845
				1846	/* xor in the rest */
				1847	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1848	}
				1849	/* if we're doing this rebuild as part of an rmw, go through
				1850	* and set all of our private rbio pages in the
				1851	* failed stripes as uptodate. This way finish_rmw will
				1852	* know they can be trusted. If this was a read reconstruction,
				1853	* other endio functions will fiddle the uptodate bits
				1854	*/
				1855	if (!rbio->read_rebuild) {
				1856	for (i = 0; i < nr_pages; i++) {
				1857	if (faila != -1) {
				1858	page = rbio_stripe_page(rbio, faila, i);
				1859	SetPageUptodate(page);
				1860	}
				1861	if (failb != -1) {
				1862	page = rbio_stripe_page(rbio, failb, i);
				1863	SetPageUptodate(page);
				1864	}
				1865	}
				1866	}
				1867	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1868	/*
				1869	* if we're rebuilding a read, we have to use
				1870	* pages from the bio list
				1871	*/
				1872	if (rbio->read_rebuild &&
				1873	(stripe == faila \|\| stripe == failb)) {
				1874	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1875	} else {
				1876	page = rbio_stripe_page(rbio, stripe, pagenr);
				1877	}
				1878	kunmap(page);
				1879	}
				1880	}
				1881
				1882	err = 0;
				1883	cleanup:
				1884	kfree(pointers);
				1885
				1886	cleanup_io:
				1887
				1888	if (rbio->read_rebuild) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1889	if (err == 0)
				1890	cache_rbio_pages(rbio);
				1891	else
				1892	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1893
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1894	rbio_orig_end_io(rbio, err, err == 0);
				1895	} else if (err == 0) {
				1896	rbio->faila = -1;
				1897	rbio->failb = -1;
				1898	finish_rmw(rbio);
				1899	} else {
				1900	rbio_orig_end_io(rbio, err, 0);
				1901	}
				1902	}
				1903
				1904	/*
				1905	* This is called only for stripes we've read from disk to
				1906	* reconstruct the parity.
				1907	*/
				1908	static void raid_recover_end_io(struct bio *bio, int err)
				1909	{
				1910	struct btrfs_raid_bio *rbio = bio->bi_private;
				1911
				1912	/*
				1913	* we only read stripe pages off the disk, set them
				1914	* up to date if there were no errors
				1915	*/
				1916	if (err)
				1917	fail_bio_stripe(rbio, bio);
				1918	else
				1919	set_bio_pages_uptodate(bio);
				1920	bio_put(bio);
				1921
				1922	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1923	return;
				1924
				1925	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1926	rbio_orig_end_io(rbio, -EIO, 0);
				1927	else
				1928	__raid_recover_end_io(rbio);
				1929	}
				1930
				1931	/*
				1932	* reads everything we need off the disk to reconstruct
				1933	* the parity. endio handlers trigger final reconstruction
				1934	* when the IO is done.
				1935	*
				1936	* This is used both for reads from the higher layers and for
				1937	* parity construction required to finish a rmw cycle.
				1938	*/
				1939	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				1940	{
				1941	int bios_to_read = 0;
				1942	struct btrfs_bio *bbio = rbio->bbio;
				1943	struct bio_list bio_list;
				1944	int ret;
				1945	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1946	int pagenr;
				1947	int stripe;
				1948	struct bio *bio;
				1949
				1950	bio_list_init(&bio_list);
				1951
				1952	ret = alloc_rbio_pages(rbio);
				1953	if (ret)
				1954	goto cleanup;
				1955
				1956	atomic_set(&rbio->bbio->error, 0);
				1957
				1958	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1959	* read everything that hasn't failed. Thanks to the
				1960	* stripe cache, it is possible that some or all of these
				1961	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1962	*/
				1963	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1964	if (rbio->faila == stripe \|\|
				1965	rbio->failb == stripe)
				1966	continue;
				1967
				1968	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1969	struct page *p;
				1970
				1971	/*
				1972	* the rmw code may have already read this
				1973	* page in
				1974	*/
				1975	p = rbio_stripe_page(rbio, stripe, pagenr);
				1976	if (PageUptodate(p))
				1977	continue;
				1978
				1979	ret = rbio_add_io_page(rbio, &bio_list,
				1980	rbio_stripe_page(rbio, stripe, pagenr),
				1981	stripe, pagenr, rbio->stripe_len);
				1982	if (ret < 0)
				1983	goto cleanup;
				1984	}
				1985	}
				1986
				1987	bios_to_read = bio_list_size(&bio_list);
				1988	if (!bios_to_read) {
				1989	/*
				1990	* we might have no bios to read just because the pages
				1991	* were up to date, or we might have no bios to read because
				1992	* the devices were gone.
				1993	*/
				1994	if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
				1995	__raid_recover_end_io(rbio);
				1996	goto out;
				1997	} else {
				1998	goto cleanup;
				1999	}
				2000	}
				2001
				2002	/*
				2003	* the bbio may be freed once we submit the last bio. Make sure
				2004	* not to touch it after that
				2005	*/
				2006	atomic_set(&bbio->stripes_pending, bios_to_read);
				2007	while (1) {
				2008	bio = bio_list_pop(&bio_list);
				2009	if (!bio)
				2010	break;
				2011
				2012	bio->bi_private = rbio;
				2013	bio->bi_end_io = raid_recover_end_io;
				2014
				2015	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2016	BTRFS_WQ_ENDIO_RAID56);
				2017
				2018	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2019	submit_bio(READ, bio);
				2020	}
				2021	out:
				2022	return 0;
				2023
				2024	cleanup:
				2025	if (rbio->read_rebuild)
				2026	rbio_orig_end_io(rbio, -EIO, 0);
				2027	return -EIO;
				2028	}
				2029
				2030	/*
				2031	* the main entry point for reads from the higher layers. This
				2032	* is really only called when the normal read path had a failure,
				2033	* so we assume the bio they send down corresponds to a failed part
				2034	* of the drive.
				2035	*/
				2036	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				2037	struct btrfs_bio bbio, u64 raid_map,
				2038	u64 stripe_len, int mirror_num)
				2039	{
				2040	struct btrfs_raid_bio *rbio;
				2041	int ret;
				2042
				2043	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				2044	if (IS_ERR(rbio)) {
				2045	return PTR_ERR(rbio);
				2046	}
				2047
				2048	rbio->read_rebuild = 1;
				2049	bio_list_add(&rbio->bio_list, bio);
				2050	rbio->bio_list_bytes = bio->bi_size;
				2051
				2052	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2053	if (rbio->faila == -1) {
				2054	BUG();
				2055	kfree(rbio);
				2056	return -EIO;
				2057	}
				2058
				2059	/*
				2060	* reconstruct from the q stripe if they are
				2061	* asking for mirror 3
				2062	*/
				2063	if (mirror_num == 3)
				2064	rbio->failb = bbio->num_stripes - 2;
				2065
				2066	ret = lock_stripe_add(rbio);
				2067
				2068	/*
				2069	* __raid56_parity_recover will end the bio with
				2070	* any errors it hits. We don't want to return
				2071	* its error value up the stack because our caller
				2072	* will end up calling bio_endio with any nonzero
				2073	* return
				2074	*/
				2075	if (ret == 0)
				2076	__raid56_parity_recover(rbio);
				2077	/*
				2078	* our rbio has been added to the list of
				2079	* rbios that will be handled after the
				2080	* currently lock owner is done
				2081	*/
				2082	return 0;
				2083
				2084	}
				2085
				2086	static void rmw_work(struct btrfs_work *work)
				2087	{
				2088	struct btrfs_raid_bio *rbio;
				2089
				2090	rbio = container_of(work, struct btrfs_raid_bio, work);
				2091	raid56_rmw_stripe(rbio);
				2092	}
				2093
				2094	static void read_rebuild_work(struct btrfs_work *work)
				2095	{
				2096	struct btrfs_raid_bio *rbio;
				2097
				2098	rbio = container_of(work, struct btrfs_raid_bio, work);
				2099	__raid56_parity_recover(rbio);
				2100	}