Blame - fs/btrfs/raid56.c - kernel/msm-4.9

blob: 9af0b25d991a8c64653b4fa20f4dc31c2794b943 [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
Geert Uytterhoeven	d7011f5	2013-03-03 04:44:41 -0700	[diff] [blame]	34	#include <linux/vmalloc.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
				61
				62	#define RBIO_CACHE_SIZE 1024
				63
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	64	struct btrfs_raid_bio {
				65	struct btrfs_fs_info *fs_info;
				66	struct btrfs_bio *bbio;
				67
				68	/*
				69	* logical block numbers for the start of each stripe
				70	* The last one or two are p/q. These are sorted,
				71	* so raid_map[0] is the start of our full stripe
				72	*/
				73	u64 *raid_map;
				74
				75	/* while we're doing rmw on a stripe
				76	* we put it into a hash table so we can
				77	* lock the stripe and merge more rbios
				78	* into it.
				79	*/
				80	struct list_head hash_list;
				81
				82	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	83	* LRU list for the stripe cache
				84	*/
				85	struct list_head stripe_cache;
				86
				87	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	88	* for scheduling work in the helper threads
				89	*/
				90	struct btrfs_work work;
				91
				92	/*
				93	* bio list and bio_list_lock are used
				94	* to add more bios into the stripe
				95	* in hopes of avoiding the full rmw
				96	*/
				97	struct bio_list bio_list;
				98	spinlock_t bio_list_lock;
				99
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	100	/* also protected by the bio_list_lock, the
				101	* plug list is used by the plugging code
				102	* to collect partial bios while plugged. The
				103	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	104	* the stripe lock to the next pending IO
				105	*/
				106	struct list_head plug_list;
				107
				108	/*
				109	* flags that tell us if it is safe to
				110	* merge with this bio
				111	*/
				112	unsigned long flags;
				113
				114	/* size of each individual stripe on disk */
				115	int stripe_len;
				116
				117	/* number of data stripes (no p/q) */
				118	int nr_data;
				119
				120	/*
				121	* set if we're doing a parity rebuild
				122	* for a read from higher up, which is handled
				123	* differently from a parity rebuild as part of
				124	* rmw
				125	*/
				126	int read_rebuild;
				127
				128	/* first bad stripe */
				129	int faila;
				130
				131	/* second bad stripe (for raid6 use) */
				132	int failb;
				133
				134	/*
				135	* number of pages needed to represent the full
				136	* stripe
				137	*/
				138	int nr_pages;
				139
				140	/*
				141	* size of all the bios in the bio_list. This
				142	* helps us decide if the rbio maps to a full
				143	* stripe or not
				144	*/
				145	int bio_list_bytes;
				146
				147	atomic_t refs;
				148
				149	/*
				150	* these are two arrays of pointers. We allocate the
				151	* rbio big enough to hold them both and setup their
				152	* locations when the rbio is allocated
				153	*/
				154
				155	/* pointers to pages that we allocated for
				156	* reading/writing stripes directly from the disk (including P/Q)
				157	*/
				158	struct page **stripe_pages;
				159
				160	/*
				161	* pointers to the pages in the bio_list. Stored
				162	* here for faster lookup
				163	*/
				164	struct page **bio_pages;
				165	};
				166
				167	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				168	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				169	static void rmw_work(struct btrfs_work *work);
				170	static void read_rebuild_work(struct btrfs_work *work);
				171	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				172	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				173	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				174	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				175	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				176	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				177	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				178
				179	/*
				180	* the stripe hash table is used for locking, and to collect
				181	* bios in hopes of making a full stripe
				182	*/
				183	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				184	{
				185	struct btrfs_stripe_hash_table *table;
				186	struct btrfs_stripe_hash_table *x;
				187	struct btrfs_stripe_hash *cur;
				188	struct btrfs_stripe_hash *h;
				189	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				190	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	191	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	192
				193	if (info->stripe_hash_table)
				194	return 0;
				195
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	196	/*
				197	* The table is large, starting with order 4 and can go as high as
				198	* order 7 in case lock debugging is turned on.
				199	*
				200	* Try harder to allocate and fallback to vmalloc to lower the chance
				201	* of a failing mount.
				202	*/
				203	table_size = sizeof(table) + sizeof(h) * num_entries;
				204	table = kzalloc(table_size, GFP_KERNEL \| __GFP_NOWARN \| __GFP_REPEAT);
				205	if (!table) {
				206	table = vzalloc(table_size);
				207	if (!table)
				208	return -ENOMEM;
				209	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	210
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	211	spin_lock_init(&table->cache_lock);
				212	INIT_LIST_HEAD(&table->stripe_cache);
				213
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	214	h = table->table;
				215
				216	for (i = 0; i < num_entries; i++) {
				217	cur = h + i;
				218	INIT_LIST_HEAD(&cur->hash_list);
				219	spin_lock_init(&cur->lock);
				220	init_waitqueue_head(&cur->wait);
				221	}
				222
				223	x = cmpxchg(&info->stripe_hash_table, NULL, table);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	224	if (x) {
				225	if (is_vmalloc_addr(x))
				226	vfree(x);
				227	else
				228	kfree(x);
				229	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	230	return 0;
				231	}
				232
				233	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	234	* caching an rbio means to copy anything from the
				235	* bio_pages array into the stripe_pages array. We
				236	* use the page uptodate bit in the stripe cache array
				237	* to indicate if it has valid data
				238	*
				239	* once the caching is done, we set the cache ready
				240	* bit.
				241	*/
				242	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				243	{
				244	int i;
				245	char *s;
				246	char *d;
				247	int ret;
				248
				249	ret = alloc_rbio_pages(rbio);
				250	if (ret)
				251	return;
				252
				253	for (i = 0; i < rbio->nr_pages; i++) {
				254	if (!rbio->bio_pages[i])
				255	continue;
				256
				257	s = kmap(rbio->bio_pages[i]);
				258	d = kmap(rbio->stripe_pages[i]);
				259
				260	memcpy(d, s, PAGE_CACHE_SIZE);
				261
				262	kunmap(rbio->bio_pages[i]);
				263	kunmap(rbio->stripe_pages[i]);
				264	SetPageUptodate(rbio->stripe_pages[i]);
				265	}
				266	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				267	}
				268
				269	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	270	* we hash on the first logical address of the stripe
				271	*/
				272	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				273	{
				274	u64 num = rbio->raid_map[0];
				275
				276	/*
				277	* we shift down quite a bit. We're using byte
				278	* addressing, and most of the lower bits are zeros.
				279	* This tends to upset hash_64, and it consistently
				280	* returns just one or two different values.
				281	*
				282	* shifting off the lower bits fixes things.
				283	*/
				284	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				285	}
				286
				287	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	288	* stealing an rbio means taking all the uptodate pages from the stripe
				289	* array in the source rbio and putting them into the destination rbio
				290	*/
				291	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				292	{
				293	int i;
				294	struct page *s;
				295	struct page *d;
				296
				297	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				298	return;
				299
				300	for (i = 0; i < dest->nr_pages; i++) {
				301	s = src->stripe_pages[i];
				302	if (!s \|\| !PageUptodate(s)) {
				303	continue;
				304	}
				305
				306	d = dest->stripe_pages[i];
				307	if (d)
				308	__free_page(d);
				309
				310	dest->stripe_pages[i] = s;
				311	src->stripe_pages[i] = NULL;
				312	}
				313	}
				314
				315	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	316	* merging means we take the bio_list from the victim and
				317	* splice it into the destination. The victim should
				318	* be discarded afterwards.
				319	*
				320	* must be called with dest->rbio_list_lock held
				321	*/
				322	static void merge_rbio(struct btrfs_raid_bio *dest,
				323	struct btrfs_raid_bio *victim)
				324	{
				325	bio_list_merge(&dest->bio_list, &victim->bio_list);
				326	dest->bio_list_bytes += victim->bio_list_bytes;
				327	bio_list_init(&victim->bio_list);
				328	}
				329
				330	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	331	* used to prune items that are in the cache. The caller
				332	* must hold the hash table lock.
				333	*/
				334	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				335	{
				336	int bucket = rbio_bucket(rbio);
				337	struct btrfs_stripe_hash_table *table;
				338	struct btrfs_stripe_hash *h;
				339	int freeit = 0;
				340
				341	/*
				342	* check the bit again under the hash table lock.
				343	*/
				344	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				345	return;
				346
				347	table = rbio->fs_info->stripe_hash_table;
				348	h = table->table + bucket;
				349
				350	/* hold the lock for the bucket because we may be
				351	* removing it from the hash table
				352	*/
				353	spin_lock(&h->lock);
				354
				355	/*
				356	* hold the lock for the bio list because we need
				357	* to make sure the bio list is empty
				358	*/
				359	spin_lock(&rbio->bio_list_lock);
				360
				361	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				362	list_del_init(&rbio->stripe_cache);
				363	table->cache_size -= 1;
				364	freeit = 1;
				365
				366	/* if the bio list isn't empty, this rbio is
				367	* still involved in an IO. We take it out
				368	* of the cache list, and drop the ref that
				369	* was held for the list.
				370	*
				371	* If the bio_list was empty, we also remove
				372	* the rbio from the hash_table, and drop
				373	* the corresponding ref
				374	*/
				375	if (bio_list_empty(&rbio->bio_list)) {
				376	if (!list_empty(&rbio->hash_list)) {
				377	list_del_init(&rbio->hash_list);
				378	atomic_dec(&rbio->refs);
				379	BUG_ON(!list_empty(&rbio->plug_list));
				380	}
				381	}
				382	}
				383
				384	spin_unlock(&rbio->bio_list_lock);
				385	spin_unlock(&h->lock);
				386
				387	if (freeit)
				388	__free_raid_bio(rbio);
				389	}
				390
				391	/*
				392	* prune a given rbio from the cache
				393	*/
				394	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				395	{
				396	struct btrfs_stripe_hash_table *table;
				397	unsigned long flags;
				398
				399	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				400	return;
				401
				402	table = rbio->fs_info->stripe_hash_table;
				403
				404	spin_lock_irqsave(&table->cache_lock, flags);
				405	__remove_rbio_from_cache(rbio);
				406	spin_unlock_irqrestore(&table->cache_lock, flags);
				407	}
				408
				409	/*
				410	* remove everything in the cache
				411	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	412	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	413	{
				414	struct btrfs_stripe_hash_table *table;
				415	unsigned long flags;
				416	struct btrfs_raid_bio *rbio;
				417
				418	table = info->stripe_hash_table;
				419
				420	spin_lock_irqsave(&table->cache_lock, flags);
				421	while (!list_empty(&table->stripe_cache)) {
				422	rbio = list_entry(table->stripe_cache.next,
				423	struct btrfs_raid_bio,
				424	stripe_cache);
				425	__remove_rbio_from_cache(rbio);
				426	}
				427	spin_unlock_irqrestore(&table->cache_lock, flags);
				428	}
				429
				430	/*
				431	* remove all cached entries and free the hash table
				432	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	433	*/
				434	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				435	{
				436	if (!info->stripe_hash_table)
				437	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	438	btrfs_clear_rbio_cache(info);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	439	if (is_vmalloc_addr(info->stripe_hash_table))
				440	vfree(info->stripe_hash_table);
				441	else
				442	kfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	443	info->stripe_hash_table = NULL;
				444	}
				445
				446	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	447	* insert an rbio into the stripe cache. It
				448	* must have already been prepared by calling
				449	* cache_rbio_pages
				450	*
				451	* If this rbio was already cached, it gets
				452	* moved to the front of the lru.
				453	*
				454	* If the size of the rbio cache is too big, we
				455	* prune an item.
				456	*/
				457	static void cache_rbio(struct btrfs_raid_bio *rbio)
				458	{
				459	struct btrfs_stripe_hash_table *table;
				460	unsigned long flags;
				461
				462	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				463	return;
				464
				465	table = rbio->fs_info->stripe_hash_table;
				466
				467	spin_lock_irqsave(&table->cache_lock, flags);
				468	spin_lock(&rbio->bio_list_lock);
				469
				470	/* bump our ref if we were not in the list before */
				471	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				472	atomic_inc(&rbio->refs);
				473
				474	if (!list_empty(&rbio->stripe_cache)){
				475	list_move(&rbio->stripe_cache, &table->stripe_cache);
				476	} else {
				477	list_add(&rbio->stripe_cache, &table->stripe_cache);
				478	table->cache_size += 1;
				479	}
				480
				481	spin_unlock(&rbio->bio_list_lock);
				482
				483	if (table->cache_size > RBIO_CACHE_SIZE) {
				484	struct btrfs_raid_bio *found;
				485
				486	found = list_entry(table->stripe_cache.prev,
				487	struct btrfs_raid_bio,
				488	stripe_cache);
				489
				490	if (found != rbio)
				491	__remove_rbio_from_cache(found);
				492	}
				493
				494	spin_unlock_irqrestore(&table->cache_lock, flags);
				495	return;
				496	}
				497
				498	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	499	* helper function to run the xor_blocks api. It is only
				500	* able to do MAX_XOR_BLOCKS at a time, so we need to
				501	* loop through.
				502	*/
				503	static void run_xor(void **pages, int src_cnt, ssize_t len)
				504	{
				505	int src_off = 0;
				506	int xor_src_cnt = 0;
				507	void *dest = pages[src_cnt];
				508
				509	while(src_cnt > 0) {
				510	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				511	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				512
				513	src_cnt -= xor_src_cnt;
				514	src_off += xor_src_cnt;
				515	}
				516	}
				517
				518	/*
				519	* returns true if the bio list inside this rbio
				520	* covers an entire stripe (no rmw required).
				521	* Must be called with the bio list lock held, or
				522	* at a time when you know it is impossible to add
				523	* new bios into the list
				524	*/
				525	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				526	{
				527	unsigned long size = rbio->bio_list_bytes;
				528	int ret = 1;
				529
				530	if (size != rbio->nr_data * rbio->stripe_len)
				531	ret = 0;
				532
				533	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				534	return ret;
				535	}
				536
				537	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				538	{
				539	unsigned long flags;
				540	int ret;
				541
				542	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				543	ret = __rbio_is_full(rbio);
				544	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				545	return ret;
				546	}
				547
				548	/*
				549	* returns 1 if it is safe to merge two rbios together.
				550	* The merging is safe if the two rbios correspond to
				551	* the same stripe and if they are both going in the same
				552	* direction (read vs write), and if neither one is
				553	* locked for final IO
				554	*
				555	* The caller is responsible for locking such that
				556	* rmw_locked is safe to test
				557	*/
				558	static int rbio_can_merge(struct btrfs_raid_bio *last,
				559	struct btrfs_raid_bio *cur)
				560	{
				561	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				562	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				563	return 0;
				564
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	565	/*
				566	* we can't merge with cached rbios, since the
				567	* idea is that when we merge the destination
				568	* rbio is going to run our IO for us. We can
				569	* steal from cached rbio's though, other functions
				570	* handle that.
				571	*/
				572	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				573	test_bit(RBIO_CACHE_BIT, &cur->flags))
				574	return 0;
				575
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	576	if (last->raid_map[0] !=
				577	cur->raid_map[0])
				578	return 0;
				579
				580	/* reads can't merge with writes */
				581	if (last->read_rebuild !=
				582	cur->read_rebuild) {
				583	return 0;
				584	}
				585
				586	return 1;
				587	}
				588
				589	/*
				590	* helper to index into the pstripe
				591	*/
				592	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				593	{
				594	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				595	return rbio->stripe_pages[index];
				596	}
				597
				598	/*
				599	* helper to index into the qstripe, returns null
				600	* if there is no qstripe
				601	*/
				602	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				603	{
				604	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
				605	return NULL;
				606
				607	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				608	PAGE_CACHE_SHIFT;
				609	return rbio->stripe_pages[index];
				610	}
				611
				612	/*
				613	* The first stripe in the table for a logical address
				614	* has the lock. rbios are added in one of three ways:
				615	*
				616	* 1) Nobody has the stripe locked yet. The rbio is given
				617	* the lock and 0 is returned. The caller must start the IO
				618	* themselves.
				619	*
				620	* 2) Someone has the stripe locked, but we're able to merge
				621	* with the lock owner. The rbio is freed and the IO will
				622	* start automatically along with the existing rbio. 1 is returned.
				623	*
				624	* 3) Someone has the stripe locked, but we're not able to merge.
				625	* The rbio is added to the lock owner's plug list, or merged into
				626	* an rbio already on the plug list. When the lock owner unlocks,
				627	* the next rbio on the list is run and the IO is started automatically.
				628	* 1 is returned
				629	*
				630	* If we return 0, the caller still owns the rbio and must continue with
				631	* IO submission. If we return 1, the caller must assume the rbio has
				632	* already been freed.
				633	*/
				634	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				635	{
				636	int bucket = rbio_bucket(rbio);
				637	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				638	struct btrfs_raid_bio *cur;
				639	struct btrfs_raid_bio *pending;
				640	unsigned long flags;
				641	DEFINE_WAIT(wait);
				642	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	643	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	644	int ret = 0;
				645	int walk = 0;
				646
				647	spin_lock_irqsave(&h->lock, flags);
				648	list_for_each_entry(cur, &h->hash_list, hash_list) {
				649	walk++;
				650	if (cur->raid_map[0] == rbio->raid_map[0]) {
				651	spin_lock(&cur->bio_list_lock);
				652
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	653	/* can we steal this cached rbio's pages? */
				654	if (bio_list_empty(&cur->bio_list) &&
				655	list_empty(&cur->plug_list) &&
				656	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				657	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				658	list_del_init(&cur->hash_list);
				659	atomic_dec(&cur->refs);
				660
				661	steal_rbio(cur, rbio);
				662	cache_drop = cur;
				663	spin_unlock(&cur->bio_list_lock);
				664
				665	goto lockit;
				666	}
				667
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	668	/* can we merge into the lock owner? */
				669	if (rbio_can_merge(cur, rbio)) {
				670	merge_rbio(cur, rbio);
				671	spin_unlock(&cur->bio_list_lock);
				672	freeit = rbio;
				673	ret = 1;
				674	goto out;
				675	}
				676
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	677
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	678	/*
				679	* we couldn't merge with the running
				680	* rbio, see if we can merge with the
				681	* pending ones. We don't have to
				682	* check for rmw_locked because there
				683	* is no way they are inside finish_rmw
				684	* right now
				685	*/
				686	list_for_each_entry(pending, &cur->plug_list,
				687	plug_list) {
				688	if (rbio_can_merge(pending, rbio)) {
				689	merge_rbio(pending, rbio);
				690	spin_unlock(&cur->bio_list_lock);
				691	freeit = rbio;
				692	ret = 1;
				693	goto out;
				694	}
				695	}
				696
				697	/* no merging, put us on the tail of the plug list,
				698	* our rbio will be started with the currently
				699	* running rbio unlocks
				700	*/
				701	list_add_tail(&rbio->plug_list, &cur->plug_list);
				702	spin_unlock(&cur->bio_list_lock);
				703	ret = 1;
				704	goto out;
				705	}
				706	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	707	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	708	atomic_inc(&rbio->refs);
				709	list_add(&rbio->hash_list, &h->hash_list);
				710	out:
				711	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	712	if (cache_drop)
				713	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	714	if (freeit)
				715	__free_raid_bio(freeit);
				716	return ret;
				717	}
				718
				719	/*
				720	* called as rmw or parity rebuild is completed. If the plug list has more
				721	* rbios waiting for this stripe, the next one on the list will be started
				722	*/
				723	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				724	{
				725	int bucket;
				726	struct btrfs_stripe_hash *h;
				727	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	728	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	729
				730	bucket = rbio_bucket(rbio);
				731	h = rbio->fs_info->stripe_hash_table->table + bucket;
				732
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	733	if (list_empty(&rbio->plug_list))
				734	cache_rbio(rbio);
				735
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	736	spin_lock_irqsave(&h->lock, flags);
				737	spin_lock(&rbio->bio_list_lock);
				738
				739	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	740	/*
				741	* if we're still cached and there is no other IO
				742	* to perform, just leave this rbio here for others
				743	* to steal from later
				744	*/
				745	if (list_empty(&rbio->plug_list) &&
				746	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				747	keep_cache = 1;
				748	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				749	BUG_ON(!bio_list_empty(&rbio->bio_list));
				750	goto done;
				751	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	752
				753	list_del_init(&rbio->hash_list);
				754	atomic_dec(&rbio->refs);
				755
				756	/*
				757	* we use the plug list to hold all the rbios
				758	* waiting for the chance to lock this stripe.
				759	* hand the lock over to one of them.
				760	*/
				761	if (!list_empty(&rbio->plug_list)) {
				762	struct btrfs_raid_bio *next;
				763	struct list_head *head = rbio->plug_list.next;
				764
				765	next = list_entry(head, struct btrfs_raid_bio,
				766	plug_list);
				767
				768	list_del_init(&rbio->plug_list);
				769
				770	list_add(&next->hash_list, &h->hash_list);
				771	atomic_inc(&next->refs);
				772	spin_unlock(&rbio->bio_list_lock);
				773	spin_unlock_irqrestore(&h->lock, flags);
				774
				775	if (next->read_rebuild)
				776	async_read_rebuild(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	777	else {
				778	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	779	async_rmw_stripe(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	780	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	781
				782	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	783	} else if (waitqueue_active(&h->wait)) {
				784	spin_unlock(&rbio->bio_list_lock);
				785	spin_unlock_irqrestore(&h->lock, flags);
				786	wake_up(&h->wait);
				787	goto done_nolock;
				788	}
				789	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	790	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	791	spin_unlock(&rbio->bio_list_lock);
				792	spin_unlock_irqrestore(&h->lock, flags);
				793
				794	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	795	if (!keep_cache)
				796	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	797	}
				798
				799	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				800	{
				801	int i;
				802
				803	WARN_ON(atomic_read(&rbio->refs) < 0);
				804	if (!atomic_dec_and_test(&rbio->refs))
				805	return;
				806
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	807	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	808	WARN_ON(!list_empty(&rbio->hash_list));
				809	WARN_ON(!bio_list_empty(&rbio->bio_list));
				810
				811	for (i = 0; i < rbio->nr_pages; i++) {
				812	if (rbio->stripe_pages[i]) {
				813	__free_page(rbio->stripe_pages[i]);
				814	rbio->stripe_pages[i] = NULL;
				815	}
				816	}
				817	kfree(rbio->raid_map);
				818	kfree(rbio->bbio);
				819	kfree(rbio);
				820	}
				821
				822	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				823	{
				824	unlock_stripe(rbio);
				825	__free_raid_bio(rbio);
				826	}
				827
				828	/*
				829	* this frees the rbio and runs through all the bios in the
				830	* bio_list and calls end_io on them
				831	*/
				832	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				833	{
				834	struct bio *cur = bio_list_get(&rbio->bio_list);
				835	struct bio *next;
				836	free_raid_bio(rbio);
				837
				838	while (cur) {
				839	next = cur->bi_next;
				840	cur->bi_next = NULL;
				841	if (uptodate)
				842	set_bit(BIO_UPTODATE, &cur->bi_flags);
				843	bio_endio(cur, err);
				844	cur = next;
				845	}
				846	}
				847
				848	/*
				849	* end io function used by finish_rmw. When we finally
				850	* get here, we've written a full stripe
				851	*/
				852	static void raid_write_end_io(struct bio *bio, int err)
				853	{
				854	struct btrfs_raid_bio *rbio = bio->bi_private;
				855
				856	if (err)
				857	fail_bio_stripe(rbio, bio);
				858
				859	bio_put(bio);
				860
				861	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				862	return;
				863
				864	err = 0;
				865
				866	/* OK, we have read all the stripes we need to. */
				867	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				868	err = -EIO;
				869
				870	rbio_orig_end_io(rbio, err, 0);
				871	return;
				872	}
				873
				874	/*
				875	* the read/modify/write code wants to use the original bio for
				876	* any pages it included, and then use the rbio for everything
				877	* else. This function decides if a given index (stripe number)
				878	* and page number in that stripe fall inside the original bio
				879	* or the rbio.
				880	*
				881	* if you set bio_list_only, you'll get a NULL back for any ranges
				882	* that are outside the bio_list
				883	*
				884	* This doesn't take any refs on anything, you get a bare page pointer
				885	* and the caller must bump refs as required.
				886	*
				887	* You must call index_rbio_pages once before you can trust
				888	* the answers from this function.
				889	*/
				890	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				891	int index, int pagenr, int bio_list_only)
				892	{
				893	int chunk_page;
				894	struct page *p = NULL;
				895
				896	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				897
				898	spin_lock_irq(&rbio->bio_list_lock);
				899	p = rbio->bio_pages[chunk_page];
				900	spin_unlock_irq(&rbio->bio_list_lock);
				901
				902	if (p \|\| bio_list_only)
				903	return p;
				904
				905	return rbio->stripe_pages[chunk_page];
				906	}
				907
				908	/*
				909	* number of pages we need for the entire stripe across all the
				910	* drives
				911	*/
				912	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				913	{
				914	unsigned long nr = stripe_len * nr_stripes;
				915	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				916	}
				917
				918	/*
				919	* allocation and initial setup for the btrfs_raid_bio. Not
				920	* this does not allocate any pages for rbio->pages.
				921	*/
				922	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				923	struct btrfs_bio bbio, u64 raid_map,
				924	u64 stripe_len)
				925	{
				926	struct btrfs_raid_bio *rbio;
				927	int nr_data = 0;
				928	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
				929	void *p;
				930
				931	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2,
				932	GFP_NOFS);
				933	if (!rbio) {
				934	kfree(raid_map);
				935	kfree(bbio);
				936	return ERR_PTR(-ENOMEM);
				937	}
				938
				939	bio_list_init(&rbio->bio_list);
				940	INIT_LIST_HEAD(&rbio->plug_list);
				941	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	942	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	943	INIT_LIST_HEAD(&rbio->hash_list);
				944	rbio->bbio = bbio;
				945	rbio->raid_map = raid_map;
				946	rbio->fs_info = root->fs_info;
				947	rbio->stripe_len = stripe_len;
				948	rbio->nr_pages = num_pages;
				949	rbio->faila = -1;
				950	rbio->failb = -1;
				951	atomic_set(&rbio->refs, 1);
				952
				953	/*
				954	* the stripe_pages and bio_pages array point to the extra
				955	* memory we allocated past the end of the rbio
				956	*/
				957	p = rbio + 1;
				958	rbio->stripe_pages = p;
				959	rbio->bio_pages = p + sizeof(struct page ) num_pages;
				960
				961	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
				962	nr_data = bbio->num_stripes - 2;
				963	else
				964	nr_data = bbio->num_stripes - 1;
				965
				966	rbio->nr_data = nr_data;
				967	return rbio;
				968	}
				969
				970	/* allocate pages for all the stripes in the bio, including parity */
				971	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				972	{
				973	int i;
				974	struct page *page;
				975
				976	for (i = 0; i < rbio->nr_pages; i++) {
				977	if (rbio->stripe_pages[i])
				978	continue;
				979	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				980	if (!page)
				981	return -ENOMEM;
				982	rbio->stripe_pages[i] = page;
				983	ClearPageUptodate(page);
				984	}
				985	return 0;
				986	}
				987
				988	/* allocate pages for just the p/q stripes */
				989	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				990	{
				991	int i;
				992	struct page *page;
				993
				994	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				995
				996	for (; i < rbio->nr_pages; i++) {
				997	if (rbio->stripe_pages[i])
				998	continue;
				999	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1000	if (!page)
				1001	return -ENOMEM;
				1002	rbio->stripe_pages[i] = page;
				1003	}
				1004	return 0;
				1005	}
				1006
				1007	/*
				1008	* add a single page from a specific stripe into our list of bios for IO
				1009	* this will try to merge into existing bios if possible, and returns
				1010	* zero if all went well.
				1011	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1012	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1013	struct bio_list *bio_list,
				1014	struct page *page,
				1015	int stripe_nr,
				1016	unsigned long page_index,
				1017	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1018	{
				1019	struct bio *last = bio_list->tail;
				1020	u64 last_end = 0;
				1021	int ret;
				1022	struct bio *bio;
				1023	struct btrfs_bio_stripe *stripe;
				1024	u64 disk_start;
				1025
				1026	stripe = &rbio->bbio->stripes[stripe_nr];
				1027	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1028
				1029	/* if the device is missing, just fail this stripe */
				1030	if (!stripe->dev->bdev)
				1031	return fail_rbio_index(rbio, stripe_nr);
				1032
				1033	/* see if we can add this page onto our existing bio */
				1034	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1035	last_end = (u64)last->bi_iter.bi_sector << 9;
				1036	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1037
				1038	/*
				1039	* we can't merge these if they are from different
				1040	* devices or if they are not contiguous
				1041	*/
				1042	if (last_end == disk_start && stripe->dev->bdev &&
				1043	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1044	last->bi_bdev == stripe->dev->bdev) {
				1045	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1046	if (ret == PAGE_CACHE_SIZE)
				1047	return 0;
				1048	}
				1049	}
				1050
				1051	/* put a new bio on the list */
Chris Mason	9be3395	2013-05-17 18:30:14 -0400	[diff] [blame]	1052	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1053	if (!bio)
				1054	return -ENOMEM;
				1055
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1056	bio->bi_iter.bi_size = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1057	bio->bi_bdev = stripe->dev->bdev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1058	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1059	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1060
				1061	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1062	bio_list_add(bio_list, bio);
				1063	return 0;
				1064	}
				1065
				1066	/*
				1067	* while we're doing the read/modify/write cycle, we could
				1068	* have errors in reading pages off the disk. This checks
				1069	* for errors and if we're not able to read the page it'll
				1070	* trigger parity reconstruction. The rmw will be finished
				1071	* after we've reconstructed the failed stripes
				1072	*/
				1073	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1074	{
				1075	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				1076	BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
				1077	__raid56_parity_recover(rbio);
				1078	} else {
				1079	finish_rmw(rbio);
				1080	}
				1081	}
				1082
				1083	/*
				1084	* these are just the pages from the rbio array, not from anything
				1085	* the FS sent down to us
				1086	*/
				1087	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1088	{
				1089	int index;
				1090	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1091	index += page;
				1092	return rbio->stripe_pages[index];
				1093	}
				1094
				1095	/*
				1096	* helper function to walk our bio list and populate the bio_pages array with
				1097	* the result. This seems expensive, but it is faster than constantly
				1098	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1099	* reconstruction.
				1100	*
				1101	* This must be called before you trust the answers from page_in_rbio
				1102	*/
				1103	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1104	{
				1105	struct bio *bio;
				1106	u64 start;
				1107	unsigned long stripe_offset;
				1108	unsigned long page_index;
				1109	struct page *p;
				1110	int i;
				1111
				1112	spin_lock_irq(&rbio->bio_list_lock);
				1113	bio_list_for_each(bio, &rbio->bio_list) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1114	start = (u64)bio->bi_iter.bi_sector << 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1115	stripe_offset = start - rbio->raid_map[0];
				1116	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1117
				1118	for (i = 0; i < bio->bi_vcnt; i++) {
				1119	p = bio->bi_io_vec[i].bv_page;
				1120	rbio->bio_pages[page_index + i] = p;
				1121	}
				1122	}
				1123	spin_unlock_irq(&rbio->bio_list_lock);
				1124	}
				1125
				1126	/*
				1127	* this is called from one of two situations. We either
				1128	* have a full stripe from the higher layers, or we've read all
				1129	* the missing bits off disk.
				1130	*
				1131	* This will calculate the parity and then send down any
				1132	* changed blocks.
				1133	*/
				1134	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1135	{
				1136	struct btrfs_bio *bbio = rbio->bbio;
				1137	void *pointers[bbio->num_stripes];
				1138	int stripe_len = rbio->stripe_len;
				1139	int nr_data = rbio->nr_data;
				1140	int stripe;
				1141	int pagenr;
				1142	int p_stripe = -1;
				1143	int q_stripe = -1;
				1144	struct bio_list bio_list;
				1145	struct bio *bio;
				1146	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1147	int ret;
				1148
				1149	bio_list_init(&bio_list);
				1150
				1151	if (bbio->num_stripes - rbio->nr_data == 1) {
				1152	p_stripe = bbio->num_stripes - 1;
				1153	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				1154	p_stripe = bbio->num_stripes - 2;
				1155	q_stripe = bbio->num_stripes - 1;
				1156	} else {
				1157	BUG();
				1158	}
				1159
				1160	/* at this point we either have a full stripe,
				1161	* or we've read the full stripe from the drive.
				1162	* recalculate the parity and write the new results.
				1163	*
				1164	* We're not allowed to add any new bios to the
				1165	* bio list here, anyone else that wants to
				1166	* change this stripe needs to do their own rmw.
				1167	*/
				1168	spin_lock_irq(&rbio->bio_list_lock);
				1169	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1170	spin_unlock_irq(&rbio->bio_list_lock);
				1171
				1172	atomic_set(&rbio->bbio->error, 0);
				1173
				1174	/*
				1175	* now that we've set rmw_locked, run through the
				1176	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1177	*
				1178	* We don't cache full rbios because we're assuming
				1179	* the higher layers are unlikely to use this area of
				1180	* the disk again soon. If they do use it again,
				1181	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1182	*/
				1183	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1184	if (!rbio_is_full(rbio))
				1185	cache_rbio_pages(rbio);
				1186	else
				1187	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1188
				1189	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1190	struct page *p;
				1191	/* first collect one page from each data stripe */
				1192	for (stripe = 0; stripe < nr_data; stripe++) {
				1193	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1194	pointers[stripe] = kmap(p);
				1195	}
				1196
				1197	/* then add the parity stripe */
				1198	p = rbio_pstripe_page(rbio, pagenr);
				1199	SetPageUptodate(p);
				1200	pointers[stripe++] = kmap(p);
				1201
				1202	if (q_stripe != -1) {
				1203
				1204	/*
				1205	* raid6, add the qstripe and call the
				1206	* library function to fill in our p/q
				1207	*/
				1208	p = rbio_qstripe_page(rbio, pagenr);
				1209	SetPageUptodate(p);
				1210	pointers[stripe++] = kmap(p);
				1211
				1212	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				1213	pointers);
				1214	} else {
				1215	/* raid5 */
				1216	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1217	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1218	}
				1219
				1220
				1221	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				1222	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1223	}
				1224
				1225	/*
				1226	* time to start writing. Make bios for everything from the
				1227	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1228	* everything else.
				1229	*/
				1230	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1231	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1232	struct page *page;
				1233	if (stripe < rbio->nr_data) {
				1234	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1235	if (!page)
				1236	continue;
				1237	} else {
				1238	page = rbio_stripe_page(rbio, stripe, pagenr);
				1239	}
				1240
				1241	ret = rbio_add_io_page(rbio, &bio_list,
				1242	page, stripe, pagenr, rbio->stripe_len);
				1243	if (ret)
				1244	goto cleanup;
				1245	}
				1246	}
				1247
				1248	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
				1249	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
				1250
				1251	while (1) {
				1252	bio = bio_list_pop(&bio_list);
				1253	if (!bio)
				1254	break;
				1255
				1256	bio->bi_private = rbio;
				1257	bio->bi_end_io = raid_write_end_io;
				1258	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1259	submit_bio(WRITE, bio);
				1260	}
				1261	return;
				1262
				1263	cleanup:
				1264	rbio_orig_end_io(rbio, -EIO, 0);
				1265	}
				1266
				1267	/*
				1268	* helper to find the stripe number for a given bio. Used to figure out which
				1269	* stripe has failed. This expects the bio to correspond to a physical disk,
				1270	* so it looks up based on physical sector numbers.
				1271	*/
				1272	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1273	struct bio *bio)
				1274	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1275	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1276	u64 stripe_start;
				1277	int i;
				1278	struct btrfs_bio_stripe *stripe;
				1279
				1280	physical <<= 9;
				1281
				1282	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1283	stripe = &rbio->bbio->stripes[i];
				1284	stripe_start = stripe->physical;
				1285	if (physical >= stripe_start &&
				1286	physical < stripe_start + rbio->stripe_len) {
				1287	return i;
				1288	}
				1289	}
				1290	return -1;
				1291	}
				1292
				1293	/*
				1294	* helper to find the stripe number for a given
				1295	* bio (before mapping). Used to figure out which stripe has
				1296	* failed. This looks up based on logical block numbers.
				1297	*/
				1298	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1299	struct bio *bio)
				1300	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1301	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1302	u64 stripe_start;
				1303	int i;
				1304
				1305	logical <<= 9;
				1306
				1307	for (i = 0; i < rbio->nr_data; i++) {
				1308	stripe_start = rbio->raid_map[i];
				1309	if (logical >= stripe_start &&
				1310	logical < stripe_start + rbio->stripe_len) {
				1311	return i;
				1312	}
				1313	}
				1314	return -1;
				1315	}
				1316
				1317	/*
				1318	* returns -EIO if we had too many failures
				1319	*/
				1320	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1321	{
				1322	unsigned long flags;
				1323	int ret = 0;
				1324
				1325	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1326
				1327	/* we already know this stripe is bad, move on */
				1328	if (rbio->faila == failed \|\| rbio->failb == failed)
				1329	goto out;
				1330
				1331	if (rbio->faila == -1) {
				1332	/* first failure on this rbio */
				1333	rbio->faila = failed;
				1334	atomic_inc(&rbio->bbio->error);
				1335	} else if (rbio->failb == -1) {
				1336	/* second failure on this rbio */
				1337	rbio->failb = failed;
				1338	atomic_inc(&rbio->bbio->error);
				1339	} else {
				1340	ret = -EIO;
				1341	}
				1342	out:
				1343	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1344
				1345	return ret;
				1346	}
				1347
				1348	/*
				1349	* helper to fail a stripe based on a physical disk
				1350	* bio.
				1351	*/
				1352	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1353	struct bio *bio)
				1354	{
				1355	int failed = find_bio_stripe(rbio, bio);
				1356
				1357	if (failed < 0)
				1358	return -EIO;
				1359
				1360	return fail_rbio_index(rbio, failed);
				1361	}
				1362
				1363	/*
				1364	* this sets each page in the bio uptodate. It should only be used on private
				1365	* rbio pages, nothing that comes in from the higher layers
				1366	*/
				1367	static void set_bio_pages_uptodate(struct bio *bio)
				1368	{
				1369	int i;
				1370	struct page *p;
				1371
				1372	for (i = 0; i < bio->bi_vcnt; i++) {
				1373	p = bio->bi_io_vec[i].bv_page;
				1374	SetPageUptodate(p);
				1375	}
				1376	}
				1377
				1378	/*
				1379	* end io for the read phase of the rmw cycle. All the bios here are physical
				1380	* stripe bios we've read from the disk so we can recalculate the parity of the
				1381	* stripe.
				1382	*
				1383	* This will usually kick off finish_rmw once all the bios are read in, but it
				1384	* may trigger parity reconstruction if we had any errors along the way
				1385	*/
				1386	static void raid_rmw_end_io(struct bio *bio, int err)
				1387	{
				1388	struct btrfs_raid_bio *rbio = bio->bi_private;
				1389
				1390	if (err)
				1391	fail_bio_stripe(rbio, bio);
				1392	else
				1393	set_bio_pages_uptodate(bio);
				1394
				1395	bio_put(bio);
				1396
				1397	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1398	return;
				1399
				1400	err = 0;
				1401	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1402	goto cleanup;
				1403
				1404	/*
				1405	* this will normally call finish_rmw to start our write
				1406	* but if there are any failed stripes we'll reconstruct
				1407	* from parity first
				1408	*/
				1409	validate_rbio_for_rmw(rbio);
				1410	return;
				1411
				1412	cleanup:
				1413
				1414	rbio_orig_end_io(rbio, -EIO, 0);
				1415	}
				1416
				1417	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1418	{
				1419	rbio->work.flags = 0;
				1420	rbio->work.func = rmw_work;
				1421
				1422	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1423	&rbio->work);
				1424	}
				1425
				1426	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1427	{
				1428	rbio->work.flags = 0;
				1429	rbio->work.func = read_rebuild_work;
				1430
				1431	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1432	&rbio->work);
				1433	}
				1434
				1435	/*
				1436	* the stripe must be locked by the caller. It will
				1437	* unlock after all the writes are done
				1438	*/
				1439	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1440	{
				1441	int bios_to_read = 0;
				1442	struct btrfs_bio *bbio = rbio->bbio;
				1443	struct bio_list bio_list;
				1444	int ret;
				1445	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1446	int pagenr;
				1447	int stripe;
				1448	struct bio *bio;
				1449
				1450	bio_list_init(&bio_list);
				1451
				1452	ret = alloc_rbio_pages(rbio);
				1453	if (ret)
				1454	goto cleanup;
				1455
				1456	index_rbio_pages(rbio);
				1457
				1458	atomic_set(&rbio->bbio->error, 0);
				1459	/*
				1460	* build a list of bios to read all the missing parts of this
				1461	* stripe
				1462	*/
				1463	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1464	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1465	struct page *page;
				1466	/*
				1467	* we want to find all the pages missing from
				1468	* the rbio and read them from the disk. If
				1469	* page_in_rbio finds a page in the bio list
				1470	* we don't need to read it off the stripe.
				1471	*/
				1472	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1473	if (page)
				1474	continue;
				1475
				1476	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1477	/*
				1478	* the bio cache may have handed us an uptodate
				1479	* page. If so, be happy and use it
				1480	*/
				1481	if (PageUptodate(page))
				1482	continue;
				1483
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1484	ret = rbio_add_io_page(rbio, &bio_list, page,
				1485	stripe, pagenr, rbio->stripe_len);
				1486	if (ret)
				1487	goto cleanup;
				1488	}
				1489	}
				1490
				1491	bios_to_read = bio_list_size(&bio_list);
				1492	if (!bios_to_read) {
				1493	/*
				1494	* this can happen if others have merged with
				1495	* us, it means there is nothing left to read.
				1496	* But if there are missing devices it may not be
				1497	* safe to do the full stripe write yet.
				1498	*/
				1499	goto finish;
				1500	}
				1501
				1502	/*
				1503	* the bbio may be freed once we submit the last bio. Make sure
				1504	* not to touch it after that
				1505	*/
				1506	atomic_set(&bbio->stripes_pending, bios_to_read);
				1507	while (1) {
				1508	bio = bio_list_pop(&bio_list);
				1509	if (!bio)
				1510	break;
				1511
				1512	bio->bi_private = rbio;
				1513	bio->bi_end_io = raid_rmw_end_io;
				1514
				1515	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1516	BTRFS_WQ_ENDIO_RAID56);
				1517
				1518	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1519	submit_bio(READ, bio);
				1520	}
				1521	/* the actual write will happen once the reads are done */
				1522	return 0;
				1523
				1524	cleanup:
				1525	rbio_orig_end_io(rbio, -EIO, 0);
				1526	return -EIO;
				1527
				1528	finish:
				1529	validate_rbio_for_rmw(rbio);
				1530	return 0;
				1531	}
				1532
				1533	/*
				1534	* if the upper layers pass in a full stripe, we thank them by only allocating
				1535	* enough pages to hold the parity, and sending it all down quickly.
				1536	*/
				1537	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1538	{
				1539	int ret;
				1540
				1541	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1542	if (ret) {
				1543	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1544	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1545	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1546
				1547	ret = lock_stripe_add(rbio);
				1548	if (ret == 0)
				1549	finish_rmw(rbio);
				1550	return 0;
				1551	}
				1552
				1553	/*
				1554	* partial stripe writes get handed over to async helpers.
				1555	* We're really hoping to merge a few more writes into this
				1556	* rbio before calculating new parity
				1557	*/
				1558	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1559	{
				1560	int ret;
				1561
				1562	ret = lock_stripe_add(rbio);
				1563	if (ret == 0)
				1564	async_rmw_stripe(rbio);
				1565	return 0;
				1566	}
				1567
				1568	/*
				1569	* sometimes while we were reading from the drive to
				1570	* recalculate parity, enough new bios come into create
				1571	* a full stripe. So we do a check here to see if we can
				1572	* go directly to finish_rmw
				1573	*/
				1574	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1575	{
				1576	/* head off into rmw land if we don't have a full stripe */
				1577	if (!rbio_is_full(rbio))
				1578	return partial_stripe_write(rbio);
				1579	return full_stripe_write(rbio);
				1580	}
				1581
				1582	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1583	* We use plugging call backs to collect full stripes.
				1584	* Any time we get a partial stripe write while plugged
				1585	* we collect it into a list. When the unplug comes down,
				1586	* we sort the list by logical block number and merge
				1587	* everything we can into the same rbios
				1588	*/
				1589	struct btrfs_plug_cb {
				1590	struct blk_plug_cb cb;
				1591	struct btrfs_fs_info *info;
				1592	struct list_head rbio_list;
				1593	struct btrfs_work work;
				1594	};
				1595
				1596	/*
				1597	* rbios on the plug list are sorted for easier merging.
				1598	*/
				1599	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1600	{
				1601	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1602	plug_list);
				1603	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1604	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1605	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1606	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1607
				1608	if (a_sector < b_sector)
				1609	return -1;
				1610	if (a_sector > b_sector)
				1611	return 1;
				1612	return 0;
				1613	}
				1614
				1615	static void run_plug(struct btrfs_plug_cb *plug)
				1616	{
				1617	struct btrfs_raid_bio *cur;
				1618	struct btrfs_raid_bio *last = NULL;
				1619
				1620	/*
				1621	* sort our plug list then try to merge
				1622	* everything we can in hopes of creating full
				1623	* stripes.
				1624	*/
				1625	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1626	while (!list_empty(&plug->rbio_list)) {
				1627	cur = list_entry(plug->rbio_list.next,
				1628	struct btrfs_raid_bio, plug_list);
				1629	list_del_init(&cur->plug_list);
				1630
				1631	if (rbio_is_full(cur)) {
				1632	/* we have a full stripe, send it down */
				1633	full_stripe_write(cur);
				1634	continue;
				1635	}
				1636	if (last) {
				1637	if (rbio_can_merge(last, cur)) {
				1638	merge_rbio(last, cur);
				1639	__free_raid_bio(cur);
				1640	continue;
				1641
				1642	}
				1643	__raid56_parity_write(last);
				1644	}
				1645	last = cur;
				1646	}
				1647	if (last) {
				1648	__raid56_parity_write(last);
				1649	}
				1650	kfree(plug);
				1651	}
				1652
				1653	/*
				1654	* if the unplug comes from schedule, we have to push the
				1655	* work off to a helper thread
				1656	*/
				1657	static void unplug_work(struct btrfs_work *work)
				1658	{
				1659	struct btrfs_plug_cb *plug;
				1660	plug = container_of(work, struct btrfs_plug_cb, work);
				1661	run_plug(plug);
				1662	}
				1663
				1664	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1665	{
				1666	struct btrfs_plug_cb *plug;
				1667	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1668
				1669	if (from_schedule) {
				1670	plug->work.flags = 0;
				1671	plug->work.func = unplug_work;
				1672	btrfs_queue_worker(&plug->info->rmw_workers,
				1673	&plug->work);
				1674	return;
				1675	}
				1676	run_plug(plug);
				1677	}
				1678
				1679	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1680	* our main entry point for writes from the rest of the FS.
				1681	*/
				1682	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1683	struct btrfs_bio bbio, u64 raid_map,
				1684	u64 stripe_len)
				1685	{
				1686	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1687	struct btrfs_plug_cb *plug = NULL;
				1688	struct blk_plug_cb *cb;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1689
				1690	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Dan Carpenter	3dc0e81	2013-07-22 09:55:15 +0300	[diff] [blame]	1691	if (IS_ERR(rbio))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1692	return PTR_ERR(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1693	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1694	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1695
				1696	/*
				1697	* don't plug on full rbios, just get them out the door
				1698	* as quickly as we can
				1699	*/
				1700	if (rbio_is_full(rbio))
				1701	return full_stripe_write(rbio);
				1702
				1703	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1704	sizeof(*plug));
				1705	if (cb) {
				1706	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1707	if (!plug->info) {
				1708	plug->info = root->fs_info;
				1709	INIT_LIST_HEAD(&plug->rbio_list);
				1710	}
				1711	list_add_tail(&rbio->plug_list, &plug->rbio_list);
				1712	} else {
				1713	return __raid56_parity_write(rbio);
				1714	}
				1715	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1716	}
				1717
				1718	/*
				1719	* all parity reconstruction happens here. We've read in everything
				1720	* we can find from the drives and this does the heavy lifting of
				1721	* sorting the good from the bad.
				1722	*/
				1723	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1724	{
				1725	int pagenr, stripe;
				1726	void **pointers;
				1727	int faila = -1, failb = -1;
				1728	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1729	struct page *page;
				1730	int err;
				1731	int i;
				1732
				1733	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
				1734	GFP_NOFS);
				1735	if (!pointers) {
				1736	err = -ENOMEM;
				1737	goto cleanup_io;
				1738	}
				1739
				1740	faila = rbio->faila;
				1741	failb = rbio->failb;
				1742
				1743	if (rbio->read_rebuild) {
				1744	spin_lock_irq(&rbio->bio_list_lock);
				1745	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1746	spin_unlock_irq(&rbio->bio_list_lock);
				1747	}
				1748
				1749	index_rbio_pages(rbio);
				1750
				1751	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1752	/* setup our array of pointers with pages
				1753	* from each stripe
				1754	*/
				1755	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1756	/*
				1757	* if we're rebuilding a read, we have to use
				1758	* pages from the bio list
				1759	*/
				1760	if (rbio->read_rebuild &&
				1761	(stripe == faila \|\| stripe == failb)) {
				1762	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1763	} else {
				1764	page = rbio_stripe_page(rbio, stripe, pagenr);
				1765	}
				1766	pointers[stripe] = kmap(page);
				1767	}
				1768
				1769	/* all raid6 handling here */
				1770	if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
				1771	RAID6_Q_STRIPE) {
				1772
				1773	/*
				1774	* single failure, rebuild from parity raid5
				1775	* style
				1776	*/
				1777	if (failb < 0) {
				1778	if (faila == rbio->nr_data) {
				1779	/*
				1780	* Just the P stripe has failed, without
				1781	* a bad data or Q stripe.
				1782	* TODO, we should redo the xor here.
				1783	*/
				1784	err = -EIO;
				1785	goto cleanup;
				1786	}
				1787	/*
				1788	* a single failure in raid6 is rebuilt
				1789	* in the pstripe code below
				1790	*/
				1791	goto pstripe;
				1792	}
				1793
				1794	/* make sure our ps and qs are in order */
				1795	if (faila > failb) {
				1796	int tmp = failb;
				1797	failb = faila;
				1798	faila = tmp;
				1799	}
				1800
				1801	/* if the q stripe is failed, do a pstripe reconstruction
				1802	* from the xors.
				1803	* If both the q stripe and the P stripe are failed, we're
				1804	* here due to a crc mismatch and we can't give them the
				1805	* data they want
				1806	*/
				1807	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1808	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1809	err = -EIO;
				1810	goto cleanup;
				1811	}
				1812	/*
				1813	* otherwise we have one bad data stripe and
				1814	* a good P stripe. raid5!
				1815	*/
				1816	goto pstripe;
				1817	}
				1818
				1819	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
				1820	raid6_datap_recov(rbio->bbio->num_stripes,
				1821	PAGE_SIZE, faila, pointers);
				1822	} else {
				1823	raid6_2data_recov(rbio->bbio->num_stripes,
				1824	PAGE_SIZE, faila, failb,
				1825	pointers);
				1826	}
				1827	} else {
				1828	void *p;
				1829
				1830	/* rebuild from P stripe here (raid5 or raid6) */
				1831	BUG_ON(failb != -1);
				1832	pstripe:
				1833	/* Copy parity block into failed block to start with */
				1834	memcpy(pointers[faila],
				1835	pointers[rbio->nr_data],
				1836	PAGE_CACHE_SIZE);
				1837
				1838	/* rearrange the pointer array */
				1839	p = pointers[faila];
				1840	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1841	pointers[stripe] = pointers[stripe + 1];
				1842	pointers[rbio->nr_data - 1] = p;
				1843
				1844	/* xor in the rest */
				1845	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1846	}
				1847	/* if we're doing this rebuild as part of an rmw, go through
				1848	* and set all of our private rbio pages in the
				1849	* failed stripes as uptodate. This way finish_rmw will
				1850	* know they can be trusted. If this was a read reconstruction,
				1851	* other endio functions will fiddle the uptodate bits
				1852	*/
				1853	if (!rbio->read_rebuild) {
				1854	for (i = 0; i < nr_pages; i++) {
				1855	if (faila != -1) {
				1856	page = rbio_stripe_page(rbio, faila, i);
				1857	SetPageUptodate(page);
				1858	}
				1859	if (failb != -1) {
				1860	page = rbio_stripe_page(rbio, failb, i);
				1861	SetPageUptodate(page);
				1862	}
				1863	}
				1864	}
				1865	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1866	/*
				1867	* if we're rebuilding a read, we have to use
				1868	* pages from the bio list
				1869	*/
				1870	if (rbio->read_rebuild &&
				1871	(stripe == faila \|\| stripe == failb)) {
				1872	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1873	} else {
				1874	page = rbio_stripe_page(rbio, stripe, pagenr);
				1875	}
				1876	kunmap(page);
				1877	}
				1878	}
				1879
				1880	err = 0;
				1881	cleanup:
				1882	kfree(pointers);
				1883
				1884	cleanup_io:
				1885
				1886	if (rbio->read_rebuild) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1887	if (err == 0)
				1888	cache_rbio_pages(rbio);
				1889	else
				1890	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1891
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1892	rbio_orig_end_io(rbio, err, err == 0);
				1893	} else if (err == 0) {
				1894	rbio->faila = -1;
				1895	rbio->failb = -1;
				1896	finish_rmw(rbio);
				1897	} else {
				1898	rbio_orig_end_io(rbio, err, 0);
				1899	}
				1900	}
				1901
				1902	/*
				1903	* This is called only for stripes we've read from disk to
				1904	* reconstruct the parity.
				1905	*/
				1906	static void raid_recover_end_io(struct bio *bio, int err)
				1907	{
				1908	struct btrfs_raid_bio *rbio = bio->bi_private;
				1909
				1910	/*
				1911	* we only read stripe pages off the disk, set them
				1912	* up to date if there were no errors
				1913	*/
				1914	if (err)
				1915	fail_bio_stripe(rbio, bio);
				1916	else
				1917	set_bio_pages_uptodate(bio);
				1918	bio_put(bio);
				1919
				1920	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1921	return;
				1922
				1923	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1924	rbio_orig_end_io(rbio, -EIO, 0);
				1925	else
				1926	__raid_recover_end_io(rbio);
				1927	}
				1928
				1929	/*
				1930	* reads everything we need off the disk to reconstruct
				1931	* the parity. endio handlers trigger final reconstruction
				1932	* when the IO is done.
				1933	*
				1934	* This is used both for reads from the higher layers and for
				1935	* parity construction required to finish a rmw cycle.
				1936	*/
				1937	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				1938	{
				1939	int bios_to_read = 0;
				1940	struct btrfs_bio *bbio = rbio->bbio;
				1941	struct bio_list bio_list;
				1942	int ret;
				1943	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1944	int pagenr;
				1945	int stripe;
				1946	struct bio *bio;
				1947
				1948	bio_list_init(&bio_list);
				1949
				1950	ret = alloc_rbio_pages(rbio);
				1951	if (ret)
				1952	goto cleanup;
				1953
				1954	atomic_set(&rbio->bbio->error, 0);
				1955
				1956	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1957	* read everything that hasn't failed. Thanks to the
				1958	* stripe cache, it is possible that some or all of these
				1959	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1960	*/
				1961	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1962	if (rbio->faila == stripe \|\|
				1963	rbio->failb == stripe)
				1964	continue;
				1965
				1966	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1967	struct page *p;
				1968
				1969	/*
				1970	* the rmw code may have already read this
				1971	* page in
				1972	*/
				1973	p = rbio_stripe_page(rbio, stripe, pagenr);
				1974	if (PageUptodate(p))
				1975	continue;
				1976
				1977	ret = rbio_add_io_page(rbio, &bio_list,
				1978	rbio_stripe_page(rbio, stripe, pagenr),
				1979	stripe, pagenr, rbio->stripe_len);
				1980	if (ret < 0)
				1981	goto cleanup;
				1982	}
				1983	}
				1984
				1985	bios_to_read = bio_list_size(&bio_list);
				1986	if (!bios_to_read) {
				1987	/*
				1988	* we might have no bios to read just because the pages
				1989	* were up to date, or we might have no bios to read because
				1990	* the devices were gone.
				1991	*/
				1992	if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
				1993	__raid_recover_end_io(rbio);
				1994	goto out;
				1995	} else {
				1996	goto cleanup;
				1997	}
				1998	}
				1999
				2000	/*
				2001	* the bbio may be freed once we submit the last bio. Make sure
				2002	* not to touch it after that
				2003	*/
				2004	atomic_set(&bbio->stripes_pending, bios_to_read);
				2005	while (1) {
				2006	bio = bio_list_pop(&bio_list);
				2007	if (!bio)
				2008	break;
				2009
				2010	bio->bi_private = rbio;
				2011	bio->bi_end_io = raid_recover_end_io;
				2012
				2013	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2014	BTRFS_WQ_ENDIO_RAID56);
				2015
				2016	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2017	submit_bio(READ, bio);
				2018	}
				2019	out:
				2020	return 0;
				2021
				2022	cleanup:
				2023	if (rbio->read_rebuild)
				2024	rbio_orig_end_io(rbio, -EIO, 0);
				2025	return -EIO;
				2026	}
				2027
				2028	/*
				2029	* the main entry point for reads from the higher layers. This
				2030	* is really only called when the normal read path had a failure,
				2031	* so we assume the bio they send down corresponds to a failed part
				2032	* of the drive.
				2033	*/
				2034	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				2035	struct btrfs_bio bbio, u64 raid_map,
				2036	u64 stripe_len, int mirror_num)
				2037	{
				2038	struct btrfs_raid_bio *rbio;
				2039	int ret;
				2040
				2041	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Dan Carpenter	3dc0e81	2013-07-22 09:55:15 +0300	[diff] [blame]	2042	if (IS_ERR(rbio))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2043	return PTR_ERR(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2044
				2045	rbio->read_rebuild = 1;
				2046	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2047	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2048
				2049	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2050	if (rbio->faila == -1) {
				2051	BUG();
Dan Carpenter	3dc0e81	2013-07-22 09:55:15 +0300	[diff] [blame]	2052	kfree(raid_map);
				2053	kfree(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2054	kfree(rbio);
				2055	return -EIO;
				2056	}
				2057
				2058	/*
				2059	* reconstruct from the q stripe if they are
				2060	* asking for mirror 3
				2061	*/
				2062	if (mirror_num == 3)
				2063	rbio->failb = bbio->num_stripes - 2;
				2064
				2065	ret = lock_stripe_add(rbio);
				2066
				2067	/*
				2068	* __raid56_parity_recover will end the bio with
				2069	* any errors it hits. We don't want to return
				2070	* its error value up the stack because our caller
				2071	* will end up calling bio_endio with any nonzero
				2072	* return
				2073	*/
				2074	if (ret == 0)
				2075	__raid56_parity_recover(rbio);
				2076	/*
				2077	* our rbio has been added to the list of
				2078	* rbios that will be handled after the
				2079	* currently lock owner is done
				2080	*/
				2081	return 0;
				2082
				2083	}
				2084
				2085	static void rmw_work(struct btrfs_work *work)
				2086	{
				2087	struct btrfs_raid_bio *rbio;
				2088
				2089	rbio = container_of(work, struct btrfs_raid_bio, work);
				2090	raid56_rmw_stripe(rbio);
				2091	}
				2092
				2093	static void read_rebuild_work(struct btrfs_work *work)
				2094	{
				2095	struct btrfs_raid_bio *rbio;
				2096
				2097	rbio = container_of(work, struct btrfs_raid_bio, work);
				2098	__raid56_parity_recover(rbio);
				2099	}