Blame - fs/btrfs/raid56.c - kernel/msm-4.9

blob: 0600bf69199ec79ba78aef2041a6358de6a904d3 [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
Geert Uytterhoeven	d7011f5	2013-03-03 04:44:41 -0700	[diff] [blame]	34	#include <linux/vmalloc.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	61	/*
				62	* bbio and raid_map is managed by the caller, so we shouldn't free
				63	* them here. And besides that, all rbios with this flag should not
				64	* be cached, because we need raid_map to check the rbios' stripe
				65	* is the same or not, but it is very likely that the caller has
				66	* free raid_map, so don't cache those rbios.
				67	*/
				68	#define RBIO_HOLD_BBIO_MAP_BIT 4
				69
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	70	#define RBIO_CACHE_SIZE 1024
				71
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	72	enum btrfs_rbio_ops {
				73	BTRFS_RBIO_WRITE = 0,
				74	BTRFS_RBIO_READ_REBUILD = 1,
				75	};
				76
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	77	struct btrfs_raid_bio {
				78	struct btrfs_fs_info *fs_info;
				79	struct btrfs_bio *bbio;
				80
				81	/*
				82	* logical block numbers for the start of each stripe
				83	* The last one or two are p/q. These are sorted,
				84	* so raid_map[0] is the start of our full stripe
				85	*/
				86	u64 *raid_map;
				87
				88	/* while we're doing rmw on a stripe
				89	* we put it into a hash table so we can
				90	* lock the stripe and merge more rbios
				91	* into it.
				92	*/
				93	struct list_head hash_list;
				94
				95	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	96	* LRU list for the stripe cache
				97	*/
				98	struct list_head stripe_cache;
				99
				100	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	101	* for scheduling work in the helper threads
				102	*/
				103	struct btrfs_work work;
				104
				105	/*
				106	* bio list and bio_list_lock are used
				107	* to add more bios into the stripe
				108	* in hopes of avoiding the full rmw
				109	*/
				110	struct bio_list bio_list;
				111	spinlock_t bio_list_lock;
				112
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	113	/* also protected by the bio_list_lock, the
				114	* plug list is used by the plugging code
				115	* to collect partial bios while plugged. The
				116	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	117	* the stripe lock to the next pending IO
				118	*/
				119	struct list_head plug_list;
				120
				121	/*
				122	* flags that tell us if it is safe to
				123	* merge with this bio
				124	*/
				125	unsigned long flags;
				126
				127	/* size of each individual stripe on disk */
				128	int stripe_len;
				129
				130	/* number of data stripes (no p/q) */
				131	int nr_data;
				132
				133	/*
				134	* set if we're doing a parity rebuild
				135	* for a read from higher up, which is handled
				136	* differently from a parity rebuild as part of
				137	* rmw
				138	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	139	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	140
				141	/* first bad stripe */
				142	int faila;
				143
				144	/* second bad stripe (for raid6 use) */
				145	int failb;
				146
				147	/*
				148	* number of pages needed to represent the full
				149	* stripe
				150	*/
				151	int nr_pages;
				152
				153	/*
				154	* size of all the bios in the bio_list. This
				155	* helps us decide if the rbio maps to a full
				156	* stripe or not
				157	*/
				158	int bio_list_bytes;
				159
				160	atomic_t refs;
				161
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	162	atomic_t stripes_pending;
				163
				164	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	165	/*
				166	* these are two arrays of pointers. We allocate the
				167	* rbio big enough to hold them both and setup their
				168	* locations when the rbio is allocated
				169	*/
				170
				171	/* pointers to pages that we allocated for
				172	* reading/writing stripes directly from the disk (including P/Q)
				173	*/
				174	struct page **stripe_pages;
				175
				176	/*
				177	* pointers to the pages in the bio_list. Stored
				178	* here for faster lookup
				179	*/
				180	struct page **bio_pages;
				181	};
				182
				183	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				184	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				185	static void rmw_work(struct btrfs_work *work);
				186	static void read_rebuild_work(struct btrfs_work *work);
				187	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				188	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				189	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				190	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				191	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				192	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				193	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				194
				195	/*
				196	* the stripe hash table is used for locking, and to collect
				197	* bios in hopes of making a full stripe
				198	*/
				199	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				200	{
				201	struct btrfs_stripe_hash_table *table;
				202	struct btrfs_stripe_hash_table *x;
				203	struct btrfs_stripe_hash *cur;
				204	struct btrfs_stripe_hash *h;
				205	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				206	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	207	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	208
				209	if (info->stripe_hash_table)
				210	return 0;
				211
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	212	/*
				213	* The table is large, starting with order 4 and can go as high as
				214	* order 7 in case lock debugging is turned on.
				215	*
				216	* Try harder to allocate and fallback to vmalloc to lower the chance
				217	* of a failing mount.
				218	*/
				219	table_size = sizeof(table) + sizeof(h) * num_entries;
				220	table = kzalloc(table_size, GFP_KERNEL \| __GFP_NOWARN \| __GFP_REPEAT);
				221	if (!table) {
				222	table = vzalloc(table_size);
				223	if (!table)
				224	return -ENOMEM;
				225	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	226
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	227	spin_lock_init(&table->cache_lock);
				228	INIT_LIST_HEAD(&table->stripe_cache);
				229
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	230	h = table->table;
				231
				232	for (i = 0; i < num_entries; i++) {
				233	cur = h + i;
				234	INIT_LIST_HEAD(&cur->hash_list);
				235	spin_lock_init(&cur->lock);
				236	init_waitqueue_head(&cur->wait);
				237	}
				238
				239	x = cmpxchg(&info->stripe_hash_table, NULL, table);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	240	if (x) {
				241	if (is_vmalloc_addr(x))
				242	vfree(x);
				243	else
				244	kfree(x);
				245	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	246	return 0;
				247	}
				248
				249	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	250	* caching an rbio means to copy anything from the
				251	* bio_pages array into the stripe_pages array. We
				252	* use the page uptodate bit in the stripe cache array
				253	* to indicate if it has valid data
				254	*
				255	* once the caching is done, we set the cache ready
				256	* bit.
				257	*/
				258	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				259	{
				260	int i;
				261	char *s;
				262	char *d;
				263	int ret;
				264
				265	ret = alloc_rbio_pages(rbio);
				266	if (ret)
				267	return;
				268
				269	for (i = 0; i < rbio->nr_pages; i++) {
				270	if (!rbio->bio_pages[i])
				271	continue;
				272
				273	s = kmap(rbio->bio_pages[i]);
				274	d = kmap(rbio->stripe_pages[i]);
				275
				276	memcpy(d, s, PAGE_CACHE_SIZE);
				277
				278	kunmap(rbio->bio_pages[i]);
				279	kunmap(rbio->stripe_pages[i]);
				280	SetPageUptodate(rbio->stripe_pages[i]);
				281	}
				282	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				283	}
				284
				285	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	286	* we hash on the first logical address of the stripe
				287	*/
				288	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				289	{
				290	u64 num = rbio->raid_map[0];
				291
				292	/*
				293	* we shift down quite a bit. We're using byte
				294	* addressing, and most of the lower bits are zeros.
				295	* This tends to upset hash_64, and it consistently
				296	* returns just one or two different values.
				297	*
				298	* shifting off the lower bits fixes things.
				299	*/
				300	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				301	}
				302
				303	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	304	* stealing an rbio means taking all the uptodate pages from the stripe
				305	* array in the source rbio and putting them into the destination rbio
				306	*/
				307	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				308	{
				309	int i;
				310	struct page *s;
				311	struct page *d;
				312
				313	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				314	return;
				315
				316	for (i = 0; i < dest->nr_pages; i++) {
				317	s = src->stripe_pages[i];
				318	if (!s \|\| !PageUptodate(s)) {
				319	continue;
				320	}
				321
				322	d = dest->stripe_pages[i];
				323	if (d)
				324	__free_page(d);
				325
				326	dest->stripe_pages[i] = s;
				327	src->stripe_pages[i] = NULL;
				328	}
				329	}
				330
				331	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	332	* merging means we take the bio_list from the victim and
				333	* splice it into the destination. The victim should
				334	* be discarded afterwards.
				335	*
				336	* must be called with dest->rbio_list_lock held
				337	*/
				338	static void merge_rbio(struct btrfs_raid_bio *dest,
				339	struct btrfs_raid_bio *victim)
				340	{
				341	bio_list_merge(&dest->bio_list, &victim->bio_list);
				342	dest->bio_list_bytes += victim->bio_list_bytes;
				343	bio_list_init(&victim->bio_list);
				344	}
				345
				346	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	347	* used to prune items that are in the cache. The caller
				348	* must hold the hash table lock.
				349	*/
				350	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				351	{
				352	int bucket = rbio_bucket(rbio);
				353	struct btrfs_stripe_hash_table *table;
				354	struct btrfs_stripe_hash *h;
				355	int freeit = 0;
				356
				357	/*
				358	* check the bit again under the hash table lock.
				359	*/
				360	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				361	return;
				362
				363	table = rbio->fs_info->stripe_hash_table;
				364	h = table->table + bucket;
				365
				366	/* hold the lock for the bucket because we may be
				367	* removing it from the hash table
				368	*/
				369	spin_lock(&h->lock);
				370
				371	/*
				372	* hold the lock for the bio list because we need
				373	* to make sure the bio list is empty
				374	*/
				375	spin_lock(&rbio->bio_list_lock);
				376
				377	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				378	list_del_init(&rbio->stripe_cache);
				379	table->cache_size -= 1;
				380	freeit = 1;
				381
				382	/* if the bio list isn't empty, this rbio is
				383	* still involved in an IO. We take it out
				384	* of the cache list, and drop the ref that
				385	* was held for the list.
				386	*
				387	* If the bio_list was empty, we also remove
				388	* the rbio from the hash_table, and drop
				389	* the corresponding ref
				390	*/
				391	if (bio_list_empty(&rbio->bio_list)) {
				392	if (!list_empty(&rbio->hash_list)) {
				393	list_del_init(&rbio->hash_list);
				394	atomic_dec(&rbio->refs);
				395	BUG_ON(!list_empty(&rbio->plug_list));
				396	}
				397	}
				398	}
				399
				400	spin_unlock(&rbio->bio_list_lock);
				401	spin_unlock(&h->lock);
				402
				403	if (freeit)
				404	__free_raid_bio(rbio);
				405	}
				406
				407	/*
				408	* prune a given rbio from the cache
				409	*/
				410	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				411	{
				412	struct btrfs_stripe_hash_table *table;
				413	unsigned long flags;
				414
				415	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				416	return;
				417
				418	table = rbio->fs_info->stripe_hash_table;
				419
				420	spin_lock_irqsave(&table->cache_lock, flags);
				421	__remove_rbio_from_cache(rbio);
				422	spin_unlock_irqrestore(&table->cache_lock, flags);
				423	}
				424
				425	/*
				426	* remove everything in the cache
				427	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	428	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	429	{
				430	struct btrfs_stripe_hash_table *table;
				431	unsigned long flags;
				432	struct btrfs_raid_bio *rbio;
				433
				434	table = info->stripe_hash_table;
				435
				436	spin_lock_irqsave(&table->cache_lock, flags);
				437	while (!list_empty(&table->stripe_cache)) {
				438	rbio = list_entry(table->stripe_cache.next,
				439	struct btrfs_raid_bio,
				440	stripe_cache);
				441	__remove_rbio_from_cache(rbio);
				442	}
				443	spin_unlock_irqrestore(&table->cache_lock, flags);
				444	}
				445
				446	/*
				447	* remove all cached entries and free the hash table
				448	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	449	*/
				450	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				451	{
				452	if (!info->stripe_hash_table)
				453	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	454	btrfs_clear_rbio_cache(info);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	455	if (is_vmalloc_addr(info->stripe_hash_table))
				456	vfree(info->stripe_hash_table);
				457	else
				458	kfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	459	info->stripe_hash_table = NULL;
				460	}
				461
				462	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	463	* insert an rbio into the stripe cache. It
				464	* must have already been prepared by calling
				465	* cache_rbio_pages
				466	*
				467	* If this rbio was already cached, it gets
				468	* moved to the front of the lru.
				469	*
				470	* If the size of the rbio cache is too big, we
				471	* prune an item.
				472	*/
				473	static void cache_rbio(struct btrfs_raid_bio *rbio)
				474	{
				475	struct btrfs_stripe_hash_table *table;
				476	unsigned long flags;
				477
				478	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				479	return;
				480
				481	table = rbio->fs_info->stripe_hash_table;
				482
				483	spin_lock_irqsave(&table->cache_lock, flags);
				484	spin_lock(&rbio->bio_list_lock);
				485
				486	/* bump our ref if we were not in the list before */
				487	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				488	atomic_inc(&rbio->refs);
				489
				490	if (!list_empty(&rbio->stripe_cache)){
				491	list_move(&rbio->stripe_cache, &table->stripe_cache);
				492	} else {
				493	list_add(&rbio->stripe_cache, &table->stripe_cache);
				494	table->cache_size += 1;
				495	}
				496
				497	spin_unlock(&rbio->bio_list_lock);
				498
				499	if (table->cache_size > RBIO_CACHE_SIZE) {
				500	struct btrfs_raid_bio *found;
				501
				502	found = list_entry(table->stripe_cache.prev,
				503	struct btrfs_raid_bio,
				504	stripe_cache);
				505
				506	if (found != rbio)
				507	__remove_rbio_from_cache(found);
				508	}
				509
				510	spin_unlock_irqrestore(&table->cache_lock, flags);
				511	return;
				512	}
				513
				514	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	515	* helper function to run the xor_blocks api. It is only
				516	* able to do MAX_XOR_BLOCKS at a time, so we need to
				517	* loop through.
				518	*/
				519	static void run_xor(void **pages, int src_cnt, ssize_t len)
				520	{
				521	int src_off = 0;
				522	int xor_src_cnt = 0;
				523	void *dest = pages[src_cnt];
				524
				525	while(src_cnt > 0) {
				526	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				527	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				528
				529	src_cnt -= xor_src_cnt;
				530	src_off += xor_src_cnt;
				531	}
				532	}
				533
				534	/*
				535	* returns true if the bio list inside this rbio
				536	* covers an entire stripe (no rmw required).
				537	* Must be called with the bio list lock held, or
				538	* at a time when you know it is impossible to add
				539	* new bios into the list
				540	*/
				541	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				542	{
				543	unsigned long size = rbio->bio_list_bytes;
				544	int ret = 1;
				545
				546	if (size != rbio->nr_data * rbio->stripe_len)
				547	ret = 0;
				548
				549	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				550	return ret;
				551	}
				552
				553	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				554	{
				555	unsigned long flags;
				556	int ret;
				557
				558	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				559	ret = __rbio_is_full(rbio);
				560	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				561	return ret;
				562	}
				563
				564	/*
				565	* returns 1 if it is safe to merge two rbios together.
				566	* The merging is safe if the two rbios correspond to
				567	* the same stripe and if they are both going in the same
				568	* direction (read vs write), and if neither one is
				569	* locked for final IO
				570	*
				571	* The caller is responsible for locking such that
				572	* rmw_locked is safe to test
				573	*/
				574	static int rbio_can_merge(struct btrfs_raid_bio *last,
				575	struct btrfs_raid_bio *cur)
				576	{
				577	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				578	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				579	return 0;
				580
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	581	/*
				582	* we can't merge with cached rbios, since the
				583	* idea is that when we merge the destination
				584	* rbio is going to run our IO for us. We can
				585	* steal from cached rbio's though, other functions
				586	* handle that.
				587	*/
				588	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				589	test_bit(RBIO_CACHE_BIT, &cur->flags))
				590	return 0;
				591
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	592	if (last->raid_map[0] !=
				593	cur->raid_map[0])
				594	return 0;
				595
				596	/* reads can't merge with writes */
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	597	if (last->operation != cur->operation) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	598	return 0;
				599	}
				600
				601	return 1;
				602	}
				603
				604	/*
				605	* helper to index into the pstripe
				606	*/
				607	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				608	{
				609	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				610	return rbio->stripe_pages[index];
				611	}
				612
				613	/*
				614	* helper to index into the qstripe, returns null
				615	* if there is no qstripe
				616	*/
				617	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				618	{
				619	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
				620	return NULL;
				621
				622	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				623	PAGE_CACHE_SHIFT;
				624	return rbio->stripe_pages[index];
				625	}
				626
				627	/*
				628	* The first stripe in the table for a logical address
				629	* has the lock. rbios are added in one of three ways:
				630	*
				631	* 1) Nobody has the stripe locked yet. The rbio is given
				632	* the lock and 0 is returned. The caller must start the IO
				633	* themselves.
				634	*
				635	* 2) Someone has the stripe locked, but we're able to merge
				636	* with the lock owner. The rbio is freed and the IO will
				637	* start automatically along with the existing rbio. 1 is returned.
				638	*
				639	* 3) Someone has the stripe locked, but we're not able to merge.
				640	* The rbio is added to the lock owner's plug list, or merged into
				641	* an rbio already on the plug list. When the lock owner unlocks,
				642	* the next rbio on the list is run and the IO is started automatically.
				643	* 1 is returned
				644	*
				645	* If we return 0, the caller still owns the rbio and must continue with
				646	* IO submission. If we return 1, the caller must assume the rbio has
				647	* already been freed.
				648	*/
				649	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				650	{
				651	int bucket = rbio_bucket(rbio);
				652	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				653	struct btrfs_raid_bio *cur;
				654	struct btrfs_raid_bio *pending;
				655	unsigned long flags;
				656	DEFINE_WAIT(wait);
				657	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	658	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	659	int ret = 0;
				660	int walk = 0;
				661
				662	spin_lock_irqsave(&h->lock, flags);
				663	list_for_each_entry(cur, &h->hash_list, hash_list) {
				664	walk++;
				665	if (cur->raid_map[0] == rbio->raid_map[0]) {
				666	spin_lock(&cur->bio_list_lock);
				667
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	668	/* can we steal this cached rbio's pages? */
				669	if (bio_list_empty(&cur->bio_list) &&
				670	list_empty(&cur->plug_list) &&
				671	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				672	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				673	list_del_init(&cur->hash_list);
				674	atomic_dec(&cur->refs);
				675
				676	steal_rbio(cur, rbio);
				677	cache_drop = cur;
				678	spin_unlock(&cur->bio_list_lock);
				679
				680	goto lockit;
				681	}
				682
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	683	/* can we merge into the lock owner? */
				684	if (rbio_can_merge(cur, rbio)) {
				685	merge_rbio(cur, rbio);
				686	spin_unlock(&cur->bio_list_lock);
				687	freeit = rbio;
				688	ret = 1;
				689	goto out;
				690	}
				691
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	692
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	693	/*
				694	* we couldn't merge with the running
				695	* rbio, see if we can merge with the
				696	* pending ones. We don't have to
				697	* check for rmw_locked because there
				698	* is no way they are inside finish_rmw
				699	* right now
				700	*/
				701	list_for_each_entry(pending, &cur->plug_list,
				702	plug_list) {
				703	if (rbio_can_merge(pending, rbio)) {
				704	merge_rbio(pending, rbio);
				705	spin_unlock(&cur->bio_list_lock);
				706	freeit = rbio;
				707	ret = 1;
				708	goto out;
				709	}
				710	}
				711
				712	/* no merging, put us on the tail of the plug list,
				713	* our rbio will be started with the currently
				714	* running rbio unlocks
				715	*/
				716	list_add_tail(&rbio->plug_list, &cur->plug_list);
				717	spin_unlock(&cur->bio_list_lock);
				718	ret = 1;
				719	goto out;
				720	}
				721	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	722	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	723	atomic_inc(&rbio->refs);
				724	list_add(&rbio->hash_list, &h->hash_list);
				725	out:
				726	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	727	if (cache_drop)
				728	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	729	if (freeit)
				730	__free_raid_bio(freeit);
				731	return ret;
				732	}
				733
				734	/*
				735	* called as rmw or parity rebuild is completed. If the plug list has more
				736	* rbios waiting for this stripe, the next one on the list will be started
				737	*/
				738	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				739	{
				740	int bucket;
				741	struct btrfs_stripe_hash *h;
				742	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	743	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	744
				745	bucket = rbio_bucket(rbio);
				746	h = rbio->fs_info->stripe_hash_table->table + bucket;
				747
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	748	if (list_empty(&rbio->plug_list))
				749	cache_rbio(rbio);
				750
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	751	spin_lock_irqsave(&h->lock, flags);
				752	spin_lock(&rbio->bio_list_lock);
				753
				754	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	755	/*
				756	* if we're still cached and there is no other IO
				757	* to perform, just leave this rbio here for others
				758	* to steal from later
				759	*/
				760	if (list_empty(&rbio->plug_list) &&
				761	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				762	keep_cache = 1;
				763	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				764	BUG_ON(!bio_list_empty(&rbio->bio_list));
				765	goto done;
				766	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	767
				768	list_del_init(&rbio->hash_list);
				769	atomic_dec(&rbio->refs);
				770
				771	/*
				772	* we use the plug list to hold all the rbios
				773	* waiting for the chance to lock this stripe.
				774	* hand the lock over to one of them.
				775	*/
				776	if (!list_empty(&rbio->plug_list)) {
				777	struct btrfs_raid_bio *next;
				778	struct list_head *head = rbio->plug_list.next;
				779
				780	next = list_entry(head, struct btrfs_raid_bio,
				781	plug_list);
				782
				783	list_del_init(&rbio->plug_list);
				784
				785	list_add(&next->hash_list, &h->hash_list);
				786	atomic_inc(&next->refs);
				787	spin_unlock(&rbio->bio_list_lock);
				788	spin_unlock_irqrestore(&h->lock, flags);
				789
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	790	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	791	async_read_rebuild(next);
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	792	else if (next->operation == BTRFS_RBIO_WRITE){
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	793	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	794	async_rmw_stripe(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	795	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	796
				797	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	798	} else if (waitqueue_active(&h->wait)) {
				799	spin_unlock(&rbio->bio_list_lock);
				800	spin_unlock_irqrestore(&h->lock, flags);
				801	wake_up(&h->wait);
				802	goto done_nolock;
				803	}
				804	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	805	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	806	spin_unlock(&rbio->bio_list_lock);
				807	spin_unlock_irqrestore(&h->lock, flags);
				808
				809	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	810	if (!keep_cache)
				811	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	812	}
				813
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	814	static inline void
				815	__free_bbio_and_raid_map(struct btrfs_bio bbio, u64 raid_map, int need)
				816	{
				817	if (need) {
				818	kfree(raid_map);
				819	kfree(bbio);
				820	}
				821	}
				822
				823	static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
				824	{
				825	__free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
				826	!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
				827	}
				828
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	829	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				830	{
				831	int i;
				832
				833	WARN_ON(atomic_read(&rbio->refs) < 0);
				834	if (!atomic_dec_and_test(&rbio->refs))
				835	return;
				836
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	837	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	838	WARN_ON(!list_empty(&rbio->hash_list));
				839	WARN_ON(!bio_list_empty(&rbio->bio_list));
				840
				841	for (i = 0; i < rbio->nr_pages; i++) {
				842	if (rbio->stripe_pages[i]) {
				843	__free_page(rbio->stripe_pages[i]);
				844	rbio->stripe_pages[i] = NULL;
				845	}
				846	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	847
				848	free_bbio_and_raid_map(rbio);
				849
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	850	kfree(rbio);
				851	}
				852
				853	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				854	{
				855	unlock_stripe(rbio);
				856	__free_raid_bio(rbio);
				857	}
				858
				859	/*
				860	* this frees the rbio and runs through all the bios in the
				861	* bio_list and calls end_io on them
				862	*/
				863	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				864	{
				865	struct bio *cur = bio_list_get(&rbio->bio_list);
				866	struct bio *next;
				867	free_raid_bio(rbio);
				868
				869	while (cur) {
				870	next = cur->bi_next;
				871	cur->bi_next = NULL;
				872	if (uptodate)
				873	set_bit(BIO_UPTODATE, &cur->bi_flags);
				874	bio_endio(cur, err);
				875	cur = next;
				876	}
				877	}
				878
				879	/*
				880	* end io function used by finish_rmw. When we finally
				881	* get here, we've written a full stripe
				882	*/
				883	static void raid_write_end_io(struct bio *bio, int err)
				884	{
				885	struct btrfs_raid_bio *rbio = bio->bi_private;
				886
				887	if (err)
				888	fail_bio_stripe(rbio, bio);
				889
				890	bio_put(bio);
				891
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	892	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	893	return;
				894
				895	err = 0;
				896
				897	/* OK, we have read all the stripes we need to. */
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	898	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	899	err = -EIO;
				900
				901	rbio_orig_end_io(rbio, err, 0);
				902	return;
				903	}
				904
				905	/*
				906	* the read/modify/write code wants to use the original bio for
				907	* any pages it included, and then use the rbio for everything
				908	* else. This function decides if a given index (stripe number)
				909	* and page number in that stripe fall inside the original bio
				910	* or the rbio.
				911	*
				912	* if you set bio_list_only, you'll get a NULL back for any ranges
				913	* that are outside the bio_list
				914	*
				915	* This doesn't take any refs on anything, you get a bare page pointer
				916	* and the caller must bump refs as required.
				917	*
				918	* You must call index_rbio_pages once before you can trust
				919	* the answers from this function.
				920	*/
				921	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				922	int index, int pagenr, int bio_list_only)
				923	{
				924	int chunk_page;
				925	struct page *p = NULL;
				926
				927	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				928
				929	spin_lock_irq(&rbio->bio_list_lock);
				930	p = rbio->bio_pages[chunk_page];
				931	spin_unlock_irq(&rbio->bio_list_lock);
				932
				933	if (p \|\| bio_list_only)
				934	return p;
				935
				936	return rbio->stripe_pages[chunk_page];
				937	}
				938
				939	/*
				940	* number of pages we need for the entire stripe across all the
				941	* drives
				942	*/
				943	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				944	{
				945	unsigned long nr = stripe_len * nr_stripes;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	946	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	947	}
				948
				949	/*
				950	* allocation and initial setup for the btrfs_raid_bio. Not
				951	* this does not allocate any pages for rbio->pages.
				952	*/
				953	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				954	struct btrfs_bio bbio, u64 raid_map,
				955	u64 stripe_len)
				956	{
				957	struct btrfs_raid_bio *rbio;
				958	int nr_data = 0;
				959	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
				960	void *p;
				961
				962	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2,
				963	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	964	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	965	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	966
				967	bio_list_init(&rbio->bio_list);
				968	INIT_LIST_HEAD(&rbio->plug_list);
				969	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	970	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	971	INIT_LIST_HEAD(&rbio->hash_list);
				972	rbio->bbio = bbio;
				973	rbio->raid_map = raid_map;
				974	rbio->fs_info = root->fs_info;
				975	rbio->stripe_len = stripe_len;
				976	rbio->nr_pages = num_pages;
				977	rbio->faila = -1;
				978	rbio->failb = -1;
				979	atomic_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	980	atomic_set(&rbio->error, 0);
				981	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	982
				983	/*
				984	* the stripe_pages and bio_pages array point to the extra
				985	* memory we allocated past the end of the rbio
				986	*/
				987	p = rbio + 1;
				988	rbio->stripe_pages = p;
				989	rbio->bio_pages = p + sizeof(struct page ) num_pages;
				990
				991	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
				992	nr_data = bbio->num_stripes - 2;
				993	else
				994	nr_data = bbio->num_stripes - 1;
				995
				996	rbio->nr_data = nr_data;
				997	return rbio;
				998	}
				999
				1000	/* allocate pages for all the stripes in the bio, including parity */
				1001	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1002	{
				1003	int i;
				1004	struct page *page;
				1005
				1006	for (i = 0; i < rbio->nr_pages; i++) {
				1007	if (rbio->stripe_pages[i])
				1008	continue;
				1009	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1010	if (!page)
				1011	return -ENOMEM;
				1012	rbio->stripe_pages[i] = page;
				1013	ClearPageUptodate(page);
				1014	}
				1015	return 0;
				1016	}
				1017
				1018	/* allocate pages for just the p/q stripes */
				1019	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1020	{
				1021	int i;
				1022	struct page *page;
				1023
				1024	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				1025
				1026	for (; i < rbio->nr_pages; i++) {
				1027	if (rbio->stripe_pages[i])
				1028	continue;
				1029	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1030	if (!page)
				1031	return -ENOMEM;
				1032	rbio->stripe_pages[i] = page;
				1033	}
				1034	return 0;
				1035	}
				1036
				1037	/*
				1038	* add a single page from a specific stripe into our list of bios for IO
				1039	* this will try to merge into existing bios if possible, and returns
				1040	* zero if all went well.
				1041	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1042	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1043	struct bio_list *bio_list,
				1044	struct page *page,
				1045	int stripe_nr,
				1046	unsigned long page_index,
				1047	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1048	{
				1049	struct bio *last = bio_list->tail;
				1050	u64 last_end = 0;
				1051	int ret;
				1052	struct bio *bio;
				1053	struct btrfs_bio_stripe *stripe;
				1054	u64 disk_start;
				1055
				1056	stripe = &rbio->bbio->stripes[stripe_nr];
				1057	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1058
				1059	/* if the device is missing, just fail this stripe */
				1060	if (!stripe->dev->bdev)
				1061	return fail_rbio_index(rbio, stripe_nr);
				1062
				1063	/* see if we can add this page onto our existing bio */
				1064	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1065	last_end = (u64)last->bi_iter.bi_sector << 9;
				1066	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1067
				1068	/*
				1069	* we can't merge these if they are from different
				1070	* devices or if they are not contiguous
				1071	*/
				1072	if (last_end == disk_start && stripe->dev->bdev &&
				1073	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1074	last->bi_bdev == stripe->dev->bdev) {
				1075	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1076	if (ret == PAGE_CACHE_SIZE)
				1077	return 0;
				1078	}
				1079	}
				1080
				1081	/* put a new bio on the list */
Chris Mason	9be3395	2013-05-17 18:30:14 -0400	[diff] [blame]	1082	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1083	if (!bio)
				1084	return -ENOMEM;
				1085
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1086	bio->bi_iter.bi_size = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1087	bio->bi_bdev = stripe->dev->bdev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1088	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1089	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1090
				1091	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1092	bio_list_add(bio_list, bio);
				1093	return 0;
				1094	}
				1095
				1096	/*
				1097	* while we're doing the read/modify/write cycle, we could
				1098	* have errors in reading pages off the disk. This checks
				1099	* for errors and if we're not able to read the page it'll
				1100	* trigger parity reconstruction. The rmw will be finished
				1101	* after we've reconstructed the failed stripes
				1102	*/
				1103	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1104	{
				1105	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				1106	BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
				1107	__raid56_parity_recover(rbio);
				1108	} else {
				1109	finish_rmw(rbio);
				1110	}
				1111	}
				1112
				1113	/*
				1114	* these are just the pages from the rbio array, not from anything
				1115	* the FS sent down to us
				1116	*/
				1117	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1118	{
				1119	int index;
				1120	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1121	index += page;
				1122	return rbio->stripe_pages[index];
				1123	}
				1124
				1125	/*
				1126	* helper function to walk our bio list and populate the bio_pages array with
				1127	* the result. This seems expensive, but it is faster than constantly
				1128	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1129	* reconstruction.
				1130	*
				1131	* This must be called before you trust the answers from page_in_rbio
				1132	*/
				1133	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1134	{
				1135	struct bio *bio;
				1136	u64 start;
				1137	unsigned long stripe_offset;
				1138	unsigned long page_index;
				1139	struct page *p;
				1140	int i;
				1141
				1142	spin_lock_irq(&rbio->bio_list_lock);
				1143	bio_list_for_each(bio, &rbio->bio_list) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1144	start = (u64)bio->bi_iter.bi_sector << 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1145	stripe_offset = start - rbio->raid_map[0];
				1146	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1147
				1148	for (i = 0; i < bio->bi_vcnt; i++) {
				1149	p = bio->bi_io_vec[i].bv_page;
				1150	rbio->bio_pages[page_index + i] = p;
				1151	}
				1152	}
				1153	spin_unlock_irq(&rbio->bio_list_lock);
				1154	}
				1155
				1156	/*
				1157	* this is called from one of two situations. We either
				1158	* have a full stripe from the higher layers, or we've read all
				1159	* the missing bits off disk.
				1160	*
				1161	* This will calculate the parity and then send down any
				1162	* changed blocks.
				1163	*/
				1164	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1165	{
				1166	struct btrfs_bio *bbio = rbio->bbio;
				1167	void *pointers[bbio->num_stripes];
				1168	int stripe_len = rbio->stripe_len;
				1169	int nr_data = rbio->nr_data;
				1170	int stripe;
				1171	int pagenr;
				1172	int p_stripe = -1;
				1173	int q_stripe = -1;
				1174	struct bio_list bio_list;
				1175	struct bio *bio;
				1176	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1177	int ret;
				1178
				1179	bio_list_init(&bio_list);
				1180
				1181	if (bbio->num_stripes - rbio->nr_data == 1) {
				1182	p_stripe = bbio->num_stripes - 1;
				1183	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				1184	p_stripe = bbio->num_stripes - 2;
				1185	q_stripe = bbio->num_stripes - 1;
				1186	} else {
				1187	BUG();
				1188	}
				1189
				1190	/* at this point we either have a full stripe,
				1191	* or we've read the full stripe from the drive.
				1192	* recalculate the parity and write the new results.
				1193	*
				1194	* We're not allowed to add any new bios to the
				1195	* bio list here, anyone else that wants to
				1196	* change this stripe needs to do their own rmw.
				1197	*/
				1198	spin_lock_irq(&rbio->bio_list_lock);
				1199	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1200	spin_unlock_irq(&rbio->bio_list_lock);
				1201
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1202	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1203
				1204	/*
				1205	* now that we've set rmw_locked, run through the
				1206	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1207	*
				1208	* We don't cache full rbios because we're assuming
				1209	* the higher layers are unlikely to use this area of
				1210	* the disk again soon. If they do use it again,
				1211	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1212	*/
				1213	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1214	if (!rbio_is_full(rbio))
				1215	cache_rbio_pages(rbio);
				1216	else
				1217	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1218
				1219	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1220	struct page *p;
				1221	/* first collect one page from each data stripe */
				1222	for (stripe = 0; stripe < nr_data; stripe++) {
				1223	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1224	pointers[stripe] = kmap(p);
				1225	}
				1226
				1227	/* then add the parity stripe */
				1228	p = rbio_pstripe_page(rbio, pagenr);
				1229	SetPageUptodate(p);
				1230	pointers[stripe++] = kmap(p);
				1231
				1232	if (q_stripe != -1) {
				1233
				1234	/*
				1235	* raid6, add the qstripe and call the
				1236	* library function to fill in our p/q
				1237	*/
				1238	p = rbio_qstripe_page(rbio, pagenr);
				1239	SetPageUptodate(p);
				1240	pointers[stripe++] = kmap(p);
				1241
				1242	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				1243	pointers);
				1244	} else {
				1245	/* raid5 */
				1246	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1247	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1248	}
				1249
				1250
				1251	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				1252	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1253	}
				1254
				1255	/*
				1256	* time to start writing. Make bios for everything from the
				1257	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1258	* everything else.
				1259	*/
				1260	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1261	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1262	struct page *page;
				1263	if (stripe < rbio->nr_data) {
				1264	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1265	if (!page)
				1266	continue;
				1267	} else {
				1268	page = rbio_stripe_page(rbio, stripe, pagenr);
				1269	}
				1270
				1271	ret = rbio_add_io_page(rbio, &bio_list,
				1272	page, stripe, pagenr, rbio->stripe_len);
				1273	if (ret)
				1274	goto cleanup;
				1275	}
				1276	}
				1277
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1278	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1279	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1280
				1281	while (1) {
				1282	bio = bio_list_pop(&bio_list);
				1283	if (!bio)
				1284	break;
				1285
				1286	bio->bi_private = rbio;
				1287	bio->bi_end_io = raid_write_end_io;
				1288	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1289	submit_bio(WRITE, bio);
				1290	}
				1291	return;
				1292
				1293	cleanup:
				1294	rbio_orig_end_io(rbio, -EIO, 0);
				1295	}
				1296
				1297	/*
				1298	* helper to find the stripe number for a given bio. Used to figure out which
				1299	* stripe has failed. This expects the bio to correspond to a physical disk,
				1300	* so it looks up based on physical sector numbers.
				1301	*/
				1302	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1303	struct bio *bio)
				1304	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1305	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1306	u64 stripe_start;
				1307	int i;
				1308	struct btrfs_bio_stripe *stripe;
				1309
				1310	physical <<= 9;
				1311
				1312	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1313	stripe = &rbio->bbio->stripes[i];
				1314	stripe_start = stripe->physical;
				1315	if (physical >= stripe_start &&
				1316	physical < stripe_start + rbio->stripe_len) {
				1317	return i;
				1318	}
				1319	}
				1320	return -1;
				1321	}
				1322
				1323	/*
				1324	* helper to find the stripe number for a given
				1325	* bio (before mapping). Used to figure out which stripe has
				1326	* failed. This looks up based on logical block numbers.
				1327	*/
				1328	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1329	struct bio *bio)
				1330	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1331	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1332	u64 stripe_start;
				1333	int i;
				1334
				1335	logical <<= 9;
				1336
				1337	for (i = 0; i < rbio->nr_data; i++) {
				1338	stripe_start = rbio->raid_map[i];
				1339	if (logical >= stripe_start &&
				1340	logical < stripe_start + rbio->stripe_len) {
				1341	return i;
				1342	}
				1343	}
				1344	return -1;
				1345	}
				1346
				1347	/*
				1348	* returns -EIO if we had too many failures
				1349	*/
				1350	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1351	{
				1352	unsigned long flags;
				1353	int ret = 0;
				1354
				1355	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1356
				1357	/* we already know this stripe is bad, move on */
				1358	if (rbio->faila == failed \|\| rbio->failb == failed)
				1359	goto out;
				1360
				1361	if (rbio->faila == -1) {
				1362	/* first failure on this rbio */
				1363	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1364	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1365	} else if (rbio->failb == -1) {
				1366	/* second failure on this rbio */
				1367	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1368	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1369	} else {
				1370	ret = -EIO;
				1371	}
				1372	out:
				1373	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1374
				1375	return ret;
				1376	}
				1377
				1378	/*
				1379	* helper to fail a stripe based on a physical disk
				1380	* bio.
				1381	*/
				1382	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1383	struct bio *bio)
				1384	{
				1385	int failed = find_bio_stripe(rbio, bio);
				1386
				1387	if (failed < 0)
				1388	return -EIO;
				1389
				1390	return fail_rbio_index(rbio, failed);
				1391	}
				1392
				1393	/*
				1394	* this sets each page in the bio uptodate. It should only be used on private
				1395	* rbio pages, nothing that comes in from the higher layers
				1396	*/
				1397	static void set_bio_pages_uptodate(struct bio *bio)
				1398	{
				1399	int i;
				1400	struct page *p;
				1401
				1402	for (i = 0; i < bio->bi_vcnt; i++) {
				1403	p = bio->bi_io_vec[i].bv_page;
				1404	SetPageUptodate(p);
				1405	}
				1406	}
				1407
				1408	/*
				1409	* end io for the read phase of the rmw cycle. All the bios here are physical
				1410	* stripe bios we've read from the disk so we can recalculate the parity of the
				1411	* stripe.
				1412	*
				1413	* This will usually kick off finish_rmw once all the bios are read in, but it
				1414	* may trigger parity reconstruction if we had any errors along the way
				1415	*/
				1416	static void raid_rmw_end_io(struct bio *bio, int err)
				1417	{
				1418	struct btrfs_raid_bio *rbio = bio->bi_private;
				1419
				1420	if (err)
				1421	fail_bio_stripe(rbio, bio);
				1422	else
				1423	set_bio_pages_uptodate(bio);
				1424
				1425	bio_put(bio);
				1426
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1427	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1428	return;
				1429
				1430	err = 0;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1431	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1432	goto cleanup;
				1433
				1434	/*
				1435	* this will normally call finish_rmw to start our write
				1436	* but if there are any failed stripes we'll reconstruct
				1437	* from parity first
				1438	*/
				1439	validate_rbio_for_rmw(rbio);
				1440	return;
				1441
				1442	cleanup:
				1443
				1444	rbio_orig_end_io(rbio, -EIO, 0);
				1445	}
				1446
				1447	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1448	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1449	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1450	rmw_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1451
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1452	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1453	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1454	}
				1455
				1456	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1457	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1458	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1459	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1460
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1461	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1462	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1463	}
				1464
				1465	/*
				1466	* the stripe must be locked by the caller. It will
				1467	* unlock after all the writes are done
				1468	*/
				1469	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1470	{
				1471	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1472	struct bio_list bio_list;
				1473	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1474	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1475	int pagenr;
				1476	int stripe;
				1477	struct bio *bio;
				1478
				1479	bio_list_init(&bio_list);
				1480
				1481	ret = alloc_rbio_pages(rbio);
				1482	if (ret)
				1483	goto cleanup;
				1484
				1485	index_rbio_pages(rbio);
				1486
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1487	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1488	/*
				1489	* build a list of bios to read all the missing parts of this
				1490	* stripe
				1491	*/
				1492	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1493	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1494	struct page *page;
				1495	/*
				1496	* we want to find all the pages missing from
				1497	* the rbio and read them from the disk. If
				1498	* page_in_rbio finds a page in the bio list
				1499	* we don't need to read it off the stripe.
				1500	*/
				1501	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1502	if (page)
				1503	continue;
				1504
				1505	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1506	/*
				1507	* the bio cache may have handed us an uptodate
				1508	* page. If so, be happy and use it
				1509	*/
				1510	if (PageUptodate(page))
				1511	continue;
				1512
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1513	ret = rbio_add_io_page(rbio, &bio_list, page,
				1514	stripe, pagenr, rbio->stripe_len);
				1515	if (ret)
				1516	goto cleanup;
				1517	}
				1518	}
				1519
				1520	bios_to_read = bio_list_size(&bio_list);
				1521	if (!bios_to_read) {
				1522	/*
				1523	* this can happen if others have merged with
				1524	* us, it means there is nothing left to read.
				1525	* But if there are missing devices it may not be
				1526	* safe to do the full stripe write yet.
				1527	*/
				1528	goto finish;
				1529	}
				1530
				1531	/*
				1532	* the bbio may be freed once we submit the last bio. Make sure
				1533	* not to touch it after that
				1534	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1535	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1536	while (1) {
				1537	bio = bio_list_pop(&bio_list);
				1538	if (!bio)
				1539	break;
				1540
				1541	bio->bi_private = rbio;
				1542	bio->bi_end_io = raid_rmw_end_io;
				1543
				1544	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1545	BTRFS_WQ_ENDIO_RAID56);
				1546
				1547	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1548	submit_bio(READ, bio);
				1549	}
				1550	/* the actual write will happen once the reads are done */
				1551	return 0;
				1552
				1553	cleanup:
				1554	rbio_orig_end_io(rbio, -EIO, 0);
				1555	return -EIO;
				1556
				1557	finish:
				1558	validate_rbio_for_rmw(rbio);
				1559	return 0;
				1560	}
				1561
				1562	/*
				1563	* if the upper layers pass in a full stripe, we thank them by only allocating
				1564	* enough pages to hold the parity, and sending it all down quickly.
				1565	*/
				1566	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1567	{
				1568	int ret;
				1569
				1570	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1571	if (ret) {
				1572	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1573	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1574	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1575
				1576	ret = lock_stripe_add(rbio);
				1577	if (ret == 0)
				1578	finish_rmw(rbio);
				1579	return 0;
				1580	}
				1581
				1582	/*
				1583	* partial stripe writes get handed over to async helpers.
				1584	* We're really hoping to merge a few more writes into this
				1585	* rbio before calculating new parity
				1586	*/
				1587	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1588	{
				1589	int ret;
				1590
				1591	ret = lock_stripe_add(rbio);
				1592	if (ret == 0)
				1593	async_rmw_stripe(rbio);
				1594	return 0;
				1595	}
				1596
				1597	/*
				1598	* sometimes while we were reading from the drive to
				1599	* recalculate parity, enough new bios come into create
				1600	* a full stripe. So we do a check here to see if we can
				1601	* go directly to finish_rmw
				1602	*/
				1603	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1604	{
				1605	/* head off into rmw land if we don't have a full stripe */
				1606	if (!rbio_is_full(rbio))
				1607	return partial_stripe_write(rbio);
				1608	return full_stripe_write(rbio);
				1609	}
				1610
				1611	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1612	* We use plugging call backs to collect full stripes.
				1613	* Any time we get a partial stripe write while plugged
				1614	* we collect it into a list. When the unplug comes down,
				1615	* we sort the list by logical block number and merge
				1616	* everything we can into the same rbios
				1617	*/
				1618	struct btrfs_plug_cb {
				1619	struct blk_plug_cb cb;
				1620	struct btrfs_fs_info *info;
				1621	struct list_head rbio_list;
				1622	struct btrfs_work work;
				1623	};
				1624
				1625	/*
				1626	* rbios on the plug list are sorted for easier merging.
				1627	*/
				1628	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1629	{
				1630	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1631	plug_list);
				1632	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1633	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1634	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1635	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1636
				1637	if (a_sector < b_sector)
				1638	return -1;
				1639	if (a_sector > b_sector)
				1640	return 1;
				1641	return 0;
				1642	}
				1643
				1644	static void run_plug(struct btrfs_plug_cb *plug)
				1645	{
				1646	struct btrfs_raid_bio *cur;
				1647	struct btrfs_raid_bio *last = NULL;
				1648
				1649	/*
				1650	* sort our plug list then try to merge
				1651	* everything we can in hopes of creating full
				1652	* stripes.
				1653	*/
				1654	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1655	while (!list_empty(&plug->rbio_list)) {
				1656	cur = list_entry(plug->rbio_list.next,
				1657	struct btrfs_raid_bio, plug_list);
				1658	list_del_init(&cur->plug_list);
				1659
				1660	if (rbio_is_full(cur)) {
				1661	/* we have a full stripe, send it down */
				1662	full_stripe_write(cur);
				1663	continue;
				1664	}
				1665	if (last) {
				1666	if (rbio_can_merge(last, cur)) {
				1667	merge_rbio(last, cur);
				1668	__free_raid_bio(cur);
				1669	continue;
				1670
				1671	}
				1672	__raid56_parity_write(last);
				1673	}
				1674	last = cur;
				1675	}
				1676	if (last) {
				1677	__raid56_parity_write(last);
				1678	}
				1679	kfree(plug);
				1680	}
				1681
				1682	/*
				1683	* if the unplug comes from schedule, we have to push the
				1684	* work off to a helper thread
				1685	*/
				1686	static void unplug_work(struct btrfs_work *work)
				1687	{
				1688	struct btrfs_plug_cb *plug;
				1689	plug = container_of(work, struct btrfs_plug_cb, work);
				1690	run_plug(plug);
				1691	}
				1692
				1693	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1694	{
				1695	struct btrfs_plug_cb *plug;
				1696	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1697
				1698	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1699	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1700	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1701	btrfs_queue_work(plug->info->rmw_workers,
				1702	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1703	return;
				1704	}
				1705	run_plug(plug);
				1706	}
				1707
				1708	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1709	* our main entry point for writes from the rest of the FS.
				1710	*/
				1711	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1712	struct btrfs_bio bbio, u64 raid_map,
				1713	u64 stripe_len)
				1714	{
				1715	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1716	struct btrfs_plug_cb *plug = NULL;
				1717	struct blk_plug_cb *cb;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1718
				1719	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1720	if (IS_ERR(rbio)) {
				1721	__free_bbio_and_raid_map(bbio, raid_map, 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1722	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1723	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1724	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1725	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	1726	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1727
				1728	/*
				1729	* don't plug on full rbios, just get them out the door
				1730	* as quickly as we can
				1731	*/
				1732	if (rbio_is_full(rbio))
				1733	return full_stripe_write(rbio);
				1734
				1735	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1736	sizeof(*plug));
				1737	if (cb) {
				1738	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1739	if (!plug->info) {
				1740	plug->info = root->fs_info;
				1741	INIT_LIST_HEAD(&plug->rbio_list);
				1742	}
				1743	list_add_tail(&rbio->plug_list, &plug->rbio_list);
				1744	} else {
				1745	return __raid56_parity_write(rbio);
				1746	}
				1747	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1748	}
				1749
				1750	/*
				1751	* all parity reconstruction happens here. We've read in everything
				1752	* we can find from the drives and this does the heavy lifting of
				1753	* sorting the good from the bad.
				1754	*/
				1755	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1756	{
				1757	int pagenr, stripe;
				1758	void **pointers;
				1759	int faila = -1, failb = -1;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1760	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1761	struct page *page;
				1762	int err;
				1763	int i;
				1764
				1765	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
				1766	GFP_NOFS);
				1767	if (!pointers) {
				1768	err = -ENOMEM;
				1769	goto cleanup_io;
				1770	}
				1771
				1772	faila = rbio->faila;
				1773	failb = rbio->failb;
				1774
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	1775	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1776	spin_lock_irq(&rbio->bio_list_lock);
				1777	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1778	spin_unlock_irq(&rbio->bio_list_lock);
				1779	}
				1780
				1781	index_rbio_pages(rbio);
				1782
				1783	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1784	/* setup our array of pointers with pages
				1785	* from each stripe
				1786	*/
				1787	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1788	/*
				1789	* if we're rebuilding a read, we have to use
				1790	* pages from the bio list
				1791	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	1792	if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1793	(stripe == faila \|\| stripe == failb)) {
				1794	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1795	} else {
				1796	page = rbio_stripe_page(rbio, stripe, pagenr);
				1797	}
				1798	pointers[stripe] = kmap(page);
				1799	}
				1800
				1801	/* all raid6 handling here */
				1802	if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
				1803	RAID6_Q_STRIPE) {
				1804
				1805	/*
				1806	* single failure, rebuild from parity raid5
				1807	* style
				1808	*/
				1809	if (failb < 0) {
				1810	if (faila == rbio->nr_data) {
				1811	/*
				1812	* Just the P stripe has failed, without
				1813	* a bad data or Q stripe.
				1814	* TODO, we should redo the xor here.
				1815	*/
				1816	err = -EIO;
				1817	goto cleanup;
				1818	}
				1819	/*
				1820	* a single failure in raid6 is rebuilt
				1821	* in the pstripe code below
				1822	*/
				1823	goto pstripe;
				1824	}
				1825
				1826	/* make sure our ps and qs are in order */
				1827	if (faila > failb) {
				1828	int tmp = failb;
				1829	failb = faila;
				1830	faila = tmp;
				1831	}
				1832
				1833	/* if the q stripe is failed, do a pstripe reconstruction
				1834	* from the xors.
				1835	* If both the q stripe and the P stripe are failed, we're
				1836	* here due to a crc mismatch and we can't give them the
				1837	* data they want
				1838	*/
				1839	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1840	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1841	err = -EIO;
				1842	goto cleanup;
				1843	}
				1844	/*
				1845	* otherwise we have one bad data stripe and
				1846	* a good P stripe. raid5!
				1847	*/
				1848	goto pstripe;
				1849	}
				1850
				1851	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
				1852	raid6_datap_recov(rbio->bbio->num_stripes,
				1853	PAGE_SIZE, faila, pointers);
				1854	} else {
				1855	raid6_2data_recov(rbio->bbio->num_stripes,
				1856	PAGE_SIZE, faila, failb,
				1857	pointers);
				1858	}
				1859	} else {
				1860	void *p;
				1861
				1862	/* rebuild from P stripe here (raid5 or raid6) */
				1863	BUG_ON(failb != -1);
				1864	pstripe:
				1865	/* Copy parity block into failed block to start with */
				1866	memcpy(pointers[faila],
				1867	pointers[rbio->nr_data],
				1868	PAGE_CACHE_SIZE);
				1869
				1870	/* rearrange the pointer array */
				1871	p = pointers[faila];
				1872	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1873	pointers[stripe] = pointers[stripe + 1];
				1874	pointers[rbio->nr_data - 1] = p;
				1875
				1876	/* xor in the rest */
				1877	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1878	}
				1879	/* if we're doing this rebuild as part of an rmw, go through
				1880	* and set all of our private rbio pages in the
				1881	* failed stripes as uptodate. This way finish_rmw will
				1882	* know they can be trusted. If this was a read reconstruction,
				1883	* other endio functions will fiddle the uptodate bits
				1884	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	1885	if (rbio->operation == BTRFS_RBIO_WRITE) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1886	for (i = 0; i < nr_pages; i++) {
				1887	if (faila != -1) {
				1888	page = rbio_stripe_page(rbio, faila, i);
				1889	SetPageUptodate(page);
				1890	}
				1891	if (failb != -1) {
				1892	page = rbio_stripe_page(rbio, failb, i);
				1893	SetPageUptodate(page);
				1894	}
				1895	}
				1896	}
				1897	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1898	/*
				1899	* if we're rebuilding a read, we have to use
				1900	* pages from the bio list
				1901	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	1902	if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1903	(stripe == faila \|\| stripe == failb)) {
				1904	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1905	} else {
				1906	page = rbio_stripe_page(rbio, stripe, pagenr);
				1907	}
				1908	kunmap(page);
				1909	}
				1910	}
				1911
				1912	err = 0;
				1913	cleanup:
				1914	kfree(pointers);
				1915
				1916	cleanup_io:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	1917	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1918	if (err == 0 &&
				1919	!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1920	cache_rbio_pages(rbio);
				1921	else
				1922	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1923
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1924	rbio_orig_end_io(rbio, err, err == 0);
				1925	} else if (err == 0) {
				1926	rbio->faila = -1;
				1927	rbio->failb = -1;
				1928	finish_rmw(rbio);
				1929	} else {
				1930	rbio_orig_end_io(rbio, err, 0);
				1931	}
				1932	}
				1933
				1934	/*
				1935	* This is called only for stripes we've read from disk to
				1936	* reconstruct the parity.
				1937	*/
				1938	static void raid_recover_end_io(struct bio *bio, int err)
				1939	{
				1940	struct btrfs_raid_bio *rbio = bio->bi_private;
				1941
				1942	/*
				1943	* we only read stripe pages off the disk, set them
				1944	* up to date if there were no errors
				1945	*/
				1946	if (err)
				1947	fail_bio_stripe(rbio, bio);
				1948	else
				1949	set_bio_pages_uptodate(bio);
				1950	bio_put(bio);
				1951
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1952	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1953	return;
				1954
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1955	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1956	rbio_orig_end_io(rbio, -EIO, 0);
				1957	else
				1958	__raid_recover_end_io(rbio);
				1959	}
				1960
				1961	/*
				1962	* reads everything we need off the disk to reconstruct
				1963	* the parity. endio handlers trigger final reconstruction
				1964	* when the IO is done.
				1965	*
				1966	* This is used both for reads from the higher layers and for
				1967	* parity construction required to finish a rmw cycle.
				1968	*/
				1969	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				1970	{
				1971	int bios_to_read = 0;
				1972	struct btrfs_bio *bbio = rbio->bbio;
				1973	struct bio_list bio_list;
				1974	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1975	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1976	int pagenr;
				1977	int stripe;
				1978	struct bio *bio;
				1979
				1980	bio_list_init(&bio_list);
				1981
				1982	ret = alloc_rbio_pages(rbio);
				1983	if (ret)
				1984	goto cleanup;
				1985
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1986	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1987
				1988	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1989	* read everything that hasn't failed. Thanks to the
				1990	* stripe cache, it is possible that some or all of these
				1991	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1992	*/
				1993	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	1994	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1995	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1996	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	1997	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1998
				1999	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				2000	struct page *p;
				2001
				2002	/*
				2003	* the rmw code may have already read this
				2004	* page in
				2005	*/
				2006	p = rbio_stripe_page(rbio, stripe, pagenr);
				2007	if (PageUptodate(p))
				2008	continue;
				2009
				2010	ret = rbio_add_io_page(rbio, &bio_list,
				2011	rbio_stripe_page(rbio, stripe, pagenr),
				2012	stripe, pagenr, rbio->stripe_len);
				2013	if (ret < 0)
				2014	goto cleanup;
				2015	}
				2016	}
				2017
				2018	bios_to_read = bio_list_size(&bio_list);
				2019	if (!bios_to_read) {
				2020	/*
				2021	* we might have no bios to read just because the pages
				2022	* were up to date, or we might have no bios to read because
				2023	* the devices were gone.
				2024	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2025	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2026	__raid_recover_end_io(rbio);
				2027	goto out;
				2028	} else {
				2029	goto cleanup;
				2030	}
				2031	}
				2032
				2033	/*
				2034	* the bbio may be freed once we submit the last bio. Make sure
				2035	* not to touch it after that
				2036	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2037	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2038	while (1) {
				2039	bio = bio_list_pop(&bio_list);
				2040	if (!bio)
				2041	break;
				2042
				2043	bio->bi_private = rbio;
				2044	bio->bi_end_io = raid_recover_end_io;
				2045
				2046	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2047	BTRFS_WQ_ENDIO_RAID56);
				2048
				2049	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2050	submit_bio(READ, bio);
				2051	}
				2052	out:
				2053	return 0;
				2054
				2055	cleanup:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	2056	if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2057	rbio_orig_end_io(rbio, -EIO, 0);
				2058	return -EIO;
				2059	}
				2060
				2061	/*
				2062	* the main entry point for reads from the higher layers. This
				2063	* is really only called when the normal read path had a failure,
				2064	* so we assume the bio they send down corresponds to a failed part
				2065	* of the drive.
				2066	*/
				2067	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				2068	struct btrfs_bio bbio, u64 raid_map,
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2069	u64 stripe_len, int mirror_num, int hold_bbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2070	{
				2071	struct btrfs_raid_bio *rbio;
				2072	int ret;
				2073
				2074	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2075	if (IS_ERR(rbio)) {
				2076	__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2077	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2078	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2079
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2080	if (hold_bbio)
				2081	set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame^]	2082	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2083	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2084	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2085
				2086	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2087	if (rbio->faila == -1) {
				2088	BUG();
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2089	__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2090	kfree(rbio);
				2091	return -EIO;
				2092	}
				2093
				2094	/*
				2095	* reconstruct from the q stripe if they are
				2096	* asking for mirror 3
				2097	*/
				2098	if (mirror_num == 3)
				2099	rbio->failb = bbio->num_stripes - 2;
				2100
				2101	ret = lock_stripe_add(rbio);
				2102
				2103	/*
				2104	* __raid56_parity_recover will end the bio with
				2105	* any errors it hits. We don't want to return
				2106	* its error value up the stack because our caller
				2107	* will end up calling bio_endio with any nonzero
				2108	* return
				2109	*/
				2110	if (ret == 0)
				2111	__raid56_parity_recover(rbio);
				2112	/*
				2113	* our rbio has been added to the list of
				2114	* rbios that will be handled after the
				2115	* currently lock owner is done
				2116	*/
				2117	return 0;
				2118
				2119	}
				2120
				2121	static void rmw_work(struct btrfs_work *work)
				2122	{
				2123	struct btrfs_raid_bio *rbio;
				2124
				2125	rbio = container_of(work, struct btrfs_raid_bio, work);
				2126	raid56_rmw_stripe(rbio);
				2127	}
				2128
				2129	static void read_rebuild_work(struct btrfs_work *work)
				2130	{
				2131	struct btrfs_raid_bio *rbio;
				2132
				2133	rbio = container_of(work, struct btrfs_raid_bio, work);
				2134	__raid56_parity_recover(rbio);
				2135	}