blob: 5e4ad134b9ad12881edb33b7962e13155edc9d45 [file] [log] [blame]
David Sterbac1d7c512018-04-03 19:23:33 +02001// SPDX-License-Identifier: GPL-2.0
David Woodhouse53b381b2013-01-29 18:40:14 -05002/*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
David Woodhouse53b381b2013-01-29 18:40:14 -05005 */
David Sterbac1d7c512018-04-03 19:23:33 +02006
David Woodhouse53b381b2013-01-29 18:40:14 -05007#include <linux/sched.h>
8#include <linux/wait.h>
9#include <linux/bio.h>
10#include <linux/slab.h>
11#include <linux/buffer_head.h>
12#include <linux/blkdev.h>
13#include <linux/random.h>
14#include <linux/iocontext.h>
15#include <linux/capability.h>
16#include <linux/ratelimit.h>
17#include <linux/kthread.h>
18#include <linux/raid/pq.h>
19#include <linux/hash.h>
20#include <linux/list_sort.h>
21#include <linux/raid/xor.h>
David Sterba818e0102017-05-31 18:40:02 +020022#include <linux/mm.h>
David Woodhouse53b381b2013-01-29 18:40:14 -050023#include <asm/div64.h>
David Woodhouse53b381b2013-01-29 18:40:14 -050024#include "ctree.h"
25#include "extent_map.h"
26#include "disk-io.h"
27#include "transaction.h"
28#include "print-tree.h"
29#include "volumes.h"
30#include "raid56.h"
31#include "async-thread.h"
32#include "check-integrity.h"
33#include "rcu-string.h"
34
35/* set when additional merges to this rbio are not allowed */
36#define RBIO_RMW_LOCKED_BIT 1
37
Chris Mason4ae10b32013-01-31 14:42:09 -050038/*
39 * set when this rbio is sitting in the hash, but it is just a cache
40 * of past RMW
41 */
42#define RBIO_CACHE_BIT 2
43
44/*
45 * set when it is safe to trust the stripe_pages for caching
46 */
47#define RBIO_CACHE_READY_BIT 3
48
Chris Mason4ae10b32013-01-31 14:42:09 -050049#define RBIO_CACHE_SIZE 1024
50
Miao Xie1b94b552014-11-06 16:14:21 +080051enum btrfs_rbio_ops {
Omar Sandovalb4ee1782015-06-19 11:52:50 -070052 BTRFS_RBIO_WRITE,
53 BTRFS_RBIO_READ_REBUILD,
54 BTRFS_RBIO_PARITY_SCRUB,
55 BTRFS_RBIO_REBUILD_MISSING,
Miao Xie1b94b552014-11-06 16:14:21 +080056};
57
David Woodhouse53b381b2013-01-29 18:40:14 -050058struct btrfs_raid_bio {
59 struct btrfs_fs_info *fs_info;
60 struct btrfs_bio *bbio;
61
David Woodhouse53b381b2013-01-29 18:40:14 -050062 /* while we're doing rmw on a stripe
63 * we put it into a hash table so we can
64 * lock the stripe and merge more rbios
65 * into it.
66 */
67 struct list_head hash_list;
68
69 /*
Chris Mason4ae10b32013-01-31 14:42:09 -050070 * LRU list for the stripe cache
71 */
72 struct list_head stripe_cache;
73
74 /*
David Woodhouse53b381b2013-01-29 18:40:14 -050075 * for scheduling work in the helper threads
76 */
77 struct btrfs_work work;
78
79 /*
80 * bio list and bio_list_lock are used
81 * to add more bios into the stripe
82 * in hopes of avoiding the full rmw
83 */
84 struct bio_list bio_list;
85 spinlock_t bio_list_lock;
86
Chris Mason6ac0f482013-01-31 14:42:28 -050087 /* also protected by the bio_list_lock, the
88 * plug list is used by the plugging code
89 * to collect partial bios while plugged. The
90 * stripe locking code also uses it to hand off
David Woodhouse53b381b2013-01-29 18:40:14 -050091 * the stripe lock to the next pending IO
92 */
93 struct list_head plug_list;
94
95 /*
96 * flags that tell us if it is safe to
97 * merge with this bio
98 */
99 unsigned long flags;
100
101 /* size of each individual stripe on disk */
102 int stripe_len;
103
104 /* number of data stripes (no p/q) */
105 int nr_data;
106
Miao Xie2c8cdd62014-11-14 16:06:25 +0800107 int real_stripes;
108
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800109 int stripe_npages;
David Woodhouse53b381b2013-01-29 18:40:14 -0500110 /*
111 * set if we're doing a parity rebuild
112 * for a read from higher up, which is handled
113 * differently from a parity rebuild as part of
114 * rmw
115 */
Miao Xie1b94b552014-11-06 16:14:21 +0800116 enum btrfs_rbio_ops operation;
David Woodhouse53b381b2013-01-29 18:40:14 -0500117
118 /* first bad stripe */
119 int faila;
120
121 /* second bad stripe (for raid6 use) */
122 int failb;
123
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800124 int scrubp;
David Woodhouse53b381b2013-01-29 18:40:14 -0500125 /*
126 * number of pages needed to represent the full
127 * stripe
128 */
129 int nr_pages;
130
131 /*
132 * size of all the bios in the bio_list. This
133 * helps us decide if the rbio maps to a full
134 * stripe or not
135 */
136 int bio_list_bytes;
137
Miao Xie42452152014-11-25 16:39:28 +0800138 int generic_bio_cnt;
139
Elena Reshetovadec95572017-03-03 10:55:26 +0200140 refcount_t refs;
David Woodhouse53b381b2013-01-29 18:40:14 -0500141
Miao Xieb89e1b02014-10-15 11:18:44 +0800142 atomic_t stripes_pending;
143
144 atomic_t error;
David Woodhouse53b381b2013-01-29 18:40:14 -0500145 /*
146 * these are two arrays of pointers. We allocate the
147 * rbio big enough to hold them both and setup their
148 * locations when the rbio is allocated
149 */
150
151 /* pointers to pages that we allocated for
152 * reading/writing stripes directly from the disk (including P/Q)
153 */
154 struct page **stripe_pages;
155
156 /*
157 * pointers to the pages in the bio_list. Stored
158 * here for faster lookup
159 */
160 struct page **bio_pages;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800161
162 /*
163 * bitmap to record which horizontal stripe has data
164 */
165 unsigned long *dbitmap;
Kees Cook13890532018-05-29 16:44:59 -0700166
167 /* allocated with real_stripes-many pointers for finish_*() calls */
168 void **finish_pointers;
169
170 /* allocated with stripe_npages-many bits for finish_*() calls */
171 unsigned long *finish_pbitmap;
David Woodhouse53b381b2013-01-29 18:40:14 -0500172};
173
174static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
175static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
176static void rmw_work(struct btrfs_work *work);
177static void read_rebuild_work(struct btrfs_work *work);
178static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
179static void async_read_rebuild(struct btrfs_raid_bio *rbio);
180static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
181static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
182static void __free_raid_bio(struct btrfs_raid_bio *rbio);
183static void index_rbio_pages(struct btrfs_raid_bio *rbio);
184static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
185
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800186static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
187 int need_check);
188static void async_scrub_parity(struct btrfs_raid_bio *rbio);
189
David Woodhouse53b381b2013-01-29 18:40:14 -0500190/*
191 * the stripe hash table is used for locking, and to collect
192 * bios in hopes of making a full stripe
193 */
194int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
195{
196 struct btrfs_stripe_hash_table *table;
197 struct btrfs_stripe_hash_table *x;
198 struct btrfs_stripe_hash *cur;
199 struct btrfs_stripe_hash *h;
200 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
201 int i;
David Sterba83c82662013-03-01 15:03:00 +0000202 int table_size;
David Woodhouse53b381b2013-01-29 18:40:14 -0500203
204 if (info->stripe_hash_table)
205 return 0;
206
David Sterba83c82662013-03-01 15:03:00 +0000207 /*
208 * The table is large, starting with order 4 and can go as high as
209 * order 7 in case lock debugging is turned on.
210 *
211 * Try harder to allocate and fallback to vmalloc to lower the chance
212 * of a failing mount.
213 */
214 table_size = sizeof(*table) + sizeof(*h) * num_entries;
David Sterba818e0102017-05-31 18:40:02 +0200215 table = kvzalloc(table_size, GFP_KERNEL);
216 if (!table)
217 return -ENOMEM;
David Woodhouse53b381b2013-01-29 18:40:14 -0500218
Chris Mason4ae10b32013-01-31 14:42:09 -0500219 spin_lock_init(&table->cache_lock);
220 INIT_LIST_HEAD(&table->stripe_cache);
221
David Woodhouse53b381b2013-01-29 18:40:14 -0500222 h = table->table;
223
224 for (i = 0; i < num_entries; i++) {
225 cur = h + i;
226 INIT_LIST_HEAD(&cur->hash_list);
227 spin_lock_init(&cur->lock);
David Woodhouse53b381b2013-01-29 18:40:14 -0500228 }
229
230 x = cmpxchg(&info->stripe_hash_table, NULL, table);
Wang Shilongf7493032014-11-22 21:13:10 +0800231 if (x)
232 kvfree(x);
David Woodhouse53b381b2013-01-29 18:40:14 -0500233 return 0;
234}
235
236/*
Chris Mason4ae10b32013-01-31 14:42:09 -0500237 * caching an rbio means to copy anything from the
238 * bio_pages array into the stripe_pages array. We
239 * use the page uptodate bit in the stripe cache array
240 * to indicate if it has valid data
241 *
242 * once the caching is done, we set the cache ready
243 * bit.
244 */
245static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
246{
247 int i;
248 char *s;
249 char *d;
250 int ret;
251
252 ret = alloc_rbio_pages(rbio);
253 if (ret)
254 return;
255
256 for (i = 0; i < rbio->nr_pages; i++) {
257 if (!rbio->bio_pages[i])
258 continue;
259
260 s = kmap(rbio->bio_pages[i]);
261 d = kmap(rbio->stripe_pages[i]);
262
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +0300263 memcpy(d, s, PAGE_SIZE);
Chris Mason4ae10b32013-01-31 14:42:09 -0500264
265 kunmap(rbio->bio_pages[i]);
266 kunmap(rbio->stripe_pages[i]);
267 SetPageUptodate(rbio->stripe_pages[i]);
268 }
269 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
270}
271
272/*
David Woodhouse53b381b2013-01-29 18:40:14 -0500273 * we hash on the first logical address of the stripe
274 */
275static int rbio_bucket(struct btrfs_raid_bio *rbio)
276{
Zhao Lei8e5cfb52015-01-20 15:11:33 +0800277 u64 num = rbio->bbio->raid_map[0];
David Woodhouse53b381b2013-01-29 18:40:14 -0500278
279 /*
280 * we shift down quite a bit. We're using byte
281 * addressing, and most of the lower bits are zeros.
282 * This tends to upset hash_64, and it consistently
283 * returns just one or two different values.
284 *
285 * shifting off the lower bits fixes things.
286 */
287 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
288}
289
290/*
Chris Mason4ae10b32013-01-31 14:42:09 -0500291 * stealing an rbio means taking all the uptodate pages from the stripe
292 * array in the source rbio and putting them into the destination rbio
293 */
294static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
295{
296 int i;
297 struct page *s;
298 struct page *d;
299
300 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
301 return;
302
303 for (i = 0; i < dest->nr_pages; i++) {
304 s = src->stripe_pages[i];
305 if (!s || !PageUptodate(s)) {
306 continue;
307 }
308
309 d = dest->stripe_pages[i];
310 if (d)
311 __free_page(d);
312
313 dest->stripe_pages[i] = s;
314 src->stripe_pages[i] = NULL;
315 }
316}
317
318/*
David Woodhouse53b381b2013-01-29 18:40:14 -0500319 * merging means we take the bio_list from the victim and
320 * splice it into the destination. The victim should
321 * be discarded afterwards.
322 *
323 * must be called with dest->rbio_list_lock held
324 */
325static void merge_rbio(struct btrfs_raid_bio *dest,
326 struct btrfs_raid_bio *victim)
327{
328 bio_list_merge(&dest->bio_list, &victim->bio_list);
329 dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie42452152014-11-25 16:39:28 +0800330 dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse53b381b2013-01-29 18:40:14 -0500331 bio_list_init(&victim->bio_list);
332}
333
334/*
Chris Mason4ae10b32013-01-31 14:42:09 -0500335 * used to prune items that are in the cache. The caller
336 * must hold the hash table lock.
337 */
338static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
339{
340 int bucket = rbio_bucket(rbio);
341 struct btrfs_stripe_hash_table *table;
342 struct btrfs_stripe_hash *h;
343 int freeit = 0;
344
345 /*
346 * check the bit again under the hash table lock.
347 */
348 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
349 return;
350
351 table = rbio->fs_info->stripe_hash_table;
352 h = table->table + bucket;
353
354 /* hold the lock for the bucket because we may be
355 * removing it from the hash table
356 */
357 spin_lock(&h->lock);
358
359 /*
360 * hold the lock for the bio list because we need
361 * to make sure the bio list is empty
362 */
363 spin_lock(&rbio->bio_list_lock);
364
365 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
366 list_del_init(&rbio->stripe_cache);
367 table->cache_size -= 1;
368 freeit = 1;
369
370 /* if the bio list isn't empty, this rbio is
371 * still involved in an IO. We take it out
372 * of the cache list, and drop the ref that
373 * was held for the list.
374 *
375 * If the bio_list was empty, we also remove
376 * the rbio from the hash_table, and drop
377 * the corresponding ref
378 */
379 if (bio_list_empty(&rbio->bio_list)) {
380 if (!list_empty(&rbio->hash_list)) {
381 list_del_init(&rbio->hash_list);
Elena Reshetovadec95572017-03-03 10:55:26 +0200382 refcount_dec(&rbio->refs);
Chris Mason4ae10b32013-01-31 14:42:09 -0500383 BUG_ON(!list_empty(&rbio->plug_list));
384 }
385 }
386 }
387
388 spin_unlock(&rbio->bio_list_lock);
389 spin_unlock(&h->lock);
390
391 if (freeit)
392 __free_raid_bio(rbio);
393}
394
395/*
396 * prune a given rbio from the cache
397 */
398static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
399{
400 struct btrfs_stripe_hash_table *table;
401 unsigned long flags;
402
403 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
404 return;
405
406 table = rbio->fs_info->stripe_hash_table;
407
408 spin_lock_irqsave(&table->cache_lock, flags);
409 __remove_rbio_from_cache(rbio);
410 spin_unlock_irqrestore(&table->cache_lock, flags);
411}
412
413/*
414 * remove everything in the cache
415 */
Eric Sandeen48a3b632013-04-25 20:41:01 +0000416static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason4ae10b32013-01-31 14:42:09 -0500417{
418 struct btrfs_stripe_hash_table *table;
419 unsigned long flags;
420 struct btrfs_raid_bio *rbio;
421
422 table = info->stripe_hash_table;
423
424 spin_lock_irqsave(&table->cache_lock, flags);
425 while (!list_empty(&table->stripe_cache)) {
426 rbio = list_entry(table->stripe_cache.next,
427 struct btrfs_raid_bio,
428 stripe_cache);
429 __remove_rbio_from_cache(rbio);
430 }
431 spin_unlock_irqrestore(&table->cache_lock, flags);
432}
433
434/*
435 * remove all cached entries and free the hash table
436 * used by unmount
David Woodhouse53b381b2013-01-29 18:40:14 -0500437 */
438void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
439{
440 if (!info->stripe_hash_table)
441 return;
Chris Mason4ae10b32013-01-31 14:42:09 -0500442 btrfs_clear_rbio_cache(info);
Wang Shilongf7493032014-11-22 21:13:10 +0800443 kvfree(info->stripe_hash_table);
David Woodhouse53b381b2013-01-29 18:40:14 -0500444 info->stripe_hash_table = NULL;
445}
446
447/*
Chris Mason4ae10b32013-01-31 14:42:09 -0500448 * insert an rbio into the stripe cache. It
449 * must have already been prepared by calling
450 * cache_rbio_pages
451 *
452 * If this rbio was already cached, it gets
453 * moved to the front of the lru.
454 *
455 * If the size of the rbio cache is too big, we
456 * prune an item.
457 */
458static void cache_rbio(struct btrfs_raid_bio *rbio)
459{
460 struct btrfs_stripe_hash_table *table;
461 unsigned long flags;
462
463 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
464 return;
465
466 table = rbio->fs_info->stripe_hash_table;
467
468 spin_lock_irqsave(&table->cache_lock, flags);
469 spin_lock(&rbio->bio_list_lock);
470
471 /* bump our ref if we were not in the list before */
472 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
Elena Reshetovadec95572017-03-03 10:55:26 +0200473 refcount_inc(&rbio->refs);
Chris Mason4ae10b32013-01-31 14:42:09 -0500474
475 if (!list_empty(&rbio->stripe_cache)){
476 list_move(&rbio->stripe_cache, &table->stripe_cache);
477 } else {
478 list_add(&rbio->stripe_cache, &table->stripe_cache);
479 table->cache_size += 1;
480 }
481
482 spin_unlock(&rbio->bio_list_lock);
483
484 if (table->cache_size > RBIO_CACHE_SIZE) {
485 struct btrfs_raid_bio *found;
486
487 found = list_entry(table->stripe_cache.prev,
488 struct btrfs_raid_bio,
489 stripe_cache);
490
491 if (found != rbio)
492 __remove_rbio_from_cache(found);
493 }
494
495 spin_unlock_irqrestore(&table->cache_lock, flags);
Chris Mason4ae10b32013-01-31 14:42:09 -0500496}
497
498/*
David Woodhouse53b381b2013-01-29 18:40:14 -0500499 * helper function to run the xor_blocks api. It is only
500 * able to do MAX_XOR_BLOCKS at a time, so we need to
501 * loop through.
502 */
503static void run_xor(void **pages, int src_cnt, ssize_t len)
504{
505 int src_off = 0;
506 int xor_src_cnt = 0;
507 void *dest = pages[src_cnt];
508
509 while(src_cnt > 0) {
510 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
511 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
512
513 src_cnt -= xor_src_cnt;
514 src_off += xor_src_cnt;
515 }
516}
517
518/*
519 * returns true if the bio list inside this rbio
520 * covers an entire stripe (no rmw required).
521 * Must be called with the bio list lock held, or
522 * at a time when you know it is impossible to add
523 * new bios into the list
524 */
525static int __rbio_is_full(struct btrfs_raid_bio *rbio)
526{
527 unsigned long size = rbio->bio_list_bytes;
528 int ret = 1;
529
530 if (size != rbio->nr_data * rbio->stripe_len)
531 ret = 0;
532
533 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
534 return ret;
535}
536
537static int rbio_is_full(struct btrfs_raid_bio *rbio)
538{
539 unsigned long flags;
540 int ret;
541
542 spin_lock_irqsave(&rbio->bio_list_lock, flags);
543 ret = __rbio_is_full(rbio);
544 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
545 return ret;
546}
547
548/*
549 * returns 1 if it is safe to merge two rbios together.
550 * The merging is safe if the two rbios correspond to
551 * the same stripe and if they are both going in the same
552 * direction (read vs write), and if neither one is
553 * locked for final IO
554 *
555 * The caller is responsible for locking such that
556 * rmw_locked is safe to test
557 */
558static int rbio_can_merge(struct btrfs_raid_bio *last,
559 struct btrfs_raid_bio *cur)
560{
561 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
562 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
563 return 0;
564
Chris Mason4ae10b32013-01-31 14:42:09 -0500565 /*
566 * we can't merge with cached rbios, since the
567 * idea is that when we merge the destination
568 * rbio is going to run our IO for us. We can
Nicholas D Steeves01327612016-05-19 21:18:45 -0400569 * steal from cached rbios though, other functions
Chris Mason4ae10b32013-01-31 14:42:09 -0500570 * handle that.
571 */
572 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
573 test_bit(RBIO_CACHE_BIT, &cur->flags))
574 return 0;
575
Zhao Lei8e5cfb52015-01-20 15:11:33 +0800576 if (last->bbio->raid_map[0] !=
577 cur->bbio->raid_map[0])
David Woodhouse53b381b2013-01-29 18:40:14 -0500578 return 0;
579
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800580 /* we can't merge with different operations */
581 if (last->operation != cur->operation)
David Woodhouse53b381b2013-01-29 18:40:14 -0500582 return 0;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800583 /*
584 * We've need read the full stripe from the drive.
585 * check and repair the parity and write the new results.
586 *
587 * We're not allowed to add any new bios to the
588 * bio list here, anyone else that wants to
589 * change this stripe needs to do their own rmw.
590 */
Liu Bodb34be12017-12-04 15:40:35 -0700591 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800592 return 0;
David Woodhouse53b381b2013-01-29 18:40:14 -0500593
Liu Bodb34be12017-12-04 15:40:35 -0700594 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandovalb4ee1782015-06-19 11:52:50 -0700595 return 0;
596
Liu Bocc54ff62017-12-11 14:56:31 -0700597 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
598 int fa = last->faila;
599 int fb = last->failb;
600 int cur_fa = cur->faila;
601 int cur_fb = cur->failb;
602
603 if (last->faila >= last->failb) {
604 fa = last->failb;
605 fb = last->faila;
606 }
607
608 if (cur->faila >= cur->failb) {
609 cur_fa = cur->failb;
610 cur_fb = cur->faila;
611 }
612
613 if (fa != cur_fa || fb != cur_fb)
614 return 0;
615 }
David Woodhouse53b381b2013-01-29 18:40:14 -0500616 return 1;
617}
618
Zhao Leib7178a52015-03-03 20:38:46 +0800619static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
620 int index)
621{
622 return stripe * rbio->stripe_npages + index;
623}
624
625/*
626 * these are just the pages from the rbio array, not from anything
627 * the FS sent down to us
628 */
629static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
630 int index)
631{
632 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
633}
634
David Woodhouse53b381b2013-01-29 18:40:14 -0500635/*
636 * helper to index into the pstripe
637 */
638static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
639{
Zhao Leib7178a52015-03-03 20:38:46 +0800640 return rbio_stripe_page(rbio, rbio->nr_data, index);
David Woodhouse53b381b2013-01-29 18:40:14 -0500641}
642
643/*
644 * helper to index into the qstripe, returns null
645 * if there is no qstripe
646 */
647static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
648{
Miao Xie2c8cdd62014-11-14 16:06:25 +0800649 if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse53b381b2013-01-29 18:40:14 -0500650 return NULL;
Zhao Leib7178a52015-03-03 20:38:46 +0800651 return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
David Woodhouse53b381b2013-01-29 18:40:14 -0500652}
653
654/*
655 * The first stripe in the table for a logical address
656 * has the lock. rbios are added in one of three ways:
657 *
658 * 1) Nobody has the stripe locked yet. The rbio is given
659 * the lock and 0 is returned. The caller must start the IO
660 * themselves.
661 *
662 * 2) Someone has the stripe locked, but we're able to merge
663 * with the lock owner. The rbio is freed and the IO will
664 * start automatically along with the existing rbio. 1 is returned.
665 *
666 * 3) Someone has the stripe locked, but we're not able to merge.
667 * The rbio is added to the lock owner's plug list, or merged into
668 * an rbio already on the plug list. When the lock owner unlocks,
669 * the next rbio on the list is run and the IO is started automatically.
670 * 1 is returned
671 *
672 * If we return 0, the caller still owns the rbio and must continue with
673 * IO submission. If we return 1, the caller must assume the rbio has
674 * already been freed.
675 */
676static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
677{
678 int bucket = rbio_bucket(rbio);
679 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
680 struct btrfs_raid_bio *cur;
681 struct btrfs_raid_bio *pending;
682 unsigned long flags;
David Woodhouse53b381b2013-01-29 18:40:14 -0500683 struct btrfs_raid_bio *freeit = NULL;
Chris Mason4ae10b32013-01-31 14:42:09 -0500684 struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse53b381b2013-01-29 18:40:14 -0500685 int ret = 0;
David Woodhouse53b381b2013-01-29 18:40:14 -0500686
687 spin_lock_irqsave(&h->lock, flags);
688 list_for_each_entry(cur, &h->hash_list, hash_list) {
Zhao Lei8e5cfb52015-01-20 15:11:33 +0800689 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
David Woodhouse53b381b2013-01-29 18:40:14 -0500690 spin_lock(&cur->bio_list_lock);
691
Chris Mason4ae10b32013-01-31 14:42:09 -0500692 /* can we steal this cached rbio's pages? */
693 if (bio_list_empty(&cur->bio_list) &&
694 list_empty(&cur->plug_list) &&
695 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
696 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
697 list_del_init(&cur->hash_list);
Elena Reshetovadec95572017-03-03 10:55:26 +0200698 refcount_dec(&cur->refs);
Chris Mason4ae10b32013-01-31 14:42:09 -0500699
700 steal_rbio(cur, rbio);
701 cache_drop = cur;
702 spin_unlock(&cur->bio_list_lock);
703
704 goto lockit;
705 }
706
David Woodhouse53b381b2013-01-29 18:40:14 -0500707 /* can we merge into the lock owner? */
708 if (rbio_can_merge(cur, rbio)) {
709 merge_rbio(cur, rbio);
710 spin_unlock(&cur->bio_list_lock);
711 freeit = rbio;
712 ret = 1;
713 goto out;
714 }
715
Chris Mason4ae10b32013-01-31 14:42:09 -0500716
David Woodhouse53b381b2013-01-29 18:40:14 -0500717 /*
718 * we couldn't merge with the running
719 * rbio, see if we can merge with the
720 * pending ones. We don't have to
721 * check for rmw_locked because there
722 * is no way they are inside finish_rmw
723 * right now
724 */
725 list_for_each_entry(pending, &cur->plug_list,
726 plug_list) {
727 if (rbio_can_merge(pending, rbio)) {
728 merge_rbio(pending, rbio);
729 spin_unlock(&cur->bio_list_lock);
730 freeit = rbio;
731 ret = 1;
732 goto out;
733 }
734 }
735
736 /* no merging, put us on the tail of the plug list,
737 * our rbio will be started with the currently
738 * running rbio unlocks
739 */
740 list_add_tail(&rbio->plug_list, &cur->plug_list);
741 spin_unlock(&cur->bio_list_lock);
742 ret = 1;
743 goto out;
744 }
745 }
Chris Mason4ae10b32013-01-31 14:42:09 -0500746lockit:
Elena Reshetovadec95572017-03-03 10:55:26 +0200747 refcount_inc(&rbio->refs);
David Woodhouse53b381b2013-01-29 18:40:14 -0500748 list_add(&rbio->hash_list, &h->hash_list);
749out:
750 spin_unlock_irqrestore(&h->lock, flags);
Chris Mason4ae10b32013-01-31 14:42:09 -0500751 if (cache_drop)
752 remove_rbio_from_cache(cache_drop);
David Woodhouse53b381b2013-01-29 18:40:14 -0500753 if (freeit)
754 __free_raid_bio(freeit);
755 return ret;
756}
757
758/*
759 * called as rmw or parity rebuild is completed. If the plug list has more
760 * rbios waiting for this stripe, the next one on the list will be started
761 */
762static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
763{
764 int bucket;
765 struct btrfs_stripe_hash *h;
766 unsigned long flags;
Chris Mason4ae10b32013-01-31 14:42:09 -0500767 int keep_cache = 0;
David Woodhouse53b381b2013-01-29 18:40:14 -0500768
769 bucket = rbio_bucket(rbio);
770 h = rbio->fs_info->stripe_hash_table->table + bucket;
771
Chris Mason4ae10b32013-01-31 14:42:09 -0500772 if (list_empty(&rbio->plug_list))
773 cache_rbio(rbio);
774
David Woodhouse53b381b2013-01-29 18:40:14 -0500775 spin_lock_irqsave(&h->lock, flags);
776 spin_lock(&rbio->bio_list_lock);
777
778 if (!list_empty(&rbio->hash_list)) {
Chris Mason4ae10b32013-01-31 14:42:09 -0500779 /*
780 * if we're still cached and there is no other IO
781 * to perform, just leave this rbio here for others
782 * to steal from later
783 */
784 if (list_empty(&rbio->plug_list) &&
785 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
786 keep_cache = 1;
787 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
788 BUG_ON(!bio_list_empty(&rbio->bio_list));
789 goto done;
790 }
David Woodhouse53b381b2013-01-29 18:40:14 -0500791
792 list_del_init(&rbio->hash_list);
Elena Reshetovadec95572017-03-03 10:55:26 +0200793 refcount_dec(&rbio->refs);
David Woodhouse53b381b2013-01-29 18:40:14 -0500794
795 /*
796 * we use the plug list to hold all the rbios
797 * waiting for the chance to lock this stripe.
798 * hand the lock over to one of them.
799 */
800 if (!list_empty(&rbio->plug_list)) {
801 struct btrfs_raid_bio *next;
802 struct list_head *head = rbio->plug_list.next;
803
804 next = list_entry(head, struct btrfs_raid_bio,
805 plug_list);
806
807 list_del_init(&rbio->plug_list);
808
809 list_add(&next->hash_list, &h->hash_list);
Elena Reshetovadec95572017-03-03 10:55:26 +0200810 refcount_inc(&next->refs);
David Woodhouse53b381b2013-01-29 18:40:14 -0500811 spin_unlock(&rbio->bio_list_lock);
812 spin_unlock_irqrestore(&h->lock, flags);
813
Miao Xie1b94b552014-11-06 16:14:21 +0800814 if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse53b381b2013-01-29 18:40:14 -0500815 async_read_rebuild(next);
Omar Sandovalb4ee1782015-06-19 11:52:50 -0700816 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
817 steal_rbio(rbio, next);
818 async_read_rebuild(next);
819 } else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason4ae10b32013-01-31 14:42:09 -0500820 steal_rbio(rbio, next);
David Woodhouse53b381b2013-01-29 18:40:14 -0500821 async_rmw_stripe(next);
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800822 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
823 steal_rbio(rbio, next);
824 async_scrub_parity(next);
Chris Mason4ae10b32013-01-31 14:42:09 -0500825 }
David Woodhouse53b381b2013-01-29 18:40:14 -0500826
827 goto done_nolock;
David Woodhouse53b381b2013-01-29 18:40:14 -0500828 }
829 }
Chris Mason4ae10b32013-01-31 14:42:09 -0500830done:
David Woodhouse53b381b2013-01-29 18:40:14 -0500831 spin_unlock(&rbio->bio_list_lock);
832 spin_unlock_irqrestore(&h->lock, flags);
833
834done_nolock:
Chris Mason4ae10b32013-01-31 14:42:09 -0500835 if (!keep_cache)
836 remove_rbio_from_cache(rbio);
David Woodhouse53b381b2013-01-29 18:40:14 -0500837}
838
839static void __free_raid_bio(struct btrfs_raid_bio *rbio)
840{
841 int i;
842
Elena Reshetovadec95572017-03-03 10:55:26 +0200843 if (!refcount_dec_and_test(&rbio->refs))
David Woodhouse53b381b2013-01-29 18:40:14 -0500844 return;
845
Chris Mason4ae10b32013-01-31 14:42:09 -0500846 WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse53b381b2013-01-29 18:40:14 -0500847 WARN_ON(!list_empty(&rbio->hash_list));
848 WARN_ON(!bio_list_empty(&rbio->bio_list));
849
850 for (i = 0; i < rbio->nr_pages; i++) {
851 if (rbio->stripe_pages[i]) {
852 __free_page(rbio->stripe_pages[i]);
853 rbio->stripe_pages[i] = NULL;
854 }
855 }
Miao Xieaf8e2d12014-10-23 14:42:50 +0800856
Zhao Lei6e9606d2015-01-20 15:11:34 +0800857 btrfs_put_bbio(rbio->bbio);
David Woodhouse53b381b2013-01-29 18:40:14 -0500858 kfree(rbio);
859}
860
Liu Bo7583d8d2018-01-09 18:36:25 -0700861static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
David Woodhouse53b381b2013-01-29 18:40:14 -0500862{
Liu Bo7583d8d2018-01-09 18:36:25 -0700863 struct bio *next;
864
865 while (cur) {
866 next = cur->bi_next;
867 cur->bi_next = NULL;
868 cur->bi_status = err;
869 bio_endio(cur);
870 cur = next;
871 }
David Woodhouse53b381b2013-01-29 18:40:14 -0500872}
873
874/*
875 * this frees the rbio and runs through all the bios in the
876 * bio_list and calls end_io on them
877 */
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +0200878static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
David Woodhouse53b381b2013-01-29 18:40:14 -0500879{
880 struct bio *cur = bio_list_get(&rbio->bio_list);
Liu Bo7583d8d2018-01-09 18:36:25 -0700881 struct bio *extra;
Miao Xie42452152014-11-25 16:39:28 +0800882
883 if (rbio->generic_bio_cnt)
884 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
885
Liu Bo7583d8d2018-01-09 18:36:25 -0700886 /*
887 * At this moment, rbio->bio_list is empty, however since rbio does not
888 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
889 * hash list, rbio may be merged with others so that rbio->bio_list
890 * becomes non-empty.
891 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
892 * more and we can call bio_endio() on all queued bios.
893 */
894 unlock_stripe(rbio);
895 extra = bio_list_get(&rbio->bio_list);
896 __free_raid_bio(rbio);
David Woodhouse53b381b2013-01-29 18:40:14 -0500897
Liu Bo7583d8d2018-01-09 18:36:25 -0700898 rbio_endio_bio_list(cur, err);
899 if (extra)
900 rbio_endio_bio_list(extra, err);
David Woodhouse53b381b2013-01-29 18:40:14 -0500901}
902
903/*
904 * end io function used by finish_rmw. When we finally
905 * get here, we've written a full stripe
906 */
Christoph Hellwig4246a0b2015-07-20 15:29:37 +0200907static void raid_write_end_io(struct bio *bio)
David Woodhouse53b381b2013-01-29 18:40:14 -0500908{
909 struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +0200910 blk_status_t err = bio->bi_status;
Zhao Leia6111d12016-01-12 17:52:13 +0800911 int max_errors;
David Woodhouse53b381b2013-01-29 18:40:14 -0500912
913 if (err)
914 fail_bio_stripe(rbio, bio);
915
916 bio_put(bio);
917
Miao Xieb89e1b02014-10-15 11:18:44 +0800918 if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse53b381b2013-01-29 18:40:14 -0500919 return;
920
Omar Sandoval58efbc92017-08-22 23:45:59 -0700921 err = BLK_STS_OK;
David Woodhouse53b381b2013-01-29 18:40:14 -0500922
923 /* OK, we have read all the stripes we need to. */
Zhao Leia6111d12016-01-12 17:52:13 +0800924 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
925 0 : rbio->bbio->max_errors;
926 if (atomic_read(&rbio->error) > max_errors)
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +0200927 err = BLK_STS_IOERR;
David Woodhouse53b381b2013-01-29 18:40:14 -0500928
Christoph Hellwig4246a0b2015-07-20 15:29:37 +0200929 rbio_orig_end_io(rbio, err);
David Woodhouse53b381b2013-01-29 18:40:14 -0500930}
931
932/*
933 * the read/modify/write code wants to use the original bio for
934 * any pages it included, and then use the rbio for everything
935 * else. This function decides if a given index (stripe number)
936 * and page number in that stripe fall inside the original bio
937 * or the rbio.
938 *
939 * if you set bio_list_only, you'll get a NULL back for any ranges
940 * that are outside the bio_list
941 *
942 * This doesn't take any refs on anything, you get a bare page pointer
943 * and the caller must bump refs as required.
944 *
945 * You must call index_rbio_pages once before you can trust
946 * the answers from this function.
947 */
948static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
949 int index, int pagenr, int bio_list_only)
950{
951 int chunk_page;
952 struct page *p = NULL;
953
954 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
955
956 spin_lock_irq(&rbio->bio_list_lock);
957 p = rbio->bio_pages[chunk_page];
958 spin_unlock_irq(&rbio->bio_list_lock);
959
960 if (p || bio_list_only)
961 return p;
962
963 return rbio->stripe_pages[chunk_page];
964}
965
966/*
967 * number of pages we need for the entire stripe across all the
968 * drives
969 */
970static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
971{
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +0300972 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
David Woodhouse53b381b2013-01-29 18:40:14 -0500973}
974
975/*
976 * allocation and initial setup for the btrfs_raid_bio. Not
977 * this does not allocate any pages for rbio->pages.
978 */
Jeff Mahoney2ff7e612016-06-22 18:54:24 -0400979static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
980 struct btrfs_bio *bbio,
981 u64 stripe_len)
David Woodhouse53b381b2013-01-29 18:40:14 -0500982{
983 struct btrfs_raid_bio *rbio;
984 int nr_data = 0;
Miao Xie2c8cdd62014-11-14 16:06:25 +0800985 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
986 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800987 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse53b381b2013-01-29 18:40:14 -0500988 void *p;
989
Kees Cook13890532018-05-29 16:44:59 -0700990 rbio = kzalloc(sizeof(*rbio) +
991 sizeof(*rbio->stripe_pages) * num_pages +
992 sizeof(*rbio->bio_pages) * num_pages +
993 sizeof(*rbio->finish_pointers) * real_stripes +
994 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
995 sizeof(*rbio->finish_pbitmap) *
996 BITS_TO_LONGS(stripe_npages),
997 GFP_NOFS);
Miao Xieaf8e2d12014-10-23 14:42:50 +0800998 if (!rbio)
David Woodhouse53b381b2013-01-29 18:40:14 -0500999 return ERR_PTR(-ENOMEM);
David Woodhouse53b381b2013-01-29 18:40:14 -05001000
1001 bio_list_init(&rbio->bio_list);
1002 INIT_LIST_HEAD(&rbio->plug_list);
1003 spin_lock_init(&rbio->bio_list_lock);
Chris Mason4ae10b32013-01-31 14:42:09 -05001004 INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse53b381b2013-01-29 18:40:14 -05001005 INIT_LIST_HEAD(&rbio->hash_list);
1006 rbio->bbio = bbio;
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04001007 rbio->fs_info = fs_info;
David Woodhouse53b381b2013-01-29 18:40:14 -05001008 rbio->stripe_len = stripe_len;
1009 rbio->nr_pages = num_pages;
Miao Xie2c8cdd62014-11-14 16:06:25 +08001010 rbio->real_stripes = real_stripes;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001011 rbio->stripe_npages = stripe_npages;
David Woodhouse53b381b2013-01-29 18:40:14 -05001012 rbio->faila = -1;
1013 rbio->failb = -1;
Elena Reshetovadec95572017-03-03 10:55:26 +02001014 refcount_set(&rbio->refs, 1);
Miao Xieb89e1b02014-10-15 11:18:44 +08001015 atomic_set(&rbio->error, 0);
1016 atomic_set(&rbio->stripes_pending, 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05001017
1018 /*
Kees Cook13890532018-05-29 16:44:59 -07001019 * the stripe_pages, bio_pages, etc arrays point to the extra
David Woodhouse53b381b2013-01-29 18:40:14 -05001020 * memory we allocated past the end of the rbio
1021 */
1022 p = rbio + 1;
Kees Cook13890532018-05-29 16:44:59 -07001023#define CONSUME_ALLOC(ptr, count) do { \
1024 ptr = p; \
1025 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1026 } while (0)
1027 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1028 CONSUME_ALLOC(rbio->bio_pages, num_pages);
1029 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1030 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1031 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1032#undef CONSUME_ALLOC
David Woodhouse53b381b2013-01-29 18:40:14 -05001033
Zhao Lei10f11902015-01-20 15:11:43 +08001034 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1035 nr_data = real_stripes - 1;
1036 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie2c8cdd62014-11-14 16:06:25 +08001037 nr_data = real_stripes - 2;
David Woodhouse53b381b2013-01-29 18:40:14 -05001038 else
Zhao Lei10f11902015-01-20 15:11:43 +08001039 BUG();
David Woodhouse53b381b2013-01-29 18:40:14 -05001040
1041 rbio->nr_data = nr_data;
1042 return rbio;
1043}
1044
1045/* allocate pages for all the stripes in the bio, including parity */
1046static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1047{
1048 int i;
1049 struct page *page;
1050
1051 for (i = 0; i < rbio->nr_pages; i++) {
1052 if (rbio->stripe_pages[i])
1053 continue;
1054 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1055 if (!page)
1056 return -ENOMEM;
1057 rbio->stripe_pages[i] = page;
David Woodhouse53b381b2013-01-29 18:40:14 -05001058 }
1059 return 0;
1060}
1061
Zhao Leib7178a52015-03-03 20:38:46 +08001062/* only allocate pages for p/q stripes */
David Woodhouse53b381b2013-01-29 18:40:14 -05001063static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1064{
1065 int i;
1066 struct page *page;
1067
Zhao Leib7178a52015-03-03 20:38:46 +08001068 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05001069
1070 for (; i < rbio->nr_pages; i++) {
1071 if (rbio->stripe_pages[i])
1072 continue;
1073 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1074 if (!page)
1075 return -ENOMEM;
1076 rbio->stripe_pages[i] = page;
1077 }
1078 return 0;
1079}
1080
1081/*
1082 * add a single page from a specific stripe into our list of bios for IO
1083 * this will try to merge into existing bios if possible, and returns
1084 * zero if all went well.
1085 */
Eric Sandeen48a3b632013-04-25 20:41:01 +00001086static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1087 struct bio_list *bio_list,
1088 struct page *page,
1089 int stripe_nr,
1090 unsigned long page_index,
1091 unsigned long bio_max_len)
David Woodhouse53b381b2013-01-29 18:40:14 -05001092{
1093 struct bio *last = bio_list->tail;
1094 u64 last_end = 0;
1095 int ret;
1096 struct bio *bio;
1097 struct btrfs_bio_stripe *stripe;
1098 u64 disk_start;
1099
1100 stripe = &rbio->bbio->stripes[stripe_nr];
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001101 disk_start = stripe->physical + (page_index << PAGE_SHIFT);
David Woodhouse53b381b2013-01-29 18:40:14 -05001102
1103 /* if the device is missing, just fail this stripe */
1104 if (!stripe->dev->bdev)
1105 return fail_rbio_index(rbio, stripe_nr);
1106
1107 /* see if we can add this page onto our existing bio */
1108 if (last) {
Kent Overstreet4f024f32013-10-11 15:44:27 -07001109 last_end = (u64)last->bi_iter.bi_sector << 9;
1110 last_end += last->bi_iter.bi_size;
David Woodhouse53b381b2013-01-29 18:40:14 -05001111
1112 /*
1113 * we can't merge these if they are from different
1114 * devices or if they are not contiguous
1115 */
1116 if (last_end == disk_start && stripe->dev->bdev &&
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02001117 !last->bi_status &&
Christoph Hellwig74d46992017-08-23 19:10:32 +02001118 last->bi_disk == stripe->dev->bdev->bd_disk &&
1119 last->bi_partno == stripe->dev->bdev->bd_partno) {
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001120 ret = bio_add_page(last, page, PAGE_SIZE, 0);
1121 if (ret == PAGE_SIZE)
David Woodhouse53b381b2013-01-29 18:40:14 -05001122 return 0;
1123 }
1124 }
1125
1126 /* put a new bio on the list */
David Sterbac5e4c3d2017-06-12 17:29:41 +02001127 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001128 bio->bi_iter.bi_size = 0;
Christoph Hellwig74d46992017-08-23 19:10:32 +02001129 bio_set_dev(bio, stripe->dev->bdev);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001130 bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse53b381b2013-01-29 18:40:14 -05001131
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001132 bio_add_page(bio, page, PAGE_SIZE, 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05001133 bio_list_add(bio_list, bio);
1134 return 0;
1135}
1136
1137/*
1138 * while we're doing the read/modify/write cycle, we could
1139 * have errors in reading pages off the disk. This checks
1140 * for errors and if we're not able to read the page it'll
1141 * trigger parity reconstruction. The rmw will be finished
1142 * after we've reconstructed the failed stripes
1143 */
1144static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1145{
1146 if (rbio->faila >= 0 || rbio->failb >= 0) {
Miao Xie2c8cdd62014-11-14 16:06:25 +08001147 BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse53b381b2013-01-29 18:40:14 -05001148 __raid56_parity_recover(rbio);
1149 } else {
1150 finish_rmw(rbio);
1151 }
1152}
1153
1154/*
David Woodhouse53b381b2013-01-29 18:40:14 -05001155 * helper function to walk our bio list and populate the bio_pages array with
1156 * the result. This seems expensive, but it is faster than constantly
1157 * searching through the bio list as we setup the IO in finish_rmw or stripe
1158 * reconstruction.
1159 *
1160 * This must be called before you trust the answers from page_in_rbio
1161 */
1162static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1163{
1164 struct bio *bio;
1165 u64 start;
1166 unsigned long stripe_offset;
1167 unsigned long page_index;
David Woodhouse53b381b2013-01-29 18:40:14 -05001168
1169 spin_lock_irq(&rbio->bio_list_lock);
1170 bio_list_for_each(bio, &rbio->bio_list) {
Filipe Manana6592e582017-07-12 23:36:02 +01001171 struct bio_vec bvec;
1172 struct bvec_iter iter;
1173 int i = 0;
1174
Kent Overstreet4f024f32013-10-11 15:44:27 -07001175 start = (u64)bio->bi_iter.bi_sector << 9;
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001176 stripe_offset = start - rbio->bbio->raid_map[0];
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001177 page_index = stripe_offset >> PAGE_SHIFT;
David Woodhouse53b381b2013-01-29 18:40:14 -05001178
Filipe Manana6592e582017-07-12 23:36:02 +01001179 if (bio_flagged(bio, BIO_CLONED))
1180 bio->bi_iter = btrfs_io_bio(bio)->iter;
1181
1182 bio_for_each_segment(bvec, bio, iter) {
1183 rbio->bio_pages[page_index + i] = bvec.bv_page;
1184 i++;
1185 }
David Woodhouse53b381b2013-01-29 18:40:14 -05001186 }
1187 spin_unlock_irq(&rbio->bio_list_lock);
1188}
1189
1190/*
1191 * this is called from one of two situations. We either
1192 * have a full stripe from the higher layers, or we've read all
1193 * the missing bits off disk.
1194 *
1195 * This will calculate the parity and then send down any
1196 * changed blocks.
1197 */
1198static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1199{
1200 struct btrfs_bio *bbio = rbio->bbio;
Kees Cook13890532018-05-29 16:44:59 -07001201 void **pointers = rbio->finish_pointers;
David Woodhouse53b381b2013-01-29 18:40:14 -05001202 int nr_data = rbio->nr_data;
1203 int stripe;
1204 int pagenr;
1205 int p_stripe = -1;
1206 int q_stripe = -1;
1207 struct bio_list bio_list;
1208 struct bio *bio;
David Woodhouse53b381b2013-01-29 18:40:14 -05001209 int ret;
1210
1211 bio_list_init(&bio_list);
1212
Miao Xie2c8cdd62014-11-14 16:06:25 +08001213 if (rbio->real_stripes - rbio->nr_data == 1) {
1214 p_stripe = rbio->real_stripes - 1;
1215 } else if (rbio->real_stripes - rbio->nr_data == 2) {
1216 p_stripe = rbio->real_stripes - 2;
1217 q_stripe = rbio->real_stripes - 1;
David Woodhouse53b381b2013-01-29 18:40:14 -05001218 } else {
1219 BUG();
1220 }
1221
1222 /* at this point we either have a full stripe,
1223 * or we've read the full stripe from the drive.
1224 * recalculate the parity and write the new results.
1225 *
1226 * We're not allowed to add any new bios to the
1227 * bio list here, anyone else that wants to
1228 * change this stripe needs to do their own rmw.
1229 */
1230 spin_lock_irq(&rbio->bio_list_lock);
1231 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1232 spin_unlock_irq(&rbio->bio_list_lock);
1233
Miao Xieb89e1b02014-10-15 11:18:44 +08001234 atomic_set(&rbio->error, 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05001235
1236 /*
1237 * now that we've set rmw_locked, run through the
1238 * bio list one last time and map the page pointers
Chris Mason4ae10b32013-01-31 14:42:09 -05001239 *
1240 * We don't cache full rbios because we're assuming
1241 * the higher layers are unlikely to use this area of
1242 * the disk again soon. If they do use it again,
1243 * hopefully they will send another full bio.
David Woodhouse53b381b2013-01-29 18:40:14 -05001244 */
1245 index_rbio_pages(rbio);
Chris Mason4ae10b32013-01-31 14:42:09 -05001246 if (!rbio_is_full(rbio))
1247 cache_rbio_pages(rbio);
1248 else
1249 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse53b381b2013-01-29 18:40:14 -05001250
Zhao Lei915e2292015-03-03 20:42:48 +08001251 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001252 struct page *p;
1253 /* first collect one page from each data stripe */
1254 for (stripe = 0; stripe < nr_data; stripe++) {
1255 p = page_in_rbio(rbio, stripe, pagenr, 0);
1256 pointers[stripe] = kmap(p);
1257 }
1258
1259 /* then add the parity stripe */
1260 p = rbio_pstripe_page(rbio, pagenr);
1261 SetPageUptodate(p);
1262 pointers[stripe++] = kmap(p);
1263
1264 if (q_stripe != -1) {
1265
1266 /*
1267 * raid6, add the qstripe and call the
1268 * library function to fill in our p/q
1269 */
1270 p = rbio_qstripe_page(rbio, pagenr);
1271 SetPageUptodate(p);
1272 pointers[stripe++] = kmap(p);
1273
Miao Xie2c8cdd62014-11-14 16:06:25 +08001274 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse53b381b2013-01-29 18:40:14 -05001275 pointers);
1276 } else {
1277 /* raid5 */
1278 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001279 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
David Woodhouse53b381b2013-01-29 18:40:14 -05001280 }
1281
1282
Miao Xie2c8cdd62014-11-14 16:06:25 +08001283 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse53b381b2013-01-29 18:40:14 -05001284 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1285 }
1286
1287 /*
1288 * time to start writing. Make bios for everything from the
1289 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1290 * everything else.
1291 */
Miao Xie2c8cdd62014-11-14 16:06:25 +08001292 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Zhao Lei915e2292015-03-03 20:42:48 +08001293 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001294 struct page *page;
1295 if (stripe < rbio->nr_data) {
1296 page = page_in_rbio(rbio, stripe, pagenr, 1);
1297 if (!page)
1298 continue;
1299 } else {
1300 page = rbio_stripe_page(rbio, stripe, pagenr);
1301 }
1302
1303 ret = rbio_add_io_page(rbio, &bio_list,
1304 page, stripe, pagenr, rbio->stripe_len);
1305 if (ret)
1306 goto cleanup;
1307 }
1308 }
1309
Miao Xie2c8cdd62014-11-14 16:06:25 +08001310 if (likely(!bbio->num_tgtdevs))
1311 goto write_data;
1312
1313 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1314 if (!bbio->tgtdev_map[stripe])
1315 continue;
1316
Zhao Lei915e2292015-03-03 20:42:48 +08001317 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie2c8cdd62014-11-14 16:06:25 +08001318 struct page *page;
1319 if (stripe < rbio->nr_data) {
1320 page = page_in_rbio(rbio, stripe, pagenr, 1);
1321 if (!page)
1322 continue;
1323 } else {
1324 page = rbio_stripe_page(rbio, stripe, pagenr);
1325 }
1326
1327 ret = rbio_add_io_page(rbio, &bio_list, page,
1328 rbio->bbio->tgtdev_map[stripe],
1329 pagenr, rbio->stripe_len);
1330 if (ret)
1331 goto cleanup;
1332 }
1333 }
1334
1335write_data:
Miao Xieb89e1b02014-10-15 11:18:44 +08001336 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1337 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05001338
1339 while (1) {
1340 bio = bio_list_pop(&bio_list);
1341 if (!bio)
1342 break;
1343
1344 bio->bi_private = rbio;
1345 bio->bi_end_io = raid_write_end_io;
Mike Christie37226b22016-06-05 14:31:52 -05001346 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Mike Christie4e49ea42016-06-05 14:31:41 -05001347
1348 submit_bio(bio);
David Woodhouse53b381b2013-01-29 18:40:14 -05001349 }
1350 return;
1351
1352cleanup:
Omar Sandoval58efbc92017-08-22 23:45:59 -07001353 rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo785884f2017-09-22 12:11:18 -06001354
1355 while ((bio = bio_list_pop(&bio_list)))
1356 bio_put(bio);
David Woodhouse53b381b2013-01-29 18:40:14 -05001357}
1358
1359/*
1360 * helper to find the stripe number for a given bio. Used to figure out which
1361 * stripe has failed. This expects the bio to correspond to a physical disk,
1362 * so it looks up based on physical sector numbers.
1363 */
1364static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1365 struct bio *bio)
1366{
Kent Overstreet4f024f32013-10-11 15:44:27 -07001367 u64 physical = bio->bi_iter.bi_sector;
David Woodhouse53b381b2013-01-29 18:40:14 -05001368 u64 stripe_start;
1369 int i;
1370 struct btrfs_bio_stripe *stripe;
1371
1372 physical <<= 9;
1373
1374 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1375 stripe = &rbio->bbio->stripes[i];
1376 stripe_start = stripe->physical;
1377 if (physical >= stripe_start &&
Miao Xie2c8cdd62014-11-14 16:06:25 +08001378 physical < stripe_start + rbio->stripe_len &&
Dmitriy Gorokh047fdea2018-02-16 19:51:38 +00001379 stripe->dev->bdev &&
Christoph Hellwig74d46992017-08-23 19:10:32 +02001380 bio->bi_disk == stripe->dev->bdev->bd_disk &&
1381 bio->bi_partno == stripe->dev->bdev->bd_partno) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001382 return i;
1383 }
1384 }
1385 return -1;
1386}
1387
1388/*
1389 * helper to find the stripe number for a given
1390 * bio (before mapping). Used to figure out which stripe has
1391 * failed. This looks up based on logical block numbers.
1392 */
1393static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1394 struct bio *bio)
1395{
Kent Overstreet4f024f32013-10-11 15:44:27 -07001396 u64 logical = bio->bi_iter.bi_sector;
David Woodhouse53b381b2013-01-29 18:40:14 -05001397 u64 stripe_start;
1398 int i;
1399
1400 logical <<= 9;
1401
1402 for (i = 0; i < rbio->nr_data; i++) {
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001403 stripe_start = rbio->bbio->raid_map[i];
David Woodhouse53b381b2013-01-29 18:40:14 -05001404 if (logical >= stripe_start &&
1405 logical < stripe_start + rbio->stripe_len) {
1406 return i;
1407 }
1408 }
1409 return -1;
1410}
1411
1412/*
1413 * returns -EIO if we had too many failures
1414 */
1415static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1416{
1417 unsigned long flags;
1418 int ret = 0;
1419
1420 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1421
1422 /* we already know this stripe is bad, move on */
1423 if (rbio->faila == failed || rbio->failb == failed)
1424 goto out;
1425
1426 if (rbio->faila == -1) {
1427 /* first failure on this rbio */
1428 rbio->faila = failed;
Miao Xieb89e1b02014-10-15 11:18:44 +08001429 atomic_inc(&rbio->error);
David Woodhouse53b381b2013-01-29 18:40:14 -05001430 } else if (rbio->failb == -1) {
1431 /* second failure on this rbio */
1432 rbio->failb = failed;
Miao Xieb89e1b02014-10-15 11:18:44 +08001433 atomic_inc(&rbio->error);
David Woodhouse53b381b2013-01-29 18:40:14 -05001434 } else {
1435 ret = -EIO;
1436 }
1437out:
1438 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1439
1440 return ret;
1441}
1442
1443/*
1444 * helper to fail a stripe based on a physical disk
1445 * bio.
1446 */
1447static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1448 struct bio *bio)
1449{
1450 int failed = find_bio_stripe(rbio, bio);
1451
1452 if (failed < 0)
1453 return -EIO;
1454
1455 return fail_rbio_index(rbio, failed);
1456}
1457
1458/*
1459 * this sets each page in the bio uptodate. It should only be used on private
1460 * rbio pages, nothing that comes in from the higher layers
1461 */
1462static void set_bio_pages_uptodate(struct bio *bio)
1463{
Liu Bo0198e5b2018-01-12 18:07:01 -07001464 struct bio_vec *bvec;
1465 int i;
David Woodhouse53b381b2013-01-29 18:40:14 -05001466
Liu Bo0198e5b2018-01-12 18:07:01 -07001467 ASSERT(!bio_flagged(bio, BIO_CLONED));
Filipe Manana6592e582017-07-12 23:36:02 +01001468
Liu Bo0198e5b2018-01-12 18:07:01 -07001469 bio_for_each_segment_all(bvec, bio, i)
1470 SetPageUptodate(bvec->bv_page);
David Woodhouse53b381b2013-01-29 18:40:14 -05001471}
1472
1473/*
1474 * end io for the read phase of the rmw cycle. All the bios here are physical
1475 * stripe bios we've read from the disk so we can recalculate the parity of the
1476 * stripe.
1477 *
1478 * This will usually kick off finish_rmw once all the bios are read in, but it
1479 * may trigger parity reconstruction if we had any errors along the way
1480 */
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02001481static void raid_rmw_end_io(struct bio *bio)
David Woodhouse53b381b2013-01-29 18:40:14 -05001482{
1483 struct btrfs_raid_bio *rbio = bio->bi_private;
1484
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02001485 if (bio->bi_status)
David Woodhouse53b381b2013-01-29 18:40:14 -05001486 fail_bio_stripe(rbio, bio);
1487 else
1488 set_bio_pages_uptodate(bio);
1489
1490 bio_put(bio);
1491
Miao Xieb89e1b02014-10-15 11:18:44 +08001492 if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse53b381b2013-01-29 18:40:14 -05001493 return;
1494
Miao Xieb89e1b02014-10-15 11:18:44 +08001495 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse53b381b2013-01-29 18:40:14 -05001496 goto cleanup;
1497
1498 /*
1499 * this will normally call finish_rmw to start our write
1500 * but if there are any failed stripes we'll reconstruct
1501 * from parity first
1502 */
1503 validate_rbio_for_rmw(rbio);
1504 return;
1505
1506cleanup:
1507
Omar Sandoval58efbc92017-08-22 23:45:59 -07001508 rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse53b381b2013-01-29 18:40:14 -05001509}
1510
1511static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1512{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001513 btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL);
1514 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
David Woodhouse53b381b2013-01-29 18:40:14 -05001515}
1516
1517static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1518{
Liu Bo9e0af232014-08-15 23:36:53 +08001519 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1520 read_rebuild_work, NULL, NULL);
David Woodhouse53b381b2013-01-29 18:40:14 -05001521
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001522 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
David Woodhouse53b381b2013-01-29 18:40:14 -05001523}
1524
1525/*
1526 * the stripe must be locked by the caller. It will
1527 * unlock after all the writes are done
1528 */
1529static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1530{
1531 int bios_to_read = 0;
David Woodhouse53b381b2013-01-29 18:40:14 -05001532 struct bio_list bio_list;
1533 int ret;
David Woodhouse53b381b2013-01-29 18:40:14 -05001534 int pagenr;
1535 int stripe;
1536 struct bio *bio;
1537
1538 bio_list_init(&bio_list);
1539
1540 ret = alloc_rbio_pages(rbio);
1541 if (ret)
1542 goto cleanup;
1543
1544 index_rbio_pages(rbio);
1545
Miao Xieb89e1b02014-10-15 11:18:44 +08001546 atomic_set(&rbio->error, 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05001547 /*
1548 * build a list of bios to read all the missing parts of this
1549 * stripe
1550 */
1551 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
Zhao Lei915e2292015-03-03 20:42:48 +08001552 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001553 struct page *page;
1554 /*
1555 * we want to find all the pages missing from
1556 * the rbio and read them from the disk. If
1557 * page_in_rbio finds a page in the bio list
1558 * we don't need to read it off the stripe.
1559 */
1560 page = page_in_rbio(rbio, stripe, pagenr, 1);
1561 if (page)
1562 continue;
1563
1564 page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason4ae10b32013-01-31 14:42:09 -05001565 /*
1566 * the bio cache may have handed us an uptodate
1567 * page. If so, be happy and use it
1568 */
1569 if (PageUptodate(page))
1570 continue;
1571
David Woodhouse53b381b2013-01-29 18:40:14 -05001572 ret = rbio_add_io_page(rbio, &bio_list, page,
1573 stripe, pagenr, rbio->stripe_len);
1574 if (ret)
1575 goto cleanup;
1576 }
1577 }
1578
1579 bios_to_read = bio_list_size(&bio_list);
1580 if (!bios_to_read) {
1581 /*
1582 * this can happen if others have merged with
1583 * us, it means there is nothing left to read.
1584 * But if there are missing devices it may not be
1585 * safe to do the full stripe write yet.
1586 */
1587 goto finish;
1588 }
1589
1590 /*
1591 * the bbio may be freed once we submit the last bio. Make sure
1592 * not to touch it after that
1593 */
Miao Xieb89e1b02014-10-15 11:18:44 +08001594 atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse53b381b2013-01-29 18:40:14 -05001595 while (1) {
1596 bio = bio_list_pop(&bio_list);
1597 if (!bio)
1598 break;
1599
1600 bio->bi_private = rbio;
1601 bio->bi_end_io = raid_rmw_end_io;
Mike Christie37226b22016-06-05 14:31:52 -05001602 bio_set_op_attrs(bio, REQ_OP_READ, 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05001603
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001604 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse53b381b2013-01-29 18:40:14 -05001605
Mike Christie4e49ea42016-06-05 14:31:41 -05001606 submit_bio(bio);
David Woodhouse53b381b2013-01-29 18:40:14 -05001607 }
1608 /* the actual write will happen once the reads are done */
1609 return 0;
1610
1611cleanup:
Omar Sandoval58efbc92017-08-22 23:45:59 -07001612 rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo785884f2017-09-22 12:11:18 -06001613
1614 while ((bio = bio_list_pop(&bio_list)))
1615 bio_put(bio);
1616
David Woodhouse53b381b2013-01-29 18:40:14 -05001617 return -EIO;
1618
1619finish:
1620 validate_rbio_for_rmw(rbio);
1621 return 0;
1622}
1623
1624/*
1625 * if the upper layers pass in a full stripe, we thank them by only allocating
1626 * enough pages to hold the parity, and sending it all down quickly.
1627 */
1628static int full_stripe_write(struct btrfs_raid_bio *rbio)
1629{
1630 int ret;
1631
1632 ret = alloc_rbio_parity_pages(rbio);
Miao Xie3cd846d2013-07-22 16:36:57 +08001633 if (ret) {
1634 __free_raid_bio(rbio);
David Woodhouse53b381b2013-01-29 18:40:14 -05001635 return ret;
Miao Xie3cd846d2013-07-22 16:36:57 +08001636 }
David Woodhouse53b381b2013-01-29 18:40:14 -05001637
1638 ret = lock_stripe_add(rbio);
1639 if (ret == 0)
1640 finish_rmw(rbio);
1641 return 0;
1642}
1643
1644/*
1645 * partial stripe writes get handed over to async helpers.
1646 * We're really hoping to merge a few more writes into this
1647 * rbio before calculating new parity
1648 */
1649static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1650{
1651 int ret;
1652
1653 ret = lock_stripe_add(rbio);
1654 if (ret == 0)
1655 async_rmw_stripe(rbio);
1656 return 0;
1657}
1658
1659/*
1660 * sometimes while we were reading from the drive to
1661 * recalculate parity, enough new bios come into create
1662 * a full stripe. So we do a check here to see if we can
1663 * go directly to finish_rmw
1664 */
1665static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1666{
1667 /* head off into rmw land if we don't have a full stripe */
1668 if (!rbio_is_full(rbio))
1669 return partial_stripe_write(rbio);
1670 return full_stripe_write(rbio);
1671}
1672
1673/*
Chris Mason6ac0f482013-01-31 14:42:28 -05001674 * We use plugging call backs to collect full stripes.
1675 * Any time we get a partial stripe write while plugged
1676 * we collect it into a list. When the unplug comes down,
1677 * we sort the list by logical block number and merge
1678 * everything we can into the same rbios
1679 */
1680struct btrfs_plug_cb {
1681 struct blk_plug_cb cb;
1682 struct btrfs_fs_info *info;
1683 struct list_head rbio_list;
1684 struct btrfs_work work;
1685};
1686
1687/*
1688 * rbios on the plug list are sorted for easier merging.
1689 */
1690static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1691{
1692 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1693 plug_list);
1694 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1695 plug_list);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001696 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1697 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason6ac0f482013-01-31 14:42:28 -05001698
1699 if (a_sector < b_sector)
1700 return -1;
1701 if (a_sector > b_sector)
1702 return 1;
1703 return 0;
1704}
1705
1706static void run_plug(struct btrfs_plug_cb *plug)
1707{
1708 struct btrfs_raid_bio *cur;
1709 struct btrfs_raid_bio *last = NULL;
1710
1711 /*
1712 * sort our plug list then try to merge
1713 * everything we can in hopes of creating full
1714 * stripes.
1715 */
1716 list_sort(NULL, &plug->rbio_list, plug_cmp);
1717 while (!list_empty(&plug->rbio_list)) {
1718 cur = list_entry(plug->rbio_list.next,
1719 struct btrfs_raid_bio, plug_list);
1720 list_del_init(&cur->plug_list);
1721
1722 if (rbio_is_full(cur)) {
1723 /* we have a full stripe, send it down */
1724 full_stripe_write(cur);
1725 continue;
1726 }
1727 if (last) {
1728 if (rbio_can_merge(last, cur)) {
1729 merge_rbio(last, cur);
1730 __free_raid_bio(cur);
1731 continue;
1732
1733 }
1734 __raid56_parity_write(last);
1735 }
1736 last = cur;
1737 }
1738 if (last) {
1739 __raid56_parity_write(last);
1740 }
1741 kfree(plug);
1742}
1743
1744/*
1745 * if the unplug comes from schedule, we have to push the
1746 * work off to a helper thread
1747 */
1748static void unplug_work(struct btrfs_work *work)
1749{
1750 struct btrfs_plug_cb *plug;
1751 plug = container_of(work, struct btrfs_plug_cb, work);
1752 run_plug(plug);
1753}
1754
1755static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1756{
1757 struct btrfs_plug_cb *plug;
1758 plug = container_of(cb, struct btrfs_plug_cb, cb);
1759
1760 if (from_schedule) {
Liu Bo9e0af232014-08-15 23:36:53 +08001761 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1762 unplug_work, NULL, NULL);
Qu Wenruod05a33a2014-02-28 10:46:11 +08001763 btrfs_queue_work(plug->info->rmw_workers,
1764 &plug->work);
Chris Mason6ac0f482013-01-31 14:42:28 -05001765 return;
1766 }
1767 run_plug(plug);
1768}
1769
1770/*
David Woodhouse53b381b2013-01-29 18:40:14 -05001771 * our main entry point for writes from the rest of the FS.
1772 */
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04001773int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001774 struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse53b381b2013-01-29 18:40:14 -05001775{
1776 struct btrfs_raid_bio *rbio;
Chris Mason6ac0f482013-01-31 14:42:28 -05001777 struct btrfs_plug_cb *plug = NULL;
1778 struct blk_plug_cb *cb;
Miao Xie42452152014-11-25 16:39:28 +08001779 int ret;
David Woodhouse53b381b2013-01-29 18:40:14 -05001780
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04001781 rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001782 if (IS_ERR(rbio)) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08001783 btrfs_put_bbio(bbio);
David Woodhouse53b381b2013-01-29 18:40:14 -05001784 return PTR_ERR(rbio);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001785 }
David Woodhouse53b381b2013-01-29 18:40:14 -05001786 bio_list_add(&rbio->bio_list, bio);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001787 rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie1b94b552014-11-06 16:14:21 +08001788 rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason6ac0f482013-01-31 14:42:28 -05001789
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001790 btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie42452152014-11-25 16:39:28 +08001791 rbio->generic_bio_cnt = 1;
1792
Chris Mason6ac0f482013-01-31 14:42:28 -05001793 /*
1794 * don't plug on full rbios, just get them out the door
1795 * as quickly as we can
1796 */
Miao Xie42452152014-11-25 16:39:28 +08001797 if (rbio_is_full(rbio)) {
1798 ret = full_stripe_write(rbio);
1799 if (ret)
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001800 btrfs_bio_counter_dec(fs_info);
Miao Xie42452152014-11-25 16:39:28 +08001801 return ret;
1802 }
Chris Mason6ac0f482013-01-31 14:42:28 -05001803
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001804 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
Chris Mason6ac0f482013-01-31 14:42:28 -05001805 if (cb) {
1806 plug = container_of(cb, struct btrfs_plug_cb, cb);
1807 if (!plug->info) {
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001808 plug->info = fs_info;
Chris Mason6ac0f482013-01-31 14:42:28 -05001809 INIT_LIST_HEAD(&plug->rbio_list);
1810 }
1811 list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie42452152014-11-25 16:39:28 +08001812 ret = 0;
Chris Mason6ac0f482013-01-31 14:42:28 -05001813 } else {
Miao Xie42452152014-11-25 16:39:28 +08001814 ret = __raid56_parity_write(rbio);
1815 if (ret)
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001816 btrfs_bio_counter_dec(fs_info);
Chris Mason6ac0f482013-01-31 14:42:28 -05001817 }
Miao Xie42452152014-11-25 16:39:28 +08001818 return ret;
David Woodhouse53b381b2013-01-29 18:40:14 -05001819}
1820
1821/*
1822 * all parity reconstruction happens here. We've read in everything
1823 * we can find from the drives and this does the heavy lifting of
1824 * sorting the good from the bad.
1825 */
1826static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1827{
1828 int pagenr, stripe;
1829 void **pointers;
1830 int faila = -1, failb = -1;
David Woodhouse53b381b2013-01-29 18:40:14 -05001831 struct page *page;
Omar Sandoval58efbc92017-08-22 23:45:59 -07001832 blk_status_t err;
David Woodhouse53b381b2013-01-29 18:40:14 -05001833 int i;
1834
David Sterba31e818f2015-02-20 18:00:26 +01001835 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse53b381b2013-01-29 18:40:14 -05001836 if (!pointers) {
Omar Sandoval58efbc92017-08-22 23:45:59 -07001837 err = BLK_STS_RESOURCE;
David Woodhouse53b381b2013-01-29 18:40:14 -05001838 goto cleanup_io;
1839 }
1840
1841 faila = rbio->faila;
1842 failb = rbio->failb;
1843
Omar Sandovalb4ee1782015-06-19 11:52:50 -07001844 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1845 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001846 spin_lock_irq(&rbio->bio_list_lock);
1847 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1848 spin_unlock_irq(&rbio->bio_list_lock);
1849 }
1850
1851 index_rbio_pages(rbio);
1852
Zhao Lei915e2292015-03-03 20:42:48 +08001853 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001854 /*
1855 * Now we just use bitmap to mark the horizontal stripes in
1856 * which we have data when doing parity scrub.
1857 */
1858 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1859 !test_bit(pagenr, rbio->dbitmap))
1860 continue;
1861
David Woodhouse53b381b2013-01-29 18:40:14 -05001862 /* setup our array of pointers with pages
1863 * from each stripe
1864 */
Miao Xie2c8cdd62014-11-14 16:06:25 +08001865 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001866 /*
1867 * if we're rebuilding a read, we have to use
1868 * pages from the bio list
1869 */
Omar Sandovalb4ee1782015-06-19 11:52:50 -07001870 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1871 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse53b381b2013-01-29 18:40:14 -05001872 (stripe == faila || stripe == failb)) {
1873 page = page_in_rbio(rbio, stripe, pagenr, 0);
1874 } else {
1875 page = rbio_stripe_page(rbio, stripe, pagenr);
1876 }
1877 pointers[stripe] = kmap(page);
1878 }
1879
1880 /* all raid6 handling here */
Zhao Lei10f11902015-01-20 15:11:43 +08001881 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001882 /*
1883 * single failure, rebuild from parity raid5
1884 * style
1885 */
1886 if (failb < 0) {
1887 if (faila == rbio->nr_data) {
1888 /*
1889 * Just the P stripe has failed, without
1890 * a bad data or Q stripe.
1891 * TODO, we should redo the xor here.
1892 */
Omar Sandoval58efbc92017-08-22 23:45:59 -07001893 err = BLK_STS_IOERR;
David Woodhouse53b381b2013-01-29 18:40:14 -05001894 goto cleanup;
1895 }
1896 /*
1897 * a single failure in raid6 is rebuilt
1898 * in the pstripe code below
1899 */
1900 goto pstripe;
1901 }
1902
1903 /* make sure our ps and qs are in order */
1904 if (faila > failb) {
1905 int tmp = failb;
1906 failb = faila;
1907 faila = tmp;
1908 }
1909
1910 /* if the q stripe is failed, do a pstripe reconstruction
1911 * from the xors.
1912 * If both the q stripe and the P stripe are failed, we're
1913 * here due to a crc mismatch and we can't give them the
1914 * data they want
1915 */
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001916 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1917 if (rbio->bbio->raid_map[faila] ==
1918 RAID5_P_STRIPE) {
Omar Sandoval58efbc92017-08-22 23:45:59 -07001919 err = BLK_STS_IOERR;
David Woodhouse53b381b2013-01-29 18:40:14 -05001920 goto cleanup;
1921 }
1922 /*
1923 * otherwise we have one bad data stripe and
1924 * a good P stripe. raid5!
1925 */
1926 goto pstripe;
1927 }
1928
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001929 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie2c8cdd62014-11-14 16:06:25 +08001930 raid6_datap_recov(rbio->real_stripes,
David Woodhouse53b381b2013-01-29 18:40:14 -05001931 PAGE_SIZE, faila, pointers);
1932 } else {
Miao Xie2c8cdd62014-11-14 16:06:25 +08001933 raid6_2data_recov(rbio->real_stripes,
David Woodhouse53b381b2013-01-29 18:40:14 -05001934 PAGE_SIZE, faila, failb,
1935 pointers);
1936 }
1937 } else {
1938 void *p;
1939
1940 /* rebuild from P stripe here (raid5 or raid6) */
1941 BUG_ON(failb != -1);
1942pstripe:
1943 /* Copy parity block into failed block to start with */
1944 memcpy(pointers[faila],
1945 pointers[rbio->nr_data],
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001946 PAGE_SIZE);
David Woodhouse53b381b2013-01-29 18:40:14 -05001947
1948 /* rearrange the pointer array */
1949 p = pointers[faila];
1950 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1951 pointers[stripe] = pointers[stripe + 1];
1952 pointers[rbio->nr_data - 1] = p;
1953
1954 /* xor in the rest */
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001955 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
David Woodhouse53b381b2013-01-29 18:40:14 -05001956 }
1957 /* if we're doing this rebuild as part of an rmw, go through
1958 * and set all of our private rbio pages in the
1959 * failed stripes as uptodate. This way finish_rmw will
1960 * know they can be trusted. If this was a read reconstruction,
1961 * other endio functions will fiddle the uptodate bits
1962 */
Miao Xie1b94b552014-11-06 16:14:21 +08001963 if (rbio->operation == BTRFS_RBIO_WRITE) {
Zhao Lei915e2292015-03-03 20:42:48 +08001964 for (i = 0; i < rbio->stripe_npages; i++) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001965 if (faila != -1) {
1966 page = rbio_stripe_page(rbio, faila, i);
1967 SetPageUptodate(page);
1968 }
1969 if (failb != -1) {
1970 page = rbio_stripe_page(rbio, failb, i);
1971 SetPageUptodate(page);
1972 }
1973 }
1974 }
Miao Xie2c8cdd62014-11-14 16:06:25 +08001975 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse53b381b2013-01-29 18:40:14 -05001976 /*
1977 * if we're rebuilding a read, we have to use
1978 * pages from the bio list
1979 */
Omar Sandovalb4ee1782015-06-19 11:52:50 -07001980 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1981 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse53b381b2013-01-29 18:40:14 -05001982 (stripe == faila || stripe == failb)) {
1983 page = page_in_rbio(rbio, stripe, pagenr, 0);
1984 } else {
1985 page = rbio_stripe_page(rbio, stripe, pagenr);
1986 }
1987 kunmap(page);
1988 }
1989 }
1990
Omar Sandoval58efbc92017-08-22 23:45:59 -07001991 err = BLK_STS_OK;
David Woodhouse53b381b2013-01-29 18:40:14 -05001992cleanup:
1993 kfree(pointers);
1994
1995cleanup_io:
Liu Bo580c6ef2018-03-22 09:20:11 +08001996 /*
1997 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1998 * valid rbio which is consistent with ondisk content, thus such a
1999 * valid rbio can be cached to avoid further disk reads.
2000 */
2001 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2002 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Liu Bo44ac4742018-01-12 18:07:02 -07002003 /*
2004 * - In case of two failures, where rbio->failb != -1:
2005 *
2006 * Do not cache this rbio since the above read reconstruction
2007 * (raid6_datap_recov() or raid6_2data_recov()) may have
2008 * changed some content of stripes which are not identical to
2009 * on-disk content any more, otherwise, a later write/recover
2010 * may steal stripe_pages from this rbio and end up with
2011 * corruptions or rebuild failures.
2012 *
2013 * - In case of single failure, where rbio->failb == -1:
2014 *
2015 * Cache this rbio iff the above read reconstruction is
2016 * excuted without problems.
2017 */
2018 if (err == BLK_STS_OK && rbio->failb < 0)
Chris Mason4ae10b32013-01-31 14:42:09 -05002019 cache_rbio_pages(rbio);
2020 else
2021 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2022
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002023 rbio_orig_end_io(rbio, err);
Omar Sandoval58efbc92017-08-22 23:45:59 -07002024 } else if (err == BLK_STS_OK) {
David Woodhouse53b381b2013-01-29 18:40:14 -05002025 rbio->faila = -1;
2026 rbio->failb = -1;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002027
2028 if (rbio->operation == BTRFS_RBIO_WRITE)
2029 finish_rmw(rbio);
2030 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2031 finish_parity_scrub(rbio, 0);
2032 else
2033 BUG();
David Woodhouse53b381b2013-01-29 18:40:14 -05002034 } else {
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002035 rbio_orig_end_io(rbio, err);
David Woodhouse53b381b2013-01-29 18:40:14 -05002036 }
2037}
2038
2039/*
2040 * This is called only for stripes we've read from disk to
2041 * reconstruct the parity.
2042 */
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002043static void raid_recover_end_io(struct bio *bio)
David Woodhouse53b381b2013-01-29 18:40:14 -05002044{
2045 struct btrfs_raid_bio *rbio = bio->bi_private;
2046
2047 /*
2048 * we only read stripe pages off the disk, set them
2049 * up to date if there were no errors
2050 */
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002051 if (bio->bi_status)
David Woodhouse53b381b2013-01-29 18:40:14 -05002052 fail_bio_stripe(rbio, bio);
2053 else
2054 set_bio_pages_uptodate(bio);
2055 bio_put(bio);
2056
Miao Xieb89e1b02014-10-15 11:18:44 +08002057 if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse53b381b2013-01-29 18:40:14 -05002058 return;
2059
Miao Xieb89e1b02014-10-15 11:18:44 +08002060 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Omar Sandoval58efbc92017-08-22 23:45:59 -07002061 rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse53b381b2013-01-29 18:40:14 -05002062 else
2063 __raid_recover_end_io(rbio);
2064}
2065
2066/*
2067 * reads everything we need off the disk to reconstruct
2068 * the parity. endio handlers trigger final reconstruction
2069 * when the IO is done.
2070 *
2071 * This is used both for reads from the higher layers and for
2072 * parity construction required to finish a rmw cycle.
2073 */
2074static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2075{
2076 int bios_to_read = 0;
David Woodhouse53b381b2013-01-29 18:40:14 -05002077 struct bio_list bio_list;
2078 int ret;
David Woodhouse53b381b2013-01-29 18:40:14 -05002079 int pagenr;
2080 int stripe;
2081 struct bio *bio;
2082
2083 bio_list_init(&bio_list);
2084
2085 ret = alloc_rbio_pages(rbio);
2086 if (ret)
2087 goto cleanup;
2088
Miao Xieb89e1b02014-10-15 11:18:44 +08002089 atomic_set(&rbio->error, 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05002090
2091 /*
Chris Mason4ae10b32013-01-31 14:42:09 -05002092 * read everything that hasn't failed. Thanks to the
2093 * stripe cache, it is possible that some or all of these
2094 * pages are going to be uptodate.
David Woodhouse53b381b2013-01-29 18:40:14 -05002095 */
Miao Xie2c8cdd62014-11-14 16:06:25 +08002096 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo55883832014-06-24 15:39:16 +08002097 if (rbio->faila == stripe || rbio->failb == stripe) {
Miao Xieb89e1b02014-10-15 11:18:44 +08002098 atomic_inc(&rbio->error);
David Woodhouse53b381b2013-01-29 18:40:14 -05002099 continue;
Liu Bo55883832014-06-24 15:39:16 +08002100 }
David Woodhouse53b381b2013-01-29 18:40:14 -05002101
Zhao Lei915e2292015-03-03 20:42:48 +08002102 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse53b381b2013-01-29 18:40:14 -05002103 struct page *p;
2104
2105 /*
2106 * the rmw code may have already read this
2107 * page in
2108 */
2109 p = rbio_stripe_page(rbio, stripe, pagenr);
2110 if (PageUptodate(p))
2111 continue;
2112
2113 ret = rbio_add_io_page(rbio, &bio_list,
2114 rbio_stripe_page(rbio, stripe, pagenr),
2115 stripe, pagenr, rbio->stripe_len);
2116 if (ret < 0)
2117 goto cleanup;
2118 }
2119 }
2120
2121 bios_to_read = bio_list_size(&bio_list);
2122 if (!bios_to_read) {
2123 /*
2124 * we might have no bios to read just because the pages
2125 * were up to date, or we might have no bios to read because
2126 * the devices were gone.
2127 */
Miao Xieb89e1b02014-10-15 11:18:44 +08002128 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse53b381b2013-01-29 18:40:14 -05002129 __raid_recover_end_io(rbio);
2130 goto out;
2131 } else {
2132 goto cleanup;
2133 }
2134 }
2135
2136 /*
2137 * the bbio may be freed once we submit the last bio. Make sure
2138 * not to touch it after that
2139 */
Miao Xieb89e1b02014-10-15 11:18:44 +08002140 atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse53b381b2013-01-29 18:40:14 -05002141 while (1) {
2142 bio = bio_list_pop(&bio_list);
2143 if (!bio)
2144 break;
2145
2146 bio->bi_private = rbio;
2147 bio->bi_end_io = raid_recover_end_io;
Mike Christie37226b22016-06-05 14:31:52 -05002148 bio_set_op_attrs(bio, REQ_OP_READ, 0);
David Woodhouse53b381b2013-01-29 18:40:14 -05002149
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002150 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse53b381b2013-01-29 18:40:14 -05002151
Mike Christie4e49ea42016-06-05 14:31:41 -05002152 submit_bio(bio);
David Woodhouse53b381b2013-01-29 18:40:14 -05002153 }
2154out:
2155 return 0;
2156
2157cleanup:
Omar Sandovalb4ee1782015-06-19 11:52:50 -07002158 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2159 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval58efbc92017-08-22 23:45:59 -07002160 rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo785884f2017-09-22 12:11:18 -06002161
2162 while ((bio = bio_list_pop(&bio_list)))
2163 bio_put(bio);
2164
David Woodhouse53b381b2013-01-29 18:40:14 -05002165 return -EIO;
2166}
2167
2168/*
2169 * the main entry point for reads from the higher layers. This
2170 * is really only called when the normal read path had a failure,
2171 * so we assume the bio they send down corresponds to a failed part
2172 * of the drive.
2173 */
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002174int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
Zhao Lei8e5cfb52015-01-20 15:11:33 +08002175 struct btrfs_bio *bbio, u64 stripe_len,
2176 int mirror_num, int generic_io)
David Woodhouse53b381b2013-01-29 18:40:14 -05002177{
2178 struct btrfs_raid_bio *rbio;
2179 int ret;
2180
Liu Boabad60c2017-03-29 10:54:26 -07002181 if (generic_io) {
2182 ASSERT(bbio->mirror_num == mirror_num);
2183 btrfs_io_bio(bio)->mirror_num = mirror_num;
2184 }
2185
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002186 rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xieaf8e2d12014-10-23 14:42:50 +08002187 if (IS_ERR(rbio)) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08002188 if (generic_io)
2189 btrfs_put_bbio(bbio);
David Woodhouse53b381b2013-01-29 18:40:14 -05002190 return PTR_ERR(rbio);
Miao Xieaf8e2d12014-10-23 14:42:50 +08002191 }
David Woodhouse53b381b2013-01-29 18:40:14 -05002192
Miao Xie1b94b552014-11-06 16:14:21 +08002193 rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse53b381b2013-01-29 18:40:14 -05002194 bio_list_add(&rbio->bio_list, bio);
Kent Overstreet4f024f32013-10-11 15:44:27 -07002195 rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse53b381b2013-01-29 18:40:14 -05002196
2197 rbio->faila = find_logical_bio_stripe(rbio, bio);
2198 if (rbio->faila == -1) {
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002199 btrfs_warn(fs_info,
Liu Boe46a28c2016-07-29 10:57:55 -07002200 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
2201 __func__, (u64)bio->bi_iter.bi_sector << 9,
2202 (u64)bio->bi_iter.bi_size, bbio->map_type);
Zhao Lei6e9606d2015-01-20 15:11:34 +08002203 if (generic_io)
2204 btrfs_put_bbio(bbio);
David Woodhouse53b381b2013-01-29 18:40:14 -05002205 kfree(rbio);
2206 return -EIO;
2207 }
2208
Miao Xie42452152014-11-25 16:39:28 +08002209 if (generic_io) {
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002210 btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie42452152014-11-25 16:39:28 +08002211 rbio->generic_bio_cnt = 1;
2212 } else {
Zhao Lei6e9606d2015-01-20 15:11:34 +08002213 btrfs_get_bbio(bbio);
Miao Xie42452152014-11-25 16:39:28 +08002214 }
2215
David Woodhouse53b381b2013-01-29 18:40:14 -05002216 /*
Liu Bo8810f752018-01-02 13:36:41 -07002217 * Loop retry:
2218 * for 'mirror == 2', reconstruct from all other stripes.
2219 * for 'mirror_num > 2', select a stripe to fail on every retry.
David Woodhouse53b381b2013-01-29 18:40:14 -05002220 */
Liu Bo8810f752018-01-02 13:36:41 -07002221 if (mirror_num > 2) {
2222 /*
2223 * 'mirror == 3' is to fail the p stripe and
2224 * reconstruct from the q stripe. 'mirror > 3' is to
2225 * fail a data stripe and reconstruct from p+q stripe.
2226 */
2227 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2228 ASSERT(rbio->failb > 0);
2229 if (rbio->failb <= rbio->faila)
2230 rbio->failb--;
2231 }
David Woodhouse53b381b2013-01-29 18:40:14 -05002232
2233 ret = lock_stripe_add(rbio);
2234
2235 /*
2236 * __raid56_parity_recover will end the bio with
2237 * any errors it hits. We don't want to return
2238 * its error value up the stack because our caller
2239 * will end up calling bio_endio with any nonzero
2240 * return
2241 */
2242 if (ret == 0)
2243 __raid56_parity_recover(rbio);
2244 /*
2245 * our rbio has been added to the list of
2246 * rbios that will be handled after the
2247 * currently lock owner is done
2248 */
2249 return 0;
2250
2251}
2252
2253static void rmw_work(struct btrfs_work *work)
2254{
2255 struct btrfs_raid_bio *rbio;
2256
2257 rbio = container_of(work, struct btrfs_raid_bio, work);
2258 raid56_rmw_stripe(rbio);
2259}
2260
2261static void read_rebuild_work(struct btrfs_work *work)
2262{
2263 struct btrfs_raid_bio *rbio;
2264
2265 rbio = container_of(work, struct btrfs_raid_bio, work);
2266 __raid56_parity_recover(rbio);
2267}
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002268
2269/*
2270 * The following code is used to scrub/replace the parity stripe
2271 *
Qu Wenruoae6529c2017-03-29 09:33:21 +08002272 * Caller must have already increased bio_counter for getting @bbio.
2273 *
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002274 * Note: We need make sure all the pages that add into the scrub/replace
2275 * raid bio are correct and not be changed during the scrub/replace. That
2276 * is those pages just hold metadata or file data with checksum.
2277 */
2278
2279struct btrfs_raid_bio *
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002280raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
Zhao Lei8e5cfb52015-01-20 15:11:33 +08002281 struct btrfs_bio *bbio, u64 stripe_len,
2282 struct btrfs_device *scrub_dev,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002283 unsigned long *dbitmap, int stripe_nsectors)
2284{
2285 struct btrfs_raid_bio *rbio;
2286 int i;
2287
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002288 rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002289 if (IS_ERR(rbio))
2290 return NULL;
2291 bio_list_add(&rbio->bio_list, bio);
2292 /*
2293 * This is a special bio which is used to hold the completion handler
2294 * and make the scrub rbio is similar to the other types
2295 */
2296 ASSERT(!bio->bi_iter.bi_size);
2297 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2298
Liu Bo9cd3a7e2017-08-03 13:53:31 -06002299 /*
2300 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
2301 * to the end position, so this search can start from the first parity
2302 * stripe.
2303 */
2304 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002305 if (bbio->stripes[i].dev == scrub_dev) {
2306 rbio->scrubp = i;
2307 break;
2308 }
2309 }
Liu Bo9cd3a7e2017-08-03 13:53:31 -06002310 ASSERT(i < rbio->real_stripes);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002311
2312 /* Now we just support the sectorsize equals to page size */
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002313 ASSERT(fs_info->sectorsize == PAGE_SIZE);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002314 ASSERT(rbio->stripe_npages == stripe_nsectors);
2315 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2316
Qu Wenruoae6529c2017-03-29 09:33:21 +08002317 /*
2318 * We have already increased bio_counter when getting bbio, record it
2319 * so we can free it at rbio_orig_end_io().
2320 */
2321 rbio->generic_bio_cnt = 1;
2322
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002323 return rbio;
2324}
2325
Omar Sandovalb4ee1782015-06-19 11:52:50 -07002326/* Used for both parity scrub and missing. */
2327void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2328 u64 logical)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002329{
2330 int stripe_offset;
2331 int index;
2332
Zhao Lei8e5cfb52015-01-20 15:11:33 +08002333 ASSERT(logical >= rbio->bbio->raid_map[0]);
2334 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002335 rbio->stripe_len * rbio->nr_data);
Zhao Lei8e5cfb52015-01-20 15:11:33 +08002336 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03002337 index = stripe_offset >> PAGE_SHIFT;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002338 rbio->bio_pages[index] = page;
2339}
2340
2341/*
2342 * We just scrub the parity that we have correct data on the same horizontal,
2343 * so we needn't allocate all pages for all the stripes.
2344 */
2345static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2346{
2347 int i;
2348 int bit;
2349 int index;
2350 struct page *page;
2351
2352 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie2c8cdd62014-11-14 16:06:25 +08002353 for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002354 index = i * rbio->stripe_npages + bit;
2355 if (rbio->stripe_pages[index])
2356 continue;
2357
2358 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2359 if (!page)
2360 return -ENOMEM;
2361 rbio->stripe_pages[index] = page;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002362 }
2363 }
2364 return 0;
2365}
2366
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002367static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2368 int need_check)
2369{
Miao Xie76035972014-11-14 17:45:42 +08002370 struct btrfs_bio *bbio = rbio->bbio;
Kees Cook13890532018-05-29 16:44:59 -07002371 void **pointers = rbio->finish_pointers;
2372 unsigned long *pbitmap = rbio->finish_pbitmap;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002373 int nr_data = rbio->nr_data;
2374 int stripe;
2375 int pagenr;
2376 int p_stripe = -1;
2377 int q_stripe = -1;
2378 struct page *p_page = NULL;
2379 struct page *q_page = NULL;
2380 struct bio_list bio_list;
2381 struct bio *bio;
Miao Xie76035972014-11-14 17:45:42 +08002382 int is_replace = 0;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002383 int ret;
2384
2385 bio_list_init(&bio_list);
2386
Miao Xie2c8cdd62014-11-14 16:06:25 +08002387 if (rbio->real_stripes - rbio->nr_data == 1) {
2388 p_stripe = rbio->real_stripes - 1;
2389 } else if (rbio->real_stripes - rbio->nr_data == 2) {
2390 p_stripe = rbio->real_stripes - 2;
2391 q_stripe = rbio->real_stripes - 1;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002392 } else {
2393 BUG();
2394 }
2395
Miao Xie76035972014-11-14 17:45:42 +08002396 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2397 is_replace = 1;
2398 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2399 }
2400
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002401 /*
2402 * Because the higher layers(scrubber) are unlikely to
2403 * use this area of the disk again soon, so don't cache
2404 * it.
2405 */
2406 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2407
2408 if (!need_check)
2409 goto writeback;
2410
2411 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2412 if (!p_page)
2413 goto cleanup;
2414 SetPageUptodate(p_page);
2415
2416 if (q_stripe != -1) {
2417 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2418 if (!q_page) {
2419 __free_page(p_page);
2420 goto cleanup;
2421 }
2422 SetPageUptodate(q_page);
2423 }
2424
2425 atomic_set(&rbio->error, 0);
2426
2427 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2428 struct page *p;
2429 void *parity;
2430 /* first collect one page from each data stripe */
2431 for (stripe = 0; stripe < nr_data; stripe++) {
2432 p = page_in_rbio(rbio, stripe, pagenr, 0);
2433 pointers[stripe] = kmap(p);
2434 }
2435
2436 /* then add the parity stripe */
2437 pointers[stripe++] = kmap(p_page);
2438
2439 if (q_stripe != -1) {
2440
2441 /*
2442 * raid6, add the qstripe and call the
2443 * library function to fill in our p/q
2444 */
2445 pointers[stripe++] = kmap(q_page);
2446
Miao Xie2c8cdd62014-11-14 16:06:25 +08002447 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002448 pointers);
2449 } else {
2450 /* raid5 */
2451 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03002452 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002453 }
2454
Nicholas D Steeves01327612016-05-19 21:18:45 -04002455 /* Check scrubbing parity and repair it */
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002456 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2457 parity = kmap(p);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03002458 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2459 memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002460 else
2461 /* Parity is right, needn't writeback */
2462 bitmap_clear(rbio->dbitmap, pagenr, 1);
2463 kunmap(p);
2464
Miao Xie2c8cdd62014-11-14 16:06:25 +08002465 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002466 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2467 }
2468
2469 __free_page(p_page);
2470 if (q_page)
2471 __free_page(q_page);
2472
2473writeback:
2474 /*
2475 * time to start writing. Make bios for everything from the
2476 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2477 * everything else.
2478 */
2479 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2480 struct page *page;
2481
2482 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2483 ret = rbio_add_io_page(rbio, &bio_list,
2484 page, rbio->scrubp, pagenr, rbio->stripe_len);
2485 if (ret)
2486 goto cleanup;
2487 }
2488
Miao Xie76035972014-11-14 17:45:42 +08002489 if (!is_replace)
2490 goto submit_write;
2491
2492 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2493 struct page *page;
2494
2495 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2496 ret = rbio_add_io_page(rbio, &bio_list, page,
2497 bbio->tgtdev_map[rbio->scrubp],
2498 pagenr, rbio->stripe_len);
2499 if (ret)
2500 goto cleanup;
2501 }
2502
2503submit_write:
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002504 nr_data = bio_list_size(&bio_list);
2505 if (!nr_data) {
2506 /* Every parity is right */
Omar Sandoval58efbc92017-08-22 23:45:59 -07002507 rbio_orig_end_io(rbio, BLK_STS_OK);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002508 return;
2509 }
2510
2511 atomic_set(&rbio->stripes_pending, nr_data);
2512
2513 while (1) {
2514 bio = bio_list_pop(&bio_list);
2515 if (!bio)
2516 break;
2517
2518 bio->bi_private = rbio;
Zhao Leia6111d12016-01-12 17:52:13 +08002519 bio->bi_end_io = raid_write_end_io;
Mike Christie37226b22016-06-05 14:31:52 -05002520 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Mike Christie4e49ea42016-06-05 14:31:41 -05002521
2522 submit_bio(bio);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002523 }
2524 return;
2525
2526cleanup:
Omar Sandoval58efbc92017-08-22 23:45:59 -07002527 rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo785884f2017-09-22 12:11:18 -06002528
2529 while ((bio = bio_list_pop(&bio_list)))
2530 bio_put(bio);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002531}
2532
2533static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2534{
2535 if (stripe >= 0 && stripe < rbio->nr_data)
2536 return 1;
2537 return 0;
2538}
2539
2540/*
2541 * While we're doing the parity check and repair, we could have errors
2542 * in reading pages off the disk. This checks for errors and if we're
2543 * not able to read the page it'll trigger parity reconstruction. The
2544 * parity scrub will be finished after we've reconstructed the failed
2545 * stripes
2546 */
2547static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2548{
2549 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2550 goto cleanup;
2551
2552 if (rbio->faila >= 0 || rbio->failb >= 0) {
2553 int dfail = 0, failp = -1;
2554
2555 if (is_data_stripe(rbio, rbio->faila))
2556 dfail++;
2557 else if (is_parity_stripe(rbio->faila))
2558 failp = rbio->faila;
2559
2560 if (is_data_stripe(rbio, rbio->failb))
2561 dfail++;
2562 else if (is_parity_stripe(rbio->failb))
2563 failp = rbio->failb;
2564
2565 /*
2566 * Because we can not use a scrubbing parity to repair
2567 * the data, so the capability of the repair is declined.
2568 * (In the case of RAID5, we can not repair anything)
2569 */
2570 if (dfail > rbio->bbio->max_errors - 1)
2571 goto cleanup;
2572
2573 /*
2574 * If all data is good, only parity is correctly, just
2575 * repair the parity.
2576 */
2577 if (dfail == 0) {
2578 finish_parity_scrub(rbio, 0);
2579 return;
2580 }
2581
2582 /*
2583 * Here means we got one corrupted data stripe and one
2584 * corrupted parity on RAID6, if the corrupted parity
Nicholas D Steeves01327612016-05-19 21:18:45 -04002585 * is scrubbing parity, luckily, use the other one to repair
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002586 * the data, or we can not repair the data stripe.
2587 */
2588 if (failp != rbio->scrubp)
2589 goto cleanup;
2590
2591 __raid_recover_end_io(rbio);
2592 } else {
2593 finish_parity_scrub(rbio, 1);
2594 }
2595 return;
2596
2597cleanup:
Omar Sandoval58efbc92017-08-22 23:45:59 -07002598 rbio_orig_end_io(rbio, BLK_STS_IOERR);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002599}
2600
2601/*
2602 * end io for the read phase of the rmw cycle. All the bios here are physical
2603 * stripe bios we've read from the disk so we can recalculate the parity of the
2604 * stripe.
2605 *
2606 * This will usually kick off finish_rmw once all the bios are read in, but it
2607 * may trigger parity reconstruction if we had any errors along the way
2608 */
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002609static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002610{
2611 struct btrfs_raid_bio *rbio = bio->bi_private;
2612
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002613 if (bio->bi_status)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002614 fail_bio_stripe(rbio, bio);
2615 else
2616 set_bio_pages_uptodate(bio);
2617
2618 bio_put(bio);
2619
2620 if (!atomic_dec_and_test(&rbio->stripes_pending))
2621 return;
2622
2623 /*
2624 * this will normally call finish_rmw to start our write
2625 * but if there are any failed stripes we'll reconstruct
2626 * from parity first
2627 */
2628 validate_rbio_for_parity_scrub(rbio);
2629}
2630
2631static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2632{
2633 int bios_to_read = 0;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002634 struct bio_list bio_list;
2635 int ret;
2636 int pagenr;
2637 int stripe;
2638 struct bio *bio;
2639
Liu Bo785884f2017-09-22 12:11:18 -06002640 bio_list_init(&bio_list);
2641
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002642 ret = alloc_rbio_essential_pages(rbio);
2643 if (ret)
2644 goto cleanup;
2645
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002646 atomic_set(&rbio->error, 0);
2647 /*
2648 * build a list of bios to read all the missing parts of this
2649 * stripe
2650 */
Miao Xie2c8cdd62014-11-14 16:06:25 +08002651 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002652 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2653 struct page *page;
2654 /*
2655 * we want to find all the pages missing from
2656 * the rbio and read them from the disk. If
2657 * page_in_rbio finds a page in the bio list
2658 * we don't need to read it off the stripe.
2659 */
2660 page = page_in_rbio(rbio, stripe, pagenr, 1);
2661 if (page)
2662 continue;
2663
2664 page = rbio_stripe_page(rbio, stripe, pagenr);
2665 /*
2666 * the bio cache may have handed us an uptodate
2667 * page. If so, be happy and use it
2668 */
2669 if (PageUptodate(page))
2670 continue;
2671
2672 ret = rbio_add_io_page(rbio, &bio_list, page,
2673 stripe, pagenr, rbio->stripe_len);
2674 if (ret)
2675 goto cleanup;
2676 }
2677 }
2678
2679 bios_to_read = bio_list_size(&bio_list);
2680 if (!bios_to_read) {
2681 /*
2682 * this can happen if others have merged with
2683 * us, it means there is nothing left to read.
2684 * But if there are missing devices it may not be
2685 * safe to do the full stripe write yet.
2686 */
2687 goto finish;
2688 }
2689
2690 /*
2691 * the bbio may be freed once we submit the last bio. Make sure
2692 * not to touch it after that
2693 */
2694 atomic_set(&rbio->stripes_pending, bios_to_read);
2695 while (1) {
2696 bio = bio_list_pop(&bio_list);
2697 if (!bio)
2698 break;
2699
2700 bio->bi_private = rbio;
2701 bio->bi_end_io = raid56_parity_scrub_end_io;
Mike Christie37226b22016-06-05 14:31:52 -05002702 bio_set_op_attrs(bio, REQ_OP_READ, 0);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002703
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002704 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002705
Mike Christie4e49ea42016-06-05 14:31:41 -05002706 submit_bio(bio);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002707 }
2708 /* the actual write will happen once the reads are done */
2709 return;
2710
2711cleanup:
Omar Sandoval58efbc92017-08-22 23:45:59 -07002712 rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo785884f2017-09-22 12:11:18 -06002713
2714 while ((bio = bio_list_pop(&bio_list)))
2715 bio_put(bio);
2716
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002717 return;
2718
2719finish:
2720 validate_rbio_for_parity_scrub(rbio);
2721}
2722
2723static void scrub_parity_work(struct btrfs_work *work)
2724{
2725 struct btrfs_raid_bio *rbio;
2726
2727 rbio = container_of(work, struct btrfs_raid_bio, work);
2728 raid56_parity_scrub_stripe(rbio);
2729}
2730
2731static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2732{
2733 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2734 scrub_parity_work, NULL, NULL);
2735
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002736 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002737}
2738
2739void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2740{
2741 if (!lock_stripe_add(rbio))
2742 async_scrub_parity(rbio);
2743}
Omar Sandovalb4ee1782015-06-19 11:52:50 -07002744
2745/* The following code is used for dev replace of a missing RAID 5/6 device. */
2746
2747struct btrfs_raid_bio *
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002748raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
Omar Sandovalb4ee1782015-06-19 11:52:50 -07002749 struct btrfs_bio *bbio, u64 length)
2750{
2751 struct btrfs_raid_bio *rbio;
2752
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002753 rbio = alloc_rbio(fs_info, bbio, length);
Omar Sandovalb4ee1782015-06-19 11:52:50 -07002754 if (IS_ERR(rbio))
2755 return NULL;
2756
2757 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2758 bio_list_add(&rbio->bio_list, bio);
2759 /*
2760 * This is a special bio which is used to hold the completion handler
2761 * and make the scrub rbio is similar to the other types
2762 */
2763 ASSERT(!bio->bi_iter.bi_size);
2764
2765 rbio->faila = find_logical_bio_stripe(rbio, bio);
2766 if (rbio->faila == -1) {
2767 BUG();
2768 kfree(rbio);
2769 return NULL;
2770 }
2771
Qu Wenruoae6529c2017-03-29 09:33:21 +08002772 /*
2773 * When we get bbio, we have already increased bio_counter, record it
2774 * so we can free it at rbio_orig_end_io()
2775 */
2776 rbio->generic_bio_cnt = 1;
2777
Omar Sandovalb4ee1782015-06-19 11:52:50 -07002778 return rbio;
2779}
2780
Omar Sandovalb4ee1782015-06-19 11:52:50 -07002781void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2782{
2783 if (!lock_stripe_add(rbio))
Liu Bod6a69132018-03-02 16:10:39 -07002784 async_read_rebuild(rbio);
Omar Sandovalb4ee1782015-06-19 11:52:50 -07002785}