blob: 1a2066ac6fe7cbe488a92fe92c9a25ed2d15dd0b [file] [log] [blame]
Arne Jansena2de7332011-03-08 14:14:00 +01001/*
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
Arne Jansena2de7332011-03-08 14:14:00 +01003 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
Arne Jansena2de7332011-03-08 14:14:00 +010019#include <linux/blkdev.h>
Jan Schmidt558540c2011-06-13 19:59:12 +020020#include <linux/ratelimit.h>
David Sterbade2491f2017-05-31 19:21:38 +020021#include <linux/sched/mm.h>
Arne Jansena2de7332011-03-08 14:14:00 +010022#include "ctree.h"
23#include "volumes.h"
24#include "disk-io.h"
25#include "ordered-data.h"
Jan Schmidt0ef8e452011-06-13 20:04:15 +020026#include "transaction.h"
Jan Schmidt558540c2011-06-13 19:59:12 +020027#include "backref.h"
Jan Schmidt5da6fcb2011-08-04 18:11:04 +020028#include "extent_io.h"
Stefan Behrensff023aa2012-11-06 11:43:11 +010029#include "dev-replace.h"
Stefan Behrens21adbd52011-11-09 13:44:05 +010030#include "check-integrity.h"
Josef Bacik606686e2012-06-04 14:03:51 -040031#include "rcu-string.h"
David Woodhouse53b381b2013-01-29 18:40:14 -050032#include "raid56.h"
Arne Jansena2de7332011-03-08 14:14:00 +010033
34/*
35 * This is only the first step towards a full-features scrub. It reads all
36 * extent and super block and verifies the checksums. In case a bad checksum
37 * is found or the extent cannot be read, good data will be written back if
38 * any can be found.
39 *
40 * Future enhancements:
Arne Jansena2de7332011-03-08 14:14:00 +010041 * - In case an unrepairable extent is encountered, track which files are
42 * affected and report them
Arne Jansena2de7332011-03-08 14:14:00 +010043 * - track and record media errors, throw out bad devices
Arne Jansena2de7332011-03-08 14:14:00 +010044 * - add a mode to also read unallocated space
Arne Jansena2de7332011-03-08 14:14:00 +010045 */
46
Stefan Behrensb5d67f62012-03-27 14:21:27 -040047struct scrub_block;
Stefan Behrensd9d181c2012-11-02 09:58:09 +010048struct scrub_ctx;
Arne Jansena2de7332011-03-08 14:14:00 +010049
Stefan Behrensff023aa2012-11-06 11:43:11 +010050/*
51 * the following three values only influence the performance.
52 * The last one configures the number of parallel and outstanding I/O
53 * operations. The first two values configure an upper limit for the number
54 * of (dynamically allocated) pages that are added to a bio.
55 */
56#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
57#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
58#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
Stefan Behrens7a9e9982012-11-02 14:58:04 +010059
60/*
61 * the following value times PAGE_SIZE needs to be large enough to match the
62 * largest node/leaf/sector size that shall be supported.
63 * Values larger than BTRFS_STRIPE_LEN are not supported.
64 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -040065#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
Arne Jansena2de7332011-03-08 14:14:00 +010066
Miao Xieaf8e2d12014-10-23 14:42:50 +080067struct scrub_recover {
Elena Reshetova6f615012017-03-03 10:55:21 +020068 refcount_t refs;
Miao Xieaf8e2d12014-10-23 14:42:50 +080069 struct btrfs_bio *bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +080070 u64 map_length;
71};
72
Arne Jansena2de7332011-03-08 14:14:00 +010073struct scrub_page {
Stefan Behrensb5d67f62012-03-27 14:21:27 -040074 struct scrub_block *sblock;
75 struct page *page;
Stefan Behrens442a4f62012-05-25 16:06:08 +020076 struct btrfs_device *dev;
Miao Xie5a6ac9e2014-11-06 17:20:58 +080077 struct list_head list;
Arne Jansena2de7332011-03-08 14:14:00 +010078 u64 flags; /* extent flags */
79 u64 generation;
Stefan Behrensb5d67f62012-03-27 14:21:27 -040080 u64 logical;
81 u64 physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +010082 u64 physical_for_dev_replace;
Zhao Lei57019342015-01-20 15:11:45 +080083 atomic_t refs;
Stefan Behrensb5d67f62012-03-27 14:21:27 -040084 struct {
85 unsigned int mirror_num:8;
86 unsigned int have_csum:1;
87 unsigned int io_error:1;
88 };
Arne Jansena2de7332011-03-08 14:14:00 +010089 u8 csum[BTRFS_CSUM_SIZE];
Miao Xieaf8e2d12014-10-23 14:42:50 +080090
91 struct scrub_recover *recover;
Arne Jansena2de7332011-03-08 14:14:00 +010092};
93
94struct scrub_bio {
95 int index;
Stefan Behrensd9d181c2012-11-02 09:58:09 +010096 struct scrub_ctx *sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +010097 struct btrfs_device *dev;
Arne Jansena2de7332011-03-08 14:14:00 +010098 struct bio *bio;
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +020099 blk_status_t status;
Arne Jansena2de7332011-03-08 14:14:00 +0100100 u64 logical;
101 u64 physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100102#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
103 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
104#else
105 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
106#endif
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400107 int page_count;
Arne Jansena2de7332011-03-08 14:14:00 +0100108 int next_free;
109 struct btrfs_work work;
110};
111
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400112struct scrub_block {
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100113 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400114 int page_count;
115 atomic_t outstanding_pages;
Elena Reshetova186debd2017-03-03 10:55:23 +0200116 refcount_t refs; /* free mem on transition to zero */
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100117 struct scrub_ctx *sctx;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800118 struct scrub_parity *sparity;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400119 struct {
120 unsigned int header_error:1;
121 unsigned int checksum_error:1;
122 unsigned int no_io_error_seen:1;
Stefan Behrens442a4f62012-05-25 16:06:08 +0200123 unsigned int generation_error:1; /* also sets header_error */
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800124
125 /* The following is for the data used to check parity */
126 /* It is for the data with checksum */
127 unsigned int data_corrected:1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400128 };
Omar Sandoval73ff61d2015-06-19 11:52:51 -0700129 struct btrfs_work work;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400130};
131
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800132/* Used for the chunks with parity stripe such RAID5/6 */
133struct scrub_parity {
134 struct scrub_ctx *sctx;
135
136 struct btrfs_device *scrub_dev;
137
138 u64 logic_start;
139
140 u64 logic_end;
141
142 int nsectors;
143
Liu Bo972d7212017-04-03 13:45:33 -0700144 u64 stripe_len;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800145
Elena Reshetova78a76452017-03-03 10:55:24 +0200146 refcount_t refs;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800147
148 struct list_head spages;
149
150 /* Work of parity check and repair */
151 struct btrfs_work work;
152
153 /* Mark the parity blocks which have data */
154 unsigned long *dbitmap;
155
156 /*
157 * Mark the parity blocks which have data, but errors happen when
158 * read data or check data
159 */
160 unsigned long *ebitmap;
161
162 unsigned long bitmap[0];
163};
164
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100165struct scrub_ctx {
Stefan Behrensff023aa2012-11-06 11:43:11 +0100166 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400167 struct btrfs_fs_info *fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +0100168 int first_free;
169 int curr;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100170 atomic_t bios_in_flight;
171 atomic_t workers_pending;
Arne Jansena2de7332011-03-08 14:14:00 +0100172 spinlock_t list_lock;
173 wait_queue_head_t list_wait;
174 u16 csum_size;
175 struct list_head csum_list;
176 atomic_t cancel_req;
Arne Jansen86287642011-03-23 16:34:19 +0100177 int readonly;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100178 int pages_per_rd_bio;
Stefan Behrens63a212a2012-11-05 18:29:28 +0100179
180 int is_dev_replace;
David Sterba3fb99302017-05-16 19:10:32 +0200181
182 struct scrub_bio *wr_curr_bio;
183 struct mutex wr_lock;
184 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
David Sterba3fb99302017-05-16 19:10:32 +0200185 struct btrfs_device *wr_tgtdev;
David Sterba2073c4c2017-03-31 17:12:51 +0200186 bool flush_all_writes;
Stefan Behrens63a212a2012-11-05 18:29:28 +0100187
Arne Jansena2de7332011-03-08 14:14:00 +0100188 /*
189 * statistics
190 */
191 struct btrfs_scrub_progress stat;
192 spinlock_t stat_lock;
Filipe Mananaf55985f2015-02-09 21:14:24 +0000193
194 /*
195 * Use a ref counter to avoid use-after-free issues. Scrub workers
196 * decrement bios_in_flight and workers_pending and then do a wakeup
197 * on the list_wait wait queue. We must ensure the main scrub task
198 * doesn't free the scrub context before or while the workers are
199 * doing the wakeup() call.
200 */
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200201 refcount_t refs;
Arne Jansena2de7332011-03-08 14:14:00 +0100202};
203
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200204struct scrub_fixup_nodatasum {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100205 struct scrub_ctx *sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100206 struct btrfs_device *dev;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200207 u64 logical;
208 struct btrfs_root *root;
209 struct btrfs_work work;
210 int mirror_num;
211};
212
Josef Bacik652f25a2013-09-12 16:58:28 -0400213struct scrub_nocow_inode {
214 u64 inum;
215 u64 offset;
216 u64 root;
217 struct list_head list;
218};
219
Stefan Behrensff023aa2012-11-06 11:43:11 +0100220struct scrub_copy_nocow_ctx {
221 struct scrub_ctx *sctx;
222 u64 logical;
223 u64 len;
224 int mirror_num;
225 u64 physical_for_dev_replace;
Josef Bacik652f25a2013-09-12 16:58:28 -0400226 struct list_head inodes;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100227 struct btrfs_work work;
228};
229
Jan Schmidt558540c2011-06-13 19:59:12 +0200230struct scrub_warning {
231 struct btrfs_path *path;
232 u64 extent_item_size;
Jan Schmidt558540c2011-06-13 19:59:12 +0200233 const char *errstr;
David Sterba6aa21262017-10-04 17:07:07 +0200234 u64 physical;
Jan Schmidt558540c2011-06-13 19:59:12 +0200235 u64 logical;
236 struct btrfs_device *dev;
Jan Schmidt558540c2011-06-13 19:59:12 +0200237};
238
Qu Wenruo0966a7b2017-04-14 08:35:54 +0800239struct full_stripe_lock {
240 struct rb_node node;
241 u64 logical;
242 u64 refs;
243 struct mutex mutex;
244};
245
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100246static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
247static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
248static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
249static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400250static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
Zhao Leibe50a8d2015-01-20 15:11:42 +0800251static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
Stefan Behrensff023aa2012-11-06 11:43:11 +0100252 struct scrub_block *sblocks_for_recheck);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +0100253static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
Zhao Leiaffe4a52015-08-24 21:32:06 +0800254 struct scrub_block *sblock,
255 int retry_failed_mirror);
Zhao Leiba7cf982015-08-24 21:18:02 +0800256static void scrub_recheck_block_checksum(struct scrub_block *sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400257static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
Zhao Lei114ab502015-01-20 15:11:36 +0800258 struct scrub_block *sblock_good);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400259static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
260 struct scrub_block *sblock_good,
261 int page_num, int force_write);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100262static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
263static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
264 int page_num);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400265static int scrub_checksum_data(struct scrub_block *sblock);
266static int scrub_checksum_tree_block(struct scrub_block *sblock);
267static int scrub_checksum_super(struct scrub_block *sblock);
268static void scrub_block_get(struct scrub_block *sblock);
269static void scrub_block_put(struct scrub_block *sblock);
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100270static void scrub_page_get(struct scrub_page *spage);
271static void scrub_page_put(struct scrub_page *spage);
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800272static void scrub_parity_get(struct scrub_parity *sparity);
273static void scrub_parity_put(struct scrub_parity *sparity);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100274static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
275 struct scrub_page *spage);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100276static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100277 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +0100278 u64 gen, int mirror_num, u8 *csum, int force,
279 u64 physical_for_dev_replace);
Christoph Hellwig4246a0b2015-07-20 15:29:37 +0200280static void scrub_bio_end_io(struct bio *bio);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400281static void scrub_bio_end_io_worker(struct btrfs_work *work);
282static void scrub_block_complete(struct scrub_block *sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100283static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
284 u64 extent_logical, u64 extent_len,
285 u64 *extent_physical,
286 struct btrfs_device **extent_dev,
287 int *extent_mirror_num);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100288static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
289 struct scrub_page *spage);
290static void scrub_wr_submit(struct scrub_ctx *sctx);
Christoph Hellwig4246a0b2015-07-20 15:29:37 +0200291static void scrub_wr_bio_end_io(struct bio *bio);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100292static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
293static int write_page_nocow(struct scrub_ctx *sctx,
294 u64 physical_for_dev_replace, struct page *page);
295static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
Josef Bacik652f25a2013-09-12 16:58:28 -0400296 struct scrub_copy_nocow_ctx *ctx);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100297static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
298 int mirror_num, u64 physical_for_dev_replace);
299static void copy_nocow_pages_worker(struct btrfs_work *work);
Wang Shilongcb7ab022013-12-04 21:16:53 +0800300static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
Wang Shilong3cb09292013-12-04 21:15:19 +0800301static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
Filipe Mananaf55985f2015-02-09 21:14:24 +0000302static void scrub_put_ctx(struct scrub_ctx *sctx);
Stefan Behrens1623ede2012-03-27 14:21:26 -0400303
Liu Bo762221f2018-01-02 13:36:42 -0700304static inline int scrub_is_page_on_raid56(struct scrub_page *page)
305{
306 return page->recover &&
307 (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
308}
Stefan Behrens1623ede2012-03-27 14:21:26 -0400309
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100310static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
311{
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200312 refcount_inc(&sctx->refs);
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100313 atomic_inc(&sctx->bios_in_flight);
314}
315
316static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
317{
318 atomic_dec(&sctx->bios_in_flight);
319 wake_up(&sctx->list_wait);
Filipe Mananaf55985f2015-02-09 21:14:24 +0000320 scrub_put_ctx(sctx);
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100321}
322
Wang Shilongcb7ab022013-12-04 21:16:53 +0800323static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
Wang Shilong3cb09292013-12-04 21:15:19 +0800324{
325 while (atomic_read(&fs_info->scrub_pause_req)) {
326 mutex_unlock(&fs_info->scrub_lock);
327 wait_event(fs_info->scrub_pause_wait,
328 atomic_read(&fs_info->scrub_pause_req) == 0);
329 mutex_lock(&fs_info->scrub_lock);
330 }
331}
332
Zhaolei0e22be82015-08-05 16:43:28 +0800333static void scrub_pause_on(struct btrfs_fs_info *fs_info)
Wang Shilongcb7ab022013-12-04 21:16:53 +0800334{
335 atomic_inc(&fs_info->scrubs_paused);
336 wake_up(&fs_info->scrub_pause_wait);
Zhaolei0e22be82015-08-05 16:43:28 +0800337}
Wang Shilongcb7ab022013-12-04 21:16:53 +0800338
Zhaolei0e22be82015-08-05 16:43:28 +0800339static void scrub_pause_off(struct btrfs_fs_info *fs_info)
340{
Wang Shilongcb7ab022013-12-04 21:16:53 +0800341 mutex_lock(&fs_info->scrub_lock);
342 __scrub_blocked_if_needed(fs_info);
343 atomic_dec(&fs_info->scrubs_paused);
344 mutex_unlock(&fs_info->scrub_lock);
345
346 wake_up(&fs_info->scrub_pause_wait);
347}
348
Zhaolei0e22be82015-08-05 16:43:28 +0800349static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
350{
351 scrub_pause_on(fs_info);
352 scrub_pause_off(fs_info);
353}
354
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100355/*
Qu Wenruo0966a7b2017-04-14 08:35:54 +0800356 * Insert new full stripe lock into full stripe locks tree
357 *
358 * Return pointer to existing or newly inserted full_stripe_lock structure if
359 * everything works well.
360 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
361 *
362 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
363 * function
364 */
365static struct full_stripe_lock *insert_full_stripe_lock(
366 struct btrfs_full_stripe_locks_tree *locks_root,
367 u64 fstripe_logical)
368{
369 struct rb_node **p;
370 struct rb_node *parent = NULL;
371 struct full_stripe_lock *entry;
372 struct full_stripe_lock *ret;
373
David Sterbaa32bf9a2018-03-16 02:21:22 +0100374 lockdep_assert_held(&locks_root->lock);
Qu Wenruo0966a7b2017-04-14 08:35:54 +0800375
376 p = &locks_root->root.rb_node;
377 while (*p) {
378 parent = *p;
379 entry = rb_entry(parent, struct full_stripe_lock, node);
380 if (fstripe_logical < entry->logical) {
381 p = &(*p)->rb_left;
382 } else if (fstripe_logical > entry->logical) {
383 p = &(*p)->rb_right;
384 } else {
385 entry->refs++;
386 return entry;
387 }
388 }
389
390 /* Insert new lock */
391 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
392 if (!ret)
393 return ERR_PTR(-ENOMEM);
394 ret->logical = fstripe_logical;
395 ret->refs = 1;
396 mutex_init(&ret->mutex);
397
398 rb_link_node(&ret->node, parent, p);
399 rb_insert_color(&ret->node, &locks_root->root);
400 return ret;
401}
402
403/*
404 * Search for a full stripe lock of a block group
405 *
406 * Return pointer to existing full stripe lock if found
407 * Return NULL if not found
408 */
409static struct full_stripe_lock *search_full_stripe_lock(
410 struct btrfs_full_stripe_locks_tree *locks_root,
411 u64 fstripe_logical)
412{
413 struct rb_node *node;
414 struct full_stripe_lock *entry;
415
David Sterbaa32bf9a2018-03-16 02:21:22 +0100416 lockdep_assert_held(&locks_root->lock);
Qu Wenruo0966a7b2017-04-14 08:35:54 +0800417
418 node = locks_root->root.rb_node;
419 while (node) {
420 entry = rb_entry(node, struct full_stripe_lock, node);
421 if (fstripe_logical < entry->logical)
422 node = node->rb_left;
423 else if (fstripe_logical > entry->logical)
424 node = node->rb_right;
425 else
426 return entry;
427 }
428 return NULL;
429}
430
431/*
432 * Helper to get full stripe logical from a normal bytenr.
433 *
434 * Caller must ensure @cache is a RAID56 block group.
435 */
436static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
437 u64 bytenr)
438{
439 u64 ret;
440
441 /*
442 * Due to chunk item size limit, full stripe length should not be
443 * larger than U32_MAX. Just a sanity check here.
444 */
445 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
446
447 /*
448 * round_down() can only handle power of 2, while RAID56 full
449 * stripe length can be 64KiB * n, so we need to manually round down.
450 */
451 ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
452 cache->full_stripe_len + cache->key.objectid;
453 return ret;
454}
455
456/*
457 * Lock a full stripe to avoid concurrency of recovery and read
458 *
459 * It's only used for profiles with parities (RAID5/6), for other profiles it
460 * does nothing.
461 *
462 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
463 * So caller must call unlock_full_stripe() at the same context.
464 *
465 * Return <0 if encounters error.
466 */
467static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
468 bool *locked_ret)
469{
470 struct btrfs_block_group_cache *bg_cache;
471 struct btrfs_full_stripe_locks_tree *locks_root;
472 struct full_stripe_lock *existing;
473 u64 fstripe_start;
474 int ret = 0;
475
476 *locked_ret = false;
477 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
478 if (!bg_cache) {
479 ASSERT(0);
480 return -ENOENT;
481 }
482
483 /* Profiles not based on parity don't need full stripe lock */
484 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
485 goto out;
486 locks_root = &bg_cache->full_stripe_locks_root;
487
488 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
489
490 /* Now insert the full stripe lock */
491 mutex_lock(&locks_root->lock);
492 existing = insert_full_stripe_lock(locks_root, fstripe_start);
493 mutex_unlock(&locks_root->lock);
494 if (IS_ERR(existing)) {
495 ret = PTR_ERR(existing);
496 goto out;
497 }
498 mutex_lock(&existing->mutex);
499 *locked_ret = true;
500out:
501 btrfs_put_block_group(bg_cache);
502 return ret;
503}
504
505/*
506 * Unlock a full stripe.
507 *
508 * NOTE: Caller must ensure it's the same context calling corresponding
509 * lock_full_stripe().
510 *
511 * Return 0 if we unlock full stripe without problem.
512 * Return <0 for error
513 */
514static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
515 bool locked)
516{
517 struct btrfs_block_group_cache *bg_cache;
518 struct btrfs_full_stripe_locks_tree *locks_root;
519 struct full_stripe_lock *fstripe_lock;
520 u64 fstripe_start;
521 bool freeit = false;
522 int ret = 0;
523
524 /* If we didn't acquire full stripe lock, no need to continue */
525 if (!locked)
526 return 0;
527
528 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
529 if (!bg_cache) {
530 ASSERT(0);
531 return -ENOENT;
532 }
533 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
534 goto out;
535
536 locks_root = &bg_cache->full_stripe_locks_root;
537 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
538
539 mutex_lock(&locks_root->lock);
540 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
541 /* Unpaired unlock_full_stripe() detected */
542 if (!fstripe_lock) {
543 WARN_ON(1);
544 ret = -ENOENT;
545 mutex_unlock(&locks_root->lock);
546 goto out;
547 }
548
549 if (fstripe_lock->refs == 0) {
550 WARN_ON(1);
551 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
552 fstripe_lock->logical);
553 } else {
554 fstripe_lock->refs--;
555 }
556
557 if (fstripe_lock->refs == 0) {
558 rb_erase(&fstripe_lock->node, &locks_root->root);
559 freeit = true;
560 }
561 mutex_unlock(&locks_root->lock);
562
563 mutex_unlock(&fstripe_lock->mutex);
564 if (freeit)
565 kfree(fstripe_lock);
566out:
567 btrfs_put_block_group(bg_cache);
568 return ret;
569}
570
571/*
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100572 * used for workers that require transaction commits (i.e., for the
573 * NOCOW case)
574 */
575static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
576{
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400577 struct btrfs_fs_info *fs_info = sctx->fs_info;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100578
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200579 refcount_inc(&sctx->refs);
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100580 /*
581 * increment scrubs_running to prevent cancel requests from
582 * completing as long as a worker is running. we must also
583 * increment scrubs_paused to prevent deadlocking on pause
584 * requests used for transactions commits (as the worker uses a
585 * transaction context). it is safe to regard the worker
586 * as paused for all matters practical. effectively, we only
587 * avoid cancellation requests from completing.
588 */
589 mutex_lock(&fs_info->scrub_lock);
590 atomic_inc(&fs_info->scrubs_running);
591 atomic_inc(&fs_info->scrubs_paused);
592 mutex_unlock(&fs_info->scrub_lock);
Wang Shilong32a44782014-02-19 19:24:19 +0800593
594 /*
595 * check if @scrubs_running=@scrubs_paused condition
596 * inside wait_event() is not an atomic operation.
597 * which means we may inc/dec @scrub_running/paused
598 * at any time. Let's wake up @scrub_pause_wait as
599 * much as we can to let commit transaction blocked less.
600 */
601 wake_up(&fs_info->scrub_pause_wait);
602
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100603 atomic_inc(&sctx->workers_pending);
604}
605
606/* used for workers that require transaction commits */
607static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
608{
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400609 struct btrfs_fs_info *fs_info = sctx->fs_info;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100610
611 /*
612 * see scrub_pending_trans_workers_inc() why we're pretending
613 * to be paused in the scrub counters
614 */
615 mutex_lock(&fs_info->scrub_lock);
616 atomic_dec(&fs_info->scrubs_running);
617 atomic_dec(&fs_info->scrubs_paused);
618 mutex_unlock(&fs_info->scrub_lock);
619 atomic_dec(&sctx->workers_pending);
620 wake_up(&fs_info->scrub_pause_wait);
621 wake_up(&sctx->list_wait);
Filipe Mananaf55985f2015-02-09 21:14:24 +0000622 scrub_put_ctx(sctx);
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100623}
624
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100625static void scrub_free_csums(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100626{
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100627 while (!list_empty(&sctx->csum_list)) {
Arne Jansena2de7332011-03-08 14:14:00 +0100628 struct btrfs_ordered_sum *sum;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100629 sum = list_first_entry(&sctx->csum_list,
Arne Jansena2de7332011-03-08 14:14:00 +0100630 struct btrfs_ordered_sum, list);
631 list_del(&sum->list);
632 kfree(sum);
633 }
634}
635
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100636static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100637{
638 int i;
Arne Jansena2de7332011-03-08 14:14:00 +0100639
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100640 if (!sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100641 return;
642
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400643 /* this can happen when scrub is cancelled */
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100644 if (sctx->curr != -1) {
645 struct scrub_bio *sbio = sctx->bios[sctx->curr];
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400646
647 for (i = 0; i < sbio->page_count; i++) {
Stefan Behrensff023aa2012-11-06 11:43:11 +0100648 WARN_ON(!sbio->pagev[i]->page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400649 scrub_block_put(sbio->pagev[i]->sblock);
650 }
651 bio_put(sbio->bio);
652 }
653
Stefan Behrensff023aa2012-11-06 11:43:11 +0100654 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100655 struct scrub_bio *sbio = sctx->bios[i];
Arne Jansena2de7332011-03-08 14:14:00 +0100656
657 if (!sbio)
658 break;
Arne Jansena2de7332011-03-08 14:14:00 +0100659 kfree(sbio);
660 }
661
David Sterba3fb99302017-05-16 19:10:32 +0200662 kfree(sctx->wr_curr_bio);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100663 scrub_free_csums(sctx);
664 kfree(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +0100665}
666
Filipe Mananaf55985f2015-02-09 21:14:24 +0000667static void scrub_put_ctx(struct scrub_ctx *sctx)
668{
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200669 if (refcount_dec_and_test(&sctx->refs))
Filipe Mananaf55985f2015-02-09 21:14:24 +0000670 scrub_free_ctx(sctx);
671}
672
Arne Jansena2de7332011-03-08 14:14:00 +0100673static noinline_for_stack
Stefan Behrens63a212a2012-11-05 18:29:28 +0100674struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +0100675{
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100676 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +0100677 int i;
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400678 struct btrfs_fs_info *fs_info = dev->fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +0100679
David Sterba58c4e172016-02-11 10:49:42 +0100680 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100681 if (!sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100682 goto nomem;
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200683 refcount_set(&sctx->refs, 1);
Stefan Behrens63a212a2012-11-05 18:29:28 +0100684 sctx->is_dev_replace = is_dev_replace;
Kent Overstreetb54ffb72015-05-19 14:31:01 +0200685 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100686 sctx->curr = -1;
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400687 sctx->fs_info = dev->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100688 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
Arne Jansena2de7332011-03-08 14:14:00 +0100689 struct scrub_bio *sbio;
690
David Sterba58c4e172016-02-11 10:49:42 +0100691 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
Arne Jansena2de7332011-03-08 14:14:00 +0100692 if (!sbio)
693 goto nomem;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100694 sctx->bios[i] = sbio;
Arne Jansena2de7332011-03-08 14:14:00 +0100695
Arne Jansena2de7332011-03-08 14:14:00 +0100696 sbio->index = i;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100697 sbio->sctx = sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400698 sbio->page_count = 0;
Liu Bo9e0af232014-08-15 23:36:53 +0800699 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
700 scrub_bio_end_io_worker, NULL, NULL);
Arne Jansena2de7332011-03-08 14:14:00 +0100701
Stefan Behrensff023aa2012-11-06 11:43:11 +0100702 if (i != SCRUB_BIOS_PER_SCTX - 1)
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100703 sctx->bios[i]->next_free = i + 1;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200704 else
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100705 sctx->bios[i]->next_free = -1;
Arne Jansena2de7332011-03-08 14:14:00 +0100706 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100707 sctx->first_free = 0;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100708 atomic_set(&sctx->bios_in_flight, 0);
709 atomic_set(&sctx->workers_pending, 0);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100710 atomic_set(&sctx->cancel_req, 0);
711 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
712 INIT_LIST_HEAD(&sctx->csum_list);
Arne Jansena2de7332011-03-08 14:14:00 +0100713
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100714 spin_lock_init(&sctx->list_lock);
715 spin_lock_init(&sctx->stat_lock);
716 init_waitqueue_head(&sctx->list_wait);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100717
David Sterba3fb99302017-05-16 19:10:32 +0200718 WARN_ON(sctx->wr_curr_bio != NULL);
719 mutex_init(&sctx->wr_lock);
720 sctx->wr_curr_bio = NULL;
David Sterba8fcdac32017-05-16 19:10:23 +0200721 if (is_dev_replace) {
David Sterbaded56182017-06-26 15:19:00 +0200722 WARN_ON(!fs_info->dev_replace.tgtdev);
David Sterba3fb99302017-05-16 19:10:32 +0200723 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
David Sterbaded56182017-06-26 15:19:00 +0200724 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
David Sterba2073c4c2017-03-31 17:12:51 +0200725 sctx->flush_all_writes = false;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100726 }
David Sterba8fcdac32017-05-16 19:10:23 +0200727
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100728 return sctx;
Arne Jansena2de7332011-03-08 14:14:00 +0100729
730nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100731 scrub_free_ctx(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +0100732 return ERR_PTR(-ENOMEM);
733}
734
Stefan Behrensff023aa2012-11-06 11:43:11 +0100735static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
736 void *warn_ctx)
Jan Schmidt558540c2011-06-13 19:59:12 +0200737{
738 u64 isize;
739 u32 nlink;
740 int ret;
741 int i;
David Sterbade2491f2017-05-31 19:21:38 +0200742 unsigned nofs_flag;
Jan Schmidt558540c2011-06-13 19:59:12 +0200743 struct extent_buffer *eb;
744 struct btrfs_inode_item *inode_item;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100745 struct scrub_warning *swarn = warn_ctx;
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400746 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
Jan Schmidt558540c2011-06-13 19:59:12 +0200747 struct inode_fs_paths *ipath = NULL;
748 struct btrfs_root *local_root;
749 struct btrfs_key root_key;
David Sterba1d4c08e2015-01-02 19:36:14 +0100750 struct btrfs_key key;
Jan Schmidt558540c2011-06-13 19:59:12 +0200751
752 root_key.objectid = root;
753 root_key.type = BTRFS_ROOT_ITEM_KEY;
754 root_key.offset = (u64)-1;
755 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
756 if (IS_ERR(local_root)) {
757 ret = PTR_ERR(local_root);
758 goto err;
759 }
760
David Sterba14692cc2015-01-02 18:55:46 +0100761 /*
762 * this makes the path point to (inum INODE_ITEM ioff)
763 */
David Sterba1d4c08e2015-01-02 19:36:14 +0100764 key.objectid = inum;
765 key.type = BTRFS_INODE_ITEM_KEY;
766 key.offset = 0;
767
768 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
Jan Schmidt558540c2011-06-13 19:59:12 +0200769 if (ret) {
770 btrfs_release_path(swarn->path);
771 goto err;
772 }
773
774 eb = swarn->path->nodes[0];
775 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
776 struct btrfs_inode_item);
777 isize = btrfs_inode_size(eb, inode_item);
778 nlink = btrfs_inode_nlink(eb, inode_item);
779 btrfs_release_path(swarn->path);
780
David Sterbade2491f2017-05-31 19:21:38 +0200781 /*
782 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
783 * uses GFP_NOFS in this context, so we keep it consistent but it does
784 * not seem to be strictly necessary.
785 */
786 nofs_flag = memalloc_nofs_save();
Jan Schmidt558540c2011-06-13 19:59:12 +0200787 ipath = init_ipath(4096, local_root, swarn->path);
David Sterbade2491f2017-05-31 19:21:38 +0200788 memalloc_nofs_restore(nofs_flag);
Dan Carpenter26bdef52011-11-16 11:28:01 +0300789 if (IS_ERR(ipath)) {
790 ret = PTR_ERR(ipath);
791 ipath = NULL;
792 goto err;
793 }
Jan Schmidt558540c2011-06-13 19:59:12 +0200794 ret = paths_from_inode(inum, ipath);
795
796 if (ret < 0)
797 goto err;
798
799 /*
800 * we deliberately ignore the bit ipath might have been too small to
801 * hold all of the paths here
802 */
803 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400804 btrfs_warn_in_rcu(fs_info,
David Sterba6aa21262017-10-04 17:07:07 +0200805"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400806 swarn->errstr, swarn->logical,
807 rcu_str_deref(swarn->dev->name),
David Sterba6aa21262017-10-04 17:07:07 +0200808 swarn->physical,
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400809 root, inum, offset,
810 min(isize - offset, (u64)PAGE_SIZE), nlink,
811 (char *)(unsigned long)ipath->fspath->val[i]);
Jan Schmidt558540c2011-06-13 19:59:12 +0200812
813 free_ipath(ipath);
814 return 0;
815
816err:
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400817 btrfs_warn_in_rcu(fs_info,
David Sterba6aa21262017-10-04 17:07:07 +0200818 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400819 swarn->errstr, swarn->logical,
820 rcu_str_deref(swarn->dev->name),
David Sterba6aa21262017-10-04 17:07:07 +0200821 swarn->physical,
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400822 root, inum, offset, ret);
Jan Schmidt558540c2011-06-13 19:59:12 +0200823
824 free_ipath(ipath);
825 return 0;
826}
827
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400828static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
Jan Schmidt558540c2011-06-13 19:59:12 +0200829{
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100830 struct btrfs_device *dev;
831 struct btrfs_fs_info *fs_info;
Jan Schmidt558540c2011-06-13 19:59:12 +0200832 struct btrfs_path *path;
833 struct btrfs_key found_key;
834 struct extent_buffer *eb;
835 struct btrfs_extent_item *ei;
836 struct scrub_warning swarn;
Jan Schmidt558540c2011-06-13 19:59:12 +0200837 unsigned long ptr = 0;
Jan Schmidt4692cf52011-12-02 14:56:41 +0100838 u64 extent_item_pos;
Liu Bo69917e42012-09-07 20:01:28 -0600839 u64 flags = 0;
840 u64 ref_root;
841 u32 item_size;
Dan Carpenter07c9a8e2016-03-11 11:08:56 +0300842 u8 ref_level = 0;
Liu Bo69917e42012-09-07 20:01:28 -0600843 int ret;
Jan Schmidt558540c2011-06-13 19:59:12 +0200844
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100845 WARN_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100846 dev = sblock->pagev[0]->dev;
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400847 fs_info = sblock->sctx->fs_info;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100848
Jan Schmidt558540c2011-06-13 19:59:12 +0200849 path = btrfs_alloc_path();
David Sterba8b9456d2014-07-30 01:25:30 +0200850 if (!path)
851 return;
Jan Schmidt558540c2011-06-13 19:59:12 +0200852
David Sterba6aa21262017-10-04 17:07:07 +0200853 swarn.physical = sblock->pagev[0]->physical;
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100854 swarn.logical = sblock->pagev[0]->logical;
Jan Schmidt558540c2011-06-13 19:59:12 +0200855 swarn.errstr = errstr;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100856 swarn.dev = NULL;
Jan Schmidt558540c2011-06-13 19:59:12 +0200857
Liu Bo69917e42012-09-07 20:01:28 -0600858 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
859 &flags);
Jan Schmidt558540c2011-06-13 19:59:12 +0200860 if (ret < 0)
861 goto out;
862
Jan Schmidt4692cf52011-12-02 14:56:41 +0100863 extent_item_pos = swarn.logical - found_key.objectid;
Jan Schmidt558540c2011-06-13 19:59:12 +0200864 swarn.extent_item_size = found_key.offset;
865
866 eb = path->nodes[0];
867 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
868 item_size = btrfs_item_size_nr(eb, path->slots[0]);
869
Liu Bo69917e42012-09-07 20:01:28 -0600870 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
Jan Schmidt558540c2011-06-13 19:59:12 +0200871 do {
Liu Bo6eda71d2014-06-09 10:54:07 +0800872 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
873 item_size, &ref_root,
874 &ref_level);
David Sterbaecaeb142015-10-08 09:01:03 +0200875 btrfs_warn_in_rcu(fs_info,
David Sterba6aa21262017-10-04 17:07:07 +0200876"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400877 errstr, swarn.logical,
Josef Bacik606686e2012-06-04 14:03:51 -0400878 rcu_str_deref(dev->name),
David Sterba6aa21262017-10-04 17:07:07 +0200879 swarn.physical,
Jan Schmidt558540c2011-06-13 19:59:12 +0200880 ref_level ? "node" : "leaf",
881 ret < 0 ? -1 : ref_level,
882 ret < 0 ? -1 : ref_root);
883 } while (ret != 1);
Josef Bacikd8fe29e2013-03-29 08:09:34 -0600884 btrfs_release_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200885 } else {
Josef Bacikd8fe29e2013-03-29 08:09:34 -0600886 btrfs_release_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200887 swarn.path = path;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100888 swarn.dev = dev;
Jan Schmidt7a3ae2f2012-03-23 17:32:28 +0100889 iterate_extent_inodes(fs_info, found_key.objectid,
890 extent_item_pos, 1,
Zygo Blaxellc995ab32017-09-22 13:58:45 -0400891 scrub_print_warning_inode, &swarn, false);
Jan Schmidt558540c2011-06-13 19:59:12 +0200892 }
893
894out:
895 btrfs_free_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200896}
897
Stefan Behrensff023aa2012-11-06 11:43:11 +0100898static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200899{
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200900 struct page *page = NULL;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200901 unsigned long index;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100902 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200903 int ret;
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200904 int corrected = 0;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200905 struct btrfs_key key;
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200906 struct inode *inode = NULL;
Liu Bo6f1c3602013-01-29 03:22:10 +0000907 struct btrfs_fs_info *fs_info;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200908 u64 end = offset + PAGE_SIZE - 1;
909 struct btrfs_root *local_root;
Liu Bo6f1c3602013-01-29 03:22:10 +0000910 int srcu_index;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200911
912 key.objectid = root;
913 key.type = BTRFS_ROOT_ITEM_KEY;
914 key.offset = (u64)-1;
Liu Bo6f1c3602013-01-29 03:22:10 +0000915
916 fs_info = fixup->root->fs_info;
917 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
918
919 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
920 if (IS_ERR(local_root)) {
921 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200922 return PTR_ERR(local_root);
Liu Bo6f1c3602013-01-29 03:22:10 +0000923 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200924
925 key.type = BTRFS_INODE_ITEM_KEY;
926 key.objectid = inum;
927 key.offset = 0;
Liu Bo6f1c3602013-01-29 03:22:10 +0000928 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
929 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200930 if (IS_ERR(inode))
931 return PTR_ERR(inode);
932
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +0300933 index = offset >> PAGE_SHIFT;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200934
935 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200936 if (!page) {
937 ret = -ENOMEM;
938 goto out;
939 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200940
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200941 if (PageUptodate(page)) {
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200942 if (PageDirty(page)) {
943 /*
944 * we need to write the data to the defect sector. the
945 * data that was in that sector is not in memory,
946 * because the page was modified. we must not write the
947 * modified page to that sector.
948 *
949 * TODO: what could be done here: wait for the delalloc
950 * runner to write out that page (might involve
951 * COW) and see whether the sector is still
952 * referenced afterwards.
953 *
954 * For the meantime, we'll treat this error
955 * incorrectable, although there is a chance that a
956 * later scrub will find the bad sector again and that
957 * there's no dirty page in memory, then.
958 */
959 ret = -EIO;
960 goto out;
961 }
Josef Bacik6ec656b2017-05-05 11:57:14 -0400962 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200963 fixup->logical, page,
Miao Xieffdd2012014-09-12 18:44:00 +0800964 offset - page_offset(page),
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200965 fixup->mirror_num);
966 unlock_page(page);
967 corrected = !ret;
968 } else {
969 /*
970 * we need to get good data first. the general readpage path
971 * will call repair_io_failure for us, we just have to make
972 * sure we read the bad mirror.
973 */
974 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
David Sterbaceeb0ae2016-04-26 23:54:39 +0200975 EXTENT_DAMAGED);
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200976 if (ret) {
977 /* set_extent_bits should give proper error */
978 WARN_ON(ret > 0);
979 if (ret > 0)
980 ret = -EFAULT;
981 goto out;
982 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200983
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200984 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
985 btrfs_get_extent,
986 fixup->mirror_num);
987 wait_on_page_locked(page);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200988
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200989 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
990 end, EXTENT_DAMAGED, 0, NULL);
991 if (!corrected)
992 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
David Sterba91166212016-04-26 23:54:39 +0200993 EXTENT_DAMAGED);
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200994 }
995
996out:
997 if (page)
998 put_page(page);
Tobias Klauser7fb18a02014-04-25 14:58:05 +0200999
1000 iput(inode);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001001
1002 if (ret < 0)
1003 return ret;
1004
1005 if (ret == 0 && corrected) {
1006 /*
1007 * we only need to call readpage for one of the inodes belonging
1008 * to this extent. so make iterate_extent_inodes stop
1009 */
1010 return 1;
1011 }
1012
1013 return -EIO;
1014}
1015
1016static void scrub_fixup_nodatasum(struct btrfs_work *work)
1017{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001018 struct btrfs_fs_info *fs_info;
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001019 int ret;
1020 struct scrub_fixup_nodatasum *fixup;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001021 struct scrub_ctx *sctx;
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001022 struct btrfs_trans_handle *trans = NULL;
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001023 struct btrfs_path *path;
1024 int uncorrectable = 0;
1025
1026 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001027 sctx = fixup->sctx;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001028 fs_info = fixup->root->fs_info;
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001029
1030 path = btrfs_alloc_path();
1031 if (!path) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001032 spin_lock(&sctx->stat_lock);
1033 ++sctx->stat.malloc_errors;
1034 spin_unlock(&sctx->stat_lock);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001035 uncorrectable = 1;
1036 goto out;
1037 }
1038
1039 trans = btrfs_join_transaction(fixup->root);
1040 if (IS_ERR(trans)) {
1041 uncorrectable = 1;
1042 goto out;
1043 }
1044
1045 /*
1046 * the idea is to trigger a regular read through the standard path. we
1047 * read a page from the (failed) logical address by specifying the
1048 * corresponding copynum of the failed sector. thus, that readpage is
1049 * expected to fail.
1050 * that is the point where on-the-fly error correction will kick in
1051 * (once it's finished) and rewrite the failed sector if a good copy
1052 * can be found.
1053 */
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001054 ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
Zygo Blaxellc995ab32017-09-22 13:58:45 -04001055 scrub_fixup_readpage, fixup, false);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001056 if (ret < 0) {
1057 uncorrectable = 1;
1058 goto out;
1059 }
1060 WARN_ON(ret != 1);
1061
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001062 spin_lock(&sctx->stat_lock);
1063 ++sctx->stat.corrected_errors;
1064 spin_unlock(&sctx->stat_lock);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001065
1066out:
1067 if (trans && !IS_ERR(trans))
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04001068 btrfs_end_transaction(trans);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001069 if (uncorrectable) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001070 spin_lock(&sctx->stat_lock);
1071 ++sctx->stat.uncorrectable_errors;
1072 spin_unlock(&sctx->stat_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001073 btrfs_dev_replace_stats_inc(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001074 &fs_info->dev_replace.num_uncorrectable_read_errors);
1075 btrfs_err_rl_in_rcu(fs_info,
David Sterbab14af3b2015-10-08 10:43:10 +02001076 "unable to fixup (nodatasum) error at logical %llu on dev %s",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02001077 fixup->logical, rcu_str_deref(fixup->dev->name));
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001078 }
1079
1080 btrfs_free_path(path);
1081 kfree(fixup);
1082
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01001083 scrub_pending_trans_workers_dec(sctx);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001084}
1085
Miao Xieaf8e2d12014-10-23 14:42:50 +08001086static inline void scrub_get_recover(struct scrub_recover *recover)
1087{
Elena Reshetova6f615012017-03-03 10:55:21 +02001088 refcount_inc(&recover->refs);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001089}
1090
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001091static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1092 struct scrub_recover *recover)
Miao Xieaf8e2d12014-10-23 14:42:50 +08001093{
Elena Reshetova6f615012017-03-03 10:55:21 +02001094 if (refcount_dec_and_test(&recover->refs)) {
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001095 btrfs_bio_counter_dec(fs_info);
Zhao Lei6e9606d2015-01-20 15:11:34 +08001096 btrfs_put_bbio(recover->bbio);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001097 kfree(recover);
1098 }
1099}
1100
Arne Jansena2de7332011-03-08 14:14:00 +01001101/*
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001102 * scrub_handle_errored_block gets called when either verification of the
1103 * pages failed or the bio failed to read, e.g. with EIO. In the latter
1104 * case, this function handles all pages in the bio, even though only one
1105 * may be bad.
1106 * The goal of this function is to repair the errored block by using the
1107 * contents of one of the mirrors.
Arne Jansena2de7332011-03-08 14:14:00 +01001108 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001109static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
Arne Jansena2de7332011-03-08 14:14:00 +01001110{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001111 struct scrub_ctx *sctx = sblock_to_check->sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001112 struct btrfs_device *dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001113 struct btrfs_fs_info *fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001114 u64 logical;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001115 unsigned int failed_mirror_index;
1116 unsigned int is_metadata;
1117 unsigned int have_csum;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001118 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1119 struct scrub_block *sblock_bad;
Arne Jansena2de7332011-03-08 14:14:00 +01001120 int ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001121 int mirror_index;
1122 int page_num;
1123 int success;
Qu Wenruo28d70e22017-04-14 08:35:55 +08001124 bool full_stripe_locked;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001125 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1126 DEFAULT_RATELIMIT_BURST);
Arne Jansena2de7332011-03-08 14:14:00 +01001127
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001128 BUG_ON(sblock_to_check->page_count < 1);
Jeff Mahoneyfb456252016-06-22 18:54:56 -04001129 fs_info = sctx->fs_info;
Stefan Behrens4ded4f62012-11-14 18:57:29 +00001130 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1131 /*
1132 * if we find an error in a super block, we just report it.
1133 * They will get written with the next transaction commit
1134 * anyway
1135 */
1136 spin_lock(&sctx->stat_lock);
1137 ++sctx->stat.super_errors;
1138 spin_unlock(&sctx->stat_lock);
1139 return 0;
1140 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001141 logical = sblock_to_check->pagev[0]->logical;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001142 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1143 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1144 is_metadata = !(sblock_to_check->pagev[0]->flags &
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001145 BTRFS_EXTENT_FLAG_DATA);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001146 have_csum = sblock_to_check->pagev[0]->have_csum;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001147 dev = sblock_to_check->pagev[0]->dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001148
Qu Wenruo28d70e22017-04-14 08:35:55 +08001149 /*
1150 * For RAID5/6, race can happen for a different device scrub thread.
1151 * For data corruption, Parity and Data threads will both try
1152 * to recovery the data.
1153 * Race can lead to doubly added csum error, or even unrecoverable
1154 * error.
1155 */
1156 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1157 if (ret < 0) {
1158 spin_lock(&sctx->stat_lock);
1159 if (ret == -ENOMEM)
1160 sctx->stat.malloc_errors++;
1161 sctx->stat.read_errors++;
1162 sctx->stat.uncorrectable_errors++;
1163 spin_unlock(&sctx->stat_lock);
1164 return ret;
1165 }
1166
Stefan Behrensff023aa2012-11-06 11:43:11 +01001167 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
1168 sblocks_for_recheck = NULL;
1169 goto nodatasum_case;
1170 }
1171
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001172 /*
1173 * read all mirrors one after the other. This includes to
1174 * re-read the extent or metadata block that failed (that was
1175 * the cause that this fixup code is called) another time,
1176 * page by page this time in order to know which pages
1177 * caused I/O errors and which ones are good (for all mirrors).
1178 * It is the goal to handle the situation when more than one
1179 * mirror contains I/O errors, but the errors do not
1180 * overlap, i.e. the data can be repaired by selecting the
1181 * pages from those mirrors without I/O error on the
1182 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1183 * would be that mirror #1 has an I/O error on the first page,
1184 * the second page is good, and mirror #2 has an I/O error on
1185 * the second page, but the first page is good.
1186 * Then the first page of the first mirror can be repaired by
1187 * taking the first page of the second mirror, and the
1188 * second page of the second mirror can be repaired by
1189 * copying the contents of the 2nd page of the 1st mirror.
1190 * One more note: if the pages of one mirror contain I/O
1191 * errors, the checksum cannot be verified. In order to get
1192 * the best data for repairing, the first attempt is to find
1193 * a mirror without I/O errors and with a validated checksum.
1194 * Only if this is not possible, the pages are picked from
1195 * mirrors with I/O errors without considering the checksum.
1196 * If the latter is the case, at the end, the checksum of the
1197 * repaired area is verified in order to correctly maintain
1198 * the statistics.
1199 */
1200
David Sterba31e818f2015-02-20 18:00:26 +01001201 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1202 sizeof(*sblocks_for_recheck), GFP_NOFS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001203 if (!sblocks_for_recheck) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001204 spin_lock(&sctx->stat_lock);
1205 sctx->stat.malloc_errors++;
1206 sctx->stat.read_errors++;
1207 sctx->stat.uncorrectable_errors++;
1208 spin_unlock(&sctx->stat_lock);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001209 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001210 goto out;
1211 }
1212
1213 /* setup the context, map the logical blocks and alloc the pages */
Zhao Leibe50a8d2015-01-20 15:11:42 +08001214 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001215 if (ret) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001216 spin_lock(&sctx->stat_lock);
1217 sctx->stat.read_errors++;
1218 sctx->stat.uncorrectable_errors++;
1219 spin_unlock(&sctx->stat_lock);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001220 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001221 goto out;
1222 }
1223 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1224 sblock_bad = sblocks_for_recheck + failed_mirror_index;
1225
1226 /* build and submit the bios for the failed mirror, check checksums */
Zhao Leiaffe4a52015-08-24 21:32:06 +08001227 scrub_recheck_block(fs_info, sblock_bad, 1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001228
1229 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1230 sblock_bad->no_io_error_seen) {
1231 /*
1232 * the error disappeared after reading page by page, or
1233 * the area was part of a huge bio and other parts of the
1234 * bio caused I/O errors, or the block layer merged several
1235 * read requests into one and the error is caused by a
1236 * different bio (usually one of the two latter cases is
1237 * the cause)
1238 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001239 spin_lock(&sctx->stat_lock);
1240 sctx->stat.unverified_errors++;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001241 sblock_to_check->data_corrected = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001242 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001243
Stefan Behrensff023aa2012-11-06 11:43:11 +01001244 if (sctx->is_dev_replace)
1245 scrub_write_block_to_dev_replace(sblock_bad);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001246 goto out;
1247 }
1248
1249 if (!sblock_bad->no_io_error_seen) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001250 spin_lock(&sctx->stat_lock);
1251 sctx->stat.read_errors++;
1252 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001253 if (__ratelimit(&_rs))
1254 scrub_print_warning("i/o error", sblock_to_check);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001255 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001256 } else if (sblock_bad->checksum_error) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001257 spin_lock(&sctx->stat_lock);
1258 sctx->stat.csum_errors++;
1259 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001260 if (__ratelimit(&_rs))
1261 scrub_print_warning("checksum error", sblock_to_check);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001262 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001263 BTRFS_DEV_STAT_CORRUPTION_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001264 } else if (sblock_bad->header_error) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001265 spin_lock(&sctx->stat_lock);
1266 sctx->stat.verify_errors++;
1267 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001268 if (__ratelimit(&_rs))
1269 scrub_print_warning("checksum/header error",
1270 sblock_to_check);
Stefan Behrens442a4f62012-05-25 16:06:08 +02001271 if (sblock_bad->generation_error)
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001272 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001273 BTRFS_DEV_STAT_GENERATION_ERRS);
1274 else
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001275 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001276 BTRFS_DEV_STAT_CORRUPTION_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001277 }
1278
Ilya Dryomov33ef30a2013-11-03 19:06:38 +02001279 if (sctx->readonly) {
1280 ASSERT(!sctx->is_dev_replace);
1281 goto out;
1282 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001283
1284 if (!is_metadata && !have_csum) {
1285 struct scrub_fixup_nodatasum *fixup_nodatasum;
1286
Stefan Behrensff023aa2012-11-06 11:43:11 +01001287 WARN_ON(sctx->is_dev_replace);
1288
Zhao Leib25c94c2015-01-20 15:11:35 +08001289nodatasum_case:
1290
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001291 /*
1292 * !is_metadata and !have_csum, this means that the data
Nicholas D Steeves01327612016-05-19 21:18:45 -04001293 * might not be COWed, that it might be modified
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001294 * concurrently. The general strategy to work on the
1295 * commit root does not help in the case when COW is not
1296 * used.
1297 */
1298 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1299 if (!fixup_nodatasum)
1300 goto did_not_correct_error;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001301 fixup_nodatasum->sctx = sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001302 fixup_nodatasum->dev = dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001303 fixup_nodatasum->logical = logical;
1304 fixup_nodatasum->root = fs_info->extent_root;
1305 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01001306 scrub_pending_trans_workers_inc(sctx);
Liu Bo9e0af232014-08-15 23:36:53 +08001307 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1308 scrub_fixup_nodatasum, NULL, NULL);
Qu Wenruo0339ef22014-02-28 10:46:17 +08001309 btrfs_queue_work(fs_info->scrub_workers,
1310 &fixup_nodatasum->work);
Arne Jansena2de7332011-03-08 14:14:00 +01001311 goto out;
1312 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001313
1314 /*
1315 * now build and submit the bios for the other mirrors, check
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001316 * checksums.
1317 * First try to pick the mirror which is completely without I/O
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001318 * errors and also does not have a checksum error.
1319 * If one is found, and if a checksum is present, the full block
1320 * that is known to contain an error is rewritten. Afterwards
1321 * the block is known to be corrected.
1322 * If a mirror is found which is completely correct, and no
1323 * checksum is present, only those pages are rewritten that had
1324 * an I/O error in the block to be repaired, since it cannot be
1325 * determined, which copy of the other pages is better (and it
1326 * could happen otherwise that a correct page would be
1327 * overwritten by a bad one).
1328 */
Liu Bo762221f2018-01-02 13:36:42 -07001329 for (mirror_index = 0; ;mirror_index++) {
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001330 struct scrub_block *sblock_other;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001331
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001332 if (mirror_index == failed_mirror_index)
1333 continue;
Liu Bo762221f2018-01-02 13:36:42 -07001334
1335 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1336 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1337 if (mirror_index >= BTRFS_MAX_MIRRORS)
1338 break;
1339 if (!sblocks_for_recheck[mirror_index].page_count)
1340 break;
1341
1342 sblock_other = sblocks_for_recheck + mirror_index;
1343 } else {
1344 struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1345 int max_allowed = r->bbio->num_stripes -
1346 r->bbio->num_tgtdevs;
1347
1348 if (mirror_index >= max_allowed)
1349 break;
1350 if (!sblocks_for_recheck[1].page_count)
1351 break;
1352
1353 ASSERT(failed_mirror_index == 0);
1354 sblock_other = sblocks_for_recheck + 1;
1355 sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1356 }
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001357
1358 /* build and submit the bios, check checksums */
Zhao Leiaffe4a52015-08-24 21:32:06 +08001359 scrub_recheck_block(fs_info, sblock_other, 0);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001360
1361 if (!sblock_other->header_error &&
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001362 !sblock_other->checksum_error &&
1363 sblock_other->no_io_error_seen) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01001364 if (sctx->is_dev_replace) {
1365 scrub_write_block_to_dev_replace(sblock_other);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001366 goto corrected_error;
Zhao Lei114ab502015-01-20 15:11:36 +08001367 } else {
1368 ret = scrub_repair_block_from_good_copy(
1369 sblock_bad, sblock_other);
1370 if (!ret)
1371 goto corrected_error;
1372 }
Arne Jansena2de7332011-03-08 14:14:00 +01001373 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001374 }
1375
Zhao Leib968fed2015-01-20 15:11:41 +08001376 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1377 goto did_not_correct_error;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001378
1379 /*
Stefan Behrensff023aa2012-11-06 11:43:11 +01001380 * In case of I/O errors in the area that is supposed to be
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001381 * repaired, continue by picking good copies of those pages.
1382 * Select the good pages from mirrors to rewrite bad pages from
1383 * the area to fix. Afterwards verify the checksum of the block
1384 * that is supposed to be repaired. This verification step is
1385 * only done for the purpose of statistic counting and for the
1386 * final scrub report, whether errors remain.
1387 * A perfect algorithm could make use of the checksum and try
1388 * all possible combinations of pages from the different mirrors
1389 * until the checksum verification succeeds. For example, when
1390 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1391 * of mirror #2 is readable but the final checksum test fails,
1392 * then the 2nd page of mirror #3 could be tried, whether now
Nicholas D Steeves01327612016-05-19 21:18:45 -04001393 * the final checksum succeeds. But this would be a rare
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001394 * exception and is therefore not implemented. At least it is
1395 * avoided that the good copy is overwritten.
1396 * A more useful improvement would be to pick the sectors
1397 * without I/O error based on sector sizes (512 bytes on legacy
1398 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1399 * mirror could be repaired by taking 512 byte of a different
1400 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1401 * area are unreadable.
1402 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001403 success = 1;
Zhao Leib968fed2015-01-20 15:11:41 +08001404 for (page_num = 0; page_num < sblock_bad->page_count;
1405 page_num++) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001406 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
Zhao Leib968fed2015-01-20 15:11:41 +08001407 struct scrub_block *sblock_other = NULL;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001408
Zhao Leib968fed2015-01-20 15:11:41 +08001409 /* skip no-io-error page in scrub */
1410 if (!page_bad->io_error && !sctx->is_dev_replace)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001411 continue;
1412
Liu Bo47597002018-03-02 16:10:41 -07001413 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1414 /*
1415 * In case of dev replace, if raid56 rebuild process
1416 * didn't work out correct data, then copy the content
1417 * in sblock_bad to make sure target device is identical
1418 * to source device, instead of writing garbage data in
1419 * sblock_for_recheck array to target device.
1420 */
1421 sblock_other = NULL;
1422 } else if (page_bad->io_error) {
1423 /* try to find no-io-error page in mirrors */
Zhao Leib968fed2015-01-20 15:11:41 +08001424 for (mirror_index = 0;
1425 mirror_index < BTRFS_MAX_MIRRORS &&
1426 sblocks_for_recheck[mirror_index].page_count > 0;
1427 mirror_index++) {
1428 if (!sblocks_for_recheck[mirror_index].
1429 pagev[page_num]->io_error) {
1430 sblock_other = sblocks_for_recheck +
1431 mirror_index;
1432 break;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001433 }
Jan Schmidt13db62b2011-06-13 19:56:13 +02001434 }
Zhao Leib968fed2015-01-20 15:11:41 +08001435 if (!sblock_other)
1436 success = 0;
Jan Schmidt13db62b2011-06-13 19:56:13 +02001437 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001438
Zhao Leib968fed2015-01-20 15:11:41 +08001439 if (sctx->is_dev_replace) {
1440 /*
1441 * did not find a mirror to fetch the page
1442 * from. scrub_write_page_to_dev_replace()
1443 * handles this case (page->io_error), by
1444 * filling the block with zeros before
1445 * submitting the write request
1446 */
1447 if (!sblock_other)
1448 sblock_other = sblock_bad;
1449
1450 if (scrub_write_page_to_dev_replace(sblock_other,
1451 page_num) != 0) {
1452 btrfs_dev_replace_stats_inc(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001453 &fs_info->dev_replace.num_write_errors);
Zhao Leib968fed2015-01-20 15:11:41 +08001454 success = 0;
1455 }
1456 } else if (sblock_other) {
1457 ret = scrub_repair_page_from_good_copy(sblock_bad,
1458 sblock_other,
1459 page_num, 0);
1460 if (0 == ret)
1461 page_bad->io_error = 0;
1462 else
1463 success = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001464 }
1465 }
1466
Zhao Leib968fed2015-01-20 15:11:41 +08001467 if (success && !sctx->is_dev_replace) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001468 if (is_metadata || have_csum) {
1469 /*
1470 * need to verify the checksum now that all
1471 * sectors on disk are repaired (the write
1472 * request for data to be repaired is on its way).
1473 * Just be lazy and use scrub_recheck_block()
1474 * which re-reads the data before the checksum
1475 * is verified, but most likely the data comes out
1476 * of the page cache.
1477 */
Zhao Leiaffe4a52015-08-24 21:32:06 +08001478 scrub_recheck_block(fs_info, sblock_bad, 1);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001479 if (!sblock_bad->header_error &&
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001480 !sblock_bad->checksum_error &&
1481 sblock_bad->no_io_error_seen)
1482 goto corrected_error;
1483 else
1484 goto did_not_correct_error;
1485 } else {
1486corrected_error:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001487 spin_lock(&sctx->stat_lock);
1488 sctx->stat.corrected_errors++;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001489 sblock_to_check->data_corrected = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001490 spin_unlock(&sctx->stat_lock);
David Sterbab14af3b2015-10-08 10:43:10 +02001491 btrfs_err_rl_in_rcu(fs_info,
1492 "fixed up error at logical %llu on dev %s",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02001493 logical, rcu_str_deref(dev->name));
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001494 }
1495 } else {
1496did_not_correct_error:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001497 spin_lock(&sctx->stat_lock);
1498 sctx->stat.uncorrectable_errors++;
1499 spin_unlock(&sctx->stat_lock);
David Sterbab14af3b2015-10-08 10:43:10 +02001500 btrfs_err_rl_in_rcu(fs_info,
1501 "unable to fixup (regular) error at logical %llu on dev %s",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02001502 logical, rcu_str_deref(dev->name));
Arne Jansena2de7332011-03-08 14:14:00 +01001503 }
1504
1505out:
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001506 if (sblocks_for_recheck) {
1507 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1508 mirror_index++) {
1509 struct scrub_block *sblock = sblocks_for_recheck +
1510 mirror_index;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001511 struct scrub_recover *recover;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001512 int page_index;
1513
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001514 for (page_index = 0; page_index < sblock->page_count;
1515 page_index++) {
1516 sblock->pagev[page_index]->sblock = NULL;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001517 recover = sblock->pagev[page_index]->recover;
1518 if (recover) {
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001519 scrub_put_recover(fs_info, recover);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001520 sblock->pagev[page_index]->recover =
1521 NULL;
1522 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001523 scrub_page_put(sblock->pagev[page_index]);
1524 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001525 }
1526 kfree(sblocks_for_recheck);
1527 }
1528
Qu Wenruo28d70e22017-04-14 08:35:55 +08001529 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1530 if (ret < 0)
1531 return ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001532 return 0;
Arne Jansena2de7332011-03-08 14:14:00 +01001533}
1534
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001535static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
Miao Xieaf8e2d12014-10-23 14:42:50 +08001536{
Zhao Lei10f11902015-01-20 15:11:43 +08001537 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1538 return 2;
1539 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1540 return 3;
1541 else
Miao Xieaf8e2d12014-10-23 14:42:50 +08001542 return (int)bbio->num_stripes;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001543}
1544
Zhao Lei10f11902015-01-20 15:11:43 +08001545static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1546 u64 *raid_map,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001547 u64 mapped_length,
1548 int nstripes, int mirror,
1549 int *stripe_index,
1550 u64 *stripe_offset)
1551{
1552 int i;
1553
Zhao Leiffe2d202015-01-20 15:11:44 +08001554 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
Miao Xieaf8e2d12014-10-23 14:42:50 +08001555 /* RAID5/6 */
1556 for (i = 0; i < nstripes; i++) {
1557 if (raid_map[i] == RAID6_Q_STRIPE ||
1558 raid_map[i] == RAID5_P_STRIPE)
1559 continue;
1560
1561 if (logical >= raid_map[i] &&
1562 logical < raid_map[i] + mapped_length)
1563 break;
1564 }
1565
1566 *stripe_index = i;
1567 *stripe_offset = logical - raid_map[i];
1568 } else {
1569 /* The other RAID type */
1570 *stripe_index = mirror;
1571 *stripe_offset = 0;
1572 }
1573}
1574
Zhao Leibe50a8d2015-01-20 15:11:42 +08001575static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001576 struct scrub_block *sblocks_for_recheck)
Arne Jansena2de7332011-03-08 14:14:00 +01001577{
Zhao Leibe50a8d2015-01-20 15:11:42 +08001578 struct scrub_ctx *sctx = original_sblock->sctx;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04001579 struct btrfs_fs_info *fs_info = sctx->fs_info;
Zhao Leibe50a8d2015-01-20 15:11:42 +08001580 u64 length = original_sblock->page_count * PAGE_SIZE;
1581 u64 logical = original_sblock->pagev[0]->logical;
Zhao Lei4734b7e2015-08-19 22:39:18 +08001582 u64 generation = original_sblock->pagev[0]->generation;
1583 u64 flags = original_sblock->pagev[0]->flags;
1584 u64 have_csum = original_sblock->pagev[0]->have_csum;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001585 struct scrub_recover *recover;
1586 struct btrfs_bio *bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001587 u64 sublen;
1588 u64 mapped_length;
1589 u64 stripe_offset;
1590 int stripe_index;
Zhao Leibe50a8d2015-01-20 15:11:42 +08001591 int page_index = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001592 int mirror_index;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001593 int nmirrors;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001594 int ret;
1595
1596 /*
Zhao Lei57019342015-01-20 15:11:45 +08001597 * note: the two members refs and outstanding_pages
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001598 * are not used (and not set) in the blocks that are used for
1599 * the recheck procedure
1600 */
1601
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001602 while (length > 0) {
Miao Xieaf8e2d12014-10-23 14:42:50 +08001603 sublen = min_t(u64, length, PAGE_SIZE);
1604 mapped_length = sublen;
1605 bbio = NULL;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001606
1607 /*
1608 * with a length of PAGE_SIZE, each returned stripe
1609 * represents one mirror
1610 */
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001611 btrfs_bio_counter_inc_blocked(fs_info);
Christoph Hellwigcf8cddd2016-10-27 09:27:36 +02001612 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
David Sterba825ad4c2017-03-28 14:45:22 +02001613 logical, &mapped_length, &bbio);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001614 if (ret || !bbio || mapped_length < sublen) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08001615 btrfs_put_bbio(bbio);
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001616 btrfs_bio_counter_dec(fs_info);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001617 return -EIO;
1618 }
1619
Miao Xieaf8e2d12014-10-23 14:42:50 +08001620 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1621 if (!recover) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08001622 btrfs_put_bbio(bbio);
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001623 btrfs_bio_counter_dec(fs_info);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001624 return -ENOMEM;
1625 }
1626
Elena Reshetova6f615012017-03-03 10:55:21 +02001627 refcount_set(&recover->refs, 1);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001628 recover->bbio = bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001629 recover->map_length = mapped_length;
1630
Ashish Samant24731142016-04-29 18:33:59 -07001631 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001632
Zhao Leibe50a8d2015-01-20 15:11:42 +08001633 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
Zhao Lei10f11902015-01-20 15:11:43 +08001634
Miao Xieaf8e2d12014-10-23 14:42:50 +08001635 for (mirror_index = 0; mirror_index < nmirrors;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001636 mirror_index++) {
1637 struct scrub_block *sblock;
1638 struct scrub_page *page;
1639
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001640 sblock = sblocks_for_recheck + mirror_index;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001641 sblock->sctx = sctx;
Zhao Lei4734b7e2015-08-19 22:39:18 +08001642
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001643 page = kzalloc(sizeof(*page), GFP_NOFS);
1644 if (!page) {
1645leave_nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001646 spin_lock(&sctx->stat_lock);
1647 sctx->stat.malloc_errors++;
1648 spin_unlock(&sctx->stat_lock);
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001649 scrub_put_recover(fs_info, recover);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001650 return -ENOMEM;
1651 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001652 scrub_page_get(page);
1653 sblock->pagev[page_index] = page;
Zhao Lei4734b7e2015-08-19 22:39:18 +08001654 page->sblock = sblock;
1655 page->flags = flags;
1656 page->generation = generation;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001657 page->logical = logical;
Zhao Lei4734b7e2015-08-19 22:39:18 +08001658 page->have_csum = have_csum;
1659 if (have_csum)
1660 memcpy(page->csum,
1661 original_sblock->pagev[0]->csum,
1662 sctx->csum_size);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001663
Zhao Lei10f11902015-01-20 15:11:43 +08001664 scrub_stripe_index_and_offset(logical,
1665 bbio->map_type,
1666 bbio->raid_map,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001667 mapped_length,
Zhao Leie34c3302015-01-20 15:11:31 +08001668 bbio->num_stripes -
1669 bbio->num_tgtdevs,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001670 mirror_index,
1671 &stripe_index,
1672 &stripe_offset);
1673 page->physical = bbio->stripes[stripe_index].physical +
1674 stripe_offset;
1675 page->dev = bbio->stripes[stripe_index].dev;
1676
Stefan Behrensff023aa2012-11-06 11:43:11 +01001677 BUG_ON(page_index >= original_sblock->page_count);
1678 page->physical_for_dev_replace =
1679 original_sblock->pagev[page_index]->
1680 physical_for_dev_replace;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001681 /* for missing devices, dev->bdev is NULL */
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001682 page->mirror_num = mirror_index + 1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001683 sblock->page_count++;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001684 page->page = alloc_page(GFP_NOFS);
1685 if (!page->page)
1686 goto leave_nomem;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001687
1688 scrub_get_recover(recover);
1689 page->recover = recover;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001690 }
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001691 scrub_put_recover(fs_info, recover);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001692 length -= sublen;
1693 logical += sublen;
1694 page_index++;
1695 }
1696
1697 return 0;
1698}
1699
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02001700static void scrub_bio_wait_endio(struct bio *bio)
Miao Xieaf8e2d12014-10-23 14:42:50 +08001701{
Liu Bob4ff5ad2017-11-30 17:26:39 -07001702 complete(bio->bi_private);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001703}
1704
Miao Xieaf8e2d12014-10-23 14:42:50 +08001705static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1706 struct bio *bio,
1707 struct scrub_page *page)
1708{
Liu Bob4ff5ad2017-11-30 17:26:39 -07001709 DECLARE_COMPLETION_ONSTACK(done);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001710 int ret;
Liu Bo762221f2018-01-02 13:36:42 -07001711 int mirror_num;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001712
Miao Xieaf8e2d12014-10-23 14:42:50 +08001713 bio->bi_iter.bi_sector = page->logical >> 9;
1714 bio->bi_private = &done;
1715 bio->bi_end_io = scrub_bio_wait_endio;
1716
Liu Bo762221f2018-01-02 13:36:42 -07001717 mirror_num = page->sblock->pagev[0]->mirror_num;
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04001718 ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001719 page->recover->map_length,
Liu Bo762221f2018-01-02 13:36:42 -07001720 mirror_num, 0);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001721 if (ret)
1722 return ret;
1723
Liu Bob4ff5ad2017-11-30 17:26:39 -07001724 wait_for_completion_io(&done);
1725 return blk_status_to_errno(bio->bi_status);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001726}
1727
Liu Bo6ca17652018-03-07 12:08:09 -07001728static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1729 struct scrub_block *sblock)
1730{
1731 struct scrub_page *first_page = sblock->pagev[0];
1732 struct bio *bio;
1733 int page_num;
1734
1735 /* All pages in sblock belong to the same stripe on the same device. */
1736 ASSERT(first_page->dev);
1737 if (!first_page->dev->bdev)
1738 goto out;
1739
1740 bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1741 bio_set_dev(bio, first_page->dev->bdev);
1742
1743 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1744 struct scrub_page *page = sblock->pagev[page_num];
1745
1746 WARN_ON(!page->page);
1747 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1748 }
1749
1750 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1751 bio_put(bio);
1752 goto out;
1753 }
1754
1755 bio_put(bio);
1756
1757 scrub_recheck_block_checksum(sblock);
1758
1759 return;
1760out:
1761 for (page_num = 0; page_num < sblock->page_count; page_num++)
1762 sblock->pagev[page_num]->io_error = 1;
1763
1764 sblock->no_io_error_seen = 0;
1765}
1766
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001767/*
1768 * this function will check the on disk data for checksum errors, header
1769 * errors and read I/O errors. If any I/O errors happen, the exact pages
1770 * which are errored are marked as being bad. The goal is to enable scrub
1771 * to take those pages that are not errored from all the mirrors so that
1772 * the pages that are errored in the just handled mirror can be repaired.
1773 */
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001774static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
Zhao Leiaffe4a52015-08-24 21:32:06 +08001775 struct scrub_block *sblock,
1776 int retry_failed_mirror)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001777{
1778 int page_num;
1779
1780 sblock->no_io_error_seen = 1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001781
Liu Bo6ca17652018-03-07 12:08:09 -07001782 /* short cut for raid56 */
1783 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1784 return scrub_recheck_block_on_raid56(fs_info, sblock);
1785
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001786 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1787 struct bio *bio;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001788 struct scrub_page *page = sblock->pagev[page_num];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001789
Stefan Behrens442a4f62012-05-25 16:06:08 +02001790 if (page->dev->bdev == NULL) {
Stefan Behrensea9947b2012-05-04 15:16:07 -04001791 page->io_error = 1;
1792 sblock->no_io_error_seen = 0;
1793 continue;
1794 }
1795
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001796 WARN_ON(!page->page);
David Sterbac5e4c3d2017-06-12 17:29:41 +02001797 bio = btrfs_io_bio_alloc(1);
Christoph Hellwig74d46992017-08-23 19:10:32 +02001798 bio_set_dev(bio, page->dev->bdev);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001799
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001800 bio_add_page(bio, page->page, PAGE_SIZE, 0);
Liu Bo6ca17652018-03-07 12:08:09 -07001801 bio->bi_iter.bi_sector = page->physical >> 9;
1802 bio->bi_opf = REQ_OP_READ;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001803
Liu Bo6ca17652018-03-07 12:08:09 -07001804 if (btrfsic_submit_bio_wait(bio)) {
1805 page->io_error = 1;
1806 sblock->no_io_error_seen = 0;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001807 }
Kent Overstreet33879d42013-11-23 22:33:32 -08001808
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001809 bio_put(bio);
1810 }
1811
1812 if (sblock->no_io_error_seen)
Zhao Leiba7cf982015-08-24 21:18:02 +08001813 scrub_recheck_block_checksum(sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001814}
1815
Miao Xie17a9be22014-07-24 11:37:08 +08001816static inline int scrub_check_fsid(u8 fsid[],
1817 struct scrub_page *spage)
1818{
1819 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1820 int ret;
1821
Anand Jain44880fd2017-07-29 17:50:09 +08001822 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
Miao Xie17a9be22014-07-24 11:37:08 +08001823 return !ret;
1824}
1825
Zhao Leiba7cf982015-08-24 21:18:02 +08001826static void scrub_recheck_block_checksum(struct scrub_block *sblock)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001827{
Zhao Leiba7cf982015-08-24 21:18:02 +08001828 sblock->header_error = 0;
1829 sblock->checksum_error = 0;
1830 sblock->generation_error = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001831
Zhao Leiba7cf982015-08-24 21:18:02 +08001832 if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1833 scrub_checksum_data(sblock);
1834 else
1835 scrub_checksum_tree_block(sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001836}
1837
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001838static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
Zhao Lei114ab502015-01-20 15:11:36 +08001839 struct scrub_block *sblock_good)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001840{
1841 int page_num;
1842 int ret = 0;
1843
1844 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1845 int ret_sub;
1846
1847 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1848 sblock_good,
Zhao Lei114ab502015-01-20 15:11:36 +08001849 page_num, 1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001850 if (ret_sub)
1851 ret = ret_sub;
1852 }
1853
1854 return ret;
1855}
1856
1857static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1858 struct scrub_block *sblock_good,
1859 int page_num, int force_write)
1860{
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001861 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1862 struct scrub_page *page_good = sblock_good->pagev[page_num];
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001863 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001864
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001865 BUG_ON(page_bad->page == NULL);
1866 BUG_ON(page_good->page == NULL);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001867 if (force_write || sblock_bad->header_error ||
1868 sblock_bad->checksum_error || page_bad->io_error) {
1869 struct bio *bio;
1870 int ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001871
Stefan Behrensff023aa2012-11-06 11:43:11 +01001872 if (!page_bad->dev->bdev) {
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001873 btrfs_warn_rl(fs_info,
Jeff Mahoney5d163e02016-09-20 10:05:00 -04001874 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
Stefan Behrensff023aa2012-11-06 11:43:11 +01001875 return -EIO;
1876 }
1877
David Sterbac5e4c3d2017-06-12 17:29:41 +02001878 bio = btrfs_io_bio_alloc(1);
Christoph Hellwig74d46992017-08-23 19:10:32 +02001879 bio_set_dev(bio, page_bad->dev->bdev);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001880 bio->bi_iter.bi_sector = page_bad->physical >> 9;
Mike Christie37226b22016-06-05 14:31:52 -05001881 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001882
1883 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1884 if (PAGE_SIZE != ret) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001888
Mike Christie4e49ea42016-06-05 14:31:41 -05001889 if (btrfsic_submit_bio_wait(bio)) {
Stefan Behrens442a4f62012-05-25 16:06:08 +02001890 btrfs_dev_stat_inc_and_print(page_bad->dev,
1891 BTRFS_DEV_STAT_WRITE_ERRS);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001892 btrfs_dev_replace_stats_inc(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001893 &fs_info->dev_replace.num_write_errors);
Stefan Behrens442a4f62012-05-25 16:06:08 +02001894 bio_put(bio);
1895 return -EIO;
1896 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001897 bio_put(bio);
1898 }
1899
1900 return 0;
1901}
1902
Stefan Behrensff023aa2012-11-06 11:43:11 +01001903static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1904{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001905 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001906 int page_num;
1907
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001908 /*
1909 * This block is used for the check of the parity on the source device,
1910 * so the data needn't be written into the destination device.
1911 */
1912 if (sblock->sparity)
1913 return;
1914
Stefan Behrensff023aa2012-11-06 11:43:11 +01001915 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1916 int ret;
1917
1918 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1919 if (ret)
1920 btrfs_dev_replace_stats_inc(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001921 &fs_info->dev_replace.num_write_errors);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001922 }
1923}
1924
1925static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1926 int page_num)
1927{
1928 struct scrub_page *spage = sblock->pagev[page_num];
1929
1930 BUG_ON(spage->page == NULL);
1931 if (spage->io_error) {
1932 void *mapped_buffer = kmap_atomic(spage->page);
1933
David Sterba619a9742017-03-29 20:48:44 +02001934 clear_page(mapped_buffer);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001935 flush_dcache_page(spage->page);
1936 kunmap_atomic(mapped_buffer);
1937 }
1938 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1939}
1940
1941static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1942 struct scrub_page *spage)
1943{
Stefan Behrensff023aa2012-11-06 11:43:11 +01001944 struct scrub_bio *sbio;
1945 int ret;
1946
David Sterba3fb99302017-05-16 19:10:32 +02001947 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001948again:
David Sterba3fb99302017-05-16 19:10:32 +02001949 if (!sctx->wr_curr_bio) {
1950 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
David Sterba58c4e172016-02-11 10:49:42 +01001951 GFP_KERNEL);
David Sterba3fb99302017-05-16 19:10:32 +02001952 if (!sctx->wr_curr_bio) {
1953 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001954 return -ENOMEM;
1955 }
David Sterba3fb99302017-05-16 19:10:32 +02001956 sctx->wr_curr_bio->sctx = sctx;
1957 sctx->wr_curr_bio->page_count = 0;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001958 }
David Sterba3fb99302017-05-16 19:10:32 +02001959 sbio = sctx->wr_curr_bio;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001960 if (sbio->page_count == 0) {
1961 struct bio *bio;
1962
1963 sbio->physical = spage->physical_for_dev_replace;
1964 sbio->logical = spage->logical;
David Sterba3fb99302017-05-16 19:10:32 +02001965 sbio->dev = sctx->wr_tgtdev;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001966 bio = sbio->bio;
1967 if (!bio) {
David Sterbac5e4c3d2017-06-12 17:29:41 +02001968 bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001969 sbio->bio = bio;
1970 }
1971
1972 bio->bi_private = sbio;
1973 bio->bi_end_io = scrub_wr_bio_end_io;
Christoph Hellwig74d46992017-08-23 19:10:32 +02001974 bio_set_dev(bio, sbio->dev->bdev);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001975 bio->bi_iter.bi_sector = sbio->physical >> 9;
Mike Christie37226b22016-06-05 14:31:52 -05001976 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02001977 sbio->status = 0;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001978 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1979 spage->physical_for_dev_replace ||
1980 sbio->logical + sbio->page_count * PAGE_SIZE !=
1981 spage->logical) {
1982 scrub_wr_submit(sctx);
1983 goto again;
1984 }
1985
1986 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1987 if (ret != PAGE_SIZE) {
1988 if (sbio->page_count < 1) {
1989 bio_put(sbio->bio);
1990 sbio->bio = NULL;
David Sterba3fb99302017-05-16 19:10:32 +02001991 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001992 return -EIO;
1993 }
1994 scrub_wr_submit(sctx);
1995 goto again;
1996 }
1997
1998 sbio->pagev[sbio->page_count] = spage;
1999 scrub_page_get(spage);
2000 sbio->page_count++;
David Sterba3fb99302017-05-16 19:10:32 +02002001 if (sbio->page_count == sctx->pages_per_wr_bio)
Stefan Behrensff023aa2012-11-06 11:43:11 +01002002 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02002003 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002004
2005 return 0;
2006}
2007
2008static void scrub_wr_submit(struct scrub_ctx *sctx)
2009{
Stefan Behrensff023aa2012-11-06 11:43:11 +01002010 struct scrub_bio *sbio;
2011
David Sterba3fb99302017-05-16 19:10:32 +02002012 if (!sctx->wr_curr_bio)
Stefan Behrensff023aa2012-11-06 11:43:11 +01002013 return;
2014
David Sterba3fb99302017-05-16 19:10:32 +02002015 sbio = sctx->wr_curr_bio;
2016 sctx->wr_curr_bio = NULL;
Christoph Hellwig74d46992017-08-23 19:10:32 +02002017 WARN_ON(!sbio->bio->bi_disk);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002018 scrub_pending_bio_inc(sctx);
2019 /* process all writes in a single worker thread. Then the block layer
2020 * orders the requests before sending them to the driver which
2021 * doubled the write performance on spinning disks when measured
2022 * with Linux 3.5 */
Mike Christie4e49ea42016-06-05 14:31:41 -05002023 btrfsic_submit_bio(sbio->bio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002024}
2025
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002026static void scrub_wr_bio_end_io(struct bio *bio)
Stefan Behrensff023aa2012-11-06 11:43:11 +01002027{
2028 struct scrub_bio *sbio = bio->bi_private;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002029 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002030
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002031 sbio->status = bio->bi_status;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002032 sbio->bio = bio;
2033
Liu Bo9e0af232014-08-15 23:36:53 +08002034 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
2035 scrub_wr_bio_end_io_worker, NULL, NULL);
Qu Wenruo0339ef22014-02-28 10:46:17 +08002036 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002037}
2038
2039static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
2040{
2041 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2042 struct scrub_ctx *sctx = sbio->sctx;
2043 int i;
2044
2045 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002046 if (sbio->status) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01002047 struct btrfs_dev_replace *dev_replace =
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002048 &sbio->sctx->fs_info->dev_replace;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002049
2050 for (i = 0; i < sbio->page_count; i++) {
2051 struct scrub_page *spage = sbio->pagev[i];
2052
2053 spage->io_error = 1;
2054 btrfs_dev_replace_stats_inc(&dev_replace->
2055 num_write_errors);
2056 }
2057 }
2058
2059 for (i = 0; i < sbio->page_count; i++)
2060 scrub_page_put(sbio->pagev[i]);
2061
2062 bio_put(sbio->bio);
2063 kfree(sbio);
2064 scrub_pending_bio_dec(sctx);
2065}
2066
2067static int scrub_checksum(struct scrub_block *sblock)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002068{
2069 u64 flags;
2070 int ret;
2071
Zhao Leiba7cf982015-08-24 21:18:02 +08002072 /*
2073 * No need to initialize these stats currently,
2074 * because this function only use return value
2075 * instead of these stats value.
2076 *
2077 * Todo:
2078 * always use stats
2079 */
2080 sblock->header_error = 0;
2081 sblock->generation_error = 0;
2082 sblock->checksum_error = 0;
2083
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002084 WARN_ON(sblock->page_count < 1);
2085 flags = sblock->pagev[0]->flags;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002086 ret = 0;
2087 if (flags & BTRFS_EXTENT_FLAG_DATA)
2088 ret = scrub_checksum_data(sblock);
2089 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2090 ret = scrub_checksum_tree_block(sblock);
2091 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2092 (void)scrub_checksum_super(sblock);
2093 else
2094 WARN_ON(1);
2095 if (ret)
2096 scrub_handle_errored_block(sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002097
2098 return ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002099}
2100
2101static int scrub_checksum_data(struct scrub_block *sblock)
2102{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002103 struct scrub_ctx *sctx = sblock->sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01002104 u8 csum[BTRFS_CSUM_SIZE];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002105 u8 *on_disk_csum;
2106 struct page *page;
2107 void *buffer;
Arne Jansena2de7332011-03-08 14:14:00 +01002108 u32 crc = ~(u32)0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002109 u64 len;
2110 int index;
Arne Jansena2de7332011-03-08 14:14:00 +01002111
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002112 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002113 if (!sblock->pagev[0]->have_csum)
Arne Jansena2de7332011-03-08 14:14:00 +01002114 return 0;
2115
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002116 on_disk_csum = sblock->pagev[0]->csum;
2117 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002118 buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002119
David Sterba25cc1222017-05-16 19:10:41 +02002120 len = sctx->fs_info->sectorsize;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002121 index = 0;
2122 for (;;) {
2123 u64 l = min_t(u64, len, PAGE_SIZE);
2124
Liu Bob0496682013-03-14 14:57:45 +00002125 crc = btrfs_csum_data(buffer, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07002126 kunmap_atomic(buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002127 len -= l;
2128 if (len == 0)
2129 break;
2130 index++;
2131 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002132 BUG_ON(!sblock->pagev[index]->page);
2133 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002134 buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002135 }
2136
Arne Jansena2de7332011-03-08 14:14:00 +01002137 btrfs_csum_final(crc, csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002138 if (memcmp(csum, on_disk_csum, sctx->csum_size))
Zhao Leiba7cf982015-08-24 21:18:02 +08002139 sblock->checksum_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002140
Zhao Leiba7cf982015-08-24 21:18:02 +08002141 return sblock->checksum_error;
Arne Jansena2de7332011-03-08 14:14:00 +01002142}
2143
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002144static int scrub_checksum_tree_block(struct scrub_block *sblock)
Arne Jansena2de7332011-03-08 14:14:00 +01002145{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002146 struct scrub_ctx *sctx = sblock->sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01002147 struct btrfs_header *h;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002148 struct btrfs_fs_info *fs_info = sctx->fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002149 u8 calculated_csum[BTRFS_CSUM_SIZE];
2150 u8 on_disk_csum[BTRFS_CSUM_SIZE];
2151 struct page *page;
2152 void *mapped_buffer;
2153 u64 mapped_size;
2154 void *p;
Arne Jansena2de7332011-03-08 14:14:00 +01002155 u32 crc = ~(u32)0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002156 u64 len;
2157 int index;
2158
2159 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002160 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002161 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002162 h = (struct btrfs_header *)mapped_buffer;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002163 memcpy(on_disk_csum, h->csum, sctx->csum_size);
Arne Jansena2de7332011-03-08 14:14:00 +01002164
2165 /*
2166 * we don't use the getter functions here, as we
2167 * a) don't have an extent buffer and
2168 * b) the page is already kmapped
2169 */
Qu Wenruo3cae2102013-07-16 11:19:18 +08002170 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
Zhao Leiba7cf982015-08-24 21:18:02 +08002171 sblock->header_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002172
Zhao Leiba7cf982015-08-24 21:18:02 +08002173 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2174 sblock->header_error = 1;
2175 sblock->generation_error = 1;
2176 }
Arne Jansena2de7332011-03-08 14:14:00 +01002177
Miao Xie17a9be22014-07-24 11:37:08 +08002178 if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
Zhao Leiba7cf982015-08-24 21:18:02 +08002179 sblock->header_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002180
2181 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2182 BTRFS_UUID_SIZE))
Zhao Leiba7cf982015-08-24 21:18:02 +08002183 sblock->header_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002184
David Sterba25cc1222017-05-16 19:10:41 +02002185 len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002186 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2187 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2188 index = 0;
2189 for (;;) {
2190 u64 l = min_t(u64, len, mapped_size);
2191
Liu Bob0496682013-03-14 14:57:45 +00002192 crc = btrfs_csum_data(p, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07002193 kunmap_atomic(mapped_buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002194 len -= l;
2195 if (len == 0)
2196 break;
2197 index++;
2198 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002199 BUG_ON(!sblock->pagev[index]->page);
2200 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002201 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002202 mapped_size = PAGE_SIZE;
2203 p = mapped_buffer;
2204 }
2205
2206 btrfs_csum_final(crc, calculated_csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002207 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
Zhao Leiba7cf982015-08-24 21:18:02 +08002208 sblock->checksum_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002209
Zhao Leiba7cf982015-08-24 21:18:02 +08002210 return sblock->header_error || sblock->checksum_error;
Arne Jansena2de7332011-03-08 14:14:00 +01002211}
2212
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002213static int scrub_checksum_super(struct scrub_block *sblock)
Arne Jansena2de7332011-03-08 14:14:00 +01002214{
2215 struct btrfs_super_block *s;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002216 struct scrub_ctx *sctx = sblock->sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002217 u8 calculated_csum[BTRFS_CSUM_SIZE];
2218 u8 on_disk_csum[BTRFS_CSUM_SIZE];
2219 struct page *page;
2220 void *mapped_buffer;
2221 u64 mapped_size;
2222 void *p;
Arne Jansena2de7332011-03-08 14:14:00 +01002223 u32 crc = ~(u32)0;
Stefan Behrens442a4f62012-05-25 16:06:08 +02002224 int fail_gen = 0;
2225 int fail_cor = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002226 u64 len;
2227 int index;
Arne Jansena2de7332011-03-08 14:14:00 +01002228
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002229 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002230 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002231 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002232 s = (struct btrfs_super_block *)mapped_buffer;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002233 memcpy(on_disk_csum, s->csum, sctx->csum_size);
Arne Jansena2de7332011-03-08 14:14:00 +01002234
Qu Wenruo3cae2102013-07-16 11:19:18 +08002235 if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002236 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01002237
Qu Wenruo3cae2102013-07-16 11:19:18 +08002238 if (sblock->pagev[0]->generation != btrfs_super_generation(s))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002239 ++fail_gen;
Arne Jansena2de7332011-03-08 14:14:00 +01002240
Miao Xie17a9be22014-07-24 11:37:08 +08002241 if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002242 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01002243
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002244 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2245 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2246 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2247 index = 0;
2248 for (;;) {
2249 u64 l = min_t(u64, len, mapped_size);
2250
Liu Bob0496682013-03-14 14:57:45 +00002251 crc = btrfs_csum_data(p, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07002252 kunmap_atomic(mapped_buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002253 len -= l;
2254 if (len == 0)
2255 break;
2256 index++;
2257 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002258 BUG_ON(!sblock->pagev[index]->page);
2259 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002260 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002261 mapped_size = PAGE_SIZE;
2262 p = mapped_buffer;
2263 }
2264
2265 btrfs_csum_final(crc, calculated_csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002266 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002267 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01002268
Stefan Behrens442a4f62012-05-25 16:06:08 +02002269 if (fail_cor + fail_gen) {
Arne Jansena2de7332011-03-08 14:14:00 +01002270 /*
2271 * if we find an error in a super block, we just report it.
2272 * They will get written with the next transaction commit
2273 * anyway
2274 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002275 spin_lock(&sctx->stat_lock);
2276 ++sctx->stat.super_errors;
2277 spin_unlock(&sctx->stat_lock);
Stefan Behrens442a4f62012-05-25 16:06:08 +02002278 if (fail_cor)
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002279 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02002280 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2281 else
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002282 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02002283 BTRFS_DEV_STAT_GENERATION_ERRS);
Arne Jansena2de7332011-03-08 14:14:00 +01002284 }
2285
Stefan Behrens442a4f62012-05-25 16:06:08 +02002286 return fail_cor + fail_gen;
Arne Jansena2de7332011-03-08 14:14:00 +01002287}
2288
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002289static void scrub_block_get(struct scrub_block *sblock)
2290{
Elena Reshetova186debd2017-03-03 10:55:23 +02002291 refcount_inc(&sblock->refs);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002292}
2293
2294static void scrub_block_put(struct scrub_block *sblock)
2295{
Elena Reshetova186debd2017-03-03 10:55:23 +02002296 if (refcount_dec_and_test(&sblock->refs)) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002297 int i;
2298
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002299 if (sblock->sparity)
2300 scrub_parity_put(sblock->sparity);
2301
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002302 for (i = 0; i < sblock->page_count; i++)
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002303 scrub_page_put(sblock->pagev[i]);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002304 kfree(sblock);
2305 }
2306}
2307
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002308static void scrub_page_get(struct scrub_page *spage)
2309{
Zhao Lei57019342015-01-20 15:11:45 +08002310 atomic_inc(&spage->refs);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002311}
2312
2313static void scrub_page_put(struct scrub_page *spage)
2314{
Zhao Lei57019342015-01-20 15:11:45 +08002315 if (atomic_dec_and_test(&spage->refs)) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002316 if (spage->page)
2317 __free_page(spage->page);
2318 kfree(spage);
2319 }
2320}
2321
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002322static void scrub_submit(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +01002323{
2324 struct scrub_bio *sbio;
2325
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002326 if (sctx->curr == -1)
Stefan Behrens1623ede2012-03-27 14:21:26 -04002327 return;
Arne Jansena2de7332011-03-08 14:14:00 +01002328
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002329 sbio = sctx->bios[sctx->curr];
2330 sctx->curr = -1;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002331 scrub_pending_bio_inc(sctx);
Mike Christie4e49ea42016-06-05 14:31:41 -05002332 btrfsic_submit_bio(sbio->bio);
Arne Jansena2de7332011-03-08 14:14:00 +01002333}
2334
Stefan Behrensff023aa2012-11-06 11:43:11 +01002335static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2336 struct scrub_page *spage)
Arne Jansena2de7332011-03-08 14:14:00 +01002337{
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002338 struct scrub_block *sblock = spage->sblock;
Arne Jansena2de7332011-03-08 14:14:00 +01002339 struct scrub_bio *sbio;
Arne Jansen69f4cb52011-11-11 08:17:10 -05002340 int ret;
Arne Jansena2de7332011-03-08 14:14:00 +01002341
2342again:
2343 /*
2344 * grab a fresh bio or wait for one to become available
2345 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002346 while (sctx->curr == -1) {
2347 spin_lock(&sctx->list_lock);
2348 sctx->curr = sctx->first_free;
2349 if (sctx->curr != -1) {
2350 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2351 sctx->bios[sctx->curr]->next_free = -1;
2352 sctx->bios[sctx->curr]->page_count = 0;
2353 spin_unlock(&sctx->list_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01002354 } else {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002355 spin_unlock(&sctx->list_lock);
2356 wait_event(sctx->list_wait, sctx->first_free != -1);
Arne Jansena2de7332011-03-08 14:14:00 +01002357 }
2358 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002359 sbio = sctx->bios[sctx->curr];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002360 if (sbio->page_count == 0) {
Arne Jansen69f4cb52011-11-11 08:17:10 -05002361 struct bio *bio;
2362
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002363 sbio->physical = spage->physical;
2364 sbio->logical = spage->logical;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002365 sbio->dev = spage->dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002366 bio = sbio->bio;
2367 if (!bio) {
David Sterbac5e4c3d2017-06-12 17:29:41 +02002368 bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002369 sbio->bio = bio;
2370 }
Arne Jansen69f4cb52011-11-11 08:17:10 -05002371
2372 bio->bi_private = sbio;
2373 bio->bi_end_io = scrub_bio_end_io;
Christoph Hellwig74d46992017-08-23 19:10:32 +02002374 bio_set_dev(bio, sbio->dev->bdev);
Kent Overstreet4f024f32013-10-11 15:44:27 -07002375 bio->bi_iter.bi_sector = sbio->physical >> 9;
Mike Christie37226b22016-06-05 14:31:52 -05002376 bio_set_op_attrs(bio, REQ_OP_READ, 0);
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002377 sbio->status = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002378 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2379 spage->physical ||
2380 sbio->logical + sbio->page_count * PAGE_SIZE !=
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002381 spage->logical ||
2382 sbio->dev != spage->dev) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002383 scrub_submit(sctx);
Arne Jansen69f4cb52011-11-11 08:17:10 -05002384 goto again;
2385 }
2386
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002387 sbio->pagev[sbio->page_count] = spage;
2388 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2389 if (ret != PAGE_SIZE) {
2390 if (sbio->page_count < 1) {
2391 bio_put(sbio->bio);
2392 sbio->bio = NULL;
2393 return -EIO;
2394 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002395 scrub_submit(sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002396 goto again;
Arne Jansena2de7332011-03-08 14:14:00 +01002397 }
Arne Jansen1bc87792011-05-28 21:57:55 +02002398
Stefan Behrensff023aa2012-11-06 11:43:11 +01002399 scrub_block_get(sblock); /* one for the page added to the bio */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002400 atomic_inc(&sblock->outstanding_pages);
2401 sbio->page_count++;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002402 if (sbio->page_count == sctx->pages_per_rd_bio)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002403 scrub_submit(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01002404
2405 return 0;
2406}
2407
Linus Torvalds22365972015-09-05 15:14:43 -07002408static void scrub_missing_raid56_end_io(struct bio *bio)
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002409{
2410 struct scrub_block *sblock = bio->bi_private;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002411 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002412
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002413 if (bio->bi_status)
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002414 sblock->no_io_error_seen = 0;
2415
Scott Talbert46732722016-05-09 09:14:28 -04002416 bio_put(bio);
2417
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002418 btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2419}
2420
2421static void scrub_missing_raid56_worker(struct btrfs_work *work)
2422{
2423 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2424 struct scrub_ctx *sctx = sblock->sctx;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002425 struct btrfs_fs_info *fs_info = sctx->fs_info;
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002426 u64 logical;
2427 struct btrfs_device *dev;
2428
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002429 logical = sblock->pagev[0]->logical;
2430 dev = sblock->pagev[0]->dev;
2431
Zhao Leiaffe4a52015-08-24 21:32:06 +08002432 if (sblock->no_io_error_seen)
Zhao Leiba7cf982015-08-24 21:18:02 +08002433 scrub_recheck_block_checksum(sblock);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002434
2435 if (!sblock->no_io_error_seen) {
2436 spin_lock(&sctx->stat_lock);
2437 sctx->stat.read_errors++;
2438 spin_unlock(&sctx->stat_lock);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002439 btrfs_err_rl_in_rcu(fs_info,
David Sterbab14af3b2015-10-08 10:43:10 +02002440 "IO error rebuilding logical %llu for dev %s",
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002441 logical, rcu_str_deref(dev->name));
2442 } else if (sblock->header_error || sblock->checksum_error) {
2443 spin_lock(&sctx->stat_lock);
2444 sctx->stat.uncorrectable_errors++;
2445 spin_unlock(&sctx->stat_lock);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002446 btrfs_err_rl_in_rcu(fs_info,
David Sterbab14af3b2015-10-08 10:43:10 +02002447 "failed to rebuild valid logical %llu for dev %s",
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002448 logical, rcu_str_deref(dev->name));
2449 } else {
2450 scrub_write_block_to_dev_replace(sblock);
2451 }
2452
2453 scrub_block_put(sblock);
2454
David Sterba2073c4c2017-03-31 17:12:51 +02002455 if (sctx->is_dev_replace && sctx->flush_all_writes) {
David Sterba3fb99302017-05-16 19:10:32 +02002456 mutex_lock(&sctx->wr_lock);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002457 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02002458 mutex_unlock(&sctx->wr_lock);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002459 }
2460
2461 scrub_pending_bio_dec(sctx);
2462}
2463
2464static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2465{
2466 struct scrub_ctx *sctx = sblock->sctx;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002467 struct btrfs_fs_info *fs_info = sctx->fs_info;
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002468 u64 length = sblock->page_count * PAGE_SIZE;
2469 u64 logical = sblock->pagev[0]->logical;
Zhao Leif1fee652016-05-17 17:37:38 +08002470 struct btrfs_bio *bbio = NULL;
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002471 struct bio *bio;
2472 struct btrfs_raid_bio *rbio;
2473 int ret;
2474 int i;
2475
Qu Wenruoae6529c2017-03-29 09:33:21 +08002476 btrfs_bio_counter_inc_blocked(fs_info);
Christoph Hellwigcf8cddd2016-10-27 09:27:36 +02002477 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
David Sterba825ad4c2017-03-28 14:45:22 +02002478 &length, &bbio);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002479 if (ret || !bbio || !bbio->raid_map)
2480 goto bbio_out;
2481
2482 if (WARN_ON(!sctx->is_dev_replace ||
2483 !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2484 /*
2485 * We shouldn't be scrubbing a missing device. Even for dev
2486 * replace, we should only get here for RAID 5/6. We either
2487 * managed to mount something with no mirrors remaining or
2488 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2489 */
2490 goto bbio_out;
2491 }
2492
David Sterbac5e4c3d2017-06-12 17:29:41 +02002493 bio = btrfs_io_bio_alloc(0);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002494 bio->bi_iter.bi_sector = logical >> 9;
2495 bio->bi_private = sblock;
2496 bio->bi_end_io = scrub_missing_raid56_end_io;
2497
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002498 rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002499 if (!rbio)
2500 goto rbio_out;
2501
2502 for (i = 0; i < sblock->page_count; i++) {
2503 struct scrub_page *spage = sblock->pagev[i];
2504
2505 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2506 }
2507
2508 btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2509 scrub_missing_raid56_worker, NULL, NULL);
2510 scrub_block_get(sblock);
2511 scrub_pending_bio_inc(sctx);
2512 raid56_submit_missing_rbio(rbio);
2513 return;
2514
2515rbio_out:
2516 bio_put(bio);
2517bbio_out:
Qu Wenruoae6529c2017-03-29 09:33:21 +08002518 btrfs_bio_counter_dec(fs_info);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002519 btrfs_put_bbio(bbio);
2520 spin_lock(&sctx->stat_lock);
2521 sctx->stat.malloc_errors++;
2522 spin_unlock(&sctx->stat_lock);
2523}
2524
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002525static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002526 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002527 u64 gen, int mirror_num, u8 *csum, int force,
2528 u64 physical_for_dev_replace)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002529{
2530 struct scrub_block *sblock;
2531 int index;
2532
David Sterba58c4e172016-02-11 10:49:42 +01002533 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002534 if (!sblock) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002535 spin_lock(&sctx->stat_lock);
2536 sctx->stat.malloc_errors++;
2537 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002538 return -ENOMEM;
2539 }
2540
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002541 /* one ref inside this function, plus one for each page added to
2542 * a bio later on */
Elena Reshetova186debd2017-03-03 10:55:23 +02002543 refcount_set(&sblock->refs, 1);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002544 sblock->sctx = sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002545 sblock->no_io_error_seen = 1;
2546
2547 for (index = 0; len > 0; index++) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002548 struct scrub_page *spage;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002549 u64 l = min_t(u64, len, PAGE_SIZE);
2550
David Sterba58c4e172016-02-11 10:49:42 +01002551 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002552 if (!spage) {
2553leave_nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002554 spin_lock(&sctx->stat_lock);
2555 sctx->stat.malloc_errors++;
2556 spin_unlock(&sctx->stat_lock);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002557 scrub_block_put(sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002558 return -ENOMEM;
2559 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002560 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2561 scrub_page_get(spage);
2562 sblock->pagev[index] = spage;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002563 spage->sblock = sblock;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002564 spage->dev = dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002565 spage->flags = flags;
2566 spage->generation = gen;
2567 spage->logical = logical;
2568 spage->physical = physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002569 spage->physical_for_dev_replace = physical_for_dev_replace;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002570 spage->mirror_num = mirror_num;
2571 if (csum) {
2572 spage->have_csum = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002573 memcpy(spage->csum, csum, sctx->csum_size);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002574 } else {
2575 spage->have_csum = 0;
2576 }
2577 sblock->page_count++;
David Sterba58c4e172016-02-11 10:49:42 +01002578 spage->page = alloc_page(GFP_KERNEL);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002579 if (!spage->page)
2580 goto leave_nomem;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002581 len -= l;
2582 logical += l;
2583 physical += l;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002584 physical_for_dev_replace += l;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002585 }
2586
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002587 WARN_ON(sblock->page_count == 0);
Anand Jaine6e674b2017-12-04 12:54:54 +08002588 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002589 /*
2590 * This case should only be hit for RAID 5/6 device replace. See
2591 * the comment in scrub_missing_raid56_pages() for details.
2592 */
2593 scrub_missing_raid56_pages(sblock);
2594 } else {
2595 for (index = 0; index < sblock->page_count; index++) {
2596 struct scrub_page *spage = sblock->pagev[index];
2597 int ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002598
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002599 ret = scrub_add_page_to_rd_bio(sctx, spage);
2600 if (ret) {
2601 scrub_block_put(sblock);
2602 return ret;
2603 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002604 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002605
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002606 if (force)
2607 scrub_submit(sctx);
2608 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002609
2610 /* last one frees, either here or in bio completion for last page */
2611 scrub_block_put(sblock);
2612 return 0;
2613}
2614
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002615static void scrub_bio_end_io(struct bio *bio)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002616{
2617 struct scrub_bio *sbio = bio->bi_private;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002618 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002619
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002620 sbio->status = bio->bi_status;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002621 sbio->bio = bio;
2622
Qu Wenruo0339ef22014-02-28 10:46:17 +08002623 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002624}
2625
2626static void scrub_bio_end_io_worker(struct btrfs_work *work)
2627{
2628 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002629 struct scrub_ctx *sctx = sbio->sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002630 int i;
2631
Stefan Behrensff023aa2012-11-06 11:43:11 +01002632 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002633 if (sbio->status) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002634 for (i = 0; i < sbio->page_count; i++) {
2635 struct scrub_page *spage = sbio->pagev[i];
2636
2637 spage->io_error = 1;
2638 spage->sblock->no_io_error_seen = 0;
2639 }
2640 }
2641
2642 /* now complete the scrub_block items that have all pages completed */
2643 for (i = 0; i < sbio->page_count; i++) {
2644 struct scrub_page *spage = sbio->pagev[i];
2645 struct scrub_block *sblock = spage->sblock;
2646
2647 if (atomic_dec_and_test(&sblock->outstanding_pages))
2648 scrub_block_complete(sblock);
2649 scrub_block_put(sblock);
2650 }
2651
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002652 bio_put(sbio->bio);
2653 sbio->bio = NULL;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002654 spin_lock(&sctx->list_lock);
2655 sbio->next_free = sctx->first_free;
2656 sctx->first_free = sbio->index;
2657 spin_unlock(&sctx->list_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002658
David Sterba2073c4c2017-03-31 17:12:51 +02002659 if (sctx->is_dev_replace && sctx->flush_all_writes) {
David Sterba3fb99302017-05-16 19:10:32 +02002660 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002661 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02002662 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002663 }
2664
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002665 scrub_pending_bio_dec(sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002666}
2667
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002668static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2669 unsigned long *bitmap,
2670 u64 start, u64 len)
2671{
Liu Bo972d7212017-04-03 13:45:33 -07002672 u64 offset;
David Sterba7736b0a2017-03-31 18:02:48 +02002673 u64 nsectors64;
2674 u32 nsectors;
Jeff Mahoneyda170662016-06-15 09:22:56 -04002675 int sectorsize = sparity->sctx->fs_info->sectorsize;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002676
2677 if (len >= sparity->stripe_len) {
2678 bitmap_set(bitmap, 0, sparity->nsectors);
2679 return;
2680 }
2681
2682 start -= sparity->logic_start;
Liu Bo972d7212017-04-03 13:45:33 -07002683 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2684 offset = div_u64(offset, sectorsize);
David Sterba7736b0a2017-03-31 18:02:48 +02002685 nsectors64 = div_u64(len, sectorsize);
2686
2687 ASSERT(nsectors64 < UINT_MAX);
2688 nsectors = (u32)nsectors64;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002689
2690 if (offset + nsectors <= sparity->nsectors) {
2691 bitmap_set(bitmap, offset, nsectors);
2692 return;
2693 }
2694
2695 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2696 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2697}
2698
2699static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2700 u64 start, u64 len)
2701{
2702 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2703}
2704
2705static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2706 u64 start, u64 len)
2707{
2708 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2709}
2710
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002711static void scrub_block_complete(struct scrub_block *sblock)
2712{
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002713 int corrupted = 0;
2714
Stefan Behrensff023aa2012-11-06 11:43:11 +01002715 if (!sblock->no_io_error_seen) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002716 corrupted = 1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002717 scrub_handle_errored_block(sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002718 } else {
2719 /*
2720 * if has checksum error, write via repair mechanism in
2721 * dev replace case, otherwise write here in dev replace
2722 * case.
2723 */
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002724 corrupted = scrub_checksum(sblock);
2725 if (!corrupted && sblock->sctx->is_dev_replace)
Stefan Behrensff023aa2012-11-06 11:43:11 +01002726 scrub_write_block_to_dev_replace(sblock);
2727 }
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002728
2729 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2730 u64 start = sblock->pagev[0]->logical;
2731 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2732 PAGE_SIZE;
2733
2734 scrub_parity_mark_sectors_error(sblock->sparity,
2735 start, end - start);
2736 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002737}
2738
Zhao Lei3b5753e2015-08-24 22:03:02 +08002739static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
Arne Jansena2de7332011-03-08 14:14:00 +01002740{
2741 struct btrfs_ordered_sum *sum = NULL;
Miao Xief51a4a12013-06-19 10:36:09 +08002742 unsigned long index;
Arne Jansena2de7332011-03-08 14:14:00 +01002743 unsigned long num_sectors;
Arne Jansena2de7332011-03-08 14:14:00 +01002744
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002745 while (!list_empty(&sctx->csum_list)) {
2746 sum = list_first_entry(&sctx->csum_list,
Arne Jansena2de7332011-03-08 14:14:00 +01002747 struct btrfs_ordered_sum, list);
2748 if (sum->bytenr > logical)
2749 return 0;
2750 if (sum->bytenr + sum->len > logical)
2751 break;
2752
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002753 ++sctx->stat.csum_discards;
Arne Jansena2de7332011-03-08 14:14:00 +01002754 list_del(&sum->list);
2755 kfree(sum);
2756 sum = NULL;
2757 }
2758 if (!sum)
2759 return 0;
2760
David Sterba1d1bf922017-03-31 18:02:48 +02002761 index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2762 ASSERT(index < UINT_MAX);
2763
David Sterba25cc1222017-05-16 19:10:41 +02002764 num_sectors = sum->len / sctx->fs_info->sectorsize;
Miao Xief51a4a12013-06-19 10:36:09 +08002765 memcpy(csum, sum->sums + index, sctx->csum_size);
2766 if (index == num_sectors - 1) {
Arne Jansena2de7332011-03-08 14:14:00 +01002767 list_del(&sum->list);
2768 kfree(sum);
2769 }
Miao Xief51a4a12013-06-19 10:36:09 +08002770 return 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002771}
2772
2773/* scrub extent tries to collect up to 64 kB for each bio */
Liu Bo6ca17652018-03-07 12:08:09 -07002774static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2775 u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002776 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002777 u64 gen, int mirror_num, u64 physical_for_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01002778{
2779 int ret;
2780 u8 csum[BTRFS_CSUM_SIZE];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002781 u32 blocksize;
2782
2783 if (flags & BTRFS_EXTENT_FLAG_DATA) {
Liu Bo6ca17652018-03-07 12:08:09 -07002784 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2785 blocksize = map->stripe_len;
2786 else
2787 blocksize = sctx->fs_info->sectorsize;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002788 spin_lock(&sctx->stat_lock);
2789 sctx->stat.data_extents_scrubbed++;
2790 sctx->stat.data_bytes_scrubbed += len;
2791 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002792 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
Liu Bo6ca17652018-03-07 12:08:09 -07002793 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2794 blocksize = map->stripe_len;
2795 else
2796 blocksize = sctx->fs_info->nodesize;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002797 spin_lock(&sctx->stat_lock);
2798 sctx->stat.tree_extents_scrubbed++;
2799 sctx->stat.tree_bytes_scrubbed += len;
2800 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002801 } else {
David Sterba25cc1222017-05-16 19:10:41 +02002802 blocksize = sctx->fs_info->sectorsize;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002803 WARN_ON(1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002804 }
Arne Jansena2de7332011-03-08 14:14:00 +01002805
2806 while (len) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002807 u64 l = min_t(u64, len, blocksize);
Arne Jansena2de7332011-03-08 14:14:00 +01002808 int have_csum = 0;
2809
2810 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2811 /* push csums to sbio */
Zhao Lei3b5753e2015-08-24 22:03:02 +08002812 have_csum = scrub_find_csum(sctx, logical, csum);
Arne Jansena2de7332011-03-08 14:14:00 +01002813 if (have_csum == 0)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002814 ++sctx->stat.no_csum;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002815 if (sctx->is_dev_replace && !have_csum) {
2816 ret = copy_nocow_pages(sctx, logical, l,
2817 mirror_num,
2818 physical_for_dev_replace);
2819 goto behind_scrub_pages;
2820 }
Arne Jansena2de7332011-03-08 14:14:00 +01002821 }
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002822 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002823 mirror_num, have_csum ? csum : NULL, 0,
2824 physical_for_dev_replace);
2825behind_scrub_pages:
Arne Jansena2de7332011-03-08 14:14:00 +01002826 if (ret)
2827 return ret;
2828 len -= l;
2829 logical += l;
2830 physical += l;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002831 physical_for_dev_replace += l;
Arne Jansena2de7332011-03-08 14:14:00 +01002832 }
2833 return 0;
2834}
2835
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002836static int scrub_pages_for_parity(struct scrub_parity *sparity,
2837 u64 logical, u64 len,
2838 u64 physical, struct btrfs_device *dev,
2839 u64 flags, u64 gen, int mirror_num, u8 *csum)
2840{
2841 struct scrub_ctx *sctx = sparity->sctx;
2842 struct scrub_block *sblock;
2843 int index;
2844
David Sterba58c4e172016-02-11 10:49:42 +01002845 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002846 if (!sblock) {
2847 spin_lock(&sctx->stat_lock);
2848 sctx->stat.malloc_errors++;
2849 spin_unlock(&sctx->stat_lock);
2850 return -ENOMEM;
2851 }
2852
2853 /* one ref inside this function, plus one for each page added to
2854 * a bio later on */
Elena Reshetova186debd2017-03-03 10:55:23 +02002855 refcount_set(&sblock->refs, 1);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002856 sblock->sctx = sctx;
2857 sblock->no_io_error_seen = 1;
2858 sblock->sparity = sparity;
2859 scrub_parity_get(sparity);
2860
2861 for (index = 0; len > 0; index++) {
2862 struct scrub_page *spage;
2863 u64 l = min_t(u64, len, PAGE_SIZE);
2864
David Sterba58c4e172016-02-11 10:49:42 +01002865 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002866 if (!spage) {
2867leave_nomem:
2868 spin_lock(&sctx->stat_lock);
2869 sctx->stat.malloc_errors++;
2870 spin_unlock(&sctx->stat_lock);
2871 scrub_block_put(sblock);
2872 return -ENOMEM;
2873 }
2874 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2875 /* For scrub block */
2876 scrub_page_get(spage);
2877 sblock->pagev[index] = spage;
2878 /* For scrub parity */
2879 scrub_page_get(spage);
2880 list_add_tail(&spage->list, &sparity->spages);
2881 spage->sblock = sblock;
2882 spage->dev = dev;
2883 spage->flags = flags;
2884 spage->generation = gen;
2885 spage->logical = logical;
2886 spage->physical = physical;
2887 spage->mirror_num = mirror_num;
2888 if (csum) {
2889 spage->have_csum = 1;
2890 memcpy(spage->csum, csum, sctx->csum_size);
2891 } else {
2892 spage->have_csum = 0;
2893 }
2894 sblock->page_count++;
David Sterba58c4e172016-02-11 10:49:42 +01002895 spage->page = alloc_page(GFP_KERNEL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002896 if (!spage->page)
2897 goto leave_nomem;
2898 len -= l;
2899 logical += l;
2900 physical += l;
2901 }
2902
2903 WARN_ON(sblock->page_count == 0);
2904 for (index = 0; index < sblock->page_count; index++) {
2905 struct scrub_page *spage = sblock->pagev[index];
2906 int ret;
2907
2908 ret = scrub_add_page_to_rd_bio(sctx, spage);
2909 if (ret) {
2910 scrub_block_put(sblock);
2911 return ret;
2912 }
2913 }
2914
2915 /* last one frees, either here or in bio completion for last page */
2916 scrub_block_put(sblock);
2917 return 0;
2918}
2919
2920static int scrub_extent_for_parity(struct scrub_parity *sparity,
2921 u64 logical, u64 len,
2922 u64 physical, struct btrfs_device *dev,
2923 u64 flags, u64 gen, int mirror_num)
2924{
2925 struct scrub_ctx *sctx = sparity->sctx;
2926 int ret;
2927 u8 csum[BTRFS_CSUM_SIZE];
2928 u32 blocksize;
2929
Anand Jaine6e674b2017-12-04 12:54:54 +08002930 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
Omar Sandoval4a770892015-06-19 11:52:52 -07002931 scrub_parity_mark_sectors_error(sparity, logical, len);
2932 return 0;
2933 }
2934
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002935 if (flags & BTRFS_EXTENT_FLAG_DATA) {
Liu Bo6ca17652018-03-07 12:08:09 -07002936 blocksize = sparity->stripe_len;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002937 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
Liu Bo6ca17652018-03-07 12:08:09 -07002938 blocksize = sparity->stripe_len;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002939 } else {
David Sterba25cc1222017-05-16 19:10:41 +02002940 blocksize = sctx->fs_info->sectorsize;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002941 WARN_ON(1);
2942 }
2943
2944 while (len) {
2945 u64 l = min_t(u64, len, blocksize);
2946 int have_csum = 0;
2947
2948 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2949 /* push csums to sbio */
Zhao Lei3b5753e2015-08-24 22:03:02 +08002950 have_csum = scrub_find_csum(sctx, logical, csum);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002951 if (have_csum == 0)
2952 goto skip;
2953 }
2954 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2955 flags, gen, mirror_num,
2956 have_csum ? csum : NULL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002957 if (ret)
2958 return ret;
Dan Carpenter6b6d24b2014-12-12 22:30:00 +03002959skip:
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002960 len -= l;
2961 logical += l;
2962 physical += l;
2963 }
2964 return 0;
2965}
2966
Wang Shilong3b080b22014-04-01 18:01:43 +08002967/*
2968 * Given a physical address, this will calculate it's
2969 * logical offset. if this is a parity stripe, it will return
2970 * the most left data stripe's logical offset.
2971 *
2972 * return 0 if it is a data stripe, 1 means parity stripe.
2973 */
2974static int get_raid56_logic_offset(u64 physical, int num,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002975 struct map_lookup *map, u64 *offset,
2976 u64 *stripe_start)
Wang Shilong3b080b22014-04-01 18:01:43 +08002977{
2978 int i;
2979 int j = 0;
2980 u64 stripe_nr;
2981 u64 last_offset;
David Sterba9d644a62015-02-20 18:42:11 +01002982 u32 stripe_index;
2983 u32 rot;
Wang Shilong3b080b22014-04-01 18:01:43 +08002984
2985 last_offset = (physical - map->stripes[num].physical) *
2986 nr_data_stripes(map);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002987 if (stripe_start)
2988 *stripe_start = last_offset;
2989
Wang Shilong3b080b22014-04-01 18:01:43 +08002990 *offset = last_offset;
2991 for (i = 0; i < nr_data_stripes(map); i++) {
2992 *offset = last_offset + i * map->stripe_len;
2993
Liu Bo42c61ab2017-04-03 13:45:24 -07002994 stripe_nr = div64_u64(*offset, map->stripe_len);
David Sterbab8b93ad2015-01-16 17:26:13 +01002995 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
Wang Shilong3b080b22014-04-01 18:01:43 +08002996
2997 /* Work out the disk rotation on this stripe-set */
David Sterba47c57132015-02-20 18:43:47 +01002998 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
Wang Shilong3b080b22014-04-01 18:01:43 +08002999 /* calculate which stripe this data locates */
3000 rot += i;
Wang Shilonge4fbaee2014-04-11 18:32:25 +08003001 stripe_index = rot % map->num_stripes;
Wang Shilong3b080b22014-04-01 18:01:43 +08003002 if (stripe_index == num)
3003 return 0;
3004 if (stripe_index < num)
3005 j++;
3006 }
3007 *offset = last_offset + j * map->stripe_len;
3008 return 1;
3009}
3010
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003011static void scrub_free_parity(struct scrub_parity *sparity)
3012{
3013 struct scrub_ctx *sctx = sparity->sctx;
3014 struct scrub_page *curr, *next;
3015 int nbits;
3016
3017 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
3018 if (nbits) {
3019 spin_lock(&sctx->stat_lock);
3020 sctx->stat.read_errors += nbits;
3021 sctx->stat.uncorrectable_errors += nbits;
3022 spin_unlock(&sctx->stat_lock);
3023 }
3024
3025 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
3026 list_del_init(&curr->list);
3027 scrub_page_put(curr);
3028 }
3029
3030 kfree(sparity);
3031}
3032
Zhao Lei20b2e302015-06-04 20:09:15 +08003033static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
3034{
3035 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
3036 work);
3037 struct scrub_ctx *sctx = sparity->sctx;
3038
3039 scrub_free_parity(sparity);
3040 scrub_pending_bio_dec(sctx);
3041}
3042
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02003043static void scrub_parity_bio_endio(struct bio *bio)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003044{
3045 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003046 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003047
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02003048 if (bio->bi_status)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003049 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3050 sparity->nsectors);
3051
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003052 bio_put(bio);
Zhao Lei20b2e302015-06-04 20:09:15 +08003053
3054 btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3055 scrub_parity_bio_endio_worker, NULL, NULL);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003056 btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003057}
3058
3059static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3060{
3061 struct scrub_ctx *sctx = sparity->sctx;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003062 struct btrfs_fs_info *fs_info = sctx->fs_info;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003063 struct bio *bio;
3064 struct btrfs_raid_bio *rbio;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003065 struct btrfs_bio *bbio = NULL;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003066 u64 length;
3067 int ret;
3068
3069 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3070 sparity->nsectors))
3071 goto out;
3072
Zhao Leia0dd59d2015-07-21 15:42:26 +08003073 length = sparity->logic_end - sparity->logic_start;
Qu Wenruoae6529c2017-03-29 09:33:21 +08003074
3075 btrfs_bio_counter_inc_blocked(fs_info);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003076 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
David Sterba825ad4c2017-03-28 14:45:22 +02003077 &length, &bbio);
Zhao Lei8e5cfb52015-01-20 15:11:33 +08003078 if (ret || !bbio || !bbio->raid_map)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003079 goto bbio_out;
3080
David Sterbac5e4c3d2017-06-12 17:29:41 +02003081 bio = btrfs_io_bio_alloc(0);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003082 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3083 bio->bi_private = sparity;
3084 bio->bi_end_io = scrub_parity_bio_endio;
3085
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04003086 rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
Zhao Lei8e5cfb52015-01-20 15:11:33 +08003087 length, sparity->scrub_dev,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003088 sparity->dbitmap,
3089 sparity->nsectors);
3090 if (!rbio)
3091 goto rbio_out;
3092
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003093 scrub_pending_bio_inc(sctx);
3094 raid56_parity_submit_scrub_rbio(rbio);
3095 return;
3096
3097rbio_out:
3098 bio_put(bio);
3099bbio_out:
Qu Wenruoae6529c2017-03-29 09:33:21 +08003100 btrfs_bio_counter_dec(fs_info);
Zhao Lei6e9606d2015-01-20 15:11:34 +08003101 btrfs_put_bbio(bbio);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003102 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3103 sparity->nsectors);
3104 spin_lock(&sctx->stat_lock);
3105 sctx->stat.malloc_errors++;
3106 spin_unlock(&sctx->stat_lock);
3107out:
3108 scrub_free_parity(sparity);
3109}
3110
3111static inline int scrub_calc_parity_bitmap_len(int nsectors)
3112{
Zhao Leibfca9a62014-12-08 19:55:57 +08003113 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003114}
3115
3116static void scrub_parity_get(struct scrub_parity *sparity)
3117{
Elena Reshetova78a76452017-03-03 10:55:24 +02003118 refcount_inc(&sparity->refs);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003119}
3120
3121static void scrub_parity_put(struct scrub_parity *sparity)
3122{
Elena Reshetova78a76452017-03-03 10:55:24 +02003123 if (!refcount_dec_and_test(&sparity->refs))
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003124 return;
3125
3126 scrub_parity_check_and_repair(sparity);
3127}
3128
3129static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3130 struct map_lookup *map,
3131 struct btrfs_device *sdev,
3132 struct btrfs_path *path,
3133 u64 logic_start,
3134 u64 logic_end)
3135{
Jeff Mahoneyfb456252016-06-22 18:54:56 -04003136 struct btrfs_fs_info *fs_info = sctx->fs_info;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003137 struct btrfs_root *root = fs_info->extent_root;
3138 struct btrfs_root *csum_root = fs_info->csum_root;
3139 struct btrfs_extent_item *extent;
Omar Sandoval4a770892015-06-19 11:52:52 -07003140 struct btrfs_bio *bbio = NULL;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003141 u64 flags;
3142 int ret;
3143 int slot;
3144 struct extent_buffer *l;
3145 struct btrfs_key key;
3146 u64 generation;
3147 u64 extent_logical;
3148 u64 extent_physical;
3149 u64 extent_len;
Omar Sandoval4a770892015-06-19 11:52:52 -07003150 u64 mapped_length;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003151 struct btrfs_device *extent_dev;
3152 struct scrub_parity *sparity;
3153 int nsectors;
3154 int bitmap_len;
3155 int extent_mirror_num;
3156 int stop_loop = 0;
3157
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003158 nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003159 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3160 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3161 GFP_NOFS);
3162 if (!sparity) {
3163 spin_lock(&sctx->stat_lock);
3164 sctx->stat.malloc_errors++;
3165 spin_unlock(&sctx->stat_lock);
3166 return -ENOMEM;
3167 }
3168
3169 sparity->stripe_len = map->stripe_len;
3170 sparity->nsectors = nsectors;
3171 sparity->sctx = sctx;
3172 sparity->scrub_dev = sdev;
3173 sparity->logic_start = logic_start;
3174 sparity->logic_end = logic_end;
Elena Reshetova78a76452017-03-03 10:55:24 +02003175 refcount_set(&sparity->refs, 1);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003176 INIT_LIST_HEAD(&sparity->spages);
3177 sparity->dbitmap = sparity->bitmap;
3178 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3179
3180 ret = 0;
3181 while (logic_start < logic_end) {
3182 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3183 key.type = BTRFS_METADATA_ITEM_KEY;
3184 else
3185 key.type = BTRFS_EXTENT_ITEM_KEY;
3186 key.objectid = logic_start;
3187 key.offset = (u64)-1;
3188
3189 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3190 if (ret < 0)
3191 goto out;
3192
3193 if (ret > 0) {
3194 ret = btrfs_previous_extent_item(root, path, 0);
3195 if (ret < 0)
3196 goto out;
3197 if (ret > 0) {
3198 btrfs_release_path(path);
3199 ret = btrfs_search_slot(NULL, root, &key,
3200 path, 0, 0);
3201 if (ret < 0)
3202 goto out;
3203 }
3204 }
3205
3206 stop_loop = 0;
3207 while (1) {
3208 u64 bytes;
3209
3210 l = path->nodes[0];
3211 slot = path->slots[0];
3212 if (slot >= btrfs_header_nritems(l)) {
3213 ret = btrfs_next_leaf(root, path);
3214 if (ret == 0)
3215 continue;
3216 if (ret < 0)
3217 goto out;
3218
3219 stop_loop = 1;
3220 break;
3221 }
3222 btrfs_item_key_to_cpu(l, &key, slot);
3223
Zhao Leid7cad232015-07-22 13:14:48 +08003224 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3225 key.type != BTRFS_METADATA_ITEM_KEY)
3226 goto next;
3227
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003228 if (key.type == BTRFS_METADATA_ITEM_KEY)
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003229 bytes = fs_info->nodesize;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003230 else
3231 bytes = key.offset;
3232
3233 if (key.objectid + bytes <= logic_start)
3234 goto next;
3235
Zhao Leia0dd59d2015-07-21 15:42:26 +08003236 if (key.objectid >= logic_end) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003237 stop_loop = 1;
3238 break;
3239 }
3240
3241 while (key.objectid >= logic_start + map->stripe_len)
3242 logic_start += map->stripe_len;
3243
3244 extent = btrfs_item_ptr(l, slot,
3245 struct btrfs_extent_item);
3246 flags = btrfs_extent_flags(l, extent);
3247 generation = btrfs_extent_generation(l, extent);
3248
Zhao Leia323e812015-07-23 12:29:49 +08003249 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3250 (key.objectid < logic_start ||
3251 key.objectid + bytes >
3252 logic_start + map->stripe_len)) {
Jeff Mahoney5d163e02016-09-20 10:05:00 -04003253 btrfs_err(fs_info,
3254 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
Zhao Leia323e812015-07-23 12:29:49 +08003255 key.objectid, logic_start);
Zhao Lei9799d2c32015-08-25 21:31:40 +08003256 spin_lock(&sctx->stat_lock);
3257 sctx->stat.uncorrectable_errors++;
3258 spin_unlock(&sctx->stat_lock);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003259 goto next;
3260 }
3261again:
3262 extent_logical = key.objectid;
3263 extent_len = bytes;
3264
3265 if (extent_logical < logic_start) {
3266 extent_len -= logic_start - extent_logical;
3267 extent_logical = logic_start;
3268 }
3269
3270 if (extent_logical + extent_len >
3271 logic_start + map->stripe_len)
3272 extent_len = logic_start + map->stripe_len -
3273 extent_logical;
3274
3275 scrub_parity_mark_sectors_data(sparity, extent_logical,
3276 extent_len);
3277
Omar Sandoval4a770892015-06-19 11:52:52 -07003278 mapped_length = extent_len;
Zhao Leif1fee652016-05-17 17:37:38 +08003279 bbio = NULL;
Christoph Hellwigcf8cddd2016-10-27 09:27:36 +02003280 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3281 extent_logical, &mapped_length, &bbio,
3282 0);
Omar Sandoval4a770892015-06-19 11:52:52 -07003283 if (!ret) {
3284 if (!bbio || mapped_length < extent_len)
3285 ret = -EIO;
3286 }
3287 if (ret) {
3288 btrfs_put_bbio(bbio);
3289 goto out;
3290 }
3291 extent_physical = bbio->stripes[0].physical;
3292 extent_mirror_num = bbio->mirror_num;
3293 extent_dev = bbio->stripes[0].dev;
3294 btrfs_put_bbio(bbio);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003295
3296 ret = btrfs_lookup_csums_range(csum_root,
3297 extent_logical,
3298 extent_logical + extent_len - 1,
3299 &sctx->csum_list, 1);
3300 if (ret)
3301 goto out;
3302
3303 ret = scrub_extent_for_parity(sparity, extent_logical,
3304 extent_len,
3305 extent_physical,
3306 extent_dev, flags,
3307 generation,
3308 extent_mirror_num);
Zhao Lei6fa96d72015-07-21 12:22:30 +08003309
3310 scrub_free_csums(sctx);
3311
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003312 if (ret)
3313 goto out;
3314
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003315 if (extent_logical + extent_len <
3316 key.objectid + bytes) {
3317 logic_start += map->stripe_len;
3318
3319 if (logic_start >= logic_end) {
3320 stop_loop = 1;
3321 break;
3322 }
3323
3324 if (logic_start < key.objectid + bytes) {
3325 cond_resched();
3326 goto again;
3327 }
3328 }
3329next:
3330 path->slots[0]++;
3331 }
3332
3333 btrfs_release_path(path);
3334
3335 if (stop_loop)
3336 break;
3337
3338 logic_start += map->stripe_len;
3339 }
3340out:
3341 if (ret < 0)
3342 scrub_parity_mark_sectors_error(sparity, logic_start,
Zhao Leia0dd59d2015-07-21 15:42:26 +08003343 logic_end - logic_start);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003344 scrub_parity_put(sparity);
3345 scrub_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003346 mutex_lock(&sctx->wr_lock);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003347 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003348 mutex_unlock(&sctx->wr_lock);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003349
3350 btrfs_release_path(path);
3351 return ret < 0 ? ret : 0;
3352}
3353
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003354static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003355 struct map_lookup *map,
3356 struct btrfs_device *scrub_dev,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003357 int num, u64 base, u64 length,
3358 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003359{
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003360 struct btrfs_path *path, *ppath;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04003361 struct btrfs_fs_info *fs_info = sctx->fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +01003362 struct btrfs_root *root = fs_info->extent_root;
3363 struct btrfs_root *csum_root = fs_info->csum_root;
3364 struct btrfs_extent_item *extent;
Arne Jansene7786c32011-05-28 20:58:38 +00003365 struct blk_plug plug;
Arne Jansena2de7332011-03-08 14:14:00 +01003366 u64 flags;
3367 int ret;
3368 int slot;
Arne Jansena2de7332011-03-08 14:14:00 +01003369 u64 nstripes;
Arne Jansena2de7332011-03-08 14:14:00 +01003370 struct extent_buffer *l;
Arne Jansena2de7332011-03-08 14:14:00 +01003371 u64 physical;
3372 u64 logical;
Liu Bo625f1c8d2013-04-27 02:56:57 +00003373 u64 logic_end;
Wang Shilong3b080b22014-04-01 18:01:43 +08003374 u64 physical_end;
Arne Jansena2de7332011-03-08 14:14:00 +01003375 u64 generation;
Jan Schmidte12fa9c2011-06-17 15:55:21 +02003376 int mirror_num;
Arne Jansen7a262852011-06-10 12:39:23 +02003377 struct reada_control *reada1;
3378 struct reada_control *reada2;
David Sterbae6c11f92016-03-24 18:00:53 +01003379 struct btrfs_key key;
Arne Jansen7a262852011-06-10 12:39:23 +02003380 struct btrfs_key key_end;
Arne Jansena2de7332011-03-08 14:14:00 +01003381 u64 increment = map->stripe_len;
3382 u64 offset;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003383 u64 extent_logical;
3384 u64 extent_physical;
3385 u64 extent_len;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003386 u64 stripe_logical;
3387 u64 stripe_end;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003388 struct btrfs_device *extent_dev;
3389 int extent_mirror_num;
Wang Shilong3b080b22014-04-01 18:01:43 +08003390 int stop_loop = 0;
David Woodhouse53b381b2013-01-29 18:40:14 -05003391
Wang Shilong3b080b22014-04-01 18:01:43 +08003392 physical = map->stripes[num].physical;
Arne Jansena2de7332011-03-08 14:14:00 +01003393 offset = 0;
Liu Bo42c61ab2017-04-03 13:45:24 -07003394 nstripes = div64_u64(length, map->stripe_len);
Arne Jansena2de7332011-03-08 14:14:00 +01003395 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3396 offset = map->stripe_len * num;
3397 increment = map->stripe_len * map->num_stripes;
Jan Schmidt193ea742011-06-13 19:56:54 +02003398 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003399 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3400 int factor = map->num_stripes / map->sub_stripes;
3401 offset = map->stripe_len * (num / map->sub_stripes);
3402 increment = map->stripe_len * factor;
Jan Schmidt193ea742011-06-13 19:56:54 +02003403 mirror_num = num % map->sub_stripes + 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003404 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3405 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02003406 mirror_num = num % map->num_stripes + 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003407 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3408 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02003409 mirror_num = num % map->num_stripes + 1;
Zhao Leiffe2d202015-01-20 15:11:44 +08003410 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003411 get_raid56_logic_offset(physical, num, map, &offset, NULL);
Wang Shilong3b080b22014-04-01 18:01:43 +08003412 increment = map->stripe_len * nr_data_stripes(map);
3413 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003414 } else {
3415 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02003416 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003417 }
3418
3419 path = btrfs_alloc_path();
3420 if (!path)
3421 return -ENOMEM;
3422
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003423 ppath = btrfs_alloc_path();
3424 if (!ppath) {
Tsutomu Itoh379d6852015-01-09 17:37:52 +09003425 btrfs_free_path(path);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003426 return -ENOMEM;
3427 }
3428
Stefan Behrensb5d67f62012-03-27 14:21:27 -04003429 /*
3430 * work on commit root. The related disk blocks are static as
3431 * long as COW is applied. This means, it is save to rewrite
3432 * them to repair disk errors without any race conditions
3433 */
Arne Jansena2de7332011-03-08 14:14:00 +01003434 path->search_commit_root = 1;
3435 path->skip_locking = 1;
3436
Gui Hecheng063c54d2015-01-09 09:39:40 +08003437 ppath->search_commit_root = 1;
3438 ppath->skip_locking = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003439 /*
Arne Jansen7a262852011-06-10 12:39:23 +02003440 * trigger the readahead for extent tree csum tree and wait for
3441 * completion. During readahead, the scrub is officially paused
3442 * to not hold off transaction commits
Arne Jansena2de7332011-03-08 14:14:00 +01003443 */
3444 logical = base + offset;
Wang Shilong3b080b22014-04-01 18:01:43 +08003445 physical_end = physical + nstripes * map->stripe_len;
Zhao Leiffe2d202015-01-20 15:11:44 +08003446 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
Wang Shilong3b080b22014-04-01 18:01:43 +08003447 get_raid56_logic_offset(physical_end, num,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003448 map, &logic_end, NULL);
Wang Shilong3b080b22014-04-01 18:01:43 +08003449 logic_end += base;
3450 } else {
3451 logic_end = logical + increment * nstripes;
3452 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003453 wait_event(sctx->list_wait,
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003454 atomic_read(&sctx->bios_in_flight) == 0);
Wang Shilongcb7ab022013-12-04 21:16:53 +08003455 scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01003456
Arne Jansen7a262852011-06-10 12:39:23 +02003457 /* FIXME it might be better to start readahead at commit root */
David Sterbae6c11f92016-03-24 18:00:53 +01003458 key.objectid = logical;
3459 key.type = BTRFS_EXTENT_ITEM_KEY;
3460 key.offset = (u64)0;
Wang Shilong3b080b22014-04-01 18:01:43 +08003461 key_end.objectid = logic_end;
Josef Bacik3173a182013-03-07 14:22:04 -05003462 key_end.type = BTRFS_METADATA_ITEM_KEY;
3463 key_end.offset = (u64)-1;
David Sterbae6c11f92016-03-24 18:00:53 +01003464 reada1 = btrfs_reada_add(root, &key, &key_end);
Arne Jansena2de7332011-03-08 14:14:00 +01003465
David Sterbae6c11f92016-03-24 18:00:53 +01003466 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3467 key.type = BTRFS_EXTENT_CSUM_KEY;
3468 key.offset = logical;
Arne Jansen7a262852011-06-10 12:39:23 +02003469 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3470 key_end.type = BTRFS_EXTENT_CSUM_KEY;
Wang Shilong3b080b22014-04-01 18:01:43 +08003471 key_end.offset = logic_end;
David Sterbae6c11f92016-03-24 18:00:53 +01003472 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
Arne Jansena2de7332011-03-08 14:14:00 +01003473
Arne Jansen7a262852011-06-10 12:39:23 +02003474 if (!IS_ERR(reada1))
3475 btrfs_reada_wait(reada1);
3476 if (!IS_ERR(reada2))
3477 btrfs_reada_wait(reada2);
Arne Jansena2de7332011-03-08 14:14:00 +01003478
Arne Jansena2de7332011-03-08 14:14:00 +01003479
3480 /*
3481 * collect all data csums for the stripe to avoid seeking during
3482 * the scrub. This might currently (crc32) end up to be about 1MB
3483 */
Arne Jansene7786c32011-05-28 20:58:38 +00003484 blk_start_plug(&plug);
Arne Jansena2de7332011-03-08 14:14:00 +01003485
Arne Jansena2de7332011-03-08 14:14:00 +01003486 /*
3487 * now find all extents for each stripe and scrub them
3488 */
Arne Jansena2de7332011-03-08 14:14:00 +01003489 ret = 0;
Wang Shilong3b080b22014-04-01 18:01:43 +08003490 while (physical < physical_end) {
Arne Jansena2de7332011-03-08 14:14:00 +01003491 /*
3492 * canceled?
3493 */
3494 if (atomic_read(&fs_info->scrub_cancel_req) ||
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003495 atomic_read(&sctx->cancel_req)) {
Arne Jansena2de7332011-03-08 14:14:00 +01003496 ret = -ECANCELED;
3497 goto out;
3498 }
3499 /*
3500 * check to see if we have to pause
3501 */
3502 if (atomic_read(&fs_info->scrub_pause_req)) {
3503 /* push queued extents */
David Sterba2073c4c2017-03-31 17:12:51 +02003504 sctx->flush_all_writes = true;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003505 scrub_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003506 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003507 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003508 mutex_unlock(&sctx->wr_lock);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003509 wait_event(sctx->list_wait,
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003510 atomic_read(&sctx->bios_in_flight) == 0);
David Sterba2073c4c2017-03-31 17:12:51 +02003511 sctx->flush_all_writes = false;
Wang Shilong3cb09292013-12-04 21:15:19 +08003512 scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01003513 }
3514
Zhao Leif2f66a22015-07-21 12:22:29 +08003515 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3516 ret = get_raid56_logic_offset(physical, num, map,
3517 &logical,
3518 &stripe_logical);
3519 logical += base;
3520 if (ret) {
Zhao Lei79553232015-08-18 17:54:30 +08003521 /* it is parity strip */
Zhao Leif2f66a22015-07-21 12:22:29 +08003522 stripe_logical += base;
Zhao Leia0dd59d2015-07-21 15:42:26 +08003523 stripe_end = stripe_logical + increment;
Zhao Leif2f66a22015-07-21 12:22:29 +08003524 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3525 ppath, stripe_logical,
3526 stripe_end);
3527 if (ret)
3528 goto out;
3529 goto skip;
3530 }
3531 }
3532
Wang Shilong7c76edb2014-01-12 21:38:32 +08003533 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3534 key.type = BTRFS_METADATA_ITEM_KEY;
3535 else
3536 key.type = BTRFS_EXTENT_ITEM_KEY;
Arne Jansena2de7332011-03-08 14:14:00 +01003537 key.objectid = logical;
Liu Bo625f1c8d2013-04-27 02:56:57 +00003538 key.offset = (u64)-1;
Arne Jansena2de7332011-03-08 14:14:00 +01003539
3540 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3541 if (ret < 0)
3542 goto out;
Josef Bacik3173a182013-03-07 14:22:04 -05003543
Arne Jansen8c510322011-06-03 10:09:26 +02003544 if (ret > 0) {
Wang Shilongade2e0b2014-01-12 21:38:33 +08003545 ret = btrfs_previous_extent_item(root, path, 0);
Arne Jansena2de7332011-03-08 14:14:00 +01003546 if (ret < 0)
3547 goto out;
Arne Jansen8c510322011-06-03 10:09:26 +02003548 if (ret > 0) {
3549 /* there's no smaller item, so stick with the
3550 * larger one */
3551 btrfs_release_path(path);
3552 ret = btrfs_search_slot(NULL, root, &key,
3553 path, 0, 0);
3554 if (ret < 0)
3555 goto out;
3556 }
Arne Jansena2de7332011-03-08 14:14:00 +01003557 }
3558
Liu Bo625f1c8d2013-04-27 02:56:57 +00003559 stop_loop = 0;
Arne Jansena2de7332011-03-08 14:14:00 +01003560 while (1) {
Josef Bacik3173a182013-03-07 14:22:04 -05003561 u64 bytes;
3562
Arne Jansena2de7332011-03-08 14:14:00 +01003563 l = path->nodes[0];
3564 slot = path->slots[0];
3565 if (slot >= btrfs_header_nritems(l)) {
3566 ret = btrfs_next_leaf(root, path);
3567 if (ret == 0)
3568 continue;
3569 if (ret < 0)
3570 goto out;
3571
Liu Bo625f1c8d2013-04-27 02:56:57 +00003572 stop_loop = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003573 break;
3574 }
3575 btrfs_item_key_to_cpu(l, &key, slot);
3576
Zhao Leid7cad232015-07-22 13:14:48 +08003577 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3578 key.type != BTRFS_METADATA_ITEM_KEY)
3579 goto next;
3580
Josef Bacik3173a182013-03-07 14:22:04 -05003581 if (key.type == BTRFS_METADATA_ITEM_KEY)
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003582 bytes = fs_info->nodesize;
Josef Bacik3173a182013-03-07 14:22:04 -05003583 else
3584 bytes = key.offset;
3585
3586 if (key.objectid + bytes <= logical)
Arne Jansena2de7332011-03-08 14:14:00 +01003587 goto next;
3588
Liu Bo625f1c8d2013-04-27 02:56:57 +00003589 if (key.objectid >= logical + map->stripe_len) {
3590 /* out of this device extent */
3591 if (key.objectid >= logic_end)
3592 stop_loop = 1;
3593 break;
3594 }
Arne Jansena2de7332011-03-08 14:14:00 +01003595
3596 extent = btrfs_item_ptr(l, slot,
3597 struct btrfs_extent_item);
3598 flags = btrfs_extent_flags(l, extent);
3599 generation = btrfs_extent_generation(l, extent);
3600
Zhao Leia323e812015-07-23 12:29:49 +08003601 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3602 (key.objectid < logical ||
3603 key.objectid + bytes >
3604 logical + map->stripe_len)) {
Frank Holtonefe120a2013-12-20 11:37:06 -05003605 btrfs_err(fs_info,
Jeff Mahoney5d163e02016-09-20 10:05:00 -04003606 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02003607 key.objectid, logical);
Zhao Lei9799d2c32015-08-25 21:31:40 +08003608 spin_lock(&sctx->stat_lock);
3609 sctx->stat.uncorrectable_errors++;
3610 spin_unlock(&sctx->stat_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01003611 goto next;
3612 }
3613
Liu Bo625f1c8d2013-04-27 02:56:57 +00003614again:
3615 extent_logical = key.objectid;
3616 extent_len = bytes;
3617
Arne Jansena2de7332011-03-08 14:14:00 +01003618 /*
3619 * trim extent to this stripe
3620 */
Liu Bo625f1c8d2013-04-27 02:56:57 +00003621 if (extent_logical < logical) {
3622 extent_len -= logical - extent_logical;
3623 extent_logical = logical;
Arne Jansena2de7332011-03-08 14:14:00 +01003624 }
Liu Bo625f1c8d2013-04-27 02:56:57 +00003625 if (extent_logical + extent_len >
Arne Jansena2de7332011-03-08 14:14:00 +01003626 logical + map->stripe_len) {
Liu Bo625f1c8d2013-04-27 02:56:57 +00003627 extent_len = logical + map->stripe_len -
3628 extent_logical;
Arne Jansena2de7332011-03-08 14:14:00 +01003629 }
3630
Liu Bo625f1c8d2013-04-27 02:56:57 +00003631 extent_physical = extent_logical - logical + physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003632 extent_dev = scrub_dev;
3633 extent_mirror_num = mirror_num;
3634 if (is_dev_replace)
3635 scrub_remap_extent(fs_info, extent_logical,
3636 extent_len, &extent_physical,
3637 &extent_dev,
3638 &extent_mirror_num);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003639
Zhao Leife8cf652015-07-22 13:14:47 +08003640 ret = btrfs_lookup_csums_range(csum_root,
3641 extent_logical,
3642 extent_logical +
3643 extent_len - 1,
3644 &sctx->csum_list, 1);
Arne Jansena2de7332011-03-08 14:14:00 +01003645 if (ret)
3646 goto out;
3647
Liu Bo6ca17652018-03-07 12:08:09 -07003648 ret = scrub_extent(sctx, map, extent_logical, extent_len,
Liu Bo625f1c8d2013-04-27 02:56:57 +00003649 extent_physical, extent_dev, flags,
3650 generation, extent_mirror_num,
Stefan Behrens115930c2013-07-04 16:14:23 +02003651 extent_logical - logical + physical);
Zhao Lei6fa96d72015-07-21 12:22:30 +08003652
3653 scrub_free_csums(sctx);
3654
Liu Bo625f1c8d2013-04-27 02:56:57 +00003655 if (ret)
3656 goto out;
3657
3658 if (extent_logical + extent_len <
3659 key.objectid + bytes) {
Zhao Leiffe2d202015-01-20 15:11:44 +08003660 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
Wang Shilong3b080b22014-04-01 18:01:43 +08003661 /*
3662 * loop until we find next data stripe
3663 * or we have finished all stripes.
3664 */
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003665loop:
3666 physical += map->stripe_len;
3667 ret = get_raid56_logic_offset(physical,
3668 num, map, &logical,
3669 &stripe_logical);
3670 logical += base;
3671
3672 if (ret && physical < physical_end) {
3673 stripe_logical += base;
3674 stripe_end = stripe_logical +
Zhao Leia0dd59d2015-07-21 15:42:26 +08003675 increment;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003676 ret = scrub_raid56_parity(sctx,
3677 map, scrub_dev, ppath,
3678 stripe_logical,
3679 stripe_end);
3680 if (ret)
3681 goto out;
3682 goto loop;
3683 }
Wang Shilong3b080b22014-04-01 18:01:43 +08003684 } else {
3685 physical += map->stripe_len;
3686 logical += increment;
3687 }
Liu Bo625f1c8d2013-04-27 02:56:57 +00003688 if (logical < key.objectid + bytes) {
3689 cond_resched();
3690 goto again;
3691 }
3692
Wang Shilong3b080b22014-04-01 18:01:43 +08003693 if (physical >= physical_end) {
Liu Bo625f1c8d2013-04-27 02:56:57 +00003694 stop_loop = 1;
3695 break;
3696 }
3697 }
Arne Jansena2de7332011-03-08 14:14:00 +01003698next:
3699 path->slots[0]++;
3700 }
Chris Mason71267332011-05-23 06:30:52 -04003701 btrfs_release_path(path);
Wang Shilong3b080b22014-04-01 18:01:43 +08003702skip:
Arne Jansena2de7332011-03-08 14:14:00 +01003703 logical += increment;
3704 physical += map->stripe_len;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003705 spin_lock(&sctx->stat_lock);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003706 if (stop_loop)
3707 sctx->stat.last_physical = map->stripes[num].physical +
3708 length;
3709 else
3710 sctx->stat.last_physical = physical;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003711 spin_unlock(&sctx->stat_lock);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003712 if (stop_loop)
3713 break;
Arne Jansena2de7332011-03-08 14:14:00 +01003714 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01003715out:
Arne Jansena2de7332011-03-08 14:14:00 +01003716 /* push queued extents */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003717 scrub_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003718 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003719 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003720 mutex_unlock(&sctx->wr_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01003721
Arne Jansene7786c32011-05-28 20:58:38 +00003722 blk_finish_plug(&plug);
Arne Jansena2de7332011-03-08 14:14:00 +01003723 btrfs_free_path(path);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003724 btrfs_free_path(ppath);
Arne Jansena2de7332011-03-08 14:14:00 +01003725 return ret < 0 ? ret : 0;
3726}
3727
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003728static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003729 struct btrfs_device *scrub_dev,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003730 u64 chunk_offset, u64 length,
Filipe Manana020d5b72015-11-19 10:57:20 +00003731 u64 dev_offset,
3732 struct btrfs_block_group_cache *cache,
3733 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003734{
Jeff Mahoneyfb456252016-06-22 18:54:56 -04003735 struct btrfs_fs_info *fs_info = sctx->fs_info;
3736 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
Arne Jansena2de7332011-03-08 14:14:00 +01003737 struct map_lookup *map;
3738 struct extent_map *em;
3739 int i;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003740 int ret = 0;
Arne Jansena2de7332011-03-08 14:14:00 +01003741
3742 read_lock(&map_tree->map_tree.lock);
3743 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3744 read_unlock(&map_tree->map_tree.lock);
3745
Filipe Manana020d5b72015-11-19 10:57:20 +00003746 if (!em) {
3747 /*
3748 * Might have been an unused block group deleted by the cleaner
3749 * kthread or relocation.
3750 */
3751 spin_lock(&cache->lock);
3752 if (!cache->removed)
3753 ret = -EINVAL;
3754 spin_unlock(&cache->lock);
3755
3756 return ret;
3757 }
Arne Jansena2de7332011-03-08 14:14:00 +01003758
Jeff Mahoney95617d62015-06-03 10:55:48 -04003759 map = em->map_lookup;
Arne Jansena2de7332011-03-08 14:14:00 +01003760 if (em->start != chunk_offset)
3761 goto out;
3762
3763 if (em->len < length)
3764 goto out;
3765
3766 for (i = 0; i < map->num_stripes; ++i) {
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003767 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
Arne Jansen859acaf2012-02-09 15:09:02 +01003768 map->stripes[i].physical == dev_offset) {
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003769 ret = scrub_stripe(sctx, map, scrub_dev, i,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003770 chunk_offset, length,
3771 is_dev_replace);
Arne Jansena2de7332011-03-08 14:14:00 +01003772 if (ret)
3773 goto out;
3774 }
3775 }
3776out:
3777 free_extent_map(em);
3778
3779 return ret;
3780}
3781
3782static noinline_for_stack
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003783int scrub_enumerate_chunks(struct scrub_ctx *sctx,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003784 struct btrfs_device *scrub_dev, u64 start, u64 end,
3785 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003786{
3787 struct btrfs_dev_extent *dev_extent = NULL;
3788 struct btrfs_path *path;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003789 struct btrfs_fs_info *fs_info = sctx->fs_info;
3790 struct btrfs_root *root = fs_info->dev_root;
Arne Jansena2de7332011-03-08 14:14:00 +01003791 u64 length;
Arne Jansena2de7332011-03-08 14:14:00 +01003792 u64 chunk_offset;
Zhaolei55e3a602015-08-05 16:43:30 +08003793 int ret = 0;
Zhaolei76a8efa2015-11-17 18:46:17 +08003794 int ro_set;
Arne Jansena2de7332011-03-08 14:14:00 +01003795 int slot;
3796 struct extent_buffer *l;
3797 struct btrfs_key key;
3798 struct btrfs_key found_key;
3799 struct btrfs_block_group_cache *cache;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003800 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
Arne Jansena2de7332011-03-08 14:14:00 +01003801
3802 path = btrfs_alloc_path();
3803 if (!path)
3804 return -ENOMEM;
3805
David Sterbae4058b52015-11-27 16:31:35 +01003806 path->reada = READA_FORWARD;
Arne Jansena2de7332011-03-08 14:14:00 +01003807 path->search_commit_root = 1;
3808 path->skip_locking = 1;
3809
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003810 key.objectid = scrub_dev->devid;
Arne Jansena2de7332011-03-08 14:14:00 +01003811 key.offset = 0ull;
3812 key.type = BTRFS_DEV_EXTENT_KEY;
3813
Arne Jansena2de7332011-03-08 14:14:00 +01003814 while (1) {
3815 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3816 if (ret < 0)
Arne Jansen8c510322011-06-03 10:09:26 +02003817 break;
3818 if (ret > 0) {
3819 if (path->slots[0] >=
3820 btrfs_header_nritems(path->nodes[0])) {
3821 ret = btrfs_next_leaf(root, path);
Zhaolei55e3a602015-08-05 16:43:30 +08003822 if (ret < 0)
Arne Jansen8c510322011-06-03 10:09:26 +02003823 break;
Zhaolei55e3a602015-08-05 16:43:30 +08003824 if (ret > 0) {
3825 ret = 0;
3826 break;
3827 }
3828 } else {
3829 ret = 0;
Arne Jansen8c510322011-06-03 10:09:26 +02003830 }
3831 }
Arne Jansena2de7332011-03-08 14:14:00 +01003832
3833 l = path->nodes[0];
3834 slot = path->slots[0];
3835
3836 btrfs_item_key_to_cpu(l, &found_key, slot);
3837
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003838 if (found_key.objectid != scrub_dev->devid)
Arne Jansena2de7332011-03-08 14:14:00 +01003839 break;
3840
David Sterba962a2982014-06-04 18:41:45 +02003841 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
Arne Jansena2de7332011-03-08 14:14:00 +01003842 break;
3843
3844 if (found_key.offset >= end)
3845 break;
3846
3847 if (found_key.offset < key.offset)
3848 break;
3849
3850 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3851 length = btrfs_dev_extent_length(l, dev_extent);
3852
Qu Wenruoced96ed2014-06-19 10:42:51 +08003853 if (found_key.offset + length <= start)
3854 goto skip;
Arne Jansena2de7332011-03-08 14:14:00 +01003855
Arne Jansena2de7332011-03-08 14:14:00 +01003856 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3857
3858 /*
3859 * get a reference on the corresponding block group to prevent
3860 * the chunk from going away while we scrub it
3861 */
3862 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
Qu Wenruoced96ed2014-06-19 10:42:51 +08003863
3864 /* some chunks are removed but not committed to disk yet,
3865 * continue scrubbing */
3866 if (!cache)
3867 goto skip;
3868
Zhaolei55e3a602015-08-05 16:43:30 +08003869 /*
3870 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3871 * to avoid deadlock caused by:
3872 * btrfs_inc_block_group_ro()
3873 * -> btrfs_wait_for_commit()
3874 * -> btrfs_commit_transaction()
3875 * -> btrfs_scrub_pause()
3876 */
3877 scrub_pause_on(fs_info);
Jeff Mahoney5e00f192017-02-15 16:28:29 -05003878 ret = btrfs_inc_block_group_ro(fs_info, cache);
Filipe Mananaf0e9b7d2016-05-14 09:12:53 +01003879 if (!ret && is_dev_replace) {
3880 /*
3881 * If we are doing a device replace wait for any tasks
3882 * that started dellaloc right before we set the block
3883 * group to RO mode, as they might have just allocated
3884 * an extent from it or decided they could do a nocow
3885 * write. And if any such tasks did that, wait for their
3886 * ordered extents to complete and then commit the
3887 * current transaction, so that we can later see the new
3888 * extent items in the extent tree - the ordered extents
3889 * create delayed data references (for cow writes) when
3890 * they complete, which will be run and insert the
3891 * corresponding extent items into the extent tree when
3892 * we commit the transaction they used when running
3893 * inode.c:btrfs_finish_ordered_io(). We later use
3894 * the commit root of the extent tree to find extents
3895 * to copy from the srcdev into the tgtdev, and we don't
3896 * want to miss any new extents.
3897 */
3898 btrfs_wait_block_group_reservations(cache);
3899 btrfs_wait_nocow_writers(cache);
Chris Mason6374e57a2017-06-23 09:48:21 -07003900 ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
Filipe Mananaf0e9b7d2016-05-14 09:12:53 +01003901 cache->key.objectid,
3902 cache->key.offset);
3903 if (ret > 0) {
3904 struct btrfs_trans_handle *trans;
3905
3906 trans = btrfs_join_transaction(root);
3907 if (IS_ERR(trans))
3908 ret = PTR_ERR(trans);
3909 else
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04003910 ret = btrfs_commit_transaction(trans);
Filipe Mananaf0e9b7d2016-05-14 09:12:53 +01003911 if (ret) {
3912 scrub_pause_off(fs_info);
3913 btrfs_put_block_group(cache);
3914 break;
3915 }
3916 }
3917 }
Zhaolei55e3a602015-08-05 16:43:30 +08003918 scrub_pause_off(fs_info);
Zhaolei76a8efa2015-11-17 18:46:17 +08003919
3920 if (ret == 0) {
3921 ro_set = 1;
3922 } else if (ret == -ENOSPC) {
3923 /*
3924 * btrfs_inc_block_group_ro return -ENOSPC when it
3925 * failed in creating new chunk for metadata.
3926 * It is not a problem for scrub/replace, because
3927 * metadata are always cowed, and our scrub paused
3928 * commit_transactions.
3929 */
3930 ro_set = 0;
3931 } else {
Jeff Mahoney5d163e02016-09-20 10:05:00 -04003932 btrfs_warn(fs_info,
David Sterba913e1532017-07-13 15:32:18 +02003933 "failed setting block group ro: %d", ret);
Zhaolei55e3a602015-08-05 16:43:30 +08003934 btrfs_put_block_group(cache);
3935 break;
3936 }
3937
David Sterba7e79cb82018-03-24 02:11:38 +01003938 btrfs_dev_replace_write_lock(&fs_info->dev_replace);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003939 dev_replace->cursor_right = found_key.offset + length;
3940 dev_replace->cursor_left = found_key.offset;
3941 dev_replace->item_needs_writeback = 1;
David Sterba7e79cb82018-03-24 02:11:38 +01003942 btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
Zhao Lei8c204c92015-08-19 15:02:40 +08003943 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
Filipe Manana020d5b72015-11-19 10:57:20 +00003944 found_key.offset, cache, is_dev_replace);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003945
3946 /*
3947 * flush, submit all pending read and write bios, afterwards
3948 * wait for them.
3949 * Note that in the dev replace case, a read request causes
3950 * write requests that are submitted in the read completion
3951 * worker. Therefore in the current situation, it is required
3952 * that all write requests are flushed, so that all read and
3953 * write requests are really completed when bios_in_flight
3954 * changes to 0.
3955 */
David Sterba2073c4c2017-03-31 17:12:51 +02003956 sctx->flush_all_writes = true;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003957 scrub_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003958 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003959 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003960 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003961
3962 wait_event(sctx->list_wait,
3963 atomic_read(&sctx->bios_in_flight) == 0);
Zhaoleib708ce92015-08-05 16:43:29 +08003964
3965 scrub_pause_on(fs_info);
Wang Shilong12cf9372014-02-19 19:24:17 +08003966
3967 /*
3968 * must be called before we decrease @scrub_paused.
3969 * make sure we don't block transaction commit while
3970 * we are waiting pending workers finished.
3971 */
Stefan Behrensff023aa2012-11-06 11:43:11 +01003972 wait_event(sctx->list_wait,
3973 atomic_read(&sctx->workers_pending) == 0);
David Sterba2073c4c2017-03-31 17:12:51 +02003974 sctx->flush_all_writes = false;
Wang Shilong12cf9372014-02-19 19:24:17 +08003975
Zhaoleib708ce92015-08-05 16:43:29 +08003976 scrub_pause_off(fs_info);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003977
David Sterba7e79cb82018-03-24 02:11:38 +01003978 btrfs_dev_replace_write_lock(&fs_info->dev_replace);
Filipe Manana1a1a8b72016-05-14 19:44:40 +01003979 dev_replace->cursor_left = dev_replace->cursor_right;
3980 dev_replace->item_needs_writeback = 1;
David Sterba7e79cb82018-03-24 02:11:38 +01003981 btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
Filipe Manana1a1a8b72016-05-14 19:44:40 +01003982
Zhaolei76a8efa2015-11-17 18:46:17 +08003983 if (ro_set)
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04003984 btrfs_dec_block_group_ro(cache);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003985
Filipe Manana758f2df2015-11-19 11:45:48 +00003986 /*
3987 * We might have prevented the cleaner kthread from deleting
3988 * this block group if it was already unused because we raced
3989 * and set it to RO mode first. So add it back to the unused
3990 * list, otherwise it might not ever be deleted unless a manual
3991 * balance is triggered or it becomes used and unused again.
3992 */
3993 spin_lock(&cache->lock);
3994 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3995 btrfs_block_group_used(&cache->item) == 0) {
3996 spin_unlock(&cache->lock);
3997 spin_lock(&fs_info->unused_bgs_lock);
3998 if (list_empty(&cache->bg_list)) {
3999 btrfs_get_block_group(cache);
4000 list_add_tail(&cache->bg_list,
4001 &fs_info->unused_bgs);
4002 }
4003 spin_unlock(&fs_info->unused_bgs_lock);
4004 } else {
4005 spin_unlock(&cache->lock);
4006 }
4007
Arne Jansena2de7332011-03-08 14:14:00 +01004008 btrfs_put_block_group(cache);
4009 if (ret)
4010 break;
Stefan Behrensaf1be4f2012-11-27 17:39:51 +00004011 if (is_dev_replace &&
4012 atomic64_read(&dev_replace->num_write_errors) > 0) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01004013 ret = -EIO;
4014 break;
4015 }
4016 if (sctx->stat.malloc_errors > 0) {
4017 ret = -ENOMEM;
4018 break;
4019 }
Qu Wenruoced96ed2014-06-19 10:42:51 +08004020skip:
Arne Jansena2de7332011-03-08 14:14:00 +01004021 key.offset = found_key.offset + length;
Chris Mason71267332011-05-23 06:30:52 -04004022 btrfs_release_path(path);
Arne Jansena2de7332011-03-08 14:14:00 +01004023 }
4024
Arne Jansena2de7332011-03-08 14:14:00 +01004025 btrfs_free_path(path);
Arne Jansen8c510322011-06-03 10:09:26 +02004026
Zhaolei55e3a602015-08-05 16:43:30 +08004027 return ret;
Arne Jansena2de7332011-03-08 14:14:00 +01004028}
4029
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01004030static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4031 struct btrfs_device *scrub_dev)
Arne Jansena2de7332011-03-08 14:14:00 +01004032{
4033 int i;
4034 u64 bytenr;
4035 u64 gen;
4036 int ret;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004037 struct btrfs_fs_info *fs_info = sctx->fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +01004038
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004039 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
Jeff Mahoney79787ea2012-03-12 16:03:00 +01004040 return -EIO;
4041
Miao Xie5f546062014-07-24 11:37:09 +08004042 /* Seed devices of a new filesystem has their own generation. */
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004043 if (scrub_dev->fs_devices != fs_info->fs_devices)
Miao Xie5f546062014-07-24 11:37:09 +08004044 gen = scrub_dev->generation;
4045 else
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004046 gen = fs_info->last_trans_committed;
Arne Jansena2de7332011-03-08 14:14:00 +01004047
4048 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4049 bytenr = btrfs_sb_offset(i);
Miao Xie935e5cc2014-09-03 21:35:33 +08004050 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4051 scrub_dev->commit_total_bytes)
Arne Jansena2de7332011-03-08 14:14:00 +01004052 break;
4053
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004054 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01004055 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
Stefan Behrensff023aa2012-11-06 11:43:11 +01004056 NULL, 1, bytenr);
Arne Jansena2de7332011-03-08 14:14:00 +01004057 if (ret)
4058 return ret;
4059 }
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01004060 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
Arne Jansena2de7332011-03-08 14:14:00 +01004061
4062 return 0;
4063}
4064
4065/*
4066 * get a reference count on fs_info->scrub_workers. start worker if necessary
4067 */
Stefan Behrensff023aa2012-11-06 11:43:11 +01004068static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4069 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01004070{
David Sterba6f011052015-02-16 18:34:01 +01004071 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
Qu Wenruo0339ef22014-02-28 10:46:17 +08004072 int max_active = fs_info->thread_pool_size;
Arne Jansena2de7332011-03-08 14:14:00 +01004073
Arne Jansen632dd772011-06-10 12:07:07 +02004074 if (fs_info->scrub_workers_refcnt == 0) {
David Sterbaaf1cbe02017-03-31 18:42:57 +02004075 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
4076 flags, is_dev_replace ? 1 : max_active, 4);
Zhao Leie82afc52015-06-12 20:36:58 +08004077 if (!fs_info->scrub_workers)
4078 goto fail_scrub_workers;
4079
Qu Wenruo0339ef22014-02-28 10:46:17 +08004080 fs_info->scrub_wr_completion_workers =
Jeff Mahoneycb001092016-06-09 16:22:11 -04004081 btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
Qu Wenruo0339ef22014-02-28 10:46:17 +08004082 max_active, 2);
Zhao Leie82afc52015-06-12 20:36:58 +08004083 if (!fs_info->scrub_wr_completion_workers)
4084 goto fail_scrub_wr_completion_workers;
4085
Qu Wenruo0339ef22014-02-28 10:46:17 +08004086 fs_info->scrub_nocow_workers =
Jeff Mahoneycb001092016-06-09 16:22:11 -04004087 btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
Zhao Leie82afc52015-06-12 20:36:58 +08004088 if (!fs_info->scrub_nocow_workers)
4089 goto fail_scrub_nocow_workers;
Zhao Lei20b2e302015-06-04 20:09:15 +08004090 fs_info->scrub_parity_workers =
Jeff Mahoneycb001092016-06-09 16:22:11 -04004091 btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
Zhao Lei20b2e302015-06-04 20:09:15 +08004092 max_active, 2);
Zhao Leie82afc52015-06-12 20:36:58 +08004093 if (!fs_info->scrub_parity_workers)
4094 goto fail_scrub_parity_workers;
Arne Jansen632dd772011-06-10 12:07:07 +02004095 }
Arne Jansena2de7332011-03-08 14:14:00 +01004096 ++fs_info->scrub_workers_refcnt;
Zhao Leie82afc52015-06-12 20:36:58 +08004097 return 0;
4098
4099fail_scrub_parity_workers:
4100 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4101fail_scrub_nocow_workers:
4102 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4103fail_scrub_wr_completion_workers:
4104 btrfs_destroy_workqueue(fs_info->scrub_workers);
4105fail_scrub_workers:
4106 return -ENOMEM;
Arne Jansena2de7332011-03-08 14:14:00 +01004107}
4108
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004109static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01004110{
Stefan Behrensff023aa2012-11-06 11:43:11 +01004111 if (--fs_info->scrub_workers_refcnt == 0) {
Qu Wenruo0339ef22014-02-28 10:46:17 +08004112 btrfs_destroy_workqueue(fs_info->scrub_workers);
4113 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4114 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
Zhao Lei20b2e302015-06-04 20:09:15 +08004115 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004116 }
Arne Jansena2de7332011-03-08 14:14:00 +01004117 WARN_ON(fs_info->scrub_workers_refcnt < 0);
Arne Jansena2de7332011-03-08 14:14:00 +01004118}
4119
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004120int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4121 u64 end, struct btrfs_scrub_progress *progress,
Stefan Behrens63a212a2012-11-05 18:29:28 +01004122 int readonly, int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01004123{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004124 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01004125 int ret;
4126 struct btrfs_device *dev;
Miao Xie5d68da32014-07-24 11:37:07 +08004127 struct rcu_string *name;
Arne Jansena2de7332011-03-08 14:14:00 +01004128
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004129 if (btrfs_fs_closing(fs_info))
Arne Jansena2de7332011-03-08 14:14:00 +01004130 return -EINVAL;
4131
Jeff Mahoneyda170662016-06-15 09:22:56 -04004132 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04004133 /*
4134 * in this case scrub is unable to calculate the checksum
4135 * the way scrub is implemented. Do not handle this
4136 * situation at all because it won't ever happen.
4137 */
Frank Holtonefe120a2013-12-20 11:37:06 -05004138 btrfs_err(fs_info,
4139 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
Jeff Mahoneyda170662016-06-15 09:22:56 -04004140 fs_info->nodesize,
4141 BTRFS_STRIPE_LEN);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04004142 return -EINVAL;
4143 }
4144
Jeff Mahoneyda170662016-06-15 09:22:56 -04004145 if (fs_info->sectorsize != PAGE_SIZE) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04004146 /* not supported for data w/o checksums */
Chandan Rajendra751bebb2016-07-04 10:04:39 +05304147 btrfs_err_rl(fs_info,
Jeff Mahoney5d163e02016-09-20 10:05:00 -04004148 "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
Jeff Mahoneyda170662016-06-15 09:22:56 -04004149 fs_info->sectorsize, PAGE_SIZE);
Arne Jansena2de7332011-03-08 14:14:00 +01004150 return -EINVAL;
4151 }
4152
Jeff Mahoneyda170662016-06-15 09:22:56 -04004153 if (fs_info->nodesize >
Stefan Behrens7a9e9982012-11-02 14:58:04 +01004154 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
Jeff Mahoneyda170662016-06-15 09:22:56 -04004155 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01004156 /*
4157 * would exhaust the array bounds of pagev member in
4158 * struct scrub_block
4159 */
Jeff Mahoney5d163e02016-09-20 10:05:00 -04004160 btrfs_err(fs_info,
4161 "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
Jeff Mahoneyda170662016-06-15 09:22:56 -04004162 fs_info->nodesize,
Stefan Behrens7a9e9982012-11-02 14:58:04 +01004163 SCRUB_MAX_PAGES_PER_BLOCK,
Jeff Mahoneyda170662016-06-15 09:22:56 -04004164 fs_info->sectorsize,
Stefan Behrens7a9e9982012-11-02 14:58:04 +01004165 SCRUB_MAX_PAGES_PER_BLOCK);
4166 return -EINVAL;
4167 }
4168
Arne Jansena2de7332011-03-08 14:14:00 +01004169
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004170 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4171 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
Anand Jaine6e674b2017-12-04 12:54:54 +08004172 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4173 !is_dev_replace)) {
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004174 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01004175 return -ENODEV;
4176 }
Arne Jansena2de7332011-03-08 14:14:00 +01004177
Anand Jainebbede42017-12-04 12:54:52 +08004178 if (!is_dev_replace && !readonly &&
4179 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
Miao Xie5d68da32014-07-24 11:37:07 +08004180 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4181 rcu_read_lock();
4182 name = rcu_dereference(dev->name);
4183 btrfs_err(fs_info, "scrub: device %s is not writable",
4184 name->str);
4185 rcu_read_unlock();
4186 return -EROFS;
4187 }
4188
Wang Shilong3b7a0162013-10-12 02:11:12 +08004189 mutex_lock(&fs_info->scrub_lock);
Anand Jaine12c9622017-12-04 12:54:53 +08004190 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
Anand Jain401e29c2017-12-04 12:54:55 +08004191 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
Arne Jansena2de7332011-03-08 14:14:00 +01004192 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004193 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004194 return -EIO;
Arne Jansena2de7332011-03-08 14:14:00 +01004195 }
4196
David Sterba7e79cb82018-03-24 02:11:38 +01004197 btrfs_dev_replace_read_lock(&fs_info->dev_replace);
Anand Jaincadbc0a2018-01-03 16:08:30 +08004198 if (dev->scrub_ctx ||
Stefan Behrens8dabb742012-11-06 13:15:27 +01004199 (!is_dev_replace &&
4200 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
David Sterba7e79cb82018-03-24 02:11:38 +01004201 btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
Arne Jansena2de7332011-03-08 14:14:00 +01004202 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004203 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01004204 return -EINPROGRESS;
4205 }
David Sterba7e79cb82018-03-24 02:11:38 +01004206 btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
Wang Shilong3b7a0162013-10-12 02:11:12 +08004207
4208 ret = scrub_workers_get(fs_info, is_dev_replace);
4209 if (ret) {
4210 mutex_unlock(&fs_info->scrub_lock);
4211 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4212 return ret;
4213 }
4214
Stefan Behrens63a212a2012-11-05 18:29:28 +01004215 sctx = scrub_setup_ctx(dev, is_dev_replace);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004216 if (IS_ERR(sctx)) {
Arne Jansena2de7332011-03-08 14:14:00 +01004217 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004218 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4219 scrub_workers_put(fs_info);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004220 return PTR_ERR(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01004221 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004222 sctx->readonly = readonly;
Anand Jaincadbc0a2018-01-03 16:08:30 +08004223 dev->scrub_ctx = sctx;
Wang Shilong3cb09292013-12-04 21:15:19 +08004224 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01004225
Wang Shilong3cb09292013-12-04 21:15:19 +08004226 /*
4227 * checking @scrub_pause_req here, we can avoid
4228 * race between committing transaction and scrubbing.
4229 */
Wang Shilongcb7ab022013-12-04 21:16:53 +08004230 __scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01004231 atomic_inc(&fs_info->scrubs_running);
4232 mutex_unlock(&fs_info->scrub_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01004233
Stefan Behrensff023aa2012-11-06 11:43:11 +01004234 if (!is_dev_replace) {
Wang Shilong9b011ad2013-10-25 19:12:02 +08004235 /*
4236 * by holding device list mutex, we can
4237 * kick off writing super in log tree sync.
4238 */
Wang Shilong3cb09292013-12-04 21:15:19 +08004239 mutex_lock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004240 ret = scrub_supers(sctx, dev);
Wang Shilong3cb09292013-12-04 21:15:19 +08004241 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004242 }
Arne Jansena2de7332011-03-08 14:14:00 +01004243
4244 if (!ret)
Stefan Behrensff023aa2012-11-06 11:43:11 +01004245 ret = scrub_enumerate_chunks(sctx, dev, start, end,
4246 is_dev_replace);
Arne Jansena2de7332011-03-08 14:14:00 +01004247
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01004248 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
Arne Jansena2de7332011-03-08 14:14:00 +01004249 atomic_dec(&fs_info->scrubs_running);
4250 wake_up(&fs_info->scrub_pause_wait);
4251
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01004252 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02004253
Arne Jansena2de7332011-03-08 14:14:00 +01004254 if (progress)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004255 memcpy(progress, &sctx->stat, sizeof(*progress));
Arne Jansena2de7332011-03-08 14:14:00 +01004256
4257 mutex_lock(&fs_info->scrub_lock);
Anand Jaincadbc0a2018-01-03 16:08:30 +08004258 dev->scrub_ctx = NULL;
Wang Shilong3b7a0162013-10-12 02:11:12 +08004259 scrub_workers_put(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01004260 mutex_unlock(&fs_info->scrub_lock);
4261
Filipe Mananaf55985f2015-02-09 21:14:24 +00004262 scrub_put_ctx(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01004263
4264 return ret;
4265}
4266
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04004267void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01004268{
Arne Jansena2de7332011-03-08 14:14:00 +01004269 mutex_lock(&fs_info->scrub_lock);
4270 atomic_inc(&fs_info->scrub_pause_req);
4271 while (atomic_read(&fs_info->scrubs_paused) !=
4272 atomic_read(&fs_info->scrubs_running)) {
4273 mutex_unlock(&fs_info->scrub_lock);
4274 wait_event(fs_info->scrub_pause_wait,
4275 atomic_read(&fs_info->scrubs_paused) ==
4276 atomic_read(&fs_info->scrubs_running));
4277 mutex_lock(&fs_info->scrub_lock);
4278 }
4279 mutex_unlock(&fs_info->scrub_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01004280}
4281
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04004282void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01004283{
Arne Jansena2de7332011-03-08 14:14:00 +01004284 atomic_dec(&fs_info->scrub_pause_req);
4285 wake_up(&fs_info->scrub_pause_wait);
Arne Jansena2de7332011-03-08 14:14:00 +01004286}
4287
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004288int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01004289{
Arne Jansena2de7332011-03-08 14:14:00 +01004290 mutex_lock(&fs_info->scrub_lock);
4291 if (!atomic_read(&fs_info->scrubs_running)) {
4292 mutex_unlock(&fs_info->scrub_lock);
4293 return -ENOTCONN;
4294 }
4295
4296 atomic_inc(&fs_info->scrub_cancel_req);
4297 while (atomic_read(&fs_info->scrubs_running)) {
4298 mutex_unlock(&fs_info->scrub_lock);
4299 wait_event(fs_info->scrub_pause_wait,
4300 atomic_read(&fs_info->scrubs_running) == 0);
4301 mutex_lock(&fs_info->scrub_lock);
4302 }
4303 atomic_dec(&fs_info->scrub_cancel_req);
4304 mutex_unlock(&fs_info->scrub_lock);
4305
4306 return 0;
4307}
4308
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004309int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4310 struct btrfs_device *dev)
Jeff Mahoney49b25e02012-03-01 17:24:58 +01004311{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004312 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01004313
4314 mutex_lock(&fs_info->scrub_lock);
Anand Jaincadbc0a2018-01-03 16:08:30 +08004315 sctx = dev->scrub_ctx;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004316 if (!sctx) {
Arne Jansena2de7332011-03-08 14:14:00 +01004317 mutex_unlock(&fs_info->scrub_lock);
4318 return -ENOTCONN;
4319 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004320 atomic_inc(&sctx->cancel_req);
Anand Jaincadbc0a2018-01-03 16:08:30 +08004321 while (dev->scrub_ctx) {
Arne Jansena2de7332011-03-08 14:14:00 +01004322 mutex_unlock(&fs_info->scrub_lock);
4323 wait_event(fs_info->scrub_pause_wait,
Anand Jaincadbc0a2018-01-03 16:08:30 +08004324 dev->scrub_ctx == NULL);
Arne Jansena2de7332011-03-08 14:14:00 +01004325 mutex_lock(&fs_info->scrub_lock);
4326 }
4327 mutex_unlock(&fs_info->scrub_lock);
4328
4329 return 0;
4330}
Stefan Behrens1623ede2012-03-27 14:21:26 -04004331
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04004332int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
Arne Jansena2de7332011-03-08 14:14:00 +01004333 struct btrfs_scrub_progress *progress)
4334{
4335 struct btrfs_device *dev;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004336 struct scrub_ctx *sctx = NULL;
Arne Jansena2de7332011-03-08 14:14:00 +01004337
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004338 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4339 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
Arne Jansena2de7332011-03-08 14:14:00 +01004340 if (dev)
Anand Jaincadbc0a2018-01-03 16:08:30 +08004341 sctx = dev->scrub_ctx;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004342 if (sctx)
4343 memcpy(progress, &sctx->stat, sizeof(*progress));
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004344 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01004345
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004346 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
Arne Jansena2de7332011-03-08 14:14:00 +01004347}
Stefan Behrensff023aa2012-11-06 11:43:11 +01004348
4349static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4350 u64 extent_logical, u64 extent_len,
4351 u64 *extent_physical,
4352 struct btrfs_device **extent_dev,
4353 int *extent_mirror_num)
4354{
4355 u64 mapped_length;
4356 struct btrfs_bio *bbio = NULL;
4357 int ret;
4358
4359 mapped_length = extent_len;
Christoph Hellwigcf8cddd2016-10-27 09:27:36 +02004360 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
Stefan Behrensff023aa2012-11-06 11:43:11 +01004361 &mapped_length, &bbio, 0);
4362 if (ret || !bbio || mapped_length < extent_len ||
4363 !bbio->stripes[0].dev->bdev) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08004364 btrfs_put_bbio(bbio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004365 return;
4366 }
4367
4368 *extent_physical = bbio->stripes[0].physical;
4369 *extent_mirror_num = bbio->mirror_num;
4370 *extent_dev = bbio->stripes[0].dev;
Zhao Lei6e9606d2015-01-20 15:11:34 +08004371 btrfs_put_bbio(bbio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004372}
4373
Stefan Behrensff023aa2012-11-06 11:43:11 +01004374static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4375 int mirror_num, u64 physical_for_dev_replace)
4376{
4377 struct scrub_copy_nocow_ctx *nocow_ctx;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04004378 struct btrfs_fs_info *fs_info = sctx->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004379
4380 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4381 if (!nocow_ctx) {
4382 spin_lock(&sctx->stat_lock);
4383 sctx->stat.malloc_errors++;
4384 spin_unlock(&sctx->stat_lock);
4385 return -ENOMEM;
4386 }
4387
4388 scrub_pending_trans_workers_inc(sctx);
4389
4390 nocow_ctx->sctx = sctx;
4391 nocow_ctx->logical = logical;
4392 nocow_ctx->len = len;
4393 nocow_ctx->mirror_num = mirror_num;
4394 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
Liu Bo9e0af232014-08-15 23:36:53 +08004395 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4396 copy_nocow_pages_worker, NULL, NULL);
Josef Bacik652f25a2013-09-12 16:58:28 -04004397 INIT_LIST_HEAD(&nocow_ctx->inodes);
Qu Wenruo0339ef22014-02-28 10:46:17 +08004398 btrfs_queue_work(fs_info->scrub_nocow_workers,
4399 &nocow_ctx->work);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004400
4401 return 0;
4402}
4403
Josef Bacik652f25a2013-09-12 16:58:28 -04004404static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4405{
4406 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4407 struct scrub_nocow_inode *nocow_inode;
4408
4409 nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4410 if (!nocow_inode)
4411 return -ENOMEM;
4412 nocow_inode->inum = inum;
4413 nocow_inode->offset = offset;
4414 nocow_inode->root = root;
4415 list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4416 return 0;
4417}
4418
4419#define COPY_COMPLETE 1
4420
Stefan Behrensff023aa2012-11-06 11:43:11 +01004421static void copy_nocow_pages_worker(struct btrfs_work *work)
4422{
4423 struct scrub_copy_nocow_ctx *nocow_ctx =
4424 container_of(work, struct scrub_copy_nocow_ctx, work);
4425 struct scrub_ctx *sctx = nocow_ctx->sctx;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004426 struct btrfs_fs_info *fs_info = sctx->fs_info;
4427 struct btrfs_root *root = fs_info->extent_root;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004428 u64 logical = nocow_ctx->logical;
4429 u64 len = nocow_ctx->len;
4430 int mirror_num = nocow_ctx->mirror_num;
4431 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4432 int ret;
4433 struct btrfs_trans_handle *trans = NULL;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004434 struct btrfs_path *path;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004435 int not_written = 0;
4436
Stefan Behrensff023aa2012-11-06 11:43:11 +01004437 path = btrfs_alloc_path();
4438 if (!path) {
4439 spin_lock(&sctx->stat_lock);
4440 sctx->stat.malloc_errors++;
4441 spin_unlock(&sctx->stat_lock);
4442 not_written = 1;
4443 goto out;
4444 }
4445
4446 trans = btrfs_join_transaction(root);
4447 if (IS_ERR(trans)) {
4448 not_written = 1;
4449 goto out;
4450 }
4451
4452 ret = iterate_inodes_from_logical(logical, fs_info, path,
Zygo Blaxellc995ab32017-09-22 13:58:45 -04004453 record_inode_for_nocow, nocow_ctx, false);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004454 if (ret != 0 && ret != -ENOENT) {
Jeff Mahoney5d163e02016-09-20 10:05:00 -04004455 btrfs_warn(fs_info,
4456 "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4457 logical, physical_for_dev_replace, len, mirror_num,
4458 ret);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004459 not_written = 1;
4460 goto out;
4461 }
4462
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04004463 btrfs_end_transaction(trans);
Josef Bacik652f25a2013-09-12 16:58:28 -04004464 trans = NULL;
4465 while (!list_empty(&nocow_ctx->inodes)) {
4466 struct scrub_nocow_inode *entry;
4467 entry = list_first_entry(&nocow_ctx->inodes,
4468 struct scrub_nocow_inode,
4469 list);
4470 list_del_init(&entry->list);
4471 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4472 entry->root, nocow_ctx);
4473 kfree(entry);
4474 if (ret == COPY_COMPLETE) {
4475 ret = 0;
4476 break;
4477 } else if (ret) {
4478 break;
4479 }
4480 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004481out:
Josef Bacik652f25a2013-09-12 16:58:28 -04004482 while (!list_empty(&nocow_ctx->inodes)) {
4483 struct scrub_nocow_inode *entry;
4484 entry = list_first_entry(&nocow_ctx->inodes,
4485 struct scrub_nocow_inode,
4486 list);
4487 list_del_init(&entry->list);
4488 kfree(entry);
4489 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004490 if (trans && !IS_ERR(trans))
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04004491 btrfs_end_transaction(trans);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004492 if (not_written)
4493 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4494 num_uncorrectable_read_errors);
4495
4496 btrfs_free_path(path);
4497 kfree(nocow_ctx);
4498
4499 scrub_pending_trans_workers_dec(sctx);
4500}
4501
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004502static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
Gui Hecheng32159242014-11-10 15:36:08 +08004503 u64 logical)
4504{
4505 struct extent_state *cached_state = NULL;
4506 struct btrfs_ordered_extent *ordered;
4507 struct extent_io_tree *io_tree;
4508 struct extent_map *em;
4509 u64 lockstart = start, lockend = start + len - 1;
4510 int ret = 0;
4511
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004512 io_tree = &inode->io_tree;
Gui Hecheng32159242014-11-10 15:36:08 +08004513
David Sterbaff13db42015-12-03 14:30:40 +01004514 lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004515 ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
Gui Hecheng32159242014-11-10 15:36:08 +08004516 if (ordered) {
4517 btrfs_put_ordered_extent(ordered);
4518 ret = 1;
4519 goto out_unlock;
4520 }
4521
4522 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4523 if (IS_ERR(em)) {
4524 ret = PTR_ERR(em);
4525 goto out_unlock;
4526 }
4527
4528 /*
4529 * This extent does not actually cover the logical extent anymore,
4530 * move on to the next inode.
4531 */
4532 if (em->block_start > logical ||
Liu Boed5d5f32018-02-27 18:10:58 -07004533 em->block_start + em->block_len < logical + len ||
4534 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
Gui Hecheng32159242014-11-10 15:36:08 +08004535 free_extent_map(em);
4536 ret = 1;
4537 goto out_unlock;
4538 }
4539 free_extent_map(em);
4540
4541out_unlock:
David Sterbae43bbe52017-12-12 21:43:52 +01004542 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
Gui Hecheng32159242014-11-10 15:36:08 +08004543 return ret;
4544}
4545
Josef Bacik652f25a2013-09-12 16:58:28 -04004546static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4547 struct scrub_copy_nocow_ctx *nocow_ctx)
Stefan Behrensff023aa2012-11-06 11:43:11 +01004548{
Jeff Mahoneyfb456252016-06-22 18:54:56 -04004549 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004550 struct btrfs_key key;
Miao Xie826aa0a2013-06-27 18:50:59 +08004551 struct inode *inode;
4552 struct page *page;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004553 struct btrfs_root *local_root;
Josef Bacik652f25a2013-09-12 16:58:28 -04004554 struct extent_io_tree *io_tree;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004555 u64 physical_for_dev_replace;
Gui Hecheng32159242014-11-10 15:36:08 +08004556 u64 nocow_ctx_logical;
Josef Bacik652f25a2013-09-12 16:58:28 -04004557 u64 len = nocow_ctx->len;
Miao Xie826aa0a2013-06-27 18:50:59 +08004558 unsigned long index;
Liu Bo6f1c3602013-01-29 03:22:10 +00004559 int srcu_index;
Josef Bacik652f25a2013-09-12 16:58:28 -04004560 int ret = 0;
4561 int err = 0;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004562
4563 key.objectid = root;
4564 key.type = BTRFS_ROOT_ITEM_KEY;
4565 key.offset = (u64)-1;
Liu Bo6f1c3602013-01-29 03:22:10 +00004566
4567 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4568
Stefan Behrensff023aa2012-11-06 11:43:11 +01004569 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
Liu Bo6f1c3602013-01-29 03:22:10 +00004570 if (IS_ERR(local_root)) {
4571 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004572 return PTR_ERR(local_root);
Liu Bo6f1c3602013-01-29 03:22:10 +00004573 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004574
4575 key.type = BTRFS_INODE_ITEM_KEY;
4576 key.objectid = inum;
4577 key.offset = 0;
4578 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
Liu Bo6f1c3602013-01-29 03:22:10 +00004579 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004580 if (IS_ERR(inode))
4581 return PTR_ERR(inode);
4582
Miao Xieedd14002013-06-27 18:51:00 +08004583 /* Avoid truncate/dio/punch hole.. */
Al Viro59551022016-01-22 15:40:57 -05004584 inode_lock(inode);
Miao Xieedd14002013-06-27 18:51:00 +08004585 inode_dio_wait(inode);
4586
Stefan Behrensff023aa2012-11-06 11:43:11 +01004587 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
Josef Bacik652f25a2013-09-12 16:58:28 -04004588 io_tree = &BTRFS_I(inode)->io_tree;
Gui Hecheng32159242014-11-10 15:36:08 +08004589 nocow_ctx_logical = nocow_ctx->logical;
Josef Bacik652f25a2013-09-12 16:58:28 -04004590
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004591 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4592 nocow_ctx_logical);
Gui Hecheng32159242014-11-10 15:36:08 +08004593 if (ret) {
4594 ret = ret > 0 ? 0 : ret;
4595 goto out;
Josef Bacik652f25a2013-09-12 16:58:28 -04004596 }
4597
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004598 while (len >= PAGE_SIZE) {
4599 index = offset >> PAGE_SHIFT;
Miao Xieedd14002013-06-27 18:51:00 +08004600again:
Stefan Behrensff023aa2012-11-06 11:43:11 +01004601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4602 if (!page) {
Frank Holtonefe120a2013-12-20 11:37:06 -05004603 btrfs_err(fs_info, "find_or_create_page() failed");
Stefan Behrensff023aa2012-11-06 11:43:11 +01004604 ret = -ENOMEM;
Miao Xie826aa0a2013-06-27 18:50:59 +08004605 goto out;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004606 }
4607
4608 if (PageUptodate(page)) {
4609 if (PageDirty(page))
4610 goto next_page;
4611 } else {
4612 ClearPageError(page);
Gui Hecheng32159242014-11-10 15:36:08 +08004613 err = extent_read_full_page(io_tree, page,
Josef Bacik652f25a2013-09-12 16:58:28 -04004614 btrfs_get_extent,
4615 nocow_ctx->mirror_num);
Miao Xie826aa0a2013-06-27 18:50:59 +08004616 if (err) {
4617 ret = err;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004618 goto next_page;
4619 }
Miao Xieedd14002013-06-27 18:51:00 +08004620
Miao Xie26b258912013-06-27 18:50:58 +08004621 lock_page(page);
Miao Xieedd14002013-06-27 18:51:00 +08004622 /*
4623 * If the page has been remove from the page cache,
4624 * the data on it is meaningless, because it may be
4625 * old one, the new data may be written into the new
4626 * page in the page cache.
4627 */
4628 if (page->mapping != inode->i_mapping) {
Josef Bacik652f25a2013-09-12 16:58:28 -04004629 unlock_page(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004630 put_page(page);
Miao Xieedd14002013-06-27 18:51:00 +08004631 goto again;
4632 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004633 if (!PageUptodate(page)) {
4634 ret = -EIO;
4635 goto next_page;
4636 }
4637 }
Gui Hecheng32159242014-11-10 15:36:08 +08004638
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004639 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
Gui Hecheng32159242014-11-10 15:36:08 +08004640 nocow_ctx_logical);
4641 if (ret) {
4642 ret = ret > 0 ? 0 : ret;
4643 goto next_page;
4644 }
4645
Miao Xie826aa0a2013-06-27 18:50:59 +08004646 err = write_page_nocow(nocow_ctx->sctx,
4647 physical_for_dev_replace, page);
4648 if (err)
4649 ret = err;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004650next_page:
Miao Xie826aa0a2013-06-27 18:50:59 +08004651 unlock_page(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004652 put_page(page);
Miao Xie826aa0a2013-06-27 18:50:59 +08004653
4654 if (ret)
4655 break;
4656
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004657 offset += PAGE_SIZE;
4658 physical_for_dev_replace += PAGE_SIZE;
4659 nocow_ctx_logical += PAGE_SIZE;
4660 len -= PAGE_SIZE;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004661 }
Josef Bacik652f25a2013-09-12 16:58:28 -04004662 ret = COPY_COMPLETE;
Miao Xie826aa0a2013-06-27 18:50:59 +08004663out:
Al Viro59551022016-01-22 15:40:57 -05004664 inode_unlock(inode);
Miao Xie826aa0a2013-06-27 18:50:59 +08004665 iput(inode);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004666 return ret;
4667}
4668
4669static int write_page_nocow(struct scrub_ctx *sctx,
4670 u64 physical_for_dev_replace, struct page *page)
4671{
4672 struct bio *bio;
4673 struct btrfs_device *dev;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004674
David Sterba3fb99302017-05-16 19:10:32 +02004675 dev = sctx->wr_tgtdev;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004676 if (!dev)
4677 return -EIO;
4678 if (!dev->bdev) {
Jeff Mahoneyfb456252016-06-22 18:54:56 -04004679 btrfs_warn_rl(dev->fs_info,
David Sterba94647322015-10-08 11:01:36 +02004680 "scrub write_page_nocow(bdev == NULL) is unexpected");
Stefan Behrensff023aa2012-11-06 11:43:11 +01004681 return -EIO;
4682 }
David Sterbac5e4c3d2017-06-12 17:29:41 +02004683 bio = btrfs_io_bio_alloc(1);
Kent Overstreet4f024f32013-10-11 15:44:27 -07004684 bio->bi_iter.bi_size = 0;
4685 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
Christoph Hellwig74d46992017-08-23 19:10:32 +02004686 bio_set_dev(bio, dev->bdev);
Christoph Hellwig70fd7612016-11-01 07:40:10 -06004687 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
Anand Jain7ef2d6a72018-01-05 10:47:07 +08004688 /* bio_add_page won't fail on a freshly allocated bio */
4689 bio_add_page(bio, page, PAGE_SIZE, 0);
4690
4691 if (btrfsic_submit_bio_wait(bio)) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01004692 bio_put(bio);
4693 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4694 return -EIO;
4695 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004696
Stefan Behrensff023aa2012-11-06 11:43:11 +01004697 bio_put(bio);
4698 return 0;
4699}