Btrfs: Retry metadata reads in the face of checksum failures
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index efce173..ff4e9c6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -569,7 +569,16 @@
static int noinline check_block(struct btrfs_root *root,
struct btrfs_path *path, int level)
{
- return 0;
+ u64 found_start;
+ if (btrfs_header_level(path->nodes[level]) != level)
+ printk("warning: bad level %Lu wanted %d found %d\n",
+ path->nodes[level]->start, level,
+ btrfs_header_level(path->nodes[level]));
+ found_start = btrfs_header_bytenr(path->nodes[level]);
+ if (found_start != path->nodes[level]->start) {
+ printk("warning: bad bytentr %Lu found %Lu\n",
+ path->nodes[level]->start, found_start);
+ }
#if 0
struct extent_buffer *buf = path->nodes[level];
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 94ff87d..59bdf04 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -156,7 +156,6 @@
memcpy(&found, result, BTRFS_CRC32_SIZE);
read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
- WARN_ON(1);
printk("btrfs: %s checksum verify failed on %llu "
"wanted %X found %X from_this_trans %d "
"level %d\n",
@@ -171,6 +170,40 @@
return 0;
}
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+ struct extent_buffer *eb,
+ u64 start)
+{
+ struct extent_io_tree *io_tree;
+ int ret;
+ int num_copies = 0;
+ int mirror_num = 0;
+
+ io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+ while (1) {
+ ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ btree_get_extent, mirror_num);
+ if (!ret) {
+ if (mirror_num)
+printk("good read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
+ return ret;
+ }
+ num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+ eb->start, eb->len);
+printk("failed to read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
+ if (num_copies == 1) {
+printk("reading %Lu failed only one copy\n", eb->start);
+ return ret;
+ }
+ mirror_num++;
+ if (mirror_num > num_copies) {
+printk("bailing at mirror %d of %d\n", mirror_num, num_copies);
+ return ret;
+ }
+ }
+printk("read extent buffer page last\n");
+ return -EIO;
+}
int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
{
@@ -180,6 +213,8 @@
int found_level;
unsigned long len;
struct extent_buffer *eb;
+ int ret;
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
if (page->private == EXTENT_PAGE_PRIVATE)
@@ -191,8 +226,8 @@
WARN_ON(1);
}
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
- read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
- btree_get_extent);
+ ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE);
+ BUG_ON(ret);
btrfs_clear_buffer_defrag(eb);
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
@@ -240,7 +275,7 @@
unsigned long len;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- int ret;
+ int ret = 0;
tree = &BTRFS_I(page->mapping->host)->io_tree;
if (page->private == EXTENT_PAGE_PRIVATE)
@@ -252,25 +287,26 @@
WARN_ON(1);
}
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
- read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
- btree_get_extent);
+
btrfs_clear_buffer_defrag(eb);
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
- printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
- start, found_start, len);
- WARN_ON(1);
+printk("bad start on %Lu found %Lu\n", eb->start, found_start);
+ ret = -EIO;
goto err;
}
if (eb->first_page != page) {
printk("bad first page %lu %lu\n", eb->first_page->index,
page->index);
WARN_ON(1);
+ ret = -EIO;
goto err;
}
found_level = btrfs_header_level(eb);
ret = csum_tree_block(root, eb, 1);
+ if (ret)
+ ret = -EIO;
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
@@ -278,7 +314,7 @@
err:
free_extent_buffer(eb);
out:
- return 0;
+ return ret;
}
#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
@@ -329,7 +365,8 @@
return 0;
}
-static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 offset;
@@ -338,7 +375,7 @@
offset = bio->bi_sector << 9;
if (rw & (1 << BIO_RW)) {
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
}
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
@@ -349,7 +386,7 @@
submit_bio(rw, bio);
return 0;
}
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
}
static int btree_writepage(struct page *page, struct writeback_control *wbc)
@@ -459,7 +496,7 @@
if (!buf)
return 0;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, 0, 0, btree_get_extent);
+ buf, 0, 0, btree_get_extent, 0);
free_extent_buffer(buf);
return ret;
}
@@ -522,8 +559,7 @@
if (!buf)
return NULL;
- ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0,
- 1, btree_get_extent);
+ ret = btree_read_extent_buffer_pages(root, buf, 0);
if (ret == 0) {
buf->flags |= EXTENT_UPTODATE;
@@ -1366,10 +1402,8 @@
int btrfs_read_buffer(struct extent_buffer *buf)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- struct inode *btree_inode = root->fs_info->btree_inode;
int ret;
- ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, 0, 1, btree_get_extent);
+ ret = btree_read_extent_buffer_pages(root, buf, 0);
if (ret == 0) {
buf->flags |= EXTENT_UPTODATE;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfc383c..2f15937 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1546,7 +1546,7 @@
!(state->state & EXTENT_LOCKED))
state = NULL;
}
- if (!state) {
+ if (!state && uptodate) {
spin_unlock_irqrestore(&tree->lock, flags);
set_extent_uptodate(tree, start, end,
GFP_ATOMIC);
@@ -1567,8 +1567,10 @@
} else {
state = NULL;
}
- set_state_cb(tree, clear, EXTENT_UPTODATE);
- clear->state |= EXTENT_UPTODATE;
+ if (uptodate) {
+ set_state_cb(tree, clear, EXTENT_UPTODATE);
+ clear->state |= EXTENT_UPTODATE;
+ }
clear_state_bit(tree, clear, EXTENT_LOCKED,
1, 0);
if (cur == start)
@@ -1685,7 +1687,7 @@
return bio;
}
-static int submit_one_bio(int rw, struct bio *bio)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
{
u64 maxsector;
int ret = 0;
@@ -1722,7 +1724,8 @@
WARN_ON(1);
}
if (tree->ops && tree->ops->submit_bio_hook)
- tree->ops->submit_bio_hook(page->mapping->host, rw, bio);
+ tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+ mirror_num);
else
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1737,7 +1740,8 @@
struct block_device *bdev,
struct bio **bio_ret,
unsigned long max_pages,
- bio_end_io_t end_io_func)
+ bio_end_io_t end_io_func,
+ int mirror_num)
{
int ret = 0;
struct bio *bio;
@@ -1749,7 +1753,7 @@
(tree->ops && tree->ops->merge_bio_hook &&
tree->ops->merge_bio_hook(page, offset, size, bio)) ||
bio_add_page(bio, page, size, offset) < size) {
- ret = submit_one_bio(rw, bio);
+ ret = submit_one_bio(rw, bio, mirror_num);
bio = NULL;
} else {
return 0;
@@ -1769,7 +1773,7 @@
if (bio_ret) {
*bio_ret = bio;
} else {
- ret = submit_one_bio(rw, bio);
+ ret = submit_one_bio(rw, bio, mirror_num);
}
return ret;
@@ -1798,7 +1802,7 @@
static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
get_extent_t *get_extent,
- struct bio **bio)
+ struct bio **bio, int mirror_num)
{
struct inode *inode = page->mapping->host;
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1901,7 +1905,7 @@
ret = submit_extent_page(READ, tree, page,
sector, iosize, page_offset,
bdev, bio, nr,
- end_bio_extent_readpage);
+ end_bio_extent_readpage, mirror_num);
}
if (ret)
SetPageError(page);
@@ -1923,9 +1927,9 @@
struct bio *bio = NULL;
int ret;
- ret = __extent_read_full_page(tree, page, get_extent, &bio);
+ ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
if (bio)
- submit_one_bio(READ, bio);
+ submit_one_bio(READ, bio, 0);
return ret;
}
EXPORT_SYMBOL(extent_read_full_page);
@@ -2077,7 +2081,7 @@
ret = submit_extent_page(WRITE, tree, page, sector,
iosize, page_offset, bdev,
&epd->bio, max_nr,
- end_bio_extent_writepage);
+ end_bio_extent_writepage, 0);
if (ret)
SetPageError(page);
}
@@ -2244,7 +2248,7 @@
write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
if (epd.bio) {
- submit_one_bio(WRITE, epd.bio);
+ submit_one_bio(WRITE, epd.bio, 0);
}
return ret;
}
@@ -2265,7 +2269,7 @@
ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
if (epd.bio) {
- submit_one_bio(WRITE, epd.bio);
+ submit_one_bio(WRITE, epd.bio, 0);
}
return ret;
}
@@ -2297,7 +2301,8 @@
page_cache_get(page);
if (!pagevec_add(&pvec, page))
__pagevec_lru_add(&pvec);
- __extent_read_full_page(tree, page, get_extent, &bio);
+ __extent_read_full_page(tree, page, get_extent,
+ &bio, 0);
}
page_cache_release(page);
}
@@ -2305,7 +2310,7 @@
__pagevec_lru_add(&pvec);
BUG_ON(!list_empty(pages));
if (bio)
- submit_one_bio(READ, bio);
+ submit_one_bio(READ, bio, 0);
return 0;
}
EXPORT_SYMBOL(extent_readpages);
@@ -2430,7 +2435,7 @@
ret = submit_extent_page(READ, tree, page,
sector, iosize, page_offset, em->bdev,
NULL, 1,
- end_bio_extent_preparewrite);
+ end_bio_extent_preparewrite, 0);
iocount++;
block_start = block_start + iosize;
} else {
@@ -2696,6 +2701,7 @@
mark_page_accessed(page0);
set_page_extent_mapped(page0);
set_page_extent_head(page0, len);
+ uptodate = PageUptodate(page0);
} else {
i = 0;
}
@@ -3006,7 +3012,7 @@
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb,
u64 start, int wait,
- get_extent_t *get_extent)
+ get_extent_t *get_extent, int mirror_num)
{
unsigned long i;
unsigned long start_i;
@@ -3062,8 +3068,10 @@
if (!PageUptodate(page)) {
if (start_i == 0)
inc_all_pages = 1;
+ ClearPageError(page);
err = __extent_read_full_page(tree, page,
- get_extent, &bio);
+ get_extent, &bio,
+ mirror_num);
if (err) {
ret = err;
}
@@ -3073,7 +3081,7 @@
}
if (bio)
- submit_one_bio(READ, bio);
+ submit_one_bio(READ, bio, mirror_num);
if (ret || !wait) {
return ret;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9d2991d..8d6b8a1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,7 +27,8 @@
struct extent_io_ops {
int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
- int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio);
+ int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num);
int (*merge_bio_hook)(struct page *page, unsigned long offset,
size_t size, struct bio *bio);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
@@ -172,7 +173,7 @@
void free_extent_buffer(struct extent_buffer *eb);
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
- get_extent_t *get_extent);
+ get_extent_t *get_extent, int mirror_num);
static inline void extent_buffer_get(struct extent_buffer *eb)
{
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1ef1ac..8c2d5d0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -314,7 +314,7 @@
map_tree = &root->fs_info->mapping_tree;
map_length = length;
ret = btrfs_map_block(map_tree, READ, logical,
- &map_length, NULL);
+ &map_length, NULL, 0);
if (map_length < length + size) {
return 1;
@@ -322,7 +322,8 @@
return 0;
}
-int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -347,7 +348,7 @@
BUG_ON(ret);
mutex_unlock(&root->fs_info->fs_mutex);
mapit:
- return btrfs_map_bio(root, rw, bio);
+ return btrfs_map_bio(root, rw, bio, mirror_num);
}
int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 008d364..3b927f6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -788,9 +788,31 @@
}
}
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ int ret;
+
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+ ret = map->num_stripes;
+ else
+ ret = 1;
+ free_extent_map(em);
+ spin_unlock(&em_tree->lock);
+ return ret;
+}
+
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
- struct btrfs_multi_bio **multi_ret)
+ struct btrfs_multi_bio **multi_ret, int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
@@ -822,6 +844,9 @@
map = (struct map_lookup *)em->bdev;
offset = logical - em->start;
+ if (mirror_num > map->num_stripes)
+ mirror_num = 0;
+
/* if our multi bio struct is too small, back off and try again */
if (multi_ret && (rw & (1 << BIO_RW)) &&
stripes_allocated < map->num_stripes &&
@@ -862,7 +887,9 @@
if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
if (rw & (1 << BIO_RW))
multi->num_stripes = map->num_stripes;
- else {
+ else if (mirror_num) {
+ stripe_index = mirror_num - 1;
+ } else {
int i;
u64 least = (u64)-1;
struct btrfs_device *cur;
@@ -880,6 +907,8 @@
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
if (rw & (1 << BIO_RW))
multi->num_stripes = map->num_stripes;
+ else if (mirror_num)
+ stripe_index = mirror_num - 1;
} else {
/*
* after this do_div call, stripe_nr is the number of stripes
@@ -938,7 +967,8 @@
#endif
}
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+ int mirror_num)
{
struct btrfs_mapping_tree *map_tree;
struct btrfs_device *dev;
@@ -960,7 +990,8 @@
map_tree = &root->fs_info->mapping_tree;
map_length = length;
- ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi);
+ ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+ mirror_num);
BUG_ON(ret);
total_devs = multi->num_stripes;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 10ca010..3d5d0a9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -93,7 +93,7 @@
u64 owner, u64 num_bytes, u64 *start);
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
- struct btrfs_multi_bio **multi_ret);
+ struct btrfs_multi_bio **multi_ret, int mirror_num);
int btrfs_read_sys_array(struct btrfs_root *root);
int btrfs_read_chunk_tree(struct btrfs_root *root);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
@@ -101,7 +101,8 @@
u64 *num_bytes, u64 type);
void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+ int mirror_num);
int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int flags, void *holder);
@@ -112,4 +113,5 @@
struct btrfs_root *root,
struct btrfs_device *device);
int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
#endif