Btrfs: Use async helpers to deal with pages that have been improperly dirtied
Higher layers sometimes call set_page_dirty without asking the filesystem
to help. This causes many problems for the data=ordered and cow code.
This commit detects pages that haven't been properly setup for IO and
kicks off an async helper to deal with them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ceebc05..4ddc8a8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -546,6 +546,12 @@
struct btrfs_workers endio_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
+ /*
+ * fixup workers take dirty pages that didn't properly go through
+ * the cow mechanism and make them safe to write. It happens
+ * for the sys_munmap function call path
+ */
+ struct btrfs_workers fixup_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
int thread_pool_size;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4a5ebaf..66466d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1329,11 +1329,13 @@
*/
btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
+ btrfs_init_workers(&fs_info->fixup_workers, 1);
btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->endio_write_workers,
fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
+ btrfs_start_workers(&fs_info->fixup_workers, 1);
btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_write_workers,
fs_info->thread_pool_size);
@@ -1454,6 +1456,7 @@
fail_sys_array:
fail_sb_buffer:
extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
+ btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -1710,6 +1713,7 @@
truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+ btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3f82a6e..feff16c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2050,6 +2050,16 @@
lock_extent(tree, start, page_end, GFP_NOFS);
unlock_start = start;
+ if (tree->ops && tree->ops->writepage_start_hook) {
+ ret = tree->ops->writepage_start_hook(page, start, page_end);
+ if (ret == -EAGAIN) {
+ unlock_extent(tree, start, page_end, GFP_NOFS);
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
end = page_end;
if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
printk("found delalloc bits after lock_extent\n");
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2268a79..23affd2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -30,6 +30,7 @@
struct bio *bio, int mirror_num);
struct extent_io_ops {
int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+ int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
extent_submit_bio_hook_t *submit_bio_hook;
int (*merge_bio_hook)(struct page *page, unsigned long offset,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 12e765f..2092863 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -313,6 +313,7 @@
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
SetPageUptodate(p);
+ ClearPageChecked(p);
set_page_dirty(p);
}
} else {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5a62f0..47a008c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -403,6 +403,87 @@
return 0;
}
+struct btrfs_writepage_fixup {
+ struct page *page;
+ struct btrfs_work work;
+};
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_ordered_extent *ordered;
+ struct page *page;
+ struct inode *inode;
+ u64 page_start;
+ u64 page_end;
+
+ fixup = container_of(work, struct btrfs_writepage_fixup, work);
+ page = fixup->page;
+
+ lock_page(page);
+ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+ ClearPageChecked(page);
+ goto out_page;
+ }
+
+ inode = page->mapping->host;
+ page_start = page_offset(page);
+ page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered)
+ goto out;
+
+ set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ GFP_NOFS);
+ ClearPageChecked(page);
+out:
+ unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+out_page:
+ unlock_page(page);
+ page_cache_release(page);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea. This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * hasn't been properly setup for IO. We kick off an async process
+ * to fix it up. The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+ EXTENT_ORDERED, 0);
+ if (ret)
+ return 0;
+
+ if (PageChecked(page))
+ return -EAGAIN;
+
+ fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+ if (!fixup)
+ return -EAGAIN;
+printk("queueing worker to fixup page %lu %Lu\n", inode->i_ino, page_offset(page));
+ SetPageChecked(page);
+ page_cache_get(page);
+ fixup->work.func = btrfs_writepage_fixup_worker;
+ fixup->page = page;
+ btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+ return -EAGAIN;
+}
+
int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
@@ -1263,6 +1344,7 @@
flush_dcache_page(page);
kunmap(page);
}
+ ClearPageChecked(page);
set_page_dirty(page);
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -2658,6 +2740,7 @@
flush_dcache_page(page);
kunmap(page);
}
+ ClearPageChecked(page);
set_page_dirty(page);
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -3039,15 +3122,6 @@
static int btrfs_set_page_dirty(struct page *page)
{
- struct inode *inode = page->mapping->host;
- u64 page_start = page_offset(page);
- u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-
- if (!test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_DELALLOC, 0)) {
-printk("inode %lu page %Lu not delalloc\n", inode->i_ino, page_offset(page));
-WARN_ON(1);
- }
return __set_page_dirty_nobuffers(page);
}
@@ -3098,6 +3172,7 @@
.readpage_io_hook = btrfs_readpage_io_hook,
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
.writepage_end_io_hook = btrfs_writepage_end_io_hook,
+ .writepage_start_hook = btrfs_writepage_start_hook,
.readpage_io_failed_hook = btrfs_io_failed_hook,
.set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,