Btrfs: finish ordered extents in their own thread
We noticed that the ordered extent completion doesn't really rely on having
a page and that it could be done independantly of ending the writeback on a
page. This patch makes us not do the threaded endio stuff for normal
buffered writes and direct writes so we can end page writeback as soon as
possible (in irq context) and only start threads to do the ordered work when
it is actually done. Compression needs to be reworked some to take
advantage of this as well, but atm it has to do a find_get_page in its endio
handler so it must be done in its own thread. This makes direct writes
quite a bit faster. Thanks,
Signed-off-by: Josef Bacik <josef@redhat.com>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 9565c02..9e138cd 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,7 +196,7 @@
entry->len = len;
entry->disk_len = disk_len;
entry->bytes_left = len;
- entry->inode = inode;
+ entry->inode = igrab(inode);
entry->compress_type = compress_type;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@
trace_btrfs_ordered_extent_add(inode, entry);
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node)
ordered_data_tree_panic(inode, -EEXIST, file_offset);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@
struct btrfs_ordered_inode_tree *tree;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
}
/*
@@ -283,18 +283,19 @@
*/
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 *file_offset, u64 io_size)
+ u64 *file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
int ret;
+ unsigned long flags;
u64 dec_end;
u64 dec_start;
u64 to_dec;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
if (!node) {
ret = 1;
@@ -323,6 +324,9 @@
(unsigned long long)to_dec);
}
entry->bytes_left -= to_dec;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
@@ -332,7 +336,7 @@
*cached = entry;
atomic_inc(&entry->refs);
}
- spin_unlock(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
}
@@ -347,15 +351,21 @@
*/
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size)
+ u64 file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
+ if (cached && *cached) {
+ entry = *cached;
+ goto have_entry;
+ }
+
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
@@ -363,6 +373,7 @@
}
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
if (!offset_in_entry(entry, file_offset)) {
ret = 1;
goto out;
@@ -374,6 +385,9 @@
(unsigned long long)io_size);
}
entry->bytes_left -= io_size;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
@@ -383,7 +397,7 @@
*cached = entry;
atomic_inc(&entry->refs);
}
- spin_unlock(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
}
@@ -399,6 +413,8 @@
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (atomic_dec_and_test(&entry->refs)) {
+ if (entry->inode)
+ btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@
/*
* remove an ordered extent from the tree. No references are dropped
- * and you must wake_up entry->wait. You must hold the tree lock
- * while you call this function.
+ * and waiters are woken up.
*/
-static void __btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_unlock_irq(&tree->lock);
spin_lock(&root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-}
-
-/*
- * remove an ordered extent from the tree. No references are dropped
- * but any waiters are woken.
- */
-void btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
-{
- struct btrfs_ordered_inode_tree *tree;
-
- tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
- __btrfs_remove_ordered_extent(inode, entry);
- spin_unlock(&tree->lock);
wake_up(&entry->wait);
}
@@ -663,7 +665,7 @@
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -674,7 +676,7 @@
if (entry)
atomic_inc(&entry->refs);
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -690,7 +692,7 @@
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node) {
node = tree_search(tree, file_offset + len);
@@ -715,7 +717,7 @@
out:
if (entry)
atomic_inc(&entry->refs);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -731,7 +733,7 @@
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -739,7 +741,7 @@
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
atomic_inc(&entry->refs);
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -765,7 +767,7 @@
else
offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
disk_i_size = BTRFS_I(inode)->disk_i_size;
/* truncate file */
@@ -803,15 +805,18 @@
}
node = prev;
}
- while (node) {
+ for (; node; node = rb_prev(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+ /* We treat this entry as if it doesnt exist */
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
if (test->file_offset + test->len <= disk_i_size)
break;
if (test->file_offset >= i_size)
break;
if (test->file_offset >= disk_i_size)
goto out;
- node = rb_prev(node);
}
new_i_size = min_t(u64, offset, i_size);
@@ -829,17 +834,27 @@
else
node = rb_first(&tree->tree);
}
- i_size_test = 0;
- if (node) {
- /*
- * do we have an area where IO might have finished
- * between our ordered extent and the next one.
- */
+
+ /*
+ * We are looking for an area between our current extent and the next
+ * ordered extent to update the i_size to. There are 3 cases here
+ *
+ * 1) We don't actually have anything and we can update to i_size.
+ * 2) We have stuff but they already did their i_size update so again we
+ * can just update to i_size.
+ * 3) We have an outstanding ordered extent so the most we can update
+ * our disk_i_size to is the start of the next offset.
+ */
+ i_size_test = i_size;
+ for (; node; node = rb_next(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (test->file_offset > offset)
+
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
+ if (test->file_offset > offset) {
i_size_test = test->file_offset;
- } else {
- i_size_test = i_size;
+ break;
+ }
}
/*
@@ -853,15 +868,15 @@
ret = 0;
out:
/*
- * we need to remove the ordered extent with the tree lock held
- * so that other people calling this function don't find our fully
- * processed ordered entry and skip updating the i_size
+ * We need to do this because we can't remove ordered extents until
+ * after the i_disk_size has been updated and then the inode has been
+ * updated to reflect the change, so we need to tell anybody who finds
+ * this ordered extent that we've already done all the real work, we
+ * just haven't completed all the other work.
*/
if (ordered)
- __btrfs_remove_ordered_extent(inode, ordered);
- spin_unlock(&tree->lock);
- if (ordered)
- wake_up(&ordered->wait);
+ set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+ spin_unlock_irq(&tree->lock);
return ret;
}
@@ -886,7 +901,7 @@
if (!ordered)
return 1;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr) {
num_sectors = ordered_sum->len / sectorsize;
@@ -901,7 +916,7 @@
}
}
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
btrfs_put_ordered_extent(ordered);
return ret;
}