Btrfs: Leaf reference cache update
This changes the reference cache to make a single cache per root
instead of one cache per transaction, and to key by the byte number
of the disk block instead of the keys inside.
This makes it much less likely to have cache misses if a snapshot
or something has an extra reference on a higher node or a leaf while
the first transaction that added the leaf into the cache is dropping.
Some throttling is added to functions that free blocks heavily so they
wait for old transactions to drop.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 34ed23d..4eca0aa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -594,7 +594,6 @@
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
- u64 running_ref_cache_size;
u64 avail_data_alloc_bits;
u64 avail_metadata_alloc_bits;
@@ -606,10 +605,18 @@
void *bdev_holder;
};
+struct btrfs_leaf_ref_tree {
+ struct rb_root root;
+ struct btrfs_leaf_ref *last;
+ struct list_head list;
+ spinlock_t lock;
+};
+
/*
* in ram representation of the tree. extent_root is used for all allocations
* and for the extent tree extent_root root.
*/
+struct dirty_root;
struct btrfs_root {
struct extent_buffer *node;
@@ -618,6 +625,8 @@
struct extent_buffer *commit_root;
struct btrfs_leaf_ref_tree *ref_tree;
+ struct btrfs_leaf_ref_tree ref_tree_struct;
+ struct dirty_root *dirty_root;
struct btrfs_root_item root_item;
struct btrfs_key root_key;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4f0e1d0..eccdf13 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -40,6 +40,7 @@
#include "print-tree.h"
#include "async-thread.h"
#include "locking.h"
+#include "ref-cache.h"
#if 0
static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -737,6 +738,10 @@
spin_lock_init(&root->node_lock);
spin_lock_init(&root->orphan_lock);
mutex_init(&root->objectid_mutex);
+
+ btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+ root->ref_tree = &root->ref_tree_struct;
+
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -1176,9 +1181,6 @@
goto sleep;
}
- printk("btrfs: running reference cache size %Lu\n",
- root->fs_info->running_ref_cache_size);
-
now = get_seconds();
if (now < cur->start_time || now - cur->start_time < 30) {
mutex_unlock(&root->fs_info->trans_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b24f15..0e294cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1004,8 +1004,6 @@
goto out;
}
- btrfs_item_key_to_cpu(buf, &ref->key, 0);
-
ref->bytenr = buf->start;
ref->owner = btrfs_header_owner(buf);
ref->generation = btrfs_header_generation(buf);
@@ -2387,19 +2385,15 @@
}
}
-/*
- * we want to avoid as much random IO as we can with the alloc mutex
- * held, so drop the lock and do the lookup, then do it again with the
- * lock held.
- */
int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
u32 *refs)
{
+ int ret;
mutex_unlock(&root->fs_info->alloc_mutex);
- lookup_extent_ref(NULL, root, start, len, refs);
+ ret = lookup_extent_ref(NULL, root, start, len, refs);
cond_resched();
mutex_lock(&root->fs_info->alloc_mutex);
- return lookup_extent_ref(NULL, root, start, len, refs);
+ return ret;
}
/*
@@ -2468,11 +2462,11 @@
BUG_ON(ret);
continue;
}
-
+
if (*level == 1) {
struct btrfs_key key;
btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
- ref = btrfs_lookup_leaf_ref(root, &key);
+ ref = btrfs_lookup_leaf_ref(root, bytenr);
if (ref) {
ret = drop_leaf_ref(trans, root, ref);
BUG_ON(ret);
@@ -2482,7 +2476,6 @@
break;
}
}
-
next = btrfs_find_tree_block(root, bytenr, blocksize);
if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
free_extent_buffer(next);
@@ -2672,6 +2665,7 @@
ret = -EAGAIN;
break;
}
+ wake_up(&root->fs_info->transaction_throttle);
}
for (i = 0; i <= orig_level; i++) {
if (path->nodes[i]) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e5ffb66..3efec25 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -347,7 +347,7 @@
btrfs_update_inode(trans, root, inode);
}
failed:
- err = btrfs_end_transaction_throttle(trans, root);
+ err = btrfs_end_transaction(trans, root);
out_unlock:
unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
return err;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cf9534b..4f977ea 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2482,7 +2482,7 @@
btrfs_update_inode_block_group(trans, dir);
out_unlock:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
@@ -2535,7 +2535,7 @@
drop_inode = 1;
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
@@ -2609,7 +2609,7 @@
out_fail:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
out_unlock:
if (drop_on_err)
@@ -3548,7 +3548,7 @@
out_unlock:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
out_fail:
if (drop_inode) {
inode_dec_link_count(inode);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 95a9fae..ec95877 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -29,6 +29,7 @@
if (ref) {
memset(ref, 0, sizeof(*ref));
atomic_set(&ref->usage, 1);
+ INIT_LIST_HEAD(&ref->list);
}
return ref;
}
@@ -44,40 +45,21 @@
}
}
-static int comp_keys(struct btrfs_key *k1, struct btrfs_key *k2)
-{
- if (k1->objectid > k2->objectid)
- return 1;
- if (k1->objectid < k2->objectid)
- return -1;
- if (k1->type > k2->type)
- return 1;
- if (k1->type < k2->type)
- return -1;
- if (k1->offset > k2->offset)
- return 1;
- if (k1->offset < k2->offset)
- return -1;
- return 0;
-}
-
-static struct rb_node *tree_insert(struct rb_root *root, struct btrfs_key *key,
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
struct rb_node *node)
{
struct rb_node ** p = &root->rb_node;
struct rb_node * parent = NULL;
struct btrfs_leaf_ref *entry;
- int ret;
while(*p) {
parent = *p;
entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
WARN_ON(!entry->in_tree);
- ret = comp_keys(key, &entry->key);
- if (ret < 0)
+ if (bytenr < entry->bytenr)
p = &(*p)->rb_left;
- else if (ret > 0)
+ else if (bytenr > entry->bytenr)
p = &(*p)->rb_right;
else
return parent;
@@ -90,20 +72,18 @@
return NULL;
}
-static struct rb_node *tree_search(struct rb_root *root, struct btrfs_key *key)
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
{
struct rb_node * n = root->rb_node;
struct btrfs_leaf_ref *entry;
- int ret;
while(n) {
entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
WARN_ON(!entry->in_tree);
- ret = comp_keys(key, &entry->key);
- if (ret < 0)
+ if (bytenr < entry->bytenr)
n = n->rb_left;
- else if (ret > 0)
+ else if (bytenr > entry->bytenr)
n = n->rb_right;
else
return n;
@@ -122,11 +102,11 @@
spin_lock(&tree->lock);
while(!btrfs_leaf_ref_tree_empty(tree)) {
- tree->last = NULL;
rb = rb_first(&tree->root);
ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
rb_erase(&ref->rb_node, &tree->root);
ref->in_tree = 0;
+ list_del_init(&ref->list);
spin_unlock(&tree->lock);
@@ -140,7 +120,7 @@
}
struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
- struct btrfs_key *key)
+ u64 bytenr)
{
struct rb_node *rb;
struct btrfs_leaf_ref *ref = NULL;
@@ -150,15 +130,9 @@
return NULL;
spin_lock(&tree->lock);
- if (tree->last && comp_keys(key, &tree->last->key) == 0) {
- ref = tree->last;
- } else {
- rb = tree_search(&tree->root, key);
- if (rb) {
- ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
- tree->last = ref;
- }
- }
+ rb = tree_search(&tree->root, bytenr);
+ if (rb)
+ ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
if (ref)
atomic_inc(&ref->usage);
spin_unlock(&tree->lock);
@@ -171,21 +145,17 @@
struct rb_node *rb;
size_t size = btrfs_leaf_ref_size(ref->nritems);
struct btrfs_leaf_ref_tree *tree = root->ref_tree;
- struct btrfs_transaction *trans = root->fs_info->running_transaction;
spin_lock(&tree->lock);
- rb = tree_insert(&tree->root, &ref->key, &ref->rb_node);
+ rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
if (rb) {
ret = -EEXIST;
} else {
spin_lock(&root->fs_info->ref_cache_lock);
root->fs_info->total_ref_cache_size += size;
- if (trans && tree->generation == trans->transid)
- root->fs_info->running_ref_cache_size += size;
spin_unlock(&root->fs_info->ref_cache_lock);
-
- tree->last = ref;
atomic_inc(&ref->usage);
+ list_add_tail(&ref->list, &tree->list);
}
spin_unlock(&tree->lock);
return ret;
@@ -195,28 +165,17 @@
{
size_t size = btrfs_leaf_ref_size(ref->nritems);
struct btrfs_leaf_ref_tree *tree = root->ref_tree;
- struct btrfs_transaction *trans = root->fs_info->running_transaction;
BUG_ON(!ref->in_tree);
spin_lock(&tree->lock);
spin_lock(&root->fs_info->ref_cache_lock);
root->fs_info->total_ref_cache_size -= size;
- if (trans && tree->generation == trans->transid)
- root->fs_info->running_ref_cache_size -= size;
spin_unlock(&root->fs_info->ref_cache_lock);
- if (tree->last == ref) {
- struct rb_node *next = rb_next(&ref->rb_node);
- if (next) {
- tree->last = rb_entry(next, struct btrfs_leaf_ref,
- rb_node);
- } else
- tree->last = NULL;
- }
-
rb_erase(&ref->rb_node, &tree->root);
ref->in_tree = 0;
+ list_del_init(&ref->list);
spin_unlock(&tree->lock);
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 79ecc47..823c049 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -15,6 +15,8 @@
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
+#ifndef __REFCACHE__
+#define __REFCACHE__
struct btrfs_extent_info {
u64 bytenr;
@@ -25,7 +27,6 @@
struct btrfs_leaf_ref {
struct rb_node rb_node;
- struct btrfs_key key;
int in_tree;
atomic_t usage;
@@ -33,14 +34,9 @@
u64 owner;
u64 generation;
int nritems;
- struct btrfs_extent_info extents[];
-};
-struct btrfs_leaf_ref_tree {
- struct rb_root root;
- struct btrfs_leaf_ref *last;
- u64 generation;
- spinlock_t lock;
+ struct list_head list;
+ struct btrfs_extent_info extents[];
};
static inline size_t btrfs_leaf_ref_size(int nr_extents)
@@ -53,7 +49,7 @@
{
tree->root.rb_node = NULL;
tree->last = NULL;
- tree->generation = 0;
+ INIT_LIST_HEAD(&tree->list);
spin_lock_init(&tree->lock);
}
@@ -66,7 +62,9 @@
struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
- struct btrfs_key *key);
+ u64 bytenr);
int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
int btrfs_remove_leaf_refs(struct btrfs_root *root);
int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 543e5ee..fcef3ca 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,7 +36,6 @@
struct list_head list;
struct btrfs_root *root;
struct btrfs_root *latest_root;
- struct btrfs_leaf_ref_tree ref_tree;
};
static noinline void put_transaction(struct btrfs_transaction *transaction)
@@ -108,13 +107,13 @@
dirty->latest_root = root;
INIT_LIST_HEAD(&dirty->list);
- btrfs_leaf_ref_tree_init(&dirty->ref_tree);
- dirty->ref_tree.generation = running_trans_id;
root->commit_root = btrfs_root_node(root);
- root->ref_tree = &dirty->ref_tree;
+ root->dirty_root = dirty;
memcpy(dirty->root, root, sizeof(*root));
+ dirty->root->ref_tree = &root->ref_tree_struct;
+
spin_lock_init(&dirty->root->node_lock);
mutex_init(&dirty->root->objectid_mutex);
dirty->root->node = root->commit_root;
@@ -217,12 +216,13 @@
if (waitqueue_active(&cur_trans->writer_wait))
wake_up(&cur_trans->writer_wait);
- if (0 && cur_trans->in_commit && throttle) {
+ if (throttle && atomic_read(&root->fs_info->throttles)) {
DEFINE_WAIT(wait);
mutex_unlock(&root->fs_info->trans_mutex);
prepare_to_wait(&root->fs_info->transaction_throttle, &wait,
TASK_UNINTERRUPTIBLE);
- schedule();
+ if (atomic_read(&root->fs_info->throttles))
+ schedule();
finish_wait(&root->fs_info->transaction_throttle, &wait);
mutex_lock(&root->fs_info->trans_mutex);
}
@@ -333,6 +333,8 @@
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
update_cowonly_root(trans, root);
+ if (root->fs_info->closing)
+ btrfs_remove_leaf_refs(root);
}
return 0;
}
@@ -346,10 +348,8 @@
dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
if (!dirty)
return -ENOMEM;
- btrfs_leaf_ref_tree_init(&dirty->ref_tree);
dirty->root = root;
dirty->latest_root = latest;
- root->ref_tree = NULL;
list_add(&dirty->list, dead_list);
return 0;
}
@@ -379,18 +379,14 @@
BTRFS_ROOT_TRANS_TAG);
BUG_ON(!root->ref_tree);
- dirty = container_of(root->ref_tree, struct dirty_root,
- ref_tree);
+ dirty = root->dirty_root;
if (root->commit_root == root->node) {
WARN_ON(root->node->start !=
btrfs_root_bytenr(&root->root_item));
- BUG_ON(!btrfs_leaf_ref_tree_empty(
- root->ref_tree));
free_extent_buffer(root->commit_root);
root->commit_root = NULL;
- root->ref_tree = NULL;
kfree(dirty->root);
kfree(dirty);
@@ -410,7 +406,6 @@
sizeof(struct btrfs_disk_key));
root->root_item.drop_level = 0;
root->commit_root = NULL;
- root->ref_tree = NULL;
root->root_key.offset = root->fs_info->generation;
btrfs_set_root_bytenr(&root->root_item,
root->node->start);
@@ -485,7 +480,7 @@
while(!list_empty(list)) {
struct btrfs_root *root;
- dirty = list_entry(list->next, struct dirty_root, list);
+ dirty = list_entry(list->prev, struct dirty_root, list);
list_del_init(&dirty->list);
num_bytes = btrfs_root_used(&dirty->root->root_item);
@@ -507,7 +502,7 @@
if (err)
ret = err;
nr = trans->blocks_used;
- ret = btrfs_end_transaction_throttle(trans, tree_root);
+ ret = btrfs_end_transaction(trans, tree_root);
BUG_ON(ret);
mutex_unlock(&root->fs_info->drop_mutex);
@@ -517,6 +512,7 @@
}
BUG_ON(ret);
atomic_dec(&root->fs_info->throttles);
+ wake_up(&root->fs_info->transaction_throttle);
mutex_lock(&root->fs_info->alloc_mutex);
num_bytes -= btrfs_root_used(&dirty->root->root_item);
@@ -539,8 +535,6 @@
ret = btrfs_end_transaction(trans, tree_root);
BUG_ON(ret);
- btrfs_remove_leaf_refs(dirty->root);
-
free_extent_buffer(dirty->root->node);
kfree(dirty->root);
kfree(dirty);
@@ -725,10 +719,6 @@
&dirty_fs_roots);
BUG_ON(ret);
- spin_lock(&root->fs_info->ref_cache_lock);
- root->fs_info->running_ref_cache_size = 0;
- spin_unlock(&root->fs_info->ref_cache_lock);
-
ret = btrfs_commit_tree_roots(trans, root);
BUG_ON(ret);