Btrfs: kill trans_mutex
We use trans_mutex for lots of things, here's a basic list
1) To serialize trans_handles joining the currently running transaction
2) To make sure that no new trans handles are started while we are committing
3) To protect the dead_roots list and the transaction lists
Really the serializing trans_handles joining is not too hard, and can really get
bogged down in acquiring a reference to the transaction. So replace the
trans_mutex with a trans_lock spinlock and use it to do the following
1) Protect fs_info->running_transaction. All trans handles have to do is check
this, and then take a reference of the transaction and keep on going.
2) Protect the fs_info->trans_list. This doesn't get used too much, basically
it just holds the current transactions, which will usually just be the currently
committing transaction and the currently running transaction at most.
3) Protect the dead roots list. This is only ever processed by splicing the
list so this is relatively simple.
4) Protect the fs_info->reloc_ctl stuff. This is very lightweight and was using
the trans_mutex before, so this is a pretty straightforward change.
5) Protect fs_info->no_trans_join. Because we don't hold the trans_lock over
the entirety of the commit we need to have a way to block new people from
creating a new transaction while we're doing our work. So we set no_trans_join
and in join_transaction we test to see if that is set, and if it is we do a
wait_on_commit.
6) Make the transaction use count atomic so we don't need to take locks to
modify it when we're dropping references.
7) Add a commit_lock to the transaction to make sure multiple people trying to
commit the same transaction don't race and commit at the same time.
8) Make open_ioctl_trans an atomic so we don't have to take any locks for ioctl
trans.
I have tested this with xfstests, but obviously it is a pretty hairy change so
lots of testing is greatly appreciated. Thanks,
Signed-off-by: Josef Bacik <josef@redhat.com>
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 46f4056..43816f8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,6 +34,7 @@
{
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
+ BUG_ON(!list_empty(&transaction->list));
memset(transaction, 0, sizeof(*transaction));
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
@@ -48,47 +49,73 @@
/*
* either allocate a new transaction or hop into the existing one
*/
-static noinline int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root, int nofail)
{
struct btrfs_transaction *cur_trans;
+
+ spin_lock(&root->fs_info->trans_lock);
+ if (root->fs_info->trans_no_join) {
+ if (!nofail) {
+ spin_unlock(&root->fs_info->trans_lock);
+ return -EBUSY;
+ }
+ }
+
cur_trans = root->fs_info->running_transaction;
- if (!cur_trans) {
- cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
- GFP_NOFS);
- if (!cur_trans)
- return -ENOMEM;
- root->fs_info->generation++;
- atomic_set(&cur_trans->num_writers, 1);
- cur_trans->num_joined = 0;
- cur_trans->transid = root->fs_info->generation;
- init_waitqueue_head(&cur_trans->writer_wait);
- init_waitqueue_head(&cur_trans->commit_wait);
- cur_trans->in_commit = 0;
- cur_trans->blocked = 0;
- atomic_set(&cur_trans->use_count, 1);
- cur_trans->commit_done = 0;
- cur_trans->start_time = get_seconds();
-
- cur_trans->delayed_refs.root = RB_ROOT;
- cur_trans->delayed_refs.num_entries = 0;
- cur_trans->delayed_refs.num_heads_ready = 0;
- cur_trans->delayed_refs.num_heads = 0;
- cur_trans->delayed_refs.flushing = 0;
- cur_trans->delayed_refs.run_delayed_start = 0;
- spin_lock_init(&cur_trans->delayed_refs.lock);
-
- INIT_LIST_HEAD(&cur_trans->pending_snapshots);
- list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
- extent_io_tree_init(&cur_trans->dirty_pages,
- root->fs_info->btree_inode->i_mapping,
- GFP_NOFS);
- spin_lock(&root->fs_info->new_trans_lock);
- root->fs_info->running_transaction = cur_trans;
- spin_unlock(&root->fs_info->new_trans_lock);
- } else {
+ if (cur_trans) {
+ atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
+ spin_unlock(&root->fs_info->trans_lock);
+ return 0;
}
+ spin_unlock(&root->fs_info->trans_lock);
+
+ cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+ if (!cur_trans)
+ return -ENOMEM;
+ spin_lock(&root->fs_info->trans_lock);
+ if (root->fs_info->running_transaction) {
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ cur_trans = root->fs_info->running_transaction;
+ atomic_inc(&cur_trans->use_count);
+ atomic_inc(&cur_trans->num_writers);
+ cur_trans->num_joined++;
+ spin_unlock(&root->fs_info->trans_lock);
+ return 0;
+ }
+ atomic_set(&cur_trans->num_writers, 1);
+ cur_trans->num_joined = 0;
+ init_waitqueue_head(&cur_trans->writer_wait);
+ init_waitqueue_head(&cur_trans->commit_wait);
+ cur_trans->in_commit = 0;
+ cur_trans->blocked = 0;
+ /*
+ * One for this trans handle, one so it will live on until we
+ * commit the transaction.
+ */
+ atomic_set(&cur_trans->use_count, 2);
+ cur_trans->commit_done = 0;
+ cur_trans->start_time = get_seconds();
+
+ cur_trans->delayed_refs.root = RB_ROOT;
+ cur_trans->delayed_refs.num_entries = 0;
+ cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.num_heads = 0;
+ cur_trans->delayed_refs.flushing = 0;
+ cur_trans->delayed_refs.run_delayed_start = 0;
+ spin_lock_init(&cur_trans->commit_lock);
+ spin_lock_init(&cur_trans->delayed_refs.lock);
+
+ INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+ list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+ extent_io_tree_init(&cur_trans->dirty_pages,
+ root->fs_info->btree_inode->i_mapping,
+ GFP_NOFS);
+ root->fs_info->generation++;
+ cur_trans->transid = root->fs_info->generation;
+ root->fs_info->running_transaction = cur_trans;
+ spin_unlock(&root->fs_info->trans_lock);
return 0;
}
@@ -99,39 +126,28 @@
* to make sure the old root from before we joined the transaction is deleted
* when the transaction commits
*/
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
if (root->ref_cows && root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
WARN_ON(root->commit_root != root->node);
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ if (root->last_trans == trans->transid) {
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ return 0;
+ }
+ root->last_trans = trans->transid;
radix_tree_tag_set(&root->fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
- root->last_trans = trans->transid;
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
btrfs_init_reloc_root(trans, root);
}
return 0;
}
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- if (!root->ref_cows)
- return 0;
-
- mutex_lock(&root->fs_info->trans_mutex);
- if (root->last_trans == trans->transid) {
- mutex_unlock(&root->fs_info->trans_mutex);
- return 0;
- }
-
- record_root_in_trans(trans, root);
- mutex_unlock(&root->fs_info->trans_mutex);
- return 0;
-}
-
/* wait for commit against the current transaction to become unblocked
* when this is done, it is safe to start a new transaction, but the current
* transaction might not be fully on disk.
@@ -140,21 +156,23 @@
{
struct btrfs_transaction *cur_trans;
+ spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
if (cur_trans && cur_trans->blocked) {
DEFINE_WAIT(wait);
atomic_inc(&cur_trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
while (1) {
prepare_to_wait(&root->fs_info->transaction_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (!cur_trans->blocked)
break;
- mutex_unlock(&root->fs_info->trans_mutex);
schedule();
- mutex_lock(&root->fs_info->trans_mutex);
}
finish_wait(&root->fs_info->transaction_wait, &wait);
put_transaction(cur_trans);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
}
}
@@ -167,10 +185,16 @@
static int may_wait_transaction(struct btrfs_root *root, int type)
{
- if (!root->fs_info->log_root_recovering &&
- ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
- type == TRANS_USERSPACE))
+ if (root->fs_info->log_root_recovering)
+ return 0;
+
+ if (type == TRANS_USERSPACE)
return 1;
+
+ if (type == TRANS_START &&
+ !atomic_read(&root->fs_info->open_ioctl_trans))
+ return 1;
+
return 0;
}
@@ -198,23 +222,21 @@
if (!h)
return ERR_PTR(-ENOMEM);
- if (type != TRANS_JOIN_NOLOCK)
- mutex_lock(&root->fs_info->trans_mutex);
if (may_wait_transaction(root, type))
wait_current_trans(root);
- ret = join_transaction(root);
+ do {
+ ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+ if (ret == -EBUSY)
+ wait_current_trans(root);
+ } while (ret == -EBUSY);
+
if (ret < 0) {
kmem_cache_free(btrfs_trans_handle_cachep, h);
- if (type != TRANS_JOIN_NOLOCK)
- mutex_unlock(&root->fs_info->trans_mutex);
return ERR_PTR(ret);
}
cur_trans = root->fs_info->running_transaction;
- atomic_inc(&cur_trans->use_count);
- if (type != TRANS_JOIN_NOLOCK)
- mutex_unlock(&root->fs_info->trans_mutex);
h->transid = cur_trans->transid;
h->transaction = cur_trans;
@@ -253,11 +275,7 @@
}
got_it:
- if (type != TRANS_JOIN_NOLOCK)
- mutex_lock(&root->fs_info->trans_mutex);
- record_root_in_trans(h, root);
- if (type != TRANS_JOIN_NOLOCK)
- mutex_unlock(&root->fs_info->trans_mutex);
+ btrfs_record_root_in_trans(h, root);
if (!current->journal_info && type != TRANS_USERSPACE)
current->journal_info = h;
@@ -289,17 +307,13 @@
struct btrfs_transaction *commit)
{
DEFINE_WAIT(wait);
- mutex_lock(&root->fs_info->trans_mutex);
while (!commit->commit_done) {
prepare_to_wait(&commit->commit_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (commit->commit_done)
break;
- mutex_unlock(&root->fs_info->trans_mutex);
schedule();
- mutex_lock(&root->fs_info->trans_mutex);
}
- mutex_unlock(&root->fs_info->trans_mutex);
finish_wait(&commit->commit_wait, &wait);
return 0;
}
@@ -309,50 +323,49 @@
struct btrfs_transaction *cur_trans = NULL, *t;
int ret;
- mutex_lock(&root->fs_info->trans_mutex);
-
ret = 0;
if (transid) {
if (transid <= root->fs_info->last_trans_committed)
- goto out_unlock;
+ goto out;
/* find specified transaction */
+ spin_lock(&root->fs_info->trans_lock);
list_for_each_entry(t, &root->fs_info->trans_list, list) {
if (t->transid == transid) {
cur_trans = t;
+ atomic_inc(&cur_trans->use_count);
break;
}
if (t->transid > transid)
break;
}
+ spin_unlock(&root->fs_info->trans_lock);
ret = -EINVAL;
if (!cur_trans)
- goto out_unlock; /* bad transid */
+ goto out; /* bad transid */
} else {
/* find newest transaction that is committing | committed */
+ spin_lock(&root->fs_info->trans_lock);
list_for_each_entry_reverse(t, &root->fs_info->trans_list,
list) {
if (t->in_commit) {
if (t->commit_done)
- goto out_unlock;
+ goto out;
cur_trans = t;
+ atomic_inc(&cur_trans->use_count);
break;
}
}
+ spin_unlock(&root->fs_info->trans_lock);
if (!cur_trans)
- goto out_unlock; /* nothing committing|committed */
+ goto out; /* nothing committing|committed */
}
- atomic_inc(&cur_trans->use_count);
- mutex_unlock(&root->fs_info->trans_mutex);
-
wait_for_commit(root, cur_trans);
- mutex_lock(&root->fs_info->trans_mutex);
put_transaction(cur_trans);
ret = 0;
-out_unlock:
- mutex_unlock(&root->fs_info->trans_mutex);
+out:
return ret;
}
@@ -401,10 +414,8 @@
void btrfs_throttle(struct btrfs_root *root)
{
- mutex_lock(&root->fs_info->trans_mutex);
- if (!root->fs_info->open_ioctl_trans)
+ if (!atomic_read(&root->fs_info->open_ioctl_trans))
wait_current_trans(root);
- mutex_unlock(&root->fs_info->trans_mutex);
}
static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -422,6 +433,7 @@
struct btrfs_transaction *cur_trans = trans->transaction;
int updates;
+ smp_mb();
if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
return 1;
@@ -467,9 +479,11 @@
btrfs_trans_release_metadata(trans, root);
- if (lock && !root->fs_info->open_ioctl_trans &&
- should_end_transaction(trans, root))
+ if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
+ should_end_transaction(trans, root)) {
trans->transaction->blocked = 1;
+ smp_wmb();
+ }
if (lock && cur_trans->blocked && !cur_trans->in_commit) {
if (throttle)
@@ -739,9 +753,9 @@
*/
int btrfs_add_dead_root(struct btrfs_root *root)
{
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&root->fs_info->trans_lock);
list_add(&root->root_list, &root->fs_info->dead_roots);
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->trans_lock);
return 0;
}
@@ -757,6 +771,7 @@
int ret;
int err = 0;
+ spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
(void **)gang, 0,
@@ -769,6 +784,7 @@
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
@@ -783,10 +799,12 @@
err = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
+ spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
}
}
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return err;
}
@@ -972,7 +990,7 @@
parent = dget_parent(dentry);
parent_inode = parent->d_inode;
parent_root = BTRFS_I(parent_inode)->root;
- record_root_in_trans(trans, parent_root);
+ btrfs_record_root_in_trans(trans, parent_root);
/*
* insert the directory item
@@ -990,7 +1008,7 @@
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
- record_root_in_trans(trans, root);
+ btrfs_record_root_in_trans(trans, root);
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
@@ -1080,20 +1098,20 @@
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
int ret = 0;
- spin_lock(&info->new_trans_lock);
+ spin_lock(&info->trans_lock);
if (info->running_transaction)
ret = info->running_transaction->in_commit;
- spin_unlock(&info->new_trans_lock);
+ spin_unlock(&info->trans_lock);
return ret;
}
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
int ret = 0;
- spin_lock(&info->new_trans_lock);
+ spin_lock(&info->trans_lock);
if (info->running_transaction)
ret = info->running_transaction->blocked;
- spin_unlock(&info->new_trans_lock);
+ spin_unlock(&info->trans_lock);
return ret;
}
@@ -1117,9 +1135,7 @@
&wait);
break;
}
- mutex_unlock(&root->fs_info->trans_mutex);
schedule();
- mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
}
}
@@ -1145,9 +1161,7 @@
&wait);
break;
}
- mutex_unlock(&root->fs_info->trans_mutex);
schedule();
- mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&root->fs_info->transaction_wait,
&wait);
}
@@ -1193,22 +1207,18 @@
}
/* take transaction reference */
- mutex_lock(&root->fs_info->trans_mutex);
cur_trans = trans->transaction;
atomic_inc(&cur_trans->use_count);
- mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
schedule_delayed_work(&ac->work, 0);
/* wait for transaction to start and unblock */
- mutex_lock(&root->fs_info->trans_mutex);
if (wait_for_unblock)
wait_current_trans_commit_start_and_unblock(root, cur_trans);
else
wait_current_trans_commit_start(root, cur_trans);
put_transaction(cur_trans);
- mutex_unlock(&root->fs_info->trans_mutex);
return 0;
}
@@ -1252,38 +1262,41 @@
ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&cur_trans->commit_lock);
if (cur_trans->in_commit) {
+ spin_unlock(&cur_trans->commit_lock);
atomic_inc(&cur_trans->use_count);
- mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
ret = wait_for_commit(root, cur_trans);
BUG_ON(ret);
- mutex_lock(&root->fs_info->trans_mutex);
put_transaction(cur_trans);
- mutex_unlock(&root->fs_info->trans_mutex);
return 0;
}
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
+ spin_unlock(&cur_trans->commit_lock);
wake_up(&root->fs_info->transaction_blocked_wait);
+ spin_lock(&root->fs_info->trans_lock);
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
if (!prev_trans->commit_done) {
atomic_inc(&prev_trans->use_count);
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->trans_lock);
wait_for_commit(root, prev_trans);
- mutex_lock(&root->fs_info->trans_mutex);
put_transaction(prev_trans);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
}
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
}
if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1291,12 +1304,12 @@
do {
int snap_pending = 0;
+
joined = cur_trans->num_joined;
if (!list_empty(&trans->transaction->pending_snapshots))
snap_pending = 1;
WARN_ON(cur_trans != trans->transaction);
- mutex_unlock(&root->fs_info->trans_mutex);
if (flush_on_commit || snap_pending) {
btrfs_start_delalloc_inodes(root, 1);
@@ -1316,14 +1329,15 @@
prepare_to_wait(&cur_trans->writer_wait, &wait,
TASK_UNINTERRUPTIBLE);
- smp_mb();
if (atomic_read(&cur_trans->num_writers) > 1)
schedule_timeout(MAX_SCHEDULE_TIMEOUT);
else if (should_grow)
schedule_timeout(1);
- mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&cur_trans->writer_wait, &wait);
+ spin_lock(&root->fs_info->trans_lock);
+ root->fs_info->trans_no_join = 1;
+ spin_unlock(&root->fs_info->trans_lock);
} while (atomic_read(&cur_trans->num_writers) > 1 ||
(should_grow && cur_trans->num_joined != joined));
@@ -1364,9 +1378,6 @@
btrfs_prepare_extent_commit(trans, root);
cur_trans = root->fs_info->running_transaction;
- spin_lock(&root->fs_info->new_trans_lock);
- root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->new_trans_lock);
btrfs_set_root_node(&root->fs_info->tree_root->root_item,
root->fs_info->tree_root->node);
@@ -1387,10 +1398,13 @@
sizeof(root->fs_info->super_copy));
trans->transaction->blocked = 0;
+ spin_lock(&root->fs_info->trans_lock);
+ root->fs_info->running_transaction = NULL;
+ root->fs_info->trans_no_join = 0;
+ spin_unlock(&root->fs_info->trans_lock);
wake_up(&root->fs_info->transaction_wait);
- mutex_unlock(&root->fs_info->trans_mutex);
ret = btrfs_write_and_wait_transaction(trans, root);
BUG_ON(ret);
write_ctree_super(trans, root, 0);
@@ -1403,22 +1417,21 @@
btrfs_finish_extent_commit(trans, root);
- mutex_lock(&root->fs_info->trans_mutex);
-
cur_trans->commit_done = 1;
root->fs_info->last_trans_committed = cur_trans->transid;
wake_up(&cur_trans->commit_wait);
+ spin_lock(&root->fs_info->trans_lock);
list_del_init(&cur_trans->list);
+ spin_unlock(&root->fs_info->trans_lock);
+
put_transaction(cur_trans);
put_transaction(cur_trans);
trace_btrfs_transaction_commit(root);
- mutex_unlock(&root->fs_info->trans_mutex);
-
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -1438,9 +1451,9 @@
LIST_HEAD(list);
struct btrfs_fs_info *fs_info = root->fs_info;
- mutex_lock(&fs_info->trans_mutex);
+ spin_lock(&fs_info->trans_lock);
list_splice_init(&fs_info->dead_roots, &list);
- mutex_unlock(&fs_info->trans_mutex);
+ spin_unlock(&fs_info->trans_lock);
while (!list_empty(&list)) {
root = list_entry(list.next, struct btrfs_root, root_list);