Btrfs: Start btree concurrency work.

The allocation trees and the chunk trees are serialized via their own
dedicated mutexes.  This means allocation location is still not very
fine grained.

The main FS btree is protected by locks on each block in the btree.  Locks
are taken top / down, and as processing finishes on a given level of the
tree, the lock is released after locking the lower level.

The end result of a search is now a path where only the lowest level
is locked.  Releasing or freeing the path drops any locks held.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7ed6b39..0e2dcc7 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1d404bd..75625c6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -21,6 +21,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "print-tree.h"
+#include "locking.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
@@ -64,12 +65,47 @@
 	int i;
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 		if (!p->nodes[i])
-			break;
+			continue;
+		if (p->locks[i]) {
+			btrfs_tree_unlock(p->nodes[i]);
+			p->locks[i] = 0;
+		}
 		free_extent_buffer(p->nodes[i]);
 	}
 	memset(p, 0, sizeof(*p));
 }
 
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+	spin_lock(&root->node_lock);
+	eb = root->node;
+	extent_buffer_get(eb);
+	spin_unlock(&root->node_lock);
+	return eb;
+}
+
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	while(1) {
+		eb = btrfs_root_node(root);
+		btrfs_tree_lock(eb);
+
+		spin_lock(&root->node_lock);
+		if (eb == root->node) {
+			spin_unlock(&root->node_lock);
+			break;
+		}
+		spin_unlock(&root->node_lock);
+
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return eb;
+}
+
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
 	if (root->track_dirty && list_empty(&root->dirty_list)) {
@@ -111,7 +147,7 @@
 	} else {
 		first_key.objectid = 0;
 	}
-	cow = __btrfs_alloc_free_block(trans, new_root, buf->len,
+	cow = btrfs_alloc_free_block(trans, new_root, buf->len,
 				       new_root_objectid,
 				       trans->transid, first_key.objectid,
 				       level, buf->start, 0);
@@ -151,8 +187,14 @@
 	int ret = 0;
 	int different_trans = 0;
 	int level;
+	int unlock_orig = 0;
 	struct btrfs_key first_key;
 
+	if (*cow_ret == buf)
+		unlock_orig = 1;
+
+	WARN_ON(!btrfs_tree_locked(buf));
+
 	if (root->ref_cows) {
 		root_gen = trans->transid;
 	} else {
@@ -172,7 +214,7 @@
 	} else {
 		first_key.objectid = 0;
 	}
-	cow = __btrfs_alloc_free_block(trans, root, buf->len,
+	cow = btrfs_alloc_free_block(trans, root, buf->len,
 				     root->root_key.objectid,
 				     root_gen, first_key.objectid, level,
 				     search_start, empty_size);
@@ -196,9 +238,14 @@
 	}
 
 	if (buf == root->node) {
+		WARN_ON(parent && parent != buf);
 		root_gen = btrfs_header_generation(buf);
+
+		spin_lock(&root->node_lock);
 		root->node = cow;
 		extent_buffer_get(cow);
+		spin_unlock(&root->node_lock);
+
 		if (buf != root->commit_root) {
 			btrfs_free_extent(trans, root, buf->start,
 					  buf->len, root->root_key.objectid,
@@ -219,6 +266,8 @@
 				  btrfs_header_owner(parent), root_gen,
 				  0, 0, 1);
 	}
+	if (unlock_orig)
+		btrfs_tree_unlock(buf);
 	free_extent_buffer(buf);
 	btrfs_mark_buffer_dirty(cow);
 	*cow_ret = cow;
@@ -316,6 +365,9 @@
 	int progress_passed = 0;
 	struct btrfs_disk_key disk_key;
 
+	/* FIXME this code needs locking */
+	return 0;
+
 	parent_level = btrfs_header_level(parent);
 	if (cache_only && parent_level != 1)
 		return 0;
@@ -729,6 +781,7 @@
 		return 0;
 
 	mid = path->nodes[level];
+	WARN_ON(!path->locks[level]);
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -749,14 +802,21 @@
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
+		btrfs_tree_lock(child);
 		BUG_ON(!child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
 		BUG_ON(ret);
 
+		spin_lock(&root->node_lock);
 		root->node = child;
+		spin_unlock(&root->node_lock);
+
 		add_root_to_dirty_list(root);
+		btrfs_tree_unlock(child);
+		path->locks[level] = 0;
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
+		btrfs_tree_unlock(mid);
 		/* once for the path */
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
@@ -775,6 +835,7 @@
 
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
+		btrfs_tree_lock(left);
 		wret = btrfs_cow_block(trans, root, left,
 				       parent, pslot - 1, &left);
 		if (wret) {
@@ -784,6 +845,7 @@
 	}
 	right = read_node_slot(root, parent, pslot + 1);
 	if (right) {
+		btrfs_tree_lock(right);
 		wret = btrfs_cow_block(trans, root, right,
 				       parent, pslot + 1, &right);
 		if (wret) {
@@ -815,6 +877,7 @@
 			u32 blocksize = right->len;
 
 			clean_tree_block(trans, root, right);
+			btrfs_tree_unlock(right);
 			free_extent_buffer(right);
 			right = NULL;
 			wret = del_ptr(trans, root, path, level + 1, pslot +
@@ -862,7 +925,9 @@
 		u64 root_gen = btrfs_header_generation(parent);
 		u64 bytenr = mid->start;
 		u32 blocksize = mid->len;
+
 		clean_tree_block(trans, root, mid);
+		btrfs_tree_unlock(mid);
 		free_extent_buffer(mid);
 		mid = NULL;
 		wret = del_ptr(trans, root, path, level + 1, pslot);
@@ -885,11 +950,14 @@
 	if (left) {
 		if (btrfs_header_nritems(left) > orig_slot) {
 			extent_buffer_get(left);
+			/* left was locked after cow */
 			path->nodes[level] = left;
 			path->slots[level + 1] -= 1;
 			path->slots[level] = orig_slot;
-			if (mid)
+			if (mid) {
+				btrfs_tree_unlock(mid);
 				free_extent_buffer(mid);
+			}
 		} else {
 			orig_slot -= btrfs_header_nritems(left);
 			path->slots[level] = orig_slot;
@@ -901,10 +969,15 @@
 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
 		BUG();
 enospc:
-	if (right)
+	if (right) {
+		btrfs_tree_unlock(right);
 		free_extent_buffer(right);
-	if (left)
+	}
+	if (left) {
+		if (path->nodes[level] != left)
+			btrfs_tree_unlock(left);
 		free_extent_buffer(left);
+	}
 	return ret;
 }
 
@@ -942,6 +1015,8 @@
 	/* first, try to make some room in the middle buffer */
 	if (left) {
 		u32 left_nr;
+
+		btrfs_tree_lock(left);
 		left_nr = btrfs_header_nritems(left);
 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -967,24 +1042,28 @@
 				path->nodes[level] = left;
 				path->slots[level + 1] -= 1;
 				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(mid);
 				free_extent_buffer(mid);
 			} else {
 				orig_slot -=
 					btrfs_header_nritems(left);
 				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(left);
 				free_extent_buffer(left);
 			}
 			return 0;
 		}
+		btrfs_tree_unlock(left);
 		free_extent_buffer(left);
 	}
-	right= read_node_slot(root, parent, pslot + 1);
+	right = read_node_slot(root, parent, pslot + 1);
 
 	/*
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right) {
 		u32 right_nr;
+		btrfs_tree_lock(right);
 		right_nr = btrfs_header_nritems(right);
 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -1013,12 +1092,15 @@
 				path->slots[level + 1] += 1;
 				path->slots[level] = orig_slot -
 					btrfs_header_nritems(mid);
+				btrfs_tree_unlock(mid);
 				free_extent_buffer(mid);
 			} else {
+				btrfs_tree_unlock(right);
 				free_extent_buffer(right);
 			}
 			return 0;
 		}
+		btrfs_tree_unlock(right);
 		free_extent_buffer(right);
 	}
 	return 1;
@@ -1050,6 +1132,8 @@
 		return;
 
 	node = path->nodes[level];
+	WARN_ON(!path->skip_locking && !btrfs_tree_locked(node));
+
 	search = btrfs_node_blockptr(node, slot);
 	blocksize = btrfs_level_size(root, level - 1);
 	eb = btrfs_find_tree_block(root, search, blocksize);
@@ -1098,6 +1182,39 @@
 			highest_read = search;
 	}
 }
+
+static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock)
+{
+	int i;
+	int skip_level = level;
+	struct extent_buffer *t;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			break;
+		if (!path->locks[i])
+			break;
+		if (path->slots[i] == 0) {
+			skip_level = i + 1;
+			continue;
+		}
+		if (path->keep_locks) {
+			u32 nritems;
+			t = path->nodes[i];
+			nritems = btrfs_header_nritems(t);
+			if (path->slots[i] >= nritems - 1) {
+				skip_level = i + 1;
+				continue;
+			}
+		}
+		t = path->nodes[i];
+		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+			btrfs_tree_unlock(t);
+			path->locks[i] = 0;
+		}
+	}
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -1120,15 +1237,27 @@
 	int ret;
 	int level;
 	int should_reada = p->reada;
+	int lowest_unlock = 1;
 	u8 lowest_level = 0;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len);
 	WARN_ON(p->nodes[0] != NULL);
-	WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
+	// WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
+	WARN_ON(root == root->fs_info->extent_root &&
+		!mutex_is_locked(&root->fs_info->alloc_mutex));
+	WARN_ON(root == root->fs_info->chunk_root &&
+		!mutex_is_locked(&root->fs_info->chunk_mutex));
+	WARN_ON(root == root->fs_info->dev_root &&
+		!mutex_is_locked(&root->fs_info->chunk_mutex));
+	if (ins_len < 0)
+		lowest_unlock = 2;
 again:
-	b = root->node;
-	extent_buffer_get(b);
+	if (!p->skip_locking)
+		b = btrfs_lock_root_node(root);
+	else
+		b = btrfs_root_node(root);
+
 	while (b) {
 		level = btrfs_header_level(b);
 		if (cow) {
@@ -1147,9 +1276,12 @@
 			WARN_ON(1);
 		level = btrfs_header_level(b);
 		p->nodes[level] = b;
+		if (!p->skip_locking)
+			p->locks[level] = 1;
 		ret = check_block(root, p, level);
 		if (ret)
 			return -1;
+
 		ret = bin_search(b, key, level, &slot);
 		if (level != 0) {
 			if (ret && slot > 0)
@@ -1177,14 +1309,19 @@
 				BUG_ON(btrfs_header_nritems(b) == 1);
 			}
 			/* this is only true while dropping a snapshot */
-			if (level == lowest_level)
+			if (level == lowest_level) {
+				unlock_up(p, level, lowest_unlock);
 				break;
+			}
 
 			if (should_reada)
 				reada_for_search(root, p, level, slot,
 						 key->objectid);
 
 			b = read_node_slot(root, b, slot);
+			if (!p->skip_locking)
+				btrfs_tree_lock(b);
+			unlock_up(p, level, lowest_unlock);
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -1195,6 +1332,7 @@
 				if (sret)
 					return sret;
 			}
+			unlock_up(p, level, lowest_unlock);
 			return ret;
 		}
 	}
@@ -1225,6 +1363,13 @@
 			break;
 		t = path->nodes[i];
 		btrfs_set_node_key(t, key, tslot);
+		if (!btrfs_tree_locked(path->nodes[i])) {
+			int ii;
+printk("fixup without lock on level %d\n", btrfs_header_level(path->nodes[i]));
+			for (ii = 0; ii < BTRFS_MAX_LEVEL; ii++) {
+printk("level %d slot %d\n", ii, path->slots[ii]);
+			}
+		}
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
 			break;
@@ -1370,6 +1515,7 @@
 	u64 lower_gen;
 	struct extent_buffer *lower;
 	struct extent_buffer *c;
+	struct extent_buffer *old;
 	struct btrfs_disk_key lower_key;
 
 	BUG_ON(path->nodes[level]);
@@ -1386,12 +1532,13 @@
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 
-	c = __btrfs_alloc_free_block(trans, root, root->nodesize,
+	c = btrfs_alloc_free_block(trans, root, root->nodesize,
 				   root->root_key.objectid,
 				   root_gen, lower_key.objectid, level,
 				   root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
+
 	memset_extent_buffer(c, 0, 0, root->nodesize);
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_header_level(c, level);
@@ -1416,23 +1563,31 @@
 
 	btrfs_mark_buffer_dirty(c);
 
-	/* the super has an extra ref to root->node */
-	free_extent_buffer(root->node);
+	spin_lock(&root->node_lock);
+	old = root->node;
 	root->node = c;
+	spin_unlock(&root->node_lock);
+
+	/* the super has an extra ref to root->node */
+	free_extent_buffer(old);
+
 	add_root_to_dirty_list(root);
 	extent_buffer_get(c);
 	path->nodes[level] = c;
+	path->locks[level] = 1;
 	path->slots[level] = 0;
 
 	if (root->ref_cows && lower_gen != trans->transid) {
 		struct btrfs_path *back_path = btrfs_alloc_path();
 		int ret;
+		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = btrfs_insert_extent_backref(trans,
 						  root->fs_info->extent_root,
 						  path, lower->start,
 						  root->root_key.objectid,
 						  trans->transid, 0, 0);
 		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		btrfs_free_path(back_path);
 	}
 	return 0;
@@ -1521,7 +1676,7 @@
 		root_gen = 0;
 
 	btrfs_node_key(c, &disk_key, 0);
-	split = __btrfs_alloc_free_block(trans, root, root->nodesize,
+	split = btrfs_alloc_free_block(trans, root, root->nodesize,
 					 root->root_key.objectid,
 					 root_gen,
 					 btrfs_disk_key_objectid(&disk_key),
@@ -1564,10 +1719,12 @@
 
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
+		btrfs_tree_unlock(c);
 		free_extent_buffer(c);
 		path->nodes[level] = split;
 		path->slots[level + 1] += 1;
 	} else {
+		btrfs_tree_unlock(split);
 		free_extent_buffer(split);
 	}
 	return ret;
@@ -1648,30 +1805,24 @@
 		return 1;
 
 	right = read_node_slot(root, upper, slot + 1);
+	btrfs_tree_lock(right);
 	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (free_space < data_size + sizeof(struct btrfs_item))
+		goto out_unlock;
 
 	/* cow and double check */
 	ret = btrfs_cow_block(trans, root, right, upper,
 			      slot + 1, &right);
-	if (ret) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (ret)
+		goto out_unlock;
+
 	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (free_space < data_size + sizeof(struct btrfs_item))
+		goto out_unlock;
 
 	left_nritems = btrfs_header_nritems(left);
-	if (left_nritems == 0) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (left_nritems == 0)
+		goto out_unlock;
 
 	if (empty)
 		nr = 0;
@@ -1707,10 +1858,8 @@
 		left->map_token = NULL;
 	}
 
-	if (push_items == 0) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (push_items == 0)
+		goto out_unlock;
 
 	if (!empty && push_items == left_nritems)
 		WARN_ON(1);
@@ -1778,14 +1927,24 @@
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
 		path->slots[0] -= left_nritems;
+		if (btrfs_header_nritems(path->nodes[0]) == 0)
+			clean_tree_block(trans, root, path->nodes[0]);
+		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = right;
 		path->slots[1] += 1;
 	} else {
+		btrfs_tree_unlock(right);
 		free_extent_buffer(right);
 	}
 	return 0;
+
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
 }
+
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
@@ -1823,10 +1982,11 @@
 	}
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
+	btrfs_tree_lock(left);
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		free_extent_buffer(left);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 
 	/* cow and double check */
@@ -1834,14 +1994,14 @@
 			      path->nodes[1], slot - 1, &left);
 	if (ret) {
 		/* we hit -ENOSPC, but it isn't fatal here */
-		free_extent_buffer(left);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		free_extent_buffer(left);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 
 	if (empty)
@@ -1876,8 +2036,8 @@
 	}
 
 	if (push_items == 0) {
-		free_extent_buffer(left);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 	if (!empty && push_items == btrfs_header_nritems(right))
 		WARN_ON(1);
@@ -1975,15 +2135,23 @@
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
 		path->slots[0] += old_left_nritems;
+		if (btrfs_header_nritems(path->nodes[0]) == 0)
+			clean_tree_block(trans, root, path->nodes[0]);
+		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = left;
 		path->slots[1] -= 1;
 	} else {
+		btrfs_tree_unlock(left);
 		free_extent_buffer(left);
 		path->slots[0] -= push_items;
 	}
 	BUG_ON(path->slots[0] < 0);
 	return ret;
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
 }
 
 /*
@@ -2052,7 +2220,7 @@
 
 	btrfs_item_key(l, &disk_key, 0);
 
-	right = __btrfs_alloc_free_block(trans, root, root->leafsize,
+	right = btrfs_alloc_free_block(trans, root, root->leafsize,
 					 root->root_key.objectid,
 					 root_gen, disk_key.objectid, 0,
 					 l->start, 0);
@@ -2085,6 +2253,8 @@
 						  path->slots[1] + 1, 1);
 				if (wret)
 					ret = wret;
+
+				btrfs_tree_unlock(path->nodes[0]);
 				free_extent_buffer(path->nodes[0]);
 				path->nodes[0] = right;
 				path->slots[0] = 0;
@@ -2111,6 +2281,7 @@
 						  path->slots[1], 1);
 				if (wret)
 					ret = wret;
+				btrfs_tree_unlock(path->nodes[0]);
 				free_extent_buffer(path->nodes[0]);
 				path->nodes[0] = right;
 				path->slots[0] = 0;
@@ -2184,12 +2355,15 @@
 	BUG_ON(path->slots[0] != slot);
 
 	if (mid <= slot) {
+		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = right;
 		path->slots[0] -= mid;
 		path->slots[1] += 1;
-	} else
+	} else {
+		btrfs_tree_unlock(right);
 		free_extent_buffer(right);
+	}
 
 	BUG_ON(path->slots[0] < 0);
 
@@ -2418,10 +2592,6 @@
 		total_data += data_size[i];
 	}
 
-	/* create a root if there isn't one */
-	if (!root->node)
-		BUG();
-
 	total_size = total_data + (nr - 1) * sizeof(struct btrfs_item);
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
 	if (ret == 0) {
@@ -2516,7 +2686,6 @@
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-
 out:
 	return ret;
 }
@@ -2655,7 +2824,6 @@
 			btrfs_set_header_level(leaf, 0);
 		} else {
 			u64 root_gen = btrfs_header_generation(path->nodes[1]);
-			clean_tree_block(trans, root, leaf);
 			wret = del_ptr(trans, root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
@@ -2706,8 +2874,6 @@
 				root_gen = btrfs_header_generation(
 							   path->nodes[1]);
 
-				clean_tree_block(trans, root, leaf);
-
 				wret = del_ptr(trans, root, path, 1, slot);
 				if (wret)
 					ret = wret;
@@ -2720,7 +2886,13 @@
 				if (wret)
 					ret = wret;
 			} else {
-				btrfs_mark_buffer_dirty(leaf);
+				/* if we're still in the path, make sure
+				 * we're dirty.  Otherwise, one of the
+				 * push_leaf functions must have already
+				 * dirtied this buffer
+				 */
+				if (path->nodes[0] == leaf)
+					btrfs_mark_buffer_dirty(leaf);
 				free_extent_buffer(leaf);
 			}
 		} else {
@@ -2731,56 +2903,40 @@
 }
 
 /*
- * walk up the tree as far as required to find the previous leaf.
+ * search the tree again to find a leaf with lesser keys
  * returns 0 if it found something or 1 if there are no lesser leaves.
  * returns < 0 on io errors.
  */
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
-	int slot;
-	int level = 1;
-	struct extent_buffer *c;
-	struct extent_buffer *next = NULL;
+	struct btrfs_key key;
+	struct btrfs_disk_key found_key;
+	int ret;
 
-	while(level < BTRFS_MAX_LEVEL) {
-		if (!path->nodes[level])
-			return 1;
+	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
 
-		slot = path->slots[level];
-		c = path->nodes[level];
-		if (slot == 0) {
-			level++;
-			if (level == BTRFS_MAX_LEVEL)
-				return 1;
-			continue;
-		}
-		slot--;
+	if (key.offset > 0)
+		key.offset--;
+	else if (key.type > 0)
+		key.type--;
+	else if (key.objectid > 0)
+		key.objectid--;
+	else
+		return 1;
 
-		if (next)
-			free_extent_buffer(next);
-
-		next = read_node_slot(root, c, slot);
-		break;
-	}
-	path->slots[level] = slot;
-	while(1) {
-		level--;
-		c = path->nodes[level];
-		free_extent_buffer(c);
-		slot = btrfs_header_nritems(next);
-		if (slot != 0)
-			slot--;
-		path->nodes[level] = next;
-		path->slots[level] = slot;
-		if (!level)
-			break;
-		next = read_node_slot(root, next, slot);
-	}
-	return 0;
+	btrfs_release_path(root, path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	btrfs_item_key(path->nodes[0], &found_key, 0);
+	ret = comp_keys(&found_key, &key);
+	if (ret < 0)
+		return 0;
+	return 1;
 }
 
 /*
- * walk up the tree as far as required to find the next leaf.
+ * search the tree again to find a leaf with greater keys
  * returns 0 if it found something or 1 if there are no greater leaves.
  * returns < 0 on io errors.
  */
@@ -2790,6 +2946,28 @@
 	int level = 1;
 	struct extent_buffer *c;
 	struct extent_buffer *next = NULL;
+	struct btrfs_key key;
+	u32 nritems;
+	int ret;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (nritems == 0) {
+		return 1;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+	path->keep_locks = 1;
+	btrfs_release_path(root, path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	path->keep_locks = 0;
+
+	if (ret < 0)
+		return ret;
+
+	if (path->slots[0] < nritems - 1) {
+		goto done;
+	}
 
 	while(level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
@@ -2799,33 +2977,45 @@
 		c = path->nodes[level];
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL)
+			if (level == BTRFS_MAX_LEVEL) {
 				return 1;
+			}
 			continue;
 		}
 
-		if (next)
+		if (next) {
+			btrfs_tree_unlock(next);
 			free_extent_buffer(next);
+		}
 
-		if (path->reada)
+		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 
 		next = read_node_slot(root, c, slot);
+		if (!path->skip_locking)
+			btrfs_tree_lock(next);
 		break;
 	}
 	path->slots[level] = slot;
 	while(1) {
 		level--;
 		c = path->nodes[level];
+		if (path->locks[level])
+			btrfs_tree_unlock(c);
 		free_extent_buffer(c);
 		path->nodes[level] = next;
 		path->slots[level] = 0;
+		path->locks[level] = 1;
 		if (!level)
 			break;
-		if (path->reada)
-			reada_for_search(root, path, level, 0, 0);
+		if (level == 1 && path->locks[1] && path->reada)
+			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
+		if (!path->skip_locking)
+			btrfs_tree_lock(next);
 	}
+done:
+	unlock_up(path, 0, 1);
 	return 0;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dcea9d7..50891b3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -330,8 +330,13 @@
 struct btrfs_path {
 	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
+	/* if there is real range locking, this locks field will change */
+	int locks[BTRFS_MAX_LEVEL];
 	int reada;
+	/* keep some upper locks as we walk down */
+	int keep_locks;
 	int lowest_level;
+	int skip_locking;
 };
 
 /*
@@ -515,6 +520,8 @@
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
 	struct mutex fs_mutex;
+	struct mutex alloc_mutex;
+	struct mutex chunk_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -576,6 +583,10 @@
  */
 struct btrfs_root {
 	struct extent_buffer *node;
+
+	/* the node lock is held while changing the node pointer */
+	spinlock_t node_lock;
+
 	struct extent_buffer *commit_root;
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
@@ -1353,13 +1364,7 @@
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
 						 int data, int owner);
-int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, u64 owner_objectid);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root, u32 size,
-					    u64 root_objectid,
-					    u64 hint, u64 empty_size);
-struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     u32 blocksize,
 					     u64 root_objectid,
@@ -1368,8 +1373,6 @@
 					     int level,
 					     u64 hint,
 					     u64 empty_size);
-int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 new_size);
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
@@ -1409,6 +1412,10 @@
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
+
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e5c758e..fe40bdd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -32,6 +32,7 @@
 #include "volumes.h"
 #include "print-tree.h"
 #include "async-thread.h"
+#include "locking.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -681,9 +682,11 @@
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
-	    root->fs_info->running_transaction->transid)
+	    root->fs_info->running_transaction->transid) {
+		WARN_ON(!btrfs_tree_locked(buf));
 		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
+	}
 	return 0;
 }
 
@@ -720,6 +723,7 @@
 	root->in_sysfs = 0;
 
 	INIT_LIST_HEAD(&root->dirty_list);
+	spin_lock_init(&root->node_lock);
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -1196,6 +1200,8 @@
 
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
+	mutex_init(&fs_info->alloc_mutex);
+	mutex_init(&fs_info->chunk_mutex);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1274,7 +1280,9 @@
 
 	mutex_lock(&fs_info->fs_mutex);
 
+	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(tree_root);
+	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk("btrfs: failed to read the system array on %s\n",
 		       sb->s_id);
@@ -1296,7 +1304,9 @@
 	         (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
 		 BTRFS_UUID_SIZE);
 
+	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
+	mutex_unlock(&fs_info->chunk_mutex);
 	BUG_ON(ret);
 
 	btrfs_close_extra_devices(fs_devices);
@@ -1654,6 +1664,7 @@
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
+	WARN_ON(!btrfs_tree_locked(buf));
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
 			(unsigned long long)buf->start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 41a6346..7e40c51 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -25,6 +25,7 @@
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
+#include "locking.h"
 
 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
@@ -36,7 +37,28 @@
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
+static struct btrfs_block_group_cache *
+__btrfs_find_block_group(struct btrfs_root *root,
+			 struct btrfs_block_group_cache *hint,
+			 u64 search_start, int data, int owner);
 
+void maybe_lock_mutex(struct btrfs_root *root)
+{
+	if (root != root->fs_info->extent_root &&
+	    root != root->fs_info->chunk_root &&
+	    root != root->fs_info->dev_root) {
+		mutex_lock(&root->fs_info->alloc_mutex);
+	}
+}
+
+void maybe_unlock_mutex(struct btrfs_root *root)
+{
+	if (root != root->fs_info->extent_root &&
+	    root != root->fs_info->chunk_root &&
+	    root != root->fs_info->dev_root) {
+		mutex_unlock(&root->fs_info->alloc_mutex);
+	}
+}
 
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
@@ -66,6 +88,7 @@
 		return -ENOMEM;
 
 	path->reada = 2;
+	path->skip_locking = 1;
 	first_free = block_group->key.objectid;
 	key.objectid = block_group->key.objectid;
 	key.offset = 0;
@@ -290,7 +313,7 @@
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
 	}
 	cache_miss = 0;
-	cache = btrfs_find_block_group(root, cache, last, data, 0);
+	cache = __btrfs_find_block_group(root, cache, last, data, 0);
 	if (!cache)
 		goto no_cache;
 	*cache_ret = cache;
@@ -318,10 +341,10 @@
 	return bits;
 }
 
-struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
-						 struct btrfs_block_group_cache
-						 *hint, u64 search_start,
-						 int data, int owner)
+static struct btrfs_block_group_cache *
+__btrfs_find_block_group(struct btrfs_root *root,
+			 struct btrfs_block_group_cache *hint,
+			 u64 search_start, int data, int owner)
 {
 	struct btrfs_block_group_cache *cache;
 	struct extent_io_tree *block_group_cache;
@@ -411,6 +434,18 @@
 	return found_group;
 }
 
+struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
+						 struct btrfs_block_group_cache
+						 *hint, u64 search_start,
+						 int data, int owner)
+{
+
+	struct btrfs_block_group_cache *ret;
+	mutex_lock(&root->fs_info->alloc_mutex);
+	ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	return ret;
+}
 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
 			   u64 owner, u64 owner_offset)
 {
@@ -646,7 +681,7 @@
 	return ret;
 }
 
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes,
 				u64 root_objectid, u64 ref_generation,
@@ -696,6 +731,22 @@
 	return 0;
 }
 
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset)
+{
+	int ret;
+
+	mutex_lock(&root->fs_info->alloc_mutex);
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+				     root_objectid, ref_generation,
+				     owner, owner_offset);
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	return ret;
+}
+
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root)
 {
@@ -760,6 +811,10 @@
 	struct btrfs_extent_ref *ref_item;
 	int level = -1;
 
+	/* FIXME, needs locking */
+	BUG();
+
+	mutex_lock(&root->fs_info->alloc_mutex);
 	path = btrfs_alloc_path();
 again:
 	if (level == -1)
@@ -854,33 +909,9 @@
 
 out:
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return total_count;
 }
-int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, u64 owner_objectid)
-{
-	u64 generation;
-	u64 key_objectid;
-	u64 level;
-	u32 nritems;
-	struct btrfs_disk_key disk_key;
-
-	level = btrfs_header_level(root->node);
-	generation = trans->transid;
-	nritems = btrfs_header_nritems(root->node);
-	if (nritems > 0) {
-		if (level == 0)
-			btrfs_item_key(root->node, &disk_key, 0);
-		else
-			btrfs_node_key(root->node, &disk_key, 0);
-		key_objectid = btrfs_disk_key_objectid(&disk_key);
-	} else {
-		key_objectid = 0;
-	}
-	return btrfs_inc_extent_ref(trans, root, root->node->start,
-				    root->node->len, owner_objectid,
-				    generation, level, key_objectid);
-}
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf)
@@ -897,6 +928,7 @@
 	if (!root->ref_cows)
 		return 0;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 	for (i = 0; i < nritems; i++) {
@@ -913,7 +945,7 @@
 			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (disk_bytenr == 0)
 				continue;
-			ret = btrfs_inc_extent_ref(trans, root, disk_bytenr,
+			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
 				    btrfs_file_extent_disk_num_bytes(buf, fi),
 				    root->root_key.objectid, trans->transid,
 				    key.objectid, key.offset);
@@ -924,7 +956,7 @@
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			btrfs_node_key_to_cpu(buf, &key, i);
-			ret = btrfs_inc_extent_ref(trans, root, bytenr,
+			ret = __btrfs_inc_extent_ref(trans, root, bytenr,
 					   btrfs_level_size(root, level - 1),
 					   root->root_key.objectid,
 					   trans->transid,
@@ -935,6 +967,7 @@
 			}
 		}
 	}
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return 0;
 fail:
 	WARN_ON(1);
@@ -965,6 +998,7 @@
 		}
 	}
 #endif
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -1019,6 +1053,7 @@
 	if (!path)
 		return -ENOMEM;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_extent_bit(block_group_cache, last,
 					    &start, &end, BLOCK_GROUP_DIRTY);
@@ -1045,6 +1080,7 @@
 				  BLOCK_GROUP_DIRTY, GFP_NOFS);
 	}
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return werr;
 }
 
@@ -1162,26 +1198,28 @@
 		space_info->force_alloc = 0;
 	}
 	if (space_info->full)
-		return 0;
+		goto out;
 
 	thresh = div_factor(space_info->total_bytes, 6);
 	if (!force &&
 	   (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
 	    thresh)
-		return 0;
+		goto out;
 
+	mutex_lock(&extent_root->fs_info->chunk_mutex);
 	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
 	if (ret == -ENOSPC) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
-		return 0;
+		goto out;
 	}
 	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-
+	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+out:
 	return 0;
 }
 
@@ -1318,6 +1356,7 @@
 	struct extent_io_tree *free_space_cache;
 	free_space_cache = &root->fs_info->free_space_cache;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -1327,6 +1366,7 @@
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
 	}
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return 0;
 }
 
@@ -1363,18 +1403,24 @@
 				  GFP_NOFS);
 		eb = read_tree_block(extent_root, ins.objectid, ins.offset,
 				     trans->transid);
+		btrfs_tree_lock(eb);
 		level = btrfs_header_level(eb);
 		if (level == 0) {
 			btrfs_item_key(eb, &first, 0);
 		} else {
 			btrfs_node_key(eb, &first, 0);
 		}
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+		/*
+		 * the first key is just a hint, so the race we've created
+		 * against reading it is fine
+		 */
 		err = btrfs_insert_extent_backref(trans, extent_root, path,
 					  start, extent_root->root_key.objectid,
 					  0, level,
 					  btrfs_disk_key_objectid(&first));
 		BUG_ON(err);
-		free_extent_buffer(eb);
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -1384,12 +1430,14 @@
 			  int pending)
 {
 	int err = 0;
-	struct extent_buffer *buf;
 
 	if (!pending) {
+#if 0
+		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
-			if (btrfs_buffer_uptodate(buf, 0)) {
+			if (!btrfs_try_tree_lock(buf) &&
+			    btrfs_buffer_uptodate(buf, 0)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
@@ -1398,12 +1446,15 @@
 				    !btrfs_header_flag(buf,
 					       BTRFS_HEADER_FLAG_WRITTEN)) {
 					clean_tree_block(NULL, root, buf);
+					btrfs_tree_unlock(buf);
 					free_extent_buffer(buf);
 					return 1;
 				}
+				btrfs_tree_unlock(buf);
 			}
 			free_extent_buffer(buf);
 		}
+#endif
 		update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
@@ -1586,10 +1637,11 @@
 /*
  * remove an extent from the root, returns 0 on success
  */
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, u64 bytenr, u64 num_bytes,
-		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, u64 owner_offset, int pin)
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, u64 bytenr,
+			       u64 num_bytes, u64 root_objectid,
+			       u64 ref_generation, u64 owner_objectid,
+			       u64 owner_offset, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int pending_ret;
@@ -1610,6 +1662,22 @@
 	return ret ? ret : pending_ret;
 }
 
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, u64 bytenr,
+		      u64 num_bytes, u64 root_objectid,
+		      u64 ref_generation, u64 owner_objectid,
+		      u64 owner_offset, int pin)
+{
+	int ret;
+
+	maybe_lock_mutex(root);
+	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes,
+				  root_objectid, ref_generation,
+				  owner_objectid, owner_offset, pin);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
 	u64 mask = ((u64)root->stripesize - 1);
@@ -1679,12 +1747,12 @@
 		block_group = btrfs_lookup_first_block_group(info, hint_byte);
 		if (!block_group)
 			hint_byte = search_start;
-		block_group = btrfs_find_block_group(root, block_group,
+		block_group = __btrfs_find_block_group(root, block_group,
 						     hint_byte, data, 1);
 		if (last_ptr && *last_ptr == 0 && block_group)
 			hint_byte = block_group->key.objectid;
 	} else {
-		block_group = btrfs_find_block_group(root,
+		block_group = __btrfs_find_block_group(root,
 						     trans->block_group,
 						     search_start, data, 1);
 	}
@@ -1806,7 +1874,7 @@
 	}
 	block_group = btrfs_lookup_first_block_group(info, search_start);
 	cond_resched();
-	block_group = btrfs_find_block_group(root, block_group,
+	block_group = __btrfs_find_block_group(root, block_group,
 					     search_start, data, 0);
 	goto check_failed;
 
@@ -1843,6 +1911,8 @@
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
 
+	maybe_lock_mutex(root);
+
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
 			        info->data_alloc_profile;
@@ -1892,9 +1962,10 @@
 	if (ret) {
 		printk("allocation failed flags %Lu\n", data);
 	}
-	BUG_ON(ret);
-	if (ret)
-		return ret;
+	if (ret) {
+		BUG();
+		goto out;
+	}
 
 	/* block accounting for super block */
 	super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -1953,11 +2024,11 @@
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
 
-	if (ret) {
-		return ret;
-	}
+	if (ret)
+		goto out;
 	if (pending_ret) {
-		return pending_ret;
+		ret = pending_ret;
+		goto out;
 	}
 
 update_block:
@@ -1967,9 +2038,10 @@
 		       ins->objectid, ins->offset);
 		BUG();
 	}
-	return 0;
+out:
+	maybe_unlock_mutex(root);
+	return ret;
 }
-
 /*
  * helper function to allocate a block for a given tree
  * returns the tree buffer or NULL.
@@ -1977,28 +2049,6 @@
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     u32 blocksize,
-					     u64 root_objectid, u64 hint,
-					     u64 empty_size)
-{
-	u64 ref_generation;
-
-	if (root->ref_cows)
-		ref_generation = trans->transid;
-	else
-		ref_generation = 0;
-
-
-	return __btrfs_alloc_free_block(trans, root, blocksize, root_objectid,
-					ref_generation, 0, 0, hint, empty_size);
-}
-
-/*
- * helper function to allocate a block for a given tree
- * returns the tree buffer or NULL.
- */
-struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root,
-					     u32 blocksize,
 					     u64 root_objectid,
 					     u64 ref_generation,
 					     u64 first_objectid,
@@ -2026,6 +2076,7 @@
 		return ERR_PTR(-ENOMEM);
 	}
 	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
 	btrfs_set_buffer_uptodate(buf);
 
@@ -2076,7 +2127,7 @@
 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 		if (disk_bytenr == 0)
 			continue;
-		ret = btrfs_free_extent(trans, root, disk_bytenr,
+		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf_owner, leaf_generation,
 				key.objectid, key.offset, 0);
@@ -2151,6 +2202,8 @@
 	int ret;
 	u32 refs;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
+
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = lookup_extent_ref(trans, root,
@@ -2182,6 +2235,7 @@
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
+
 		ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1) {
@@ -2189,7 +2243,7 @@
 			root_owner = btrfs_header_owner(parent);
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
-			ret = btrfs_free_extent(trans, root, bytenr,
+			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, root_owner,
 						root_gen, 0, 0, 1);
 			BUG_ON(ret);
@@ -2201,9 +2255,11 @@
 			reada_walk_down(root, cur, path->slots[*level]);
 
 			mutex_unlock(&root->fs_info->fs_mutex);
+			mutex_unlock(&root->fs_info->alloc_mutex);
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
 			mutex_lock(&root->fs_info->fs_mutex);
+			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
 			ret = lookup_extent_ref(trans, root, bytenr,
@@ -2216,7 +2272,7 @@
 
 				path->slots[*level]++;
 				free_extent_buffer(next);
-				ret = btrfs_free_extent(trans, root, bytenr,
+				ret = __btrfs_free_extent(trans, root, bytenr,
 							blocksize,
 							root_owner,
 							root_gen, 0, 0, 1);
@@ -2244,13 +2300,14 @@
 	}
 
 	root_gen = btrfs_header_generation(parent);
-	ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
+	ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start,
 				path->nodes[*level]->len,
 				root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
 	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return 0;
 }
 
@@ -2350,6 +2407,12 @@
 		btrfs_node_key(node, &found_key, path->slots[level]);
 		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
 			       sizeof(found_key)));
+		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+			if (path->nodes[i] && path->locks[i]) {
+				path->locks[i] = 0;
+				btrfs_tree_unlock(path->nodes[i]);
+			}
+		}
 	}
 	while(1) {
 		wret = walk_down_tree(trans, root, path, &level);
@@ -2383,6 +2446,8 @@
 	u64 end;
 	u64 ptr;
 	int ret;
+
+	mutex_lock(&info->alloc_mutex);
 	while(1) {
 		ret = find_first_extent_bit(&info->block_group_cache, 0,
 					    &start, &end, (unsigned int)-1);
@@ -2402,6 +2467,7 @@
 		clear_extent_dirty(&info->free_space_cache, start,
 				   end, GFP_NOFS);
 	}
+	mutex_unlock(&info->alloc_mutex);
 	return 0;
 }
 
@@ -2678,6 +2744,7 @@
 
 		eb = read_tree_block(found_root, extent_key->objectid,
 				     extent_key->offset, 0);
+		btrfs_tree_lock(eb);
 		level = btrfs_header_level(eb);
 
 		if (level == 0)
@@ -2685,6 +2752,7 @@
 		else
 			btrfs_node_key_to_cpu(eb, &found_key, 0);
 
+		btrfs_tree_unlock(eb);
 		free_extent_buffer(eb);
 
 		ret = find_root_for_ref(extent_root, path, &found_key,
@@ -2888,6 +2956,7 @@
 	int ret;
 	int progress;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
 						      shrink_start);
 	BUG_ON(!shrink_block_group);
@@ -3044,20 +3113,22 @@
 			   (unsigned int)-1, GFP_NOFS);
 out:
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
 			   struct btrfs_key *key)
 {
-	int ret;
+	int ret = 0;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 	int slot;
 
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 	if (ret < 0)
-		return ret;
+		goto out;
+
 	while(1) {
 		slot = path->slots[0];
 		leaf = path->nodes[0];
@@ -3066,18 +3137,20 @@
 			if (ret == 0)
 				continue;
 			if (ret < 0)
-				goto error;
+				goto out;
 			break;
 		}
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 		if (found_key.objectid >= key->objectid &&
-		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY)
-			return 0;
+		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			ret = 0;
+			goto out;
+		}
 		path->slots[0]++;
 	}
 	ret = -ENOENT;
-error:
+out:
 	return ret;
 }
 
@@ -3103,6 +3176,7 @@
 	if (!path)
 		return -ENOMEM;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0) {
@@ -3158,6 +3232,7 @@
 	ret = 0;
 error:
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -3205,5 +3280,6 @@
 	ret = del_pending_extents(trans, extent_root);
 	BUG_ON(ret);
 	set_avail_alloc_bits(extent_root->fs_info, type);
+
 	return 0;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 17c508a..bd15cdc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2889,7 +2889,6 @@
 
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		lock_page(page);
 		if (i == 0)
 			set_page_extent_head(page, eb->len);
 		else
@@ -2907,7 +2906,6 @@
 			end  = start + PAGE_CACHE_SIZE - 1;
 			if (test_range_bit(tree, start, end,
 					   EXTENT_DIRTY, 0)) {
-				unlock_page(page);
 				continue;
 			}
 		}
@@ -2919,7 +2917,6 @@
 						PAGECACHE_TAG_DIRTY);
 		}
 		read_unlock_irq(&page->mapping->tree_lock);
-		unlock_page(page);
 	}
 	return 0;
 }
@@ -2948,17 +2945,12 @@
 		 * on us if the page isn't already dirty.
 		 */
 		if (i == 0) {
-			lock_page(page);
 			set_page_extent_head(page, eb->len);
 		} else if (PagePrivate(page) &&
 			   page->private != EXTENT_PAGE_PRIVATE) {
-			lock_page(page);
 			set_page_extent_mapped(page);
-			unlock_page(page);
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-		if (i == 0)
-			unlock_page(page);
 	}
 	return set_extent_dirty(tree, eb->start,
 				eb->start + eb->len - 1, GFP_NOFS);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0c79346..61bd895 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -115,6 +115,7 @@
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
+	mutex_unlock(&root->fs_info->fs_mutex);
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
@@ -159,6 +160,7 @@
 	btrfs_add_ordered_inode(inode);
 	btrfs_update_inode(trans, root, inode);
 out:
+	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_end_transaction(trans, root);
 	return ret;
 }
@@ -349,10 +351,12 @@
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	mutex_unlock(&root->fs_info->fs_mutex);
 
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
 
+	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -807,6 +811,7 @@
 		goto err;
 	}
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	btrfs_release_path(root, path);
 
 	dentry->d_inode->i_ctime = dir->i_ctime;
 	ret = btrfs_del_inode_ref(trans, root, name, name_len,
@@ -881,8 +886,9 @@
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
 
-	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
 		return -ENOTEMPTY;
+	}
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 1);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6fb4558..3fbf74e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -43,6 +43,7 @@
 #include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "locking.h"
 
 
 
@@ -75,9 +76,9 @@
 	if (ret)
 		goto fail;
 
-	leaf = __btrfs_alloc_free_block(trans, root, root->leafsize,
-					objectid, trans->transid, 0, 0,
-					0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      objectid, trans->transid, 0, 0,
+				      0, 0);
 	if (IS_ERR(leaf))
 		return PTR_ERR(leaf);
 
@@ -108,6 +109,7 @@
 	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
 	root_item.drop_level = 0;
 
+	btrfs_tree_unlock(leaf);
 	free_extent_buffer(leaf);
 	leaf = NULL;
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 0000000..80813a3
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <linux/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+	lock_page(eb->first_page);
+	return 0;
+}
+
+int btrfs_try_tree_lock(struct extent_buffer *eb)
+{
+	return TestSetPageLocked(eb->first_page);
+}
+
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+	WARN_ON(!PageLocked(eb->first_page));
+	unlock_page(eb->first_page);
+	return 0;
+}
+
+int btrfs_tree_locked(struct extent_buffer *eb)
+{
+	return PageLocked(eb->first_page);
+}
+
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 0000000..2dab96d
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+
+int btrfs_tree_lock(struct extent_buffer *eb);
+int btrfs_tree_unlock(struct extent_buffer *eb);
+int btrfs_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_lock(struct extent_buffer *eb);
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f04684f..1ed433a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,6 +23,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "locking.h"
 
 static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
@@ -96,8 +97,7 @@
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 				   (unsigned long)root->root_key.objectid,
 				   BTRFS_ROOT_DEFRAG_TAG);
-			root->commit_root = root->node;
-			extent_buffer_get(root->node);
+			root->commit_root = btrfs_root_node(root);
 		} else {
 			WARN_ON(1);
 		}
@@ -559,6 +559,7 @@
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *root = pending->root;
 	struct extent_buffer *tmp;
+	struct extent_buffer *old;
 	int ret;
 	int namelen;
 	u64 objectid;
@@ -578,16 +579,18 @@
 	key.offset = 1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
-	extent_buffer_get(root->node);
-	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
-	free_extent_buffer(tmp);
+	old = btrfs_lock_root_node(root);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old);
 
-	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
+	btrfs_copy_root(trans, root, old, &tmp, objectid);
+	btrfs_tree_unlock(old);
+	free_extent_buffer(old);
 
 	btrfs_set_root_bytenr(new_root_item, tmp->start);
 	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				new_root_item);
+	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
 	if (ret)
 		goto fail;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 155961c..fab851d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -181,6 +181,8 @@
 	if (root->fs_info->extent_root == root)
 		is_extent = 1;
 
+	goto out;
+
 	if (root->ref_cows == 0 && !is_extent)
 		goto out;