Btrfs: Improve space balancing code

This patch improves the space balancing code to keep more sharing
of tree blocks. The only case that breaks sharing of tree blocks is
data extents get fragmented during balancing. The main changes in
this patch are:

Add a 'drop sub-tree' function. This solves the problem in old code
that BTRFS_HEADER_FLAG_WRITTEN check breaks sharing of tree block.

Remove relocation mapping tree. Relocation mappings are stored in
struct btrfs_ref_path and updated dynamically during walking up/down
the reference path. This reduces CPU usage and simplifies code.

This patch also fixes a bug. Root items for reloc trees should be
updated in btrfs_free_reloc_root.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>


diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9caeb37..73899d0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -287,7 +287,7 @@
 		/*
 		 * There are only two places that can drop reference to
 		 * tree blocks owned by living reloc trees, one is here,
-		 * the other place is btrfs_merge_path. In both places,
+		 * the other place is btrfs_drop_subtree. In both places,
 		 * we check reference count while tree block is locked.
 		 * Furthermore, if reference count is one, it won't get
 		 * increased by someone else.
@@ -312,9 +312,6 @@
 	}
 
 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		ret = btrfs_add_reloc_mapping(root, buf->start,
-					      buf->len, cow->start);
-		BUG_ON(ret);
 		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
 		WARN_ON(ret);
 	}
@@ -1627,61 +1624,57 @@
 		btrfs_node_key_to_cpu(eb, &key, slot);
 		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
 
-		/*
-		 * if node keys match and node pointer hasn't been modified
-		 * in the running transaction, we can merge the path. for
-		 * reloc trees, the node pointer check is skipped, this is
-		 * because the reloc trees are fully controlled by the space
-		 * balance code, no one else can modify them.
-		 */
-		if (!nodes[level - 1] || !key_match ||
-		    (generation == trans->transid &&
-		     root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)) {
-next_level:
-			if (level == 1 || level == lowest_level + 1)
-				break;
-
+		if (generation == trans->transid) {
 			eb = read_tree_block(root, bytenr, blocksize,
 					     generation);
 			btrfs_tree_lock(eb);
+		}
+
+		/*
+		 * if node keys match and node pointer hasn't been modified
+		 * in the running transaction, we can merge the path. for
+		 * blocks owened by reloc trees, the node pointer check is
+		 * skipped, this is because these blocks are fully controlled
+		 * by the space balance code, no one else can modify them.
+		 */
+		if (!nodes[level - 1] || !key_match ||
+		    (generation == trans->transid &&
+		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
+			if (level == 1 || level == lowest_level + 1) {
+				if (generation == trans->transid) {
+					btrfs_tree_unlock(eb);
+					free_extent_buffer(eb);
+				}
+				break;
+			}
+
+			if (generation != trans->transid) {
+				eb = read_tree_block(root, bytenr, blocksize,
+						generation);
+				btrfs_tree_lock(eb);
+			}
 
 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
 					      &eb, 0);
 			BUG_ON(ret);
 
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID) {
+				if (!nodes[level - 1]) {
+					nodes[level - 1] = eb->start;
+					memcpy(&node_keys[level - 1], &key,
+					       sizeof(node_keys[0]));
+				} else {
+					WARN_ON(1);
+				}
+			}
+
 			btrfs_tree_unlock(parent);
 			free_extent_buffer(parent);
 			parent = eb;
 			continue;
 		}
 
-		if (generation == trans->transid) {
-			u32 refs;
-			BUG_ON(btrfs_header_owner(eb) !=
-			       BTRFS_TREE_RELOC_OBJECTID);
-			/*
-			 * lock the block to keep __btrfs_cow_block from
-			 * changing the reference count.
-			 */
-			eb = read_tree_block(root, bytenr, blocksize,
-					     generation);
-			btrfs_tree_lock(eb);
-
-			ret = btrfs_lookup_extent_ref(trans, root, bytenr,
-						      blocksize, &refs);
-			BUG_ON(ret);
-			/*
-			 * if replace block whose reference count is one,
-			 * we have to "drop the subtree". so skip it for
-			 * simplicity
-			 */
-			if (refs == 1) {
-				btrfs_tree_unlock(eb);
-				free_extent_buffer(eb);
-				goto next_level;
-			}
-		}
-
 		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
 		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
 		btrfs_mark_buffer_dirty(parent);
@@ -1693,16 +1686,24 @@
 					btrfs_header_generation(parent),
 					level - 1);
 		BUG_ON(ret);
-		ret = btrfs_free_extent(trans, root, bytenr,
+
+		/*
+		 * If the block was created in the running transaction,
+		 * it's possible this is the last reference to it, so we
+		 * should drop the subtree.
+		 */
+		if (generation == trans->transid) {
+			ret = btrfs_drop_subtree(trans, root, eb, parent);
+			BUG_ON(ret);
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		} else {
+			ret = btrfs_free_extent(trans, root, bytenr,
 					blocksize, parent->start,
 					btrfs_header_owner(parent),
 					btrfs_header_generation(parent),
 					level - 1, 1);
-		BUG_ON(ret);
-
-		if (generation == trans->transid) {
-			btrfs_tree_unlock(eb);
-			free_extent_buffer(eb);
+			BUG_ON(ret);
 		}
 		break;
 	}