Btrfs: Add run time btree defrag, and an ioctl to force btree defrag

This adds two types of btree defrag, a run time form that tries to
defrag recently allocated blocks in the btree when they are still in ram,
and an ioctl that forces defrag of all btree blocks.

File data blocks are not defragged yet, but this can make a huge difference
in sequential btree reads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a4e2df6..9321438 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,7 @@
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
-	   transaction.o bit-radix.o inode.o file.o
+	   transaction.o bit-radix.o inode.o file.o tree-defrag.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7a08491..c7e47e7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -65,16 +65,71 @@
 	memset(p, 0, sizeof(*p));
 }
 
-static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct buffer_head *buf, struct buffer_head
+			   *parent, int parent_slot, struct buffer_head
+			   **cow_ret, u64 search_start, u64 empty_size)
+{
+	struct buffer_head *cow;
+	struct btrfs_node *cow_node;
+	int ret = 0;
+	int different_trans = 0;
+
+	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+	WARN_ON(!buffer_uptodate(buf));
+	cow = btrfs_alloc_free_block(trans, root, search_start, empty_size);
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
+
+	cow_node = btrfs_buffer_node(cow);
+	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
+		WARN_ON(1);
+
+	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
+	btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
+	btrfs_set_header_generation(&cow_node->header, trans->transid);
+	btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
+
+	WARN_ON(btrfs_header_generation(btrfs_buffer_header(buf)) >
+		trans->transid);
+	if (btrfs_header_generation(btrfs_buffer_header(buf)) !=
+				    trans->transid) {
+		different_trans = 1;
+		ret = btrfs_inc_ref(trans, root, buf);
+		if (ret)
+			return ret;
+	} else {
+		WARN_ON(!root->ref_cows);
+		clean_tree_block(trans, root, buf);
+	}
+
+	if (buf == root->node) {
+		root->node = cow;
+		get_bh(cow);
+		if (buf != root->commit_root) {
+			btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
+		}
+		btrfs_block_release(root, buf);
+	} else {
+		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
+					bh_blocknr(cow));
+		btrfs_mark_buffer_dirty(parent);
+		WARN_ON(btrfs_header_generation(btrfs_buffer_header(parent)) !=
+				    trans->transid);
+		btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
+	}
+	btrfs_block_release(root, buf);
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 			   *root, struct buffer_head *buf, struct buffer_head
 			   *parent, int parent_slot, struct buffer_head
 			   **cow_ret)
 {
-	struct buffer_head *cow;
-	struct btrfs_node *cow_node;
-	int ret;
-
-	WARN_ON(!buffer_uptodate(buf));
+	u64 search_start;
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
 		       root->fs_info->running_transaction->transid);
@@ -90,38 +145,94 @@
 		*cow_ret = buf;
 		return 0;
 	}
-	cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr);
-	if (IS_ERR(cow))
-		return PTR_ERR(cow);
-	cow_node = btrfs_buffer_node(cow);
-	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
-		WARN_ON(1);
-	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
-	btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
-	btrfs_set_header_generation(&cow_node->header, trans->transid);
-	btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
-	ret = btrfs_inc_ref(trans, root, buf);
-	if (ret)
-		return ret;
-	if (buf == root->node) {
-		root->node = cow;
-		get_bh(cow);
-		if (buf != root->commit_root) {
-			btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
-		}
-		btrfs_block_release(root, buf);
-	} else {
-		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
-					bh_blocknr(cow));
-		btrfs_mark_buffer_dirty(parent);
-		btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
-	}
-	btrfs_block_release(root, buf);
-	btrfs_mark_buffer_dirty(cow);
-	*cow_ret = cow;
+
+	search_start = bh_blocknr(buf) & ~((u64)65535);
+	return __btrfs_cow_block(trans, root, buf, parent,
+				 parent_slot, cow_ret, search_start, 0);
+}
+
+static int close_blocks(u64 blocknr, u64 other)
+{
+	if (blocknr < other && other - blocknr < 8)
+		return 1;
+	if (blocknr > other && blocknr - other < 8)
+		return 1;
 	return 0;
 }
 
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct buffer_head *parent,
+		       int cache_only)
+{
+	struct btrfs_node *parent_node;
+	struct buffer_head *cur_bh;
+	struct buffer_head *tmp_bh;
+	u64 blocknr;
+	u64 search_start = 0;
+	u64 other;
+	u32 parent_nritems;
+	int start_slot;
+	int end_slot;
+	int i;
+	int err = 0;
+
+	if (trans->transaction != root->fs_info->running_transaction) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->running_transaction->transid);
+		WARN_ON(1);
+	}
+	if (trans->transid != root->fs_info->generation) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->generation);
+		WARN_ON(1);
+	}
+	parent_node = btrfs_buffer_node(parent);
+	parent_nritems = btrfs_header_nritems(&parent_node->header);
+
+	start_slot = 0;
+	end_slot = parent_nritems;
+
+	if (parent_nritems == 1)
+		return 0;
+
+	for (i = start_slot; i < end_slot; i++) {
+		int close = 1;
+		blocknr = btrfs_node_blockptr(parent_node, i);
+		if (i > 0) {
+			other = btrfs_node_blockptr(parent_node, i - 1);
+			close = close_blocks(blocknr, other);
+		}
+		if (close && i < end_slot - 1) {
+			other = btrfs_node_blockptr(parent_node, i + 1);
+			close = close_blocks(blocknr, other);
+		}
+		if (close)
+			continue;
+
+		cur_bh = btrfs_find_tree_block(root, blocknr);
+		if (!cur_bh || !buffer_uptodate(cur_bh) ||
+		    buffer_locked(cur_bh)) {
+			if (cache_only) {
+				brelse(cur_bh);
+				continue;
+			}
+			brelse(cur_bh);
+			cur_bh = read_tree_block(root, blocknr);
+		}
+		if (search_start == 0) {
+			search_start = bh_blocknr(cur_bh) & ~((u64)65535);
+		}
+		err = __btrfs_cow_block(trans, root, cur_bh, parent, i,
+					&tmp_bh, search_start,
+					min(8, end_slot - i));
+		if (err)
+			break;
+		search_start = bh_blocknr(tmp_bh);
+		brelse(tmp_bh);
+	}
+	return err;
+}
+
 /*
  * The leaf data grows from end-to-front in the node.
  * this returns the address of the start of the last item,
@@ -221,6 +332,7 @@
 
 		parent_slot = path->slots[level + 1];
 		parent_key = &parent->ptrs[parent_slot].key;
+
 		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
 		       sizeof(struct btrfs_disk_key)));
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
@@ -643,7 +755,7 @@
  * readahead one full node of leaves
  */
 static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
-			     int slot)
+			     int level, int slot)
 {
 	struct btrfs_node *node;
 	int i;
@@ -659,10 +771,13 @@
 	unsigned long gang[8];
 	struct buffer_head *bh;
 
-	if (!path->nodes[1])
+	if (level == 0)
 		return;
 
-	node = btrfs_buffer_node(path->nodes[1]);
+	if (!path->nodes[level])
+		return;
+
+	node = btrfs_buffer_node(path->nodes[level]);
 	search = btrfs_node_blockptr(node, slot);
 	bh = btrfs_find_tree_block(root, search);
 	if (bh) {
@@ -690,7 +805,7 @@
 		for (i = 0; i < ret; i++) {
 			blocknr = gang[i];
 			clear_radix_bit(&found, blocknr);
-			if (nread > 64)
+			if (nread > 32)
 				continue;
 			if (direction > 0 && cluster_start <= blocknr &&
 			    cluster_start + 8 > blocknr) {
@@ -726,7 +841,6 @@
 	struct buffer_head *b;
 	struct buffer_head *cow_buf;
 	struct btrfs_node *c;
-	struct btrfs_root_item *root_item = &root->root_item;
 	u64 blocknr;
 	int slot;
 	int ret;
@@ -734,11 +848,8 @@
 	int should_reada = p->reada;
 	u8 lowest_level = 0;
 
-	if (btrfs_root_refs(root_item) == 0 && root->ref_cows) {
-		lowest_level = root_item->drop_level;
-		WARN_ON(ins_len || cow);
-	}
-
+	lowest_level = p->lowest_level;
+	WARN_ON(lowest_level && ins_len);
 	WARN_ON(p->nodes[0] != NULL);
 	WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
 again:
@@ -798,8 +909,8 @@
 			if (level == lowest_level)
 				break;
 			blocknr = btrfs_node_blockptr(c, slot);
-			if (level == 1 && should_reada)
-				reada_for_search(root, p, slot);
+			if (should_reada)
+				reada_for_search(root, p, level, slot);
 			b = read_tree_block(root, btrfs_node_blockptr(c, slot));
 
 		} else {
@@ -960,7 +1071,7 @@
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr);
+	t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr, 0);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	c = btrfs_buffer_node(t);
@@ -1070,7 +1181,7 @@
 	}
 
 	c_nritems = btrfs_header_nritems(&c->header);
-	split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr);
+	split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr, 0);
 	if (IS_ERR(split_buffer))
 		return PTR_ERR(split_buffer);
 
@@ -1461,7 +1572,7 @@
 	nritems = btrfs_header_nritems(&l->header);
 	mid = (nritems + 1)/ 2;
 
-	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
 	if (IS_ERR(right_buffer))
 		return PTR_ERR(right_buffer);
 
@@ -1560,7 +1671,7 @@
 
 	if (!double_split)
 		return ret;
-	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
 	if (IS_ERR(right_buffer))
 		return PTR_ERR(right_buffer);
 
@@ -1988,8 +2099,8 @@
 		blocknr = btrfs_node_blockptr(c_node, slot);
 		if (next)
 			btrfs_block_release(root, next);
-		if (level == 1 && path->reada)
-			reada_for_search(root, path, slot);
+		if (path->reada)
+			reada_for_search(root, path, level, slot);
 		next = read_tree_block(root, blocknr);
 		break;
 	}
@@ -2002,8 +2113,8 @@
 		path->slots[level] = 0;
 		if (!level)
 			break;
-		if (level == 1 && path->reada)
-			reada_for_search(root, path, slot);
+		if (path->reada)
+			reada_for_search(root, path, level, slot);
 		next = read_tree_block(root,
 		       btrfs_node_blockptr(btrfs_buffer_node(next), 0));
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c5a18d5..42aa203 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -178,6 +178,7 @@
 	struct buffer_head *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
 	int reada;
+	int lowest_level;
 };
 
 /*
@@ -338,6 +339,9 @@
 	u64 highest_inode;
 	u64 last_inode_alloc;
 	int ref_cows;
+	struct btrfs_key defrag_progress;
+	int defrag_running;
+	int defrag_level;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -1031,10 +1035,11 @@
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root, u64 hint);
+					    struct btrfs_root *root, u64 hint,
+					    u64 empty_size);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u64 num_blocks, u64 search_start,
+		       u64 num_blocks, u64 empty_size, u64 search_start,
 		       u64 search_end, struct btrfs_key *ins, int data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf);
@@ -1051,6 +1056,10 @@
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 /* ctree.c */
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct buffer_head *buf, struct buffer_head
+			   *parent, int parent_slot, struct buffer_head
+			   **cow_ret);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1060,6 +1069,9 @@
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct buffer_head *parent,
+		       int cache_only);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
@@ -1171,4 +1183,7 @@
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 *hint_block);
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, int cache_only);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60db85b..c948416 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -273,7 +273,9 @@
 		     struct buffer_head *buf)
 {
 	WARN_ON(atomic_read(&buf->b_count) == 0);
+	lock_buffer(buf);
 	clear_buffer_dirty(buf);
+	unlock_buffer(buf);
 	return 0;
 }
 
@@ -294,6 +296,9 @@
 	root->last_inode_alloc = 0;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
+	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+	root->defrag_running = 0;
+	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
 	return 0;
 }
@@ -585,6 +590,7 @@
 	fs_info->closing = 1;
 	btrfs_transaction_flush_work(root);
 	mutex_lock(&fs_info->fs_mutex);
+	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	/* run commit again to  drop the original snapshot */
@@ -616,7 +622,9 @@
 {
 	struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
 	u64 transid = btrfs_header_generation(btrfs_buffer_header(bh));
+
 	WARN_ON(!atomic_read(&bh->b_count));
+
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
 			(unsigned long long)bh->b_blocknr,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5d4d5d8..26b8d340 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,7 +23,8 @@
 #include "transaction.h"
 
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *orig_root, u64 num_blocks, u64 search_start,
+			    *orig_root, u64 num_blocks, u64 empty_size,
+			    u64 search_start,
 			    u64 search_end, u64 hint_block,
 			    struct btrfs_key *ins, u64 exclude_start,
 			    u64 exclude_nr, int data);
@@ -379,7 +380,7 @@
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0,
+	ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0, 0,
 			       (u64)-1, 0, &ins, 0, 0, 0);
 	if (ret) {
 		btrfs_free_path(path);
@@ -533,7 +534,7 @@
 	struct btrfs_block_group_item *bi;
 	struct btrfs_key ins;
 
-	ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins,
+	ret = find_free_extent(trans, extent_root, 0, 0, 0, (u64)-1, 0, &ins,
 			       0, 0, 0);
 	/* FIXME, set bit to recalc cache groups on next mount */
 	if (ret)
@@ -708,6 +709,7 @@
 static int try_remove_page(struct address_space *mapping, unsigned long index)
 {
 	int ret;
+	return 0;
 	ret = invalidate_mapping_pages(mapping, index, index);
 	return ret;
 }
@@ -866,7 +868,7 @@
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
+	ret = find_free_extent(trans, root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
 	if (ret) {
 		btrfs_free_path(path);
 		return ret;
@@ -983,8 +985,8 @@
  * Any available blocks before search_start are skipped.
  */
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *orig_root, u64 num_blocks, u64 search_start, u64
-			    search_end, u64 hint_block,
+			    *orig_root, u64 num_blocks, u64 empty_size,
+			    u64 search_start, u64 search_end, u64 hint_block,
 			    struct btrfs_key *ins, u64 exclude_start,
 			    u64 exclude_nr, int data)
 {
@@ -1042,6 +1044,7 @@
 						     data, 1);
 	}
 
+	total_needed += empty_size;
 	path = btrfs_alloc_path();
 
 check_failed:
@@ -1157,9 +1160,11 @@
 			goto error;
 		}
 		search_start = orig_search_start;
-		if (wrapped)
+		if (wrapped) {
+			if (!full_scan)
+				total_needed -= empty_size;
 			full_scan = 1;
-		else
+		} else
 			wrapped = 1;
 		goto new_group;
 	}
@@ -1238,9 +1243,11 @@
 			ret = -ENOSPC;
 			goto error;
 		}
-		if (wrapped)
+		if (wrapped) {
+			if (!full_scan)
+				total_needed -= empty_size;
 			full_scan = 1;
-		else
+		} else
 			wrapped = 1;
 	}
 	block_group = btrfs_lookup_block_group(info, search_start);
@@ -1264,7 +1271,7 @@
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u64 num_blocks, u64 hint_block,
+		       u64 num_blocks, u64 empty_size, u64 hint_block,
 		       u64 search_end, struct btrfs_key *ins, int data)
 {
 	int ret;
@@ -1303,7 +1310,7 @@
 	 * in the correct block group.
 	 */
 	if (data) {
-		ret = find_free_extent(trans, root, 0, 0,
+		ret = find_free_extent(trans, root, 0, 0, 0,
 				       search_end, 0, &prealloc_key, 0, 0, 0);
 		BUG_ON(ret);
 		if (ret)
@@ -1313,8 +1320,8 @@
 	}
 
 	/* do the real allocation */
-	ret = find_free_extent(trans, root, num_blocks, search_start,
-			       search_end, hint_block, ins,
+	ret = find_free_extent(trans, root, num_blocks, empty_size,
+			       search_start, search_end, hint_block, ins,
 			       exclude_start, exclude_nr, data);
 	BUG_ON(ret);
 	if (ret)
@@ -1333,7 +1340,7 @@
 		exclude_start = ins->objectid;
 		exclude_nr = ins->offset;
 		hint_block = exclude_start + exclude_nr;
-		ret = find_free_extent(trans, root, 0, search_start,
+		ret = find_free_extent(trans, root, 0, 0, search_start,
 				       search_end, hint_block,
 				       &prealloc_key, exclude_start,
 				       exclude_nr, 0);
@@ -1368,14 +1375,16 @@
  * returns the tree buffer or NULL.
  */
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root, u64 hint)
+					   struct btrfs_root *root, u64 hint,
+					   u64 empty_size)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, hint, (unsigned long)-1, &ins, 0);
+				 1, empty_size, hint,
+				 (unsigned long)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
@@ -1385,6 +1394,7 @@
 		btrfs_free_extent(trans, root, ins.objectid, 1, 0);
 		return ERR_PTR(-ENOMEM);
 	}
+	WARN_ON(buffer_dirty(buf));
 	set_buffer_uptodate(buf);
 	set_buffer_checked(buf);
 	set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
@@ -1591,13 +1601,15 @@
 		struct btrfs_key key;
 		struct btrfs_disk_key *found_key;
 		struct btrfs_node *node;
+
 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+		level = root_item->drop_level;
+		path->lowest_level = level;
 		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0) {
+		if (wret < 0) {
 			ret = wret;
 			goto out;
 		}
-		level = root_item->drop_level;
 		node = btrfs_buffer_node(path->nodes[level]);
 		found_key = &node->ptrs[path->slots[level]].key;
 		WARN_ON(memcmp(found_key, &root_item->drop_progress,
@@ -1617,8 +1629,6 @@
 			ret = wret;
 		num_walks++;
 		if (num_walks > 10) {
-			struct btrfs_key key;
-			btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
 			ret = -EAGAIN;
 			get_bh(root->node);
 			break;
@@ -1627,6 +1637,7 @@
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
 			btrfs_block_release(root, path->nodes[i]);
+			path->nodes[i] = 0;
 		}
 	}
 out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1fe38fe..00b118a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -512,7 +512,7 @@
 	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
 	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		err = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, hint_block, (u64)-1,
+					 num_blocks, 0, hint_block, (u64)-1,
 					 &ins, 1);
 		if (err)
 			goto failed_truncate;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3889032..12aa043 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -554,7 +554,7 @@
 				 &alloc_hint);
 	if (ret)
 		goto out;
-	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
+	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
 				 alloc_hint, (u64)-1, &ins, 1);
 	if (ret)
 		goto out;
@@ -1360,7 +1360,7 @@
 	if (create & BTRFS_GET_BLOCK_CREATE) {
 		struct btrfs_key ins;
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 1, alloc_hint, (u64)-1,
+					 1, 0, alloc_hint, (u64)-1,
 					 &ins, 1);
 		if (ret) {
 			err = ret;
@@ -1998,7 +1998,7 @@
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
-	subvol = btrfs_alloc_free_block(trans, root, 0);
+	subvol = btrfs_alloc_free_block(trans, root, 0, 0);
 	if (IS_ERR(subvol))
 		return PTR_ERR(subvol);
 	leaf = btrfs_buffer_leaf(subvol);
@@ -2159,7 +2159,9 @@
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_vol_args vol_args;
+	struct btrfs_trans_handle *trans;
 	int ret = 0;
+	int err;
 	struct btrfs_dir_item *di;
 	int namelen;
 	struct btrfs_path *path;
@@ -2196,6 +2198,31 @@
 		else
 			ret = create_snapshot(root, vol_args.name, namelen);
 		break;
+
+	case BTRFS_IOC_DEFRAG:
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		memset(&root->defrag_progress, 0,
+		       sizeof(root->defrag_progress));
+		while (1) {
+			root->defrag_running = 1;
+			err = btrfs_defrag_leaves(trans, root, 0);
+
+			btrfs_end_transaction(trans, root);
+			mutex_unlock(&root->fs_info->fs_mutex);
+
+			btrfs_btree_balance_dirty(root);
+
+			mutex_lock(&root->fs_info->fs_mutex);
+			trans = btrfs_start_transaction(root, 1);
+			if (err != -EAGAIN)
+				break;
+		}
+		root->defrag_running = 0;
+		btrfs_end_transaction(trans, root);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		ret = 0;
+		break;
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 23bed48..8bc47dec 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -28,6 +28,6 @@
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_ADD_DISK _IOW(BTRFS_IOCTL_MAGIC, 2, \
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
 				   struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4986264..338a719 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -29,6 +29,7 @@
 static struct workqueue_struct *trans_wq;
 
 #define BTRFS_ROOT_TRANS_TAG 0
+#define BTRFS_ROOT_DEFRAG_TAG 1
 
 static void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -69,35 +70,41 @@
 	return 0;
 }
 
+static int record_root_in_trans(struct btrfs_root *root)
+{
+	u64 running_trans_id = root->fs_info->running_transaction->transid;
+	if (root->ref_cows && root->last_trans < running_trans_id) {
+		WARN_ON(root == root->fs_info->extent_root);
+		if (root->root_item.refs != 0) {
+			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+				   (unsigned long)root->root_key.objectid,
+				   BTRFS_ROOT_TRANS_TAG);
+			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+				   (unsigned long)root->root_key.objectid,
+				   BTRFS_ROOT_DEFRAG_TAG);
+			root->commit_root = root->node;
+			get_bh(root->node);
+		} else {
+			WARN_ON(1);
+		}
+		root->last_trans = running_trans_id;
+	}
+	return 0;
+}
+
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_blocks)
 {
 	struct btrfs_trans_handle *h =
 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	int ret;
-	u64 running_trans_id;
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	ret = join_transaction(root);
 	BUG_ON(ret);
-	running_trans_id = root->fs_info->running_transaction->transid;
 
-	if (root != root->fs_info->tree_root && root->last_trans <
-	    running_trans_id) {
-		WARN_ON(root == root->fs_info->extent_root);
-		WARN_ON(root->ref_cows != 1);
-		if (root->root_item.refs != 0) {
-			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-					   (unsigned long)root->root_key.objectid,
-					   BTRFS_ROOT_TRANS_TAG);
-			root->commit_root = root->node;
-			get_bh(root->node);
-		} else {
-			WARN_ON(1);
-		}
-	}
-	root->last_trans = running_trans_id;
-	h->transid = running_trans_id;
+	record_root_in_trans(root);
+	h->transid = root->fs_info->running_transaction->transid;
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
@@ -155,6 +162,15 @@
 					      gang[i]);
 			if (!page)
 				continue;
+			if (PageWriteback(page)) {
+				if (PageDirty(page))
+					wait_on_page_writeback(page);
+				else {
+					unlock_page(page);
+					page_cache_release(page);
+					continue;
+				}
+			}
 			err = write_one_page(page, 0);
 			if (err)
 				werr = err;
@@ -299,6 +315,58 @@
 	return err;
 }
 
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
+{
+	struct btrfs_root *gang[1];
+	struct btrfs_root *root;
+	struct btrfs_root *tree_root = info->tree_root;
+	struct btrfs_trans_handle *trans;
+	int i;
+	int ret;
+	int err = 0;
+	u64 last = 0;
+
+	trans = btrfs_start_transaction(tree_root, 1);
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
+						 (void **)gang, last,
+						 ARRAY_SIZE(gang),
+						 BTRFS_ROOT_DEFRAG_TAG);
+		if (ret == 0)
+			break;
+		for (i = 0; i < ret; i++) {
+			root = gang[i];
+			last = root->root_key.objectid + 1;
+			radix_tree_tag_clear(&info->fs_roots_radix,
+				     (unsigned long)root->root_key.objectid,
+				     BTRFS_ROOT_DEFRAG_TAG);
+			if (root->defrag_running)
+				continue;
+
+			while (1) {
+				mutex_lock(&root->fs_info->trans_mutex);
+				record_root_in_trans(root);
+				mutex_unlock(&root->fs_info->trans_mutex);
+
+				root->defrag_running = 1;
+				err = btrfs_defrag_leaves(trans, root, 1);
+				btrfs_end_transaction(trans, tree_root);
+				mutex_unlock(&info->fs_mutex);
+
+				btrfs_btree_balance_dirty(root);
+
+				mutex_lock(&info->fs_mutex);
+				trans = btrfs_start_transaction(tree_root, 1);
+				if (err != -EAGAIN)
+					break;
+			}
+			root->defrag_running = 0;
+		}
+	}
+	btrfs_end_transaction(trans, tree_root);
+	return err;
+}
+
 static int drop_dirty_roots(struct btrfs_root *tree_root,
 			    struct list_head *list)
 {
@@ -475,6 +543,7 @@
 		goto out;
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
+	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 out:
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ebf44f3..8b2714e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -70,5 +70,6 @@
 void btrfs_init_transaction_sys(void);
 void btrfs_exit_transaction_sys(void);
 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list);
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
 
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 0000000..15d0a48
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+
+static void reada_defrag(struct btrfs_root *root,
+			 struct btrfs_node *node)
+{
+	int i;
+	u32 nritems;
+	u64 blocknr;
+	int ret;
+
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = 0; i < nritems; i++) {
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+
+static int defrag_walk_down(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, int *level,
+			    int cache_only)
+{
+	struct buffer_head *next;
+	struct buffer_head *cur;
+	u64 blocknr;
+	int ret = 0;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	while(*level > 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		if (!cache_only && *level > 1 && path->slots[*level] == 0)
+			reada_defrag(root, btrfs_buffer_node(cur));
+
+		if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
+			WARN_ON(1);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(btrfs_buffer_header(cur)))
+			break;
+
+		if (*level == 1) {
+			ret = btrfs_realloc_node(trans, root,
+						 path->nodes[*level],
+						 cache_only);
+			break;
+		}
+		blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
+					      path->slots[*level]);
+
+		if (cache_only) {
+			next = btrfs_find_tree_block(root, blocknr);
+			if (!next || !buffer_uptodate(next) ||
+			   buffer_locked(next)) {
+				brelse(next);
+				path->slots[*level]++;
+				continue;
+			}
+		} else {
+			next = read_tree_block(root, blocknr);
+		}
+		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
+				      path->slots[*level], &next);
+		BUG_ON(ret);
+		ret = btrfs_realloc_node(trans, root, next, cache_only);
+		BUG_ON(ret);
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			btrfs_block_release(root, path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(btrfs_buffer_header(next));
+		path->slots[*level] = 0;
+	}
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+	btrfs_block_release(root, path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+	WARN_ON(ret);
+	return 0;
+}
+
+static int defrag_walk_up(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path, int *level,
+			  int cache_only)
+{
+	int i;
+	int slot;
+	struct btrfs_node *node;
+
+	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot < btrfs_header_nritems(
+		    btrfs_buffer_header(path->nodes[i])) - 1) {
+			path->slots[i]++;
+			*level = i;
+			node = btrfs_buffer_node(path->nodes[i]);
+			WARN_ON(i == 0);
+			btrfs_disk_key_to_cpu(&root->defrag_progress,
+					      &node->ptrs[path->slots[i]].key);
+			root->defrag_level = i;
+			return 0;
+		} else {
+			btrfs_block_release(root, path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, int cache_only)
+{
+	struct btrfs_path *path = NULL;
+	struct buffer_head *tmp;
+	int ret = 0;
+	int wret;
+	int level;
+	int orig_level;
+	int i;
+	int num_runs = 0;
+
+	if (root->ref_cows == 0) {
+		goto out;
+	}
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	level = btrfs_header_level(btrfs_buffer_header(root->node));
+	orig_level = level;
+	if (level == 0) {
+		goto out;
+	}
+	if (root->defrag_progress.objectid == 0) {
+		get_bh(root->node);
+		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
+		BUG_ON(ret);
+		ret = btrfs_realloc_node(trans, root, root->node, cache_only);
+		BUG_ON(ret);
+		path->nodes[level] = root->node;
+		path->slots[level] = 0;
+	} else {
+		level = root->defrag_level;
+		path->lowest_level = level;
+		wret = btrfs_search_slot(trans, root, &root->defrag_progress,
+					 path, 0, 1);
+
+		if (wret < 0) {
+			ret = wret;
+			goto out;
+		}
+		while(level > 0 && !path->nodes[level])
+			level--;
+		if (!path->nodes[level]) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	while(1) {
+		wret = defrag_walk_down(trans, root, path, &level, cache_only);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+
+		wret = defrag_walk_up(trans, root, path, &level, cache_only);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+		if (num_runs++ > 8) {
+			ret = -EAGAIN;
+			break;
+		}
+	}
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			btrfs_block_release(root, path->nodes[i]);
+			path->nodes[i] = 0;
+		}
+	}
+out:
+	if (path)
+		btrfs_free_path(path);
+	if (ret != -EAGAIN) {
+		memset(&root->defrag_progress, 0,
+		       sizeof(root->defrag_progress));
+	}
+	return ret;
+}