Btrfs: allocator optimizations, truncate readahead

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d8caeba..5830b43 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1081,7 +1081,7 @@
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
-						 int data);
+						 int data, int owner);
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 678456d..4c0262b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -257,24 +257,30 @@
 int readahead_tree_block(struct btrfs_root *root, u64 blocknr)
 {
 	struct buffer_head *bh = NULL;
+	int ret = 0;
 
 	bh = btrfs_find_create_tree_block(root, blocknr);
 	if (!bh)
 		return 0;
-	if (buffer_uptodate(bh))
+	if (buffer_uptodate(bh)) {
+		ret = 1;
 		goto done;
-	if (test_set_buffer_locked(bh))
+	}
+	if (test_set_buffer_locked(bh)) {
+		ret = 1;
 		goto done;
+	}
 	if (!buffer_uptodate(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh);
 	} else {
 		unlock_buffer(bh);
+		ret = 1;
 	}
 done:
 	brelse(bh);
-	return 0;
+	return ret;
 }
 
 struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1497ff9..e3c6bfe 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -12,6 +12,33 @@
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
+static void reada_extent_leaves(struct btrfs_root *root,
+				struct btrfs_path *path, u64 limit)
+{
+	struct btrfs_node *node;
+	int i;
+	int nritems;
+	u64 item_objectid;
+	u64 blocknr;
+	int slot;
+	int ret;
+
+	if (!path->nodes[1])
+		return;
+	node = btrfs_buffer_node(path->nodes[1]);
+	slot = path->slots[1] + 1;
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot; i < nritems && i < slot + 8; i++) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		if (item_objectid > limit)
+			break;
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
@@ -24,6 +51,7 @@
 	u64 i;
 	u64 last = 0;
 	u64 hole_size;
+	u64 limit;
 	int found = 0;
 
 	root = root->fs_info->extent_root;
@@ -46,14 +74,17 @@
 		return ret;
 	if (ret && path->slots[0] > 0)
 		path->slots[0]--;
+	limit = block_group->key.objectid + block_group->key.offset;
+	reada_extent_leaves(root, path, limit);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(&leaf->header)) {
+			reada_extent_leaves(root, path, limit);
 			ret = btrfs_next_leaf(root, path);
-			if (ret == 0)
+			if (ret == 0) {
 				continue;
-			else {
+			} else {
 				if (found) {
 					hole_size = block_group->key.objectid +
 						block_group->key.offset - last;
@@ -187,7 +218,7 @@
 		return max((*cache_ret)->last_alloc, search_start);
 	}
 	cache = btrfs_find_block_group(root, cache,
-				       last + cache->key.offset - 1, 0);
+				       last + cache->key.offset - 1, 0, 0);
 	*cache_ret = cache;
 	goto again;
 }
@@ -195,7 +226,7 @@
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
-						 int data)
+						 int data, int owner)
 {
 	struct btrfs_block_group_cache *cache[8];
 	struct btrfs_block_group_cache *found_group = NULL;
@@ -207,6 +238,10 @@
 	int i;
 	int ret;
 	int full_search = 0;
+	int factor = 8;
+
+	if (!owner)
+		factor = 5;
 
 	if (data)
 		radix = &info->block_group_data_radix;
@@ -219,14 +254,14 @@
 		if (shint->data == data) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
-			    (shint->key.offset * 8) / 10) {
+			    (shint->key.offset * factor) / 10) {
 				return shint;
 			}
 		}
 	}
 	if (hint && hint->data == data) {
 		used = btrfs_block_group_used(&hint->item);
-		if (used + hint->pinned < (hint->key.offset * 8) / 10) {
+		if (used + hint->pinned < (hint->key.offset * factor) / 10) {
 			return hint;
 		}
 		if (used >= (hint->key.offset * 8) / 10) {
@@ -261,7 +296,7 @@
 				cache[i]->key.offset;
 			used = btrfs_block_group_used(&cache[i]->item);
 			if (used + cache[i]->pinned <
-			    (cache[i]->key.offset * 8) / 10) {
+			    (cache[i]->key.offset * factor) / 10) {
 				found_group = cache[i];
 				goto found;
 			}
@@ -272,6 +307,7 @@
 						     BTRFS_BLOCK_GROUP_AVAIL);
 			}
 		}
+		cond_resched();
 	}
 	last = hint_last;
 again:
@@ -295,13 +331,16 @@
 						     BTRFS_BLOCK_GROUP_AVAIL);
 			}
 		}
+		cond_resched();
 	}
 	if (!full_search) {
+printk("find block group doing full search data %d start %Lu\n", data, search_start);
 		last = search_start;
 		full_search = 1;
 		goto again;
 	}
 	if (!found_group) {
+printk("find block group bailing to zero data %d\n", data);
 		ret = radix_tree_gang_lookup(radix,
 					     (void **)&found_group, 0, 1);
 		BUG_ON(ret != 1);
@@ -554,8 +593,8 @@
 						      blocknr + i);
 				}
 			}
-			if (old_val < (cache->key.offset * 6) / 10 &&
-			    old_val + num >= (cache->key.offset * 6) / 10) {
+			if (old_val < (cache->key.offset * 5) / 10 &&
+			    old_val + num >= (cache->key.offset * 5) / 10) {
 printk("group %Lu now available\n", cache->key.objectid);
 				radix_tree_tag_set(cache->radix,
 						   cache->key.objectid +
@@ -842,6 +881,7 @@
 	int level;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
+	u64 limit;
 
 	path = btrfs_alloc_path();
 	ins->flags = 0;
@@ -858,11 +898,11 @@
 	if (search_start) {
 		block_group = lookup_block_group(info, search_start);
 		block_group = btrfs_find_block_group(root, block_group,
-						     search_start, data);
+						     search_start, data, 1);
 	} else {
 		block_group = btrfs_find_block_group(root,
 						     trans->block_group, 0,
-						     data);
+						     data, 1);
 	}
 
 check_failed:
@@ -916,6 +956,12 @@
 				info->extent_tree_prealloc_nr = 0;
 				total_found = 0;
 			}
+			if (start_found)
+				limit = last_block +
+					block_group->key.offset / 2;
+			else
+				limit = search_start +
+					block_group->key.offset / 2;
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
@@ -960,6 +1006,7 @@
 		}
 next:
 		path->slots[0]++;
+		cond_resched();
 	}
 	// FIXME -ENOSPC
 check_pending:
@@ -1049,7 +1096,8 @@
 	block_group = lookup_block_group(info, search_start);
 	if (!full_scan)
 		block_group = btrfs_find_block_group(root, block_group,
-						     search_start, data);
+						     search_start, data, 0);
+	cond_resched();
 	goto check_failed;
 
 error:
@@ -1102,7 +1150,7 @@
 	 * in the correct block group.
 	 */
 	if (data) {
-		ret = find_free_extent(trans, root, 0, search_start,
+		ret = find_free_extent(trans, root, 0, 0,
 				       search_end, &prealloc_key, 0);
 		if (ret) {
 			return ret;
@@ -1173,7 +1221,7 @@
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, 0, (unsigned long)-1, &ins, 0);
+				 1, hint, (unsigned long)-1, &ins, 0);
 	if (ret) {
 		BUG();
 		return NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index eaa48f0..0f79490 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -351,6 +351,35 @@
 	return ret;
 }
 
+static void reada_truncate(struct btrfs_root *root, struct btrfs_path *path,
+			   u64 objectid)
+{
+	struct btrfs_node *node;
+	int i;
+	int nritems;
+	u64 item_objectid;
+	u64 blocknr;
+	int slot;
+	int ret;
+
+	if (!path->nodes[1])
+		return;
+	node = btrfs_buffer_node(path->nodes[1]);
+	slot = path->slots[1];
+	if (slot == 0)
+		return;
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot - 1; i >= 0; i--) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		if (item_objectid != objectid)
+			break;
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+
 static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct inode *inode)
@@ -386,6 +415,7 @@
 			BUG_ON(path->slots[0] == 0);
 			path->slots[0]--;
 		}
+		reada_truncate(root, path, inode->i_ino);
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		found_key = &leaf->items[path->slots[0]].key;
 		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
@@ -587,28 +617,30 @@
 	return d_splice_alias(inode, dentry);
 }
 
-static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path)
+static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path,
+			 u64 objectid)
 {
 	struct btrfs_node *node;
 	int i;
-	int nritems;
-	u64 objectid;
+	u32 nritems;
 	u64 item_objectid;
 	u64 blocknr;
 	int slot;
+	int ret;
 
 	if (!path->nodes[1])
 		return;
 	node = btrfs_buffer_node(path->nodes[1]);
 	slot = path->slots[1];
-	objectid = btrfs_disk_key_objectid(&node->ptrs[slot].key);
 	nritems = btrfs_header_nritems(&node->header);
-	for (i = slot; i < nritems; i++) {
+	for (i = slot + 1; i < nritems; i++) {
 		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
 		if (item_objectid != objectid)
 			break;
 		blocknr = btrfs_node_blockptr(node, i);
-		readahead_tree_block(root, blocknr);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
 	}
 }
 
@@ -646,21 +678,20 @@
 	if (ret < 0)
 		goto err;
 	advance = 0;
-	reada_leaves(root, path);
+	reada_leaves(root, path, inode->i_ino);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
 		slot = path->slots[0];
 		if (advance || slot >= nritems) {
 			if (slot >= nritems -1) {
+				reada_leaves(root, path, inode->i_ino);
 				ret = btrfs_next_leaf(root, path);
 				if (ret)
 					break;
 				leaf = btrfs_buffer_leaf(path->nodes[0]);
 				nritems = btrfs_header_nritems(&leaf->header);
 				slot = path->slots[0];
-				if (path->slots[1] == 0)
-					reada_leaves(root, path);
 			} else {
 				slot++;
 				path->slots[0]++;
@@ -805,13 +836,18 @@
 	struct btrfs_inode_item inode_item;
 	struct btrfs_key *location;
 	int ret;
+	int owner;
 
 	inode = new_inode(root->fs_info->sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
 	BTRFS_I(inode)->root = root;
-	group = btrfs_find_block_group(root, group, 0, 0);
+	if (mode & S_IFDIR)
+		owner = 0;
+	else
+		owner = 1;
+	group = btrfs_find_block_group(root, group, 0, 0, owner);
 	BTRFS_I(inode)->block_group = group;
 
 	inode->i_uid = current->fsuid;
@@ -1562,7 +1598,7 @@
 static int drop_extents(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct inode *inode,
-			  u64 start, u64 end)
+			  u64 start, u64 end, u64 *hint_block)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1659,17 +1695,14 @@
 				new_num = (start - key.offset) >>
 					inode->i_blkbits;
 				old_num = btrfs_file_extent_num_blocks(extent);
+				*hint_block =
+					btrfs_file_extent_disk_blocknr(extent);
 				inode->i_blocks -= (old_num - new_num) << 3;
 				btrfs_set_file_extent_num_blocks(extent,
 								 new_num);
 				mark_buffer_dirty(path->nodes[0]);
 			} else {
 				WARN_ON(1);
-				/*
-				ret = btrfs_truncate_item(trans, root, path,
-							  start - key.offset);
-				BUG_ON(ret);
-				*/
 			}
 		}
 		if (!keep) {
@@ -1683,6 +1716,8 @@
 				      btrfs_file_extent_disk_num_blocks(extent);
 				extent_num_blocks =
 				      btrfs_file_extent_num_blocks(extent);
+				*hint_block =
+					btrfs_file_extent_disk_blocknr(extent);
 			}
 			ret = btrfs_del_item(trans, root, path);
 			BUG_ON(ret);
@@ -1831,6 +1866,7 @@
 	u64 start_pos;
 	u64 num_blocks;
 	u64 alloc_extent_start;
+	u64 hint_block;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
 	pinned[0] = NULL;
@@ -1871,6 +1907,7 @@
 	}
 	if (first_index != last_index &&
 	    (last_index << PAGE_CACHE_SHIFT) < inode->i_size &&
+	    pos + count < inode->i_size &&
 	    (count & (PAGE_CACHE_SIZE - 1))) {
 		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
 		if (!PageUptodate(pinned[1])) {
@@ -1892,18 +1929,20 @@
 	btrfs_set_trans_block_group(trans, inode);
 	/* FIXME blocksize != 4096 */
 	inode->i_blocks += num_blocks << 3;
+	hint_block = 0;
 	if (start_pos < inode->i_size) {
 		/* FIXME blocksize != pagesize */
 		ret = drop_extents(trans, root, inode,
 				   start_pos,
 				   (pos + count + root->blocksize -1) &
-				   ~((u64)root->blocksize - 1));
+				   ~((u64)root->blocksize - 1), &hint_block);
 		BUG_ON(ret);
 	}
 	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
 	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, 1, (u64)-1, &ins, 1);
+					 num_blocks, hint_block, (u64)-1,
+					 &ins, 1);
 		BUG_ON(ret);
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 				       start_pos, ins.objectid, ins.offset);
@@ -2455,7 +2494,6 @@
 			   btrfs_fill_super, mnt);
 }
 
-
 static int btrfs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {