Btrfs: improve free space cache management and space allocation

While under random IO, a block group's free space cache eventually reaches
a state where it has a mix of extent entries and bitmap entries representing
free space regions.

As later free space regions are returned to the cache, some of them are merged
with existing extent entries if they are contiguous with them. But others are
not merged, because despite the existence of adjacent free space regions in
the cache, the merging doesn't happen because the existing free space regions
are represented in bitmap extents. Even when new free space regions are merged
with existing extent entries (enlarging the free space range they represent),
we create chances of having after an enlarged region that is contiguous with
some other region represented in a bitmap entry.

Both clustered and non-clustered space allocation work by iterating over our
extent and bitmap entries and skipping any that represents a region smaller
then the allocation request (and giving preference to extent entries before
bitmap entries). By having a contiguous free space region that is represented
by 2 (or more) entries (mix of extent and bitmap entries), we end up not
satisfying an allocation request with a size larger than the size of any of
the entries but no larger than the sum of their sizes. Making the caller assume
we're under a ENOSPC condition or force it to allocate multiple smaller space
regions (as we do for file data writes), which adds extra overhead and more
chances of causing fragmentation due to the smaller regions being all spread
apart from each other (more likely when under concurrency).

For example, if we have the following in the cache:

* extent entry representing free space range: [128Mb - 256Kb, 128Mb[

* bitmap entry covering the range [128Mb, 256Mb[, but only with the bits
  representing the range [128Mb, 128Mb + 768Kb[ set - that is, only that
  space in this 128Mb area is marked as free

An allocation request for 1Mb, starting at offset not greater than 128Mb - 256Kb,
would fail before, despite the existence of such contiguous free space area in the
cache. The caller could only allocate up to 768Kb of space at once and later another
256Kb (or vice-versa). In between each smaller allocation request, another task
working on a different file/inode might come in and take that space, preventing the
former task of getting a contiguous 1Mb region of free space.

Therefore this change implements the ability to move free space from bitmap
entries into existing and new free space regions represented with extent
entries. This is done when a space region is added to the cache.

A test was added to the sanity tests that explains in detail the issue too.

Some performance test results with compilebench on a 4 cores machine, with
32Gb of ram and using an HDD follow.

Test: compilebench -D /mnt -i 30 -r 1000 --makej

Before this change:

   intial create total runs 30 avg 69.02 MB/s (user 0.28s sys 0.57s)
   compile total runs 30 avg 314.96 MB/s (user 0.12s sys 0.25s)
   read compiled tree total runs 3 avg 27.14 MB/s (user 1.52s sys 0.90s)
   delete compiled tree total runs 30 avg 3.14 seconds (user 0.15s sys 0.66s)

After this change:

   intial create total runs 30 avg 68.37 MB/s (user 0.29s sys 0.55s)
   compile total runs 30 avg 382.83 MB/s (user 0.12s sys 0.24s)
   read compiled tree total runs 3 avg 27.82 MB/s (user 1.45s sys 0.97s)
   delete compiled tree total runs 30 avg 3.18 seconds (user 0.17s sys 0.65s)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2f0fe10..3384819 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1997,6 +1997,128 @@
 	return merged;
 }
 
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+				     struct btrfs_free_space *info,
+				     bool update_stat)
+{
+	struct btrfs_free_space *bitmap;
+	unsigned long i;
+	unsigned long j;
+	const u64 end = info->offset + info->bytes;
+	const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+	u64 bytes;
+
+	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+	if (!bitmap)
+		return false;
+
+	i = offset_to_bit(bitmap->offset, ctl->unit, end);
+	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+	if (j == i)
+		return false;
+	bytes = (j - i) * ctl->unit;
+	info->bytes += bytes;
+
+	if (update_stat)
+		bitmap_clear_bits(ctl, bitmap, end, bytes);
+	else
+		__bitmap_clear_bits(ctl, bitmap, end, bytes);
+
+	if (!bitmap->bytes)
+		free_bitmap(ctl, bitmap);
+
+	return true;
+}
+
+static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
+				       struct btrfs_free_space *info,
+				       bool update_stat)
+{
+	struct btrfs_free_space *bitmap;
+	u64 bitmap_offset;
+	unsigned long i;
+	unsigned long j;
+	unsigned long prev_j;
+	u64 bytes;
+
+	bitmap_offset = offset_to_bitmap(ctl, info->offset);
+	/* If we're on a boundary, try the previous logical bitmap. */
+	if (bitmap_offset == info->offset) {
+		if (info->offset == 0)
+			return false;
+		bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
+	}
+
+	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+	if (!bitmap)
+		return false;
+
+	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+	j = 0;
+	prev_j = (unsigned long)-1;
+	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
+		if (j > i)
+			break;
+		prev_j = j;
+	}
+	if (prev_j == i)
+		return false;
+
+	if (prev_j == (unsigned long)-1)
+		bytes = (i + 1) * ctl->unit;
+	else
+		bytes = (i - prev_j) * ctl->unit;
+
+	info->offset -= bytes;
+	info->bytes += bytes;
+
+	if (update_stat)
+		bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+	else
+		__bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+
+	if (!bitmap->bytes)
+		free_bitmap(ctl, bitmap);
+
+	return true;
+}
+
+/*
+ * We prefer always to allocate from extent entries, both for clustered and
+ * non-clustered allocation requests. So when attempting to add a new extent
+ * entry, try to see if there's adjacent free space in bitmap entries, and if
+ * there is, migrate that space from the bitmaps to the extent.
+ * Like this we get better chances of satisfying space allocation requests
+ * because we attempt to satisfy them based on a single cache entry, and never
+ * on 2 or more entries - even if the entries represent a contiguous free space
+ * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
+ * ends).
+ */
+static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info,
+			      bool update_stat)
+{
+	/*
+	 * Only work with disconnected entries, as we can change their offset,
+	 * and must be extent entries.
+	 */
+	ASSERT(!info->bitmap);
+	ASSERT(RB_EMPTY_NODE(&info->offset_index));
+
+	if (ctl->total_bitmaps > 0) {
+		bool stole_end;
+		bool stole_front = false;
+
+		stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
+		if (ctl->total_bitmaps > 0)
+			stole_front = steal_from_bitmap_to_front(ctl, info,
+								 update_stat);
+
+		if (stole_end || stole_front)
+			try_merge_free_space(ctl, info, update_stat);
+	}
+}
+
 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
 			   u64 offset, u64 bytes)
 {
@@ -2009,6 +2131,7 @@
 
 	info->offset = offset;
 	info->bytes = bytes;
+	RB_CLEAR_NODE(&info->offset_index);
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2028,6 +2151,14 @@
 		goto out;
 	}
 link:
+	/*
+	 * Only steal free space from adjacent bitmaps if we're sure we're not
+	 * going to add the new free space to existing bitmap entries - because
+	 * that would mean unnecessary work that would be reverted. Therefore
+	 * attempt to steal space from bitmaps if we're adding an extent entry.
+	 */
+	steal_from_bitmap(ctl, info, true);
+
 	ret = link_free_space(ctl, info);
 	if (ret)
 		kmem_cache_free(btrfs_free_space_cachep, info);
@@ -2204,10 +2335,13 @@
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		node = rb_next(&entry->offset_index);
 		rb_erase(&entry->offset_index, &cluster->root);
+		RB_CLEAR_NODE(&entry->offset_index);
 
 		bitmap = (entry->bitmap != NULL);
-		if (!bitmap)
+		if (!bitmap) {
 			try_merge_free_space(ctl, entry, false);
+			steal_from_bitmap(ctl, entry, false);
+		}
 		tree_insert_offset(&ctl->free_space_offset,
 				   entry->offset, &entry->offset_index, bitmap);
 	}
@@ -3175,6 +3309,7 @@
 		map = NULL;
 		add_new_bitmap(ctl, info, offset);
 		bitmap_info = info;
+		info = NULL;
 	}
 
 	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
@@ -3185,6 +3320,8 @@
 	if (bytes)
 		goto again;
 
+	if (info)
+		kmem_cache_free(btrfs_free_space_cachep, info);
 	if (map)
 		kfree(map);
 	return 0;
@@ -3259,6 +3396,7 @@
 			goto have_info;
 		}
 
+		ret = 0;
 		goto out;
 	}