bcache: Kill unused freelist This was originally added as at optimization that for various reasons isn't needed anymore, but it does add a lot of nasty corner cases (and it was responsible for some recently fixed bugs). Just get rid of it now. Signed-off-by: Kent Overstreet <kmo@daterainc.com>

commit: 2531d9ee61fa08a5a9ab8f002c50779888d232c7 [log] [tgz]
author: Kent Overstreet <kmo@daterainc.com> Mon Mar 17 16:55:55 2014 -0700
committer: Kent Overstreet <kmo@daterainc.com> Tue Mar 18 12:23:36 2014 -0700
tree: 4a75259d27010a2b48c0aca0e5b490060db81f75
parent: 0a63b66db566cffdf90182eb6e66fdd4d0479e63 [diff]
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a59ef61..443d03f 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c

@@ -78,12 +78,6 @@
 	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
 	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
 
-	if (CACHE_SYNC(&ca->set->sb)) {
-		ca->need_save_prio = max(ca->need_save_prio,
-					 bucket_disk_gen(b));
-		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
-	}
-
 	return ret;
 }
 
@@ -120,58 +114,46 @@
 	mutex_unlock(&c->bucket_lock);
 }
 
-/* Allocation */
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
 
 static inline bool can_inc_bucket_gen(struct bucket *b)
 {
-	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
-		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
 }
 
-bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
+	BUG_ON(!ca->set->gc_mark_valid);
 
-	if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
-		unsigned i;
-
-		for (i = 0; i < RESERVE_NONE; i++)
-			if (!fifo_full(&ca->free[i]))
-				goto add;
-
-		return false;
-	}
-add:
-	b->prio = 0;
-
-	if (can_inc_bucket_gen(b) &&
-	    fifo_push(&ca->unused, b - ca->buckets)) {
-		atomic_inc(&b->pin);
-		return true;
-	}
-
-	return false;
-}
-
-static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
-{
 	return (!GC_MARK(b) ||
 		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
 		can_inc_bucket_gen(b);
 }
 
-static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 {
-	size_t bucket = b - ca->buckets;
+	lockdep_assert_held(&ca->set->bucket_lock);
+	BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
 
 	if (GC_SECTORS_USED(b))
-		trace_bcache_invalidate(ca, bucket);
+		trace_bcache_invalidate(ca, b - ca->buckets);
 
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
-	fifo_push(&ca->free_inc, bucket);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+	__bch_invalidate_one_bucket(ca, b);
+
+	fifo_push(&ca->free_inc, b - ca->buckets);
 }
 
 /*
@@ -201,20 +183,7 @@
 	ca->heap.used = 0;
 
 	for_each_bucket(b, ca) {
-		/*
-		 * If we fill up the unused list, if we then return before
-		 * adding anything to the free_inc list we'll skip writing
-		 * prios/gens and just go back to allocating from the unused
-		 * list:
-		 */
-		if (fifo_full(&ca->unused))
-			return;
-
-		if (!can_invalidate_bucket(ca, b))
-			continue;
-
-		if (!GC_SECTORS_USED(b) &&
-		    bch_bucket_add_unused(ca, b))
+		if (!bch_can_invalidate_bucket(ca, b))
 			continue;
 
 		if (!heap_full(&ca->heap))
@@ -239,7 +208,7 @@
 			return;
 		}
 
-		invalidate_one_bucket(ca, b);
+		bch_invalidate_one_bucket(ca, b);
 	}
 }
 
@@ -255,8 +224,8 @@
 
 		b = ca->buckets + ca->fifo_last_bucket++;
 
-		if (can_invalidate_bucket(ca, b))
-			invalidate_one_bucket(ca, b);
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets) {
 			ca->invalidate_needs_gc = 1;
@@ -280,8 +249,8 @@
 
 		b = ca->buckets + n;
 
-		if (can_invalidate_bucket(ca, b))
-			invalidate_one_bucket(ca, b);
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets / 2) {
 			ca->invalidate_needs_gc = 1;
@@ -293,8 +262,7 @@
 
 static void invalidate_buckets(struct cache *ca)
 {
-	if (ca->invalidate_needs_gc)
-		return;
+	BUG_ON(ca->invalidate_needs_gc);
 
 	switch (CACHE_REPLACEMENT(&ca->sb)) {
 	case CACHE_REPLACEMENT_LRU:
@@ -354,17 +322,10 @@
 		 * possibly issue discards to them, then we add the bucket to
 		 * the free list:
 		 */
-		while (1) {
+		while (!fifo_empty(&ca->free_inc)) {
 			long bucket;
 
-			if ((!atomic_read(&ca->set->prio_blocked) ||
-			     !CACHE_SYNC(&ca->set->sb)) &&
-			    !fifo_empty(&ca->unused))
-				fifo_pop(&ca->unused, bucket);
-			else if (!fifo_empty(&ca->free_inc))
-				fifo_pop(&ca->free_inc, bucket);
-			else
-				break;
+			fifo_pop(&ca->free_inc, bucket);
 
 			if (ca->discard) {
 				mutex_unlock(&ca->set->bucket_lock);
@@ -385,9 +346,9 @@
 		 * them to the free_inc list:
 		 */
 
+retry_invalidate:
 		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       (ca->need_save_prio > 64 ||
-				!ca->invalidate_needs_gc));
+			       !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
@@ -395,13 +356,28 @@
 		 * new stuff to them:
 		 */
 		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
-		if (CACHE_SYNC(&ca->set->sb) &&
-		    (!fifo_empty(&ca->free_inc) ||
-		     ca->need_save_prio > 64))
+		if (CACHE_SYNC(&ca->set->sb)) {
+			/*
+			 * This could deadlock if an allocation with a btree
+			 * node locked ever blocked - having the btree node
+			 * locked would block garbage collection, but here we're
+			 * waiting on garbage collection before we invalidate
+			 * and free anything.
+			 *
+			 * But this should be safe since the btree code always
+			 * uses btree_check_reserve() before allocating now, and
+			 * if it fails it blocks without btree nodes locked.
+			 */
+			if (!fifo_full(&ca->free_inc))
+				goto retry_invalidate;
+
 			bch_prio_write(ca);
+		}
 	}
 }
 
+/* Allocation */
+
 long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
 	DEFINE_WAIT(w);
@@ -447,8 +423,6 @@
 				BUG_ON(i == r);
 		fifo_for_each(i, &ca->free_inc, iter)
 			BUG_ON(i == r);
-		fifo_for_each(i, &ca->unused, iter)
-			BUG_ON(i == r);
 	}
 
 	b = ca->buckets + r;
@@ -470,17 +444,19 @@
 	return r;
 }
 
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+	SET_GC_MARK(b, 0);
+	SET_GC_SECTORS_USED(b, 0);
+}
+
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
 {
 	unsigned i;
 
-	for (i = 0; i < KEY_PTRS(k); i++) {
-		struct bucket *b = PTR_BUCKET(c, k, i);
-
-		SET_GC_MARK(b, 0);
-		SET_GC_SECTORS_USED(b, 0);
-		bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
-	}
+	for (i = 0; i < KEY_PTRS(k); i++)
+		__bch_bucket_free(PTR_CACHE(c, k, i),
+				  PTR_BUCKET(c, k, i));
 }
 
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 171cda8..200efc1 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h

@@ -195,7 +195,6 @@
 	atomic_t	pin;
 	uint16_t	prio;
 	uint8_t		gen;
-	uint8_t		disk_gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
 	uint8_t		gc_gen;
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
@@ -426,14 +425,9 @@
 	 * their new gen to disk. After prio_write() finishes writing the new
 	 * gens/prios, they'll be moved to the free list (and possibly discarded
 	 * in the process)
-	 *
-	 * unused: GC found nothing pointing into these buckets (possibly
-	 * because all the data they contained was overwritten), so we only
-	 * need to discard them before they can be moved to the free list.
 	 */
 	DECLARE_FIFO(long, free)[RESERVE_NR];
 	DECLARE_FIFO(long, free_inc);
-	DECLARE_FIFO(long, unused);
 
 	size_t			fifo_last_bucket;
 
@@ -443,12 +437,6 @@
 	DECLARE_HEAP(struct bucket *, heap);
 
 	/*
-	 * max(gen - disk_gen) for all buckets. When it gets too big we have to
-	 * call prio_write() to keep gens from wrapping.
-	 */
-	uint8_t			need_save_prio;
-
-	/*
 	 * If nonzero, we know we aren't going to find any buckets to invalidate
 	 * until a gc finishes - otherwise we could pointlessly burn a ton of
 	 * cpu
@@ -848,9 +836,6 @@
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree (last_gc).
- *
- * bucket_disk_gen() returns the difference between the current gen and the gen
- * on disk; they're both used to make sure gens don't wrap around.
  */
 
 static inline uint8_t bucket_gc_gen(struct bucket *b)
@@ -858,13 +843,7 @@
 	return b->gen - b->last_gc;
 }
 
-static inline uint8_t bucket_disk_gen(struct bucket *b)
-{
-	return b->gen - b->disk_gen;
-}
-
 #define BUCKET_GC_GEN_MAX	96U
-#define BUCKET_DISK_GEN_MAX	64U
 
 #define kobj_attribute_write(n, fn)					\
 	static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@@ -897,11 +876,14 @@
 
 uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
-bool bch_bucket_add_unused(struct cache *, struct bucket *);
 
-long bch_bucket_alloc(struct cache *, unsigned, bool);
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
 void bch_bucket_free(struct cache_set *, struct bkey *);
 
+long bch_bucket_alloc(struct cache *, unsigned, bool);
 int __bch_bucket_alloc_set(struct cache_set *, unsigned,
 			   struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index be90596..4c340c8 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c

@@ -1641,7 +1641,7 @@
 	mutex_unlock(&c->bucket_lock);
 }
 
-size_t bch_btree_gc_finish(struct cache_set *c)
+static size_t bch_btree_gc_finish(struct cache_set *c)
 {
 	size_t available = 0;
 	struct bucket *b;
@@ -1703,9 +1703,6 @@
 
 			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
 				available++;
-
-			if (!GC_MARK(b))
-				bch_bucket_add_unused(ca, b);
 		}
 	}
 
@@ -1836,6 +1833,42 @@
 	return btree_root(check_recurse, c, &op);
 }
 
+void bch_initial_gc_finish(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned i;
+
+	bch_btree_gc_finish(c);
+
+	mutex_lock(&c->bucket_lock);
+
+	/*
+	 * We need to put some unused buckets directly on the prio freelist in
+	 * order to get the allocator thread started - it needs freed buckets in
+	 * order to rewrite the prios and gens, and it needs to rewrite prios
+	 * and gens in order to free buckets.
+	 *
+	 * This is only safe for buckets that have no live data in them, which
+	 * there should always be some of.
+	 */
+	for_each_cache(ca, c, i) {
+		for_each_bucket(b, ca) {
+			if (fifo_full(&ca->free[RESERVE_PRIO]))
+				break;
+
+			if (bch_can_invalidate_bucket(ca, b) &&
+			    !GC_MARK(b)) {
+				__bch_invalidate_one_bucket(ca, b);
+				fifo_push(&ca->free[RESERVE_PRIO],
+					  b - ca->buckets);
+			}
+		}
+	}
+
+	mutex_unlock(&c->bucket_lock);
+}
+
 /* Btree insertion */
 
 static bool btree_insert_key(struct btree *b, struct bkey *k,

diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 3ce371f..91dfa5e 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h

@@ -252,7 +252,7 @@
 		     atomic_t *, struct bkey *);
 
 int bch_gc_thread_start(struct cache_set *);
-size_t bch_btree_gc_finish(struct cache_set *);
+void bch_initial_gc_finish(struct cache_set *);
 void bch_moving_gc(struct cache_set *);
 int bch_btree_check(struct cache_set *);
 void bch_initial_mark_key(struct cache_set *, int, struct bkey *);

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 2d4a562..a8c57d5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c

@@ -541,9 +541,6 @@
 	closure_sync(cl);
 }
 
-#define buckets_free(c)	"free %zu, free_inc %zu, unused %zu",		\
-	fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
-
 void bch_prio_write(struct cache *ca)
 {
 	int i;
@@ -554,10 +551,6 @@
 
 	lockdep_assert_held(&ca->set->bucket_lock);
 
-	for (b = ca->buckets;
-	     b < ca->buckets + ca->sb.nbuckets; b++)
-		b->disk_gen = b->gen;
-
 	ca->disk_buckets->seq++;
 
 	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
@@ -601,14 +594,17 @@
 
 	mutex_lock(&ca->set->bucket_lock);
 
-	ca->need_save_prio = 0;
-
 	/*
 	 * Don't want the old priorities to get garbage collected until after we
 	 * finish writing the new ones, and they're journalled
 	 */
-	for (i = 0; i < prio_buckets(ca); i++)
+	for (i = 0; i < prio_buckets(ca); i++) {
+		if (ca->prio_last_buckets[i])
+			__bch_bucket_free(ca,
+				&ca->buckets[ca->prio_last_buckets[i]]);
+
 		ca->prio_last_buckets[i] = ca->prio_buckets[i];
+	}
 }
 
 static void prio_read(struct cache *ca, uint64_t bucket)
@@ -639,7 +635,7 @@
 		}
 
 		b->prio = le16_to_cpu(d->prio);
-		b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+		b->gen = b->last_gc = b->gc_gen = d->gen;
 	}
 }
 
@@ -1606,7 +1602,7 @@
 			goto err;
 
 		bch_journal_mark(c, &journal);
-		bch_btree_gc_finish(c);
+		bch_initial_gc_finish(c);
 		pr_debug("btree_check() done");
 
 		/*
@@ -1648,7 +1644,7 @@
 				ca->sb.d[j] = ca->sb.first_bucket + j;
 		}
 
-		bch_btree_gc_finish(c);
+		bch_initial_gc_finish(c);
 
 		err = "error starting allocator thread";
 		for_each_cache(ca, c, i)
@@ -1794,7 +1790,6 @@
 	vfree(ca->buckets);
 
 	free_heap(&ca->heap);
-	free_fifo(&ca->unused);
 	free_fifo(&ca->free_inc);
 
 	for (i = 0; i < RESERVE_NR; i++)
@@ -1831,7 +1826,6 @@
 	    !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
 	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) ||
-	    !init_fifo(&ca->unused,	free << 2, GFP_KERNEL) ||
 	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) ||
 	    !(ca->buckets	= vzalloc(sizeof(struct bucket) *
 					  ca->sb.nbuckets)) ||
commit	2531d9ee61fa08a5a9ab8f002c50779888d232c7	[log] [tgz]
author	Kent Overstreet <kmo@daterainc.com>	Mon Mar 17 16:55:55 2014 -0700
committer	Kent Overstreet <kmo@daterainc.com>	Tue Mar 18 12:23:36 2014 -0700
tree	4a75259d27010a2b48c0aca0e5b490060db81f75
parent	0a63b66db566cffdf90182eb6e66fdd4d0479e63 [diff]