Blame - drivers/md/bcache/bset.c - kernel/msm-4.9

blob: cb4578a327b9d2841d43b58afd9be238d5d38a6c [file] [log] [blame]

Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	1	/*
				2	* Code for working with individual keys, and sorted sets of keys with in a
				3	* btree node
				4	*
				5	* Copyright 2012 Google, Inc.
				6	*/
				7
				8	#include "bcache.h"
				9	#include "btree.h"
				10	#include "debug.h"
				11
				12	#include <linux/random.h>
Geert Uytterhoeven	cd953ed	2013-03-27 18:56:28 +0100	[diff] [blame]	13	#include <linux/prefetch.h>
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	14
				15	/* Keylists */
				16
				17	void bch_keylist_copy(struct keylist dest, struct keylist src)
				18	{
				19	dest = src;
				20
				21	if (src->list == src->d) {
				22	size_t n = (uint64_t *) src->top - src->d;
				23	dest->top = (struct bkey *) &dest->d[n];
				24	dest->list = dest->d;
				25	}
				26	}
				27
				28	int bch_keylist_realloc(struct keylist l, int nptrs, struct cache_set c)
				29	{
				30	unsigned oldsize = (uint64_t *) l->top - l->list;
				31	unsigned newsize = oldsize + 2 + nptrs;
				32	uint64_t *new;
				33
				34	/* The journalling code doesn't handle the case where the keys to insert
				35	* is bigger than an empty write: If we just return -ENOMEM here,
				36	* bio_insert() and bio_invalidate() will insert the keys created so far
				37	* and finish the rest when the keylist is empty.
				38	*/
				39	if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
				40	return -ENOMEM;
				41
				42	newsize = roundup_pow_of_two(newsize);
				43
				44	if (newsize <= KEYLIST_INLINE \|\|
				45	roundup_pow_of_two(oldsize) == newsize)
				46	return 0;
				47
				48	new = krealloc(l->list == l->d ? NULL : l->list,
				49	sizeof(uint64_t) * newsize, GFP_NOIO);
				50
				51	if (!new)
				52	return -ENOMEM;
				53
				54	if (l->list == l->d)
				55	memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE);
				56
				57	l->list = new;
				58	l->top = (struct bkey *) (&l->list[oldsize]);
				59
				60	return 0;
				61	}
				62
				63	struct bkey bch_keylist_pop(struct keylist l)
				64	{
				65	struct bkey *k = l->bottom;
				66
				67	if (k == l->top)
				68	return NULL;
				69
				70	while (bkey_next(k) != l->top)
				71	k = bkey_next(k);
				72
				73	return l->top = k;
				74	}
				75
				76	/* Pointer validation */
				77
				78	bool __bch_ptr_invalid(struct cache_set c, int level, const struct bkey k)
				79	{
				80	unsigned i;
				81
				82	if (level && (!KEY_PTRS(k) \|\| !KEY_SIZE(k) \|\| KEY_DIRTY(k)))
				83	goto bad;
				84
				85	if (!level && KEY_SIZE(k) > KEY_OFFSET(k))
				86	goto bad;
				87
				88	if (!KEY_SIZE(k))
				89	return true;
				90
				91	for (i = 0; i < KEY_PTRS(k); i++)
				92	if (ptr_available(c, k, i)) {
				93	struct cache *ca = PTR_CACHE(c, k, i);
				94	size_t bucket = PTR_BUCKET_NR(c, k, i);
				95	size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
				96
				97	if (KEY_SIZE(k) + r > c->sb.bucket_size \|\|
				98	bucket < ca->sb.first_bucket \|\|
				99	bucket >= ca->sb.nbuckets)
				100	goto bad;
				101	}
				102
				103	return false;
				104	bad:
				105	cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
				106	return true;
				107	}
				108
				109	bool bch_ptr_bad(struct btree b, const struct bkey k)
				110	{
				111	struct bucket *g;
				112	unsigned i, stale;
				113
				114	if (!bkey_cmp(k, &ZERO_KEY) \|\|
				115	!KEY_PTRS(k) \|\|
				116	bch_ptr_invalid(b, k))
				117	return true;
				118
				119	if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV)
				120	return true;
				121
				122	for (i = 0; i < KEY_PTRS(k); i++)
				123	if (ptr_available(b->c, k, i)) {
				124	g = PTR_BUCKET(b->c, k, i);
				125	stale = ptr_stale(b->c, k, i);
				126
				127	btree_bug_on(stale > 96, b,
				128	"key too stale: %i, need_gc %u",
				129	stale, b->c->need_gc);
				130
				131	btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
				132	b, "stale dirty pointer");
				133
				134	if (stale)
				135	return true;
				136
				137	#ifdef CONFIG_BCACHE_EDEBUG
				138	if (!mutex_trylock(&b->c->bucket_lock))
				139	continue;
				140
				141	if (b->level) {
				142	if (KEY_DIRTY(k) \|\|
				143	g->prio != BTREE_PRIO \|\|
				144	(b->c->gc_mark_valid &&
				145	GC_MARK(g) != GC_MARK_METADATA))
				146	goto bug;
				147
				148	} else {
				149	if (g->prio == BTREE_PRIO)
				150	goto bug;
				151
				152	if (KEY_DIRTY(k) &&
				153	b->c->gc_mark_valid &&
				154	GC_MARK(g) != GC_MARK_DIRTY)
				155	goto bug;
				156	}
				157	mutex_unlock(&b->c->bucket_lock);
				158	#endif
				159	}
				160
				161	return false;
				162	#ifdef CONFIG_BCACHE_EDEBUG
				163	bug:
				164	mutex_unlock(&b->c->bucket_lock);
Kent Overstreet	b1a67b0	2013-03-25 11:46:44 -0700	[diff] [blame]	165	btree_bug(b,
				166	"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
				167	pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	168	g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
				169	return true;
				170	#endif
				171	}
				172
				173	/* Key/pointer manipulation */
				174
				175	void bch_bkey_copy_single_ptr(struct bkey dest, const struct bkey src,
				176	unsigned i)
				177	{
				178	BUG_ON(i > KEY_PTRS(src));
				179
				180	/* Only copy the header, key, and one pointer. */
				181	memcpy(dest, src, 2 * sizeof(uint64_t));
				182	dest->ptr[0] = src->ptr[i];
				183	SET_KEY_PTRS(dest, 1);
				184	/* We didn't copy the checksum so clear that bit. */
				185	SET_KEY_CSUM(dest, 0);
				186	}
				187
				188	bool __bch_cut_front(const struct bkey where, struct bkey k)
				189	{
				190	unsigned i, len = 0;
				191
				192	if (bkey_cmp(where, &START_KEY(k)) <= 0)
				193	return false;
				194
				195	if (bkey_cmp(where, k) < 0)
				196	len = KEY_OFFSET(k) - KEY_OFFSET(where);
				197	else
				198	bkey_copy_key(k, where);
				199
				200	for (i = 0; i < KEY_PTRS(k); i++)
				201	SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
				202
				203	BUG_ON(len > KEY_SIZE(k));
				204	SET_KEY_SIZE(k, len);
				205	return true;
				206	}
				207
				208	bool __bch_cut_back(const struct bkey where, struct bkey k)
				209	{
				210	unsigned len = 0;
				211
				212	if (bkey_cmp(where, k) >= 0)
				213	return false;
				214
				215	BUG_ON(KEY_INODE(where) != KEY_INODE(k));
				216
				217	if (bkey_cmp(where, &START_KEY(k)) > 0)
				218	len = KEY_OFFSET(where) - KEY_START(k);
				219
				220	bkey_copy_key(k, where);
				221
				222	BUG_ON(len > KEY_SIZE(k));
				223	SET_KEY_SIZE(k, len);
				224	return true;
				225	}
				226
				227	static uint64_t merge_chksums(struct bkey l, struct bkey r)
				228	{
				229	return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
				230	~((uint64_t)1 << 63);
				231	}
				232
				233	/* Tries to merge l and r: l should be lower than r
				234	* Returns true if we were able to merge. If we did merge, l will be the merged
				235	* key, r will be untouched.
				236	*/
				237	bool bch_bkey_try_merge(struct btree b, struct bkey l, struct bkey *r)
				238	{
				239	unsigned i;
				240
				241	if (key_merging_disabled(b->c))
				242	return false;
				243
				244	if (KEY_PTRS(l) != KEY_PTRS(r) \|\|
				245	KEY_DIRTY(l) != KEY_DIRTY(r) \|\|
				246	bkey_cmp(l, &START_KEY(r)))
				247	return false;
				248
				249	for (i = 0; i < KEY_PTRS(l); i++)
				250	if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] \|\|
				251	PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
				252	return false;
				253
				254	/* Keys with no pointers aren't restricted to one bucket and could
				255	* overflow KEY_SIZE
				256	*/
				257	if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
				258	SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
				259	SET_KEY_SIZE(l, USHRT_MAX);
				260
				261	bch_cut_front(l, r);
				262	return false;
				263	}
				264
				265	if (KEY_CSUM(l)) {
				266	if (KEY_CSUM(r))
				267	l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
				268	else
				269	SET_KEY_CSUM(l, 0);
				270	}
				271
				272	SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
				273	SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
				274
				275	return true;
				276	}
				277
				278	/* Binary tree stuff for auxiliary search trees */
				279
				280	static unsigned inorder_next(unsigned j, unsigned size)
				281	{
				282	if (j * 2 + 1 < size) {
				283	j = j * 2 + 1;
				284
				285	while (j * 2 < size)
				286	j *= 2;
				287	} else
				288	j >>= ffz(j) + 1;
				289
				290	return j;
				291	}
				292
				293	static unsigned inorder_prev(unsigned j, unsigned size)
				294	{
				295	if (j * 2 < size) {
				296	j = j * 2;
				297
				298	while (j * 2 + 1 < size)
				299	j = j * 2 + 1;
				300	} else
				301	j >>= ffs(j);
				302
				303	return j;
				304	}
				305
				306	/* I have no idea why this code works... and I'm the one who wrote it
				307	*
				308	* However, I do know what it does:
				309	* Given a binary tree constructed in an array (i.e. how you normally implement
				310	* a heap), it converts a node in the tree - referenced by array index - to the
				311	* index it would have if you did an inorder traversal.
				312	*
				313	* Also tested for every j, size up to size somewhere around 6 million.
				314	*
				315	* The binary tree starts at array index 1, not 0
				316	* extra is a function of size:
				317	* extra = (size - rounddown_pow_of_two(size - 1)) << 1;
				318	*/
				319	static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
				320	{
				321	unsigned b = fls(j);
				322	unsigned shift = fls(size - 1) - b;
				323
				324	j ^= 1U << (b - 1);
				325	j <<= 1;
				326	j \|= 1;
				327	j <<= shift;
				328
				329	if (j > extra)
				330	j -= (j - extra) >> 1;
				331
				332	return j;
				333	}
				334
				335	static unsigned to_inorder(unsigned j, struct bset_tree *t)
				336	{
				337	return __to_inorder(j, t->size, t->extra);
				338	}
				339
				340	static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
				341	{
				342	unsigned shift;
				343
				344	if (j > extra)
				345	j += j - extra;
				346
				347	shift = ffs(j);
				348
				349	j >>= shift;
				350	j \|= roundup_pow_of_two(size) >> shift;
				351
				352	return j;
				353	}
				354
				355	static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
				356	{
				357	return __inorder_to_tree(j, t->size, t->extra);
				358	}
				359
				360	#if 0
				361	void inorder_test(void)
				362	{
				363	unsigned long done = 0;
				364	ktime_t start = ktime_get();
				365
				366	for (unsigned size = 2;
				367	size < 65536000;
				368	size++) {
				369	unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
				370	unsigned i = 1, j = rounddown_pow_of_two(size - 1);
				371
				372	if (!(size % 4096))
				373	printk(KERN_NOTICE "loop %u, %llu per us\n", size,
				374	done / ktime_us_delta(ktime_get(), start));
				375
				376	while (1) {
				377	if (__inorder_to_tree(i, size, extra) != j)
				378	panic("size %10u j %10u i %10u", size, j, i);
				379
				380	if (__to_inorder(j, size, extra) != i)
				381	panic("size %10u j %10u i %10u", size, j, i);
				382
				383	if (j == rounddown_pow_of_two(size) - 1)
				384	break;
				385
				386	BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
				387
				388	j = inorder_next(j, size);
				389	i++;
				390	}
				391
				392	done += size - 1;
				393	}
				394	}
				395	#endif
				396
				397	/*
				398	* Cacheline/offset <-> bkey pointer arithmatic:
				399	*
				400	* t->tree is a binary search tree in an array; each node corresponds to a key
				401	* in one cacheline in t->set (BSET_CACHELINE bytes).
				402	*
				403	* This means we don't have to store the full index of the key that a node in
				404	* the binary tree points to; to_inorder() gives us the cacheline, and then
				405	* bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
				406	*
				407	* cacheline_to_bkey() and friends abstract out all the pointer arithmatic to
				408	* make this work.
				409	*
				410	* To construct the bfloat for an arbitrary key we need to know what the key
				411	* immediately preceding it is: we have to check if the two keys differ in the
				412	* bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
				413	* of the previous key so we can walk backwards to it from t->tree[j]'s key.
				414	*/
				415
				416	static struct bkey cacheline_to_bkey(struct bset_tree t, unsigned cacheline,
				417	unsigned offset)
				418	{
				419	return ((void ) t->data) + cacheline BSET_CACHELINE + offset * 8;
				420	}
				421
				422	static unsigned bkey_to_cacheline(struct bset_tree t, struct bkey k)
				423	{
				424	return ((void ) k - (void ) t->data) / BSET_CACHELINE;
				425	}
				426
				427	static unsigned bkey_to_cacheline_offset(struct bkey *k)
				428	{
				429	return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
				430	}
				431
				432	static struct bkey tree_to_bkey(struct bset_tree t, unsigned j)
				433	{
				434	return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
				435	}
				436
				437	static struct bkey tree_to_prev_bkey(struct bset_tree t, unsigned j)
				438	{
				439	return (void ) (((uint64_t ) tree_to_bkey(t, j)) - t->prev[j]);
				440	}
				441
				442	/*
				443	* For the write set - the one we're currently inserting keys into - we don't
				444	* maintain a full search tree, we just keep a simple lookup table in t->prev.
				445	*/
				446	static struct bkey table_to_bkey(struct bset_tree t, unsigned cacheline)
				447	{
				448	return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
				449	}
				450
				451	static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
				452	{
				453	#ifdef CONFIG_X86_64
				454	asm("shrd %[shift],%[high],%[low]"
				455	: [low] "+Rm" (low)
				456	: [high] "R" (high),
				457	[shift] "ci" (shift)
				458	: "cc");
				459	#else
				460	low >>= shift;
				461	low \|= (high << 1) << (63U - shift);
				462	#endif
				463	return low;
				464	}
				465
				466	static inline unsigned bfloat_mantissa(const struct bkey *k,
				467	struct bkey_float *f)
				468	{
				469	const uint64_t *p = &k->low - (f->exponent >> 6);
				470	return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
				471	}
				472
				473	static void make_bfloat(struct bset_tree *t, unsigned j)
				474	{
				475	struct bkey_float *f = &t->tree[j];
				476	struct bkey *m = tree_to_bkey(t, j);
				477	struct bkey *p = tree_to_prev_bkey(t, j);
				478
				479	struct bkey *l = is_power_of_2(j)
				480	? t->data->start
				481	: tree_to_prev_bkey(t, j >> ffs(j));
				482
				483	struct bkey *r = is_power_of_2(j + 1)
				484	? node(t->data, t->data->keys - bkey_u64s(&t->end))
				485	: tree_to_bkey(t, j >> (ffz(j) + 1));
				486
				487	BUG_ON(m < l \|\| m > r);
				488	BUG_ON(bkey_next(p) != m);
				489
				490	if (KEY_INODE(l) != KEY_INODE(r))
				491	f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
				492	else
				493	f->exponent = fls64(r->low ^ l->low);
				494
				495	f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
				496
				497	/*
				498	* Setting f->exponent = 127 flags this node as failed, and causes the
				499	* lookup code to fall back to comparing against the original key.
				500	*/
				501
				502	if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
				503	f->mantissa = bfloat_mantissa(m, f) - 1;
				504	else
				505	f->exponent = 127;
				506	}
				507
				508	static void bset_alloc_tree(struct btree b, struct bset_tree t)
				509	{
				510	if (t != b->sets) {
				511	unsigned j = roundup(t[-1].size,
				512	64 / sizeof(struct bkey_float));
				513
				514	t->tree = t[-1].tree + j;
				515	t->prev = t[-1].prev + j;
				516	}
				517
				518	while (t < b->sets + MAX_BSETS)
				519	t++->size = 0;
				520	}
				521
				522	static void bset_build_unwritten_tree(struct btree *b)
				523	{
				524	struct bset_tree *t = b->sets + b->nsets;
				525
				526	bset_alloc_tree(b, t);
				527
				528	if (t->tree != b->sets->tree + bset_tree_space(b)) {
				529	t->prev[0] = bkey_to_cacheline_offset(t->data->start);
				530	t->size = 1;
				531	}
				532	}
				533
				534	static void bset_build_written_tree(struct btree *b)
				535	{
				536	struct bset_tree *t = b->sets + b->nsets;
				537	struct bkey *k = t->data->start;
				538	unsigned j, cacheline = 1;
				539
				540	bset_alloc_tree(b, t);
				541
				542	t->size = min_t(unsigned,
				543	bkey_to_cacheline(t, end(t->data)),
				544	b->sets->tree + bset_tree_space(b) - t->tree);
				545
				546	if (t->size < 2) {
				547	t->size = 0;
				548	return;
				549	}
				550
				551	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
				552
				553	/* First we figure out where the first key in each cacheline is */
				554	for (j = inorder_next(0, t->size);
				555	j;
				556	j = inorder_next(j, t->size)) {
				557	while (bkey_to_cacheline(t, k) != cacheline)
				558	k = bkey_next(k);
				559
				560	t->prev[j] = bkey_u64s(k);
				561	k = bkey_next(k);
				562	cacheline++;
				563	t->tree[j].m = bkey_to_cacheline_offset(k);
				564	}
				565
				566	while (bkey_next(k) != end(t->data))
				567	k = bkey_next(k);
				568
				569	t->end = *k;
				570
				571	/* Then we build the tree */
				572	for (j = inorder_next(0, t->size);
				573	j;
				574	j = inorder_next(j, t->size))
				575	make_bfloat(t, j);
				576	}
				577
				578	void bch_bset_fix_invalidated_key(struct btree b, struct bkey k)
				579	{
				580	struct bset_tree *t;
				581	unsigned inorder, j = 1;
				582
				583	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
				584	if (k < end(t->data))
				585	goto found_set;
				586
				587	BUG();
				588	found_set:
				589	if (!t->size \|\| !bset_written(b, t))
				590	return;
				591
				592	inorder = bkey_to_cacheline(t, k);
				593
				594	if (k == t->data->start)
				595	goto fix_left;
				596
				597	if (bkey_next(k) == end(t->data)) {
				598	t->end = *k;
				599	goto fix_right;
				600	}
				601
				602	j = inorder_to_tree(inorder, t);
				603
				604	if (j &&
				605	j < t->size &&
				606	k == tree_to_bkey(t, j))
				607	fix_left: do {
				608	make_bfloat(t, j);
				609	j = j * 2;
				610	} while (j < t->size);
				611
				612	j = inorder_to_tree(inorder + 1, t);
				613
				614	if (j &&
				615	j < t->size &&
				616	k == tree_to_prev_bkey(t, j))
				617	fix_right: do {
				618	make_bfloat(t, j);
				619	j = j * 2 + 1;
				620	} while (j < t->size);
				621	}
				622
				623	void bch_bset_fix_lookup_table(struct btree b, struct bkey k)
				624	{
				625	struct bset_tree *t = &b->sets[b->nsets];
				626	unsigned shift = bkey_u64s(k);
				627	unsigned j = bkey_to_cacheline(t, k);
				628
				629	/* We're getting called from btree_split() or btree_gc, just bail out */
				630	if (!t->size)
				631	return;
				632
				633	/* k is the key we just inserted; we need to find the entry in the
				634	* lookup table for the first key that is strictly greater than k:
				635	* it's either k's cacheline or the next one
				636	*/
				637	if (j < t->size &&
				638	table_to_bkey(t, j) <= k)
				639	j++;
				640
				641	/* Adjust all the lookup table entries, and find a new key for any that
				642	* have gotten too big
				643	*/
				644	for (; j < t->size; j++) {
				645	t->prev[j] += shift;
				646
				647	if (t->prev[j] > 7) {
				648	k = table_to_bkey(t, j - 1);
				649
				650	while (k < cacheline_to_bkey(t, j, 0))
				651	k = bkey_next(k);
				652
				653	t->prev[j] = bkey_to_cacheline_offset(k);
				654	}
				655	}
				656
				657	if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
				658	return;
				659
				660	/* Possibly add a new entry to the end of the lookup table */
				661
				662	for (k = table_to_bkey(t, t->size - 1);
				663	k != end(t->data);
				664	k = bkey_next(k))
				665	if (t->size == bkey_to_cacheline(t, k)) {
				666	t->prev[t->size] = bkey_to_cacheline_offset(k);
				667	t->size++;
				668	}
				669	}
				670
				671	void bch_bset_init_next(struct btree *b)
				672	{
				673	struct bset *i = write_block(b);
				674
				675	if (i != b->sets[0].data) {
				676	b->sets[++b->nsets].data = i;
				677	i->seq = b->sets[0].data->seq;
				678	} else
				679	get_random_bytes(&i->seq, sizeof(uint64_t));
				680
				681	i->magic = bset_magic(b->c);
				682	i->version = 0;
				683	i->keys = 0;
				684
				685	bset_build_unwritten_tree(b);
				686	}
				687
				688	struct bset_search_iter {
				689	struct bkey l, r;
				690	};
				691
				692	static struct bset_search_iter bset_search_write_set(struct btree *b,
				693	struct bset_tree *t,
				694	const struct bkey *search)
				695	{
				696	unsigned li = 0, ri = t->size;
				697
				698	BUG_ON(!b->nsets &&
				699	t->size < bkey_to_cacheline(t, end(t->data)));
				700
				701	while (li + 1 != ri) {
				702	unsigned m = (li + ri) >> 1;
				703
				704	if (bkey_cmp(table_to_bkey(t, m), search) > 0)
				705	ri = m;
				706	else
				707	li = m;
				708	}
				709
				710	return (struct bset_search_iter) {
				711	table_to_bkey(t, li),
				712	ri < t->size ? table_to_bkey(t, ri) : end(t->data)
				713	};
				714	}
				715
				716	static struct bset_search_iter bset_search_tree(struct btree *b,
				717	struct bset_tree *t,
				718	const struct bkey *search)
				719	{
				720	struct bkey l, r;
				721	struct bkey_float *f;
				722	unsigned inorder, j, n = 1;
				723
				724	do {
				725	unsigned p = n << 4;
				726	p &= ((int) (p - t->size)) >> 31;
				727
				728	prefetch(&t->tree[p]);
				729
				730	j = n;
				731	f = &t->tree[j];
				732
				733	/*
				734	* n = (f->mantissa > bfloat_mantissa())
				735	* ? j * 2
				736	* : j * 2 + 1;
				737	*
				738	* We need to subtract 1 from f->mantissa for the sign bit trick
				739	* to work - that's done in make_bfloat()
				740	*/
				741	if (likely(f->exponent != 127))
				742	n = j * 2 + (((unsigned)
				743	(f->mantissa -
				744	bfloat_mantissa(search, f))) >> 31);
				745	else
				746	n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
				747	? j * 2
				748	: j * 2 + 1;
				749	} while (n < t->size);
				750
				751	inorder = to_inorder(j, t);
				752
				753	/*
				754	* n would have been the node we recursed to - the low bit tells us if
				755	* we recursed left or recursed right.
				756	*/
				757	if (n & 1) {
				758	l = cacheline_to_bkey(t, inorder, f->m);
				759
				760	if (++inorder != t->size) {
				761	f = &t->tree[inorder_next(j, t->size)];
				762	r = cacheline_to_bkey(t, inorder, f->m);
				763	} else
				764	r = end(t->data);
				765	} else {
				766	r = cacheline_to_bkey(t, inorder, f->m);
				767
				768	if (--inorder) {
				769	f = &t->tree[inorder_prev(j, t->size)];
				770	l = cacheline_to_bkey(t, inorder, f->m);
				771	} else
				772	l = t->data->start;
				773	}
				774
				775	return (struct bset_search_iter) {l, r};
				776	}
				777
				778	struct bkey __bch_bset_search(struct btree b, struct bset_tree *t,
				779	const struct bkey *search)
				780	{
				781	struct bset_search_iter i;
				782
				783	/*
				784	* First, we search for a cacheline, then lastly we do a linear search
				785	* within that cacheline.
				786	*
				787	* To search for the cacheline, there's three different possibilities:
				788	* * The set is too small to have a search tree, so we just do a linear
				789	* search over the whole set.
				790	* * The set is the one we're currently inserting into; keeping a full
				791	* auxiliary search tree up to date would be too expensive, so we
				792	* use a much simpler lookup table to do a binary search -
				793	* bset_search_write_set().
				794	* * Or we use the auxiliary search tree we constructed earlier -
				795	* bset_search_tree()
				796	*/
				797
				798	if (unlikely(!t->size)) {
				799	i.l = t->data->start;
				800	i.r = end(t->data);
				801	} else if (bset_written(b, t)) {
				802	/*
				803	* Each node in the auxiliary search tree covers a certain range
				804	* of bits, and keys above and below the set it covers might
				805	* differ outside those bits - so we have to special case the
				806	* start and end - handle that here:
				807	*/
				808
				809	if (unlikely(bkey_cmp(search, &t->end) >= 0))
				810	return end(t->data);
				811
				812	if (unlikely(bkey_cmp(search, t->data->start) < 0))
				813	return t->data->start;
				814
				815	i = bset_search_tree(b, t, search);
				816	} else
				817	i = bset_search_write_set(b, t, search);
				818
				819	#ifdef CONFIG_BCACHE_EDEBUG
				820	BUG_ON(bset_written(b, t) &&
				821	i.l != t->data->start &&
				822	bkey_cmp(tree_to_prev_bkey(t,
				823	inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
				824	search) > 0);
				825
				826	BUG_ON(i.r != end(t->data) &&
				827	bkey_cmp(i.r, search) <= 0);
				828	#endif
				829
				830	while (likely(i.l != i.r) &&
				831	bkey_cmp(i.l, search) <= 0)
				832	i.l = bkey_next(i.l);
				833
				834	return i.l;
				835	}
				836
				837	/* Btree iterator */
				838
				839	static inline bool btree_iter_cmp(struct btree_iter_set l,
				840	struct btree_iter_set r)
				841	{
				842	int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
				843
				844	return c ? c > 0 : l.k < r.k;
				845	}
				846
				847	static inline bool btree_iter_end(struct btree_iter *iter)
				848	{
				849	return !iter->used;
				850	}
				851
				852	void bch_btree_iter_push(struct btree_iter iter, struct bkey k,
				853	struct bkey *end)
				854	{
				855	if (k != end)
				856	BUG_ON(!heap_add(iter,
				857	((struct btree_iter_set) { k, end }),
				858	btree_iter_cmp));
				859	}
				860
				861	struct bkey __bch_btree_iter_init(struct btree b, struct btree_iter *iter,
				862	struct bkey search, struct bset_tree start)
				863	{
				864	struct bkey *ret = NULL;
				865	iter->size = ARRAY_SIZE(iter->data);
				866	iter->used = 0;
				867
				868	for (; start <= &b->sets[b->nsets]; start++) {
				869	ret = bch_bset_search(b, start, search);
				870	bch_btree_iter_push(iter, ret, end(start->data));
				871	}
				872
				873	return ret;
				874	}
				875
				876	struct bkey bch_btree_iter_next(struct btree_iter iter)
				877	{
				878	struct btree_iter_set unused;
				879	struct bkey *ret = NULL;
				880
				881	if (!btree_iter_end(iter)) {
				882	ret = iter->data->k;
				883	iter->data->k = bkey_next(iter->data->k);
				884
				885	if (iter->data->k > iter->data->end) {
Kent Overstreet	cc0f4ea	2013-03-27 12:47:45 -0700	[diff] [blame]	886	WARN_ONCE(1, "bset was corrupt!\n");
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	887	iter->data->k = iter->data->end;
				888	}
				889
				890	if (iter->data->k == iter->data->end)
				891	heap_pop(iter, unused, btree_iter_cmp);
				892	else
				893	heap_sift(iter, 0, btree_iter_cmp);
				894	}
				895
				896	return ret;
				897	}
				898
				899	struct bkey bch_btree_iter_next_filter(struct btree_iter iter,
				900	struct btree *b, ptr_filter_fn fn)
				901	{
				902	struct bkey *ret;
				903
				904	do {
				905	ret = bch_btree_iter_next(iter);
				906	} while (ret && fn(b, ret));
				907
				908	return ret;
				909	}
				910
				911	struct bkey bch_next_recurse_key(struct btree b, struct bkey *search)
				912	{
				913	struct btree_iter iter;
				914
				915	bch_btree_iter_init(b, &iter, search);
				916	return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
				917	}
				918
				919	/* Mergesort */
				920
				921	static void btree_sort_fixup(struct btree_iter *iter)
				922	{
				923	while (iter->used > 1) {
				924	struct btree_iter_set top = iter->data, i = top + 1;
				925	struct bkey *k;
				926
				927	if (iter->used > 2 &&
				928	btree_iter_cmp(i[0], i[1]))
				929	i++;
				930
				931	for (k = i->k;
				932	k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
				933	k = bkey_next(k))
				934	if (top->k > i->k)
				935	__bch_cut_front(top->k, k);
				936	else if (KEY_SIZE(k))
				937	bch_cut_back(&START_KEY(k), top->k);
				938
				939	if (top->k < i->k \|\| k == i->k)
				940	break;
				941
				942	heap_sift(iter, i - top, btree_iter_cmp);
				943	}
				944	}
				945
				946	static void btree_mergesort(struct btree b, struct bset out,
				947	struct btree_iter *iter,
				948	bool fixup, bool remove_stale)
				949	{
				950	struct bkey k, last = NULL;
				951	bool (bad)(struct btree , const struct bkey *) = remove_stale
				952	? bch_ptr_bad
				953	: bch_ptr_invalid;
				954
				955	while (!btree_iter_end(iter)) {
				956	if (fixup && !b->level)
				957	btree_sort_fixup(iter);
				958
				959	k = bch_btree_iter_next(iter);
				960	if (bad(b, k))
				961	continue;
				962
				963	if (!last) {
				964	last = out->start;
				965	bkey_copy(last, k);
				966	} else if (b->level \|\|
				967	!bch_bkey_try_merge(b, last, k)) {
				968	last = bkey_next(last);
				969	bkey_copy(last, k);
				970	}
				971	}
				972
				973	out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
				974
				975	pr_debug("sorted %i keys", out->keys);
				976	bch_check_key_order(b, out);
				977	}
				978
				979	static void __btree_sort(struct btree b, struct btree_iter iter,
				980	unsigned start, unsigned order, bool fixup)
				981	{
				982	uint64_t start_time;
				983	bool remove_stale = !b->written;
				984	struct bset out = (void ) __get_free_pages(__GFP_NOWARN\|GFP_NOIO,
				985	order);
				986	if (!out) {
				987	mutex_lock(&b->c->sort_lock);
				988	out = b->c->sort;
				989	order = ilog2(bucket_pages(b->c));
				990	}
				991
				992	start_time = local_clock();
				993
				994	btree_mergesort(b, out, iter, fixup, remove_stale);
				995	b->nsets = start;
				996
				997	if (!fixup && !start && b->written)
				998	bch_btree_verify(b, out);
				999
				1000	if (!start && order == b->page_order) {
				1001	/*
				1002	* Our temporary buffer is the same size as the btree node's
				1003	* buffer, we can just swap buffers instead of doing a big
				1004	* memcpy()
				1005	*/
				1006
				1007	out->magic = bset_magic(b->c);
				1008	out->seq = b->sets[0].data->seq;
				1009	out->version = b->sets[0].data->version;
				1010	swap(out, b->sets[0].data);
				1011
				1012	if (b->c->sort == b->sets[0].data)
				1013	b->c->sort = out;
				1014	} else {
				1015	b->sets[start].data->keys = out->keys;
				1016	memcpy(b->sets[start].data->start, out->start,
				1017	(void ) end(out) - (void ) out->start);
				1018	}
				1019
				1020	if (out == b->c->sort)
				1021	mutex_unlock(&b->c->sort_lock);
				1022	else
				1023	free_pages((unsigned long) out, order);
				1024
				1025	if (b->written)
				1026	bset_build_written_tree(b);
				1027
				1028	if (!start) {
				1029	spin_lock(&b->c->sort_time_lock);
Kent Overstreet	169ef1c	2013-03-28 12:50:55 -0600	[diff] [blame]	1030	bch_time_stats_update(&b->c->sort_time, start_time);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	1031	spin_unlock(&b->c->sort_time_lock);
				1032	}
				1033	}
				1034
				1035	void bch_btree_sort_partial(struct btree *b, unsigned start)
				1036	{
				1037	size_t oldsize = 0, order = b->page_order, keys = 0;
				1038	struct btree_iter iter;
				1039	__bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
				1040
				1041	BUG_ON(b->sets[b->nsets].data == write_block(b) &&
				1042	(b->sets[b->nsets].size \|\| b->nsets));
				1043
				1044	if (b->written)
				1045	oldsize = bch_count_data(b);
				1046
				1047	if (start) {
				1048	unsigned i;
				1049
				1050	for (i = start; i <= b->nsets; i++)
				1051	keys += b->sets[i].data->keys;
				1052
Kent Overstreet	b1a67b0	2013-03-25 11:46:44 -0700	[diff] [blame]	1053	order = roundup_pow_of_two(__set_bytes(b->sets->data,
				1054	keys)) / PAGE_SIZE;
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	1055	if (order)
				1056	order = ilog2(order);
				1057	}
				1058
				1059	__btree_sort(b, &iter, start, order, false);
				1060
				1061	EBUG_ON(b->written && bch_count_data(b) != oldsize);
				1062	}
				1063
				1064	void bch_btree_sort_and_fix_extents(struct btree b, struct btree_iter iter)
				1065	{
				1066	BUG_ON(!b->written);
				1067	__btree_sort(b, iter, 0, b->page_order, true);
				1068	}
				1069
				1070	void bch_btree_sort_into(struct btree b, struct btree new)
				1071	{
				1072	uint64_t start_time = local_clock();
				1073
				1074	struct btree_iter iter;
				1075	bch_btree_iter_init(b, &iter, NULL);
				1076
				1077	btree_mergesort(b, new->sets->data, &iter, false, true);
				1078
				1079	spin_lock(&b->c->sort_time_lock);
Kent Overstreet	169ef1c	2013-03-28 12:50:55 -0600	[diff] [blame]	1080	bch_time_stats_update(&b->c->sort_time, start_time);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	1081	spin_unlock(&b->c->sort_time_lock);
				1082
				1083	bkey_copy_key(&new->key, &b->key);
				1084	new->sets->size = 0;
				1085	}
				1086
				1087	void bch_btree_sort_lazy(struct btree *b)
				1088	{
				1089	if (b->nsets) {
				1090	unsigned i, j, keys = 0, total;
				1091
				1092	for (i = 0; i <= b->nsets; i++)
				1093	keys += b->sets[i].data->keys;
				1094
				1095	total = keys;
				1096
				1097	for (j = 0; j < b->nsets; j++) {
				1098	if (keys * 2 < total \|\|
				1099	keys < 1000) {
				1100	bch_btree_sort_partial(b, j);
				1101	return;
				1102	}
				1103
				1104	keys -= b->sets[j].data->keys;
				1105	}
				1106
				1107	/* Must sort if b->nsets == 3 or we'll overflow */
				1108	if (b->nsets >= (MAX_BSETS - 1) - b->level) {
				1109	bch_btree_sort(b);
				1110	return;
				1111	}
				1112	}
				1113
				1114	bset_build_written_tree(b);
				1115	}
				1116
				1117	/* Sysfs stuff */
				1118
				1119	struct bset_stats {
				1120	size_t nodes;
				1121	size_t sets_written, sets_unwritten;
				1122	size_t bytes_written, bytes_unwritten;
				1123	size_t floats, failed;
				1124	};
				1125
				1126	static int bch_btree_bset_stats(struct btree b, struct btree_op op,
				1127	struct bset_stats *stats)
				1128	{
				1129	struct bkey *k;
				1130	unsigned i;
				1131
				1132	stats->nodes++;
				1133
				1134	for (i = 0; i <= b->nsets; i++) {
				1135	struct bset_tree *t = &b->sets[i];
				1136	size_t bytes = t->data->keys * sizeof(uint64_t);
				1137	size_t j;
				1138
				1139	if (bset_written(b, t)) {
				1140	stats->sets_written++;
				1141	stats->bytes_written += bytes;
				1142
				1143	stats->floats += t->size - 1;
				1144
				1145	for (j = 1; j < t->size; j++)
				1146	if (t->tree[j].exponent == 127)
				1147	stats->failed++;
				1148	} else {
				1149	stats->sets_unwritten++;
				1150	stats->bytes_unwritten += bytes;
				1151	}
				1152	}
				1153
				1154	if (b->level) {
				1155	struct btree_iter iter;
				1156
				1157	for_each_key_filter(b, k, &iter, bch_ptr_bad) {
				1158	int ret = btree(bset_stats, k, b, op, stats);
				1159	if (ret)
				1160	return ret;
				1161	}
				1162	}
				1163
				1164	return 0;
				1165	}
				1166
				1167	int bch_bset_print_stats(struct cache_set c, char buf)
				1168	{
				1169	struct btree_op op;
				1170	struct bset_stats t;
				1171	int ret;
				1172
				1173	bch_btree_op_init_stack(&op);
				1174	memset(&t, 0, sizeof(struct bset_stats));
				1175
				1176	ret = btree_root(bset_stats, c, &op, &t);
				1177	if (ret)
				1178	return ret;
				1179
				1180	return snprintf(buf, PAGE_SIZE,
				1181	"btree nodes: %zu\n"
				1182	"written sets: %zu\n"
				1183	"unwritten sets: %zu\n"
				1184	"written key bytes: %zu\n"
				1185	"unwritten key bytes: %zu\n"
				1186	"floats: %zu\n"
				1187	"failed: %zu\n",
				1188	t.nodes,
				1189	t.sets_written, t.sets_unwritten,
				1190	t.bytes_written, t.bytes_unwritten,
				1191	t.floats, t.failed);
				1192	}