blob: aaa87b3be9ef4baa0913aac1b41faea66bfa7abc [file] [log] [blame]
Kent Overstreetcafe5632013-03-23 16:11:31 -07001/*
2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 *
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
6 * of the device.
7 *
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
12 *
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
15 *
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
19 *
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
21 */
22
23#include "bcache.h"
24#include "btree.h"
25#include "debug.h"
Kent Overstreet279afba2013-06-05 06:21:07 -070026#include "writeback.h"
Kent Overstreetcafe5632013-03-23 16:11:31 -070027
28#include <linux/slab.h>
29#include <linux/bitops.h>
Kent Overstreet72a44512013-10-24 17:19:26 -070030#include <linux/freezer.h>
Kent Overstreetcafe5632013-03-23 16:11:31 -070031#include <linux/hash.h>
Kent Overstreet72a44512013-10-24 17:19:26 -070032#include <linux/kthread.h>
Geert Uytterhoevencd953ed2013-03-27 18:56:28 +010033#include <linux/prefetch.h>
Kent Overstreetcafe5632013-03-23 16:11:31 -070034#include <linux/random.h>
35#include <linux/rcupdate.h>
36#include <trace/events/bcache.h>
37
38/*
39 * Todo:
40 * register_bcache: Return errors out to userspace correctly
41 *
42 * Writeback: don't undirty key until after a cache flush
43 *
44 * Create an iterator for key pointers
45 *
46 * On btree write error, mark bucket such that it won't be freed from the cache
47 *
48 * Journalling:
49 * Check for bad keys in replay
50 * Propagate barriers
51 * Refcount journal entries in journal_replay
52 *
53 * Garbage collection:
54 * Finish incremental gc
55 * Gc should free old UUIDs, data for invalid UUIDs
56 *
57 * Provide a way to list backing device UUIDs we have data cached for, and
58 * probably how long it's been since we've seen them, and a way to invalidate
59 * dirty data for devices that will never be attached again
60 *
61 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
62 * that based on that and how much dirty data we have we can keep writeback
63 * from being starved
64 *
65 * Add a tracepoint or somesuch to watch for writeback starvation
66 *
67 * When btree depth > 1 and splitting an interior node, we have to make sure
68 * alloc_bucket() cannot fail. This should be true but is not completely
69 * obvious.
70 *
71 * Make sure all allocations get charged to the root cgroup
72 *
73 * Plugging?
74 *
75 * If data write is less than hard sector size of ssd, round up offset in open
76 * bucket to the next whole sector
77 *
78 * Also lookup by cgroup in get_open_bucket()
79 *
80 * Superblock needs to be fleshed out for multiple cache devices
81 *
82 * Add a sysfs tunable for the number of writeback IOs in flight
83 *
84 * Add a sysfs tunable for the number of open data buckets
85 *
86 * IO tracking: Can we track when one process is doing io on behalf of another?
87 * IO tracking: Don't use just an average, weigh more recent stuff higher
88 *
89 * Test module load/unload
90 */
91
Kent Overstreetdf8e8972013-07-24 17:37:59 -070092enum {
93 BTREE_INSERT_STATUS_INSERT,
94 BTREE_INSERT_STATUS_BACK_MERGE,
95 BTREE_INSERT_STATUS_OVERWROTE,
96 BTREE_INSERT_STATUS_FRONT_MERGE,
97};
98
Kent Overstreetcafe5632013-03-23 16:11:31 -070099#define MAX_NEED_GC 64
100#define MAX_SAVE_PRIO 72
101
102#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
103
104#define PTR_HASH(c, k) \
105 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
106
Kent Overstreetcafe5632013-03-23 16:11:31 -0700107static struct workqueue_struct *btree_io_wq;
108
Kent Overstreetdf8e8972013-07-24 17:37:59 -0700109static inline bool should_split(struct btree *b)
110{
111 struct bset *i = write_block(b);
112 return b->written >= btree_blocks(b) ||
113 (b->written + __set_blocks(i, i->keys + 15, b->c)
114 > btree_blocks(b));
115}
116
117#define insert_lock(s, b) ((b)->level <= (s)->lock)
118
119/*
120 * These macros are for recursing down the btree - they handle the details of
121 * locking and looking up nodes in the cache for you. They're best treated as
122 * mere syntax when reading code that uses them.
123 *
124 * op->lock determines whether we take a read or a write lock at a given depth.
125 * If you've got a read lock and find that you need a write lock (i.e. you're
126 * going to have to split), set op->lock and return -EINTR; btree_root() will
127 * call you again and you'll have the correct lock.
128 */
129
130/**
131 * btree - recurse down the btree on a specified key
132 * @fn: function to call, which will be passed the child node
133 * @key: key to recurse on
134 * @b: parent btree node
135 * @op: pointer to struct btree_op
136 */
137#define btree(fn, key, b, op, ...) \
138({ \
139 int _r, l = (b)->level - 1; \
140 bool _w = l <= (op)->lock; \
141 struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \
142 if (!IS_ERR(_child)) { \
143 _child->parent = (b); \
144 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
145 rw_unlock(_w, _child); \
146 } else \
147 _r = PTR_ERR(_child); \
148 _r; \
149})
150
151/**
152 * btree_root - call a function on the root of the btree
153 * @fn: function to call, which will be passed the child node
154 * @c: cache set
155 * @op: pointer to struct btree_op
156 */
157#define btree_root(fn, c, op, ...) \
158({ \
159 int _r = -EINTR; \
160 do { \
161 struct btree *_b = (c)->root; \
162 bool _w = insert_lock(op, _b); \
163 rw_lock(_w, _b, _b->level); \
164 if (_b == (c)->root && \
165 _w == insert_lock(op, _b)) { \
166 _b->parent = NULL; \
167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
168 } \
169 rw_unlock(_w, _b); \
170 bch_cannibalize_unlock(c); \
171 if (_r == -ENOSPC) { \
172 wait_event((c)->try_wait, \
173 !(c)->try_harder); \
174 _r = -EINTR; \
175 } \
176 } while (_r == -EINTR); \
177 \
178 _r; \
179})
180
Kent Overstreetcafe5632013-03-23 16:11:31 -0700181/* Btree key manipulation */
182
Kent Overstreet3a3b6a42013-07-24 16:46:42 -0700183void bkey_put(struct cache_set *c, struct bkey *k)
Kent Overstreete7c590e2013-09-10 18:39:16 -0700184{
185 unsigned i;
186
187 for (i = 0; i < KEY_PTRS(k); i++)
188 if (ptr_available(c, k, i))
189 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
190}
191
Kent Overstreetcafe5632013-03-23 16:11:31 -0700192/* Btree IO */
193
194static uint64_t btree_csum_set(struct btree *b, struct bset *i)
195{
196 uint64_t crc = b->key.ptr[0];
197 void *data = (void *) i + 8, *end = end(i);
198
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600199 crc = bch_crc64_update(crc, data, end - data);
Kent Overstreetc19ed232013-03-26 13:49:02 -0700200 return crc ^ 0xffffffffffffffffULL;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700201}
202
Kent Overstreetf3059a52013-05-15 17:13:45 -0700203static void bch_btree_node_read_done(struct btree *b)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700204{
Kent Overstreetcafe5632013-03-23 16:11:31 -0700205 const char *err = "bad btree header";
Kent Overstreet57943512013-04-25 13:58:35 -0700206 struct bset *i = b->sets[0].data;
207 struct btree_iter *iter;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700208
Kent Overstreet57943512013-04-25 13:58:35 -0700209 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
210 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700211 iter->used = 0;
212
Kent Overstreet280481d2013-10-24 16:36:03 -0700213#ifdef CONFIG_BCACHE_DEBUG
214 iter->b = b;
215#endif
216
Kent Overstreet57943512013-04-25 13:58:35 -0700217 if (!i->seq)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700218 goto err;
219
220 for (;
221 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
222 i = write_block(b)) {
223 err = "unsupported bset version";
224 if (i->version > BCACHE_BSET_VERSION)
225 goto err;
226
227 err = "bad btree header";
228 if (b->written + set_blocks(i, b->c) > btree_blocks(b))
229 goto err;
230
231 err = "bad magic";
Kent Overstreet81ab4192013-10-31 15:46:42 -0700232 if (i->magic != bset_magic(&b->c->sb))
Kent Overstreetcafe5632013-03-23 16:11:31 -0700233 goto err;
234
235 err = "bad checksum";
236 switch (i->version) {
237 case 0:
238 if (i->csum != csum_set(i))
239 goto err;
240 break;
241 case BCACHE_BSET_VERSION:
242 if (i->csum != btree_csum_set(b, i))
243 goto err;
244 break;
245 }
246
247 err = "empty set";
248 if (i != b->sets[0].data && !i->keys)
249 goto err;
250
251 bch_btree_iter_push(iter, i->start, end(i));
252
253 b->written += set_blocks(i, b->c);
254 }
255
256 err = "corrupted btree";
257 for (i = write_block(b);
258 index(i, b) < btree_blocks(b);
259 i = ((void *) i) + block_bytes(b->c))
260 if (i->seq == b->sets[0].data->seq)
261 goto err;
262
263 bch_btree_sort_and_fix_extents(b, iter);
264
265 i = b->sets[0].data;
266 err = "short btree key";
267 if (b->sets[0].size &&
268 bkey_cmp(&b->key, &b->sets[0].end) < 0)
269 goto err;
270
271 if (b->written < btree_blocks(b))
272 bch_bset_init_next(b);
273out:
Kent Overstreet57943512013-04-25 13:58:35 -0700274 mempool_free(iter, b->c->fill_iter);
275 return;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700276err:
277 set_btree_node_io_error(b);
Kent Overstreet07e86cc2013-03-25 11:46:43 -0700278 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
Kent Overstreetcafe5632013-03-23 16:11:31 -0700279 err, PTR_BUCKET_NR(b->c, &b->key, 0),
280 index(i, b), i->keys);
281 goto out;
282}
283
Kent Overstreet57943512013-04-25 13:58:35 -0700284static void btree_node_read_endio(struct bio *bio, int error)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700285{
Kent Overstreet57943512013-04-25 13:58:35 -0700286 struct closure *cl = bio->bi_private;
287 closure_put(cl);
288}
Kent Overstreetcafe5632013-03-23 16:11:31 -0700289
Kent Overstreet57943512013-04-25 13:58:35 -0700290void bch_btree_node_read(struct btree *b)
291{
292 uint64_t start_time = local_clock();
293 struct closure cl;
294 struct bio *bio;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700295
Kent Overstreetc37511b2013-04-26 15:39:55 -0700296 trace_bcache_btree_read(b);
297
Kent Overstreet57943512013-04-25 13:58:35 -0700298 closure_init_stack(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700299
Kent Overstreet57943512013-04-25 13:58:35 -0700300 bio = bch_bbio_alloc(b->c);
301 bio->bi_rw = REQ_META|READ_SYNC;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700302 bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
Kent Overstreet57943512013-04-25 13:58:35 -0700303 bio->bi_end_io = btree_node_read_endio;
304 bio->bi_private = &cl;
305
306 bch_bio_map(bio, b->sets[0].data);
307
Kent Overstreet57943512013-04-25 13:58:35 -0700308 bch_submit_bbio(bio, b->c, &b->key, 0);
309 closure_sync(&cl);
310
311 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
312 set_btree_node_io_error(b);
313
314 bch_bbio_free(bio, b->c);
315
316 if (btree_node_io_error(b))
317 goto err;
318
319 bch_btree_node_read_done(b);
Kent Overstreet57943512013-04-25 13:58:35 -0700320 bch_time_stats_update(&b->c->btree_read_time, start_time);
Kent Overstreet57943512013-04-25 13:58:35 -0700321
322 return;
323err:
Geert Uytterhoeven61cbd252013-09-23 23:17:30 -0700324 bch_cache_set_error(b->c, "io error reading bucket %zu",
Kent Overstreet57943512013-04-25 13:58:35 -0700325 PTR_BUCKET_NR(b->c, &b->key, 0));
Kent Overstreetcafe5632013-03-23 16:11:31 -0700326}
327
328static void btree_complete_write(struct btree *b, struct btree_write *w)
329{
330 if (w->prio_blocked &&
331 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
Kent Overstreet119ba0f2013-04-24 19:01:12 -0700332 wake_up_allocators(b->c);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700333
334 if (w->journal) {
335 atomic_dec_bug(w->journal);
336 __closure_wake_up(&b->c->journal.wait);
337 }
338
Kent Overstreetcafe5632013-03-23 16:11:31 -0700339 w->prio_blocked = 0;
340 w->journal = NULL;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700341}
342
Kent Overstreet57943512013-04-25 13:58:35 -0700343static void __btree_node_write_done(struct closure *cl)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700344{
345 struct btree *b = container_of(cl, struct btree, io.cl);
346 struct btree_write *w = btree_prev_write(b);
347
348 bch_bbio_free(b->bio, b->c);
349 b->bio = NULL;
350 btree_complete_write(b, w);
351
352 if (btree_node_dirty(b))
353 queue_delayed_work(btree_io_wq, &b->work,
354 msecs_to_jiffies(30000));
355
356 closure_return(cl);
357}
358
Kent Overstreet57943512013-04-25 13:58:35 -0700359static void btree_node_write_done(struct closure *cl)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700360{
361 struct btree *b = container_of(cl, struct btree, io.cl);
362 struct bio_vec *bv;
363 int n;
364
Kent Overstreet79886132013-11-23 17:19:00 -0800365 bio_for_each_segment_all(bv, b->bio, n)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700366 __free_page(bv->bv_page);
367
Kent Overstreet57943512013-04-25 13:58:35 -0700368 __btree_node_write_done(cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700369}
370
Kent Overstreet57943512013-04-25 13:58:35 -0700371static void btree_node_write_endio(struct bio *bio, int error)
372{
373 struct closure *cl = bio->bi_private;
374 struct btree *b = container_of(cl, struct btree, io.cl);
375
376 if (error)
377 set_btree_node_io_error(b);
378
379 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
380 closure_put(cl);
381}
382
383static void do_btree_node_write(struct btree *b)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700384{
385 struct closure *cl = &b->io.cl;
386 struct bset *i = b->sets[b->nsets].data;
387 BKEY_PADDED(key) k;
388
389 i->version = BCACHE_BSET_VERSION;
390 i->csum = btree_csum_set(b, i);
391
Kent Overstreet57943512013-04-25 13:58:35 -0700392 BUG_ON(b->bio);
393 b->bio = bch_bbio_alloc(b->c);
394
395 b->bio->bi_end_io = btree_node_write_endio;
Kent Overstreetfaadf0c2013-11-01 18:03:08 -0700396 b->bio->bi_private = cl;
Kent Overstreete49c7c32013-06-26 17:25:38 -0700397 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700398 b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c);
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600399 bch_bio_map(b->bio, i);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700400
Kent Overstreete49c7c32013-06-26 17:25:38 -0700401 /*
402 * If we're appending to a leaf node, we don't technically need FUA -
403 * this write just needs to be persisted before the next journal write,
404 * which will be marked FLUSH|FUA.
405 *
406 * Similarly if we're writing a new btree root - the pointer is going to
407 * be in the next journal entry.
408 *
409 * But if we're writing a new btree node (that isn't a root) or
410 * appending to a non leaf btree node, we need either FUA or a flush
411 * when we write the parent with the new pointer. FUA is cheaper than a
412 * flush, and writes appending to leaf nodes aren't blocking anything so
413 * just make all btree node writes FUA to keep things sane.
414 */
415
Kent Overstreetcafe5632013-03-23 16:11:31 -0700416 bkey_copy(&k.key, &b->key);
417 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
418
Kent Overstreet8e51e412013-06-06 18:15:57 -0700419 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700420 int j;
421 struct bio_vec *bv;
422 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
423
Kent Overstreet79886132013-11-23 17:19:00 -0800424 bio_for_each_segment_all(bv, b->bio, j)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700425 memcpy(page_address(bv->bv_page),
426 base + j * PAGE_SIZE, PAGE_SIZE);
427
Kent Overstreetcafe5632013-03-23 16:11:31 -0700428 bch_submit_bbio(b->bio, b->c, &k.key, 0);
429
Kent Overstreet57943512013-04-25 13:58:35 -0700430 continue_at(cl, btree_node_write_done, NULL);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700431 } else {
432 b->bio->bi_vcnt = 0;
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600433 bch_bio_map(b->bio, i);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700434
Kent Overstreetcafe5632013-03-23 16:11:31 -0700435 bch_submit_bbio(b->bio, b->c, &k.key, 0);
436
437 closure_sync(cl);
Kent Overstreet57943512013-04-25 13:58:35 -0700438 __btree_node_write_done(cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700439 }
440}
441
Kent Overstreet57943512013-04-25 13:58:35 -0700442void bch_btree_node_write(struct btree *b, struct closure *parent)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700443{
444 struct bset *i = b->sets[b->nsets].data;
445
Kent Overstreetc37511b2013-04-26 15:39:55 -0700446 trace_bcache_btree_write(b);
447
Kent Overstreetcafe5632013-03-23 16:11:31 -0700448 BUG_ON(current->bio_list);
Kent Overstreet57943512013-04-25 13:58:35 -0700449 BUG_ON(b->written >= btree_blocks(b));
450 BUG_ON(b->written && !i->keys);
451 BUG_ON(b->sets->data->seq != i->seq);
Kent Overstreet280481d2013-10-24 16:36:03 -0700452 bch_check_keys(b, "writing");
Kent Overstreetcafe5632013-03-23 16:11:31 -0700453
Kent Overstreetcafe5632013-03-23 16:11:31 -0700454 cancel_delayed_work(&b->work);
455
Kent Overstreet57943512013-04-25 13:58:35 -0700456 /* If caller isn't waiting for write, parent refcount is cache set */
457 closure_lock(&b->io, parent ?: &b->c->cl);
458
Kent Overstreetcafe5632013-03-23 16:11:31 -0700459 clear_bit(BTREE_NODE_dirty, &b->flags);
460 change_bit(BTREE_NODE_write_idx, &b->flags);
461
Kent Overstreet57943512013-04-25 13:58:35 -0700462 do_btree_node_write(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700463
Kent Overstreetcafe5632013-03-23 16:11:31 -0700464 b->written += set_blocks(i, b->c);
465 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
466 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
467
468 bch_btree_sort_lazy(b);
469
470 if (b->written < btree_blocks(b))
471 bch_bset_init_next(b);
472}
473
Kent Overstreetf269af52013-07-23 20:48:29 -0700474static void bch_btree_node_write_sync(struct btree *b)
475{
476 struct closure cl;
477
478 closure_init_stack(&cl);
479 bch_btree_node_write(b, &cl);
480 closure_sync(&cl);
481}
482
Kent Overstreet57943512013-04-25 13:58:35 -0700483static void btree_node_write_work(struct work_struct *w)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700484{
485 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
486
Kent Overstreet57943512013-04-25 13:58:35 -0700487 rw_lock(true, b, b->level);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700488
489 if (btree_node_dirty(b))
Kent Overstreet57943512013-04-25 13:58:35 -0700490 bch_btree_node_write(b, NULL);
491 rw_unlock(true, b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700492}
493
Kent Overstreetc18536a2013-07-24 17:44:17 -0700494static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700495{
496 struct bset *i = b->sets[b->nsets].data;
497 struct btree_write *w = btree_current_write(b);
498
Kent Overstreet57943512013-04-25 13:58:35 -0700499 BUG_ON(!b->written);
500 BUG_ON(!i->keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700501
Kent Overstreet57943512013-04-25 13:58:35 -0700502 if (!btree_node_dirty(b))
503 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700504
Kent Overstreet57943512013-04-25 13:58:35 -0700505 set_btree_node_dirty(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700506
Kent Overstreetc18536a2013-07-24 17:44:17 -0700507 if (journal_ref) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700508 if (w->journal &&
Kent Overstreetc18536a2013-07-24 17:44:17 -0700509 journal_pin_cmp(b->c, w->journal, journal_ref)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700510 atomic_dec_bug(w->journal);
511 w->journal = NULL;
512 }
513
514 if (!w->journal) {
Kent Overstreetc18536a2013-07-24 17:44:17 -0700515 w->journal = journal_ref;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700516 atomic_inc(w->journal);
517 }
518 }
519
Kent Overstreetcafe5632013-03-23 16:11:31 -0700520 /* Force write if set is too big */
Kent Overstreet57943512013-04-25 13:58:35 -0700521 if (set_bytes(i) > PAGE_SIZE - 48 &&
522 !current->bio_list)
523 bch_btree_node_write(b, NULL);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700524}
525
526/*
527 * Btree in memory cache - allocation/freeing
528 * mca -> memory cache
529 */
530
531static void mca_reinit(struct btree *b)
532{
533 unsigned i;
534
535 b->flags = 0;
536 b->written = 0;
537 b->nsets = 0;
538
539 for (i = 0; i < MAX_BSETS; i++)
540 b->sets[i].size = 0;
541 /*
542 * Second loop starts at 1 because b->sets[0]->data is the memory we
543 * allocated
544 */
545 for (i = 1; i < MAX_BSETS; i++)
546 b->sets[i].data = NULL;
547}
548
549#define mca_reserve(c) (((c->root && c->root->level) \
550 ? c->root->level : 1) * 8 + 16)
551#define mca_can_free(c) \
552 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
553
554static void mca_data_free(struct btree *b)
555{
556 struct bset_tree *t = b->sets;
557 BUG_ON(!closure_is_unlocked(&b->io.cl));
558
559 if (bset_prev_bytes(b) < PAGE_SIZE)
560 kfree(t->prev);
561 else
562 free_pages((unsigned long) t->prev,
563 get_order(bset_prev_bytes(b)));
564
565 if (bset_tree_bytes(b) < PAGE_SIZE)
566 kfree(t->tree);
567 else
568 free_pages((unsigned long) t->tree,
569 get_order(bset_tree_bytes(b)));
570
571 free_pages((unsigned long) t->data, b->page_order);
572
573 t->prev = NULL;
574 t->tree = NULL;
575 t->data = NULL;
576 list_move(&b->list, &b->c->btree_cache_freed);
577 b->c->bucket_cache_used--;
578}
579
580static void mca_bucket_free(struct btree *b)
581{
582 BUG_ON(btree_node_dirty(b));
583
584 b->key.ptr[0] = 0;
585 hlist_del_init_rcu(&b->hash);
586 list_move(&b->list, &b->c->btree_cache_freeable);
587}
588
589static unsigned btree_order(struct bkey *k)
590{
591 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
592}
593
594static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
595{
596 struct bset_tree *t = b->sets;
597 BUG_ON(t->data);
598
599 b->page_order = max_t(unsigned,
600 ilog2(b->c->btree_pages),
601 btree_order(k));
602
603 t->data = (void *) __get_free_pages(gfp, b->page_order);
604 if (!t->data)
605 goto err;
606
607 t->tree = bset_tree_bytes(b) < PAGE_SIZE
608 ? kmalloc(bset_tree_bytes(b), gfp)
609 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
610 if (!t->tree)
611 goto err;
612
613 t->prev = bset_prev_bytes(b) < PAGE_SIZE
614 ? kmalloc(bset_prev_bytes(b), gfp)
615 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
616 if (!t->prev)
617 goto err;
618
619 list_move(&b->list, &b->c->btree_cache);
620 b->c->bucket_cache_used++;
621 return;
622err:
623 mca_data_free(b);
624}
625
626static struct btree *mca_bucket_alloc(struct cache_set *c,
627 struct bkey *k, gfp_t gfp)
628{
629 struct btree *b = kzalloc(sizeof(struct btree), gfp);
630 if (!b)
631 return NULL;
632
633 init_rwsem(&b->lock);
634 lockdep_set_novalidate_class(&b->lock);
635 INIT_LIST_HEAD(&b->list);
Kent Overstreet57943512013-04-25 13:58:35 -0700636 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700637 b->c = c;
638 closure_init_unlocked(&b->io);
639
640 mca_data_alloc(b, k, gfp);
641 return b;
642}
643
Kent Overstreete8e1d462013-07-24 17:27:07 -0700644static int mca_reap(struct btree *b, unsigned min_order, bool flush)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700645{
Kent Overstreete8e1d462013-07-24 17:27:07 -0700646 struct closure cl;
647
648 closure_init_stack(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700649 lockdep_assert_held(&b->c->bucket_lock);
650
651 if (!down_write_trylock(&b->lock))
652 return -ENOMEM;
653
Kent Overstreete8e1d462013-07-24 17:27:07 -0700654 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
655
656 if (b->page_order < min_order ||
657 (!flush &&
658 (btree_node_dirty(b) ||
659 atomic_read(&b->io.cl.remaining) != -1))) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700660 rw_unlock(true, b);
661 return -ENOMEM;
662 }
663
Kent Overstreetf269af52013-07-23 20:48:29 -0700664 if (btree_node_dirty(b))
665 bch_btree_node_write_sync(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700666
Kent Overstreete8e1d462013-07-24 17:27:07 -0700667 /* wait for any in flight btree write */
Kent Overstreetfaadf0c2013-11-01 18:03:08 -0700668 closure_wait_event(&b->io.wait, &cl,
669 atomic_read(&b->io.cl.remaining) == -1);
Kent Overstreete8e1d462013-07-24 17:27:07 -0700670
Kent Overstreetcafe5632013-03-23 16:11:31 -0700671 return 0;
672}
673
Dave Chinner7dc19d52013-08-28 10:18:11 +1000674static unsigned long bch_mca_scan(struct shrinker *shrink,
675 struct shrink_control *sc)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700676{
677 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
678 struct btree *b, *t;
679 unsigned long i, nr = sc->nr_to_scan;
Dave Chinner7dc19d52013-08-28 10:18:11 +1000680 unsigned long freed = 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700681
682 if (c->shrinker_disabled)
Dave Chinner7dc19d52013-08-28 10:18:11 +1000683 return SHRINK_STOP;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700684
685 if (c->try_harder)
Dave Chinner7dc19d52013-08-28 10:18:11 +1000686 return SHRINK_STOP;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700687
688 /* Return -1 if we can't do anything right now */
Kent Overstreeta698e082013-09-23 23:17:34 -0700689 if (sc->gfp_mask & __GFP_IO)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700690 mutex_lock(&c->bucket_lock);
691 else if (!mutex_trylock(&c->bucket_lock))
692 return -1;
693
Kent Overstreet36c9ea92013-06-03 13:04:56 -0700694 /*
695 * It's _really_ critical that we don't free too many btree nodes - we
696 * have to always leave ourselves a reserve. The reserve is how we
697 * guarantee that allocating memory for a new btree node can always
698 * succeed, so that inserting keys into the btree can always succeed and
699 * IO can always make forward progress:
700 */
Kent Overstreetcafe5632013-03-23 16:11:31 -0700701 nr /= c->btree_pages;
702 nr = min_t(unsigned long, nr, mca_can_free(c));
703
704 i = 0;
705 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
Dave Chinner7dc19d52013-08-28 10:18:11 +1000706 if (freed >= nr)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700707 break;
708
709 if (++i > 3 &&
Kent Overstreete8e1d462013-07-24 17:27:07 -0700710 !mca_reap(b, 0, false)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700711 mca_data_free(b);
712 rw_unlock(true, b);
Dave Chinner7dc19d52013-08-28 10:18:11 +1000713 freed++;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700714 }
715 }
716
Dave Chinner7dc19d52013-08-28 10:18:11 +1000717 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
Kent Overstreetb0f32a52013-12-10 13:24:26 -0800718 if (list_empty(&c->btree_cache))
719 goto out;
720
Kent Overstreetcafe5632013-03-23 16:11:31 -0700721 b = list_first_entry(&c->btree_cache, struct btree, list);
722 list_rotate_left(&c->btree_cache);
723
724 if (!b->accessed &&
Kent Overstreete8e1d462013-07-24 17:27:07 -0700725 !mca_reap(b, 0, false)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700726 mca_bucket_free(b);
727 mca_data_free(b);
728 rw_unlock(true, b);
Dave Chinner7dc19d52013-08-28 10:18:11 +1000729 freed++;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700730 } else
731 b->accessed = 0;
732 }
733out:
Kent Overstreetcafe5632013-03-23 16:11:31 -0700734 mutex_unlock(&c->bucket_lock);
Dave Chinner7dc19d52013-08-28 10:18:11 +1000735 return freed;
736}
737
738static unsigned long bch_mca_count(struct shrinker *shrink,
739 struct shrink_control *sc)
740{
741 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
742
743 if (c->shrinker_disabled)
744 return 0;
745
746 if (c->try_harder)
747 return 0;
748
749 return mca_can_free(c) * c->btree_pages;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700750}
751
752void bch_btree_cache_free(struct cache_set *c)
753{
754 struct btree *b;
755 struct closure cl;
756 closure_init_stack(&cl);
757
758 if (c->shrink.list.next)
759 unregister_shrinker(&c->shrink);
760
761 mutex_lock(&c->bucket_lock);
762
763#ifdef CONFIG_BCACHE_DEBUG
764 if (c->verify_data)
765 list_move(&c->verify_data->list, &c->btree_cache);
766#endif
767
768 list_splice(&c->btree_cache_freeable,
769 &c->btree_cache);
770
771 while (!list_empty(&c->btree_cache)) {
772 b = list_first_entry(&c->btree_cache, struct btree, list);
773
774 if (btree_node_dirty(b))
775 btree_complete_write(b, btree_current_write(b));
776 clear_bit(BTREE_NODE_dirty, &b->flags);
777
778 mca_data_free(b);
779 }
780
781 while (!list_empty(&c->btree_cache_freed)) {
782 b = list_first_entry(&c->btree_cache_freed,
783 struct btree, list);
784 list_del(&b->list);
785 cancel_delayed_work_sync(&b->work);
786 kfree(b);
787 }
788
789 mutex_unlock(&c->bucket_lock);
790}
791
792int bch_btree_cache_alloc(struct cache_set *c)
793{
794 unsigned i;
795
Kent Overstreetcafe5632013-03-23 16:11:31 -0700796 for (i = 0; i < mca_reserve(c); i++)
Kent Overstreet72a44512013-10-24 17:19:26 -0700797 if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
798 return -ENOMEM;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700799
800 list_splice_init(&c->btree_cache,
801 &c->btree_cache_freeable);
802
803#ifdef CONFIG_BCACHE_DEBUG
804 mutex_init(&c->verify_lock);
805
806 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
807
808 if (c->verify_data &&
809 c->verify_data->sets[0].data)
810 list_del_init(&c->verify_data->list);
811 else
812 c->verify_data = NULL;
813#endif
814
Dave Chinner7dc19d52013-08-28 10:18:11 +1000815 c->shrink.count_objects = bch_mca_count;
816 c->shrink.scan_objects = bch_mca_scan;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700817 c->shrink.seeks = 4;
818 c->shrink.batch = c->btree_pages * 2;
819 register_shrinker(&c->shrink);
820
821 return 0;
822}
823
824/* Btree in memory cache - hash table */
825
826static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
827{
828 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
829}
830
831static struct btree *mca_find(struct cache_set *c, struct bkey *k)
832{
833 struct btree *b;
834
835 rcu_read_lock();
836 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
837 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
838 goto out;
839 b = NULL;
840out:
841 rcu_read_unlock();
842 return b;
843}
844
Kent Overstreete8e1d462013-07-24 17:27:07 -0700845static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700846{
Kent Overstreete8e1d462013-07-24 17:27:07 -0700847 struct btree *b;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700848
Kent Overstreetc37511b2013-04-26 15:39:55 -0700849 trace_bcache_btree_cache_cannibalize(c);
850
Kent Overstreete8e1d462013-07-24 17:27:07 -0700851 if (!c->try_harder) {
852 c->try_harder = current;
853 c->try_harder_start = local_clock();
854 } else if (c->try_harder != current)
855 return ERR_PTR(-ENOSPC);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700856
Kent Overstreete8e1d462013-07-24 17:27:07 -0700857 list_for_each_entry_reverse(b, &c->btree_cache, list)
858 if (!mca_reap(b, btree_order(k), false))
859 return b;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700860
Kent Overstreete8e1d462013-07-24 17:27:07 -0700861 list_for_each_entry_reverse(b, &c->btree_cache, list)
862 if (!mca_reap(b, btree_order(k), true))
863 return b;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700864
Kent Overstreete8e1d462013-07-24 17:27:07 -0700865 return ERR_PTR(-ENOMEM);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700866}
867
868/*
869 * We can only have one thread cannibalizing other cached btree nodes at a time,
870 * or we'll deadlock. We use an open coded mutex to ensure that, which a
871 * cannibalize_bucket() will take. This means every time we unlock the root of
872 * the btree, we need to release this lock if we have it held.
873 */
Kent Overstreetdf8e8972013-07-24 17:37:59 -0700874static void bch_cannibalize_unlock(struct cache_set *c)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700875{
Kent Overstreete8e1d462013-07-24 17:27:07 -0700876 if (c->try_harder == current) {
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600877 bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700878 c->try_harder = NULL;
Kent Overstreete8e1d462013-07-24 17:27:07 -0700879 wake_up(&c->try_wait);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700880 }
881}
882
Kent Overstreete8e1d462013-07-24 17:27:07 -0700883static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700884{
885 struct btree *b;
886
Kent Overstreete8e1d462013-07-24 17:27:07 -0700887 BUG_ON(current->bio_list);
888
Kent Overstreetcafe5632013-03-23 16:11:31 -0700889 lockdep_assert_held(&c->bucket_lock);
890
891 if (mca_find(c, k))
892 return NULL;
893
894 /* btree_free() doesn't free memory; it sticks the node on the end of
895 * the list. Check if there's any freed nodes there:
896 */
897 list_for_each_entry(b, &c->btree_cache_freeable, list)
Kent Overstreete8e1d462013-07-24 17:27:07 -0700898 if (!mca_reap(b, btree_order(k), false))
Kent Overstreetcafe5632013-03-23 16:11:31 -0700899 goto out;
900
901 /* We never free struct btree itself, just the memory that holds the on
902 * disk node. Check the freed list before allocating a new one:
903 */
904 list_for_each_entry(b, &c->btree_cache_freed, list)
Kent Overstreete8e1d462013-07-24 17:27:07 -0700905 if (!mca_reap(b, 0, false)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700906 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
907 if (!b->sets[0].data)
908 goto err;
909 else
910 goto out;
911 }
912
913 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
914 if (!b)
915 goto err;
916
917 BUG_ON(!down_write_trylock(&b->lock));
918 if (!b->sets->data)
919 goto err;
920out:
921 BUG_ON(!closure_is_unlocked(&b->io.cl));
922
923 bkey_copy(&b->key, k);
924 list_move(&b->list, &c->btree_cache);
925 hlist_del_init_rcu(&b->hash);
926 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
927
928 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
929 b->level = level;
Kent Overstreetd6fd3b12013-07-24 17:20:19 -0700930 b->parent = (void *) ~0UL;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700931
932 mca_reinit(b);
933
934 return b;
935err:
936 if (b)
937 rw_unlock(true, b);
938
Kent Overstreete8e1d462013-07-24 17:27:07 -0700939 b = mca_cannibalize(c, k);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700940 if (!IS_ERR(b))
941 goto out;
942
943 return b;
944}
945
946/**
947 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
948 * in from disk if necessary.
949 *
Kent Overstreetb54d6932013-07-24 18:04:18 -0700950 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
Kent Overstreetcafe5632013-03-23 16:11:31 -0700951 *
952 * The btree node will have either a read or a write lock held, depending on
953 * level and op->lock.
954 */
955struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
Kent Overstreete8e1d462013-07-24 17:27:07 -0700956 int level, bool write)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700957{
958 int i = 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700959 struct btree *b;
960
961 BUG_ON(level < 0);
962retry:
963 b = mca_find(c, k);
964
965 if (!b) {
Kent Overstreet57943512013-04-25 13:58:35 -0700966 if (current->bio_list)
967 return ERR_PTR(-EAGAIN);
968
Kent Overstreetcafe5632013-03-23 16:11:31 -0700969 mutex_lock(&c->bucket_lock);
Kent Overstreete8e1d462013-07-24 17:27:07 -0700970 b = mca_alloc(c, k, level);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700971 mutex_unlock(&c->bucket_lock);
972
973 if (!b)
974 goto retry;
975 if (IS_ERR(b))
976 return b;
977
Kent Overstreet57943512013-04-25 13:58:35 -0700978 bch_btree_node_read(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700979
980 if (!write)
981 downgrade_write(&b->lock);
982 } else {
983 rw_lock(write, b, level);
984 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
985 rw_unlock(write, b);
986 goto retry;
987 }
988 BUG_ON(b->level != level);
989 }
990
991 b->accessed = 1;
992
993 for (; i <= b->nsets && b->sets[i].size; i++) {
994 prefetch(b->sets[i].tree);
995 prefetch(b->sets[i].data);
996 }
997
998 for (; i <= b->nsets; i++)
999 prefetch(b->sets[i].data);
1000
Kent Overstreet57943512013-04-25 13:58:35 -07001001 if (btree_node_io_error(b)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001002 rw_unlock(write, b);
Kent Overstreet57943512013-04-25 13:58:35 -07001003 return ERR_PTR(-EIO);
1004 }
1005
1006 BUG_ON(!b->written);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001007
1008 return b;
1009}
1010
1011static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
1012{
1013 struct btree *b;
1014
1015 mutex_lock(&c->bucket_lock);
Kent Overstreete8e1d462013-07-24 17:27:07 -07001016 b = mca_alloc(c, k, level);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001017 mutex_unlock(&c->bucket_lock);
1018
1019 if (!IS_ERR_OR_NULL(b)) {
Kent Overstreet57943512013-04-25 13:58:35 -07001020 bch_btree_node_read(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001021 rw_unlock(true, b);
1022 }
1023}
1024
1025/* Btree alloc */
1026
Kent Overstreete8e1d462013-07-24 17:27:07 -07001027static void btree_node_free(struct btree *b)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001028{
1029 unsigned i;
1030
Kent Overstreetc37511b2013-04-26 15:39:55 -07001031 trace_bcache_btree_node_free(b);
1032
Kent Overstreetcafe5632013-03-23 16:11:31 -07001033 BUG_ON(b == b->c->root);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001034
1035 if (btree_node_dirty(b))
1036 btree_complete_write(b, btree_current_write(b));
1037 clear_bit(BTREE_NODE_dirty, &b->flags);
1038
Kent Overstreetcafe5632013-03-23 16:11:31 -07001039 cancel_delayed_work(&b->work);
1040
1041 mutex_lock(&b->c->bucket_lock);
1042
1043 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1044 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
1045
1046 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1047 PTR_BUCKET(b->c, &b->key, i));
1048 }
1049
1050 bch_bucket_free(b->c, &b->key);
1051 mca_bucket_free(b);
1052 mutex_unlock(&b->c->bucket_lock);
1053}
1054
Kent Overstreetbc9389e2013-09-10 19:07:35 -07001055struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001056{
1057 BKEY_PADDED(key) k;
1058 struct btree *b = ERR_PTR(-EAGAIN);
1059
1060 mutex_lock(&c->bucket_lock);
1061retry:
Kent Overstreetbc9389e2013-09-10 19:07:35 -07001062 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
Kent Overstreetcafe5632013-03-23 16:11:31 -07001063 goto err;
1064
Kent Overstreet3a3b6a42013-07-24 16:46:42 -07001065 bkey_put(c, &k.key);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001066 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1067
Kent Overstreete8e1d462013-07-24 17:27:07 -07001068 b = mca_alloc(c, &k.key, level);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001069 if (IS_ERR(b))
1070 goto err_free;
1071
1072 if (!b) {
Kent Overstreetb1a67b02013-03-25 11:46:44 -07001073 cache_bug(c,
1074 "Tried to allocate bucket that was in btree cache");
Kent Overstreetcafe5632013-03-23 16:11:31 -07001075 goto retry;
1076 }
1077
Kent Overstreetcafe5632013-03-23 16:11:31 -07001078 b->accessed = 1;
1079 bch_bset_init_next(b);
1080
1081 mutex_unlock(&c->bucket_lock);
Kent Overstreetc37511b2013-04-26 15:39:55 -07001082
1083 trace_bcache_btree_node_alloc(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001084 return b;
1085err_free:
1086 bch_bucket_free(c, &k.key);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001087err:
1088 mutex_unlock(&c->bucket_lock);
Kent Overstreetc37511b2013-04-26 15:39:55 -07001089
1090 trace_bcache_btree_node_alloc_fail(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001091 return b;
1092}
1093
Kent Overstreetbc9389e2013-09-10 19:07:35 -07001094static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001095{
Kent Overstreetbc9389e2013-09-10 19:07:35 -07001096 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001097 if (!IS_ERR_OR_NULL(n))
1098 bch_btree_sort_into(b, n);
1099
1100 return n;
1101}
1102
Kent Overstreet8835c122013-07-24 23:18:05 -07001103static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1104{
1105 unsigned i;
1106
1107 bkey_copy(k, &b->key);
1108 bkey_copy_key(k, &ZERO_KEY);
1109
1110 for (i = 0; i < KEY_PTRS(k); i++) {
1111 uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
1112
1113 SET_PTR_GEN(k, i, g);
1114 }
1115
1116 atomic_inc(&b->c->prio_blocked);
1117}
1118
Kent Overstreetcafe5632013-03-23 16:11:31 -07001119/* Garbage collection */
1120
1121uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1122{
1123 uint8_t stale = 0;
1124 unsigned i;
1125 struct bucket *g;
1126
1127 /*
1128 * ptr_invalid() can't return true for the keys that mark btree nodes as
1129 * freed, but since ptr_bad() returns true we'll never actually use them
1130 * for anything and thus we don't want mark their pointers here
1131 */
1132 if (!bkey_cmp(k, &ZERO_KEY))
1133 return stale;
1134
1135 for (i = 0; i < KEY_PTRS(k); i++) {
1136 if (!ptr_available(c, k, i))
1137 continue;
1138
1139 g = PTR_BUCKET(c, k, i);
1140
1141 if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1142 g->gc_gen = PTR_GEN(k, i);
1143
1144 if (ptr_stale(c, k, i)) {
1145 stale = max(stale, ptr_stale(c, k, i));
1146 continue;
1147 }
1148
1149 cache_bug_on(GC_MARK(g) &&
1150 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1151 c, "inconsistent ptrs: mark = %llu, level = %i",
1152 GC_MARK(g), level);
1153
1154 if (level)
1155 SET_GC_MARK(g, GC_MARK_METADATA);
1156 else if (KEY_DIRTY(k))
1157 SET_GC_MARK(g, GC_MARK_DIRTY);
1158
1159 /* guard against overflow */
1160 SET_GC_SECTORS_USED(g, min_t(unsigned,
1161 GC_SECTORS_USED(g) + KEY_SIZE(k),
1162 (1 << 14) - 1));
1163
1164 BUG_ON(!GC_SECTORS_USED(g));
1165 }
1166
1167 return stale;
1168}
1169
1170#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1171
Kent Overstreeta1f03582013-09-10 19:07:00 -07001172static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001173{
1174 uint8_t stale = 0;
Kent Overstreeta1f03582013-09-10 19:07:00 -07001175 unsigned keys = 0, good_keys = 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001176 struct bkey *k;
1177 struct btree_iter iter;
1178 struct bset_tree *t;
1179
1180 gc->nodes++;
1181
1182 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001183 stale = max(stale, btree_mark_key(b, k));
Kent Overstreeta1f03582013-09-10 19:07:00 -07001184 keys++;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001185
1186 if (bch_ptr_bad(b, k))
1187 continue;
1188
Kent Overstreetcafe5632013-03-23 16:11:31 -07001189 gc->key_bytes += bkey_u64s(k);
1190 gc->nkeys++;
Kent Overstreeta1f03582013-09-10 19:07:00 -07001191 good_keys++;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001192
1193 gc->data += KEY_SIZE(k);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001194 }
1195
1196 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1197 btree_bug_on(t->size &&
1198 bset_written(b, t) &&
1199 bkey_cmp(&b->key, &t->end) < 0,
1200 b, "found short btree key in gc");
1201
Kent Overstreeta1f03582013-09-10 19:07:00 -07001202 if (b->c->gc_always_rewrite)
1203 return true;
1204
1205 if (stale > 10)
1206 return true;
1207
1208 if ((keys - good_keys) * 2 > keys)
1209 return true;
1210
1211 return false;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001212}
1213
Kent Overstreeta1f03582013-09-10 19:07:00 -07001214#define GC_MERGE_NODES 4U
Kent Overstreetcafe5632013-03-23 16:11:31 -07001215
1216struct gc_merge_info {
1217 struct btree *b;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001218 unsigned keys;
1219};
1220
Kent Overstreeta1f03582013-09-10 19:07:00 -07001221static int bch_btree_insert_node(struct btree *, struct btree_op *,
1222 struct keylist *, atomic_t *, struct bkey *);
Kent Overstreetb54d6932013-07-24 18:04:18 -07001223
Kent Overstreeta1f03582013-09-10 19:07:00 -07001224static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1225 struct keylist *keylist, struct gc_stat *gc,
1226 struct gc_merge_info *r)
1227{
1228 unsigned i, nodes = 0, keys = 0, blocks;
1229 struct btree *new_nodes[GC_MERGE_NODES];
1230 struct closure cl;
1231 struct bkey *k;
1232
1233 memset(new_nodes, 0, sizeof(new_nodes));
Kent Overstreetb54d6932013-07-24 18:04:18 -07001234 closure_init_stack(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001235
Kent Overstreeta1f03582013-09-10 19:07:00 -07001236 while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
Kent Overstreetcafe5632013-03-23 16:11:31 -07001237 keys += r[nodes++].keys;
1238
1239 blocks = btree_default_blocks(b->c) * 2 / 3;
1240
1241 if (nodes < 2 ||
1242 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
Kent Overstreeta1f03582013-09-10 19:07:00 -07001243 return 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001244
Kent Overstreeta1f03582013-09-10 19:07:00 -07001245 for (i = 0; i < nodes; i++) {
Kent Overstreetbc9389e2013-09-10 19:07:35 -07001246 new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
Kent Overstreeta1f03582013-09-10 19:07:00 -07001247 if (IS_ERR_OR_NULL(new_nodes[i]))
1248 goto out_nocoalesce;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001249 }
1250
1251 for (i = nodes - 1; i > 0; --i) {
Kent Overstreeta1f03582013-09-10 19:07:00 -07001252 struct bset *n1 = new_nodes[i]->sets->data;
1253 struct bset *n2 = new_nodes[i - 1]->sets->data;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001254 struct bkey *k, *last = NULL;
1255
1256 keys = 0;
1257
Kent Overstreeta1f03582013-09-10 19:07:00 -07001258 if (i > 1) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001259 for (k = n2->start;
1260 k < end(n2);
1261 k = bkey_next(k)) {
1262 if (__set_blocks(n1, n1->keys + keys +
1263 bkey_u64s(k), b->c) > blocks)
1264 break;
1265
1266 last = k;
1267 keys += bkey_u64s(k);
1268 }
Kent Overstreeta1f03582013-09-10 19:07:00 -07001269 } else {
1270 /*
1271 * Last node we're not getting rid of - we're getting
1272 * rid of the node at r[0]. Have to try and fit all of
1273 * the remaining keys into this node; we can't ensure
1274 * they will always fit due to rounding and variable
1275 * length keys (shouldn't be possible in practice,
1276 * though)
1277 */
1278 if (__set_blocks(n1, n1->keys + n2->keys,
1279 b->c) > btree_blocks(new_nodes[i]))
1280 goto out_nocoalesce;
1281
1282 keys = n2->keys;
1283 /* Take the key of the node we're getting rid of */
1284 last = &r->b->key;
1285 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001286
1287 BUG_ON(__set_blocks(n1, n1->keys + keys,
Kent Overstreeta1f03582013-09-10 19:07:00 -07001288 b->c) > btree_blocks(new_nodes[i]));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001289
Kent Overstreeta1f03582013-09-10 19:07:00 -07001290 if (last)
1291 bkey_copy_key(&new_nodes[i]->key, last);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001292
1293 memcpy(end(n1),
1294 n2->start,
1295 (void *) node(n2, keys) - (void *) n2->start);
1296
1297 n1->keys += keys;
Kent Overstreeta1f03582013-09-10 19:07:00 -07001298 r[i].keys = n1->keys;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001299
1300 memmove(n2->start,
1301 node(n2, keys),
1302 (void *) end(n2) - (void *) node(n2, keys));
1303
1304 n2->keys -= keys;
1305
Kent Overstreeta1f03582013-09-10 19:07:00 -07001306 if (bch_keylist_realloc(keylist,
1307 KEY_PTRS(&new_nodes[i]->key), b->c))
1308 goto out_nocoalesce;
1309
1310 bch_btree_node_write(new_nodes[i], &cl);
1311 bch_keylist_add(keylist, &new_nodes[i]->key);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001312 }
1313
Kent Overstreeta1f03582013-09-10 19:07:00 -07001314 for (i = 0; i < nodes; i++) {
1315 if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
1316 goto out_nocoalesce;
1317
1318 make_btree_freeing_key(r[i].b, keylist->top);
1319 bch_keylist_push(keylist);
1320 }
1321
1322 /* We emptied out this node */
1323 BUG_ON(new_nodes[0]->sets->data->keys);
1324 btree_node_free(new_nodes[0]);
1325 rw_unlock(true, new_nodes[0]);
1326
1327 closure_sync(&cl);
1328
1329 for (i = 0; i < nodes; i++) {
1330 btree_node_free(r[i].b);
1331 rw_unlock(true, r[i].b);
1332
1333 r[i].b = new_nodes[i];
1334 }
1335
1336 bch_btree_insert_node(b, op, keylist, NULL, NULL);
1337 BUG_ON(!bch_keylist_empty(keylist));
1338
1339 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1340 r[nodes - 1].b = ERR_PTR(-EINTR);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001341
Kent Overstreetc37511b2013-04-26 15:39:55 -07001342 trace_bcache_btree_gc_coalesce(nodes);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001343 gc->nodes--;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001344
Kent Overstreeta1f03582013-09-10 19:07:00 -07001345 /* Invalidated our iterator */
1346 return -EINTR;
1347
1348out_nocoalesce:
1349 closure_sync(&cl);
1350
1351 while ((k = bch_keylist_pop(keylist)))
1352 if (!bkey_cmp(k, &ZERO_KEY))
1353 atomic_dec(&b->c->prio_blocked);
1354
1355 for (i = 0; i < nodes; i++)
1356 if (!IS_ERR_OR_NULL(new_nodes[i])) {
1357 btree_node_free(new_nodes[i]);
1358 rw_unlock(true, new_nodes[i]);
1359 }
1360 return 0;
1361}
1362
1363static unsigned btree_gc_count_keys(struct btree *b)
1364{
1365 struct bkey *k;
1366 struct btree_iter iter;
1367 unsigned ret = 0;
1368
1369 for_each_key_filter(b, k, &iter, bch_ptr_bad)
1370 ret += bkey_u64s(k);
1371
1372 return ret;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001373}
1374
1375static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1376 struct closure *writes, struct gc_stat *gc)
1377{
Kent Overstreetcafe5632013-03-23 16:11:31 -07001378 unsigned i;
Kent Overstreeta1f03582013-09-10 19:07:00 -07001379 int ret = 0;
1380 bool should_rewrite;
1381 struct btree *n;
1382 struct bkey *k;
1383 struct keylist keys;
1384 struct btree_iter iter;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001385 struct gc_merge_info r[GC_MERGE_NODES];
Kent Overstreeta1f03582013-09-10 19:07:00 -07001386 struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001387
Kent Overstreeta1f03582013-09-10 19:07:00 -07001388 bch_keylist_init(&keys);
1389 bch_btree_iter_init(b, &iter, &b->c->gc_done);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001390
Kent Overstreeta1f03582013-09-10 19:07:00 -07001391 for (i = 0; i < GC_MERGE_NODES; i++)
1392 r[i].b = ERR_PTR(-EINTR);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001393
Kent Overstreeta1f03582013-09-10 19:07:00 -07001394 while (1) {
1395 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1396 if (k) {
1397 r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1398 if (IS_ERR(r->b)) {
1399 ret = PTR_ERR(r->b);
1400 break;
1401 }
1402
1403 r->keys = btree_gc_count_keys(r->b);
1404
1405 ret = btree_gc_coalesce(b, op, &keys, gc, r);
1406 if (ret)
1407 break;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001408 }
1409
Kent Overstreeta1f03582013-09-10 19:07:00 -07001410 if (!last->b)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001411 break;
Kent Overstreeta1f03582013-09-10 19:07:00 -07001412
1413 if (!IS_ERR(last->b)) {
1414 should_rewrite = btree_gc_mark_node(last->b, gc);
1415 if (should_rewrite) {
Kent Overstreetbc9389e2013-09-10 19:07:35 -07001416 n = btree_node_alloc_replacement(last->b,
1417 false);
Kent Overstreeta1f03582013-09-10 19:07:00 -07001418
1419 if (!IS_ERR_OR_NULL(n)) {
1420 bch_btree_node_write_sync(n);
1421 bch_keylist_add(&keys, &n->key);
1422
1423 make_btree_freeing_key(last->b,
1424 keys.top);
1425 bch_keylist_push(&keys);
1426
1427 btree_node_free(last->b);
1428
1429 bch_btree_insert_node(b, op, &keys,
1430 NULL, NULL);
1431 BUG_ON(!bch_keylist_empty(&keys));
1432
1433 rw_unlock(true, last->b);
1434 last->b = n;
1435
1436 /* Invalidated our iterator */
1437 ret = -EINTR;
1438 break;
1439 }
1440 }
1441
1442 if (last->b->level) {
1443 ret = btree_gc_recurse(last->b, op, writes, gc);
1444 if (ret)
1445 break;
1446 }
1447
1448 bkey_copy_key(&b->c->gc_done, &last->b->key);
1449
1450 /*
1451 * Must flush leaf nodes before gc ends, since replace
1452 * operations aren't journalled
1453 */
1454 if (btree_node_dirty(last->b))
1455 bch_btree_node_write(last->b, writes);
1456 rw_unlock(true, last->b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001457 }
1458
Kent Overstreeta1f03582013-09-10 19:07:00 -07001459 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1460 r->b = NULL;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001461
Kent Overstreetcafe5632013-03-23 16:11:31 -07001462 if (need_resched()) {
1463 ret = -EAGAIN;
1464 break;
1465 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001466 }
1467
Kent Overstreeta1f03582013-09-10 19:07:00 -07001468 for (i = 0; i < GC_MERGE_NODES; i++)
1469 if (!IS_ERR_OR_NULL(r[i].b)) {
1470 if (btree_node_dirty(r[i].b))
1471 bch_btree_node_write(r[i].b, writes);
1472 rw_unlock(true, r[i].b);
1473 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001474
Kent Overstreeta1f03582013-09-10 19:07:00 -07001475 bch_keylist_free(&keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001476
1477 return ret;
1478}
1479
1480static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1481 struct closure *writes, struct gc_stat *gc)
1482{
1483 struct btree *n = NULL;
Kent Overstreeta1f03582013-09-10 19:07:00 -07001484 int ret = 0;
1485 bool should_rewrite;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001486
Kent Overstreeta1f03582013-09-10 19:07:00 -07001487 should_rewrite = btree_gc_mark_node(b, gc);
1488 if (should_rewrite) {
Kent Overstreetbc9389e2013-09-10 19:07:35 -07001489 n = btree_node_alloc_replacement(b, false);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001490
Kent Overstreeta1f03582013-09-10 19:07:00 -07001491 if (!IS_ERR_OR_NULL(n)) {
1492 bch_btree_node_write_sync(n);
1493 bch_btree_set_root(n);
1494 btree_node_free(b);
1495 rw_unlock(true, n);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001496
Kent Overstreeta1f03582013-09-10 19:07:00 -07001497 return -EINTR;
1498 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001499 }
1500
Kent Overstreeta1f03582013-09-10 19:07:00 -07001501 if (b->level) {
1502 ret = btree_gc_recurse(b, op, writes, gc);
1503 if (ret)
1504 return ret;
1505 }
1506
1507 bkey_copy_key(&b->c->gc_done, &b->key);
1508
Kent Overstreetcafe5632013-03-23 16:11:31 -07001509 return ret;
1510}
1511
1512static void btree_gc_start(struct cache_set *c)
1513{
1514 struct cache *ca;
1515 struct bucket *b;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001516 unsigned i;
1517
1518 if (!c->gc_mark_valid)
1519 return;
1520
1521 mutex_lock(&c->bucket_lock);
1522
1523 c->gc_mark_valid = 0;
1524 c->gc_done = ZERO_KEY;
1525
1526 for_each_cache(ca, c, i)
1527 for_each_bucket(b, ca) {
1528 b->gc_gen = b->gen;
Kent Overstreet29ebf462013-07-11 19:43:21 -07001529 if (!atomic_read(&b->pin)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001530 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
Kent Overstreet29ebf462013-07-11 19:43:21 -07001531 SET_GC_SECTORS_USED(b, 0);
1532 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001533 }
1534
Kent Overstreetcafe5632013-03-23 16:11:31 -07001535 mutex_unlock(&c->bucket_lock);
1536}
1537
1538size_t bch_btree_gc_finish(struct cache_set *c)
1539{
1540 size_t available = 0;
1541 struct bucket *b;
1542 struct cache *ca;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001543 unsigned i;
1544
1545 mutex_lock(&c->bucket_lock);
1546
1547 set_gc_sectors(c);
1548 c->gc_mark_valid = 1;
1549 c->need_gc = 0;
1550
1551 if (c->root)
1552 for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1553 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1554 GC_MARK_METADATA);
1555
1556 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1557 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1558 GC_MARK_METADATA);
1559
Nicholas Swensonbf0a6282013-11-26 19:14:23 -08001560 /* don't reclaim buckets to which writeback keys point */
1561 rcu_read_lock();
1562 for (i = 0; i < c->nr_uuids; i++) {
1563 struct bcache_device *d = c->devices[i];
1564 struct cached_dev *dc;
1565 struct keybuf_key *w, *n;
1566 unsigned j;
1567
1568 if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
1569 continue;
1570 dc = container_of(d, struct cached_dev, disk);
1571
1572 spin_lock(&dc->writeback_keys.lock);
1573 rbtree_postorder_for_each_entry_safe(w, n,
1574 &dc->writeback_keys.keys, node)
1575 for (j = 0; j < KEY_PTRS(&w->key); j++)
1576 SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
1577 GC_MARK_DIRTY);
1578 spin_unlock(&dc->writeback_keys.lock);
1579 }
1580 rcu_read_unlock();
1581
Kent Overstreetcafe5632013-03-23 16:11:31 -07001582 for_each_cache(ca, c, i) {
1583 uint64_t *i;
1584
1585 ca->invalidate_needs_gc = 0;
1586
1587 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1588 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1589
1590 for (i = ca->prio_buckets;
1591 i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1592 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1593
1594 for_each_bucket(b, ca) {
1595 b->last_gc = b->gc_gen;
1596 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1597
1598 if (!atomic_read(&b->pin) &&
1599 GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1600 available++;
1601 if (!GC_SECTORS_USED(b))
1602 bch_bucket_add_unused(ca, b);
1603 }
1604 }
1605 }
1606
Kent Overstreetcafe5632013-03-23 16:11:31 -07001607 mutex_unlock(&c->bucket_lock);
1608 return available;
1609}
1610
Kent Overstreet72a44512013-10-24 17:19:26 -07001611static void bch_btree_gc(struct cache_set *c)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001612{
Kent Overstreetcafe5632013-03-23 16:11:31 -07001613 int ret;
1614 unsigned long available;
1615 struct gc_stat stats;
1616 struct closure writes;
1617 struct btree_op op;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001618 uint64_t start_time = local_clock();
Kent Overstreet57943512013-04-25 13:58:35 -07001619
Kent Overstreetc37511b2013-04-26 15:39:55 -07001620 trace_bcache_gc_start(c);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001621
1622 memset(&stats, 0, sizeof(struct gc_stat));
1623 closure_init_stack(&writes);
Kent Overstreetb54d6932013-07-24 18:04:18 -07001624 bch_btree_op_init(&op, SHRT_MAX);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001625
1626 btree_gc_start(c);
1627
Kent Overstreeta1f03582013-09-10 19:07:00 -07001628 do {
1629 ret = btree_root(gc_root, c, &op, &writes, &stats);
1630 closure_sync(&writes);
Kent Overstreet57943512013-04-25 13:58:35 -07001631
Kent Overstreeta1f03582013-09-10 19:07:00 -07001632 if (ret && ret != -EAGAIN)
1633 pr_warn("gc failed!");
1634 } while (ret);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001635
1636 available = bch_btree_gc_finish(c);
Kent Overstreet57943512013-04-25 13:58:35 -07001637 wake_up_allocators(c);
1638
Kent Overstreet169ef1c2013-03-28 12:50:55 -06001639 bch_time_stats_update(&c->btree_gc_time, start_time);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001640
1641 stats.key_bytes *= sizeof(uint64_t);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001642 stats.data <<= 9;
1643 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1644 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001645
Kent Overstreetc37511b2013-04-26 15:39:55 -07001646 trace_bcache_gc_end(c);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001647
Kent Overstreet72a44512013-10-24 17:19:26 -07001648 bch_moving_gc(c);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001649}
1650
Kent Overstreet72a44512013-10-24 17:19:26 -07001651static int bch_gc_thread(void *arg)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001652{
Kent Overstreet72a44512013-10-24 17:19:26 -07001653 struct cache_set *c = arg;
Kent Overstreeta1f03582013-09-10 19:07:00 -07001654 struct cache *ca;
1655 unsigned i;
Kent Overstreet72a44512013-10-24 17:19:26 -07001656
1657 while (1) {
Kent Overstreeta1f03582013-09-10 19:07:00 -07001658again:
Kent Overstreet72a44512013-10-24 17:19:26 -07001659 bch_btree_gc(c);
1660
1661 set_current_state(TASK_INTERRUPTIBLE);
1662 if (kthread_should_stop())
1663 break;
1664
Kent Overstreeta1f03582013-09-10 19:07:00 -07001665 mutex_lock(&c->bucket_lock);
1666
1667 for_each_cache(ca, c, i)
1668 if (ca->invalidate_needs_gc) {
1669 mutex_unlock(&c->bucket_lock);
1670 set_current_state(TASK_RUNNING);
1671 goto again;
1672 }
1673
1674 mutex_unlock(&c->bucket_lock);
1675
Kent Overstreet72a44512013-10-24 17:19:26 -07001676 try_to_freeze();
1677 schedule();
1678 }
1679
1680 return 0;
1681}
1682
1683int bch_gc_thread_start(struct cache_set *c)
1684{
1685 c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
1686 if (IS_ERR(c->gc_thread))
1687 return PTR_ERR(c->gc_thread);
1688
1689 set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
1690 return 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001691}
1692
1693/* Initial partial gc */
1694
1695static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1696 unsigned long **seen)
1697{
Kent Overstreet50310162013-09-10 17:18:59 -07001698 int ret = 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001699 unsigned i;
Kent Overstreet50310162013-09-10 17:18:59 -07001700 struct bkey *k, *p = NULL;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001701 struct bucket *g;
1702 struct btree_iter iter;
1703
1704 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1705 for (i = 0; i < KEY_PTRS(k); i++) {
1706 if (!ptr_available(b->c, k, i))
1707 continue;
1708
1709 g = PTR_BUCKET(b->c, k, i);
1710
1711 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1712 seen[PTR_DEV(k, i)]) ||
1713 !ptr_stale(b->c, k, i)) {
1714 g->gen = PTR_GEN(k, i);
1715
1716 if (b->level)
1717 g->prio = BTREE_PRIO;
1718 else if (g->prio == BTREE_PRIO)
1719 g->prio = INITIAL_PRIO;
1720 }
1721 }
1722
1723 btree_mark_key(b, k);
1724 }
1725
1726 if (b->level) {
Kent Overstreet50310162013-09-10 17:18:59 -07001727 bch_btree_iter_init(b, &iter, NULL);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001728
Kent Overstreet50310162013-09-10 17:18:59 -07001729 do {
1730 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1731 if (k)
1732 btree_node_prefetch(b->c, k, b->level - 1);
1733
Kent Overstreetcafe5632013-03-23 16:11:31 -07001734 if (p)
Kent Overstreet50310162013-09-10 17:18:59 -07001735 ret = btree(check_recurse, p, b, op, seen);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001736
Kent Overstreet50310162013-09-10 17:18:59 -07001737 p = k;
1738 } while (p && !ret);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001739 }
1740
1741 return 0;
1742}
1743
Kent Overstreetc18536a2013-07-24 17:44:17 -07001744int bch_btree_check(struct cache_set *c)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001745{
1746 int ret = -ENOMEM;
1747 unsigned i;
1748 unsigned long *seen[MAX_CACHES_PER_SET];
Kent Overstreetc18536a2013-07-24 17:44:17 -07001749 struct btree_op op;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001750
1751 memset(seen, 0, sizeof(seen));
Kent Overstreetb54d6932013-07-24 18:04:18 -07001752 bch_btree_op_init(&op, SHRT_MAX);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001753
1754 for (i = 0; c->cache[i]; i++) {
1755 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1756 seen[i] = kmalloc(n, GFP_KERNEL);
1757 if (!seen[i])
1758 goto err;
1759
1760 /* Disables the seen array until prio_read() uses it too */
1761 memset(seen[i], 0xFF, n);
1762 }
1763
Kent Overstreetc18536a2013-07-24 17:44:17 -07001764 ret = btree_root(check_recurse, c, &op, seen);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001765err:
1766 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1767 kfree(seen[i]);
1768 return ret;
1769}
1770
1771/* Btree insertion */
1772
1773static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1774{
1775 struct bset *i = b->sets[b->nsets].data;
1776
1777 memmove((uint64_t *) where + bkey_u64s(insert),
1778 where,
1779 (void *) end(i) - (void *) where);
1780
1781 i->keys += bkey_u64s(insert);
1782 bkey_copy(where, insert);
1783 bch_bset_fix_lookup_table(b, where);
1784}
1785
Kent Overstreet1b207d82013-09-10 18:52:54 -07001786static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
Kent Overstreetcafe5632013-03-23 16:11:31 -07001787 struct btree_iter *iter,
Kent Overstreet1b207d82013-09-10 18:52:54 -07001788 struct bkey *replace_key)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001789{
Kent Overstreet279afba2013-06-05 06:21:07 -07001790 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001791 {
Kent Overstreet279afba2013-06-05 06:21:07 -07001792 if (KEY_DIRTY(k))
1793 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1794 offset, -sectors);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001795 }
1796
Kent Overstreet279afba2013-06-05 06:21:07 -07001797 uint64_t old_offset;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001798 unsigned old_size, sectors_found = 0;
1799
1800 while (1) {
1801 struct bkey *k = bch_btree_iter_next(iter);
1802 if (!k ||
1803 bkey_cmp(&START_KEY(k), insert) >= 0)
1804 break;
1805
1806 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1807 continue;
1808
Kent Overstreet279afba2013-06-05 06:21:07 -07001809 old_offset = KEY_START(k);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001810 old_size = KEY_SIZE(k);
1811
1812 /*
1813 * We might overlap with 0 size extents; we can't skip these
1814 * because if they're in the set we're inserting to we have to
1815 * adjust them so they don't overlap with the key we're
Kent Overstreet1b207d82013-09-10 18:52:54 -07001816 * inserting. But we don't want to check them for replace
Kent Overstreetcafe5632013-03-23 16:11:31 -07001817 * operations.
1818 */
1819
Kent Overstreet1b207d82013-09-10 18:52:54 -07001820 if (replace_key && KEY_SIZE(k)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001821 /*
1822 * k might have been split since we inserted/found the
1823 * key we're replacing
1824 */
1825 unsigned i;
1826 uint64_t offset = KEY_START(k) -
Kent Overstreet1b207d82013-09-10 18:52:54 -07001827 KEY_START(replace_key);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001828
1829 /* But it must be a subset of the replace key */
Kent Overstreet1b207d82013-09-10 18:52:54 -07001830 if (KEY_START(k) < KEY_START(replace_key) ||
1831 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
Kent Overstreetcafe5632013-03-23 16:11:31 -07001832 goto check_failed;
1833
1834 /* We didn't find a key that we were supposed to */
1835 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1836 goto check_failed;
1837
Kent Overstreetd24a6e12013-11-10 21:55:27 -08001838 if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
1839 KEY_DIRTY(k) != KEY_DIRTY(replace_key))
Kent Overstreetcafe5632013-03-23 16:11:31 -07001840 goto check_failed;
1841
1842 /* skip past gen */
1843 offset <<= 8;
1844
Kent Overstreet1b207d82013-09-10 18:52:54 -07001845 BUG_ON(!KEY_PTRS(replace_key));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001846
Kent Overstreet1b207d82013-09-10 18:52:54 -07001847 for (i = 0; i < KEY_PTRS(replace_key); i++)
1848 if (k->ptr[i] != replace_key->ptr[i] + offset)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001849 goto check_failed;
1850
1851 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1852 }
1853
1854 if (bkey_cmp(insert, k) < 0 &&
1855 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1856 /*
1857 * We overlapped in the middle of an existing key: that
1858 * means we have to split the old key. But we have to do
1859 * slightly different things depending on whether the
1860 * old key has been written out yet.
1861 */
1862
1863 struct bkey *top;
1864
Kent Overstreet279afba2013-06-05 06:21:07 -07001865 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001866
1867 if (bkey_written(b, k)) {
1868 /*
1869 * We insert a new key to cover the top of the
1870 * old key, and the old key is modified in place
1871 * to represent the bottom split.
1872 *
1873 * It's completely arbitrary whether the new key
1874 * is the top or the bottom, but it has to match
1875 * up with what btree_sort_fixup() does - it
1876 * doesn't check for this kind of overlap, it
1877 * depends on us inserting a new key for the top
1878 * here.
1879 */
1880 top = bch_bset_search(b, &b->sets[b->nsets],
1881 insert);
1882 shift_keys(b, top, k);
1883 } else {
1884 BKEY_PADDED(key) temp;
1885 bkey_copy(&temp.key, k);
1886 shift_keys(b, k, &temp.key);
1887 top = bkey_next(k);
1888 }
1889
1890 bch_cut_front(insert, top);
1891 bch_cut_back(&START_KEY(insert), k);
1892 bch_bset_fix_invalidated_key(b, k);
1893 return false;
1894 }
1895
1896 if (bkey_cmp(insert, k) < 0) {
1897 bch_cut_front(insert, k);
1898 } else {
Kent Overstreet1fa84552013-11-10 21:55:27 -08001899 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1900 old_offset = KEY_START(insert);
1901
Kent Overstreetcafe5632013-03-23 16:11:31 -07001902 if (bkey_written(b, k) &&
1903 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1904 /*
1905 * Completely overwrote, so we don't have to
1906 * invalidate the binary search tree
1907 */
1908 bch_cut_front(k, k);
1909 } else {
1910 __bch_cut_back(&START_KEY(insert), k);
1911 bch_bset_fix_invalidated_key(b, k);
1912 }
1913 }
1914
Kent Overstreet279afba2013-06-05 06:21:07 -07001915 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001916 }
1917
1918check_failed:
Kent Overstreet1b207d82013-09-10 18:52:54 -07001919 if (replace_key) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001920 if (!sectors_found) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001921 return true;
1922 } else if (sectors_found < KEY_SIZE(insert)) {
1923 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1924 (KEY_SIZE(insert) - sectors_found));
1925 SET_KEY_SIZE(insert, sectors_found);
1926 }
1927 }
1928
1929 return false;
1930}
1931
1932static bool btree_insert_key(struct btree *b, struct btree_op *op,
Kent Overstreet1b207d82013-09-10 18:52:54 -07001933 struct bkey *k, struct bkey *replace_key)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001934{
1935 struct bset *i = b->sets[b->nsets].data;
1936 struct bkey *m, *prev;
Kent Overstreet85b14922013-05-14 20:33:16 -07001937 unsigned status = BTREE_INSERT_STATUS_INSERT;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001938
1939 BUG_ON(bkey_cmp(k, &b->key) > 0);
1940 BUG_ON(b->level && !KEY_PTRS(k));
1941 BUG_ON(!b->level && !KEY_OFFSET(k));
1942
1943 if (!b->level) {
1944 struct btree_iter iter;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001945
1946 /*
1947 * bset_search() returns the first key that is strictly greater
1948 * than the search key - but for back merging, we want to find
Kent Overstreet0eacac22013-07-01 19:29:05 -07001949 * the previous key.
Kent Overstreetcafe5632013-03-23 16:11:31 -07001950 */
Kent Overstreetcafe5632013-03-23 16:11:31 -07001951 prev = NULL;
Kent Overstreet0eacac22013-07-01 19:29:05 -07001952 m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001953
Kent Overstreet1b207d82013-09-10 18:52:54 -07001954 if (fix_overlapping_extents(b, k, &iter, replace_key)) {
1955 op->insert_collision = true;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001956 return false;
Kent Overstreet1b207d82013-09-10 18:52:54 -07001957 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001958
Kent Overstreet1fa84552013-11-10 21:55:27 -08001959 if (KEY_DIRTY(k))
1960 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1961 KEY_START(k), KEY_SIZE(k));
1962
Kent Overstreetcafe5632013-03-23 16:11:31 -07001963 while (m != end(i) &&
1964 bkey_cmp(k, &START_KEY(m)) > 0)
1965 prev = m, m = bkey_next(m);
1966
1967 if (key_merging_disabled(b->c))
1968 goto insert;
1969
1970 /* prev is in the tree, if we merge we're done */
Kent Overstreet85b14922013-05-14 20:33:16 -07001971 status = BTREE_INSERT_STATUS_BACK_MERGE;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001972 if (prev &&
1973 bch_bkey_try_merge(b, prev, k))
1974 goto merged;
1975
Kent Overstreet85b14922013-05-14 20:33:16 -07001976 status = BTREE_INSERT_STATUS_OVERWROTE;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001977 if (m != end(i) &&
1978 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1979 goto copy;
1980
Kent Overstreet85b14922013-05-14 20:33:16 -07001981 status = BTREE_INSERT_STATUS_FRONT_MERGE;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001982 if (m != end(i) &&
1983 bch_bkey_try_merge(b, k, m))
1984 goto copy;
Kent Overstreet1b207d82013-09-10 18:52:54 -07001985 } else {
1986 BUG_ON(replace_key);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001987 m = bch_bset_search(b, &b->sets[b->nsets], k);
Kent Overstreet1b207d82013-09-10 18:52:54 -07001988 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001989
1990insert: shift_keys(b, m, k);
1991copy: bkey_copy(m, k);
1992merged:
Kent Overstreet1b207d82013-09-10 18:52:54 -07001993 bch_check_keys(b, "%u for %s", status,
1994 replace_key ? "replace" : "insert");
Kent Overstreetcafe5632013-03-23 16:11:31 -07001995
1996 if (b->level && !KEY_OFFSET(k))
Kent Overstreet57943512013-04-25 13:58:35 -07001997 btree_current_write(b)->prio_blocked++;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001998
Kent Overstreet1b207d82013-09-10 18:52:54 -07001999 trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002000
2001 return true;
2002}
2003
Kent Overstreet26c949f2013-09-10 18:41:15 -07002004static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
Kent Overstreet1b207d82013-09-10 18:52:54 -07002005 struct keylist *insert_keys,
2006 struct bkey *replace_key)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002007{
2008 bool ret = false;
Kent Overstreet280481d2013-10-24 16:36:03 -07002009 int oldsize = bch_count_data(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002010
Kent Overstreet26c949f2013-09-10 18:41:15 -07002011 while (!bch_keylist_empty(insert_keys)) {
Kent Overstreet403b6cd2013-07-24 17:22:44 -07002012 struct bset *i = write_block(b);
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07002013 struct bkey *k = insert_keys->keys;
Kent Overstreet26c949f2013-09-10 18:41:15 -07002014
Kent Overstreet403b6cd2013-07-24 17:22:44 -07002015 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
2016 > btree_blocks(b))
2017 break;
2018
2019 if (bkey_cmp(k, &b->key) <= 0) {
Kent Overstreet3a3b6a42013-07-24 16:46:42 -07002020 if (!b->level)
2021 bkey_put(b->c, k);
Kent Overstreet26c949f2013-09-10 18:41:15 -07002022
Kent Overstreet1b207d82013-09-10 18:52:54 -07002023 ret |= btree_insert_key(b, op, k, replace_key);
Kent Overstreet26c949f2013-09-10 18:41:15 -07002024 bch_keylist_pop_front(insert_keys);
2025 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
Kent Overstreet26c949f2013-09-10 18:41:15 -07002026 BKEY_PADDED(key) temp;
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07002027 bkey_copy(&temp.key, insert_keys->keys);
Kent Overstreet26c949f2013-09-10 18:41:15 -07002028
2029 bch_cut_back(&b->key, &temp.key);
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07002030 bch_cut_front(&b->key, insert_keys->keys);
Kent Overstreet26c949f2013-09-10 18:41:15 -07002031
Kent Overstreet1b207d82013-09-10 18:52:54 -07002032 ret |= btree_insert_key(b, op, &temp.key, replace_key);
Kent Overstreet26c949f2013-09-10 18:41:15 -07002033 break;
2034 } else {
2035 break;
2036 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07002037 }
2038
Kent Overstreet403b6cd2013-07-24 17:22:44 -07002039 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2040
Kent Overstreetcafe5632013-03-23 16:11:31 -07002041 BUG_ON(bch_count_data(b) < oldsize);
2042 return ret;
2043}
2044
Kent Overstreet26c949f2013-09-10 18:41:15 -07002045static int btree_split(struct btree *b, struct btree_op *op,
2046 struct keylist *insert_keys,
Kent Overstreet1b207d82013-09-10 18:52:54 -07002047 struct bkey *replace_key)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002048{
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07002049 bool split;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002050 struct btree *n1, *n2 = NULL, *n3 = NULL;
2051 uint64_t start_time = local_clock();
Kent Overstreetb54d6932013-07-24 18:04:18 -07002052 struct closure cl;
Kent Overstreet17e21a92013-07-26 12:32:38 -07002053 struct keylist parent_keys;
Kent Overstreetb54d6932013-07-24 18:04:18 -07002054
2055 closure_init_stack(&cl);
Kent Overstreet17e21a92013-07-26 12:32:38 -07002056 bch_keylist_init(&parent_keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002057
Kent Overstreetbc9389e2013-09-10 19:07:35 -07002058 n1 = btree_node_alloc_replacement(b, true);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002059 if (IS_ERR(n1))
2060 goto err;
2061
2062 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
2063
Kent Overstreetcafe5632013-03-23 16:11:31 -07002064 if (split) {
2065 unsigned keys = 0;
2066
Kent Overstreetc37511b2013-04-26 15:39:55 -07002067 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
2068
Kent Overstreetbc9389e2013-09-10 19:07:35 -07002069 n2 = bch_btree_node_alloc(b->c, b->level, true);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002070 if (IS_ERR(n2))
2071 goto err_free1;
2072
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07002073 if (!b->parent) {
Kent Overstreetbc9389e2013-09-10 19:07:35 -07002074 n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002075 if (IS_ERR(n3))
2076 goto err_free2;
2077 }
2078
Kent Overstreet1b207d82013-09-10 18:52:54 -07002079 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002080
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07002081 /*
2082 * Has to be a linear search because we don't have an auxiliary
Kent Overstreetcafe5632013-03-23 16:11:31 -07002083 * search tree yet
2084 */
2085
2086 while (keys < (n1->sets[0].data->keys * 3) / 5)
2087 keys += bkey_u64s(node(n1->sets[0].data, keys));
2088
2089 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
2090 keys += bkey_u64s(node(n1->sets[0].data, keys));
2091
2092 n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
2093 n1->sets[0].data->keys = keys;
2094
2095 memcpy(n2->sets[0].data->start,
2096 end(n1->sets[0].data),
2097 n2->sets[0].data->keys * sizeof(uint64_t));
2098
2099 bkey_copy_key(&n2->key, &b->key);
2100
Kent Overstreet17e21a92013-07-26 12:32:38 -07002101 bch_keylist_add(&parent_keys, &n2->key);
Kent Overstreetb54d6932013-07-24 18:04:18 -07002102 bch_btree_node_write(n2, &cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002103 rw_unlock(true, n2);
Kent Overstreetc37511b2013-04-26 15:39:55 -07002104 } else {
2105 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
2106
Kent Overstreet1b207d82013-09-10 18:52:54 -07002107 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
Kent Overstreetc37511b2013-04-26 15:39:55 -07002108 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07002109
Kent Overstreet17e21a92013-07-26 12:32:38 -07002110 bch_keylist_add(&parent_keys, &n1->key);
Kent Overstreetb54d6932013-07-24 18:04:18 -07002111 bch_btree_node_write(n1, &cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002112
2113 if (n3) {
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07002114 /* Depth increases, make a new root */
Kent Overstreetcafe5632013-03-23 16:11:31 -07002115 bkey_copy_key(&n3->key, &MAX_KEY);
Kent Overstreet17e21a92013-07-26 12:32:38 -07002116 bch_btree_insert_keys(n3, op, &parent_keys, NULL);
Kent Overstreetb54d6932013-07-24 18:04:18 -07002117 bch_btree_node_write(n3, &cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002118
Kent Overstreetb54d6932013-07-24 18:04:18 -07002119 closure_sync(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002120 bch_btree_set_root(n3);
2121 rw_unlock(true, n3);
Kent Overstreet17e21a92013-07-26 12:32:38 -07002122
2123 btree_node_free(b);
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07002124 } else if (!b->parent) {
2125 /* Root filled up but didn't need to be split */
Kent Overstreetb54d6932013-07-24 18:04:18 -07002126 closure_sync(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002127 bch_btree_set_root(n1);
Kent Overstreet17e21a92013-07-26 12:32:38 -07002128
2129 btree_node_free(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002130 } else {
Kent Overstreet17e21a92013-07-26 12:32:38 -07002131 /* Split a non root node */
Kent Overstreetb54d6932013-07-24 18:04:18 -07002132 closure_sync(&cl);
Kent Overstreet17e21a92013-07-26 12:32:38 -07002133 make_btree_freeing_key(b, parent_keys.top);
2134 bch_keylist_push(&parent_keys);
2135
2136 btree_node_free(b);
2137
2138 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
2139 BUG_ON(!bch_keylist_empty(&parent_keys));
Kent Overstreetcafe5632013-03-23 16:11:31 -07002140 }
2141
2142 rw_unlock(true, n1);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002143
Kent Overstreet169ef1c2013-03-28 12:50:55 -06002144 bch_time_stats_update(&b->c->btree_split_time, start_time);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002145
2146 return 0;
2147err_free2:
Kent Overstreete8e1d462013-07-24 17:27:07 -07002148 btree_node_free(n2);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002149 rw_unlock(true, n2);
2150err_free1:
Kent Overstreete8e1d462013-07-24 17:27:07 -07002151 btree_node_free(n1);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002152 rw_unlock(true, n1);
2153err:
2154 if (n3 == ERR_PTR(-EAGAIN) ||
2155 n2 == ERR_PTR(-EAGAIN) ||
2156 n1 == ERR_PTR(-EAGAIN))
2157 return -EAGAIN;
2158
2159 pr_warn("couldn't split");
2160 return -ENOMEM;
2161}
2162
Kent Overstreet26c949f2013-09-10 18:41:15 -07002163static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
Kent Overstreetc18536a2013-07-24 17:44:17 -07002164 struct keylist *insert_keys,
Kent Overstreet1b207d82013-09-10 18:52:54 -07002165 atomic_t *journal_ref,
2166 struct bkey *replace_key)
Kent Overstreet26c949f2013-09-10 18:41:15 -07002167{
Kent Overstreet17e21a92013-07-26 12:32:38 -07002168 BUG_ON(b->level && replace_key);
Kent Overstreet26c949f2013-09-10 18:41:15 -07002169
Kent Overstreet17e21a92013-07-26 12:32:38 -07002170 if (should_split(b)) {
2171 if (current->bio_list) {
2172 op->lock = b->c->root->level + 1;
2173 return -EAGAIN;
2174 } else if (op->lock <= b->c->root->level) {
2175 op->lock = b->c->root->level + 1;
2176 return -EINTR;
Kent Overstreet26c949f2013-09-10 18:41:15 -07002177 } else {
Kent Overstreet17e21a92013-07-26 12:32:38 -07002178 /* Invalidated all iterators */
2179 return btree_split(b, op, insert_keys, replace_key) ?:
2180 -EINTR;
Kent Overstreet26c949f2013-09-10 18:41:15 -07002181 }
Kent Overstreet17e21a92013-07-26 12:32:38 -07002182 } else {
2183 BUG_ON(write_block(b) != b->sets[b->nsets].data);
Kent Overstreet26c949f2013-09-10 18:41:15 -07002184
Kent Overstreet17e21a92013-07-26 12:32:38 -07002185 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2186 if (!b->level)
2187 bch_btree_leaf_dirty(b, journal_ref);
2188 else
2189 bch_btree_node_write_sync(b);
2190 }
2191
2192 return 0;
2193 }
Kent Overstreet26c949f2013-09-10 18:41:15 -07002194}
2195
Kent Overstreete7c590e2013-09-10 18:39:16 -07002196int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2197 struct bkey *check_key)
2198{
2199 int ret = -EINTR;
2200 uint64_t btree_ptr = b->key.ptr[0];
2201 unsigned long seq = b->seq;
2202 struct keylist insert;
2203 bool upgrade = op->lock == -1;
2204
2205 bch_keylist_init(&insert);
2206
2207 if (upgrade) {
2208 rw_unlock(false, b);
2209 rw_lock(true, b, b->level);
2210
2211 if (b->key.ptr[0] != btree_ptr ||
2212 b->seq != seq + 1)
2213 goto out;
2214 }
2215
2216 SET_KEY_PTRS(check_key, 1);
2217 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2218
2219 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2220
2221 bch_keylist_add(&insert, check_key);
2222
Kent Overstreet1b207d82013-09-10 18:52:54 -07002223 ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
Kent Overstreete7c590e2013-09-10 18:39:16 -07002224
2225 BUG_ON(!ret && !bch_keylist_empty(&insert));
2226out:
2227 if (upgrade)
2228 downgrade_write(&b->lock);
2229 return ret;
2230}
2231
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002232struct btree_insert_op {
2233 struct btree_op op;
2234 struct keylist *keys;
2235 atomic_t *journal_ref;
2236 struct bkey *replace_key;
2237};
2238
Wei Yongjun08239ca2013-11-28 10:31:35 +08002239static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002240{
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002241 struct btree_insert_op *op = container_of(b_op,
2242 struct btree_insert_op, op);
Kent Overstreet403b6cd2013-07-24 17:22:44 -07002243
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002244 int ret = bch_btree_insert_node(b, &op->op, op->keys,
2245 op->journal_ref, op->replace_key);
2246 if (ret && !bch_keylist_empty(op->keys))
2247 return ret;
2248 else
2249 return MAP_DONE;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002250}
2251
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002252int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2253 atomic_t *journal_ref, struct bkey *replace_key)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002254{
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002255 struct btree_insert_op op;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002256 int ret = 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002257
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002258 BUG_ON(current->bio_list);
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002259 BUG_ON(bch_keylist_empty(keys));
Kent Overstreetcafe5632013-03-23 16:11:31 -07002260
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002261 bch_btree_op_init(&op.op, 0);
2262 op.keys = keys;
2263 op.journal_ref = journal_ref;
2264 op.replace_key = replace_key;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002265
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002266 while (!ret && !bch_keylist_empty(keys)) {
2267 op.op.lock = 0;
2268 ret = bch_btree_map_leaf_nodes(&op.op, c,
2269 &START_KEY(keys->keys),
2270 btree_insert_fn);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002271 }
2272
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002273 if (ret) {
2274 struct bkey *k;
2275
2276 pr_err("error %i", ret);
2277
2278 while ((k = bch_keylist_pop(keys)))
Kent Overstreet3a3b6a42013-07-24 16:46:42 -07002279 bkey_put(c, k);
Kent Overstreetcc7b8812013-07-24 18:07:22 -07002280 } else if (op.op.insert_collision)
2281 ret = -ESRCH;
Kent Overstreet6054c6d2013-07-24 18:06:22 -07002282
Kent Overstreetcafe5632013-03-23 16:11:31 -07002283 return ret;
2284}
2285
2286void bch_btree_set_root(struct btree *b)
2287{
2288 unsigned i;
Kent Overstreete49c7c32013-06-26 17:25:38 -07002289 struct closure cl;
2290
2291 closure_init_stack(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002292
Kent Overstreetc37511b2013-04-26 15:39:55 -07002293 trace_bcache_btree_set_root(b);
2294
Kent Overstreetcafe5632013-03-23 16:11:31 -07002295 BUG_ON(!b->written);
2296
2297 for (i = 0; i < KEY_PTRS(&b->key); i++)
2298 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2299
2300 mutex_lock(&b->c->bucket_lock);
2301 list_del_init(&b->list);
2302 mutex_unlock(&b->c->bucket_lock);
2303
2304 b->c->root = b;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002305
Kent Overstreete49c7c32013-06-26 17:25:38 -07002306 bch_journal_meta(b->c, &cl);
2307 closure_sync(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002308}
2309
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002310/* Map across nodes or keys */
2311
2312static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2313 struct bkey *from,
2314 btree_map_nodes_fn *fn, int flags)
2315{
2316 int ret = MAP_CONTINUE;
2317
2318 if (b->level) {
2319 struct bkey *k;
2320 struct btree_iter iter;
2321
2322 bch_btree_iter_init(b, &iter, from);
2323
2324 while ((k = bch_btree_iter_next_filter(&iter, b,
2325 bch_ptr_bad))) {
2326 ret = btree(map_nodes_recurse, k, b,
2327 op, from, fn, flags);
2328 from = NULL;
2329
2330 if (ret != MAP_CONTINUE)
2331 return ret;
2332 }
2333 }
2334
2335 if (!b->level || flags == MAP_ALL_NODES)
2336 ret = fn(op, b);
2337
2338 return ret;
2339}
2340
2341int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2342 struct bkey *from, btree_map_nodes_fn *fn, int flags)
2343{
Kent Overstreetb54d6932013-07-24 18:04:18 -07002344 return btree_root(map_nodes_recurse, c, op, from, fn, flags);
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002345}
2346
2347static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2348 struct bkey *from, btree_map_keys_fn *fn,
2349 int flags)
2350{
2351 int ret = MAP_CONTINUE;
2352 struct bkey *k;
2353 struct btree_iter iter;
2354
2355 bch_btree_iter_init(b, &iter, from);
2356
2357 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
2358 ret = !b->level
2359 ? fn(op, b, k)
2360 : btree(map_keys_recurse, k, b, op, from, fn, flags);
2361 from = NULL;
2362
2363 if (ret != MAP_CONTINUE)
2364 return ret;
2365 }
2366
2367 if (!b->level && (flags & MAP_END_KEY))
2368 ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2369 KEY_OFFSET(&b->key), 0));
2370
2371 return ret;
2372}
2373
2374int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2375 struct bkey *from, btree_map_keys_fn *fn, int flags)
2376{
Kent Overstreetb54d6932013-07-24 18:04:18 -07002377 return btree_root(map_keys_recurse, c, op, from, fn, flags);
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002378}
2379
Kent Overstreetcafe5632013-03-23 16:11:31 -07002380/* Keybuf code */
2381
2382static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2383{
2384 /* Overlapping keys compare equal */
2385 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2386 return -1;
2387 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2388 return 1;
2389 return 0;
2390}
2391
2392static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2393 struct keybuf_key *r)
2394{
2395 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2396}
2397
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002398struct refill {
2399 struct btree_op op;
Kent Overstreet48a915a2013-10-31 15:43:22 -07002400 unsigned nr_found;
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002401 struct keybuf *buf;
2402 struct bkey *end;
2403 keybuf_pred_fn *pred;
2404};
2405
2406static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2407 struct bkey *k)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002408{
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002409 struct refill *refill = container_of(op, struct refill, op);
2410 struct keybuf *buf = refill->buf;
2411 int ret = MAP_CONTINUE;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002412
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002413 if (bkey_cmp(k, refill->end) >= 0) {
2414 ret = MAP_DONE;
2415 goto out;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002416 }
2417
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002418 if (!KEY_SIZE(k)) /* end key */
2419 goto out;
2420
2421 if (refill->pred(buf, k)) {
2422 struct keybuf_key *w;
2423
2424 spin_lock(&buf->lock);
2425
2426 w = array_alloc(&buf->freelist);
2427 if (!w) {
2428 spin_unlock(&buf->lock);
2429 return MAP_DONE;
2430 }
2431
2432 w->private = NULL;
2433 bkey_copy(&w->key, k);
2434
2435 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2436 array_free(&buf->freelist, w);
Kent Overstreet48a915a2013-10-31 15:43:22 -07002437 else
2438 refill->nr_found++;
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002439
2440 if (array_freelist_empty(&buf->freelist))
2441 ret = MAP_DONE;
2442
2443 spin_unlock(&buf->lock);
2444 }
2445out:
2446 buf->last_scanned = *k;
2447 return ret;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002448}
2449
2450void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
Kent Overstreet72c27062013-06-05 06:24:39 -07002451 struct bkey *end, keybuf_pred_fn *pred)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002452{
2453 struct bkey start = buf->last_scanned;
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002454 struct refill refill;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002455
2456 cond_resched();
2457
Kent Overstreetb54d6932013-07-24 18:04:18 -07002458 bch_btree_op_init(&refill.op, -1);
Kent Overstreet48a915a2013-10-31 15:43:22 -07002459 refill.nr_found = 0;
2460 refill.buf = buf;
2461 refill.end = end;
2462 refill.pred = pred;
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002463
2464 bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2465 refill_keybuf_fn, MAP_END_KEY);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002466
Kent Overstreet48a915a2013-10-31 15:43:22 -07002467 trace_bcache_keyscan(refill.nr_found,
2468 KEY_INODE(&start), KEY_OFFSET(&start),
2469 KEY_INODE(&buf->last_scanned),
2470 KEY_OFFSET(&buf->last_scanned));
Kent Overstreetcafe5632013-03-23 16:11:31 -07002471
2472 spin_lock(&buf->lock);
2473
2474 if (!RB_EMPTY_ROOT(&buf->keys)) {
2475 struct keybuf_key *w;
2476 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2477 buf->start = START_KEY(&w->key);
2478
2479 w = RB_LAST(&buf->keys, struct keybuf_key, node);
2480 buf->end = w->key;
2481 } else {
2482 buf->start = MAX_KEY;
2483 buf->end = MAX_KEY;
2484 }
2485
2486 spin_unlock(&buf->lock);
2487}
2488
2489static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2490{
2491 rb_erase(&w->node, &buf->keys);
2492 array_free(&buf->freelist, w);
2493}
2494
2495void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2496{
2497 spin_lock(&buf->lock);
2498 __bch_keybuf_del(buf, w);
2499 spin_unlock(&buf->lock);
2500}
2501
2502bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2503 struct bkey *end)
2504{
2505 bool ret = false;
2506 struct keybuf_key *p, *w, s;
2507 s.key = *start;
2508
2509 if (bkey_cmp(end, &buf->start) <= 0 ||
2510 bkey_cmp(start, &buf->end) >= 0)
2511 return false;
2512
2513 spin_lock(&buf->lock);
2514 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2515
2516 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2517 p = w;
2518 w = RB_NEXT(w, node);
2519
2520 if (p->private)
2521 ret = true;
2522 else
2523 __bch_keybuf_del(buf, p);
2524 }
2525
2526 spin_unlock(&buf->lock);
2527 return ret;
2528}
2529
2530struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2531{
2532 struct keybuf_key *w;
2533 spin_lock(&buf->lock);
2534
2535 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2536
2537 while (w && w->private)
2538 w = RB_NEXT(w, node);
2539
2540 if (w)
2541 w->private = ERR_PTR(-EINTR);
2542
2543 spin_unlock(&buf->lock);
2544 return w;
2545}
2546
2547struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
Kent Overstreet48dad8b2013-09-10 18:48:51 -07002548 struct keybuf *buf,
2549 struct bkey *end,
2550 keybuf_pred_fn *pred)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002551{
2552 struct keybuf_key *ret;
2553
2554 while (1) {
2555 ret = bch_keybuf_next(buf);
2556 if (ret)
2557 break;
2558
2559 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2560 pr_debug("scan finished");
2561 break;
2562 }
2563
Kent Overstreet72c27062013-06-05 06:24:39 -07002564 bch_refill_keybuf(c, buf, end, pred);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002565 }
2566
2567 return ret;
2568}
2569
Kent Overstreet72c27062013-06-05 06:24:39 -07002570void bch_keybuf_init(struct keybuf *buf)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002571{
Kent Overstreetcafe5632013-03-23 16:11:31 -07002572 buf->last_scanned = MAX_KEY;
2573 buf->keys = RB_ROOT;
2574
2575 spin_lock_init(&buf->lock);
2576 array_allocator_init(&buf->freelist);
2577}
2578
2579void bch_btree_exit(void)
2580{
2581 if (btree_io_wq)
2582 destroy_workqueue(btree_io_wq);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002583}
2584
2585int __init bch_btree_init(void)
2586{
Kent Overstreet72a44512013-10-24 17:19:26 -07002587 btree_io_wq = create_singlethread_workqueue("bch_btree_io");
2588 if (!btree_io_wq)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002589 return -ENOMEM;
2590
2591 return 0;
2592}