blob: 935d90df397b8481e3de2f2a453356d7637567d3 [file] [log] [blame]
Kent Overstreetcafe5632013-03-23 16:11:31 -07001/*
2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 *
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
6 * of the device.
7 *
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
12 *
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
15 *
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
19 *
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
21 */
22
23#include "bcache.h"
24#include "btree.h"
25#include "debug.h"
26#include "request.h"
Kent Overstreet279afba2013-06-05 06:21:07 -070027#include "writeback.h"
Kent Overstreetcafe5632013-03-23 16:11:31 -070028
29#include <linux/slab.h>
30#include <linux/bitops.h>
31#include <linux/hash.h>
Geert Uytterhoevencd953ed2013-03-27 18:56:28 +010032#include <linux/prefetch.h>
Kent Overstreetcafe5632013-03-23 16:11:31 -070033#include <linux/random.h>
34#include <linux/rcupdate.h>
35#include <trace/events/bcache.h>
36
37/*
38 * Todo:
39 * register_bcache: Return errors out to userspace correctly
40 *
41 * Writeback: don't undirty key until after a cache flush
42 *
43 * Create an iterator for key pointers
44 *
45 * On btree write error, mark bucket such that it won't be freed from the cache
46 *
47 * Journalling:
48 * Check for bad keys in replay
49 * Propagate barriers
50 * Refcount journal entries in journal_replay
51 *
52 * Garbage collection:
53 * Finish incremental gc
54 * Gc should free old UUIDs, data for invalid UUIDs
55 *
56 * Provide a way to list backing device UUIDs we have data cached for, and
57 * probably how long it's been since we've seen them, and a way to invalidate
58 * dirty data for devices that will never be attached again
59 *
60 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
61 * that based on that and how much dirty data we have we can keep writeback
62 * from being starved
63 *
64 * Add a tracepoint or somesuch to watch for writeback starvation
65 *
66 * When btree depth > 1 and splitting an interior node, we have to make sure
67 * alloc_bucket() cannot fail. This should be true but is not completely
68 * obvious.
69 *
70 * Make sure all allocations get charged to the root cgroup
71 *
72 * Plugging?
73 *
74 * If data write is less than hard sector size of ssd, round up offset in open
75 * bucket to the next whole sector
76 *
77 * Also lookup by cgroup in get_open_bucket()
78 *
79 * Superblock needs to be fleshed out for multiple cache devices
80 *
81 * Add a sysfs tunable for the number of writeback IOs in flight
82 *
83 * Add a sysfs tunable for the number of open data buckets
84 *
85 * IO tracking: Can we track when one process is doing io on behalf of another?
86 * IO tracking: Don't use just an average, weigh more recent stuff higher
87 *
88 * Test module load/unload
89 */
90
91static const char * const op_types[] = {
92 "insert", "replace"
93};
94
95static const char *op_type(struct btree_op *op)
96{
97 return op_types[op->type];
98}
99
100#define MAX_NEED_GC 64
101#define MAX_SAVE_PRIO 72
102
103#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
104
105#define PTR_HASH(c, k) \
106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
107
108struct workqueue_struct *bch_gc_wq;
109static struct workqueue_struct *btree_io_wq;
110
111void bch_btree_op_init_stack(struct btree_op *op)
112{
113 memset(op, 0, sizeof(struct btree_op));
114 closure_init_stack(&op->cl);
115 op->lock = -1;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700116}
117
118/* Btree key manipulation */
119
Kent Overstreete7c590e2013-09-10 18:39:16 -0700120void __bkey_put(struct cache_set *c, struct bkey *k)
121{
122 unsigned i;
123
124 for (i = 0; i < KEY_PTRS(k); i++)
125 if (ptr_available(c, k, i))
126 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
127}
128
Kent Overstreetcafe5632013-03-23 16:11:31 -0700129static void bkey_put(struct cache_set *c, struct bkey *k, int level)
130{
131 if ((level && KEY_OFFSET(k)) || !level)
132 __bkey_put(c, k);
133}
134
135/* Btree IO */
136
137static uint64_t btree_csum_set(struct btree *b, struct bset *i)
138{
139 uint64_t crc = b->key.ptr[0];
140 void *data = (void *) i + 8, *end = end(i);
141
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600142 crc = bch_crc64_update(crc, data, end - data);
Kent Overstreetc19ed232013-03-26 13:49:02 -0700143 return crc ^ 0xffffffffffffffffULL;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700144}
145
Kent Overstreetf3059a52013-05-15 17:13:45 -0700146static void bch_btree_node_read_done(struct btree *b)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700147{
Kent Overstreetcafe5632013-03-23 16:11:31 -0700148 const char *err = "bad btree header";
Kent Overstreet57943512013-04-25 13:58:35 -0700149 struct bset *i = b->sets[0].data;
150 struct btree_iter *iter;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700151
Kent Overstreet57943512013-04-25 13:58:35 -0700152 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
153 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700154 iter->used = 0;
155
Kent Overstreet57943512013-04-25 13:58:35 -0700156 if (!i->seq)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700157 goto err;
158
159 for (;
160 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
161 i = write_block(b)) {
162 err = "unsupported bset version";
163 if (i->version > BCACHE_BSET_VERSION)
164 goto err;
165
166 err = "bad btree header";
167 if (b->written + set_blocks(i, b->c) > btree_blocks(b))
168 goto err;
169
170 err = "bad magic";
171 if (i->magic != bset_magic(b->c))
172 goto err;
173
174 err = "bad checksum";
175 switch (i->version) {
176 case 0:
177 if (i->csum != csum_set(i))
178 goto err;
179 break;
180 case BCACHE_BSET_VERSION:
181 if (i->csum != btree_csum_set(b, i))
182 goto err;
183 break;
184 }
185
186 err = "empty set";
187 if (i != b->sets[0].data && !i->keys)
188 goto err;
189
190 bch_btree_iter_push(iter, i->start, end(i));
191
192 b->written += set_blocks(i, b->c);
193 }
194
195 err = "corrupted btree";
196 for (i = write_block(b);
197 index(i, b) < btree_blocks(b);
198 i = ((void *) i) + block_bytes(b->c))
199 if (i->seq == b->sets[0].data->seq)
200 goto err;
201
202 bch_btree_sort_and_fix_extents(b, iter);
203
204 i = b->sets[0].data;
205 err = "short btree key";
206 if (b->sets[0].size &&
207 bkey_cmp(&b->key, &b->sets[0].end) < 0)
208 goto err;
209
210 if (b->written < btree_blocks(b))
211 bch_bset_init_next(b);
212out:
Kent Overstreet57943512013-04-25 13:58:35 -0700213 mempool_free(iter, b->c->fill_iter);
214 return;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700215err:
216 set_btree_node_io_error(b);
Kent Overstreet07e86cc2013-03-25 11:46:43 -0700217 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
Kent Overstreetcafe5632013-03-23 16:11:31 -0700218 err, PTR_BUCKET_NR(b->c, &b->key, 0),
219 index(i, b), i->keys);
220 goto out;
221}
222
Kent Overstreet57943512013-04-25 13:58:35 -0700223static void btree_node_read_endio(struct bio *bio, int error)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700224{
Kent Overstreet57943512013-04-25 13:58:35 -0700225 struct closure *cl = bio->bi_private;
226 closure_put(cl);
227}
Kent Overstreetcafe5632013-03-23 16:11:31 -0700228
Kent Overstreet57943512013-04-25 13:58:35 -0700229void bch_btree_node_read(struct btree *b)
230{
231 uint64_t start_time = local_clock();
232 struct closure cl;
233 struct bio *bio;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700234
Kent Overstreetc37511b2013-04-26 15:39:55 -0700235 trace_bcache_btree_read(b);
236
Kent Overstreet57943512013-04-25 13:58:35 -0700237 closure_init_stack(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700238
Kent Overstreet57943512013-04-25 13:58:35 -0700239 bio = bch_bbio_alloc(b->c);
240 bio->bi_rw = REQ_META|READ_SYNC;
241 bio->bi_size = KEY_SIZE(&b->key) << 9;
242 bio->bi_end_io = btree_node_read_endio;
243 bio->bi_private = &cl;
244
245 bch_bio_map(bio, b->sets[0].data);
246
Kent Overstreet57943512013-04-25 13:58:35 -0700247 bch_submit_bbio(bio, b->c, &b->key, 0);
248 closure_sync(&cl);
249
250 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
251 set_btree_node_io_error(b);
252
253 bch_bbio_free(bio, b->c);
254
255 if (btree_node_io_error(b))
256 goto err;
257
258 bch_btree_node_read_done(b);
259
260 spin_lock(&b->c->btree_read_time_lock);
261 bch_time_stats_update(&b->c->btree_read_time, start_time);
262 spin_unlock(&b->c->btree_read_time_lock);
263
264 return;
265err:
Geert Uytterhoeven61cbd252013-09-23 23:17:30 -0700266 bch_cache_set_error(b->c, "io error reading bucket %zu",
Kent Overstreet57943512013-04-25 13:58:35 -0700267 PTR_BUCKET_NR(b->c, &b->key, 0));
Kent Overstreetcafe5632013-03-23 16:11:31 -0700268}
269
270static void btree_complete_write(struct btree *b, struct btree_write *w)
271{
272 if (w->prio_blocked &&
273 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
Kent Overstreet119ba0f2013-04-24 19:01:12 -0700274 wake_up_allocators(b->c);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700275
276 if (w->journal) {
277 atomic_dec_bug(w->journal);
278 __closure_wake_up(&b->c->journal.wait);
279 }
280
Kent Overstreetcafe5632013-03-23 16:11:31 -0700281 w->prio_blocked = 0;
282 w->journal = NULL;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700283}
284
Kent Overstreet57943512013-04-25 13:58:35 -0700285static void __btree_node_write_done(struct closure *cl)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700286{
287 struct btree *b = container_of(cl, struct btree, io.cl);
288 struct btree_write *w = btree_prev_write(b);
289
290 bch_bbio_free(b->bio, b->c);
291 b->bio = NULL;
292 btree_complete_write(b, w);
293
294 if (btree_node_dirty(b))
295 queue_delayed_work(btree_io_wq, &b->work,
296 msecs_to_jiffies(30000));
297
298 closure_return(cl);
299}
300
Kent Overstreet57943512013-04-25 13:58:35 -0700301static void btree_node_write_done(struct closure *cl)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700302{
303 struct btree *b = container_of(cl, struct btree, io.cl);
304 struct bio_vec *bv;
305 int n;
306
307 __bio_for_each_segment(bv, b->bio, n, 0)
308 __free_page(bv->bv_page);
309
Kent Overstreet57943512013-04-25 13:58:35 -0700310 __btree_node_write_done(cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700311}
312
Kent Overstreet57943512013-04-25 13:58:35 -0700313static void btree_node_write_endio(struct bio *bio, int error)
314{
315 struct closure *cl = bio->bi_private;
316 struct btree *b = container_of(cl, struct btree, io.cl);
317
318 if (error)
319 set_btree_node_io_error(b);
320
321 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
322 closure_put(cl);
323}
324
325static void do_btree_node_write(struct btree *b)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700326{
327 struct closure *cl = &b->io.cl;
328 struct bset *i = b->sets[b->nsets].data;
329 BKEY_PADDED(key) k;
330
331 i->version = BCACHE_BSET_VERSION;
332 i->csum = btree_csum_set(b, i);
333
Kent Overstreet57943512013-04-25 13:58:35 -0700334 BUG_ON(b->bio);
335 b->bio = bch_bbio_alloc(b->c);
336
337 b->bio->bi_end_io = btree_node_write_endio;
338 b->bio->bi_private = &b->io.cl;
Kent Overstreete49c7c32013-06-26 17:25:38 -0700339 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
340 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600341 bch_bio_map(b->bio, i);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700342
Kent Overstreete49c7c32013-06-26 17:25:38 -0700343 /*
344 * If we're appending to a leaf node, we don't technically need FUA -
345 * this write just needs to be persisted before the next journal write,
346 * which will be marked FLUSH|FUA.
347 *
348 * Similarly if we're writing a new btree root - the pointer is going to
349 * be in the next journal entry.
350 *
351 * But if we're writing a new btree node (that isn't a root) or
352 * appending to a non leaf btree node, we need either FUA or a flush
353 * when we write the parent with the new pointer. FUA is cheaper than a
354 * flush, and writes appending to leaf nodes aren't blocking anything so
355 * just make all btree node writes FUA to keep things sane.
356 */
357
Kent Overstreetcafe5632013-03-23 16:11:31 -0700358 bkey_copy(&k.key, &b->key);
359 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
360
Kent Overstreet8e51e412013-06-06 18:15:57 -0700361 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700362 int j;
363 struct bio_vec *bv;
364 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
365
366 bio_for_each_segment(bv, b->bio, j)
367 memcpy(page_address(bv->bv_page),
368 base + j * PAGE_SIZE, PAGE_SIZE);
369
Kent Overstreetcafe5632013-03-23 16:11:31 -0700370 bch_submit_bbio(b->bio, b->c, &k.key, 0);
371
Kent Overstreet57943512013-04-25 13:58:35 -0700372 continue_at(cl, btree_node_write_done, NULL);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700373 } else {
374 b->bio->bi_vcnt = 0;
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600375 bch_bio_map(b->bio, i);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700376
Kent Overstreetcafe5632013-03-23 16:11:31 -0700377 bch_submit_bbio(b->bio, b->c, &k.key, 0);
378
379 closure_sync(cl);
Kent Overstreet57943512013-04-25 13:58:35 -0700380 __btree_node_write_done(cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700381 }
382}
383
Kent Overstreet57943512013-04-25 13:58:35 -0700384void bch_btree_node_write(struct btree *b, struct closure *parent)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700385{
386 struct bset *i = b->sets[b->nsets].data;
387
Kent Overstreetc37511b2013-04-26 15:39:55 -0700388 trace_bcache_btree_write(b);
389
Kent Overstreetcafe5632013-03-23 16:11:31 -0700390 BUG_ON(current->bio_list);
Kent Overstreet57943512013-04-25 13:58:35 -0700391 BUG_ON(b->written >= btree_blocks(b));
392 BUG_ON(b->written && !i->keys);
393 BUG_ON(b->sets->data->seq != i->seq);
Kent Overstreetc37511b2013-04-26 15:39:55 -0700394 bch_check_key_order(b, i);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700395
Kent Overstreetcafe5632013-03-23 16:11:31 -0700396 cancel_delayed_work(&b->work);
397
Kent Overstreet57943512013-04-25 13:58:35 -0700398 /* If caller isn't waiting for write, parent refcount is cache set */
399 closure_lock(&b->io, parent ?: &b->c->cl);
400
Kent Overstreetcafe5632013-03-23 16:11:31 -0700401 clear_bit(BTREE_NODE_dirty, &b->flags);
402 change_bit(BTREE_NODE_write_idx, &b->flags);
403
Kent Overstreet57943512013-04-25 13:58:35 -0700404 do_btree_node_write(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700405
Kent Overstreetcafe5632013-03-23 16:11:31 -0700406 b->written += set_blocks(i, b->c);
407 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
408 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
409
410 bch_btree_sort_lazy(b);
411
412 if (b->written < btree_blocks(b))
413 bch_bset_init_next(b);
414}
415
Kent Overstreet57943512013-04-25 13:58:35 -0700416static void btree_node_write_work(struct work_struct *w)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700417{
418 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
419
Kent Overstreet57943512013-04-25 13:58:35 -0700420 rw_lock(true, b, b->level);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700421
422 if (btree_node_dirty(b))
Kent Overstreet57943512013-04-25 13:58:35 -0700423 bch_btree_node_write(b, NULL);
424 rw_unlock(true, b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700425}
426
Kent Overstreet57943512013-04-25 13:58:35 -0700427static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700428{
429 struct bset *i = b->sets[b->nsets].data;
430 struct btree_write *w = btree_current_write(b);
431
Kent Overstreet57943512013-04-25 13:58:35 -0700432 BUG_ON(!b->written);
433 BUG_ON(!i->keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700434
Kent Overstreet57943512013-04-25 13:58:35 -0700435 if (!btree_node_dirty(b))
436 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700437
Kent Overstreet57943512013-04-25 13:58:35 -0700438 set_btree_node_dirty(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700439
Kent Overstreete8e1d462013-07-24 17:27:07 -0700440 if (op->journal) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700441 if (w->journal &&
442 journal_pin_cmp(b->c, w, op)) {
443 atomic_dec_bug(w->journal);
444 w->journal = NULL;
445 }
446
447 if (!w->journal) {
448 w->journal = op->journal;
449 atomic_inc(w->journal);
450 }
451 }
452
Kent Overstreetcafe5632013-03-23 16:11:31 -0700453 /* Force write if set is too big */
Kent Overstreet57943512013-04-25 13:58:35 -0700454 if (set_bytes(i) > PAGE_SIZE - 48 &&
455 !current->bio_list)
456 bch_btree_node_write(b, NULL);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700457}
458
459/*
460 * Btree in memory cache - allocation/freeing
461 * mca -> memory cache
462 */
463
464static void mca_reinit(struct btree *b)
465{
466 unsigned i;
467
468 b->flags = 0;
469 b->written = 0;
470 b->nsets = 0;
471
472 for (i = 0; i < MAX_BSETS; i++)
473 b->sets[i].size = 0;
474 /*
475 * Second loop starts at 1 because b->sets[0]->data is the memory we
476 * allocated
477 */
478 for (i = 1; i < MAX_BSETS; i++)
479 b->sets[i].data = NULL;
480}
481
482#define mca_reserve(c) (((c->root && c->root->level) \
483 ? c->root->level : 1) * 8 + 16)
484#define mca_can_free(c) \
485 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
486
487static void mca_data_free(struct btree *b)
488{
489 struct bset_tree *t = b->sets;
490 BUG_ON(!closure_is_unlocked(&b->io.cl));
491
492 if (bset_prev_bytes(b) < PAGE_SIZE)
493 kfree(t->prev);
494 else
495 free_pages((unsigned long) t->prev,
496 get_order(bset_prev_bytes(b)));
497
498 if (bset_tree_bytes(b) < PAGE_SIZE)
499 kfree(t->tree);
500 else
501 free_pages((unsigned long) t->tree,
502 get_order(bset_tree_bytes(b)));
503
504 free_pages((unsigned long) t->data, b->page_order);
505
506 t->prev = NULL;
507 t->tree = NULL;
508 t->data = NULL;
509 list_move(&b->list, &b->c->btree_cache_freed);
510 b->c->bucket_cache_used--;
511}
512
513static void mca_bucket_free(struct btree *b)
514{
515 BUG_ON(btree_node_dirty(b));
516
517 b->key.ptr[0] = 0;
518 hlist_del_init_rcu(&b->hash);
519 list_move(&b->list, &b->c->btree_cache_freeable);
520}
521
522static unsigned btree_order(struct bkey *k)
523{
524 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
525}
526
527static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
528{
529 struct bset_tree *t = b->sets;
530 BUG_ON(t->data);
531
532 b->page_order = max_t(unsigned,
533 ilog2(b->c->btree_pages),
534 btree_order(k));
535
536 t->data = (void *) __get_free_pages(gfp, b->page_order);
537 if (!t->data)
538 goto err;
539
540 t->tree = bset_tree_bytes(b) < PAGE_SIZE
541 ? kmalloc(bset_tree_bytes(b), gfp)
542 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
543 if (!t->tree)
544 goto err;
545
546 t->prev = bset_prev_bytes(b) < PAGE_SIZE
547 ? kmalloc(bset_prev_bytes(b), gfp)
548 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
549 if (!t->prev)
550 goto err;
551
552 list_move(&b->list, &b->c->btree_cache);
553 b->c->bucket_cache_used++;
554 return;
555err:
556 mca_data_free(b);
557}
558
559static struct btree *mca_bucket_alloc(struct cache_set *c,
560 struct bkey *k, gfp_t gfp)
561{
562 struct btree *b = kzalloc(sizeof(struct btree), gfp);
563 if (!b)
564 return NULL;
565
566 init_rwsem(&b->lock);
567 lockdep_set_novalidate_class(&b->lock);
568 INIT_LIST_HEAD(&b->list);
Kent Overstreet57943512013-04-25 13:58:35 -0700569 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700570 b->c = c;
571 closure_init_unlocked(&b->io);
572
573 mca_data_alloc(b, k, gfp);
574 return b;
575}
576
Kent Overstreete8e1d462013-07-24 17:27:07 -0700577static int mca_reap(struct btree *b, unsigned min_order, bool flush)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700578{
Kent Overstreete8e1d462013-07-24 17:27:07 -0700579 struct closure cl;
580
581 closure_init_stack(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700582 lockdep_assert_held(&b->c->bucket_lock);
583
584 if (!down_write_trylock(&b->lock))
585 return -ENOMEM;
586
Kent Overstreete8e1d462013-07-24 17:27:07 -0700587 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
588
589 if (b->page_order < min_order ||
590 (!flush &&
591 (btree_node_dirty(b) ||
592 atomic_read(&b->io.cl.remaining) != -1))) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700593 rw_unlock(true, b);
594 return -ENOMEM;
595 }
596
Kent Overstreete8e1d462013-07-24 17:27:07 -0700597 if (btree_node_dirty(b)) {
598 bch_btree_node_write(b, &cl);
599 closure_sync(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700600 }
601
Kent Overstreete8e1d462013-07-24 17:27:07 -0700602 /* wait for any in flight btree write */
603 closure_wait_event_sync(&b->io.wait, &cl,
604 atomic_read(&b->io.cl.remaining) == -1);
605
Kent Overstreetcafe5632013-03-23 16:11:31 -0700606 return 0;
607}
608
Dave Chinner7dc19d52013-08-28 10:18:11 +1000609static unsigned long bch_mca_scan(struct shrinker *shrink,
610 struct shrink_control *sc)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700611{
612 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
613 struct btree *b, *t;
614 unsigned long i, nr = sc->nr_to_scan;
Dave Chinner7dc19d52013-08-28 10:18:11 +1000615 unsigned long freed = 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700616
617 if (c->shrinker_disabled)
Dave Chinner7dc19d52013-08-28 10:18:11 +1000618 return SHRINK_STOP;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700619
620 if (c->try_harder)
Dave Chinner7dc19d52013-08-28 10:18:11 +1000621 return SHRINK_STOP;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700622
623 /* Return -1 if we can't do anything right now */
Kent Overstreeta698e082013-09-23 23:17:34 -0700624 if (sc->gfp_mask & __GFP_IO)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700625 mutex_lock(&c->bucket_lock);
626 else if (!mutex_trylock(&c->bucket_lock))
627 return -1;
628
Kent Overstreet36c9ea92013-06-03 13:04:56 -0700629 /*
630 * It's _really_ critical that we don't free too many btree nodes - we
631 * have to always leave ourselves a reserve. The reserve is how we
632 * guarantee that allocating memory for a new btree node can always
633 * succeed, so that inserting keys into the btree can always succeed and
634 * IO can always make forward progress:
635 */
Kent Overstreetcafe5632013-03-23 16:11:31 -0700636 nr /= c->btree_pages;
637 nr = min_t(unsigned long, nr, mca_can_free(c));
638
639 i = 0;
640 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
Dave Chinner7dc19d52013-08-28 10:18:11 +1000641 if (freed >= nr)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700642 break;
643
644 if (++i > 3 &&
Kent Overstreete8e1d462013-07-24 17:27:07 -0700645 !mca_reap(b, 0, false)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700646 mca_data_free(b);
647 rw_unlock(true, b);
Dave Chinner7dc19d52013-08-28 10:18:11 +1000648 freed++;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700649 }
650 }
651
652 /*
653 * Can happen right when we first start up, before we've read in any
654 * btree nodes
655 */
656 if (list_empty(&c->btree_cache))
657 goto out;
658
Dave Chinner7dc19d52013-08-28 10:18:11 +1000659 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700660 b = list_first_entry(&c->btree_cache, struct btree, list);
661 list_rotate_left(&c->btree_cache);
662
663 if (!b->accessed &&
Kent Overstreete8e1d462013-07-24 17:27:07 -0700664 !mca_reap(b, 0, false)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700665 mca_bucket_free(b);
666 mca_data_free(b);
667 rw_unlock(true, b);
Dave Chinner7dc19d52013-08-28 10:18:11 +1000668 freed++;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700669 } else
670 b->accessed = 0;
671 }
672out:
Kent Overstreetcafe5632013-03-23 16:11:31 -0700673 mutex_unlock(&c->bucket_lock);
Dave Chinner7dc19d52013-08-28 10:18:11 +1000674 return freed;
675}
676
677static unsigned long bch_mca_count(struct shrinker *shrink,
678 struct shrink_control *sc)
679{
680 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
681
682 if (c->shrinker_disabled)
683 return 0;
684
685 if (c->try_harder)
686 return 0;
687
688 return mca_can_free(c) * c->btree_pages;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700689}
690
691void bch_btree_cache_free(struct cache_set *c)
692{
693 struct btree *b;
694 struct closure cl;
695 closure_init_stack(&cl);
696
697 if (c->shrink.list.next)
698 unregister_shrinker(&c->shrink);
699
700 mutex_lock(&c->bucket_lock);
701
702#ifdef CONFIG_BCACHE_DEBUG
703 if (c->verify_data)
704 list_move(&c->verify_data->list, &c->btree_cache);
705#endif
706
707 list_splice(&c->btree_cache_freeable,
708 &c->btree_cache);
709
710 while (!list_empty(&c->btree_cache)) {
711 b = list_first_entry(&c->btree_cache, struct btree, list);
712
713 if (btree_node_dirty(b))
714 btree_complete_write(b, btree_current_write(b));
715 clear_bit(BTREE_NODE_dirty, &b->flags);
716
717 mca_data_free(b);
718 }
719
720 while (!list_empty(&c->btree_cache_freed)) {
721 b = list_first_entry(&c->btree_cache_freed,
722 struct btree, list);
723 list_del(&b->list);
724 cancel_delayed_work_sync(&b->work);
725 kfree(b);
726 }
727
728 mutex_unlock(&c->bucket_lock);
729}
730
731int bch_btree_cache_alloc(struct cache_set *c)
732{
733 unsigned i;
734
735 /* XXX: doesn't check for errors */
736
737 closure_init_unlocked(&c->gc);
738
739 for (i = 0; i < mca_reserve(c); i++)
740 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
741
742 list_splice_init(&c->btree_cache,
743 &c->btree_cache_freeable);
744
745#ifdef CONFIG_BCACHE_DEBUG
746 mutex_init(&c->verify_lock);
747
748 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
749
750 if (c->verify_data &&
751 c->verify_data->sets[0].data)
752 list_del_init(&c->verify_data->list);
753 else
754 c->verify_data = NULL;
755#endif
756
Dave Chinner7dc19d52013-08-28 10:18:11 +1000757 c->shrink.count_objects = bch_mca_count;
758 c->shrink.scan_objects = bch_mca_scan;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700759 c->shrink.seeks = 4;
760 c->shrink.batch = c->btree_pages * 2;
761 register_shrinker(&c->shrink);
762
763 return 0;
764}
765
766/* Btree in memory cache - hash table */
767
768static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
769{
770 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
771}
772
773static struct btree *mca_find(struct cache_set *c, struct bkey *k)
774{
775 struct btree *b;
776
777 rcu_read_lock();
778 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
779 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
780 goto out;
781 b = NULL;
782out:
783 rcu_read_unlock();
784 return b;
785}
786
Kent Overstreete8e1d462013-07-24 17:27:07 -0700787static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700788{
Kent Overstreete8e1d462013-07-24 17:27:07 -0700789 struct btree *b;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700790
Kent Overstreetc37511b2013-04-26 15:39:55 -0700791 trace_bcache_btree_cache_cannibalize(c);
792
Kent Overstreete8e1d462013-07-24 17:27:07 -0700793 if (!c->try_harder) {
794 c->try_harder = current;
795 c->try_harder_start = local_clock();
796 } else if (c->try_harder != current)
797 return ERR_PTR(-ENOSPC);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700798
Kent Overstreete8e1d462013-07-24 17:27:07 -0700799 list_for_each_entry_reverse(b, &c->btree_cache, list)
800 if (!mca_reap(b, btree_order(k), false))
801 return b;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700802
Kent Overstreete8e1d462013-07-24 17:27:07 -0700803 list_for_each_entry_reverse(b, &c->btree_cache, list)
804 if (!mca_reap(b, btree_order(k), true))
805 return b;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700806
Kent Overstreete8e1d462013-07-24 17:27:07 -0700807 return ERR_PTR(-ENOMEM);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700808}
809
810/*
811 * We can only have one thread cannibalizing other cached btree nodes at a time,
812 * or we'll deadlock. We use an open coded mutex to ensure that, which a
813 * cannibalize_bucket() will take. This means every time we unlock the root of
814 * the btree, we need to release this lock if we have it held.
815 */
Kent Overstreet35fcd842013-07-24 17:29:09 -0700816void bch_cannibalize_unlock(struct cache_set *c)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700817{
Kent Overstreete8e1d462013-07-24 17:27:07 -0700818 if (c->try_harder == current) {
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600819 bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700820 c->try_harder = NULL;
Kent Overstreete8e1d462013-07-24 17:27:07 -0700821 wake_up(&c->try_wait);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700822 }
823}
824
Kent Overstreete8e1d462013-07-24 17:27:07 -0700825static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700826{
827 struct btree *b;
828
Kent Overstreete8e1d462013-07-24 17:27:07 -0700829 BUG_ON(current->bio_list);
830
Kent Overstreetcafe5632013-03-23 16:11:31 -0700831 lockdep_assert_held(&c->bucket_lock);
832
833 if (mca_find(c, k))
834 return NULL;
835
836 /* btree_free() doesn't free memory; it sticks the node on the end of
837 * the list. Check if there's any freed nodes there:
838 */
839 list_for_each_entry(b, &c->btree_cache_freeable, list)
Kent Overstreete8e1d462013-07-24 17:27:07 -0700840 if (!mca_reap(b, btree_order(k), false))
Kent Overstreetcafe5632013-03-23 16:11:31 -0700841 goto out;
842
843 /* We never free struct btree itself, just the memory that holds the on
844 * disk node. Check the freed list before allocating a new one:
845 */
846 list_for_each_entry(b, &c->btree_cache_freed, list)
Kent Overstreete8e1d462013-07-24 17:27:07 -0700847 if (!mca_reap(b, 0, false)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700848 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
849 if (!b->sets[0].data)
850 goto err;
851 else
852 goto out;
853 }
854
855 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
856 if (!b)
857 goto err;
858
859 BUG_ON(!down_write_trylock(&b->lock));
860 if (!b->sets->data)
861 goto err;
862out:
863 BUG_ON(!closure_is_unlocked(&b->io.cl));
864
865 bkey_copy(&b->key, k);
866 list_move(&b->list, &c->btree_cache);
867 hlist_del_init_rcu(&b->hash);
868 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
869
870 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
871 b->level = level;
Kent Overstreetd6fd3b12013-07-24 17:20:19 -0700872 b->parent = (void *) ~0UL;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700873
874 mca_reinit(b);
875
876 return b;
877err:
878 if (b)
879 rw_unlock(true, b);
880
Kent Overstreete8e1d462013-07-24 17:27:07 -0700881 b = mca_cannibalize(c, k);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700882 if (!IS_ERR(b))
883 goto out;
884
885 return b;
886}
887
888/**
889 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
890 * in from disk if necessary.
891 *
892 * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
893 * if that closure is in non blocking mode, will return -EAGAIN.
894 *
895 * The btree node will have either a read or a write lock held, depending on
896 * level and op->lock.
897 */
898struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
Kent Overstreete8e1d462013-07-24 17:27:07 -0700899 int level, bool write)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700900{
901 int i = 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -0700902 struct btree *b;
903
904 BUG_ON(level < 0);
905retry:
906 b = mca_find(c, k);
907
908 if (!b) {
Kent Overstreet57943512013-04-25 13:58:35 -0700909 if (current->bio_list)
910 return ERR_PTR(-EAGAIN);
911
Kent Overstreetcafe5632013-03-23 16:11:31 -0700912 mutex_lock(&c->bucket_lock);
Kent Overstreete8e1d462013-07-24 17:27:07 -0700913 b = mca_alloc(c, k, level);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700914 mutex_unlock(&c->bucket_lock);
915
916 if (!b)
917 goto retry;
918 if (IS_ERR(b))
919 return b;
920
Kent Overstreet57943512013-04-25 13:58:35 -0700921 bch_btree_node_read(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700922
923 if (!write)
924 downgrade_write(&b->lock);
925 } else {
926 rw_lock(write, b, level);
927 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
928 rw_unlock(write, b);
929 goto retry;
930 }
931 BUG_ON(b->level != level);
932 }
933
934 b->accessed = 1;
935
936 for (; i <= b->nsets && b->sets[i].size; i++) {
937 prefetch(b->sets[i].tree);
938 prefetch(b->sets[i].data);
939 }
940
941 for (; i <= b->nsets; i++)
942 prefetch(b->sets[i].data);
943
Kent Overstreet57943512013-04-25 13:58:35 -0700944 if (btree_node_io_error(b)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -0700945 rw_unlock(write, b);
Kent Overstreet57943512013-04-25 13:58:35 -0700946 return ERR_PTR(-EIO);
947 }
948
949 BUG_ON(!b->written);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700950
951 return b;
952}
953
954static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
955{
956 struct btree *b;
957
958 mutex_lock(&c->bucket_lock);
Kent Overstreete8e1d462013-07-24 17:27:07 -0700959 b = mca_alloc(c, k, level);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700960 mutex_unlock(&c->bucket_lock);
961
962 if (!IS_ERR_OR_NULL(b)) {
Kent Overstreet57943512013-04-25 13:58:35 -0700963 bch_btree_node_read(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700964 rw_unlock(true, b);
965 }
966}
967
968/* Btree alloc */
969
Kent Overstreete8e1d462013-07-24 17:27:07 -0700970static void btree_node_free(struct btree *b)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700971{
972 unsigned i;
973
Kent Overstreetc37511b2013-04-26 15:39:55 -0700974 trace_bcache_btree_node_free(b);
975
Kent Overstreetcafe5632013-03-23 16:11:31 -0700976 BUG_ON(b == b->c->root);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700977
978 if (btree_node_dirty(b))
979 btree_complete_write(b, btree_current_write(b));
980 clear_bit(BTREE_NODE_dirty, &b->flags);
981
Kent Overstreetcafe5632013-03-23 16:11:31 -0700982 cancel_delayed_work(&b->work);
983
984 mutex_lock(&b->c->bucket_lock);
985
986 for (i = 0; i < KEY_PTRS(&b->key); i++) {
987 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
988
989 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
990 PTR_BUCKET(b->c, &b->key, i));
991 }
992
993 bch_bucket_free(b->c, &b->key);
994 mca_bucket_free(b);
995 mutex_unlock(&b->c->bucket_lock);
996}
997
Kent Overstreet35fcd842013-07-24 17:29:09 -0700998struct btree *bch_btree_node_alloc(struct cache_set *c, int level)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700999{
1000 BKEY_PADDED(key) k;
1001 struct btree *b = ERR_PTR(-EAGAIN);
1002
1003 mutex_lock(&c->bucket_lock);
1004retry:
Kent Overstreet35fcd842013-07-24 17:29:09 -07001005 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
Kent Overstreetcafe5632013-03-23 16:11:31 -07001006 goto err;
1007
1008 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1009
Kent Overstreete8e1d462013-07-24 17:27:07 -07001010 b = mca_alloc(c, &k.key, level);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001011 if (IS_ERR(b))
1012 goto err_free;
1013
1014 if (!b) {
Kent Overstreetb1a67b02013-03-25 11:46:44 -07001015 cache_bug(c,
1016 "Tried to allocate bucket that was in btree cache");
Kent Overstreetcafe5632013-03-23 16:11:31 -07001017 __bkey_put(c, &k.key);
1018 goto retry;
1019 }
1020
Kent Overstreetcafe5632013-03-23 16:11:31 -07001021 b->accessed = 1;
1022 bch_bset_init_next(b);
1023
1024 mutex_unlock(&c->bucket_lock);
Kent Overstreetc37511b2013-04-26 15:39:55 -07001025
1026 trace_bcache_btree_node_alloc(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001027 return b;
1028err_free:
1029 bch_bucket_free(c, &k.key);
1030 __bkey_put(c, &k.key);
1031err:
1032 mutex_unlock(&c->bucket_lock);
Kent Overstreetc37511b2013-04-26 15:39:55 -07001033
1034 trace_bcache_btree_node_alloc_fail(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001035 return b;
1036}
1037
Kent Overstreet35fcd842013-07-24 17:29:09 -07001038static struct btree *btree_node_alloc_replacement(struct btree *b)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001039{
Kent Overstreet35fcd842013-07-24 17:29:09 -07001040 struct btree *n = bch_btree_node_alloc(b->c, b->level);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001041 if (!IS_ERR_OR_NULL(n))
1042 bch_btree_sort_into(b, n);
1043
1044 return n;
1045}
1046
1047/* Garbage collection */
1048
1049uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1050{
1051 uint8_t stale = 0;
1052 unsigned i;
1053 struct bucket *g;
1054
1055 /*
1056 * ptr_invalid() can't return true for the keys that mark btree nodes as
1057 * freed, but since ptr_bad() returns true we'll never actually use them
1058 * for anything and thus we don't want mark their pointers here
1059 */
1060 if (!bkey_cmp(k, &ZERO_KEY))
1061 return stale;
1062
1063 for (i = 0; i < KEY_PTRS(k); i++) {
1064 if (!ptr_available(c, k, i))
1065 continue;
1066
1067 g = PTR_BUCKET(c, k, i);
1068
1069 if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1070 g->gc_gen = PTR_GEN(k, i);
1071
1072 if (ptr_stale(c, k, i)) {
1073 stale = max(stale, ptr_stale(c, k, i));
1074 continue;
1075 }
1076
1077 cache_bug_on(GC_MARK(g) &&
1078 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1079 c, "inconsistent ptrs: mark = %llu, level = %i",
1080 GC_MARK(g), level);
1081
1082 if (level)
1083 SET_GC_MARK(g, GC_MARK_METADATA);
1084 else if (KEY_DIRTY(k))
1085 SET_GC_MARK(g, GC_MARK_DIRTY);
1086
1087 /* guard against overflow */
1088 SET_GC_SECTORS_USED(g, min_t(unsigned,
1089 GC_SECTORS_USED(g) + KEY_SIZE(k),
1090 (1 << 14) - 1));
1091
1092 BUG_ON(!GC_SECTORS_USED(g));
1093 }
1094
1095 return stale;
1096}
1097
1098#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1099
1100static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1101 struct gc_stat *gc)
1102{
1103 uint8_t stale = 0;
1104 unsigned last_dev = -1;
1105 struct bcache_device *d = NULL;
1106 struct bkey *k;
1107 struct btree_iter iter;
1108 struct bset_tree *t;
1109
1110 gc->nodes++;
1111
1112 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1113 if (last_dev != KEY_INODE(k)) {
1114 last_dev = KEY_INODE(k);
1115
1116 d = KEY_INODE(k) < b->c->nr_uuids
1117 ? b->c->devices[last_dev]
1118 : NULL;
1119 }
1120
1121 stale = max(stale, btree_mark_key(b, k));
1122
1123 if (bch_ptr_bad(b, k))
1124 continue;
1125
1126 *keys += bkey_u64s(k);
1127
1128 gc->key_bytes += bkey_u64s(k);
1129 gc->nkeys++;
1130
1131 gc->data += KEY_SIZE(k);
Kent Overstreet444fc0b2013-05-11 17:07:26 -07001132 if (KEY_DIRTY(k))
Kent Overstreetcafe5632013-03-23 16:11:31 -07001133 gc->dirty += KEY_SIZE(k);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001134 }
1135
1136 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1137 btree_bug_on(t->size &&
1138 bset_written(b, t) &&
1139 bkey_cmp(&b->key, &t->end) < 0,
1140 b, "found short btree key in gc");
1141
1142 return stale;
1143}
1144
Kent Overstreete8e1d462013-07-24 17:27:07 -07001145static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001146{
1147 /*
1148 * We block priorities from being written for the duration of garbage
1149 * collection, so we can't sleep in btree_alloc() ->
1150 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
1151 * our closure.
1152 */
Kent Overstreet35fcd842013-07-24 17:29:09 -07001153 struct btree *n = btree_node_alloc_replacement(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001154
1155 if (!IS_ERR_OR_NULL(n)) {
1156 swap(b, n);
Kent Overstreet57943512013-04-25 13:58:35 -07001157 __bkey_put(b->c, &b->key);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001158
1159 memcpy(k->ptr, b->key.ptr,
1160 sizeof(uint64_t) * KEY_PTRS(&b->key));
1161
Kent Overstreete8e1d462013-07-24 17:27:07 -07001162 btree_node_free(n);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001163 up_write(&n->lock);
1164 }
1165
1166 return b;
1167}
1168
1169/*
1170 * Leaving this at 2 until we've got incremental garbage collection done; it
1171 * could be higher (and has been tested with 4) except that garbage collection
1172 * could take much longer, adversely affecting latency.
1173 */
1174#define GC_MERGE_NODES 2U
1175
1176struct gc_merge_info {
1177 struct btree *b;
1178 struct bkey *k;
1179 unsigned keys;
1180};
1181
Kent Overstreete8e1d462013-07-24 17:27:07 -07001182static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
1183 struct gc_merge_info *r)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001184{
1185 unsigned nodes = 0, keys = 0, blocks;
1186 int i;
1187
1188 while (nodes < GC_MERGE_NODES && r[nodes].b)
1189 keys += r[nodes++].keys;
1190
1191 blocks = btree_default_blocks(b->c) * 2 / 3;
1192
1193 if (nodes < 2 ||
1194 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1195 return;
1196
1197 for (i = nodes - 1; i >= 0; --i) {
1198 if (r[i].b->written)
Kent Overstreete8e1d462013-07-24 17:27:07 -07001199 r[i].b = btree_gc_alloc(r[i].b, r[i].k);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001200
1201 if (r[i].b->written)
1202 return;
1203 }
1204
1205 for (i = nodes - 1; i > 0; --i) {
1206 struct bset *n1 = r[i].b->sets->data;
1207 struct bset *n2 = r[i - 1].b->sets->data;
1208 struct bkey *k, *last = NULL;
1209
1210 keys = 0;
1211
1212 if (i == 1) {
1213 /*
1214 * Last node we're not getting rid of - we're getting
1215 * rid of the node at r[0]. Have to try and fit all of
1216 * the remaining keys into this node; we can't ensure
1217 * they will always fit due to rounding and variable
1218 * length keys (shouldn't be possible in practice,
1219 * though)
1220 */
1221 if (__set_blocks(n1, n1->keys + r->keys,
1222 b->c) > btree_blocks(r[i].b))
1223 return;
1224
1225 keys = n2->keys;
1226 last = &r->b->key;
1227 } else
1228 for (k = n2->start;
1229 k < end(n2);
1230 k = bkey_next(k)) {
1231 if (__set_blocks(n1, n1->keys + keys +
1232 bkey_u64s(k), b->c) > blocks)
1233 break;
1234
1235 last = k;
1236 keys += bkey_u64s(k);
1237 }
1238
1239 BUG_ON(__set_blocks(n1, n1->keys + keys,
1240 b->c) > btree_blocks(r[i].b));
1241
1242 if (last) {
1243 bkey_copy_key(&r[i].b->key, last);
1244 bkey_copy_key(r[i].k, last);
1245 }
1246
1247 memcpy(end(n1),
1248 n2->start,
1249 (void *) node(n2, keys) - (void *) n2->start);
1250
1251 n1->keys += keys;
1252
1253 memmove(n2->start,
1254 node(n2, keys),
1255 (void *) end(n2) - (void *) node(n2, keys));
1256
1257 n2->keys -= keys;
1258
1259 r[i].keys = n1->keys;
1260 r[i - 1].keys = n2->keys;
1261 }
1262
Kent Overstreete8e1d462013-07-24 17:27:07 -07001263 btree_node_free(r->b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001264 up_write(&r->b->lock);
1265
Kent Overstreetc37511b2013-04-26 15:39:55 -07001266 trace_bcache_btree_gc_coalesce(nodes);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001267
1268 gc->nodes--;
1269 nodes--;
1270
1271 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
1272 memset(&r[nodes], 0, sizeof(struct gc_merge_info));
1273}
1274
1275static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1276 struct closure *writes, struct gc_stat *gc)
1277{
1278 void write(struct btree *r)
1279 {
1280 if (!r->written)
Kent Overstreet57943512013-04-25 13:58:35 -07001281 bch_btree_node_write(r, &op->cl);
1282 else if (btree_node_dirty(r))
1283 bch_btree_node_write(r, writes);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001284
1285 up_write(&r->lock);
1286 }
1287
1288 int ret = 0, stale;
1289 unsigned i;
1290 struct gc_merge_info r[GC_MERGE_NODES];
1291
1292 memset(r, 0, sizeof(r));
1293
1294 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
Kent Overstreete8e1d462013-07-24 17:27:07 -07001295 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001296
1297 if (IS_ERR(r->b)) {
1298 ret = PTR_ERR(r->b);
1299 break;
1300 }
1301
1302 r->keys = 0;
1303 stale = btree_gc_mark_node(r->b, &r->keys, gc);
1304
1305 if (!b->written &&
1306 (r->b->level || stale > 10 ||
1307 b->c->gc_always_rewrite))
Kent Overstreete8e1d462013-07-24 17:27:07 -07001308 r->b = btree_gc_alloc(r->b, r->k);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001309
1310 if (r->b->level)
1311 ret = btree_gc_recurse(r->b, op, writes, gc);
1312
1313 if (ret) {
1314 write(r->b);
1315 break;
1316 }
1317
1318 bkey_copy_key(&b->c->gc_done, r->k);
1319
1320 if (!b->written)
Kent Overstreete8e1d462013-07-24 17:27:07 -07001321 btree_gc_coalesce(b, gc, r);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001322
1323 if (r[GC_MERGE_NODES - 1].b)
1324 write(r[GC_MERGE_NODES - 1].b);
1325
1326 memmove(&r[1], &r[0],
1327 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
1328
1329 /* When we've got incremental GC working, we'll want to do
1330 * if (should_resched())
1331 * return -EAGAIN;
1332 */
1333 cond_resched();
1334#if 0
1335 if (need_resched()) {
1336 ret = -EAGAIN;
1337 break;
1338 }
1339#endif
1340 }
1341
1342 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
1343 write(r[i].b);
1344
1345 /* Might have freed some children, must remove their keys */
1346 if (!b->written)
1347 bch_btree_sort(b);
1348
1349 return ret;
1350}
1351
1352static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1353 struct closure *writes, struct gc_stat *gc)
1354{
1355 struct btree *n = NULL;
1356 unsigned keys = 0;
1357 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
1358
1359 if (b->level || stale > 10)
Kent Overstreet35fcd842013-07-24 17:29:09 -07001360 n = btree_node_alloc_replacement(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001361
1362 if (!IS_ERR_OR_NULL(n))
1363 swap(b, n);
1364
1365 if (b->level)
1366 ret = btree_gc_recurse(b, op, writes, gc);
1367
1368 if (!b->written || btree_node_dirty(b)) {
Kent Overstreet57943512013-04-25 13:58:35 -07001369 bch_btree_node_write(b, n ? &op->cl : NULL);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001370 }
1371
1372 if (!IS_ERR_OR_NULL(n)) {
1373 closure_sync(&op->cl);
1374 bch_btree_set_root(b);
Kent Overstreete8e1d462013-07-24 17:27:07 -07001375 btree_node_free(n);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001376 rw_unlock(true, b);
1377 }
1378
1379 return ret;
1380}
1381
1382static void btree_gc_start(struct cache_set *c)
1383{
1384 struct cache *ca;
1385 struct bucket *b;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001386 unsigned i;
1387
1388 if (!c->gc_mark_valid)
1389 return;
1390
1391 mutex_lock(&c->bucket_lock);
1392
1393 c->gc_mark_valid = 0;
1394 c->gc_done = ZERO_KEY;
1395
1396 for_each_cache(ca, c, i)
1397 for_each_bucket(b, ca) {
1398 b->gc_gen = b->gen;
Kent Overstreet29ebf462013-07-11 19:43:21 -07001399 if (!atomic_read(&b->pin)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001400 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
Kent Overstreet29ebf462013-07-11 19:43:21 -07001401 SET_GC_SECTORS_USED(b, 0);
1402 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001403 }
1404
Kent Overstreetcafe5632013-03-23 16:11:31 -07001405 mutex_unlock(&c->bucket_lock);
1406}
1407
1408size_t bch_btree_gc_finish(struct cache_set *c)
1409{
1410 size_t available = 0;
1411 struct bucket *b;
1412 struct cache *ca;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001413 unsigned i;
1414
1415 mutex_lock(&c->bucket_lock);
1416
1417 set_gc_sectors(c);
1418 c->gc_mark_valid = 1;
1419 c->need_gc = 0;
1420
1421 if (c->root)
1422 for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1423 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1424 GC_MARK_METADATA);
1425
1426 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1427 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1428 GC_MARK_METADATA);
1429
1430 for_each_cache(ca, c, i) {
1431 uint64_t *i;
1432
1433 ca->invalidate_needs_gc = 0;
1434
1435 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1436 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1437
1438 for (i = ca->prio_buckets;
1439 i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1440 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1441
1442 for_each_bucket(b, ca) {
1443 b->last_gc = b->gc_gen;
1444 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1445
1446 if (!atomic_read(&b->pin) &&
1447 GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1448 available++;
1449 if (!GC_SECTORS_USED(b))
1450 bch_bucket_add_unused(ca, b);
1451 }
1452 }
1453 }
1454
Kent Overstreetcafe5632013-03-23 16:11:31 -07001455 mutex_unlock(&c->bucket_lock);
1456 return available;
1457}
1458
1459static void bch_btree_gc(struct closure *cl)
1460{
1461 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
1462 int ret;
1463 unsigned long available;
1464 struct gc_stat stats;
1465 struct closure writes;
1466 struct btree_op op;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001467 uint64_t start_time = local_clock();
Kent Overstreet57943512013-04-25 13:58:35 -07001468
Kent Overstreetc37511b2013-04-26 15:39:55 -07001469 trace_bcache_gc_start(c);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001470
1471 memset(&stats, 0, sizeof(struct gc_stat));
1472 closure_init_stack(&writes);
1473 bch_btree_op_init_stack(&op);
1474 op.lock = SHRT_MAX;
1475
1476 btree_gc_start(c);
1477
Kent Overstreet57943512013-04-25 13:58:35 -07001478 atomic_inc(&c->prio_blocked);
1479
Kent Overstreetcafe5632013-03-23 16:11:31 -07001480 ret = btree_root(gc_root, c, &op, &writes, &stats);
1481 closure_sync(&op.cl);
1482 closure_sync(&writes);
1483
1484 if (ret) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07001485 pr_warn("gc failed!");
Kent Overstreetcafe5632013-03-23 16:11:31 -07001486 continue_at(cl, bch_btree_gc, bch_gc_wq);
1487 }
1488
1489 /* Possibly wait for new UUIDs or whatever to hit disk */
1490 bch_journal_meta(c, &op.cl);
1491 closure_sync(&op.cl);
1492
1493 available = bch_btree_gc_finish(c);
1494
Kent Overstreet57943512013-04-25 13:58:35 -07001495 atomic_dec(&c->prio_blocked);
1496 wake_up_allocators(c);
1497
Kent Overstreet169ef1c2013-03-28 12:50:55 -06001498 bch_time_stats_update(&c->btree_gc_time, start_time);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001499
1500 stats.key_bytes *= sizeof(uint64_t);
1501 stats.dirty <<= 9;
1502 stats.data <<= 9;
1503 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1504 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001505
Kent Overstreetc37511b2013-04-26 15:39:55 -07001506 trace_bcache_gc_end(c);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001507
1508 continue_at(cl, bch_moving_gc, bch_gc_wq);
1509}
1510
1511void bch_queue_gc(struct cache_set *c)
1512{
1513 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
1514}
1515
1516/* Initial partial gc */
1517
1518static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1519 unsigned long **seen)
1520{
1521 int ret;
1522 unsigned i;
1523 struct bkey *k;
1524 struct bucket *g;
1525 struct btree_iter iter;
1526
1527 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1528 for (i = 0; i < KEY_PTRS(k); i++) {
1529 if (!ptr_available(b->c, k, i))
1530 continue;
1531
1532 g = PTR_BUCKET(b->c, k, i);
1533
1534 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1535 seen[PTR_DEV(k, i)]) ||
1536 !ptr_stale(b->c, k, i)) {
1537 g->gen = PTR_GEN(k, i);
1538
1539 if (b->level)
1540 g->prio = BTREE_PRIO;
1541 else if (g->prio == BTREE_PRIO)
1542 g->prio = INITIAL_PRIO;
1543 }
1544 }
1545
1546 btree_mark_key(b, k);
1547 }
1548
1549 if (b->level) {
1550 k = bch_next_recurse_key(b, &ZERO_KEY);
1551
1552 while (k) {
1553 struct bkey *p = bch_next_recurse_key(b, k);
1554 if (p)
1555 btree_node_prefetch(b->c, p, b->level - 1);
1556
1557 ret = btree(check_recurse, k, b, op, seen);
1558 if (ret)
1559 return ret;
1560
1561 k = p;
1562 }
1563 }
1564
1565 return 0;
1566}
1567
1568int bch_btree_check(struct cache_set *c, struct btree_op *op)
1569{
1570 int ret = -ENOMEM;
1571 unsigned i;
1572 unsigned long *seen[MAX_CACHES_PER_SET];
1573
1574 memset(seen, 0, sizeof(seen));
1575
1576 for (i = 0; c->cache[i]; i++) {
1577 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1578 seen[i] = kmalloc(n, GFP_KERNEL);
1579 if (!seen[i])
1580 goto err;
1581
1582 /* Disables the seen array until prio_read() uses it too */
1583 memset(seen[i], 0xFF, n);
1584 }
1585
1586 ret = btree_root(check_recurse, c, op, seen);
1587err:
1588 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1589 kfree(seen[i]);
1590 return ret;
1591}
1592
1593/* Btree insertion */
1594
1595static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1596{
1597 struct bset *i = b->sets[b->nsets].data;
1598
1599 memmove((uint64_t *) where + bkey_u64s(insert),
1600 where,
1601 (void *) end(i) - (void *) where);
1602
1603 i->keys += bkey_u64s(insert);
1604 bkey_copy(where, insert);
1605 bch_bset_fix_lookup_table(b, where);
1606}
1607
1608static bool fix_overlapping_extents(struct btree *b,
1609 struct bkey *insert,
1610 struct btree_iter *iter,
1611 struct btree_op *op)
1612{
Kent Overstreet279afba2013-06-05 06:21:07 -07001613 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001614 {
Kent Overstreet279afba2013-06-05 06:21:07 -07001615 if (KEY_DIRTY(k))
1616 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1617 offset, -sectors);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001618 }
1619
Kent Overstreet279afba2013-06-05 06:21:07 -07001620 uint64_t old_offset;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001621 unsigned old_size, sectors_found = 0;
1622
1623 while (1) {
1624 struct bkey *k = bch_btree_iter_next(iter);
1625 if (!k ||
1626 bkey_cmp(&START_KEY(k), insert) >= 0)
1627 break;
1628
1629 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1630 continue;
1631
Kent Overstreet279afba2013-06-05 06:21:07 -07001632 old_offset = KEY_START(k);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001633 old_size = KEY_SIZE(k);
1634
1635 /*
1636 * We might overlap with 0 size extents; we can't skip these
1637 * because if they're in the set we're inserting to we have to
1638 * adjust them so they don't overlap with the key we're
1639 * inserting. But we don't want to check them for BTREE_REPLACE
1640 * operations.
1641 */
1642
1643 if (op->type == BTREE_REPLACE &&
1644 KEY_SIZE(k)) {
1645 /*
1646 * k might have been split since we inserted/found the
1647 * key we're replacing
1648 */
1649 unsigned i;
1650 uint64_t offset = KEY_START(k) -
1651 KEY_START(&op->replace);
1652
1653 /* But it must be a subset of the replace key */
1654 if (KEY_START(k) < KEY_START(&op->replace) ||
1655 KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
1656 goto check_failed;
1657
1658 /* We didn't find a key that we were supposed to */
1659 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1660 goto check_failed;
1661
1662 if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
1663 goto check_failed;
1664
1665 /* skip past gen */
1666 offset <<= 8;
1667
1668 BUG_ON(!KEY_PTRS(&op->replace));
1669
1670 for (i = 0; i < KEY_PTRS(&op->replace); i++)
1671 if (k->ptr[i] != op->replace.ptr[i] + offset)
1672 goto check_failed;
1673
1674 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1675 }
1676
1677 if (bkey_cmp(insert, k) < 0 &&
1678 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1679 /*
1680 * We overlapped in the middle of an existing key: that
1681 * means we have to split the old key. But we have to do
1682 * slightly different things depending on whether the
1683 * old key has been written out yet.
1684 */
1685
1686 struct bkey *top;
1687
Kent Overstreet279afba2013-06-05 06:21:07 -07001688 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001689
1690 if (bkey_written(b, k)) {
1691 /*
1692 * We insert a new key to cover the top of the
1693 * old key, and the old key is modified in place
1694 * to represent the bottom split.
1695 *
1696 * It's completely arbitrary whether the new key
1697 * is the top or the bottom, but it has to match
1698 * up with what btree_sort_fixup() does - it
1699 * doesn't check for this kind of overlap, it
1700 * depends on us inserting a new key for the top
1701 * here.
1702 */
1703 top = bch_bset_search(b, &b->sets[b->nsets],
1704 insert);
1705 shift_keys(b, top, k);
1706 } else {
1707 BKEY_PADDED(key) temp;
1708 bkey_copy(&temp.key, k);
1709 shift_keys(b, k, &temp.key);
1710 top = bkey_next(k);
1711 }
1712
1713 bch_cut_front(insert, top);
1714 bch_cut_back(&START_KEY(insert), k);
1715 bch_bset_fix_invalidated_key(b, k);
1716 return false;
1717 }
1718
1719 if (bkey_cmp(insert, k) < 0) {
1720 bch_cut_front(insert, k);
1721 } else {
Kent Overstreet1fa84552013-11-10 21:55:27 -08001722 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1723 old_offset = KEY_START(insert);
1724
Kent Overstreetcafe5632013-03-23 16:11:31 -07001725 if (bkey_written(b, k) &&
1726 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1727 /*
1728 * Completely overwrote, so we don't have to
1729 * invalidate the binary search tree
1730 */
1731 bch_cut_front(k, k);
1732 } else {
1733 __bch_cut_back(&START_KEY(insert), k);
1734 bch_bset_fix_invalidated_key(b, k);
1735 }
1736 }
1737
Kent Overstreet279afba2013-06-05 06:21:07 -07001738 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001739 }
1740
1741check_failed:
1742 if (op->type == BTREE_REPLACE) {
1743 if (!sectors_found) {
1744 op->insert_collision = true;
1745 return true;
1746 } else if (sectors_found < KEY_SIZE(insert)) {
1747 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1748 (KEY_SIZE(insert) - sectors_found));
1749 SET_KEY_SIZE(insert, sectors_found);
1750 }
1751 }
1752
1753 return false;
1754}
1755
1756static bool btree_insert_key(struct btree *b, struct btree_op *op,
1757 struct bkey *k)
1758{
1759 struct bset *i = b->sets[b->nsets].data;
1760 struct bkey *m, *prev;
Kent Overstreet85b14922013-05-14 20:33:16 -07001761 unsigned status = BTREE_INSERT_STATUS_INSERT;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001762
1763 BUG_ON(bkey_cmp(k, &b->key) > 0);
1764 BUG_ON(b->level && !KEY_PTRS(k));
1765 BUG_ON(!b->level && !KEY_OFFSET(k));
1766
1767 if (!b->level) {
1768 struct btree_iter iter;
1769 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
1770
1771 /*
1772 * bset_search() returns the first key that is strictly greater
1773 * than the search key - but for back merging, we want to find
1774 * the first key that is greater than or equal to KEY_START(k) -
1775 * unless KEY_START(k) is 0.
1776 */
1777 if (KEY_OFFSET(&search))
1778 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
1779
1780 prev = NULL;
1781 m = bch_btree_iter_init(b, &iter, &search);
1782
1783 if (fix_overlapping_extents(b, k, &iter, op))
1784 return false;
1785
Kent Overstreet1fa84552013-11-10 21:55:27 -08001786 if (KEY_DIRTY(k))
1787 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1788 KEY_START(k), KEY_SIZE(k));
1789
Kent Overstreetcafe5632013-03-23 16:11:31 -07001790 while (m != end(i) &&
1791 bkey_cmp(k, &START_KEY(m)) > 0)
1792 prev = m, m = bkey_next(m);
1793
1794 if (key_merging_disabled(b->c))
1795 goto insert;
1796
1797 /* prev is in the tree, if we merge we're done */
Kent Overstreet85b14922013-05-14 20:33:16 -07001798 status = BTREE_INSERT_STATUS_BACK_MERGE;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001799 if (prev &&
1800 bch_bkey_try_merge(b, prev, k))
1801 goto merged;
1802
Kent Overstreet85b14922013-05-14 20:33:16 -07001803 status = BTREE_INSERT_STATUS_OVERWROTE;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001804 if (m != end(i) &&
1805 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1806 goto copy;
1807
Kent Overstreet85b14922013-05-14 20:33:16 -07001808 status = BTREE_INSERT_STATUS_FRONT_MERGE;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001809 if (m != end(i) &&
1810 bch_bkey_try_merge(b, k, m))
1811 goto copy;
1812 } else
1813 m = bch_bset_search(b, &b->sets[b->nsets], k);
1814
1815insert: shift_keys(b, m, k);
1816copy: bkey_copy(m, k);
1817merged:
Kent Overstreet85b14922013-05-14 20:33:16 -07001818 bch_check_keys(b, "%u for %s", status, op_type(op));
Kent Overstreetcafe5632013-03-23 16:11:31 -07001819
1820 if (b->level && !KEY_OFFSET(k))
Kent Overstreet57943512013-04-25 13:58:35 -07001821 btree_current_write(b)->prio_blocked++;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001822
Kent Overstreet85b14922013-05-14 20:33:16 -07001823 trace_bcache_btree_insert_key(b, k, op->type, status);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001824
1825 return true;
1826}
1827
Kent Overstreet26c949f2013-09-10 18:41:15 -07001828static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
1829 struct keylist *insert_keys)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001830{
1831 bool ret = false;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001832 unsigned oldsize = bch_count_data(b);
1833
Kent Overstreet26c949f2013-09-10 18:41:15 -07001834 while (!bch_keylist_empty(insert_keys)) {
Kent Overstreet403b6cd2013-07-24 17:22:44 -07001835 struct bset *i = write_block(b);
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07001836 struct bkey *k = insert_keys->keys;
Kent Overstreet26c949f2013-09-10 18:41:15 -07001837
Kent Overstreet403b6cd2013-07-24 17:22:44 -07001838 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
1839 > btree_blocks(b))
1840 break;
1841
1842 if (bkey_cmp(k, &b->key) <= 0) {
Kent Overstreet26c949f2013-09-10 18:41:15 -07001843 bkey_put(b->c, k, b->level);
1844
1845 ret |= btree_insert_key(b, op, k);
1846 bch_keylist_pop_front(insert_keys);
1847 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
1848#if 0
1849 if (op->type == BTREE_REPLACE) {
1850 bkey_put(b->c, k, b->level);
1851 bch_keylist_pop_front(insert_keys);
1852 op->insert_collision = true;
1853 break;
1854 }
1855#endif
1856 BKEY_PADDED(key) temp;
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07001857 bkey_copy(&temp.key, insert_keys->keys);
Kent Overstreet26c949f2013-09-10 18:41:15 -07001858
1859 bch_cut_back(&b->key, &temp.key);
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07001860 bch_cut_front(&b->key, insert_keys->keys);
Kent Overstreet26c949f2013-09-10 18:41:15 -07001861
1862 ret |= btree_insert_key(b, op, &temp.key);
1863 break;
1864 } else {
1865 break;
1866 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001867 }
1868
Kent Overstreet403b6cd2013-07-24 17:22:44 -07001869 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
1870
Kent Overstreetcafe5632013-03-23 16:11:31 -07001871 BUG_ON(bch_count_data(b) < oldsize);
1872 return ret;
1873}
1874
Kent Overstreet26c949f2013-09-10 18:41:15 -07001875static int btree_split(struct btree *b, struct btree_op *op,
1876 struct keylist *insert_keys,
1877 struct keylist *parent_keys)
Kent Overstreetcafe5632013-03-23 16:11:31 -07001878{
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07001879 bool split;
Kent Overstreetcafe5632013-03-23 16:11:31 -07001880 struct btree *n1, *n2 = NULL, *n3 = NULL;
1881 uint64_t start_time = local_clock();
1882
Kent Overstreet35fcd842013-07-24 17:29:09 -07001883 n1 = btree_node_alloc_replacement(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001884 if (IS_ERR(n1))
1885 goto err;
1886
1887 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1888
Kent Overstreetcafe5632013-03-23 16:11:31 -07001889 if (split) {
1890 unsigned keys = 0;
1891
Kent Overstreetc37511b2013-04-26 15:39:55 -07001892 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1893
Kent Overstreet35fcd842013-07-24 17:29:09 -07001894 n2 = bch_btree_node_alloc(b->c, b->level);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001895 if (IS_ERR(n2))
1896 goto err_free1;
1897
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07001898 if (!b->parent) {
Kent Overstreet35fcd842013-07-24 17:29:09 -07001899 n3 = bch_btree_node_alloc(b->c, b->level + 1);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001900 if (IS_ERR(n3))
1901 goto err_free2;
1902 }
1903
Kent Overstreet26c949f2013-09-10 18:41:15 -07001904 bch_btree_insert_keys(n1, op, insert_keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001905
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07001906 /*
1907 * Has to be a linear search because we don't have an auxiliary
Kent Overstreetcafe5632013-03-23 16:11:31 -07001908 * search tree yet
1909 */
1910
1911 while (keys < (n1->sets[0].data->keys * 3) / 5)
1912 keys += bkey_u64s(node(n1->sets[0].data, keys));
1913
1914 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
1915 keys += bkey_u64s(node(n1->sets[0].data, keys));
1916
1917 n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
1918 n1->sets[0].data->keys = keys;
1919
1920 memcpy(n2->sets[0].data->start,
1921 end(n1->sets[0].data),
1922 n2->sets[0].data->keys * sizeof(uint64_t));
1923
1924 bkey_copy_key(&n2->key, &b->key);
1925
Kent Overstreet26c949f2013-09-10 18:41:15 -07001926 bch_keylist_add(parent_keys, &n2->key);
Kent Overstreet57943512013-04-25 13:58:35 -07001927 bch_btree_node_write(n2, &op->cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001928 rw_unlock(true, n2);
Kent Overstreetc37511b2013-04-26 15:39:55 -07001929 } else {
1930 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1931
Kent Overstreet26c949f2013-09-10 18:41:15 -07001932 bch_btree_insert_keys(n1, op, insert_keys);
Kent Overstreetc37511b2013-04-26 15:39:55 -07001933 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07001934
Kent Overstreet26c949f2013-09-10 18:41:15 -07001935 bch_keylist_add(parent_keys, &n1->key);
Kent Overstreet57943512013-04-25 13:58:35 -07001936 bch_btree_node_write(n1, &op->cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001937
1938 if (n3) {
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07001939 /* Depth increases, make a new root */
1940
Kent Overstreetcafe5632013-03-23 16:11:31 -07001941 bkey_copy_key(&n3->key, &MAX_KEY);
Kent Overstreet26c949f2013-09-10 18:41:15 -07001942 bch_btree_insert_keys(n3, op, parent_keys);
Kent Overstreet57943512013-04-25 13:58:35 -07001943 bch_btree_node_write(n3, &op->cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001944
1945 closure_sync(&op->cl);
1946 bch_btree_set_root(n3);
1947 rw_unlock(true, n3);
Kent Overstreetd6fd3b12013-07-24 17:20:19 -07001948 } else if (!b->parent) {
1949 /* Root filled up but didn't need to be split */
1950
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07001951 bch_keylist_reset(parent_keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001952 closure_sync(&op->cl);
1953 bch_btree_set_root(n1);
1954 } else {
1955 unsigned i;
1956
Kent Overstreet26c949f2013-09-10 18:41:15 -07001957 bkey_copy(parent_keys->top, &b->key);
1958 bkey_copy_key(parent_keys->top, &ZERO_KEY);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001959
1960 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1961 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
1962
Kent Overstreet26c949f2013-09-10 18:41:15 -07001963 SET_PTR_GEN(parent_keys->top, i, g);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001964 }
1965
Kent Overstreet26c949f2013-09-10 18:41:15 -07001966 bch_keylist_push(parent_keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001967 closure_sync(&op->cl);
1968 atomic_inc(&b->c->prio_blocked);
1969 }
1970
1971 rw_unlock(true, n1);
Kent Overstreete8e1d462013-07-24 17:27:07 -07001972 btree_node_free(b);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001973
Kent Overstreet169ef1c2013-03-28 12:50:55 -06001974 bch_time_stats_update(&b->c->btree_split_time, start_time);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001975
1976 return 0;
1977err_free2:
1978 __bkey_put(n2->c, &n2->key);
Kent Overstreete8e1d462013-07-24 17:27:07 -07001979 btree_node_free(n2);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001980 rw_unlock(true, n2);
1981err_free1:
1982 __bkey_put(n1->c, &n1->key);
Kent Overstreete8e1d462013-07-24 17:27:07 -07001983 btree_node_free(n1);
Kent Overstreetcafe5632013-03-23 16:11:31 -07001984 rw_unlock(true, n1);
1985err:
1986 if (n3 == ERR_PTR(-EAGAIN) ||
1987 n2 == ERR_PTR(-EAGAIN) ||
1988 n1 == ERR_PTR(-EAGAIN))
1989 return -EAGAIN;
1990
1991 pr_warn("couldn't split");
1992 return -ENOMEM;
1993}
1994
Kent Overstreet26c949f2013-09-10 18:41:15 -07001995static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
1996 struct keylist *insert_keys)
1997{
1998 int ret = 0;
1999 struct keylist split_keys;
2000
2001 bch_keylist_init(&split_keys);
2002
2003 BUG_ON(b->level);
2004
2005 do {
2006 if (should_split(b)) {
2007 if (current->bio_list) {
2008 op->lock = b->c->root->level + 1;
2009 ret = -EAGAIN;
2010 } else if (op->lock <= b->c->root->level) {
2011 op->lock = b->c->root->level + 1;
2012 ret = -EINTR;
2013 } else {
2014 struct btree *parent = b->parent;
2015
2016 ret = btree_split(b, op, insert_keys,
2017 &split_keys);
2018 insert_keys = &split_keys;
2019 b = parent;
Kent Overstreet403b6cd2013-07-24 17:22:44 -07002020 if (!ret)
2021 ret = -EINTR;
Kent Overstreet26c949f2013-09-10 18:41:15 -07002022 }
2023 } else {
2024 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2025
2026 if (bch_btree_insert_keys(b, op, insert_keys)) {
2027 if (!b->level)
2028 bch_btree_leaf_dirty(b, op);
2029 else
2030 bch_btree_node_write(b, &op->cl);
2031 }
2032 }
2033 } while (!bch_keylist_empty(&split_keys));
2034
2035 return ret;
2036}
2037
Kent Overstreete7c590e2013-09-10 18:39:16 -07002038int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2039 struct bkey *check_key)
2040{
2041 int ret = -EINTR;
2042 uint64_t btree_ptr = b->key.ptr[0];
2043 unsigned long seq = b->seq;
2044 struct keylist insert;
2045 bool upgrade = op->lock == -1;
2046
2047 bch_keylist_init(&insert);
2048
2049 if (upgrade) {
2050 rw_unlock(false, b);
2051 rw_lock(true, b, b->level);
2052
2053 if (b->key.ptr[0] != btree_ptr ||
2054 b->seq != seq + 1)
2055 goto out;
2056 }
2057
2058 SET_KEY_PTRS(check_key, 1);
2059 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2060
2061 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2062
2063 bch_keylist_add(&insert, check_key);
2064
2065 BUG_ON(op->type != BTREE_INSERT);
2066
2067 ret = bch_btree_insert_node(b, op, &insert);
2068
2069 BUG_ON(!ret && !bch_keylist_empty(&insert));
2070out:
2071 if (upgrade)
2072 downgrade_write(&b->lock);
2073 return ret;
2074}
2075
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002076static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2077 struct keylist *keys)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002078{
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002079 if (bch_keylist_empty(keys))
Kent Overstreet403b6cd2013-07-24 17:22:44 -07002080 return 0;
2081
Kent Overstreetcafe5632013-03-23 16:11:31 -07002082 if (b->level) {
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002083 struct bkey *k;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002084
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07002085 k = bch_next_recurse_key(b, &START_KEY(keys->keys));
Kent Overstreetcafe5632013-03-23 16:11:31 -07002086 if (!k) {
2087 btree_bug(b, "no key to recurse on at level %i/%i",
2088 b->level, b->c->root->level);
2089
Kent Overstreetc2f95ae2013-07-24 17:24:25 -07002090 bch_keylist_reset(keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002091 return -EIO;
2092 }
2093
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002094 return btree(insert_recurse, k, b, op, keys);
Kent Overstreet26c949f2013-09-10 18:41:15 -07002095 } else {
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002096 return bch_btree_insert_node(b, op, keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002097 }
Kent Overstreetcafe5632013-03-23 16:11:31 -07002098}
2099
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002100int bch_btree_insert(struct btree_op *op, struct cache_set *c,
2101 struct keylist *keys)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002102{
2103 int ret = 0;
Kent Overstreetcafe5632013-03-23 16:11:31 -07002104
2105 /*
2106 * Don't want to block with the btree locked unless we have to,
2107 * otherwise we get deadlocks with try_harder and between split/gc
2108 */
2109 clear_closure_blocking(&op->cl);
2110
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002111 BUG_ON(bch_keylist_empty(keys));
Kent Overstreetcafe5632013-03-23 16:11:31 -07002112
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002113 while (!bch_keylist_empty(keys)) {
Kent Overstreet403b6cd2013-07-24 17:22:44 -07002114 op->lock = 0;
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002115 ret = btree_root(insert_recurse, c, op, keys);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002116
2117 if (ret == -EAGAIN) {
2118 ret = 0;
2119 closure_sync(&op->cl);
2120 } else if (ret) {
2121 struct bkey *k;
2122
2123 pr_err("error %i trying to insert key for %s",
2124 ret, op_type(op));
2125
Kent Overstreet4f3d4012013-09-10 18:46:36 -07002126 while ((k = bch_keylist_pop(keys)))
Kent Overstreetcafe5632013-03-23 16:11:31 -07002127 bkey_put(c, k, 0);
2128 }
2129 }
2130
Kent Overstreetcafe5632013-03-23 16:11:31 -07002131 return ret;
2132}
2133
2134void bch_btree_set_root(struct btree *b)
2135{
2136 unsigned i;
Kent Overstreete49c7c32013-06-26 17:25:38 -07002137 struct closure cl;
2138
2139 closure_init_stack(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002140
Kent Overstreetc37511b2013-04-26 15:39:55 -07002141 trace_bcache_btree_set_root(b);
2142
Kent Overstreetcafe5632013-03-23 16:11:31 -07002143 BUG_ON(!b->written);
2144
2145 for (i = 0; i < KEY_PTRS(&b->key); i++)
2146 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2147
2148 mutex_lock(&b->c->bucket_lock);
2149 list_del_init(&b->list);
2150 mutex_unlock(&b->c->bucket_lock);
2151
2152 b->c->root = b;
2153 __bkey_put(b->c, &b->key);
2154
Kent Overstreete49c7c32013-06-26 17:25:38 -07002155 bch_journal_meta(b->c, &cl);
2156 closure_sync(&cl);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002157}
2158
2159/* Cache lookup */
2160
2161static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
2162 struct bkey *k)
2163{
2164 struct search *s = container_of(op, struct search, op);
2165 struct bio *bio = &s->bio.bio;
2166 int ret = 0;
2167
2168 while (!ret &&
2169 !op->lookup_done) {
2170 unsigned sectors = INT_MAX;
2171
2172 if (KEY_INODE(k) == op->inode) {
2173 if (KEY_START(k) <= bio->bi_sector)
2174 break;
2175
2176 sectors = min_t(uint64_t, sectors,
2177 KEY_START(k) - bio->bi_sector);
2178 }
2179
2180 ret = s->d->cache_miss(b, s, bio, sectors);
2181 }
2182
2183 return ret;
2184}
2185
2186/*
2187 * Read from a single key, handling the initial cache miss if the key starts in
2188 * the middle of the bio
2189 */
2190static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2191 struct bkey *k)
2192{
2193 struct search *s = container_of(op, struct search, op);
2194 struct bio *bio = &s->bio.bio;
2195 unsigned ptr;
2196 struct bio *n;
2197
2198 int ret = submit_partial_cache_miss(b, op, k);
2199 if (ret || op->lookup_done)
2200 return ret;
2201
2202 /* XXX: figure out best pointer - for multiple cache devices */
2203 ptr = 0;
2204
2205 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
2206
2207 while (!op->lookup_done &&
2208 KEY_INODE(k) == op->inode &&
2209 bio->bi_sector < KEY_OFFSET(k)) {
2210 struct bkey *bio_key;
2211 sector_t sector = PTR_OFFSET(k, ptr) +
2212 (bio->bi_sector - KEY_START(k));
2213 unsigned sectors = min_t(uint64_t, INT_MAX,
2214 KEY_OFFSET(k) - bio->bi_sector);
2215
2216 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002217 if (n == bio)
2218 op->lookup_done = true;
2219
2220 bio_key = &container_of(n, struct bbio, bio)->key;
2221
2222 /*
2223 * The bucket we're reading from might be reused while our bio
2224 * is in flight, and we could then end up reading the wrong
2225 * data.
2226 *
2227 * We guard against this by checking (in cache_read_endio()) if
2228 * the pointer is stale again; if so, we treat it as an error
2229 * and reread from the backing device (but we don't pass that
2230 * error up anywhere).
2231 */
2232
2233 bch_bkey_copy_single_ptr(bio_key, k, ptr);
2234 SET_PTR_OFFSET(bio_key, 0, sector);
2235
2236 n->bi_end_io = bch_cache_read_endio;
2237 n->bi_private = &s->cl;
2238
Kent Overstreetcafe5632013-03-23 16:11:31 -07002239 __bch_submit_bbio(n, b->c);
2240 }
2241
2242 return 0;
2243}
2244
2245int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2246{
2247 struct search *s = container_of(op, struct search, op);
2248 struct bio *bio = &s->bio.bio;
2249
2250 int ret = 0;
2251 struct bkey *k;
2252 struct btree_iter iter;
2253 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2254
Kent Overstreetcafe5632013-03-23 16:11:31 -07002255 do {
2256 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2257 if (!k) {
2258 /*
2259 * b->key would be exactly what we want, except that
2260 * pointers to btree nodes have nonzero size - we
2261 * wouldn't go far enough
2262 */
2263
2264 ret = submit_partial_cache_miss(b, op,
2265 &KEY(KEY_INODE(&b->key),
2266 KEY_OFFSET(&b->key), 0));
2267 break;
2268 }
2269
2270 ret = b->level
2271 ? btree(search_recurse, k, b, op)
2272 : submit_partial_cache_hit(b, op, k);
2273 } while (!ret &&
2274 !op->lookup_done);
2275
2276 return ret;
2277}
2278
2279/* Keybuf code */
2280
2281static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2282{
2283 /* Overlapping keys compare equal */
2284 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2285 return -1;
2286 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2287 return 1;
2288 return 0;
2289}
2290
2291static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2292 struct keybuf_key *r)
2293{
2294 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2295}
2296
2297static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
Kent Overstreet72c27062013-06-05 06:24:39 -07002298 struct keybuf *buf, struct bkey *end,
2299 keybuf_pred_fn *pred)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002300{
2301 struct btree_iter iter;
2302 bch_btree_iter_init(b, &iter, &buf->last_scanned);
2303
2304 while (!array_freelist_empty(&buf->freelist)) {
2305 struct bkey *k = bch_btree_iter_next_filter(&iter, b,
2306 bch_ptr_bad);
2307
2308 if (!b->level) {
2309 if (!k) {
2310 buf->last_scanned = b->key;
2311 break;
2312 }
2313
2314 buf->last_scanned = *k;
2315 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2316 break;
2317
Kent Overstreet72c27062013-06-05 06:24:39 -07002318 if (pred(buf, k)) {
Kent Overstreetcafe5632013-03-23 16:11:31 -07002319 struct keybuf_key *w;
2320
Kent Overstreetcafe5632013-03-23 16:11:31 -07002321 spin_lock(&buf->lock);
2322
2323 w = array_alloc(&buf->freelist);
2324
2325 w->private = NULL;
2326 bkey_copy(&w->key, k);
2327
2328 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2329 array_free(&buf->freelist, w);
2330
2331 spin_unlock(&buf->lock);
2332 }
2333 } else {
2334 if (!k)
2335 break;
2336
Kent Overstreet72c27062013-06-05 06:24:39 -07002337 btree(refill_keybuf, k, b, op, buf, end, pred);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002338 /*
2339 * Might get an error here, but can't really do anything
2340 * and it'll get logged elsewhere. Just read what we
2341 * can.
2342 */
2343
2344 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2345 break;
2346
2347 cond_resched();
2348 }
2349 }
2350
2351 return 0;
2352}
2353
2354void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
Kent Overstreet72c27062013-06-05 06:24:39 -07002355 struct bkey *end, keybuf_pred_fn *pred)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002356{
2357 struct bkey start = buf->last_scanned;
2358 struct btree_op op;
2359 bch_btree_op_init_stack(&op);
2360
2361 cond_resched();
2362
Kent Overstreet72c27062013-06-05 06:24:39 -07002363 btree_root(refill_keybuf, c, &op, buf, end, pred);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002364 closure_sync(&op.cl);
2365
2366 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
2367 RB_EMPTY_ROOT(&buf->keys) ? "no" :
2368 array_freelist_empty(&buf->freelist) ? "some" : "a few",
2369 KEY_INODE(&start), KEY_OFFSET(&start),
2370 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
2371
2372 spin_lock(&buf->lock);
2373
2374 if (!RB_EMPTY_ROOT(&buf->keys)) {
2375 struct keybuf_key *w;
2376 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2377 buf->start = START_KEY(&w->key);
2378
2379 w = RB_LAST(&buf->keys, struct keybuf_key, node);
2380 buf->end = w->key;
2381 } else {
2382 buf->start = MAX_KEY;
2383 buf->end = MAX_KEY;
2384 }
2385
2386 spin_unlock(&buf->lock);
2387}
2388
2389static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2390{
2391 rb_erase(&w->node, &buf->keys);
2392 array_free(&buf->freelist, w);
2393}
2394
2395void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2396{
2397 spin_lock(&buf->lock);
2398 __bch_keybuf_del(buf, w);
2399 spin_unlock(&buf->lock);
2400}
2401
2402bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2403 struct bkey *end)
2404{
2405 bool ret = false;
2406 struct keybuf_key *p, *w, s;
2407 s.key = *start;
2408
2409 if (bkey_cmp(end, &buf->start) <= 0 ||
2410 bkey_cmp(start, &buf->end) >= 0)
2411 return false;
2412
2413 spin_lock(&buf->lock);
2414 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2415
2416 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2417 p = w;
2418 w = RB_NEXT(w, node);
2419
2420 if (p->private)
2421 ret = true;
2422 else
2423 __bch_keybuf_del(buf, p);
2424 }
2425
2426 spin_unlock(&buf->lock);
2427 return ret;
2428}
2429
2430struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2431{
2432 struct keybuf_key *w;
2433 spin_lock(&buf->lock);
2434
2435 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2436
2437 while (w && w->private)
2438 w = RB_NEXT(w, node);
2439
2440 if (w)
2441 w->private = ERR_PTR(-EINTR);
2442
2443 spin_unlock(&buf->lock);
2444 return w;
2445}
2446
2447struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2448 struct keybuf *buf,
Kent Overstreet72c27062013-06-05 06:24:39 -07002449 struct bkey *end,
2450 keybuf_pred_fn *pred)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002451{
2452 struct keybuf_key *ret;
2453
2454 while (1) {
2455 ret = bch_keybuf_next(buf);
2456 if (ret)
2457 break;
2458
2459 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2460 pr_debug("scan finished");
2461 break;
2462 }
2463
Kent Overstreet72c27062013-06-05 06:24:39 -07002464 bch_refill_keybuf(c, buf, end, pred);
Kent Overstreetcafe5632013-03-23 16:11:31 -07002465 }
2466
2467 return ret;
2468}
2469
Kent Overstreet72c27062013-06-05 06:24:39 -07002470void bch_keybuf_init(struct keybuf *buf)
Kent Overstreetcafe5632013-03-23 16:11:31 -07002471{
Kent Overstreetcafe5632013-03-23 16:11:31 -07002472 buf->last_scanned = MAX_KEY;
2473 buf->keys = RB_ROOT;
2474
2475 spin_lock_init(&buf->lock);
2476 array_allocator_init(&buf->freelist);
2477}
2478
2479void bch_btree_exit(void)
2480{
2481 if (btree_io_wq)
2482 destroy_workqueue(btree_io_wq);
2483 if (bch_gc_wq)
2484 destroy_workqueue(bch_gc_wq);
2485}
2486
2487int __init bch_btree_init(void)
2488{
2489 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
2490 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
2491 return -ENOMEM;
2492
2493 return 0;
2494}