blob: 2879487d036a8d11b6e807fe70e1f91f7457fcf2 [file] [log] [blame]
Kent Overstreetcafe5632013-03-23 16:11:31 -07001/*
2 * Primary bucket allocation code
3 *
4 * Copyright 2012 Google, Inc.
5 *
6 * Allocation in bcache is done in terms of buckets:
7 *
8 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
9 * btree pointers - they must match for the pointer to be considered valid.
10 *
11 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
12 * bucket simply by incrementing its gen.
13 *
14 * The gens (along with the priorities; it's really the gens are important but
15 * the code is named as if it's the priorities) are written in an arbitrary list
16 * of buckets on disk, with a pointer to them in the journal header.
17 *
18 * When we invalidate a bucket, we have to write its new gen to disk and wait
19 * for that write to complete before we use it - otherwise after a crash we
20 * could have pointers that appeared to be good but pointed to data that had
21 * been overwritten.
22 *
23 * Since the gens and priorities are all stored contiguously on disk, we can
24 * batch this up: We fill up the free_inc list with freshly invalidated buckets,
25 * call prio_write(), and when prio_write() finishes we pull buckets off the
26 * free_inc list and optionally discard them.
27 *
28 * free_inc isn't the only freelist - if it was, we'd often to sleep while
29 * priorities and gens were being written before we could allocate. c->free is a
30 * smaller freelist, and buckets on that list are always ready to be used.
31 *
32 * If we've got discards enabled, that happens when a bucket moves from the
33 * free_inc list to the free list.
34 *
35 * There is another freelist, because sometimes we have buckets that we know
36 * have nothing pointing into them - these we can reuse without waiting for
37 * priorities to be rewritten. These come from freed btree nodes and buckets
38 * that garbage collection discovered no longer had valid keys pointing into
39 * them (because they were overwritten). That's the unused list - buckets on the
40 * unused list move to the free list, optionally being discarded in the process.
41 *
42 * It's also important to ensure that gens don't wrap around - with respect to
43 * either the oldest gen in the btree or the gen on disk. This is quite
44 * difficult to do in practice, but we explicitly guard against it anyways - if
45 * a bucket is in danger of wrapping around we simply skip invalidating it that
46 * time around, and we garbage collect or rewrite the priorities sooner than we
47 * would have otherwise.
48 *
49 * bch_bucket_alloc() allocates a single bucket from a specific cache.
50 *
51 * bch_bucket_alloc_set() allocates one or more buckets from different caches
52 * out of a cache set.
53 *
54 * free_some_buckets() drives all the processes described above. It's called
55 * from bch_bucket_alloc() and a few other places that need to make sure free
56 * buckets are ready.
57 *
58 * invalidate_buckets_(lru|fifo)() find buckets that are available to be
59 * invalidated, and then invalidate them and stick them on the free_inc list -
60 * in either lru or fifo order.
61 */
62
63#include "bcache.h"
64#include "btree.h"
65
66#include <linux/random.h>
67
68#define MAX_IN_FLIGHT_DISCARDS 8U
69
70/* Bucket heap / gen */
71
72uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
73{
74 uint8_t ret = ++b->gen;
75
76 ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
77 WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
78
79 if (CACHE_SYNC(&ca->set->sb)) {
80 ca->need_save_prio = max(ca->need_save_prio,
81 bucket_disk_gen(b));
82 WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
83 }
84
85 return ret;
86}
87
88void bch_rescale_priorities(struct cache_set *c, int sectors)
89{
90 struct cache *ca;
91 struct bucket *b;
92 unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
93 unsigned i;
94 int r;
95
96 atomic_sub(sectors, &c->rescale);
97
98 do {
99 r = atomic_read(&c->rescale);
100
101 if (r >= 0)
102 return;
103 } while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
104
105 mutex_lock(&c->bucket_lock);
106
107 c->min_prio = USHRT_MAX;
108
109 for_each_cache(ca, c, i)
110 for_each_bucket(b, ca)
111 if (b->prio &&
112 b->prio != BTREE_PRIO &&
113 !atomic_read(&b->pin)) {
114 b->prio--;
115 c->min_prio = min(c->min_prio, b->prio);
116 }
117
118 mutex_unlock(&c->bucket_lock);
119}
120
121/* Discard/TRIM */
122
123struct discard {
124 struct list_head list;
125 struct work_struct work;
126 struct cache *ca;
127 long bucket;
128
129 struct bio bio;
130 struct bio_vec bv;
131};
132
133static void discard_finish(struct work_struct *w)
134{
135 struct discard *d = container_of(w, struct discard, work);
136 struct cache *ca = d->ca;
137 char buf[BDEVNAME_SIZE];
138
139 if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
140 pr_notice("discard error on %s, disabling",
141 bdevname(ca->bdev, buf));
142 d->ca->discard = 0;
143 }
144
145 mutex_lock(&ca->set->bucket_lock);
146
147 fifo_push(&ca->free, d->bucket);
148 list_add(&d->list, &ca->discards);
149 atomic_dec(&ca->discards_in_flight);
150
151 mutex_unlock(&ca->set->bucket_lock);
152
153 closure_wake_up(&ca->set->bucket_wait);
154 wake_up(&ca->set->alloc_wait);
155
156 closure_put(&ca->set->cl);
157}
158
159static void discard_endio(struct bio *bio, int error)
160{
161 struct discard *d = container_of(bio, struct discard, bio);
162 schedule_work(&d->work);
163}
164
165static void do_discard(struct cache *ca, long bucket)
166{
167 struct discard *d = list_first_entry(&ca->discards,
168 struct discard, list);
169
170 list_del(&d->list);
171 d->bucket = bucket;
172
173 atomic_inc(&ca->discards_in_flight);
174 closure_get(&ca->set->cl);
175
176 bio_init(&d->bio);
177
178 d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket);
179 d->bio.bi_bdev = ca->bdev;
180 d->bio.bi_rw = REQ_WRITE|REQ_DISCARD;
181 d->bio.bi_max_vecs = 1;
182 d->bio.bi_io_vec = d->bio.bi_inline_vecs;
183 d->bio.bi_size = bucket_bytes(ca);
184 d->bio.bi_end_io = discard_endio;
185 bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
186
187 submit_bio(0, &d->bio);
188}
189
190/* Allocation */
191
192static inline bool can_inc_bucket_gen(struct bucket *b)
193{
194 return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
195 bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
196}
197
198bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
199{
200 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
201
202 if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
203 CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
204 return false;
205
206 b->prio = 0;
207
208 if (can_inc_bucket_gen(b) &&
209 fifo_push(&ca->unused, b - ca->buckets)) {
210 atomic_inc(&b->pin);
211 return true;
212 }
213
214 return false;
215}
216
217static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
218{
219 return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
220 !atomic_read(&b->pin) &&
221 can_inc_bucket_gen(b);
222}
223
224static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
225{
226 bch_inc_gen(ca, b);
227 b->prio = INITIAL_PRIO;
228 atomic_inc(&b->pin);
229 fifo_push(&ca->free_inc, b - ca->buckets);
230}
231
Kent Overstreetb1a67b02013-03-25 11:46:44 -0700232#define bucket_prio(b) \
233 (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
234
235#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
236#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
237
Kent Overstreetcafe5632013-03-23 16:11:31 -0700238static void invalidate_buckets_lru(struct cache *ca)
239{
Kent Overstreetcafe5632013-03-23 16:11:31 -0700240 struct bucket *b;
241 ssize_t i;
242
243 ca->heap.used = 0;
244
245 for_each_bucket(b, ca) {
246 if (!can_invalidate_bucket(ca, b))
247 continue;
248
249 if (!GC_SECTORS_USED(b)) {
250 if (!bch_bucket_add_unused(ca, b))
251 return;
252 } else {
253 if (!heap_full(&ca->heap))
254 heap_add(&ca->heap, b, bucket_max_cmp);
255 else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
256 ca->heap.data[0] = b;
257 heap_sift(&ca->heap, 0, bucket_max_cmp);
258 }
259 }
260 }
261
262 if (ca->heap.used * 2 < ca->heap.size)
263 bch_queue_gc(ca->set);
264
265 for (i = ca->heap.used / 2 - 1; i >= 0; --i)
266 heap_sift(&ca->heap, i, bucket_min_cmp);
267
268 while (!fifo_full(&ca->free_inc)) {
269 if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
270 /* We don't want to be calling invalidate_buckets()
271 * multiple times when it can't do anything
272 */
273 ca->invalidate_needs_gc = 1;
274 bch_queue_gc(ca->set);
275 return;
276 }
277
278 invalidate_one_bucket(ca, b);
279 }
280}
281
282static void invalidate_buckets_fifo(struct cache *ca)
283{
284 struct bucket *b;
285 size_t checked = 0;
286
287 while (!fifo_full(&ca->free_inc)) {
288 if (ca->fifo_last_bucket < ca->sb.first_bucket ||
289 ca->fifo_last_bucket >= ca->sb.nbuckets)
290 ca->fifo_last_bucket = ca->sb.first_bucket;
291
292 b = ca->buckets + ca->fifo_last_bucket++;
293
294 if (can_invalidate_bucket(ca, b))
295 invalidate_one_bucket(ca, b);
296
297 if (++checked >= ca->sb.nbuckets) {
298 ca->invalidate_needs_gc = 1;
299 bch_queue_gc(ca->set);
300 return;
301 }
302 }
303}
304
305static void invalidate_buckets_random(struct cache *ca)
306{
307 struct bucket *b;
308 size_t checked = 0;
309
310 while (!fifo_full(&ca->free_inc)) {
311 size_t n;
312 get_random_bytes(&n, sizeof(n));
313
314 n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
315 n += ca->sb.first_bucket;
316
317 b = ca->buckets + n;
318
319 if (can_invalidate_bucket(ca, b))
320 invalidate_one_bucket(ca, b);
321
322 if (++checked >= ca->sb.nbuckets / 2) {
323 ca->invalidate_needs_gc = 1;
324 bch_queue_gc(ca->set);
325 return;
326 }
327 }
328}
329
330static void invalidate_buckets(struct cache *ca)
331{
332 if (ca->invalidate_needs_gc)
333 return;
334
335 switch (CACHE_REPLACEMENT(&ca->sb)) {
336 case CACHE_REPLACEMENT_LRU:
337 invalidate_buckets_lru(ca);
338 break;
339 case CACHE_REPLACEMENT_FIFO:
340 invalidate_buckets_fifo(ca);
341 break;
342 case CACHE_REPLACEMENT_RANDOM:
343 invalidate_buckets_random(ca);
344 break;
345 }
346}
347
348#define allocator_wait(ca, cond) \
349do { \
350 DEFINE_WAIT(__wait); \
351 \
352 while (!(cond)) { \
353 prepare_to_wait(&ca->set->alloc_wait, \
354 &__wait, TASK_INTERRUPTIBLE); \
355 \
356 mutex_unlock(&(ca)->set->bucket_lock); \
357 if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
358 finish_wait(&ca->set->alloc_wait, &__wait); \
359 closure_return(cl); \
360 } \
361 \
362 schedule(); \
363 __set_current_state(TASK_RUNNING); \
364 mutex_lock(&(ca)->set->bucket_lock); \
365 } \
366 \
367 finish_wait(&ca->set->alloc_wait, &__wait); \
368} while (0)
369
370void bch_allocator_thread(struct closure *cl)
371{
372 struct cache *ca = container_of(cl, struct cache, alloc);
373
374 mutex_lock(&ca->set->bucket_lock);
375
376 while (1) {
377 while (1) {
378 long bucket;
379
380 if ((!atomic_read(&ca->set->prio_blocked) ||
381 !CACHE_SYNC(&ca->set->sb)) &&
382 !fifo_empty(&ca->unused))
383 fifo_pop(&ca->unused, bucket);
384 else if (!fifo_empty(&ca->free_inc))
385 fifo_pop(&ca->free_inc, bucket);
386 else
387 break;
388
389 allocator_wait(ca, (int) fifo_free(&ca->free) >
390 atomic_read(&ca->discards_in_flight));
391
392 if (ca->discard) {
393 allocator_wait(ca, !list_empty(&ca->discards));
394 do_discard(ca, bucket);
395 } else {
396 fifo_push(&ca->free, bucket);
397 closure_wake_up(&ca->set->bucket_wait);
398 }
399 }
400
401 allocator_wait(ca, ca->set->gc_mark_valid);
402 invalidate_buckets(ca);
403
404 allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) ||
405 !CACHE_SYNC(&ca->set->sb));
406
407 if (CACHE_SYNC(&ca->set->sb) &&
408 (!fifo_empty(&ca->free_inc) ||
409 ca->need_save_prio > 64)) {
410 bch_prio_write(ca);
411 }
412 }
413}
414
415long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
416{
417 long r = -1;
418again:
419 wake_up(&ca->set->alloc_wait);
420
421 if (fifo_used(&ca->free) > ca->watermark[watermark] &&
422 fifo_pop(&ca->free, r)) {
423 struct bucket *b = ca->buckets + r;
424#ifdef CONFIG_BCACHE_EDEBUG
425 size_t iter;
426 long i;
427
428 for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
429 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
430
431 fifo_for_each(i, &ca->free, iter)
432 BUG_ON(i == r);
433 fifo_for_each(i, &ca->free_inc, iter)
434 BUG_ON(i == r);
435 fifo_for_each(i, &ca->unused, iter)
436 BUG_ON(i == r);
437#endif
438 BUG_ON(atomic_read(&b->pin) != 1);
439
440 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
441
442 if (watermark <= WATERMARK_METADATA) {
443 SET_GC_MARK(b, GC_MARK_METADATA);
444 b->prio = BTREE_PRIO;
445 } else {
446 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
447 b->prio = INITIAL_PRIO;
448 }
449
450 return r;
451 }
452
453 pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
454 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
455 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
456
457 if (cl) {
458 closure_wait(&ca->set->bucket_wait, cl);
459
460 if (closure_blocking(cl)) {
461 mutex_unlock(&ca->set->bucket_lock);
462 closure_sync(cl);
463 mutex_lock(&ca->set->bucket_lock);
464 goto again;
465 }
466 }
467
468 return -1;
469}
470
471void bch_bucket_free(struct cache_set *c, struct bkey *k)
472{
473 unsigned i;
474
475 for (i = 0; i < KEY_PTRS(k); i++) {
476 struct bucket *b = PTR_BUCKET(c, k, i);
477
478 SET_GC_MARK(b, 0);
479 SET_GC_SECTORS_USED(b, 0);
480 bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
481 }
482}
483
484int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
485 struct bkey *k, int n, struct closure *cl)
486{
487 int i;
488
489 lockdep_assert_held(&c->bucket_lock);
490 BUG_ON(!n || n > c->caches_loaded || n > 8);
491
492 bkey_init(k);
493
494 /* sort by free space/prio of oldest data in caches */
495
496 for (i = 0; i < n; i++) {
497 struct cache *ca = c->cache_by_alloc[i];
498 long b = bch_bucket_alloc(ca, watermark, cl);
499
500 if (b == -1)
501 goto err;
502
503 k->ptr[i] = PTR(ca->buckets[b].gen,
504 bucket_to_sector(c, b),
505 ca->sb.nr_this_dev);
506
507 SET_KEY_PTRS(k, i + 1);
508 }
509
510 return 0;
511err:
512 bch_bucket_free(c, k);
513 __bkey_put(c, k);
514 return -1;
515}
516
517int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
518 struct bkey *k, int n, struct closure *cl)
519{
520 int ret;
521 mutex_lock(&c->bucket_lock);
522 ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
523 mutex_unlock(&c->bucket_lock);
524 return ret;
525}
526
527/* Init */
528
529void bch_cache_allocator_exit(struct cache *ca)
530{
531 struct discard *d;
532
533 while (!list_empty(&ca->discards)) {
534 d = list_first_entry(&ca->discards, struct discard, list);
535 cancel_work_sync(&d->work);
536 list_del(&d->list);
537 kfree(d);
538 }
539}
540
541int bch_cache_allocator_init(struct cache *ca)
542{
543 unsigned i;
544
545 /*
546 * Reserve:
547 * Prio/gen writes first
548 * Then 8 for btree allocations
549 * Then half for the moving garbage collector
550 */
551
552 ca->watermark[WATERMARK_PRIO] = 0;
553
554 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
555
556 ca->watermark[WATERMARK_MOVINGGC] = 8 +
557 ca->watermark[WATERMARK_METADATA];
558
559 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
560 ca->watermark[WATERMARK_MOVINGGC];
561
562 for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
563 struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
564 if (!d)
565 return -ENOMEM;
566
567 d->ca = ca;
568 INIT_WORK(&d->work, discard_finish);
569 list_add(&d->list, &ca->discards);
570 }
571
572 return 0;
573}