Merge branch 'bcache' (bcache fixes from Kent Overstreet)
Merge bcache fixes from Kent Overstreet:
"There's fixes for _three_ different data corruption bugs, all of which
were found by users hitting them in the wild.
The first one isn't bcache specific - in 3.11 bcache was switched to
the bio_copy_data in fs/bio.c, and that's when the bug in that code
was discovered, but it's also used by raid1 and pktcdvd. (That was my
code too, so the bug's doubly embarassing given that it was or
should've been just a cut and paste from bcache code. Dunno what
happened there).
Most of these (all the non data corruption bugs, actually) were ready
before the merge window and have been sitting in Jens' tree, but I
don't know what's been up with him lately..."
* emailed patches from Kent Overstreet <kmo@daterainc.com>:
bcache: Fix flushes in writeback mode
bcache: Fix for handling overlapping extents when reading in a btree node
bcache: Fix a shrinker deadlock
bcache: Fix a dumb CPU spinning bug in writeback
bcache: Fix a flush/fua performance bug
bcache: Fix a writeback performance regression
bcache: Correct printf()-style format length modifier
bcache: Fix for when no journal entries are found
bcache: Strip endline when writing the label through sysfs
bcache: Fix a dumb journal discard bug
block: Fix bio_copy_data()
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b39f6f0..0f12382 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -498,7 +498,7 @@
*/
atomic_t has_dirty;
- struct ratelimit writeback_rate;
+ struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
/*
@@ -507,10 +507,9 @@
*/
sector_t last_read;
- /* Number of writeback bios in flight */
- atomic_t in_flight;
+ /* Limit number of writeback bios in flight */
+ struct semaphore in_flight;
struct closure_with_timer writeback;
- struct closure_waitlist writeback_wait;
struct keybuf writeback_keys;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 8010eed..22d1ae7 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -926,28 +926,45 @@
/* Mergesort */
+static void sort_key_next(struct btree_iter *iter,
+ struct btree_iter_set *i)
+{
+ i->k = bkey_next(i->k);
+
+ if (i->k == i->end)
+ *i = iter->data[--iter->used];
+}
+
static void btree_sort_fixup(struct btree_iter *iter)
{
while (iter->used > 1) {
struct btree_iter_set *top = iter->data, *i = top + 1;
- struct bkey *k;
if (iter->used > 2 &&
btree_iter_cmp(i[0], i[1]))
i++;
- for (k = i->k;
- k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
- k = bkey_next(k))
- if (top->k > i->k)
- __bch_cut_front(top->k, k);
- else if (KEY_SIZE(k))
- bch_cut_back(&START_KEY(k), top->k);
-
- if (top->k < i->k || k == i->k)
+ if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
break;
- heap_sift(iter, i - top, btree_iter_cmp);
+ if (!KEY_SIZE(i->k)) {
+ sort_key_next(iter, i);
+ heap_sift(iter, i - top, btree_iter_cmp);
+ continue;
+ }
+
+ if (top->k > i->k) {
+ if (bkey_cmp(top->k, i->k) >= 0)
+ sort_key_next(iter, i);
+ else
+ bch_cut_front(top->k, i->k);
+
+ heap_sift(iter, i - top, btree_iter_cmp);
+ } else {
+ /* can't happen because of comparison func */
+ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+ bch_cut_back(&START_KEY(i->k), top->k);
+ }
}
}
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f9764e6..f42fc7e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -255,7 +255,7 @@
return;
err:
- bch_cache_set_error(b->c, "io error reading bucket %lu",
+ bch_cache_set_error(b->c, "io error reading bucket %zu",
PTR_BUCKET_NR(b->c, &b->key, 0));
}
@@ -612,7 +612,7 @@
return SHRINK_STOP;
/* Return -1 if we can't do anything right now */
- if (sc->gfp_mask & __GFP_WAIT)
+ if (sc->gfp_mask & __GFP_IO)
mutex_lock(&c->bucket_lock);
else if (!mutex_trylock(&c->bucket_lock))
return -1;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ba95ab8..8435f81 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -153,7 +153,8 @@
bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
pr_debug("%u journal buckets", ca->sb.njournal_buckets);
- /* Read journal buckets ordered by golden ratio hash to quickly
+ /*
+ * Read journal buckets ordered by golden ratio hash to quickly
* find a sequence of buckets with valid journal entries
*/
for (i = 0; i < ca->sb.njournal_buckets; i++) {
@@ -166,18 +167,20 @@
goto bsearch;
}
- /* If that fails, check all the buckets we haven't checked
+ /*
+ * If that fails, check all the buckets we haven't checked
* already
*/
pr_debug("falling back to linear search");
- for (l = 0; l < ca->sb.njournal_buckets; l++) {
- if (test_bit(l, bitmap))
- continue;
-
+ for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
+ l < ca->sb.njournal_buckets;
+ l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
if (read_bucket(l))
goto bsearch;
- }
+
+ if (list_empty(list))
+ continue;
bsearch:
/* Binary search */
m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
@@ -197,10 +200,12 @@
r = m;
}
- /* Read buckets in reverse order until we stop finding more
+ /*
+ * Read buckets in reverse order until we stop finding more
* journal entries
*/
- pr_debug("finishing up");
+ pr_debug("finishing up: m %u njournal_buckets %u",
+ m, ca->sb.njournal_buckets);
l = m;
while (1) {
@@ -228,9 +233,10 @@
}
}
- c->journal.seq = list_entry(list->prev,
- struct journal_replay,
- list)->j.seq;
+ if (!list_empty(list))
+ c->journal.seq = list_entry(list->prev,
+ struct journal_replay,
+ list)->j.seq;
return 0;
#undef read_bucket
@@ -428,7 +434,7 @@
return;
}
- switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
+ switch (atomic_read(&ja->discard_in_flight)) {
case DISCARD_IN_FLIGHT:
return;
@@ -689,6 +695,7 @@
if (cl)
BUG_ON(!closure_wait(&w->wait, cl));
+ closure_flush(&c->journal.io);
__journal_try_write(c, true);
}
}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 786a1a4..71eb233 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -997,14 +997,17 @@
} else {
bch_writeback_add(dc);
- if (s->op.flush_journal) {
+ if (bio->bi_rw & REQ_FLUSH) {
/* Also need to send a flush to the backing device */
- s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
- dc->disk.bio_split);
+ struct bio *flush = bio_alloc_bioset(0, GFP_NOIO,
+ dc->disk.bio_split);
- bio->bi_size = 0;
- bio->bi_vcnt = 0;
- closure_bio_submit(bio, cl, s->d);
+ flush->bi_rw = WRITE_FLUSH;
+ flush->bi_bdev = bio->bi_bdev;
+ flush->bi_end_io = request_endio;
+ flush->bi_private = cl;
+
+ closure_bio_submit(flush, cl, s->d);
} else {
s->op.cache_bio = bio;
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4fe6ab2..924dcfd 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -223,8 +223,13 @@
}
if (attr == &sysfs_label) {
- /* note: endlines are preserved */
- memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
+ if (size > SB_LABEL_SIZE)
+ return -EINVAL;
+ memcpy(dc->sb.label, buf, size);
+ if (size < SB_LABEL_SIZE)
+ dc->sb.label[size] = '\0';
+ if (size && dc->sb.label[size - 1] == '\n')
+ dc->sb.label[size - 1] = '\0';
bch_write_bdev_super(dc, NULL);
if (dc->disk.c) {
memcpy(dc->disk.c->uuids[dc->disk.id].label,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 98eb811..420dad5 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -190,7 +190,16 @@
stats->last = now ?: 1;
}
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1ae2a73..ea345c6 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -450,17 +450,23 @@
(ewma) >> factor; \
})
-struct ratelimit {
+struct bch_ratelimit {
+ /* Next time we want to do some work, in nanoseconds */
uint64_t next;
+
+ /*
+ * Rate at which we want to do work, in units per nanosecond
+ * The units here correspond to the units passed to bch_next_delay()
+ */
unsigned rate;
};
-static inline void ratelimit_reset(struct ratelimit *d)
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
{
d->next = local_clock();
}
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
#define __DIV_SAFE(n, d, zero) \
({ \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 22cbff5..ba3ee48 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -94,11 +94,15 @@
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{
+ uint64_t ret;
+
if (atomic_read(&dc->disk.detaching) ||
!dc->writeback_percent)
return 0;
- return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+ ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+
+ return min_t(uint64_t, ret, HZ);
}
/* Background writeback */
@@ -208,7 +212,7 @@
up_write(&dc->writeback_lock);
- ratelimit_reset(&dc->writeback_rate);
+ bch_ratelimit_reset(&dc->writeback_rate);
/* Punt to workqueue only so we don't recurse and blow the stack */
continue_at(cl, read_dirty, dirty_wq);
@@ -318,9 +322,7 @@
}
bch_keybuf_del(&dc->writeback_keys, w);
- atomic_dec_bug(&dc->in_flight);
-
- closure_wake_up(&dc->writeback_wait);
+ up(&dc->in_flight);
closure_return_with_destructor(cl, dirty_io_destructor);
}
@@ -349,7 +351,7 @@
closure_bio_submit(&io->bio, cl, &io->dc->disk);
- continue_at(cl, write_dirty_finish, dirty_wq);
+ continue_at(cl, write_dirty_finish, system_wq);
}
static void read_dirty_endio(struct bio *bio, int error)
@@ -369,7 +371,7 @@
closure_bio_submit(&io->bio, cl, &io->dc->disk);
- continue_at(cl, write_dirty, dirty_wq);
+ continue_at(cl, write_dirty, system_wq);
}
static void read_dirty(struct closure *cl)
@@ -394,12 +396,8 @@
if (delay > 0 &&
(KEY_START(&w->key) != dc->last_read ||
- jiffies_to_msecs(delay) > 50)) {
- w->private = NULL;
-
- closure_delay(&dc->writeback, delay);
- continue_at(cl, read_dirty, dirty_wq);
- }
+ jiffies_to_msecs(delay) > 50))
+ delay = schedule_timeout_uninterruptible(delay);
dc->last_read = KEY_OFFSET(&w->key);
@@ -424,15 +422,10 @@
trace_bcache_writeback(&w->key);
- closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+ down(&dc->in_flight);
+ closure_call(&io->cl, read_dirty_submit, NULL, cl);
delay = writeback_delay(dc, KEY_SIZE(&w->key));
-
- atomic_inc(&dc->in_flight);
-
- if (!closure_wait_event(&dc->writeback_wait, cl,
- atomic_read(&dc->in_flight) < 64))
- continue_at(cl, read_dirty, dirty_wq);
}
if (0) {
@@ -442,7 +435,11 @@
bch_keybuf_del(&dc->writeback_keys, w);
}
- refill_dirty(cl);
+ /*
+ * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+ * freed) before refilling again
+ */
+ continue_at(cl, refill_dirty, dirty_wq);
}
/* Init */
@@ -484,6 +481,7 @@
void bch_cached_dev_writeback_init(struct cached_dev *dc)
{
+ sema_init(&dc->in_flight, 64);
closure_init_unlocked(&dc->writeback);
init_rwsem(&dc->writeback_lock);
@@ -513,7 +511,7 @@
int __init bch_writeback_init(void)
{
- dirty_wq = create_singlethread_workqueue("bcache_writeback");
+ dirty_wq = create_workqueue("bcache_writeback");
if (!dirty_wq)
return -ENOMEM;
diff --git a/fs/bio.c b/fs/bio.c
index b3b20ed..ea5035d 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -917,8 +917,8 @@
src_p = kmap_atomic(src_bv->bv_page);
dst_p = kmap_atomic(dst_bv->bv_page);
- memcpy(dst_p + dst_bv->bv_offset,
- src_p + src_bv->bv_offset,
+ memcpy(dst_p + dst_offset,
+ src_p + src_offset,
bytes);
kunmap_atomic(dst_p);