Blame - drivers/md/bcache/journal.c - kernel/msm-4.19

blob: 7c9e6bf6aababae1c94abd3971f49af2b375c0c7 [file] [log] [blame]

Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	1	/*
				2	* bcache journalling code, for btree insertions
				3	*
				4	* Copyright 2012 Google, Inc.
				5	*/
				6
				7	#include "bcache.h"
				8	#include "btree.h"
				9	#include "debug.h"
				10	#include "request.h"
				11
Kent Overstreet	c37511b	2013-04-26 15:39:55 -0700	[diff] [blame]	12	#include <trace/events/bcache.h>
				13
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	14	/*
				15	* Journal replay/recovery:
				16	*
				17	* This code is all driven from run_cache_set(); we first read the journal
				18	* entries, do some other stuff, then we mark all the keys in the journal
				19	* entries (same as garbage collection would), then we replay them - reinserting
				20	* them into the cache in precisely the same order as they appear in the
				21	* journal.
				22	*
				23	* We only journal keys that go in leaf nodes, which simplifies things quite a
				24	* bit.
				25	*/
				26
				27	static void journal_read_endio(struct bio *bio, int error)
				28	{
				29	struct closure *cl = bio->bi_private;
				30	closure_put(cl);
				31	}
				32
				33	static int journal_read_bucket(struct cache ca, struct list_head list,
				34	struct btree_op *op, unsigned bucket_index)
				35	{
				36	struct journal_device *ja = &ca->journal;
				37	struct bio *bio = &ja->bio;
				38
				39	struct journal_replay *i;
				40	struct jset j, data = ca->set->journal.w[0].data;
				41	unsigned len, left, offset = 0;
				42	int ret = 0;
				43	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
				44
				45	pr_debug("reading %llu", (uint64_t) bucket);
				46
				47	while (offset < ca->sb.bucket_size) {
				48	reread: left = ca->sb.bucket_size - offset;
				49	len = min_t(unsigned, left, PAGE_SECTORS * 8);
				50
				51	bio_reset(bio);
				52	bio->bi_sector = bucket + offset;
				53	bio->bi_bdev = ca->bdev;
				54	bio->bi_rw = READ;
				55	bio->bi_size = len << 9;
				56
				57	bio->bi_end_io = journal_read_endio;
				58	bio->bi_private = &op->cl;
Kent Overstreet	169ef1c	2013-03-28 12:50:55 -0600	[diff] [blame]	59	bch_bio_map(bio, data);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	60
				61	closure_bio_submit(bio, &op->cl, ca);
				62	closure_sync(&op->cl);
				63
				64	/* This function could be simpler now since we no longer write
				65	* journal entries that overlap bucket boundaries; this means
				66	* the start of a bucket will always have a valid journal entry
				67	* if it has any journal entries at all.
				68	*/
				69
				70	j = data;
				71	while (len) {
				72	struct list_head *where;
				73	size_t blocks, bytes = set_bytes(j);
				74
				75	if (j->magic != jset_magic(ca->set))
				76	return ret;
				77
				78	if (bytes > left << 9)
				79	return ret;
				80
				81	if (bytes > len << 9)
				82	goto reread;
				83
				84	if (j->csum != csum_set(j))
				85	return ret;
				86
				87	blocks = set_blocks(j, ca->set);
				88
				89	while (!list_empty(list)) {
				90	i = list_first_entry(list,
				91	struct journal_replay, list);
				92	if (i->j.seq >= j->last_seq)
				93	break;
				94	list_del(&i->list);
				95	kfree(i);
				96	}
				97
				98	list_for_each_entry_reverse(i, list, list) {
				99	if (j->seq == i->j.seq)
				100	goto next_set;
				101
				102	if (j->seq < i->j.last_seq)
				103	goto next_set;
				104
				105	if (j->seq > i->j.seq) {
				106	where = &i->list;
				107	goto add;
				108	}
				109	}
				110
				111	where = list;
				112	add:
				113	i = kmalloc(offsetof(struct journal_replay, j) +
				114	bytes, GFP_KERNEL);
				115	if (!i)
				116	return -ENOMEM;
				117	memcpy(&i->j, j, bytes);
				118	list_add(&i->list, where);
				119	ret = 1;
				120
				121	ja->seq[bucket_index] = j->seq;
				122	next_set:
				123	offset += blocks * ca->sb.block_size;
				124	len -= blocks * ca->sb.block_size;
				125	j = ((void ) j) + blocks block_bytes(ca);
				126	}
				127	}
				128
				129	return ret;
				130	}
				131
				132	int bch_journal_read(struct cache_set c, struct list_head list,
				133	struct btree_op *op)
				134	{
				135	#define read_bucket(b) \
				136	({ \
				137	int ret = journal_read_bucket(ca, list, op, b); \
				138	__set_bit(b, bitmap); \
				139	if (ret < 0) \
				140	return ret; \
				141	ret; \
				142	})
				143
				144	struct cache *ca;
				145	unsigned iter;
				146
				147	for_each_cache(ca, c, iter) {
				148	struct journal_device *ja = &ca->journal;
				149	unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
				150	unsigned i, l, r, m;
				151	uint64_t seq;
				152
				153	bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
				154	pr_debug("%u journal buckets", ca->sb.njournal_buckets);
				155
Kent Overstreet	c426c4f	2013-09-23 23:17:29 -0700	[diff] [blame]	156	/*
				157	* Read journal buckets ordered by golden ratio hash to quickly
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	158	* find a sequence of buckets with valid journal entries
				159	*/
				160	for (i = 0; i < ca->sb.njournal_buckets; i++) {
				161	l = (i * 2654435769U) % ca->sb.njournal_buckets;
				162
				163	if (test_bit(l, bitmap))
				164	break;
				165
				166	if (read_bucket(l))
				167	goto bsearch;
				168	}
				169
Kent Overstreet	c426c4f	2013-09-23 23:17:29 -0700	[diff] [blame]	170	/*
				171	* If that fails, check all the buckets we haven't checked
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	172	* already
				173	*/
				174	pr_debug("falling back to linear search");
				175
Kent Overstreet	c426c4f	2013-09-23 23:17:29 -0700	[diff] [blame]	176	for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
				177	l < ca->sb.njournal_buckets;
				178	l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	179	if (read_bucket(l))
				180	goto bsearch;
Kent Overstreet	c426c4f	2013-09-23 23:17:29 -0700	[diff] [blame]	181
				182	if (list_empty(list))
				183	continue;
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	184	bsearch:
				185	/* Binary search */
				186	m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
				187	pr_debug("starting binary search, l %u r %u", l, r);
				188
				189	while (l + 1 < r) {
Kent Overstreet	faa5673	2013-07-11 22:42:14 -0700	[diff] [blame]	190	seq = list_entry(list->prev, struct journal_replay,
				191	list)->j.seq;
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	192
Kent Overstreet	faa5673	2013-07-11 22:42:14 -0700	[diff] [blame]	193	m = (l + r) >> 1;
				194	read_bucket(m);
				195
				196	if (seq != list_entry(list->prev, struct journal_replay,
				197	list)->j.seq)
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	198	l = m;
				199	else
				200	r = m;
				201	}
				202
Kent Overstreet	c426c4f	2013-09-23 23:17:29 -0700	[diff] [blame]	203	/*
				204	* Read buckets in reverse order until we stop finding more
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	205	* journal entries
				206	*/
Kent Overstreet	c426c4f	2013-09-23 23:17:29 -0700	[diff] [blame]	207	pr_debug("finishing up: m %u njournal_buckets %u",
				208	m, ca->sb.njournal_buckets);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	209	l = m;
				210
				211	while (1) {
				212	if (!l--)
				213	l = ca->sb.njournal_buckets - 1;
				214
				215	if (l == m)
				216	break;
				217
				218	if (test_bit(l, bitmap))
				219	continue;
				220
				221	if (!read_bucket(l))
				222	break;
				223	}
				224
				225	seq = 0;
				226
				227	for (i = 0; i < ca->sb.njournal_buckets; i++)
				228	if (ja->seq[i] > seq) {
				229	seq = ja->seq[i];
				230	ja->cur_idx = ja->discard_idx =
				231	ja->last_idx = i;
				232
				233	}
				234	}
				235
Kent Overstreet	c426c4f	2013-09-23 23:17:29 -0700	[diff] [blame]	236	if (!list_empty(list))
				237	c->journal.seq = list_entry(list->prev,
				238	struct journal_replay,
				239	list)->j.seq;
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	240
				241	return 0;
				242	#undef read_bucket
				243	}
				244
				245	void bch_journal_mark(struct cache_set c, struct list_head list)
				246	{
				247	atomic_t p = { 0 };
				248	struct bkey *k;
				249	struct journal_replay *i;
				250	struct journal *j = &c->journal;
				251	uint64_t last = j->seq;
				252
				253	/*
				254	* journal.pin should never fill up - we never write a journal
				255	* entry when it would fill up. But if for some reason it does, we
				256	* iterate over the list in reverse order so that we can just skip that
				257	* refcount instead of bugging.
				258	*/
				259
				260	list_for_each_entry_reverse(i, list, list) {
				261	BUG_ON(last < i->j.seq);
				262	i->pin = NULL;
				263
				264	while (last-- != i->j.seq)
				265	if (fifo_free(&j->pin) > 1) {
				266	fifo_push_front(&j->pin, p);
				267	atomic_set(&fifo_front(&j->pin), 0);
				268	}
				269
				270	if (fifo_free(&j->pin) > 1) {
				271	fifo_push_front(&j->pin, p);
				272	i->pin = &fifo_front(&j->pin);
				273	atomic_set(i->pin, 1);
				274	}
				275
				276	for (k = i->j.start;
				277	k < end(&i->j);
				278	k = bkey_next(k)) {
				279	unsigned j;
				280
				281	for (j = 0; j < KEY_PTRS(k); j++) {
				282	struct bucket *g = PTR_BUCKET(c, k, j);
				283	atomic_inc(&g->pin);
				284
				285	if (g->prio == BTREE_PRIO &&
				286	!ptr_stale(c, k, j))
				287	g->prio = INITIAL_PRIO;
				288	}
				289
				290	__bch_btree_mark_key(c, 0, k);
				291	}
				292	}
				293	}
				294
				295	int bch_journal_replay(struct cache_set s, struct list_head list,
				296	struct btree_op *op)
				297	{
				298	int ret = 0, keys = 0, entries = 0;
				299	struct bkey *k;
				300	struct journal_replay *i =
				301	list_entry(list->prev, struct journal_replay, list);
				302
				303	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
				304
				305	list_for_each_entry(i, list, list) {
				306	BUG_ON(i->pin && atomic_read(i->pin) != 1);
				307
				308	if (n != i->j.seq)
Kent Overstreet	b1a67b0	2013-03-25 11:46:44 -0700	[diff] [blame]	309	pr_err(
				310	"journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
				311	n, i->j.seq - 1, start, end);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	312
				313	for (k = i->j.start;
				314	k < end(&i->j);
				315	k = bkey_next(k)) {
Kent Overstreet	c37511b	2013-04-26 15:39:55 -0700	[diff] [blame]	316	trace_bcache_journal_replay_key(k);
				317
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	318	bkey_copy(op->keys.top, k);
				319	bch_keylist_push(&op->keys);
				320
				321	op->journal = i->pin;
				322	atomic_inc(op->journal);
				323
				324	ret = bch_btree_insert(op, s);
				325	if (ret)
				326	goto err;
				327
				328	BUG_ON(!bch_keylist_empty(&op->keys));
				329	keys++;
				330
				331	cond_resched();
				332	}
				333
				334	if (i->pin)
				335	atomic_dec(i->pin);
				336	n = i->j.seq + 1;
				337	entries++;
				338	}
				339
				340	pr_info("journal replay done, %i keys in %i entries, seq %llu",
				341	keys, entries, end);
				342
				343	while (!list_empty(list)) {
				344	i = list_first_entry(list, struct journal_replay, list);
				345	list_del(&i->list);
				346	kfree(i);
				347	}
				348	err:
				349	closure_sync(&op->cl);
				350	return ret;
				351	}
				352
				353	/* Journalling */
				354
				355	static void btree_flush_write(struct cache_set *c)
				356	{
				357	/*
				358	* Try to find the btree node with that references the oldest journal
				359	* entry, best is our current candidate and is locked if non NULL:
				360	*/
				361	struct btree b, best = NULL;
				362	unsigned iter;
				363
				364	for_each_cached_btree(b, c, iter) {
				365	if (!down_write_trylock(&b->lock))
				366	continue;
				367
				368	if (!btree_node_dirty(b) \|\|
				369	!btree_current_write(b)->journal) {
				370	rw_unlock(true, b);
				371	continue;
				372	}
				373
				374	if (!best)
				375	best = b;
				376	else if (journal_pin_cmp(c,
				377	btree_current_write(best),
				378	btree_current_write(b))) {
				379	rw_unlock(true, best);
				380	best = b;
				381	} else
				382	rw_unlock(true, b);
				383	}
				384
				385	if (best)
				386	goto out;
				387
				388	/* We can't find the best btree node, just pick the first */
				389	list_for_each_entry(b, &c->btree_cache, list)
				390	if (!b->level && btree_node_dirty(b)) {
				391	best = b;
				392	rw_lock(true, best, best->level);
				393	goto found;
				394	}
				395
				396	out:
				397	if (!best)
				398	return;
				399	found:
				400	if (btree_node_dirty(best))
Kent Overstreet	5794351	2013-04-25 13:58:35 -0700	[diff] [blame]	401	bch_btree_node_write(best, NULL);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	402	rw_unlock(true, best);
				403	}
				404
				405	#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
				406
				407	static void journal_discard_endio(struct bio *bio, int error)
				408	{
				409	struct journal_device *ja =
				410	container_of(bio, struct journal_device, discard_bio);
				411	struct cache *ca = container_of(ja, struct cache, journal);
				412
				413	atomic_set(&ja->discard_in_flight, DISCARD_DONE);
				414
				415	closure_wake_up(&ca->set->journal.wait);
				416	closure_put(&ca->set->cl);
				417	}
				418
				419	static void journal_discard_work(struct work_struct *work)
				420	{
				421	struct journal_device *ja =
				422	container_of(work, struct journal_device, discard_work);
				423
				424	submit_bio(0, &ja->discard_bio);
				425	}
				426
				427	static void do_journal_discard(struct cache *ca)
				428	{
				429	struct journal_device *ja = &ca->journal;
				430	struct bio *bio = &ja->discard_bio;
				431
				432	if (!ca->discard) {
				433	ja->discard_idx = ja->last_idx;
				434	return;
				435	}
				436
Kent Overstreet	6d9d21e	2013-09-23 23:17:27 -0700	[diff] [blame]	437	switch (atomic_read(&ja->discard_in_flight)) {
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	438	case DISCARD_IN_FLIGHT:
				439	return;
				440
				441	case DISCARD_DONE:
				442	ja->discard_idx = (ja->discard_idx + 1) %
				443	ca->sb.njournal_buckets;
				444
				445	atomic_set(&ja->discard_in_flight, DISCARD_READY);
				446	/* fallthrough */
				447
				448	case DISCARD_READY:
				449	if (ja->discard_idx == ja->last_idx)
				450	return;
				451
				452	atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
				453
				454	bio_init(bio);
				455	bio->bi_sector = bucket_to_sector(ca->set,
Kent Overstreet	b1a67b0	2013-03-25 11:46:44 -0700	[diff] [blame]	456	ca->sb.d[ja->discard_idx]);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	457	bio->bi_bdev = ca->bdev;
				458	bio->bi_rw = REQ_WRITE\|REQ_DISCARD;
				459	bio->bi_max_vecs = 1;
				460	bio->bi_io_vec = bio->bi_inline_vecs;
				461	bio->bi_size = bucket_bytes(ca);
				462	bio->bi_end_io = journal_discard_endio;
				463
				464	closure_get(&ca->set->cl);
				465	INIT_WORK(&ja->discard_work, journal_discard_work);
				466	schedule_work(&ja->discard_work);
				467	}
				468	}
				469
				470	static void journal_reclaim(struct cache_set *c)
				471	{
				472	struct bkey *k = &c->journal.key;
				473	struct cache *ca;
				474	uint64_t last_seq;
				475	unsigned iter, n = 0;
				476	atomic_t p;
				477
				478	while (!atomic_read(&fifo_front(&c->journal.pin)))
				479	fifo_pop(&c->journal.pin, p);
				480
				481	last_seq = last_seq(&c->journal);
				482
				483	/* Update last_idx */
				484
				485	for_each_cache(ca, c, iter) {
				486	struct journal_device *ja = &ca->journal;
				487
				488	while (ja->last_idx != ja->cur_idx &&
				489	ja->seq[ja->last_idx] < last_seq)
				490	ja->last_idx = (ja->last_idx + 1) %
				491	ca->sb.njournal_buckets;
				492	}
				493
				494	for_each_cache(ca, c, iter)
				495	do_journal_discard(ca);
				496
				497	if (c->journal.blocks_free)
				498	return;
				499
				500	/*
				501	* Allocate:
				502	* XXX: Sort by free journal space
				503	*/
				504
				505	for_each_cache(ca, c, iter) {
				506	struct journal_device *ja = &ca->journal;
				507	unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
				508
				509	/* No space available on this device */
				510	if (next == ja->discard_idx)
				511	continue;
				512
				513	ja->cur_idx = next;
				514	k->ptr[n++] = PTR(0,
				515	bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
				516	ca->sb.nr_this_dev);
				517	}
				518
				519	bkey_init(k);
				520	SET_KEY_PTRS(k, n);
				521
				522	if (n)
				523	c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
				524
				525	if (!journal_full(&c->journal))
				526	__closure_wake_up(&c->journal.wait);
				527	}
				528
				529	void bch_journal_next(struct journal *j)
				530	{
				531	atomic_t p = { 1 };
				532
				533	j->cur = (j->cur == j->w)
				534	? &j->w[1]
				535	: &j->w[0];
				536
				537	/*
				538	* The fifo_push() needs to happen at the same time as j->seq is
				539	* incremented for last_seq() to be calculated correctly
				540	*/
				541	BUG_ON(!fifo_push(&j->pin, p));
				542	atomic_set(&fifo_back(&j->pin), 1);
				543
				544	j->cur->data->seq = ++j->seq;
				545	j->cur->need_write = false;
				546	j->cur->data->keys = 0;
				547
				548	if (fifo_full(&j->pin))
				549	pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
				550	}
				551
				552	static void journal_write_endio(struct bio *bio, int error)
				553	{
				554	struct journal_write *w = bio->bi_private;
				555
				556	cache_set_err_on(error, w->c, "journal io error");
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	557	closure_put(&w->c->journal.io);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	558	}
				559
				560	static void journal_write(struct closure *);
				561
				562	static void journal_write_done(struct closure *cl)
				563	{
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	564	struct journal *j = container_of(cl, struct journal, io);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	565	struct journal_write *w = (j->cur == j->w)
				566	? &j->w[1]
				567	: &j->w[0];
				568
				569	__closure_wake_up(&w->wait);
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	570	continue_at_nobarrier(cl, journal_write, system_wq);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	571	}
				572
				573	static void journal_write_unlocked(struct closure *cl)
Kent Overstreet	c19ed23	2013-03-26 13:49:02 -0700	[diff] [blame]	574	__releases(c->journal.lock)
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	575	{
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	576	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	577	struct cache *ca;
				578	struct journal_write *w = c->journal.cur;
				579	struct bkey *k = &c->journal.key;
				580	unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
				581
				582	struct bio *bio;
				583	struct bio_list list;
				584	bio_list_init(&list);
				585
				586	if (!w->need_write) {
				587	/*
				588	* XXX: have to unlock closure before we unlock journal lock,
				589	* else we race with bch_journal(). But this way we race
				590	* against cache set unregister. Doh.
				591	*/
				592	set_closure_fn(cl, NULL, NULL);
				593	closure_sub(cl, CLOSURE_RUNNING + 1);
				594	spin_unlock(&c->journal.lock);
				595	return;
				596	} else if (journal_full(&c->journal)) {
				597	journal_reclaim(c);
				598	spin_unlock(&c->journal.lock);
				599
				600	btree_flush_write(c);
				601	continue_at(cl, journal_write, system_wq);
				602	}
				603
				604	c->journal.blocks_free -= set_blocks(w->data, c);
				605
				606	w->data->btree_level = c->root->level;
				607
				608	bkey_copy(&w->data->btree_root, &c->root->key);
				609	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
				610
				611	for_each_cache(ca, c, i)
				612	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
				613
				614	w->data->magic = jset_magic(c);
				615	w->data->version = BCACHE_JSET_VERSION;
				616	w->data->last_seq = last_seq(&c->journal);
				617	w->data->csum = csum_set(w->data);
				618
				619	for (i = 0; i < KEY_PTRS(k); i++) {
				620	ca = PTR_CACHE(c, k, i);
				621	bio = &ca->journal.bio;
				622
				623	atomic_long_add(sectors, &ca->meta_sectors_written);
				624
				625	bio_reset(bio);
				626	bio->bi_sector = PTR_OFFSET(k, i);
				627	bio->bi_bdev = ca->bdev;
Kent Overstreet	e49c7c3	2013-06-26 17:25:38 -0700	[diff] [blame]	628	bio->bi_rw = REQ_WRITE\|REQ_SYNC\|REQ_META\|REQ_FLUSH\|REQ_FUA;
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	629	bio->bi_size = sectors << 9;
				630
				631	bio->bi_end_io = journal_write_endio;
				632	bio->bi_private = w;
Kent Overstreet	169ef1c	2013-03-28 12:50:55 -0600	[diff] [blame]	633	bch_bio_map(bio, w->data);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	634
				635	trace_bcache_journal_write(bio);
				636	bio_list_add(&list, bio);
				637
				638	SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
				639
				640	ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
				641	}
				642
				643	atomic_dec_bug(&fifo_back(&c->journal.pin));
				644	bch_journal_next(&c->journal);
				645	journal_reclaim(c);
				646
				647	spin_unlock(&c->journal.lock);
				648
				649	while ((bio = bio_list_pop(&list)))
				650	closure_bio_submit(bio, cl, c->cache[0]);
				651
				652	continue_at(cl, journal_write_done, NULL);
				653	}
				654
				655	static void journal_write(struct closure *cl)
				656	{
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	657	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	658
				659	spin_lock(&c->journal.lock);
				660	journal_write_unlocked(cl);
				661	}
				662
				663	static void __journal_try_write(struct cache_set *c, bool noflush)
Kent Overstreet	c19ed23	2013-03-26 13:49:02 -0700	[diff] [blame]	664	__releases(c->journal.lock)
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	665	{
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	666	struct closure *cl = &c->journal.io;
				667	struct journal_write *w = c->journal.cur;
				668
				669	w->need_write = true;
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	670
				671	if (!closure_trylock(cl, &c->cl))
				672	spin_unlock(&c->journal.lock);
				673	else if (noflush && journal_full(&c->journal)) {
				674	spin_unlock(&c->journal.lock);
				675	continue_at(cl, journal_write, system_wq);
				676	} else
				677	journal_write_unlocked(cl);
				678	}
				679
				680	#define journal_try_write(c) __journal_try_write(c, false)
				681
				682	void bch_journal_meta(struct cache_set c, struct closure cl)
				683	{
				684	struct journal_write *w;
				685
				686	if (CACHE_SYNC(&c->sb)) {
				687	spin_lock(&c->journal.lock);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	688	w = c->journal.cur;
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	689
				690	if (cl)
				691	BUG_ON(!closure_wait(&w->wait, cl));
				692
				693	__journal_try_write(c, true);
				694	}
				695	}
				696
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	697	static void journal_write_work(struct work_struct *work)
				698	{
				699	struct cache_set *c = container_of(to_delayed_work(work),
				700	struct cache_set,
				701	journal.work);
				702	spin_lock(&c->journal.lock);
				703	journal_try_write(c);
				704	}
				705
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	706	/*
				707	* Entry point to the journalling code - bio_insert() and btree_invalidate()
				708	* pass bch_journal() a list of keys to be journalled, and then
				709	* bch_journal() hands those same keys off to btree_insert_async()
				710	*/
				711
				712	void bch_journal(struct closure *cl)
				713	{
				714	struct btree_op *op = container_of(cl, struct btree_op, cl);
				715	struct cache_set *c = op->c;
				716	struct journal_write *w;
				717	size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
				718
				719	if (op->type != BTREE_INSERT \|\|
				720	!CACHE_SYNC(&c->sb))
				721	goto out;
				722
				723	/*
				724	* If we're looping because we errored, might already be waiting on
				725	* another journal write:
				726	*/
				727	while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
				728	closure_sync(cl->parent);
				729
				730	spin_lock(&c->journal.lock);
				731
				732	if (journal_full(&c->journal)) {
Kent Overstreet	c37511b	2013-04-26 15:39:55 -0700	[diff] [blame]	733	trace_bcache_journal_full(c);
				734
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	735	closure_wait(&c->journal.wait, cl);
				736
				737	journal_reclaim(c);
				738	spin_unlock(&c->journal.lock);
				739
				740	btree_flush_write(c);
				741	continue_at(cl, bch_journal, bcache_wq);
				742	}
				743
				744	w = c->journal.cur;
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	745	b = __set_blocks(w->data, w->data->keys + n, c);
				746
				747	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS \|\|
				748	b > c->journal.blocks_free) {
Kent Overstreet	c37511b	2013-04-26 15:39:55 -0700	[diff] [blame]	749	trace_bcache_journal_entry_full(c);
				750
				751	/*
				752	* XXX: If we were inserting so many keys that they won't fit in
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	753	* an _empty_ journal write, we'll deadlock. For now, handle
				754	* this in bch_keylist_realloc() - but something to think about.
				755	*/
				756	BUG_ON(!w->data->keys);
				757
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	758	BUG_ON(!closure_wait(&w->wait, cl));
				759
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	760	journal_try_write(c);
				761	continue_at(cl, bch_journal, bcache_wq);
				762	}
				763
				764	memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
				765	w->data->keys += n;
				766
				767	op->journal = &fifo_back(&c->journal.pin);
				768	atomic_inc(op->journal);
				769
				770	if (op->flush_journal) {
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	771	closure_wait(&w->wait, cl->parent);
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	772	journal_try_write(c);
				773	} else if (!w->need_write) {
				774	schedule_delayed_work(&c->journal.work,
				775	msecs_to_jiffies(c->journal_delay_ms));
				776	spin_unlock(&c->journal.lock);
				777	} else {
				778	spin_unlock(&c->journal.lock);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	779	}
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	780	out:
				781	bch_btree_insert_async(cl);
				782	}
				783
				784	void bch_journal_free(struct cache_set *c)
				785	{
				786	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
				787	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
				788	free_fifo(&c->journal.pin);
				789	}
				790
				791	int bch_journal_alloc(struct cache_set *c)
				792	{
				793	struct journal *j = &c->journal;
				794
				795	closure_init_unlocked(&j->io);
				796	spin_lock_init(&j->lock);
Kent Overstreet	7857d5d4	2013-10-08 15:50:46 -0700	[diff] [blame^]	797	INIT_DELAYED_WORK(&j->work, journal_write_work);
Kent Overstreet	cafe563	2013-03-23 16:11:31 -0700	[diff] [blame]	798
				799	c->journal_delay_ms = 100;
				800
				801	j->w[0].c = c;
				802	j->w[1].c = c;
				803
				804	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) \|\|
				805	!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) \|\|
				806	!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
				807	return -ENOMEM;
				808
				809	return 0;
				810	}