Blame - drivers/md/dm-cache-target.c - kernel/msm-4.9

blob: 76cc910557f049b59a0b81fc384167cf82af6669 [file] [log] [blame]

Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Red Hat. All rights reserved.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm.h"
				8	#include "dm-bio-prison.h"
				9	#include "dm-cache-metadata.h"
				10
				11	#include <linux/dm-io.h>
				12	#include <linux/dm-kcopyd.h>
				13	#include <linux/init.h>
				14	#include <linux/mempool.h>
				15	#include <linux/module.h>
				16	#include <linux/slab.h>
				17	#include <linux/vmalloc.h>
				18
				19	#define DM_MSG_PREFIX "cache"
				20
				21	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
				22	"A percentage of time allocated for copying to and/or from cache");
				23
				24	/----------------------------------------------------------------/
				25
				26	/*
				27	* Glossary:
				28	*
				29	* oblock: index of an origin block
				30	* cblock: index of a cache block
				31	* promotion: movement of a block from origin to cache
				32	* demotion: movement of a block from cache to origin
				33	* migration: movement of a block between the origin and cache device,
				34	* either direction
				35	*/
				36
				37	/----------------------------------------------------------------/
				38
				39	static size_t bitset_size_in_bytes(unsigned nr_entries)
				40	{
				41	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
				42	}
				43
				44	static unsigned long *alloc_bitset(unsigned nr_entries)
				45	{
				46	size_t s = bitset_size_in_bytes(nr_entries);
				47	return vzalloc(s);
				48	}
				49
				50	static void clear_bitset(void *bitset, unsigned nr_entries)
				51	{
				52	size_t s = bitset_size_in_bytes(nr_entries);
				53	memset(bitset, 0, s);
				54	}
				55
				56	static void free_bitset(unsigned long *bits)
				57	{
				58	vfree(bits);
				59	}
				60
				61	/----------------------------------------------------------------/
				62
				63	#define PRISON_CELLS 1024
				64	#define MIGRATION_POOL_SIZE 128
				65	#define COMMIT_PERIOD HZ
				66	#define MIGRATION_COUNT_WINDOW 10
				67
				68	/*
				69	* The block size of the device holding cache data must be >= 32KB
				70	*/
				71	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
				72
				73	/*
				74	* FIXME: the cache is read/write for the time being.
				75	*/
				76	enum cache_mode {
				77	CM_WRITE, /* metadata may be changed */
				78	CM_READ_ONLY, /* metadata may not be changed */
				79	};
				80
				81	struct cache_features {
				82	enum cache_mode mode;
				83	bool write_through:1;
				84	};
				85
				86	struct cache_stats {
				87	atomic_t read_hit;
				88	atomic_t read_miss;
				89	atomic_t write_hit;
				90	atomic_t write_miss;
				91	atomic_t demotion;
				92	atomic_t promotion;
				93	atomic_t copies_avoided;
				94	atomic_t cache_cell_clash;
				95	atomic_t commit_count;
				96	atomic_t discard_count;
				97	};
				98
				99	struct cache {
				100	struct dm_target *ti;
				101	struct dm_target_callbacks callbacks;
				102
				103	/*
				104	* Metadata is written to this device.
				105	*/
				106	struct dm_dev *metadata_dev;
				107
				108	/*
				109	* The slower of the two data devices. Typically a spindle.
				110	*/
				111	struct dm_dev *origin_dev;
				112
				113	/*
				114	* The faster of the two data devices. Typically an SSD.
				115	*/
				116	struct dm_dev *cache_dev;
				117
				118	/*
				119	* Cache features such as write-through.
				120	*/
				121	struct cache_features features;
				122
				123	/*
				124	* Size of the origin device in _complete_ blocks and native sectors.
				125	*/
				126	dm_oblock_t origin_blocks;
				127	sector_t origin_sectors;
				128
				129	/*
				130	* Size of the cache device in blocks.
				131	*/
				132	dm_cblock_t cache_size;
				133
				134	/*
				135	* Fields for converting from sectors to blocks.
				136	*/
				137	uint32_t sectors_per_block;
				138	int sectors_per_block_shift;
				139
				140	struct dm_cache_metadata *cmd;
				141
				142	spinlock_t lock;
				143	struct bio_list deferred_bios;
				144	struct bio_list deferred_flush_bios;
				145	struct list_head quiesced_migrations;
				146	struct list_head completed_migrations;
				147	struct list_head need_commit_migrations;
				148	sector_t migration_threshold;
				149	atomic_t nr_migrations;
				150	wait_queue_head_t migration_wait;
				151
				152	/*
				153	* cache_size entries, dirty if set
				154	*/
				155	dm_cblock_t nr_dirty;
				156	unsigned long *dirty_bitset;
				157
				158	/*
				159	* origin_blocks entries, discarded if set.
				160	*/
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame]	161	uint32_t discard_block_size; /* a power of 2 times sectors per block */
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	162	dm_dblock_t discard_nr_blocks;
				163	unsigned long *discard_bitset;
				164
				165	struct dm_kcopyd_client *copier;
				166	struct workqueue_struct *wq;
				167	struct work_struct worker;
				168
				169	struct delayed_work waker;
				170	unsigned long last_commit_jiffies;
				171
				172	struct dm_bio_prison *prison;
				173	struct dm_deferred_set *all_io_ds;
				174
				175	mempool_t *migration_pool;
				176	struct dm_cache_migration *next_migration;
				177
				178	struct dm_cache_policy *policy;
				179	unsigned policy_nr_args;
				180
				181	bool need_tick_bio:1;
				182	bool sized:1;
				183	bool quiescing:1;
				184	bool commit_requested:1;
				185	bool loaded_mappings:1;
				186	bool loaded_discards:1;
				187
				188	struct cache_stats stats;
				189
				190	/*
				191	* Rather than reconstructing the table line for the status we just
				192	* save it and regurgitate.
				193	*/
				194	unsigned nr_ctr_args;
				195	const char **ctr_args;
				196	};
				197
				198	struct per_bio_data {
				199	bool tick:1;
				200	unsigned req_nr:2;
				201	struct dm_deferred_entry *all_io_entry;
				202	};
				203
				204	struct dm_cache_migration {
				205	struct list_head list;
				206	struct cache *cache;
				207
				208	unsigned long start_jiffies;
				209	dm_oblock_t old_oblock;
				210	dm_oblock_t new_oblock;
				211	dm_cblock_t cblock;
				212
				213	bool err:1;
				214	bool writeback:1;
				215	bool demote:1;
				216	bool promote:1;
				217
				218	struct dm_bio_prison_cell *old_ocell;
				219	struct dm_bio_prison_cell *new_ocell;
				220	};
				221
				222	/*
				223	* Processing a bio in the worker thread may require these memory
				224	* allocations. We prealloc to avoid deadlocks (the same worker thread
				225	* frees them back to the mempool).
				226	*/
				227	struct prealloc {
				228	struct dm_cache_migration *mg;
				229	struct dm_bio_prison_cell *cell1;
				230	struct dm_bio_prison_cell *cell2;
				231	};
				232
				233	static void wake_worker(struct cache *cache)
				234	{
				235	queue_work(cache->wq, &cache->worker);
				236	}
				237
				238	/----------------------------------------------------------------/
				239
				240	static struct dm_bio_prison_cell alloc_prison_cell(struct cache cache)
				241	{
				242	/* FIXME: change to use a local slab. */
				243	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
				244	}
				245
				246	static void free_prison_cell(struct cache cache, struct dm_bio_prison_cell cell)
				247	{
				248	dm_bio_prison_free_cell(cache->prison, cell);
				249	}
				250
				251	static int prealloc_data_structs(struct cache cache, struct prealloc p)
				252	{
				253	if (!p->mg) {
				254	p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
				255	if (!p->mg)
				256	return -ENOMEM;
				257	}
				258
				259	if (!p->cell1) {
				260	p->cell1 = alloc_prison_cell(cache);
				261	if (!p->cell1)
				262	return -ENOMEM;
				263	}
				264
				265	if (!p->cell2) {
				266	p->cell2 = alloc_prison_cell(cache);
				267	if (!p->cell2)
				268	return -ENOMEM;
				269	}
				270
				271	return 0;
				272	}
				273
				274	static void prealloc_free_structs(struct cache cache, struct prealloc p)
				275	{
				276	if (p->cell2)
				277	free_prison_cell(cache, p->cell2);
				278
				279	if (p->cell1)
				280	free_prison_cell(cache, p->cell1);
				281
				282	if (p->mg)
				283	mempool_free(p->mg, cache->migration_pool);
				284	}
				285
				286	static struct dm_cache_migration prealloc_get_migration(struct prealloc p)
				287	{
				288	struct dm_cache_migration *mg = p->mg;
				289
				290	BUG_ON(!mg);
				291	p->mg = NULL;
				292
				293	return mg;
				294	}
				295
				296	/*
				297	* You must have a cell within the prealloc struct to return. If not this
				298	* function will BUG() rather than returning NULL.
				299	*/
				300	static struct dm_bio_prison_cell prealloc_get_cell(struct prealloc p)
				301	{
				302	struct dm_bio_prison_cell *r = NULL;
				303
				304	if (p->cell1) {
				305	r = p->cell1;
				306	p->cell1 = NULL;
				307
				308	} else if (p->cell2) {
				309	r = p->cell2;
				310	p->cell2 = NULL;
				311	} else
				312	BUG();
				313
				314	return r;
				315	}
				316
				317	/*
				318	* You can't have more than two cells in a prealloc struct. BUG() will be
				319	* called if you try and overfill.
				320	*/
				321	static void prealloc_put_cell(struct prealloc p, struct dm_bio_prison_cell cell)
				322	{
				323	if (!p->cell2)
				324	p->cell2 = cell;
				325
				326	else if (!p->cell1)
				327	p->cell1 = cell;
				328
				329	else
				330	BUG();
				331	}
				332
				333	/----------------------------------------------------------------/
				334
				335	static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
				336	{
				337	key->virtual = 0;
				338	key->dev = 0;
				339	key->block = from_oblock(oblock);
				340	}
				341
				342	/*
				343	* The caller hands in a preallocated cell, and a free function for it.
				344	* The cell will be freed if there's an error, or if it wasn't used because
				345	* a cell with that key already exists.
				346	*/
				347	typedef void (cell_free_fn)(void context, struct dm_bio_prison_cell *cell);
				348
				349	static int bio_detain(struct cache *cache, dm_oblock_t oblock,
				350	struct bio bio, struct dm_bio_prison_cell cell_prealloc,
				351	cell_free_fn free_fn, void *free_context,
				352	struct dm_bio_prison_cell **cell_result)
				353	{
				354	int r;
				355	struct dm_cell_key key;
				356
				357	build_key(oblock, &key);
				358	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
				359	if (r)
				360	free_fn(free_context, cell_prealloc);
				361
				362	return r;
				363	}
				364
				365	static int get_cell(struct cache *cache,
				366	dm_oblock_t oblock,
				367	struct prealloc *structs,
				368	struct dm_bio_prison_cell **cell_result)
				369	{
				370	int r;
				371	struct dm_cell_key key;
				372	struct dm_bio_prison_cell *cell_prealloc;
				373
				374	cell_prealloc = prealloc_get_cell(structs);
				375
				376	build_key(oblock, &key);
				377	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
				378	if (r)
				379	prealloc_put_cell(structs, cell_prealloc);
				380
				381	return r;
				382	}
				383
				384	/----------------------------------------------------------------/
				385
				386	static bool is_dirty(struct cache *cache, dm_cblock_t b)
				387	{
				388	return test_bit(from_cblock(b), cache->dirty_bitset);
				389	}
				390
				391	static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
				392	{
				393	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
				394	cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
				395	policy_set_dirty(cache->policy, oblock);
				396	}
				397	}
				398
				399	static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
				400	{
				401	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
				402	policy_clear_dirty(cache->policy, oblock);
				403	cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
				404	if (!from_cblock(cache->nr_dirty))
				405	dm_table_event(cache->ti->table);
				406	}
				407	}
				408
				409	/----------------------------------------------------------------/
				410	static bool block_size_is_power_of_two(struct cache *cache)
				411	{
				412	return cache->sectors_per_block_shift >= 0;
				413	}
				414
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame]	415	static dm_block_t block_div(dm_block_t b, uint32_t n)
				416	{
				417	do_div(b, n);
				418
				419	return b;
				420	}
				421
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	422	static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
				423	{
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame]	424	uint32_t discard_blocks = cache->discard_block_size;
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	425	dm_block_t b = from_oblock(oblock);
				426
				427	if (!block_size_is_power_of_two(cache))
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame]	428	discard_blocks = discard_blocks / cache->sectors_per_block;
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	429	else
				430	discard_blocks >>= cache->sectors_per_block_shift;
				431
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame]	432	b = block_div(b, discard_blocks);
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	433
				434	return to_dblock(b);
				435	}
				436
				437	static void set_discard(struct cache *cache, dm_dblock_t b)
				438	{
				439	unsigned long flags;
				440
				441	atomic_inc(&cache->stats.discard_count);
				442
				443	spin_lock_irqsave(&cache->lock, flags);
				444	set_bit(from_dblock(b), cache->discard_bitset);
				445	spin_unlock_irqrestore(&cache->lock, flags);
				446	}
				447
				448	static void clear_discard(struct cache *cache, dm_dblock_t b)
				449	{
				450	unsigned long flags;
				451
				452	spin_lock_irqsave(&cache->lock, flags);
				453	clear_bit(from_dblock(b), cache->discard_bitset);
				454	spin_unlock_irqrestore(&cache->lock, flags);
				455	}
				456
				457	static bool is_discarded(struct cache *cache, dm_dblock_t b)
				458	{
				459	int r;
				460	unsigned long flags;
				461
				462	spin_lock_irqsave(&cache->lock, flags);
				463	r = test_bit(from_dblock(b), cache->discard_bitset);
				464	spin_unlock_irqrestore(&cache->lock, flags);
				465
				466	return r;
				467	}
				468
				469	static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
				470	{
				471	int r;
				472	unsigned long flags;
				473
				474	spin_lock_irqsave(&cache->lock, flags);
				475	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
				476	cache->discard_bitset);
				477	spin_unlock_irqrestore(&cache->lock, flags);
				478
				479	return r;
				480	}
				481
				482	/----------------------------------------------------------------/
				483
				484	static void load_stats(struct cache *cache)
				485	{
				486	struct dm_cache_statistics stats;
				487
				488	dm_cache_metadata_get_stats(cache->cmd, &stats);
				489	atomic_set(&cache->stats.read_hit, stats.read_hits);
				490	atomic_set(&cache->stats.read_miss, stats.read_misses);
				491	atomic_set(&cache->stats.write_hit, stats.write_hits);
				492	atomic_set(&cache->stats.write_miss, stats.write_misses);
				493	}
				494
				495	static void save_stats(struct cache *cache)
				496	{
				497	struct dm_cache_statistics stats;
				498
				499	stats.read_hits = atomic_read(&cache->stats.read_hit);
				500	stats.read_misses = atomic_read(&cache->stats.read_miss);
				501	stats.write_hits = atomic_read(&cache->stats.write_hit);
				502	stats.write_misses = atomic_read(&cache->stats.write_miss);
				503
				504	dm_cache_metadata_set_stats(cache->cmd, &stats);
				505	}
				506
				507	/*----------------------------------------------------------------
				508	* Per bio data
				509	--------------------------------------------------------------/
				510	static struct per_bio_data get_per_bio_data(struct bio bio)
				511	{
				512	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
				513	BUG_ON(!pb);
				514	return pb;
				515	}
				516
				517	static struct per_bio_data init_per_bio_data(struct bio bio)
				518	{
				519	struct per_bio_data *pb = get_per_bio_data(bio);
				520
				521	pb->tick = false;
				522	pb->req_nr = dm_bio_get_target_bio_nr(bio);
				523	pb->all_io_entry = NULL;
				524
				525	return pb;
				526	}
				527
				528	/*----------------------------------------------------------------
				529	* Remapping
				530	--------------------------------------------------------------/
				531	static void remap_to_origin(struct cache cache, struct bio bio)
				532	{
				533	bio->bi_bdev = cache->origin_dev->bdev;
				534	}
				535
				536	static void remap_to_cache(struct cache cache, struct bio bio,
				537	dm_cblock_t cblock)
				538	{
				539	sector_t bi_sector = bio->bi_sector;
				540
				541	bio->bi_bdev = cache->cache_dev->bdev;
				542	if (!block_size_is_power_of_two(cache))
				543	bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
				544	sector_div(bi_sector, cache->sectors_per_block);
				545	else
				546	bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) \|
				547	(bi_sector & (cache->sectors_per_block - 1));
				548	}
				549
				550	static void check_if_tick_bio_needed(struct cache cache, struct bio bio)
				551	{
				552	unsigned long flags;
				553	struct per_bio_data *pb = get_per_bio_data(bio);
				554
				555	spin_lock_irqsave(&cache->lock, flags);
				556	if (cache->need_tick_bio &&
				557	!(bio->bi_rw & (REQ_FUA \| REQ_FLUSH \| REQ_DISCARD))) {
				558	pb->tick = true;
				559	cache->need_tick_bio = false;
				560	}
				561	spin_unlock_irqrestore(&cache->lock, flags);
				562	}
				563
				564	static void remap_to_origin_clear_discard(struct cache cache, struct bio bio,
				565	dm_oblock_t oblock)
				566	{
				567	check_if_tick_bio_needed(cache, bio);
				568	remap_to_origin(cache, bio);
				569	if (bio_data_dir(bio) == WRITE)
				570	clear_discard(cache, oblock_to_dblock(cache, oblock));
				571	}
				572
				573	static void remap_to_cache_dirty(struct cache cache, struct bio bio,
				574	dm_oblock_t oblock, dm_cblock_t cblock)
				575	{
				576	remap_to_cache(cache, bio, cblock);
				577	if (bio_data_dir(bio) == WRITE) {
				578	set_dirty(cache, oblock, cblock);
				579	clear_discard(cache, oblock_to_dblock(cache, oblock));
				580	}
				581	}
				582
				583	static dm_oblock_t get_bio_block(struct cache cache, struct bio bio)
				584	{
				585	sector_t block_nr = bio->bi_sector;
				586
				587	if (!block_size_is_power_of_two(cache))
				588	(void) sector_div(block_nr, cache->sectors_per_block);
				589	else
				590	block_nr >>= cache->sectors_per_block_shift;
				591
				592	return to_oblock(block_nr);
				593	}
				594
				595	static int bio_triggers_commit(struct cache cache, struct bio bio)
				596	{
				597	return bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
				598	}
				599
				600	static void issue(struct cache cache, struct bio bio)
				601	{
				602	unsigned long flags;
				603
				604	if (!bio_triggers_commit(cache, bio)) {
				605	generic_make_request(bio);
				606	return;
				607	}
				608
				609	/*
				610	* Batch together any bios that trigger commits and then issue a
				611	* single commit for them in do_worker().
				612	*/
				613	spin_lock_irqsave(&cache->lock, flags);
				614	cache->commit_requested = true;
				615	bio_list_add(&cache->deferred_flush_bios, bio);
				616	spin_unlock_irqrestore(&cache->lock, flags);
				617	}
				618
				619	/*----------------------------------------------------------------
				620	* Migration processing
				621	*
				622	* Migration covers moving data from the origin device to the cache, or
				623	* vice versa.
				624	--------------------------------------------------------------/
				625	static void free_migration(struct dm_cache_migration *mg)
				626	{
				627	mempool_free(mg, mg->cache->migration_pool);
				628	}
				629
				630	static void inc_nr_migrations(struct cache *cache)
				631	{
				632	atomic_inc(&cache->nr_migrations);
				633	}
				634
				635	static void dec_nr_migrations(struct cache *cache)
				636	{
				637	atomic_dec(&cache->nr_migrations);
				638
				639	/*
				640	* Wake the worker in case we're suspending the target.
				641	*/
				642	wake_up(&cache->migration_wait);
				643	}
				644
				645	static void __cell_defer(struct cache cache, struct dm_bio_prison_cell cell,
				646	bool holder)
				647	{
				648	(holder ? dm_cell_release : dm_cell_release_no_holder)
				649	(cache->prison, cell, &cache->deferred_bios);
				650	free_prison_cell(cache, cell);
				651	}
				652
				653	static void cell_defer(struct cache cache, struct dm_bio_prison_cell cell,
				654	bool holder)
				655	{
				656	unsigned long flags;
				657
				658	spin_lock_irqsave(&cache->lock, flags);
				659	__cell_defer(cache, cell, holder);
				660	spin_unlock_irqrestore(&cache->lock, flags);
				661
				662	wake_worker(cache);
				663	}
				664
				665	static void cleanup_migration(struct dm_cache_migration *mg)
				666	{
				667	dec_nr_migrations(mg->cache);
				668	free_migration(mg);
				669	}
				670
				671	static void migration_failure(struct dm_cache_migration *mg)
				672	{
				673	struct cache *cache = mg->cache;
				674
				675	if (mg->writeback) {
				676	DMWARN_LIMIT("writeback failed; couldn't copy block");
				677	set_dirty(cache, mg->old_oblock, mg->cblock);
				678	cell_defer(cache, mg->old_ocell, false);
				679
				680	} else if (mg->demote) {
				681	DMWARN_LIMIT("demotion failed; couldn't copy block");
				682	policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
				683
				684	cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
				685	if (mg->promote)
				686	cell_defer(cache, mg->new_ocell, 1);
				687	} else {
				688	DMWARN_LIMIT("promotion failed; couldn't copy block");
				689	policy_remove_mapping(cache->policy, mg->new_oblock);
				690	cell_defer(cache, mg->new_ocell, 1);
				691	}
				692
				693	cleanup_migration(mg);
				694	}
				695
				696	static void migration_success_pre_commit(struct dm_cache_migration *mg)
				697	{
				698	unsigned long flags;
				699	struct cache *cache = mg->cache;
				700
				701	if (mg->writeback) {
				702	cell_defer(cache, mg->old_ocell, false);
				703	clear_dirty(cache, mg->old_oblock, mg->cblock);
				704	cleanup_migration(mg);
				705	return;
				706
				707	} else if (mg->demote) {
				708	if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
				709	DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
				710	policy_force_mapping(cache->policy, mg->new_oblock,
				711	mg->old_oblock);
				712	if (mg->promote)
				713	cell_defer(cache, mg->new_ocell, true);
				714	cleanup_migration(mg);
				715	return;
				716	}
				717	} else {
				718	if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
				719	DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
				720	policy_remove_mapping(cache->policy, mg->new_oblock);
				721	cleanup_migration(mg);
				722	return;
				723	}
				724	}
				725
				726	spin_lock_irqsave(&cache->lock, flags);
				727	list_add_tail(&mg->list, &cache->need_commit_migrations);
				728	cache->commit_requested = true;
				729	spin_unlock_irqrestore(&cache->lock, flags);
				730	}
				731
				732	static void migration_success_post_commit(struct dm_cache_migration *mg)
				733	{
				734	unsigned long flags;
				735	struct cache *cache = mg->cache;
				736
				737	if (mg->writeback) {
				738	DMWARN("writeback unexpectedly triggered commit");
				739	return;
				740
				741	} else if (mg->demote) {
				742	cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
				743
				744	if (mg->promote) {
				745	mg->demote = false;
				746
				747	spin_lock_irqsave(&cache->lock, flags);
				748	list_add_tail(&mg->list, &cache->quiesced_migrations);
				749	spin_unlock_irqrestore(&cache->lock, flags);
				750
				751	} else
				752	cleanup_migration(mg);
				753
				754	} else {
				755	cell_defer(cache, mg->new_ocell, true);
				756	clear_dirty(cache, mg->new_oblock, mg->cblock);
				757	cleanup_migration(mg);
				758	}
				759	}
				760
				761	static void copy_complete(int read_err, unsigned long write_err, void *context)
				762	{
				763	unsigned long flags;
				764	struct dm_cache_migration mg = (struct dm_cache_migration ) context;
				765	struct cache *cache = mg->cache;
				766
				767	if (read_err \|\| write_err)
				768	mg->err = true;
				769
				770	spin_lock_irqsave(&cache->lock, flags);
				771	list_add_tail(&mg->list, &cache->completed_migrations);
				772	spin_unlock_irqrestore(&cache->lock, flags);
				773
				774	wake_worker(cache);
				775	}
				776
				777	static void issue_copy_real(struct dm_cache_migration *mg)
				778	{
				779	int r;
				780	struct dm_io_region o_region, c_region;
				781	struct cache *cache = mg->cache;
				782
				783	o_region.bdev = cache->origin_dev->bdev;
				784	o_region.count = cache->sectors_per_block;
				785
				786	c_region.bdev = cache->cache_dev->bdev;
				787	c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
				788	c_region.count = cache->sectors_per_block;
				789
				790	if (mg->writeback \|\| mg->demote) {
				791	/* demote */
				792	o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
				793	r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
				794	} else {
				795	/* promote */
				796	o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
				797	r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
				798	}
				799
				800	if (r < 0)
				801	migration_failure(mg);
				802	}
				803
				804	static void avoid_copy(struct dm_cache_migration *mg)
				805	{
				806	atomic_inc(&mg->cache->stats.copies_avoided);
				807	migration_success_pre_commit(mg);
				808	}
				809
				810	static void issue_copy(struct dm_cache_migration *mg)
				811	{
				812	bool avoid;
				813	struct cache *cache = mg->cache;
				814
				815	if (mg->writeback \|\| mg->demote)
				816	avoid = !is_dirty(cache, mg->cblock) \|\|
				817	is_discarded_oblock(cache, mg->old_oblock);
				818	else
				819	avoid = is_discarded_oblock(cache, mg->new_oblock);
				820
				821	avoid ? avoid_copy(mg) : issue_copy_real(mg);
				822	}
				823
				824	static void complete_migration(struct dm_cache_migration *mg)
				825	{
				826	if (mg->err)
				827	migration_failure(mg);
				828	else
				829	migration_success_pre_commit(mg);
				830	}
				831
				832	static void process_migrations(struct cache cache, struct list_head head,
				833	void (fn)(struct dm_cache_migration ))
				834	{
				835	unsigned long flags;
				836	struct list_head list;
				837	struct dm_cache_migration mg, tmp;
				838
				839	INIT_LIST_HEAD(&list);
				840	spin_lock_irqsave(&cache->lock, flags);
				841	list_splice_init(head, &list);
				842	spin_unlock_irqrestore(&cache->lock, flags);
				843
				844	list_for_each_entry_safe(mg, tmp, &list, list)
				845	fn(mg);
				846	}
				847
				848	static void __queue_quiesced_migration(struct dm_cache_migration *mg)
				849	{
				850	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
				851	}
				852
				853	static void queue_quiesced_migration(struct dm_cache_migration *mg)
				854	{
				855	unsigned long flags;
				856	struct cache *cache = mg->cache;
				857
				858	spin_lock_irqsave(&cache->lock, flags);
				859	__queue_quiesced_migration(mg);
				860	spin_unlock_irqrestore(&cache->lock, flags);
				861
				862	wake_worker(cache);
				863	}
				864
				865	static void queue_quiesced_migrations(struct cache cache, struct list_head work)
				866	{
				867	unsigned long flags;
				868	struct dm_cache_migration mg, tmp;
				869
				870	spin_lock_irqsave(&cache->lock, flags);
				871	list_for_each_entry_safe(mg, tmp, work, list)
				872	__queue_quiesced_migration(mg);
				873	spin_unlock_irqrestore(&cache->lock, flags);
				874
				875	wake_worker(cache);
				876	}
				877
				878	static void check_for_quiesced_migrations(struct cache *cache,
				879	struct per_bio_data *pb)
				880	{
				881	struct list_head work;
				882
				883	if (!pb->all_io_entry)
				884	return;
				885
				886	INIT_LIST_HEAD(&work);
				887	if (pb->all_io_entry)
				888	dm_deferred_entry_dec(pb->all_io_entry, &work);
				889
				890	if (!list_empty(&work))
				891	queue_quiesced_migrations(cache, &work);
				892	}
				893
				894	static void quiesce_migration(struct dm_cache_migration *mg)
				895	{
				896	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
				897	queue_quiesced_migration(mg);
				898	}
				899
				900	static void promote(struct cache cache, struct prealloc structs,
				901	dm_oblock_t oblock, dm_cblock_t cblock,
				902	struct dm_bio_prison_cell *cell)
				903	{
				904	struct dm_cache_migration *mg = prealloc_get_migration(structs);
				905
				906	mg->err = false;
				907	mg->writeback = false;
				908	mg->demote = false;
				909	mg->promote = true;
				910	mg->cache = cache;
				911	mg->new_oblock = oblock;
				912	mg->cblock = cblock;
				913	mg->old_ocell = NULL;
				914	mg->new_ocell = cell;
				915	mg->start_jiffies = jiffies;
				916
				917	inc_nr_migrations(cache);
				918	quiesce_migration(mg);
				919	}
				920
				921	static void writeback(struct cache cache, struct prealloc structs,
				922	dm_oblock_t oblock, dm_cblock_t cblock,
				923	struct dm_bio_prison_cell *cell)
				924	{
				925	struct dm_cache_migration *mg = prealloc_get_migration(structs);
				926
				927	mg->err = false;
				928	mg->writeback = true;
				929	mg->demote = false;
				930	mg->promote = false;
				931	mg->cache = cache;
				932	mg->old_oblock = oblock;
				933	mg->cblock = cblock;
				934	mg->old_ocell = cell;
				935	mg->new_ocell = NULL;
				936	mg->start_jiffies = jiffies;
				937
				938	inc_nr_migrations(cache);
				939	quiesce_migration(mg);
				940	}
				941
				942	static void demote_then_promote(struct cache cache, struct prealloc structs,
				943	dm_oblock_t old_oblock, dm_oblock_t new_oblock,
				944	dm_cblock_t cblock,
				945	struct dm_bio_prison_cell *old_ocell,
				946	struct dm_bio_prison_cell *new_ocell)
				947	{
				948	struct dm_cache_migration *mg = prealloc_get_migration(structs);
				949
				950	mg->err = false;
				951	mg->writeback = false;
				952	mg->demote = true;
				953	mg->promote = true;
				954	mg->cache = cache;
				955	mg->old_oblock = old_oblock;
				956	mg->new_oblock = new_oblock;
				957	mg->cblock = cblock;
				958	mg->old_ocell = old_ocell;
				959	mg->new_ocell = new_ocell;
				960	mg->start_jiffies = jiffies;
				961
				962	inc_nr_migrations(cache);
				963	quiesce_migration(mg);
				964	}
				965
				966	/*----------------------------------------------------------------
				967	* bio processing
				968	--------------------------------------------------------------/
				969	static void defer_bio(struct cache cache, struct bio bio)
				970	{
				971	unsigned long flags;
				972
				973	spin_lock_irqsave(&cache->lock, flags);
				974	bio_list_add(&cache->deferred_bios, bio);
				975	spin_unlock_irqrestore(&cache->lock, flags);
				976
				977	wake_worker(cache);
				978	}
				979
				980	static void process_flush_bio(struct cache cache, struct bio bio)
				981	{
				982	struct per_bio_data *pb = get_per_bio_data(bio);
				983
				984	BUG_ON(bio->bi_size);
				985	if (!pb->req_nr)
				986	remap_to_origin(cache, bio);
				987	else
				988	remap_to_cache(cache, bio, 0);
				989
				990	issue(cache, bio);
				991	}
				992
				993	/*
				994	* People generally discard large parts of a device, eg, the whole device
				995	* when formatting. Splitting these large discards up into cache block
				996	* sized ios and then quiescing (always neccessary for discard) takes too
				997	* long.
				998	*
				999	* We keep it simple, and allow any size of discard to come in, and just
				1000	* mark off blocks on the discard bitset. No passdown occurs!
				1001	*
				1002	* To implement passdown we need to change the bio_prison such that a cell
				1003	* can have a key that spans many blocks.
				1004	*/
				1005	static void process_discard_bio(struct cache cache, struct bio bio)
				1006	{
				1007	dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
				1008	cache->discard_block_size);
				1009	dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
				1010	dm_block_t b;
				1011
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame]	1012	end_block = block_div(end_block, cache->discard_block_size);
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	1013
				1014	for (b = start_block; b < end_block; b++)
				1015	set_discard(cache, to_dblock(b));
				1016
				1017	bio_endio(bio, 0);
				1018	}
				1019
				1020	static bool spare_migration_bandwidth(struct cache *cache)
				1021	{
				1022	sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
				1023	cache->sectors_per_block;
				1024	return current_volume < cache->migration_threshold;
				1025	}
				1026
				1027	static bool is_writethrough_io(struct cache cache, struct bio bio,
				1028	dm_cblock_t cblock)
				1029	{
				1030	return bio_data_dir(bio) == WRITE &&
				1031	cache->features.write_through && !is_dirty(cache, cblock);
				1032	}
				1033
				1034	static void inc_hit_counter(struct cache cache, struct bio bio)
				1035	{
				1036	atomic_inc(bio_data_dir(bio) == READ ?
				1037	&cache->stats.read_hit : &cache->stats.write_hit);
				1038	}
				1039
				1040	static void inc_miss_counter(struct cache cache, struct bio bio)
				1041	{
				1042	atomic_inc(bio_data_dir(bio) == READ ?
				1043	&cache->stats.read_miss : &cache->stats.write_miss);
				1044	}
				1045
				1046	static void process_bio(struct cache cache, struct prealloc structs,
				1047	struct bio *bio)
				1048	{
				1049	int r;
				1050	bool release_cell = true;
				1051	dm_oblock_t block = get_bio_block(cache, bio);
				1052	struct dm_bio_prison_cell cell_prealloc, old_ocell, *new_ocell;
				1053	struct policy_result lookup_result;
				1054	struct per_bio_data *pb = get_per_bio_data(bio);
				1055	bool discarded_block = is_discarded_oblock(cache, block);
				1056	bool can_migrate = discarded_block \|\| spare_migration_bandwidth(cache);
				1057
				1058	/*
				1059	* Check to see if that block is currently migrating.
				1060	*/
				1061	cell_prealloc = prealloc_get_cell(structs);
				1062	r = bio_detain(cache, block, bio, cell_prealloc,
				1063	(cell_free_fn) prealloc_put_cell,
				1064	structs, &new_ocell);
				1065	if (r > 0)
				1066	return;
				1067
				1068	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
				1069	bio, &lookup_result);
				1070
				1071	if (r == -EWOULDBLOCK)
				1072	/* migration has been denied */
				1073	lookup_result.op = POLICY_MISS;
				1074
				1075	switch (lookup_result.op) {
				1076	case POLICY_HIT:
				1077	inc_hit_counter(cache, bio);
				1078	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
				1079
				1080	if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
				1081	/*
				1082	* No need to mark anything dirty in write through mode.
				1083	*/
				1084	pb->req_nr == 0 ?
				1085	remap_to_cache(cache, bio, lookup_result.cblock) :
				1086	remap_to_origin_clear_discard(cache, bio, block);
				1087	} else
				1088	remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
				1089
				1090	issue(cache, bio);
				1091	break;
				1092
				1093	case POLICY_MISS:
				1094	inc_miss_counter(cache, bio);
				1095	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
				1096
				1097	if (pb->req_nr != 0) {
				1098	/*
				1099	* This is a duplicate writethrough io that is no
				1100	* longer needed because the block has been demoted.
				1101	*/
				1102	bio_endio(bio, 0);
				1103	} else {
				1104	remap_to_origin_clear_discard(cache, bio, block);
				1105	issue(cache, bio);
				1106	}
				1107	break;
				1108
				1109	case POLICY_NEW:
				1110	atomic_inc(&cache->stats.promotion);
				1111	promote(cache, structs, block, lookup_result.cblock, new_ocell);
				1112	release_cell = false;
				1113	break;
				1114
				1115	case POLICY_REPLACE:
				1116	cell_prealloc = prealloc_get_cell(structs);
				1117	r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
				1118	(cell_free_fn) prealloc_put_cell,
				1119	structs, &old_ocell);
				1120	if (r > 0) {
				1121	/*
				1122	* We have to be careful to avoid lock inversion of
				1123	* the cells. So we back off, and wait for the
				1124	* old_ocell to become free.
				1125	*/
				1126	policy_force_mapping(cache->policy, block,
				1127	lookup_result.old_oblock);
				1128	atomic_inc(&cache->stats.cache_cell_clash);
				1129	break;
				1130	}
				1131	atomic_inc(&cache->stats.demotion);
				1132	atomic_inc(&cache->stats.promotion);
				1133
				1134	demote_then_promote(cache, structs, lookup_result.old_oblock,
				1135	block, lookup_result.cblock,
				1136	old_ocell, new_ocell);
				1137	release_cell = false;
				1138	break;
				1139
				1140	default:
				1141	DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
				1142	(unsigned) lookup_result.op);
				1143	bio_io_error(bio);
				1144	}
				1145
				1146	if (release_cell)
				1147	cell_defer(cache, new_ocell, false);
				1148	}
				1149
				1150	static int need_commit_due_to_time(struct cache *cache)
				1151	{
				1152	return jiffies < cache->last_commit_jiffies \|\|
				1153	jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
				1154	}
				1155
				1156	static int commit_if_needed(struct cache *cache)
				1157	{
				1158	if (dm_cache_changed_this_transaction(cache->cmd) &&
				1159	(cache->commit_requested \|\| need_commit_due_to_time(cache))) {
				1160	atomic_inc(&cache->stats.commit_count);
				1161	cache->last_commit_jiffies = jiffies;
				1162	cache->commit_requested = false;
				1163	return dm_cache_commit(cache->cmd, false);
				1164	}
				1165
				1166	return 0;
				1167	}
				1168
				1169	static void process_deferred_bios(struct cache *cache)
				1170	{
				1171	unsigned long flags;
				1172	struct bio_list bios;
				1173	struct bio *bio;
				1174	struct prealloc structs;
				1175
				1176	memset(&structs, 0, sizeof(structs));
				1177	bio_list_init(&bios);
				1178
				1179	spin_lock_irqsave(&cache->lock, flags);
				1180	bio_list_merge(&bios, &cache->deferred_bios);
				1181	bio_list_init(&cache->deferred_bios);
				1182	spin_unlock_irqrestore(&cache->lock, flags);
				1183
				1184	while (!bio_list_empty(&bios)) {
				1185	/*
				1186	* If we've got no free migration structs, and processing
				1187	* this bio might require one, we pause until there are some
				1188	* prepared mappings to process.
				1189	*/
				1190	if (prealloc_data_structs(cache, &structs)) {
				1191	spin_lock_irqsave(&cache->lock, flags);
				1192	bio_list_merge(&cache->deferred_bios, &bios);
				1193	spin_unlock_irqrestore(&cache->lock, flags);
				1194	break;
				1195	}
				1196
				1197	bio = bio_list_pop(&bios);
				1198
				1199	if (bio->bi_rw & REQ_FLUSH)
				1200	process_flush_bio(cache, bio);
				1201	else if (bio->bi_rw & REQ_DISCARD)
				1202	process_discard_bio(cache, bio);
				1203	else
				1204	process_bio(cache, &structs, bio);
				1205	}
				1206
				1207	prealloc_free_structs(cache, &structs);
				1208	}
				1209
				1210	static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
				1211	{
				1212	unsigned long flags;
				1213	struct bio_list bios;
				1214	struct bio *bio;
				1215
				1216	bio_list_init(&bios);
				1217
				1218	spin_lock_irqsave(&cache->lock, flags);
				1219	bio_list_merge(&bios, &cache->deferred_flush_bios);
				1220	bio_list_init(&cache->deferred_flush_bios);
				1221	spin_unlock_irqrestore(&cache->lock, flags);
				1222
				1223	while ((bio = bio_list_pop(&bios)))
				1224	submit_bios ? generic_make_request(bio) : bio_io_error(bio);
				1225	}
				1226
				1227	static void writeback_some_dirty_blocks(struct cache *cache)
				1228	{
				1229	int r = 0;
				1230	dm_oblock_t oblock;
				1231	dm_cblock_t cblock;
				1232	struct prealloc structs;
				1233	struct dm_bio_prison_cell *old_ocell;
				1234
				1235	memset(&structs, 0, sizeof(structs));
				1236
				1237	while (spare_migration_bandwidth(cache)) {
				1238	if (prealloc_data_structs(cache, &structs))
				1239	break;
				1240
				1241	r = policy_writeback_work(cache->policy, &oblock, &cblock);
				1242	if (r)
				1243	break;
				1244
				1245	r = get_cell(cache, oblock, &structs, &old_ocell);
				1246	if (r) {
				1247	policy_set_dirty(cache->policy, oblock);
				1248	break;
				1249	}
				1250
				1251	writeback(cache, &structs, oblock, cblock, old_ocell);
				1252	}
				1253
				1254	prealloc_free_structs(cache, &structs);
				1255	}
				1256
				1257	/*----------------------------------------------------------------
				1258	* Main worker loop
				1259	--------------------------------------------------------------/
				1260	static void start_quiescing(struct cache *cache)
				1261	{
				1262	unsigned long flags;
				1263
				1264	spin_lock_irqsave(&cache->lock, flags);
				1265	cache->quiescing = 1;
				1266	spin_unlock_irqrestore(&cache->lock, flags);
				1267	}
				1268
				1269	static void stop_quiescing(struct cache *cache)
				1270	{
				1271	unsigned long flags;
				1272
				1273	spin_lock_irqsave(&cache->lock, flags);
				1274	cache->quiescing = 0;
				1275	spin_unlock_irqrestore(&cache->lock, flags);
				1276	}
				1277
				1278	static bool is_quiescing(struct cache *cache)
				1279	{
				1280	int r;
				1281	unsigned long flags;
				1282
				1283	spin_lock_irqsave(&cache->lock, flags);
				1284	r = cache->quiescing;
				1285	spin_unlock_irqrestore(&cache->lock, flags);
				1286
				1287	return r;
				1288	}
				1289
				1290	static void wait_for_migrations(struct cache *cache)
				1291	{
				1292	wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
				1293	}
				1294
				1295	static void stop_worker(struct cache *cache)
				1296	{
				1297	cancel_delayed_work(&cache->waker);
				1298	flush_workqueue(cache->wq);
				1299	}
				1300
				1301	static void requeue_deferred_io(struct cache *cache)
				1302	{
				1303	struct bio *bio;
				1304	struct bio_list bios;
				1305
				1306	bio_list_init(&bios);
				1307	bio_list_merge(&bios, &cache->deferred_bios);
				1308	bio_list_init(&cache->deferred_bios);
				1309
				1310	while ((bio = bio_list_pop(&bios)))
				1311	bio_endio(bio, DM_ENDIO_REQUEUE);
				1312	}
				1313
				1314	static int more_work(struct cache *cache)
				1315	{
				1316	if (is_quiescing(cache))
				1317	return !list_empty(&cache->quiesced_migrations) \|\|
				1318	!list_empty(&cache->completed_migrations) \|\|
				1319	!list_empty(&cache->need_commit_migrations);
				1320	else
				1321	return !bio_list_empty(&cache->deferred_bios) \|\|
				1322	!bio_list_empty(&cache->deferred_flush_bios) \|\|
				1323	!list_empty(&cache->quiesced_migrations) \|\|
				1324	!list_empty(&cache->completed_migrations) \|\|
				1325	!list_empty(&cache->need_commit_migrations);
				1326	}
				1327
				1328	static void do_worker(struct work_struct *ws)
				1329	{
				1330	struct cache *cache = container_of(ws, struct cache, worker);
				1331
				1332	do {
				1333	if (!is_quiescing(cache))
				1334	process_deferred_bios(cache);
				1335
				1336	process_migrations(cache, &cache->quiesced_migrations, issue_copy);
				1337	process_migrations(cache, &cache->completed_migrations, complete_migration);
				1338
				1339	writeback_some_dirty_blocks(cache);
				1340
				1341	if (commit_if_needed(cache)) {
				1342	process_deferred_flush_bios(cache, false);
				1343
				1344	/*
				1345	* FIXME: rollback metadata or just go into a
				1346	* failure mode and error everything
				1347	*/
				1348	} else {
				1349	process_deferred_flush_bios(cache, true);
				1350	process_migrations(cache, &cache->need_commit_migrations,
				1351	migration_success_post_commit);
				1352	}
				1353	} while (more_work(cache));
				1354	}
				1355
				1356	/*
				1357	* We want to commit periodically so that not too much
				1358	* unwritten metadata builds up.
				1359	*/
				1360	static void do_waker(struct work_struct *ws)
				1361	{
				1362	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
				1363	wake_worker(cache);
				1364	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
				1365	}
				1366
				1367	/----------------------------------------------------------------/
				1368
				1369	static int is_congested(struct dm_dev *dev, int bdi_bits)
				1370	{
				1371	struct request_queue *q = bdev_get_queue(dev->bdev);
				1372	return bdi_congested(&q->backing_dev_info, bdi_bits);
				1373	}
				1374
				1375	static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				1376	{
				1377	struct cache *cache = container_of(cb, struct cache, callbacks);
				1378
				1379	return is_congested(cache->origin_dev, bdi_bits) \|\|
				1380	is_congested(cache->cache_dev, bdi_bits);
				1381	}
				1382
				1383	/*----------------------------------------------------------------
				1384	* Target methods
				1385	--------------------------------------------------------------/
				1386
				1387	/*
				1388	* This function gets called on the error paths of the constructor, so we
				1389	* have to cope with a partially initialised struct.
				1390	*/
				1391	static void destroy(struct cache *cache)
				1392	{
				1393	unsigned i;
				1394
				1395	if (cache->next_migration)
				1396	mempool_free(cache->next_migration, cache->migration_pool);
				1397
				1398	if (cache->migration_pool)
				1399	mempool_destroy(cache->migration_pool);
				1400
				1401	if (cache->all_io_ds)
				1402	dm_deferred_set_destroy(cache->all_io_ds);
				1403
				1404	if (cache->prison)
				1405	dm_bio_prison_destroy(cache->prison);
				1406
				1407	if (cache->wq)
				1408	destroy_workqueue(cache->wq);
				1409
				1410	if (cache->dirty_bitset)
				1411	free_bitset(cache->dirty_bitset);
				1412
				1413	if (cache->discard_bitset)
				1414	free_bitset(cache->discard_bitset);
				1415
				1416	if (cache->copier)
				1417	dm_kcopyd_client_destroy(cache->copier);
				1418
				1419	if (cache->cmd)
				1420	dm_cache_metadata_close(cache->cmd);
				1421
				1422	if (cache->metadata_dev)
				1423	dm_put_device(cache->ti, cache->metadata_dev);
				1424
				1425	if (cache->origin_dev)
				1426	dm_put_device(cache->ti, cache->origin_dev);
				1427
				1428	if (cache->cache_dev)
				1429	dm_put_device(cache->ti, cache->cache_dev);
				1430
				1431	if (cache->policy)
				1432	dm_cache_policy_destroy(cache->policy);
				1433
				1434	for (i = 0; i < cache->nr_ctr_args ; i++)
				1435	kfree(cache->ctr_args[i]);
				1436	kfree(cache->ctr_args);
				1437
				1438	kfree(cache);
				1439	}
				1440
				1441	static void cache_dtr(struct dm_target *ti)
				1442	{
				1443	struct cache *cache = ti->private;
				1444
				1445	destroy(cache);
				1446	}
				1447
				1448	static sector_t get_dev_size(struct dm_dev *dev)
				1449	{
				1450	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
				1451	}
				1452
				1453	/----------------------------------------------------------------/
				1454
				1455	/*
				1456	* Construct a cache device mapping.
				1457	*
				1458	* cache <metadata dev> <cache dev> <origin dev> <block size>
				1459	* <#feature args> [<feature arg>]*
				1460	* <policy> <#policy args> [<policy arg>]*
				1461	*
				1462	* metadata dev : fast device holding the persistent metadata
				1463	* cache dev : fast device holding cached data blocks
				1464	* origin dev : slow device holding original data blocks
				1465	* block size : cache unit size in sectors
				1466	*
				1467	* #feature args : number of feature arguments passed
				1468	* feature args : writethrough. (The default is writeback.)
				1469	*
				1470	* policy : the replacement policy to use
				1471	* #policy args : an even number of policy arguments corresponding
				1472	* to key/value pairs passed to the policy
				1473	* policy args : key/value pairs passed to the policy
				1474	* E.g. 'sequential_threshold 1024'
				1475	* See cache-policies.txt for details.
				1476	*
				1477	* Optional feature arguments are:
				1478	* writethrough : write through caching that prohibits cache block
				1479	* content from being different from origin block content.
				1480	* Without this argument, the default behaviour is to write
				1481	* back cache block contents later for performance reasons,
				1482	* so they may differ from the corresponding origin blocks.
				1483	*/
				1484	struct cache_args {
				1485	struct dm_target *ti;
				1486
				1487	struct dm_dev *metadata_dev;
				1488
				1489	struct dm_dev *cache_dev;
				1490	sector_t cache_sectors;
				1491
				1492	struct dm_dev *origin_dev;
				1493	sector_t origin_sectors;
				1494
				1495	uint32_t block_size;
				1496
				1497	const char *policy_name;
				1498	int policy_argc;
				1499	const char **policy_argv;
				1500
				1501	struct cache_features features;
				1502	};
				1503
				1504	static void destroy_cache_args(struct cache_args *ca)
				1505	{
				1506	if (ca->metadata_dev)
				1507	dm_put_device(ca->ti, ca->metadata_dev);
				1508
				1509	if (ca->cache_dev)
				1510	dm_put_device(ca->ti, ca->cache_dev);
				1511
				1512	if (ca->origin_dev)
				1513	dm_put_device(ca->ti, ca->origin_dev);
				1514
				1515	kfree(ca);
				1516	}
				1517
				1518	static bool at_least_one_arg(struct dm_arg_set as, char *error)
				1519	{
				1520	if (!as->argc) {
				1521	*error = "Insufficient args";
				1522	return false;
				1523	}
				1524
				1525	return true;
				1526	}
				1527
				1528	static int parse_metadata_dev(struct cache_args ca, struct dm_arg_set as,
				1529	char **error)
				1530	{
				1531	int r;
				1532	sector_t metadata_dev_size;
				1533	char b[BDEVNAME_SIZE];
				1534
				1535	if (!at_least_one_arg(as, error))
				1536	return -EINVAL;
				1537
				1538	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				1539	&ca->metadata_dev);
				1540	if (r) {
				1541	*error = "Error opening metadata device";
				1542	return r;
				1543	}
				1544
				1545	metadata_dev_size = get_dev_size(ca->metadata_dev);
				1546	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
				1547	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				1548	bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
				1549
				1550	return 0;
				1551	}
				1552
				1553	static int parse_cache_dev(struct cache_args ca, struct dm_arg_set as,
				1554	char **error)
				1555	{
				1556	int r;
				1557
				1558	if (!at_least_one_arg(as, error))
				1559	return -EINVAL;
				1560
				1561	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				1562	&ca->cache_dev);
				1563	if (r) {
				1564	*error = "Error opening cache device";
				1565	return r;
				1566	}
				1567	ca->cache_sectors = get_dev_size(ca->cache_dev);
				1568
				1569	return 0;
				1570	}
				1571
				1572	static int parse_origin_dev(struct cache_args ca, struct dm_arg_set as,
				1573	char **error)
				1574	{
				1575	int r;
				1576
				1577	if (!at_least_one_arg(as, error))
				1578	return -EINVAL;
				1579
				1580	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				1581	&ca->origin_dev);
				1582	if (r) {
				1583	*error = "Error opening origin device";
				1584	return r;
				1585	}
				1586
				1587	ca->origin_sectors = get_dev_size(ca->origin_dev);
				1588	if (ca->ti->len > ca->origin_sectors) {
				1589	*error = "Device size larger than cached device";
				1590	return -EINVAL;
				1591	}
				1592
				1593	return 0;
				1594	}
				1595
				1596	static int parse_block_size(struct cache_args ca, struct dm_arg_set as,
				1597	char **error)
				1598	{
				1599	unsigned long tmp;
				1600
				1601	if (!at_least_one_arg(as, error))
				1602	return -EINVAL;
				1603
				1604	if (kstrtoul(dm_shift_arg(as), 10, &tmp) \|\| !tmp \|\|
				1605	tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				1606	tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
				1607	*error = "Invalid data block size";
				1608	return -EINVAL;
				1609	}
				1610
				1611	if (tmp > ca->cache_sectors) {
				1612	*error = "Data block size is larger than the cache device";
				1613	return -EINVAL;
				1614	}
				1615
				1616	ca->block_size = tmp;
				1617
				1618	return 0;
				1619	}
				1620
				1621	static void init_features(struct cache_features *cf)
				1622	{
				1623	cf->mode = CM_WRITE;
				1624	cf->write_through = false;
				1625	}
				1626
				1627	static int parse_features(struct cache_args ca, struct dm_arg_set as,
				1628	char **error)
				1629	{
				1630	static struct dm_arg _args[] = {
				1631	{0, 1, "Invalid number of cache feature arguments"},
				1632	};
				1633
				1634	int r;
				1635	unsigned argc;
				1636	const char *arg;
				1637	struct cache_features *cf = &ca->features;
				1638
				1639	init_features(cf);
				1640
				1641	r = dm_read_arg_group(_args, as, &argc, error);
				1642	if (r)
				1643	return -EINVAL;
				1644
				1645	while (argc--) {
				1646	arg = dm_shift_arg(as);
				1647
				1648	if (!strcasecmp(arg, "writeback"))
				1649	cf->write_through = false;
				1650
				1651	else if (!strcasecmp(arg, "writethrough"))
				1652	cf->write_through = true;
				1653
				1654	else {
				1655	*error = "Unrecognised cache feature requested";
				1656	return -EINVAL;
				1657	}
				1658	}
				1659
				1660	return 0;
				1661	}
				1662
				1663	static int parse_policy(struct cache_args ca, struct dm_arg_set as,
				1664	char **error)
				1665	{
				1666	static struct dm_arg _args[] = {
				1667	{0, 1024, "Invalid number of policy arguments"},
				1668	};
				1669
				1670	int r;
				1671
				1672	if (!at_least_one_arg(as, error))
				1673	return -EINVAL;
				1674
				1675	ca->policy_name = dm_shift_arg(as);
				1676
				1677	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
				1678	if (r)
				1679	return -EINVAL;
				1680
				1681	ca->policy_argv = (const char **)as->argv;
				1682	dm_consume_args(as, ca->policy_argc);
				1683
				1684	return 0;
				1685	}
				1686
				1687	static int parse_cache_args(struct cache_args ca, int argc, char *argv,
				1688	char **error)
				1689	{
				1690	int r;
				1691	struct dm_arg_set as;
				1692
				1693	as.argc = argc;
				1694	as.argv = argv;
				1695
				1696	r = parse_metadata_dev(ca, &as, error);
				1697	if (r)
				1698	return r;
				1699
				1700	r = parse_cache_dev(ca, &as, error);
				1701	if (r)
				1702	return r;
				1703
				1704	r = parse_origin_dev(ca, &as, error);
				1705	if (r)
				1706	return r;
				1707
				1708	r = parse_block_size(ca, &as, error);
				1709	if (r)
				1710	return r;
				1711
				1712	r = parse_features(ca, &as, error);
				1713	if (r)
				1714	return r;
				1715
				1716	r = parse_policy(ca, &as, error);
				1717	if (r)
				1718	return r;
				1719
				1720	return 0;
				1721	}
				1722
				1723	/----------------------------------------------------------------/
				1724
				1725	static struct kmem_cache *migration_cache;
				1726
				1727	static int set_config_values(struct dm_cache_policy p, int argc, const char *argv)
				1728	{
				1729	int r = 0;
				1730
				1731	if (argc & 1) {
				1732	DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
				1733	return -EINVAL;
				1734	}
				1735
				1736	while (argc) {
				1737	r = policy_set_config_value(p, argv[0], argv[1]);
				1738	if (r) {
				1739	DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
				1740	argv[0], argv[1]);
				1741	return r;
				1742	}
				1743
				1744	argc -= 2;
				1745	argv += 2;
				1746	}
				1747
				1748	return r;
				1749	}
				1750
				1751	static int create_cache_policy(struct cache cache, struct cache_args ca,
				1752	char **error)
				1753	{
				1754	int r;
				1755
				1756	cache->policy = dm_cache_policy_create(ca->policy_name,
				1757	cache->cache_size,
				1758	cache->origin_sectors,
				1759	cache->sectors_per_block);
				1760	if (!cache->policy) {
				1761	*error = "Error creating cache's policy";
				1762	return -ENOMEM;
				1763	}
				1764
				1765	r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
				1766	if (r)
				1767	dm_cache_policy_destroy(cache->policy);
				1768
				1769	return r;
				1770	}
				1771
				1772	/*
				1773	* We want the discard block size to be a power of two, at least the size
				1774	* of the cache block size, and have no more than 2^14 discard blocks
				1775	* across the origin.
				1776	*/
				1777	#define MAX_DISCARD_BLOCKS (1 << 14)
				1778
				1779	static bool too_many_discard_blocks(sector_t discard_block_size,
				1780	sector_t origin_size)
				1781	{
				1782	(void) sector_div(origin_size, discard_block_size);
				1783
				1784	return origin_size > MAX_DISCARD_BLOCKS;
				1785	}
				1786
				1787	static sector_t calculate_discard_block_size(sector_t cache_block_size,
				1788	sector_t origin_size)
				1789	{
				1790	sector_t discard_block_size;
				1791
				1792	discard_block_size = roundup_pow_of_two(cache_block_size);
				1793
				1794	if (origin_size)
				1795	while (too_many_discard_blocks(discard_block_size, origin_size))
				1796	discard_block_size *= 2;
				1797
				1798	return discard_block_size;
				1799	}
				1800
				1801	#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
				1802
				1803	static unsigned cache_num_write_bios(struct dm_target ti, struct bio bio);
				1804
				1805	static int cache_create(struct cache_args ca, struct cache *result)
				1806	{
				1807	int r = 0;
				1808	char **error = &ca->ti->error;
				1809	struct cache *cache;
				1810	struct dm_target *ti = ca->ti;
				1811	dm_block_t origin_blocks;
				1812	struct dm_cache_metadata *cmd;
				1813	bool may_format = ca->features.mode == CM_WRITE;
				1814
				1815	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
				1816	if (!cache)
				1817	return -ENOMEM;
				1818
				1819	cache->ti = ca->ti;
				1820	ti->private = cache;
				1821	ti->per_bio_data_size = sizeof(struct per_bio_data);
				1822	ti->num_flush_bios = 2;
				1823	ti->flush_supported = true;
				1824
				1825	ti->num_discard_bios = 1;
				1826	ti->discards_supported = true;
				1827	ti->discard_zeroes_data_unsupported = true;
				1828
				1829	memcpy(&cache->features, &ca->features, sizeof(cache->features));
				1830
				1831	if (cache->features.write_through)
				1832	ti->num_write_bios = cache_num_write_bios;
				1833
				1834	cache->callbacks.congested_fn = cache_is_congested;
				1835	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
				1836
				1837	cache->metadata_dev = ca->metadata_dev;
				1838	cache->origin_dev = ca->origin_dev;
				1839	cache->cache_dev = ca->cache_dev;
				1840
				1841	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
				1842
				1843	/* FIXME: factor out this whole section */
				1844	origin_blocks = cache->origin_sectors = ca->origin_sectors;
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame]	1845	origin_blocks = block_div(origin_blocks, ca->block_size);
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	1846	cache->origin_blocks = to_oblock(origin_blocks);
				1847
				1848	cache->sectors_per_block = ca->block_size;
				1849	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
				1850	r = -EINVAL;
				1851	goto bad;
				1852	}
				1853
				1854	if (ca->block_size & (ca->block_size - 1)) {
				1855	dm_block_t cache_size = ca->cache_sectors;
				1856
				1857	cache->sectors_per_block_shift = -1;
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame]	1858	cache_size = block_div(cache_size, ca->block_size);
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	1859	cache->cache_size = to_cblock(cache_size);
				1860	} else {
				1861	cache->sectors_per_block_shift = __ffs(ca->block_size);
				1862	cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
				1863	}
				1864
				1865	r = create_cache_policy(cache, ca, error);
				1866	if (r)
				1867	goto bad;
				1868	cache->policy_nr_args = ca->policy_argc;
				1869
				1870	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
				1871	ca->block_size, may_format,
				1872	dm_cache_policy_get_hint_size(cache->policy));
				1873	if (IS_ERR(cmd)) {
				1874	*error = "Error creating metadata object";
				1875	r = PTR_ERR(cmd);
				1876	goto bad;
				1877	}
				1878	cache->cmd = cmd;
				1879
				1880	spin_lock_init(&cache->lock);
				1881	bio_list_init(&cache->deferred_bios);
				1882	bio_list_init(&cache->deferred_flush_bios);
				1883	INIT_LIST_HEAD(&cache->quiesced_migrations);
				1884	INIT_LIST_HEAD(&cache->completed_migrations);
				1885	INIT_LIST_HEAD(&cache->need_commit_migrations);
				1886	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
				1887	atomic_set(&cache->nr_migrations, 0);
				1888	init_waitqueue_head(&cache->migration_wait);
				1889
				1890	cache->nr_dirty = 0;
				1891	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
				1892	if (!cache->dirty_bitset) {
				1893	*error = "could not allocate dirty bitset";
				1894	goto bad;
				1895	}
				1896	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
				1897
				1898	cache->discard_block_size =
				1899	calculate_discard_block_size(cache->sectors_per_block,
				1900	cache->origin_sectors);
				1901	cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
				1902	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
				1903	if (!cache->discard_bitset) {
				1904	*error = "could not allocate discard bitset";
				1905	goto bad;
				1906	}
				1907	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
				1908
				1909	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				1910	if (IS_ERR(cache->copier)) {
				1911	*error = "could not create kcopyd client";
				1912	r = PTR_ERR(cache->copier);
				1913	goto bad;
				1914	}
				1915
				1916	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
				1917	if (!cache->wq) {
				1918	*error = "could not create workqueue for metadata object";
				1919	goto bad;
				1920	}
				1921	INIT_WORK(&cache->worker, do_worker);
				1922	INIT_DELAYED_WORK(&cache->waker, do_waker);
				1923	cache->last_commit_jiffies = jiffies;
				1924
				1925	cache->prison = dm_bio_prison_create(PRISON_CELLS);
				1926	if (!cache->prison) {
				1927	*error = "could not create bio prison";
				1928	goto bad;
				1929	}
				1930
				1931	cache->all_io_ds = dm_deferred_set_create();
				1932	if (!cache->all_io_ds) {
				1933	*error = "could not create all_io deferred set";
				1934	goto bad;
				1935	}
				1936
				1937	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
				1938	migration_cache);
				1939	if (!cache->migration_pool) {
				1940	*error = "Error creating cache's migration mempool";
				1941	goto bad;
				1942	}
				1943
				1944	cache->next_migration = NULL;
				1945
				1946	cache->need_tick_bio = true;
				1947	cache->sized = false;
				1948	cache->quiescing = false;
				1949	cache->commit_requested = false;
				1950	cache->loaded_mappings = false;
				1951	cache->loaded_discards = false;
				1952
				1953	load_stats(cache);
				1954
				1955	atomic_set(&cache->stats.demotion, 0);
				1956	atomic_set(&cache->stats.promotion, 0);
				1957	atomic_set(&cache->stats.copies_avoided, 0);
				1958	atomic_set(&cache->stats.cache_cell_clash, 0);
				1959	atomic_set(&cache->stats.commit_count, 0);
				1960	atomic_set(&cache->stats.discard_count, 0);
				1961
				1962	*result = cache;
				1963	return 0;
				1964
				1965	bad:
				1966	destroy(cache);
				1967	return r;
				1968	}
				1969
				1970	static int copy_ctr_args(struct cache cache, int argc, const char *argv)
				1971	{
				1972	unsigned i;
				1973	const char **copy;
				1974
				1975	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
				1976	if (!copy)
				1977	return -ENOMEM;
				1978	for (i = 0; i < argc; i++) {
				1979	copy[i] = kstrdup(argv[i], GFP_KERNEL);
				1980	if (!copy[i]) {
				1981	while (i--)
				1982	kfree(copy[i]);
				1983	kfree(copy);
				1984	return -ENOMEM;
				1985	}
				1986	}
				1987
				1988	cache->nr_ctr_args = argc;
				1989	cache->ctr_args = copy;
				1990
				1991	return 0;
				1992	}
				1993
				1994	static int cache_ctr(struct dm_target ti, unsigned argc, char *argv)
				1995	{
				1996	int r = -EINVAL;
				1997	struct cache_args *ca;
				1998	struct cache *cache = NULL;
				1999
				2000	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
				2001	if (!ca) {
				2002	ti->error = "Error allocating memory for cache";
				2003	return -ENOMEM;
				2004	}
				2005	ca->ti = ti;
				2006
				2007	r = parse_cache_args(ca, argc, argv, &ti->error);
				2008	if (r)
				2009	goto out;
				2010
				2011	r = cache_create(ca, &cache);
Heinz Mauelshagen	617a0b8	2013-03-20 17:21:26 +0000	[diff] [blame^]	2012	if (r)
				2013	goto out;
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	2014
				2015	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
				2016	if (r) {
				2017	destroy(cache);
				2018	goto out;
				2019	}
				2020
				2021	ti->private = cache;
				2022
				2023	out:
				2024	destroy_cache_args(ca);
				2025	return r;
				2026	}
				2027
				2028	static unsigned cache_num_write_bios(struct dm_target ti, struct bio bio)
				2029	{
				2030	int r;
				2031	struct cache *cache = ti->private;
				2032	dm_oblock_t block = get_bio_block(cache, bio);
				2033	dm_cblock_t cblock;
				2034
				2035	r = policy_lookup(cache->policy, block, &cblock);
				2036	if (r < 0)
				2037	return 2; /* assume the worst */
				2038
				2039	return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
				2040	}
				2041
				2042	static int cache_map(struct dm_target ti, struct bio bio)
				2043	{
				2044	struct cache *cache = ti->private;
				2045
				2046	int r;
				2047	dm_oblock_t block = get_bio_block(cache, bio);
				2048	bool can_migrate = false;
				2049	bool discarded_block;
				2050	struct dm_bio_prison_cell *cell;
				2051	struct policy_result lookup_result;
				2052	struct per_bio_data *pb;
				2053
				2054	if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
				2055	/*
				2056	* This can only occur if the io goes to a partial block at
				2057	* the end of the origin device. We don't cache these.
				2058	* Just remap to the origin and carry on.
				2059	*/
				2060	remap_to_origin_clear_discard(cache, bio, block);
				2061	return DM_MAPIO_REMAPPED;
				2062	}
				2063
				2064	pb = init_per_bio_data(bio);
				2065
				2066	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA \| REQ_DISCARD)) {
				2067	defer_bio(cache, bio);
				2068	return DM_MAPIO_SUBMITTED;
				2069	}
				2070
				2071	/*
				2072	* Check to see if that block is currently migrating.
				2073	*/
				2074	cell = alloc_prison_cell(cache);
				2075	if (!cell) {
				2076	defer_bio(cache, bio);
				2077	return DM_MAPIO_SUBMITTED;
				2078	}
				2079
				2080	r = bio_detain(cache, block, bio, cell,
				2081	(cell_free_fn) free_prison_cell,
				2082	cache, &cell);
				2083	if (r) {
				2084	if (r < 0)
				2085	defer_bio(cache, bio);
				2086
				2087	return DM_MAPIO_SUBMITTED;
				2088	}
				2089
				2090	discarded_block = is_discarded_oblock(cache, block);
				2091
				2092	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
				2093	bio, &lookup_result);
				2094	if (r == -EWOULDBLOCK) {
				2095	cell_defer(cache, cell, true);
				2096	return DM_MAPIO_SUBMITTED;
				2097
				2098	} else if (r) {
				2099	DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
				2100	bio_io_error(bio);
				2101	return DM_MAPIO_SUBMITTED;
				2102	}
				2103
				2104	switch (lookup_result.op) {
				2105	case POLICY_HIT:
				2106	inc_hit_counter(cache, bio);
				2107	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
				2108
				2109	if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
				2110	/*
				2111	* No need to mark anything dirty in write through mode.
				2112	*/
				2113	pb->req_nr == 0 ?
				2114	remap_to_cache(cache, bio, lookup_result.cblock) :
				2115	remap_to_origin_clear_discard(cache, bio, block);
				2116	cell_defer(cache, cell, false);
				2117	} else {
				2118	remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
				2119	cell_defer(cache, cell, false);
				2120	}
				2121	break;
				2122
				2123	case POLICY_MISS:
				2124	inc_miss_counter(cache, bio);
				2125	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
				2126
				2127	if (pb->req_nr != 0) {
				2128	/*
				2129	* This is a duplicate writethrough io that is no
				2130	* longer needed because the block has been demoted.
				2131	*/
				2132	bio_endio(bio, 0);
				2133	cell_defer(cache, cell, false);
				2134	return DM_MAPIO_SUBMITTED;
				2135	} else {
				2136	remap_to_origin_clear_discard(cache, bio, block);
				2137	cell_defer(cache, cell, false);
				2138	}
				2139	break;
				2140
				2141	default:
				2142	DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
				2143	(unsigned) lookup_result.op);
				2144	bio_io_error(bio);
				2145	return DM_MAPIO_SUBMITTED;
				2146	}
				2147
				2148	return DM_MAPIO_REMAPPED;
				2149	}
				2150
				2151	static int cache_end_io(struct dm_target ti, struct bio bio, int error)
				2152	{
				2153	struct cache *cache = ti->private;
				2154	unsigned long flags;
				2155	struct per_bio_data *pb = get_per_bio_data(bio);
				2156
				2157	if (pb->tick) {
				2158	policy_tick(cache->policy);
				2159
				2160	spin_lock_irqsave(&cache->lock, flags);
				2161	cache->need_tick_bio = true;
				2162	spin_unlock_irqrestore(&cache->lock, flags);
				2163	}
				2164
				2165	check_for_quiesced_migrations(cache, pb);
				2166
				2167	return 0;
				2168	}
				2169
				2170	static int write_dirty_bitset(struct cache *cache)
				2171	{
				2172	unsigned i, r;
				2173
				2174	for (i = 0; i < from_cblock(cache->cache_size); i++) {
				2175	r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
				2176	is_dirty(cache, to_cblock(i)));
				2177	if (r)
				2178	return r;
				2179	}
				2180
				2181	return 0;
				2182	}
				2183
				2184	static int write_discard_bitset(struct cache *cache)
				2185	{
				2186	unsigned i, r;
				2187
				2188	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
				2189	cache->discard_nr_blocks);
				2190	if (r) {
				2191	DMERR("could not resize on-disk discard bitset");
				2192	return r;
				2193	}
				2194
				2195	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
				2196	r = dm_cache_set_discard(cache->cmd, to_dblock(i),
				2197	is_discarded(cache, to_dblock(i)));
				2198	if (r)
				2199	return r;
				2200	}
				2201
				2202	return 0;
				2203	}
				2204
				2205	static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
				2206	uint32_t hint)
				2207	{
				2208	struct cache *cache = context;
				2209	return dm_cache_save_hint(cache->cmd, cblock, hint);
				2210	}
				2211
				2212	static int write_hints(struct cache *cache)
				2213	{
				2214	int r;
				2215
				2216	r = dm_cache_begin_hints(cache->cmd, cache->policy);
				2217	if (r) {
				2218	DMERR("dm_cache_begin_hints failed");
				2219	return r;
				2220	}
				2221
				2222	r = policy_walk_mappings(cache->policy, save_hint, cache);
				2223	if (r)
				2224	DMERR("policy_walk_mappings failed");
				2225
				2226	return r;
				2227	}
				2228
				2229	/*
				2230	* returns true on success
				2231	*/
				2232	static bool sync_metadata(struct cache *cache)
				2233	{
				2234	int r1, r2, r3, r4;
				2235
				2236	r1 = write_dirty_bitset(cache);
				2237	if (r1)
				2238	DMERR("could not write dirty bitset");
				2239
				2240	r2 = write_discard_bitset(cache);
				2241	if (r2)
				2242	DMERR("could not write discard bitset");
				2243
				2244	save_stats(cache);
				2245
				2246	r3 = write_hints(cache);
				2247	if (r3)
				2248	DMERR("could not write hints");
				2249
				2250	/*
				2251	* If writing the above metadata failed, we still commit, but don't
				2252	* set the clean shutdown flag. This will effectively force every
				2253	* dirty bit to be set on reload.
				2254	*/
				2255	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
				2256	if (r4)
				2257	DMERR("could not write cache metadata. Data loss may occur.");
				2258
				2259	return !r1 && !r2 && !r3 && !r4;
				2260	}
				2261
				2262	static void cache_postsuspend(struct dm_target *ti)
				2263	{
				2264	struct cache *cache = ti->private;
				2265
				2266	start_quiescing(cache);
				2267	wait_for_migrations(cache);
				2268	stop_worker(cache);
				2269	requeue_deferred_io(cache);
				2270	stop_quiescing(cache);
				2271
				2272	(void) sync_metadata(cache);
				2273	}
				2274
				2275	static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
				2276	bool dirty, uint32_t hint, bool hint_valid)
				2277	{
				2278	int r;
				2279	struct cache *cache = context;
				2280
				2281	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
				2282	if (r)
				2283	return r;
				2284
				2285	if (dirty)
				2286	set_dirty(cache, oblock, cblock);
				2287	else
				2288	clear_dirty(cache, oblock, cblock);
				2289
				2290	return 0;
				2291	}
				2292
				2293	static int load_discard(void *context, sector_t discard_block_size,
				2294	dm_dblock_t dblock, bool discard)
				2295	{
				2296	struct cache *cache = context;
				2297
				2298	/* FIXME: handle mis-matched block size */
				2299
				2300	if (discard)
				2301	set_discard(cache, dblock);
				2302	else
				2303	clear_discard(cache, dblock);
				2304
				2305	return 0;
				2306	}
				2307
				2308	static int cache_preresume(struct dm_target *ti)
				2309	{
				2310	int r = 0;
				2311	struct cache *cache = ti->private;
				2312	sector_t actual_cache_size = get_dev_size(cache->cache_dev);
				2313	(void) sector_div(actual_cache_size, cache->sectors_per_block);
				2314
				2315	/*
				2316	* Check to see if the cache has resized.
				2317	*/
				2318	if (from_cblock(cache->cache_size) != actual_cache_size \|\| !cache->sized) {
				2319	cache->cache_size = to_cblock(actual_cache_size);
				2320
				2321	r = dm_cache_resize(cache->cmd, cache->cache_size);
				2322	if (r) {
				2323	DMERR("could not resize cache metadata");
				2324	return r;
				2325	}
				2326
				2327	cache->sized = true;
				2328	}
				2329
				2330	if (!cache->loaded_mappings) {
				2331	r = dm_cache_load_mappings(cache->cmd,
				2332	dm_cache_policy_get_name(cache->policy),
				2333	load_mapping, cache);
				2334	if (r) {
				2335	DMERR("could not load cache mappings");
				2336	return r;
				2337	}
				2338
				2339	cache->loaded_mappings = true;
				2340	}
				2341
				2342	if (!cache->loaded_discards) {
				2343	r = dm_cache_load_discards(cache->cmd, load_discard, cache);
				2344	if (r) {
				2345	DMERR("could not load origin discards");
				2346	return r;
				2347	}
				2348
				2349	cache->loaded_discards = true;
				2350	}
				2351
				2352	return r;
				2353	}
				2354
				2355	static void cache_resume(struct dm_target *ti)
				2356	{
				2357	struct cache *cache = ti->private;
				2358
				2359	cache->need_tick_bio = true;
				2360	do_waker(&cache->waker.work);
				2361	}
				2362
				2363	/*
				2364	* Status format:
				2365	*
				2366	* <#used metadata blocks>/<#total metadata blocks>
				2367	* <#read hits> <#read misses> <#write hits> <#write misses>
				2368	* <#demotions> <#promotions> <#blocks in cache> <#dirty>
				2369	* <#features> <features>*
				2370	* <#core args> <core args>
				2371	* <#policy args> <policy args>*
				2372	*/
				2373	static void cache_status(struct dm_target *ti, status_type_t type,
				2374	unsigned status_flags, char *result, unsigned maxlen)
				2375	{
				2376	int r = 0;
				2377	unsigned i;
				2378	ssize_t sz = 0;
				2379	dm_block_t nr_free_blocks_metadata = 0;
				2380	dm_block_t nr_blocks_metadata = 0;
				2381	char buf[BDEVNAME_SIZE];
				2382	struct cache *cache = ti->private;
				2383	dm_cblock_t residency;
				2384
				2385	switch (type) {
				2386	case STATUSTYPE_INFO:
				2387	/* Commit to ensure statistics aren't out-of-date */
				2388	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
				2389	r = dm_cache_commit(cache->cmd, false);
				2390	if (r)
				2391	DMERR("could not commit metadata for accurate status");
				2392	}
				2393
				2394	r = dm_cache_get_free_metadata_block_count(cache->cmd,
				2395	&nr_free_blocks_metadata);
				2396	if (r) {
				2397	DMERR("could not get metadata free block count");
				2398	goto err;
				2399	}
				2400
				2401	r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
				2402	if (r) {
				2403	DMERR("could not get metadata device size");
				2404	goto err;
				2405	}
				2406
				2407	residency = policy_residency(cache->policy);
				2408
				2409	DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
				2410	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				2411	(unsigned long long)nr_blocks_metadata,
				2412	(unsigned) atomic_read(&cache->stats.read_hit),
				2413	(unsigned) atomic_read(&cache->stats.read_miss),
				2414	(unsigned) atomic_read(&cache->stats.write_hit),
				2415	(unsigned) atomic_read(&cache->stats.write_miss),
				2416	(unsigned) atomic_read(&cache->stats.demotion),
				2417	(unsigned) atomic_read(&cache->stats.promotion),
				2418	(unsigned long long) from_cblock(residency),
				2419	cache->nr_dirty);
				2420
				2421	if (cache->features.write_through)
				2422	DMEMIT("1 writethrough ");
				2423	else
				2424	DMEMIT("0 ");
				2425
				2426	DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
				2427	if (sz < maxlen) {
				2428	r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
				2429	if (r)
				2430	DMERR("policy_emit_config_values returned %d", r);
				2431	}
				2432
				2433	break;
				2434
				2435	case STATUSTYPE_TABLE:
				2436	format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
				2437	DMEMIT("%s ", buf);
				2438	format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
				2439	DMEMIT("%s ", buf);
				2440	format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
				2441	DMEMIT("%s", buf);
				2442
				2443	for (i = 0; i < cache->nr_ctr_args - 1; i++)
				2444	DMEMIT(" %s", cache->ctr_args[i]);
				2445	if (cache->nr_ctr_args)
				2446	DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
				2447	}
				2448
				2449	return;
				2450
				2451	err:
				2452	DMEMIT("Error");
				2453	}
				2454
				2455	#define NOT_CORE_OPTION 1
				2456
				2457	static int process_config_option(struct cache cache, char *argv)
				2458	{
				2459	unsigned long tmp;
				2460
				2461	if (!strcasecmp(argv[0], "migration_threshold")) {
				2462	if (kstrtoul(argv[1], 10, &tmp))
				2463	return -EINVAL;
				2464
				2465	cache->migration_threshold = tmp;
				2466	return 0;
				2467	}
				2468
				2469	return NOT_CORE_OPTION;
				2470	}
				2471
				2472	/*
				2473	* Supports <key> <value>.
				2474	*
				2475	* The key migration_threshold is supported by the cache target core.
				2476	*/
				2477	static int cache_message(struct dm_target ti, unsigned argc, char *argv)
				2478	{
				2479	int r;
				2480	struct cache *cache = ti->private;
				2481
				2482	if (argc != 2)
				2483	return -EINVAL;
				2484
				2485	r = process_config_option(cache, argv);
				2486	if (r == NOT_CORE_OPTION)
				2487	return policy_set_config_value(cache->policy, argv[0], argv[1]);
				2488
				2489	return r;
				2490	}
				2491
				2492	static int cache_iterate_devices(struct dm_target *ti,
				2493	iterate_devices_callout_fn fn, void *data)
				2494	{
				2495	int r = 0;
				2496	struct cache *cache = ti->private;
				2497
				2498	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
				2499	if (!r)
				2500	r = fn(ti, cache->origin_dev, 0, ti->len, data);
				2501
				2502	return r;
				2503	}
				2504
				2505	/*
				2506	* We assume I/O is going to the origin (which is the volume
				2507	* more likely to have restrictions e.g. by being striped).
				2508	* (Looking up the exact location of the data would be expensive
				2509	* and could always be out of date by the time the bio is submitted.)
				2510	*/
				2511	static int cache_bvec_merge(struct dm_target *ti,
				2512	struct bvec_merge_data *bvm,
				2513	struct bio_vec *biovec, int max_size)
				2514	{
				2515	struct cache *cache = ti->private;
				2516	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
				2517
				2518	if (!q->merge_bvec_fn)
				2519	return max_size;
				2520
				2521	bvm->bi_bdev = cache->origin_dev->bdev;
				2522	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
				2523	}
				2524
				2525	static void set_discard_limits(struct cache cache, struct queue_limits limits)
				2526	{
				2527	/*
				2528	* FIXME: these limits may be incompatible with the cache device
				2529	*/
				2530	limits->max_discard_sectors = cache->discard_block_size * 1024;
				2531	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
				2532	}
				2533
				2534	static void cache_io_hints(struct dm_target ti, struct queue_limits limits)
				2535	{
				2536	struct cache *cache = ti->private;
				2537
				2538	blk_limits_io_min(limits, 0);
				2539	blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
				2540	set_discard_limits(cache, limits);
				2541	}
				2542
				2543	/----------------------------------------------------------------/
				2544
				2545	static struct target_type cache_target = {
				2546	.name = "cache",
				2547	.version = {1, 0, 0},
				2548	.module = THIS_MODULE,
				2549	.ctr = cache_ctr,
				2550	.dtr = cache_dtr,
				2551	.map = cache_map,
				2552	.end_io = cache_end_io,
				2553	.postsuspend = cache_postsuspend,
				2554	.preresume = cache_preresume,
				2555	.resume = cache_resume,
				2556	.status = cache_status,
				2557	.message = cache_message,
				2558	.iterate_devices = cache_iterate_devices,
				2559	.merge = cache_bvec_merge,
				2560	.io_hints = cache_io_hints,
				2561	};
				2562
				2563	static int __init dm_cache_init(void)
				2564	{
				2565	int r;
				2566
				2567	r = dm_register_target(&cache_target);
				2568	if (r) {
				2569	DMERR("cache target registration failed: %d", r);
				2570	return r;
				2571	}
				2572
				2573	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
				2574	if (!migration_cache) {
				2575	dm_unregister_target(&cache_target);
				2576	return -ENOMEM;
				2577	}
				2578
				2579	return 0;
				2580	}
				2581
				2582	static void __exit dm_cache_exit(void)
				2583	{
				2584	dm_unregister_target(&cache_target);
				2585	kmem_cache_destroy(migration_cache);
				2586	}
				2587
				2588	module_init(dm_cache_init);
				2589	module_exit(dm_cache_exit);
				2590
				2591	MODULE_DESCRIPTION(DM_NAME " cache target");
				2592	MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
				2593	MODULE_LICENSE("GPL");