Blame - drivers/md/dm-cache-target.c - kernel/msm-4.9

blob: 5ad227f0cea3f8af09c9131e95f4e7f3a1497d31 [file] [log] [blame]

Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Red Hat. All rights reserved.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm.h"
				8	#include "dm-bio-prison.h"
				9	#include "dm-cache-metadata.h"
				10
				11	#include <linux/dm-io.h>
				12	#include <linux/dm-kcopyd.h>
				13	#include <linux/init.h>
				14	#include <linux/mempool.h>
				15	#include <linux/module.h>
				16	#include <linux/slab.h>
				17	#include <linux/vmalloc.h>
				18
				19	#define DM_MSG_PREFIX "cache"
				20
				21	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
				22	"A percentage of time allocated for copying to and/or from cache");
				23
				24	/----------------------------------------------------------------/
				25
				26	/*
				27	* Glossary:
				28	*
				29	* oblock: index of an origin block
				30	* cblock: index of a cache block
				31	* promotion: movement of a block from origin to cache
				32	* demotion: movement of a block from cache to origin
				33	* migration: movement of a block between the origin and cache device,
				34	* either direction
				35	*/
				36
				37	/----------------------------------------------------------------/
				38
				39	static size_t bitset_size_in_bytes(unsigned nr_entries)
				40	{
				41	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
				42	}
				43
				44	static unsigned long *alloc_bitset(unsigned nr_entries)
				45	{
				46	size_t s = bitset_size_in_bytes(nr_entries);
				47	return vzalloc(s);
				48	}
				49
				50	static void clear_bitset(void *bitset, unsigned nr_entries)
				51	{
				52	size_t s = bitset_size_in_bytes(nr_entries);
				53	memset(bitset, 0, s);
				54	}
				55
				56	static void free_bitset(unsigned long *bits)
				57	{
				58	vfree(bits);
				59	}
				60
				61	/----------------------------------------------------------------/
				62
				63	#define PRISON_CELLS 1024
				64	#define MIGRATION_POOL_SIZE 128
				65	#define COMMIT_PERIOD HZ
				66	#define MIGRATION_COUNT_WINDOW 10
				67
				68	/*
				69	* The block size of the device holding cache data must be >= 32KB
				70	*/
				71	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
				72
				73	/*
				74	* FIXME: the cache is read/write for the time being.
				75	*/
				76	enum cache_mode {
				77	CM_WRITE, /* metadata may be changed */
				78	CM_READ_ONLY, /* metadata may not be changed */
				79	};
				80
				81	struct cache_features {
				82	enum cache_mode mode;
				83	bool write_through:1;
				84	};
				85
				86	struct cache_stats {
				87	atomic_t read_hit;
				88	atomic_t read_miss;
				89	atomic_t write_hit;
				90	atomic_t write_miss;
				91	atomic_t demotion;
				92	atomic_t promotion;
				93	atomic_t copies_avoided;
				94	atomic_t cache_cell_clash;
				95	atomic_t commit_count;
				96	atomic_t discard_count;
				97	};
				98
				99	struct cache {
				100	struct dm_target *ti;
				101	struct dm_target_callbacks callbacks;
				102
				103	/*
				104	* Metadata is written to this device.
				105	*/
				106	struct dm_dev *metadata_dev;
				107
				108	/*
				109	* The slower of the two data devices. Typically a spindle.
				110	*/
				111	struct dm_dev *origin_dev;
				112
				113	/*
				114	* The faster of the two data devices. Typically an SSD.
				115	*/
				116	struct dm_dev *cache_dev;
				117
				118	/*
				119	* Cache features such as write-through.
				120	*/
				121	struct cache_features features;
				122
				123	/*
				124	* Size of the origin device in _complete_ blocks and native sectors.
				125	*/
				126	dm_oblock_t origin_blocks;
				127	sector_t origin_sectors;
				128
				129	/*
				130	* Size of the cache device in blocks.
				131	*/
				132	dm_cblock_t cache_size;
				133
				134	/*
				135	* Fields for converting from sectors to blocks.
				136	*/
				137	uint32_t sectors_per_block;
				138	int sectors_per_block_shift;
				139
				140	struct dm_cache_metadata *cmd;
				141
				142	spinlock_t lock;
				143	struct bio_list deferred_bios;
				144	struct bio_list deferred_flush_bios;
				145	struct list_head quiesced_migrations;
				146	struct list_head completed_migrations;
				147	struct list_head need_commit_migrations;
				148	sector_t migration_threshold;
				149	atomic_t nr_migrations;
				150	wait_queue_head_t migration_wait;
				151
				152	/*
				153	* cache_size entries, dirty if set
				154	*/
				155	dm_cblock_t nr_dirty;
				156	unsigned long *dirty_bitset;
				157
				158	/*
				159	* origin_blocks entries, discarded if set.
				160	*/
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame^]	161	uint32_t discard_block_size; /* a power of 2 times sectors per block */
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	162	dm_dblock_t discard_nr_blocks;
				163	unsigned long *discard_bitset;
				164
				165	struct dm_kcopyd_client *copier;
				166	struct workqueue_struct *wq;
				167	struct work_struct worker;
				168
				169	struct delayed_work waker;
				170	unsigned long last_commit_jiffies;
				171
				172	struct dm_bio_prison *prison;
				173	struct dm_deferred_set *all_io_ds;
				174
				175	mempool_t *migration_pool;
				176	struct dm_cache_migration *next_migration;
				177
				178	struct dm_cache_policy *policy;
				179	unsigned policy_nr_args;
				180
				181	bool need_tick_bio:1;
				182	bool sized:1;
				183	bool quiescing:1;
				184	bool commit_requested:1;
				185	bool loaded_mappings:1;
				186	bool loaded_discards:1;
				187
				188	struct cache_stats stats;
				189
				190	/*
				191	* Rather than reconstructing the table line for the status we just
				192	* save it and regurgitate.
				193	*/
				194	unsigned nr_ctr_args;
				195	const char **ctr_args;
				196	};
				197
				198	struct per_bio_data {
				199	bool tick:1;
				200	unsigned req_nr:2;
				201	struct dm_deferred_entry *all_io_entry;
				202	};
				203
				204	struct dm_cache_migration {
				205	struct list_head list;
				206	struct cache *cache;
				207
				208	unsigned long start_jiffies;
				209	dm_oblock_t old_oblock;
				210	dm_oblock_t new_oblock;
				211	dm_cblock_t cblock;
				212
				213	bool err:1;
				214	bool writeback:1;
				215	bool demote:1;
				216	bool promote:1;
				217
				218	struct dm_bio_prison_cell *old_ocell;
				219	struct dm_bio_prison_cell *new_ocell;
				220	};
				221
				222	/*
				223	* Processing a bio in the worker thread may require these memory
				224	* allocations. We prealloc to avoid deadlocks (the same worker thread
				225	* frees them back to the mempool).
				226	*/
				227	struct prealloc {
				228	struct dm_cache_migration *mg;
				229	struct dm_bio_prison_cell *cell1;
				230	struct dm_bio_prison_cell *cell2;
				231	};
				232
				233	static void wake_worker(struct cache *cache)
				234	{
				235	queue_work(cache->wq, &cache->worker);
				236	}
				237
				238	/----------------------------------------------------------------/
				239
				240	static struct dm_bio_prison_cell alloc_prison_cell(struct cache cache)
				241	{
				242	/* FIXME: change to use a local slab. */
				243	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
				244	}
				245
				246	static void free_prison_cell(struct cache cache, struct dm_bio_prison_cell cell)
				247	{
				248	dm_bio_prison_free_cell(cache->prison, cell);
				249	}
				250
				251	static int prealloc_data_structs(struct cache cache, struct prealloc p)
				252	{
				253	if (!p->mg) {
				254	p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
				255	if (!p->mg)
				256	return -ENOMEM;
				257	}
				258
				259	if (!p->cell1) {
				260	p->cell1 = alloc_prison_cell(cache);
				261	if (!p->cell1)
				262	return -ENOMEM;
				263	}
				264
				265	if (!p->cell2) {
				266	p->cell2 = alloc_prison_cell(cache);
				267	if (!p->cell2)
				268	return -ENOMEM;
				269	}
				270
				271	return 0;
				272	}
				273
				274	static void prealloc_free_structs(struct cache cache, struct prealloc p)
				275	{
				276	if (p->cell2)
				277	free_prison_cell(cache, p->cell2);
				278
				279	if (p->cell1)
				280	free_prison_cell(cache, p->cell1);
				281
				282	if (p->mg)
				283	mempool_free(p->mg, cache->migration_pool);
				284	}
				285
				286	static struct dm_cache_migration prealloc_get_migration(struct prealloc p)
				287	{
				288	struct dm_cache_migration *mg = p->mg;
				289
				290	BUG_ON(!mg);
				291	p->mg = NULL;
				292
				293	return mg;
				294	}
				295
				296	/*
				297	* You must have a cell within the prealloc struct to return. If not this
				298	* function will BUG() rather than returning NULL.
				299	*/
				300	static struct dm_bio_prison_cell prealloc_get_cell(struct prealloc p)
				301	{
				302	struct dm_bio_prison_cell *r = NULL;
				303
				304	if (p->cell1) {
				305	r = p->cell1;
				306	p->cell1 = NULL;
				307
				308	} else if (p->cell2) {
				309	r = p->cell2;
				310	p->cell2 = NULL;
				311	} else
				312	BUG();
				313
				314	return r;
				315	}
				316
				317	/*
				318	* You can't have more than two cells in a prealloc struct. BUG() will be
				319	* called if you try and overfill.
				320	*/
				321	static void prealloc_put_cell(struct prealloc p, struct dm_bio_prison_cell cell)
				322	{
				323	if (!p->cell2)
				324	p->cell2 = cell;
				325
				326	else if (!p->cell1)
				327	p->cell1 = cell;
				328
				329	else
				330	BUG();
				331	}
				332
				333	/----------------------------------------------------------------/
				334
				335	static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
				336	{
				337	key->virtual = 0;
				338	key->dev = 0;
				339	key->block = from_oblock(oblock);
				340	}
				341
				342	/*
				343	* The caller hands in a preallocated cell, and a free function for it.
				344	* The cell will be freed if there's an error, or if it wasn't used because
				345	* a cell with that key already exists.
				346	*/
				347	typedef void (cell_free_fn)(void context, struct dm_bio_prison_cell *cell);
				348
				349	static int bio_detain(struct cache *cache, dm_oblock_t oblock,
				350	struct bio bio, struct dm_bio_prison_cell cell_prealloc,
				351	cell_free_fn free_fn, void *free_context,
				352	struct dm_bio_prison_cell **cell_result)
				353	{
				354	int r;
				355	struct dm_cell_key key;
				356
				357	build_key(oblock, &key);
				358	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
				359	if (r)
				360	free_fn(free_context, cell_prealloc);
				361
				362	return r;
				363	}
				364
				365	static int get_cell(struct cache *cache,
				366	dm_oblock_t oblock,
				367	struct prealloc *structs,
				368	struct dm_bio_prison_cell **cell_result)
				369	{
				370	int r;
				371	struct dm_cell_key key;
				372	struct dm_bio_prison_cell *cell_prealloc;
				373
				374	cell_prealloc = prealloc_get_cell(structs);
				375
				376	build_key(oblock, &key);
				377	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
				378	if (r)
				379	prealloc_put_cell(structs, cell_prealloc);
				380
				381	return r;
				382	}
				383
				384	/----------------------------------------------------------------/
				385
				386	static bool is_dirty(struct cache *cache, dm_cblock_t b)
				387	{
				388	return test_bit(from_cblock(b), cache->dirty_bitset);
				389	}
				390
				391	static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
				392	{
				393	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
				394	cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
				395	policy_set_dirty(cache->policy, oblock);
				396	}
				397	}
				398
				399	static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
				400	{
				401	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
				402	policy_clear_dirty(cache->policy, oblock);
				403	cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
				404	if (!from_cblock(cache->nr_dirty))
				405	dm_table_event(cache->ti->table);
				406	}
				407	}
				408
				409	/----------------------------------------------------------------/
				410	static bool block_size_is_power_of_two(struct cache *cache)
				411	{
				412	return cache->sectors_per_block_shift >= 0;
				413	}
				414
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame^]	415	static dm_block_t block_div(dm_block_t b, uint32_t n)
				416	{
				417	do_div(b, n);
				418
				419	return b;
				420	}
				421
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	422	static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
				423	{
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame^]	424	uint32_t discard_blocks = cache->discard_block_size;
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	425	dm_block_t b = from_oblock(oblock);
				426
				427	if (!block_size_is_power_of_two(cache))
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame^]	428	discard_blocks = discard_blocks / cache->sectors_per_block;
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	429	else
				430	discard_blocks >>= cache->sectors_per_block_shift;
				431
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame^]	432	b = block_div(b, discard_blocks);
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	433
				434	return to_dblock(b);
				435	}
				436
				437	static void set_discard(struct cache *cache, dm_dblock_t b)
				438	{
				439	unsigned long flags;
				440
				441	atomic_inc(&cache->stats.discard_count);
				442
				443	spin_lock_irqsave(&cache->lock, flags);
				444	set_bit(from_dblock(b), cache->discard_bitset);
				445	spin_unlock_irqrestore(&cache->lock, flags);
				446	}
				447
				448	static void clear_discard(struct cache *cache, dm_dblock_t b)
				449	{
				450	unsigned long flags;
				451
				452	spin_lock_irqsave(&cache->lock, flags);
				453	clear_bit(from_dblock(b), cache->discard_bitset);
				454	spin_unlock_irqrestore(&cache->lock, flags);
				455	}
				456
				457	static bool is_discarded(struct cache *cache, dm_dblock_t b)
				458	{
				459	int r;
				460	unsigned long flags;
				461
				462	spin_lock_irqsave(&cache->lock, flags);
				463	r = test_bit(from_dblock(b), cache->discard_bitset);
				464	spin_unlock_irqrestore(&cache->lock, flags);
				465
				466	return r;
				467	}
				468
				469	static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
				470	{
				471	int r;
				472	unsigned long flags;
				473
				474	spin_lock_irqsave(&cache->lock, flags);
				475	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
				476	cache->discard_bitset);
				477	spin_unlock_irqrestore(&cache->lock, flags);
				478
				479	return r;
				480	}
				481
				482	/----------------------------------------------------------------/
				483
				484	static void load_stats(struct cache *cache)
				485	{
				486	struct dm_cache_statistics stats;
				487
				488	dm_cache_metadata_get_stats(cache->cmd, &stats);
				489	atomic_set(&cache->stats.read_hit, stats.read_hits);
				490	atomic_set(&cache->stats.read_miss, stats.read_misses);
				491	atomic_set(&cache->stats.write_hit, stats.write_hits);
				492	atomic_set(&cache->stats.write_miss, stats.write_misses);
				493	}
				494
				495	static void save_stats(struct cache *cache)
				496	{
				497	struct dm_cache_statistics stats;
				498
				499	stats.read_hits = atomic_read(&cache->stats.read_hit);
				500	stats.read_misses = atomic_read(&cache->stats.read_miss);
				501	stats.write_hits = atomic_read(&cache->stats.write_hit);
				502	stats.write_misses = atomic_read(&cache->stats.write_miss);
				503
				504	dm_cache_metadata_set_stats(cache->cmd, &stats);
				505	}
				506
				507	/*----------------------------------------------------------------
				508	* Per bio data
				509	--------------------------------------------------------------/
				510	static struct per_bio_data get_per_bio_data(struct bio bio)
				511	{
				512	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
				513	BUG_ON(!pb);
				514	return pb;
				515	}
				516
				517	static struct per_bio_data init_per_bio_data(struct bio bio)
				518	{
				519	struct per_bio_data *pb = get_per_bio_data(bio);
				520
				521	pb->tick = false;
				522	pb->req_nr = dm_bio_get_target_bio_nr(bio);
				523	pb->all_io_entry = NULL;
				524
				525	return pb;
				526	}
				527
				528	/*----------------------------------------------------------------
				529	* Remapping
				530	--------------------------------------------------------------/
				531	static void remap_to_origin(struct cache cache, struct bio bio)
				532	{
				533	bio->bi_bdev = cache->origin_dev->bdev;
				534	}
				535
				536	static void remap_to_cache(struct cache cache, struct bio bio,
				537	dm_cblock_t cblock)
				538	{
				539	sector_t bi_sector = bio->bi_sector;
				540
				541	bio->bi_bdev = cache->cache_dev->bdev;
				542	if (!block_size_is_power_of_two(cache))
				543	bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
				544	sector_div(bi_sector, cache->sectors_per_block);
				545	else
				546	bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) \|
				547	(bi_sector & (cache->sectors_per_block - 1));
				548	}
				549
				550	static void check_if_tick_bio_needed(struct cache cache, struct bio bio)
				551	{
				552	unsigned long flags;
				553	struct per_bio_data *pb = get_per_bio_data(bio);
				554
				555	spin_lock_irqsave(&cache->lock, flags);
				556	if (cache->need_tick_bio &&
				557	!(bio->bi_rw & (REQ_FUA \| REQ_FLUSH \| REQ_DISCARD))) {
				558	pb->tick = true;
				559	cache->need_tick_bio = false;
				560	}
				561	spin_unlock_irqrestore(&cache->lock, flags);
				562	}
				563
				564	static void remap_to_origin_clear_discard(struct cache cache, struct bio bio,
				565	dm_oblock_t oblock)
				566	{
				567	check_if_tick_bio_needed(cache, bio);
				568	remap_to_origin(cache, bio);
				569	if (bio_data_dir(bio) == WRITE)
				570	clear_discard(cache, oblock_to_dblock(cache, oblock));
				571	}
				572
				573	static void remap_to_cache_dirty(struct cache cache, struct bio bio,
				574	dm_oblock_t oblock, dm_cblock_t cblock)
				575	{
				576	remap_to_cache(cache, bio, cblock);
				577	if (bio_data_dir(bio) == WRITE) {
				578	set_dirty(cache, oblock, cblock);
				579	clear_discard(cache, oblock_to_dblock(cache, oblock));
				580	}
				581	}
				582
				583	static dm_oblock_t get_bio_block(struct cache cache, struct bio bio)
				584	{
				585	sector_t block_nr = bio->bi_sector;
				586
				587	if (!block_size_is_power_of_two(cache))
				588	(void) sector_div(block_nr, cache->sectors_per_block);
				589	else
				590	block_nr >>= cache->sectors_per_block_shift;
				591
				592	return to_oblock(block_nr);
				593	}
				594
				595	static int bio_triggers_commit(struct cache cache, struct bio bio)
				596	{
				597	return bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
				598	}
				599
				600	static void issue(struct cache cache, struct bio bio)
				601	{
				602	unsigned long flags;
				603
				604	if (!bio_triggers_commit(cache, bio)) {
				605	generic_make_request(bio);
				606	return;
				607	}
				608
				609	/*
				610	* Batch together any bios that trigger commits and then issue a
				611	* single commit for them in do_worker().
				612	*/
				613	spin_lock_irqsave(&cache->lock, flags);
				614	cache->commit_requested = true;
				615	bio_list_add(&cache->deferred_flush_bios, bio);
				616	spin_unlock_irqrestore(&cache->lock, flags);
				617	}
				618
				619	/*----------------------------------------------------------------
				620	* Migration processing
				621	*
				622	* Migration covers moving data from the origin device to the cache, or
				623	* vice versa.
				624	--------------------------------------------------------------/
				625	static void free_migration(struct dm_cache_migration *mg)
				626	{
				627	mempool_free(mg, mg->cache->migration_pool);
				628	}
				629
				630	static void inc_nr_migrations(struct cache *cache)
				631	{
				632	atomic_inc(&cache->nr_migrations);
				633	}
				634
				635	static void dec_nr_migrations(struct cache *cache)
				636	{
				637	atomic_dec(&cache->nr_migrations);
				638
				639	/*
				640	* Wake the worker in case we're suspending the target.
				641	*/
				642	wake_up(&cache->migration_wait);
				643	}
				644
				645	static void __cell_defer(struct cache cache, struct dm_bio_prison_cell cell,
				646	bool holder)
				647	{
				648	(holder ? dm_cell_release : dm_cell_release_no_holder)
				649	(cache->prison, cell, &cache->deferred_bios);
				650	free_prison_cell(cache, cell);
				651	}
				652
				653	static void cell_defer(struct cache cache, struct dm_bio_prison_cell cell,
				654	bool holder)
				655	{
				656	unsigned long flags;
				657
				658	spin_lock_irqsave(&cache->lock, flags);
				659	__cell_defer(cache, cell, holder);
				660	spin_unlock_irqrestore(&cache->lock, flags);
				661
				662	wake_worker(cache);
				663	}
				664
				665	static void cleanup_migration(struct dm_cache_migration *mg)
				666	{
				667	dec_nr_migrations(mg->cache);
				668	free_migration(mg);
				669	}
				670
				671	static void migration_failure(struct dm_cache_migration *mg)
				672	{
				673	struct cache *cache = mg->cache;
				674
				675	if (mg->writeback) {
				676	DMWARN_LIMIT("writeback failed; couldn't copy block");
				677	set_dirty(cache, mg->old_oblock, mg->cblock);
				678	cell_defer(cache, mg->old_ocell, false);
				679
				680	} else if (mg->demote) {
				681	DMWARN_LIMIT("demotion failed; couldn't copy block");
				682	policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
				683
				684	cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
				685	if (mg->promote)
				686	cell_defer(cache, mg->new_ocell, 1);
				687	} else {
				688	DMWARN_LIMIT("promotion failed; couldn't copy block");
				689	policy_remove_mapping(cache->policy, mg->new_oblock);
				690	cell_defer(cache, mg->new_ocell, 1);
				691	}
				692
				693	cleanup_migration(mg);
				694	}
				695
				696	static void migration_success_pre_commit(struct dm_cache_migration *mg)
				697	{
				698	unsigned long flags;
				699	struct cache *cache = mg->cache;
				700
				701	if (mg->writeback) {
				702	cell_defer(cache, mg->old_ocell, false);
				703	clear_dirty(cache, mg->old_oblock, mg->cblock);
				704	cleanup_migration(mg);
				705	return;
				706
				707	} else if (mg->demote) {
				708	if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
				709	DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
				710	policy_force_mapping(cache->policy, mg->new_oblock,
				711	mg->old_oblock);
				712	if (mg->promote)
				713	cell_defer(cache, mg->new_ocell, true);
				714	cleanup_migration(mg);
				715	return;
				716	}
				717	} else {
				718	if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
				719	DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
				720	policy_remove_mapping(cache->policy, mg->new_oblock);
				721	cleanup_migration(mg);
				722	return;
				723	}
				724	}
				725
				726	spin_lock_irqsave(&cache->lock, flags);
				727	list_add_tail(&mg->list, &cache->need_commit_migrations);
				728	cache->commit_requested = true;
				729	spin_unlock_irqrestore(&cache->lock, flags);
				730	}
				731
				732	static void migration_success_post_commit(struct dm_cache_migration *mg)
				733	{
				734	unsigned long flags;
				735	struct cache *cache = mg->cache;
				736
				737	if (mg->writeback) {
				738	DMWARN("writeback unexpectedly triggered commit");
				739	return;
				740
				741	} else if (mg->demote) {
				742	cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
				743
				744	if (mg->promote) {
				745	mg->demote = false;
				746
				747	spin_lock_irqsave(&cache->lock, flags);
				748	list_add_tail(&mg->list, &cache->quiesced_migrations);
				749	spin_unlock_irqrestore(&cache->lock, flags);
				750
				751	} else
				752	cleanup_migration(mg);
				753
				754	} else {
				755	cell_defer(cache, mg->new_ocell, true);
				756	clear_dirty(cache, mg->new_oblock, mg->cblock);
				757	cleanup_migration(mg);
				758	}
				759	}
				760
				761	static void copy_complete(int read_err, unsigned long write_err, void *context)
				762	{
				763	unsigned long flags;
				764	struct dm_cache_migration mg = (struct dm_cache_migration ) context;
				765	struct cache *cache = mg->cache;
				766
				767	if (read_err \|\| write_err)
				768	mg->err = true;
				769
				770	spin_lock_irqsave(&cache->lock, flags);
				771	list_add_tail(&mg->list, &cache->completed_migrations);
				772	spin_unlock_irqrestore(&cache->lock, flags);
				773
				774	wake_worker(cache);
				775	}
				776
				777	static void issue_copy_real(struct dm_cache_migration *mg)
				778	{
				779	int r;
				780	struct dm_io_region o_region, c_region;
				781	struct cache *cache = mg->cache;
				782
				783	o_region.bdev = cache->origin_dev->bdev;
				784	o_region.count = cache->sectors_per_block;
				785
				786	c_region.bdev = cache->cache_dev->bdev;
				787	c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
				788	c_region.count = cache->sectors_per_block;
				789
				790	if (mg->writeback \|\| mg->demote) {
				791	/* demote */
				792	o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
				793	r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
				794	} else {
				795	/* promote */
				796	o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
				797	r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
				798	}
				799
				800	if (r < 0)
				801	migration_failure(mg);
				802	}
				803
				804	static void avoid_copy(struct dm_cache_migration *mg)
				805	{
				806	atomic_inc(&mg->cache->stats.copies_avoided);
				807	migration_success_pre_commit(mg);
				808	}
				809
				810	static void issue_copy(struct dm_cache_migration *mg)
				811	{
				812	bool avoid;
				813	struct cache *cache = mg->cache;
				814
				815	if (mg->writeback \|\| mg->demote)
				816	avoid = !is_dirty(cache, mg->cblock) \|\|
				817	is_discarded_oblock(cache, mg->old_oblock);
				818	else
				819	avoid = is_discarded_oblock(cache, mg->new_oblock);
				820
				821	avoid ? avoid_copy(mg) : issue_copy_real(mg);
				822	}
				823
				824	static void complete_migration(struct dm_cache_migration *mg)
				825	{
				826	if (mg->err)
				827	migration_failure(mg);
				828	else
				829	migration_success_pre_commit(mg);
				830	}
				831
				832	static void process_migrations(struct cache cache, struct list_head head,
				833	void (fn)(struct dm_cache_migration ))
				834	{
				835	unsigned long flags;
				836	struct list_head list;
				837	struct dm_cache_migration mg, tmp;
				838
				839	INIT_LIST_HEAD(&list);
				840	spin_lock_irqsave(&cache->lock, flags);
				841	list_splice_init(head, &list);
				842	spin_unlock_irqrestore(&cache->lock, flags);
				843
				844	list_for_each_entry_safe(mg, tmp, &list, list)
				845	fn(mg);
				846	}
				847
				848	static void __queue_quiesced_migration(struct dm_cache_migration *mg)
				849	{
				850	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
				851	}
				852
				853	static void queue_quiesced_migration(struct dm_cache_migration *mg)
				854	{
				855	unsigned long flags;
				856	struct cache *cache = mg->cache;
				857
				858	spin_lock_irqsave(&cache->lock, flags);
				859	__queue_quiesced_migration(mg);
				860	spin_unlock_irqrestore(&cache->lock, flags);
				861
				862	wake_worker(cache);
				863	}
				864
				865	static void queue_quiesced_migrations(struct cache cache, struct list_head work)
				866	{
				867	unsigned long flags;
				868	struct dm_cache_migration mg, tmp;
				869
				870	spin_lock_irqsave(&cache->lock, flags);
				871	list_for_each_entry_safe(mg, tmp, work, list)
				872	__queue_quiesced_migration(mg);
				873	spin_unlock_irqrestore(&cache->lock, flags);
				874
				875	wake_worker(cache);
				876	}
				877
				878	static void check_for_quiesced_migrations(struct cache *cache,
				879	struct per_bio_data *pb)
				880	{
				881	struct list_head work;
				882
				883	if (!pb->all_io_entry)
				884	return;
				885
				886	INIT_LIST_HEAD(&work);
				887	if (pb->all_io_entry)
				888	dm_deferred_entry_dec(pb->all_io_entry, &work);
				889
				890	if (!list_empty(&work))
				891	queue_quiesced_migrations(cache, &work);
				892	}
				893
				894	static void quiesce_migration(struct dm_cache_migration *mg)
				895	{
				896	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
				897	queue_quiesced_migration(mg);
				898	}
				899
				900	static void promote(struct cache cache, struct prealloc structs,
				901	dm_oblock_t oblock, dm_cblock_t cblock,
				902	struct dm_bio_prison_cell *cell)
				903	{
				904	struct dm_cache_migration *mg = prealloc_get_migration(structs);
				905
				906	mg->err = false;
				907	mg->writeback = false;
				908	mg->demote = false;
				909	mg->promote = true;
				910	mg->cache = cache;
				911	mg->new_oblock = oblock;
				912	mg->cblock = cblock;
				913	mg->old_ocell = NULL;
				914	mg->new_ocell = cell;
				915	mg->start_jiffies = jiffies;
				916
				917	inc_nr_migrations(cache);
				918	quiesce_migration(mg);
				919	}
				920
				921	static void writeback(struct cache cache, struct prealloc structs,
				922	dm_oblock_t oblock, dm_cblock_t cblock,
				923	struct dm_bio_prison_cell *cell)
				924	{
				925	struct dm_cache_migration *mg = prealloc_get_migration(structs);
				926
				927	mg->err = false;
				928	mg->writeback = true;
				929	mg->demote = false;
				930	mg->promote = false;
				931	mg->cache = cache;
				932	mg->old_oblock = oblock;
				933	mg->cblock = cblock;
				934	mg->old_ocell = cell;
				935	mg->new_ocell = NULL;
				936	mg->start_jiffies = jiffies;
				937
				938	inc_nr_migrations(cache);
				939	quiesce_migration(mg);
				940	}
				941
				942	static void demote_then_promote(struct cache cache, struct prealloc structs,
				943	dm_oblock_t old_oblock, dm_oblock_t new_oblock,
				944	dm_cblock_t cblock,
				945	struct dm_bio_prison_cell *old_ocell,
				946	struct dm_bio_prison_cell *new_ocell)
				947	{
				948	struct dm_cache_migration *mg = prealloc_get_migration(structs);
				949
				950	mg->err = false;
				951	mg->writeback = false;
				952	mg->demote = true;
				953	mg->promote = true;
				954	mg->cache = cache;
				955	mg->old_oblock = old_oblock;
				956	mg->new_oblock = new_oblock;
				957	mg->cblock = cblock;
				958	mg->old_ocell = old_ocell;
				959	mg->new_ocell = new_ocell;
				960	mg->start_jiffies = jiffies;
				961
				962	inc_nr_migrations(cache);
				963	quiesce_migration(mg);
				964	}
				965
				966	/*----------------------------------------------------------------
				967	* bio processing
				968	--------------------------------------------------------------/
				969	static void defer_bio(struct cache cache, struct bio bio)
				970	{
				971	unsigned long flags;
				972
				973	spin_lock_irqsave(&cache->lock, flags);
				974	bio_list_add(&cache->deferred_bios, bio);
				975	spin_unlock_irqrestore(&cache->lock, flags);
				976
				977	wake_worker(cache);
				978	}
				979
				980	static void process_flush_bio(struct cache cache, struct bio bio)
				981	{
				982	struct per_bio_data *pb = get_per_bio_data(bio);
				983
				984	BUG_ON(bio->bi_size);
				985	if (!pb->req_nr)
				986	remap_to_origin(cache, bio);
				987	else
				988	remap_to_cache(cache, bio, 0);
				989
				990	issue(cache, bio);
				991	}
				992
				993	/*
				994	* People generally discard large parts of a device, eg, the whole device
				995	* when formatting. Splitting these large discards up into cache block
				996	* sized ios and then quiescing (always neccessary for discard) takes too
				997	* long.
				998	*
				999	* We keep it simple, and allow any size of discard to come in, and just
				1000	* mark off blocks on the discard bitset. No passdown occurs!
				1001	*
				1002	* To implement passdown we need to change the bio_prison such that a cell
				1003	* can have a key that spans many blocks.
				1004	*/
				1005	static void process_discard_bio(struct cache cache, struct bio bio)
				1006	{
				1007	dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
				1008	cache->discard_block_size);
				1009	dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
				1010	dm_block_t b;
				1011
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame^]	1012	end_block = block_div(end_block, cache->discard_block_size);
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	1013
				1014	for (b = start_block; b < end_block; b++)
				1015	set_discard(cache, to_dblock(b));
				1016
				1017	bio_endio(bio, 0);
				1018	}
				1019
				1020	static bool spare_migration_bandwidth(struct cache *cache)
				1021	{
				1022	sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
				1023	cache->sectors_per_block;
				1024	return current_volume < cache->migration_threshold;
				1025	}
				1026
				1027	static bool is_writethrough_io(struct cache cache, struct bio bio,
				1028	dm_cblock_t cblock)
				1029	{
				1030	return bio_data_dir(bio) == WRITE &&
				1031	cache->features.write_through && !is_dirty(cache, cblock);
				1032	}
				1033
				1034	static void inc_hit_counter(struct cache cache, struct bio bio)
				1035	{
				1036	atomic_inc(bio_data_dir(bio) == READ ?
				1037	&cache->stats.read_hit : &cache->stats.write_hit);
				1038	}
				1039
				1040	static void inc_miss_counter(struct cache cache, struct bio bio)
				1041	{
				1042	atomic_inc(bio_data_dir(bio) == READ ?
				1043	&cache->stats.read_miss : &cache->stats.write_miss);
				1044	}
				1045
				1046	static void process_bio(struct cache cache, struct prealloc structs,
				1047	struct bio *bio)
				1048	{
				1049	int r;
				1050	bool release_cell = true;
				1051	dm_oblock_t block = get_bio_block(cache, bio);
				1052	struct dm_bio_prison_cell cell_prealloc, old_ocell, *new_ocell;
				1053	struct policy_result lookup_result;
				1054	struct per_bio_data *pb = get_per_bio_data(bio);
				1055	bool discarded_block = is_discarded_oblock(cache, block);
				1056	bool can_migrate = discarded_block \|\| spare_migration_bandwidth(cache);
				1057
				1058	/*
				1059	* Check to see if that block is currently migrating.
				1060	*/
				1061	cell_prealloc = prealloc_get_cell(structs);
				1062	r = bio_detain(cache, block, bio, cell_prealloc,
				1063	(cell_free_fn) prealloc_put_cell,
				1064	structs, &new_ocell);
				1065	if (r > 0)
				1066	return;
				1067
				1068	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
				1069	bio, &lookup_result);
				1070
				1071	if (r == -EWOULDBLOCK)
				1072	/* migration has been denied */
				1073	lookup_result.op = POLICY_MISS;
				1074
				1075	switch (lookup_result.op) {
				1076	case POLICY_HIT:
				1077	inc_hit_counter(cache, bio);
				1078	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
				1079
				1080	if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
				1081	/*
				1082	* No need to mark anything dirty in write through mode.
				1083	*/
				1084	pb->req_nr == 0 ?
				1085	remap_to_cache(cache, bio, lookup_result.cblock) :
				1086	remap_to_origin_clear_discard(cache, bio, block);
				1087	} else
				1088	remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
				1089
				1090	issue(cache, bio);
				1091	break;
				1092
				1093	case POLICY_MISS:
				1094	inc_miss_counter(cache, bio);
				1095	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
				1096
				1097	if (pb->req_nr != 0) {
				1098	/*
				1099	* This is a duplicate writethrough io that is no
				1100	* longer needed because the block has been demoted.
				1101	*/
				1102	bio_endio(bio, 0);
				1103	} else {
				1104	remap_to_origin_clear_discard(cache, bio, block);
				1105	issue(cache, bio);
				1106	}
				1107	break;
				1108
				1109	case POLICY_NEW:
				1110	atomic_inc(&cache->stats.promotion);
				1111	promote(cache, structs, block, lookup_result.cblock, new_ocell);
				1112	release_cell = false;
				1113	break;
				1114
				1115	case POLICY_REPLACE:
				1116	cell_prealloc = prealloc_get_cell(structs);
				1117	r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
				1118	(cell_free_fn) prealloc_put_cell,
				1119	structs, &old_ocell);
				1120	if (r > 0) {
				1121	/*
				1122	* We have to be careful to avoid lock inversion of
				1123	* the cells. So we back off, and wait for the
				1124	* old_ocell to become free.
				1125	*/
				1126	policy_force_mapping(cache->policy, block,
				1127	lookup_result.old_oblock);
				1128	atomic_inc(&cache->stats.cache_cell_clash);
				1129	break;
				1130	}
				1131	atomic_inc(&cache->stats.demotion);
				1132	atomic_inc(&cache->stats.promotion);
				1133
				1134	demote_then_promote(cache, structs, lookup_result.old_oblock,
				1135	block, lookup_result.cblock,
				1136	old_ocell, new_ocell);
				1137	release_cell = false;
				1138	break;
				1139
				1140	default:
				1141	DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
				1142	(unsigned) lookup_result.op);
				1143	bio_io_error(bio);
				1144	}
				1145
				1146	if (release_cell)
				1147	cell_defer(cache, new_ocell, false);
				1148	}
				1149
				1150	static int need_commit_due_to_time(struct cache *cache)
				1151	{
				1152	return jiffies < cache->last_commit_jiffies \|\|
				1153	jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
				1154	}
				1155
				1156	static int commit_if_needed(struct cache *cache)
				1157	{
				1158	if (dm_cache_changed_this_transaction(cache->cmd) &&
				1159	(cache->commit_requested \|\| need_commit_due_to_time(cache))) {
				1160	atomic_inc(&cache->stats.commit_count);
				1161	cache->last_commit_jiffies = jiffies;
				1162	cache->commit_requested = false;
				1163	return dm_cache_commit(cache->cmd, false);
				1164	}
				1165
				1166	return 0;
				1167	}
				1168
				1169	static void process_deferred_bios(struct cache *cache)
				1170	{
				1171	unsigned long flags;
				1172	struct bio_list bios;
				1173	struct bio *bio;
				1174	struct prealloc structs;
				1175
				1176	memset(&structs, 0, sizeof(structs));
				1177	bio_list_init(&bios);
				1178
				1179	spin_lock_irqsave(&cache->lock, flags);
				1180	bio_list_merge(&bios, &cache->deferred_bios);
				1181	bio_list_init(&cache->deferred_bios);
				1182	spin_unlock_irqrestore(&cache->lock, flags);
				1183
				1184	while (!bio_list_empty(&bios)) {
				1185	/*
				1186	* If we've got no free migration structs, and processing
				1187	* this bio might require one, we pause until there are some
				1188	* prepared mappings to process.
				1189	*/
				1190	if (prealloc_data_structs(cache, &structs)) {
				1191	spin_lock_irqsave(&cache->lock, flags);
				1192	bio_list_merge(&cache->deferred_bios, &bios);
				1193	spin_unlock_irqrestore(&cache->lock, flags);
				1194	break;
				1195	}
				1196
				1197	bio = bio_list_pop(&bios);
				1198
				1199	if (bio->bi_rw & REQ_FLUSH)
				1200	process_flush_bio(cache, bio);
				1201	else if (bio->bi_rw & REQ_DISCARD)
				1202	process_discard_bio(cache, bio);
				1203	else
				1204	process_bio(cache, &structs, bio);
				1205	}
				1206
				1207	prealloc_free_structs(cache, &structs);
				1208	}
				1209
				1210	static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
				1211	{
				1212	unsigned long flags;
				1213	struct bio_list bios;
				1214	struct bio *bio;
				1215
				1216	bio_list_init(&bios);
				1217
				1218	spin_lock_irqsave(&cache->lock, flags);
				1219	bio_list_merge(&bios, &cache->deferred_flush_bios);
				1220	bio_list_init(&cache->deferred_flush_bios);
				1221	spin_unlock_irqrestore(&cache->lock, flags);
				1222
				1223	while ((bio = bio_list_pop(&bios)))
				1224	submit_bios ? generic_make_request(bio) : bio_io_error(bio);
				1225	}
				1226
				1227	static void writeback_some_dirty_blocks(struct cache *cache)
				1228	{
				1229	int r = 0;
				1230	dm_oblock_t oblock;
				1231	dm_cblock_t cblock;
				1232	struct prealloc structs;
				1233	struct dm_bio_prison_cell *old_ocell;
				1234
				1235	memset(&structs, 0, sizeof(structs));
				1236
				1237	while (spare_migration_bandwidth(cache)) {
				1238	if (prealloc_data_structs(cache, &structs))
				1239	break;
				1240
				1241	r = policy_writeback_work(cache->policy, &oblock, &cblock);
				1242	if (r)
				1243	break;
				1244
				1245	r = get_cell(cache, oblock, &structs, &old_ocell);
				1246	if (r) {
				1247	policy_set_dirty(cache->policy, oblock);
				1248	break;
				1249	}
				1250
				1251	writeback(cache, &structs, oblock, cblock, old_ocell);
				1252	}
				1253
				1254	prealloc_free_structs(cache, &structs);
				1255	}
				1256
				1257	/*----------------------------------------------------------------
				1258	* Main worker loop
				1259	--------------------------------------------------------------/
				1260	static void start_quiescing(struct cache *cache)
				1261	{
				1262	unsigned long flags;
				1263
				1264	spin_lock_irqsave(&cache->lock, flags);
				1265	cache->quiescing = 1;
				1266	spin_unlock_irqrestore(&cache->lock, flags);
				1267	}
				1268
				1269	static void stop_quiescing(struct cache *cache)
				1270	{
				1271	unsigned long flags;
				1272
				1273	spin_lock_irqsave(&cache->lock, flags);
				1274	cache->quiescing = 0;
				1275	spin_unlock_irqrestore(&cache->lock, flags);
				1276	}
				1277
				1278	static bool is_quiescing(struct cache *cache)
				1279	{
				1280	int r;
				1281	unsigned long flags;
				1282
				1283	spin_lock_irqsave(&cache->lock, flags);
				1284	r = cache->quiescing;
				1285	spin_unlock_irqrestore(&cache->lock, flags);
				1286
				1287	return r;
				1288	}
				1289
				1290	static void wait_for_migrations(struct cache *cache)
				1291	{
				1292	wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
				1293	}
				1294
				1295	static void stop_worker(struct cache *cache)
				1296	{
				1297	cancel_delayed_work(&cache->waker);
				1298	flush_workqueue(cache->wq);
				1299	}
				1300
				1301	static void requeue_deferred_io(struct cache *cache)
				1302	{
				1303	struct bio *bio;
				1304	struct bio_list bios;
				1305
				1306	bio_list_init(&bios);
				1307	bio_list_merge(&bios, &cache->deferred_bios);
				1308	bio_list_init(&cache->deferred_bios);
				1309
				1310	while ((bio = bio_list_pop(&bios)))
				1311	bio_endio(bio, DM_ENDIO_REQUEUE);
				1312	}
				1313
				1314	static int more_work(struct cache *cache)
				1315	{
				1316	if (is_quiescing(cache))
				1317	return !list_empty(&cache->quiesced_migrations) \|\|
				1318	!list_empty(&cache->completed_migrations) \|\|
				1319	!list_empty(&cache->need_commit_migrations);
				1320	else
				1321	return !bio_list_empty(&cache->deferred_bios) \|\|
				1322	!bio_list_empty(&cache->deferred_flush_bios) \|\|
				1323	!list_empty(&cache->quiesced_migrations) \|\|
				1324	!list_empty(&cache->completed_migrations) \|\|
				1325	!list_empty(&cache->need_commit_migrations);
				1326	}
				1327
				1328	static void do_worker(struct work_struct *ws)
				1329	{
				1330	struct cache *cache = container_of(ws, struct cache, worker);
				1331
				1332	do {
				1333	if (!is_quiescing(cache))
				1334	process_deferred_bios(cache);
				1335
				1336	process_migrations(cache, &cache->quiesced_migrations, issue_copy);
				1337	process_migrations(cache, &cache->completed_migrations, complete_migration);
				1338
				1339	writeback_some_dirty_blocks(cache);
				1340
				1341	if (commit_if_needed(cache)) {
				1342	process_deferred_flush_bios(cache, false);
				1343
				1344	/*
				1345	* FIXME: rollback metadata or just go into a
				1346	* failure mode and error everything
				1347	*/
				1348	} else {
				1349	process_deferred_flush_bios(cache, true);
				1350	process_migrations(cache, &cache->need_commit_migrations,
				1351	migration_success_post_commit);
				1352	}
				1353	} while (more_work(cache));
				1354	}
				1355
				1356	/*
				1357	* We want to commit periodically so that not too much
				1358	* unwritten metadata builds up.
				1359	*/
				1360	static void do_waker(struct work_struct *ws)
				1361	{
				1362	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
				1363	wake_worker(cache);
				1364	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
				1365	}
				1366
				1367	/----------------------------------------------------------------/
				1368
				1369	static int is_congested(struct dm_dev *dev, int bdi_bits)
				1370	{
				1371	struct request_queue *q = bdev_get_queue(dev->bdev);
				1372	return bdi_congested(&q->backing_dev_info, bdi_bits);
				1373	}
				1374
				1375	static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				1376	{
				1377	struct cache *cache = container_of(cb, struct cache, callbacks);
				1378
				1379	return is_congested(cache->origin_dev, bdi_bits) \|\|
				1380	is_congested(cache->cache_dev, bdi_bits);
				1381	}
				1382
				1383	/*----------------------------------------------------------------
				1384	* Target methods
				1385	--------------------------------------------------------------/
				1386
				1387	/*
				1388	* This function gets called on the error paths of the constructor, so we
				1389	* have to cope with a partially initialised struct.
				1390	*/
				1391	static void destroy(struct cache *cache)
				1392	{
				1393	unsigned i;
				1394
				1395	if (cache->next_migration)
				1396	mempool_free(cache->next_migration, cache->migration_pool);
				1397
				1398	if (cache->migration_pool)
				1399	mempool_destroy(cache->migration_pool);
				1400
				1401	if (cache->all_io_ds)
				1402	dm_deferred_set_destroy(cache->all_io_ds);
				1403
				1404	if (cache->prison)
				1405	dm_bio_prison_destroy(cache->prison);
				1406
				1407	if (cache->wq)
				1408	destroy_workqueue(cache->wq);
				1409
				1410	if (cache->dirty_bitset)
				1411	free_bitset(cache->dirty_bitset);
				1412
				1413	if (cache->discard_bitset)
				1414	free_bitset(cache->discard_bitset);
				1415
				1416	if (cache->copier)
				1417	dm_kcopyd_client_destroy(cache->copier);
				1418
				1419	if (cache->cmd)
				1420	dm_cache_metadata_close(cache->cmd);
				1421
				1422	if (cache->metadata_dev)
				1423	dm_put_device(cache->ti, cache->metadata_dev);
				1424
				1425	if (cache->origin_dev)
				1426	dm_put_device(cache->ti, cache->origin_dev);
				1427
				1428	if (cache->cache_dev)
				1429	dm_put_device(cache->ti, cache->cache_dev);
				1430
				1431	if (cache->policy)
				1432	dm_cache_policy_destroy(cache->policy);
				1433
				1434	for (i = 0; i < cache->nr_ctr_args ; i++)
				1435	kfree(cache->ctr_args[i]);
				1436	kfree(cache->ctr_args);
				1437
				1438	kfree(cache);
				1439	}
				1440
				1441	static void cache_dtr(struct dm_target *ti)
				1442	{
				1443	struct cache *cache = ti->private;
				1444
				1445	destroy(cache);
				1446	}
				1447
				1448	static sector_t get_dev_size(struct dm_dev *dev)
				1449	{
				1450	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
				1451	}
				1452
				1453	/----------------------------------------------------------------/
				1454
				1455	/*
				1456	* Construct a cache device mapping.
				1457	*
				1458	* cache <metadata dev> <cache dev> <origin dev> <block size>
				1459	* <#feature args> [<feature arg>]*
				1460	* <policy> <#policy args> [<policy arg>]*
				1461	*
				1462	* metadata dev : fast device holding the persistent metadata
				1463	* cache dev : fast device holding cached data blocks
				1464	* origin dev : slow device holding original data blocks
				1465	* block size : cache unit size in sectors
				1466	*
				1467	* #feature args : number of feature arguments passed
				1468	* feature args : writethrough. (The default is writeback.)
				1469	*
				1470	* policy : the replacement policy to use
				1471	* #policy args : an even number of policy arguments corresponding
				1472	* to key/value pairs passed to the policy
				1473	* policy args : key/value pairs passed to the policy
				1474	* E.g. 'sequential_threshold 1024'
				1475	* See cache-policies.txt for details.
				1476	*
				1477	* Optional feature arguments are:
				1478	* writethrough : write through caching that prohibits cache block
				1479	* content from being different from origin block content.
				1480	* Without this argument, the default behaviour is to write
				1481	* back cache block contents later for performance reasons,
				1482	* so they may differ from the corresponding origin blocks.
				1483	*/
				1484	struct cache_args {
				1485	struct dm_target *ti;
				1486
				1487	struct dm_dev *metadata_dev;
				1488
				1489	struct dm_dev *cache_dev;
				1490	sector_t cache_sectors;
				1491
				1492	struct dm_dev *origin_dev;
				1493	sector_t origin_sectors;
				1494
				1495	uint32_t block_size;
				1496
				1497	const char *policy_name;
				1498	int policy_argc;
				1499	const char **policy_argv;
				1500
				1501	struct cache_features features;
				1502	};
				1503
				1504	static void destroy_cache_args(struct cache_args *ca)
				1505	{
				1506	if (ca->metadata_dev)
				1507	dm_put_device(ca->ti, ca->metadata_dev);
				1508
				1509	if (ca->cache_dev)
				1510	dm_put_device(ca->ti, ca->cache_dev);
				1511
				1512	if (ca->origin_dev)
				1513	dm_put_device(ca->ti, ca->origin_dev);
				1514
				1515	kfree(ca);
				1516	}
				1517
				1518	static bool at_least_one_arg(struct dm_arg_set as, char *error)
				1519	{
				1520	if (!as->argc) {
				1521	*error = "Insufficient args";
				1522	return false;
				1523	}
				1524
				1525	return true;
				1526	}
				1527
				1528	static int parse_metadata_dev(struct cache_args ca, struct dm_arg_set as,
				1529	char **error)
				1530	{
				1531	int r;
				1532	sector_t metadata_dev_size;
				1533	char b[BDEVNAME_SIZE];
				1534
				1535	if (!at_least_one_arg(as, error))
				1536	return -EINVAL;
				1537
				1538	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				1539	&ca->metadata_dev);
				1540	if (r) {
				1541	*error = "Error opening metadata device";
				1542	return r;
				1543	}
				1544
				1545	metadata_dev_size = get_dev_size(ca->metadata_dev);
				1546	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
				1547	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				1548	bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
				1549
				1550	return 0;
				1551	}
				1552
				1553	static int parse_cache_dev(struct cache_args ca, struct dm_arg_set as,
				1554	char **error)
				1555	{
				1556	int r;
				1557
				1558	if (!at_least_one_arg(as, error))
				1559	return -EINVAL;
				1560
				1561	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				1562	&ca->cache_dev);
				1563	if (r) {
				1564	*error = "Error opening cache device";
				1565	return r;
				1566	}
				1567	ca->cache_sectors = get_dev_size(ca->cache_dev);
				1568
				1569	return 0;
				1570	}
				1571
				1572	static int parse_origin_dev(struct cache_args ca, struct dm_arg_set as,
				1573	char **error)
				1574	{
				1575	int r;
				1576
				1577	if (!at_least_one_arg(as, error))
				1578	return -EINVAL;
				1579
				1580	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				1581	&ca->origin_dev);
				1582	if (r) {
				1583	*error = "Error opening origin device";
				1584	return r;
				1585	}
				1586
				1587	ca->origin_sectors = get_dev_size(ca->origin_dev);
				1588	if (ca->ti->len > ca->origin_sectors) {
				1589	*error = "Device size larger than cached device";
				1590	return -EINVAL;
				1591	}
				1592
				1593	return 0;
				1594	}
				1595
				1596	static int parse_block_size(struct cache_args ca, struct dm_arg_set as,
				1597	char **error)
				1598	{
				1599	unsigned long tmp;
				1600
				1601	if (!at_least_one_arg(as, error))
				1602	return -EINVAL;
				1603
				1604	if (kstrtoul(dm_shift_arg(as), 10, &tmp) \|\| !tmp \|\|
				1605	tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				1606	tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
				1607	*error = "Invalid data block size";
				1608	return -EINVAL;
				1609	}
				1610
				1611	if (tmp > ca->cache_sectors) {
				1612	*error = "Data block size is larger than the cache device";
				1613	return -EINVAL;
				1614	}
				1615
				1616	ca->block_size = tmp;
				1617
				1618	return 0;
				1619	}
				1620
				1621	static void init_features(struct cache_features *cf)
				1622	{
				1623	cf->mode = CM_WRITE;
				1624	cf->write_through = false;
				1625	}
				1626
				1627	static int parse_features(struct cache_args ca, struct dm_arg_set as,
				1628	char **error)
				1629	{
				1630	static struct dm_arg _args[] = {
				1631	{0, 1, "Invalid number of cache feature arguments"},
				1632	};
				1633
				1634	int r;
				1635	unsigned argc;
				1636	const char *arg;
				1637	struct cache_features *cf = &ca->features;
				1638
				1639	init_features(cf);
				1640
				1641	r = dm_read_arg_group(_args, as, &argc, error);
				1642	if (r)
				1643	return -EINVAL;
				1644
				1645	while (argc--) {
				1646	arg = dm_shift_arg(as);
				1647
				1648	if (!strcasecmp(arg, "writeback"))
				1649	cf->write_through = false;
				1650
				1651	else if (!strcasecmp(arg, "writethrough"))
				1652	cf->write_through = true;
				1653
				1654	else {
				1655	*error = "Unrecognised cache feature requested";
				1656	return -EINVAL;
				1657	}
				1658	}
				1659
				1660	return 0;
				1661	}
				1662
				1663	static int parse_policy(struct cache_args ca, struct dm_arg_set as,
				1664	char **error)
				1665	{
				1666	static struct dm_arg _args[] = {
				1667	{0, 1024, "Invalid number of policy arguments"},
				1668	};
				1669
				1670	int r;
				1671
				1672	if (!at_least_one_arg(as, error))
				1673	return -EINVAL;
				1674
				1675	ca->policy_name = dm_shift_arg(as);
				1676
				1677	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
				1678	if (r)
				1679	return -EINVAL;
				1680
				1681	ca->policy_argv = (const char **)as->argv;
				1682	dm_consume_args(as, ca->policy_argc);
				1683
				1684	return 0;
				1685	}
				1686
				1687	static int parse_cache_args(struct cache_args ca, int argc, char *argv,
				1688	char **error)
				1689	{
				1690	int r;
				1691	struct dm_arg_set as;
				1692
				1693	as.argc = argc;
				1694	as.argv = argv;
				1695
				1696	r = parse_metadata_dev(ca, &as, error);
				1697	if (r)
				1698	return r;
				1699
				1700	r = parse_cache_dev(ca, &as, error);
				1701	if (r)
				1702	return r;
				1703
				1704	r = parse_origin_dev(ca, &as, error);
				1705	if (r)
				1706	return r;
				1707
				1708	r = parse_block_size(ca, &as, error);
				1709	if (r)
				1710	return r;
				1711
				1712	r = parse_features(ca, &as, error);
				1713	if (r)
				1714	return r;
				1715
				1716	r = parse_policy(ca, &as, error);
				1717	if (r)
				1718	return r;
				1719
				1720	return 0;
				1721	}
				1722
				1723	/----------------------------------------------------------------/
				1724
				1725	static struct kmem_cache *migration_cache;
				1726
				1727	static int set_config_values(struct dm_cache_policy p, int argc, const char *argv)
				1728	{
				1729	int r = 0;
				1730
				1731	if (argc & 1) {
				1732	DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
				1733	return -EINVAL;
				1734	}
				1735
				1736	while (argc) {
				1737	r = policy_set_config_value(p, argv[0], argv[1]);
				1738	if (r) {
				1739	DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
				1740	argv[0], argv[1]);
				1741	return r;
				1742	}
				1743
				1744	argc -= 2;
				1745	argv += 2;
				1746	}
				1747
				1748	return r;
				1749	}
				1750
				1751	static int create_cache_policy(struct cache cache, struct cache_args ca,
				1752	char **error)
				1753	{
				1754	int r;
				1755
				1756	cache->policy = dm_cache_policy_create(ca->policy_name,
				1757	cache->cache_size,
				1758	cache->origin_sectors,
				1759	cache->sectors_per_block);
				1760	if (!cache->policy) {
				1761	*error = "Error creating cache's policy";
				1762	return -ENOMEM;
				1763	}
				1764
				1765	r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
				1766	if (r)
				1767	dm_cache_policy_destroy(cache->policy);
				1768
				1769	return r;
				1770	}
				1771
				1772	/*
				1773	* We want the discard block size to be a power of two, at least the size
				1774	* of the cache block size, and have no more than 2^14 discard blocks
				1775	* across the origin.
				1776	*/
				1777	#define MAX_DISCARD_BLOCKS (1 << 14)
				1778
				1779	static bool too_many_discard_blocks(sector_t discard_block_size,
				1780	sector_t origin_size)
				1781	{
				1782	(void) sector_div(origin_size, discard_block_size);
				1783
				1784	return origin_size > MAX_DISCARD_BLOCKS;
				1785	}
				1786
				1787	static sector_t calculate_discard_block_size(sector_t cache_block_size,
				1788	sector_t origin_size)
				1789	{
				1790	sector_t discard_block_size;
				1791
				1792	discard_block_size = roundup_pow_of_two(cache_block_size);
				1793
				1794	if (origin_size)
				1795	while (too_many_discard_blocks(discard_block_size, origin_size))
				1796	discard_block_size *= 2;
				1797
				1798	return discard_block_size;
				1799	}
				1800
				1801	#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
				1802
				1803	static unsigned cache_num_write_bios(struct dm_target ti, struct bio bio);
				1804
				1805	static int cache_create(struct cache_args ca, struct cache *result)
				1806	{
				1807	int r = 0;
				1808	char **error = &ca->ti->error;
				1809	struct cache *cache;
				1810	struct dm_target *ti = ca->ti;
				1811	dm_block_t origin_blocks;
				1812	struct dm_cache_metadata *cmd;
				1813	bool may_format = ca->features.mode == CM_WRITE;
				1814
				1815	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
				1816	if (!cache)
				1817	return -ENOMEM;
				1818
				1819	cache->ti = ca->ti;
				1820	ti->private = cache;
				1821	ti->per_bio_data_size = sizeof(struct per_bio_data);
				1822	ti->num_flush_bios = 2;
				1823	ti->flush_supported = true;
				1824
				1825	ti->num_discard_bios = 1;
				1826	ti->discards_supported = true;
				1827	ti->discard_zeroes_data_unsupported = true;
				1828
				1829	memcpy(&cache->features, &ca->features, sizeof(cache->features));
				1830
				1831	if (cache->features.write_through)
				1832	ti->num_write_bios = cache_num_write_bios;
				1833
				1834	cache->callbacks.congested_fn = cache_is_congested;
				1835	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
				1836
				1837	cache->metadata_dev = ca->metadata_dev;
				1838	cache->origin_dev = ca->origin_dev;
				1839	cache->cache_dev = ca->cache_dev;
				1840
				1841	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
				1842
				1843	/* FIXME: factor out this whole section */
				1844	origin_blocks = cache->origin_sectors = ca->origin_sectors;
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame^]	1845	origin_blocks = block_div(origin_blocks, ca->block_size);
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	1846	cache->origin_blocks = to_oblock(origin_blocks);
				1847
				1848	cache->sectors_per_block = ca->block_size;
				1849	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
				1850	r = -EINVAL;
				1851	goto bad;
				1852	}
				1853
				1854	if (ca->block_size & (ca->block_size - 1)) {
				1855	dm_block_t cache_size = ca->cache_sectors;
				1856
				1857	cache->sectors_per_block_shift = -1;
Joe Thornber	414dd67	2013-03-20 17:21:25 +0000	[diff] [blame^]	1858	cache_size = block_div(cache_size, ca->block_size);
Joe Thornber	c6b4fcb	2013-03-01 22:45:51 +0000	[diff] [blame]	1859	cache->cache_size = to_cblock(cache_size);
				1860	} else {
				1861	cache->sectors_per_block_shift = __ffs(ca->block_size);
				1862	cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
				1863	}
				1864
				1865	r = create_cache_policy(cache, ca, error);
				1866	if (r)
				1867	goto bad;
				1868	cache->policy_nr_args = ca->policy_argc;
				1869
				1870	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
				1871	ca->block_size, may_format,
				1872	dm_cache_policy_get_hint_size(cache->policy));
				1873	if (IS_ERR(cmd)) {
				1874	*error = "Error creating metadata object";
				1875	r = PTR_ERR(cmd);
				1876	goto bad;
				1877	}
				1878	cache->cmd = cmd;
				1879
				1880	spin_lock_init(&cache->lock);
				1881	bio_list_init(&cache->deferred_bios);
				1882	bio_list_init(&cache->deferred_flush_bios);
				1883	INIT_LIST_HEAD(&cache->quiesced_migrations);
				1884	INIT_LIST_HEAD(&cache->completed_migrations);
				1885	INIT_LIST_HEAD(&cache->need_commit_migrations);
				1886	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
				1887	atomic_set(&cache->nr_migrations, 0);
				1888	init_waitqueue_head(&cache->migration_wait);
				1889
				1890	cache->nr_dirty = 0;
				1891	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
				1892	if (!cache->dirty_bitset) {
				1893	*error = "could not allocate dirty bitset";
				1894	goto bad;
				1895	}
				1896	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
				1897
				1898	cache->discard_block_size =
				1899	calculate_discard_block_size(cache->sectors_per_block,
				1900	cache->origin_sectors);
				1901	cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
				1902	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
				1903	if (!cache->discard_bitset) {
				1904	*error = "could not allocate discard bitset";
				1905	goto bad;
				1906	}
				1907	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
				1908
				1909	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				1910	if (IS_ERR(cache->copier)) {
				1911	*error = "could not create kcopyd client";
				1912	r = PTR_ERR(cache->copier);
				1913	goto bad;
				1914	}
				1915
				1916	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
				1917	if (!cache->wq) {
				1918	*error = "could not create workqueue for metadata object";
				1919	goto bad;
				1920	}
				1921	INIT_WORK(&cache->worker, do_worker);
				1922	INIT_DELAYED_WORK(&cache->waker, do_waker);
				1923	cache->last_commit_jiffies = jiffies;
				1924
				1925	cache->prison = dm_bio_prison_create(PRISON_CELLS);
				1926	if (!cache->prison) {
				1927	*error = "could not create bio prison";
				1928	goto bad;
				1929	}
				1930
				1931	cache->all_io_ds = dm_deferred_set_create();
				1932	if (!cache->all_io_ds) {
				1933	*error = "could not create all_io deferred set";
				1934	goto bad;
				1935	}
				1936
				1937	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
				1938	migration_cache);
				1939	if (!cache->migration_pool) {
				1940	*error = "Error creating cache's migration mempool";
				1941	goto bad;
				1942	}
				1943
				1944	cache->next_migration = NULL;
				1945
				1946	cache->need_tick_bio = true;
				1947	cache->sized = false;
				1948	cache->quiescing = false;
				1949	cache->commit_requested = false;
				1950	cache->loaded_mappings = false;
				1951	cache->loaded_discards = false;
				1952
				1953	load_stats(cache);
				1954
				1955	atomic_set(&cache->stats.demotion, 0);
				1956	atomic_set(&cache->stats.promotion, 0);
				1957	atomic_set(&cache->stats.copies_avoided, 0);
				1958	atomic_set(&cache->stats.cache_cell_clash, 0);
				1959	atomic_set(&cache->stats.commit_count, 0);
				1960	atomic_set(&cache->stats.discard_count, 0);
				1961
				1962	*result = cache;
				1963	return 0;
				1964
				1965	bad:
				1966	destroy(cache);
				1967	return r;
				1968	}
				1969
				1970	static int copy_ctr_args(struct cache cache, int argc, const char *argv)
				1971	{
				1972	unsigned i;
				1973	const char **copy;
				1974
				1975	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
				1976	if (!copy)
				1977	return -ENOMEM;
				1978	for (i = 0; i < argc; i++) {
				1979	copy[i] = kstrdup(argv[i], GFP_KERNEL);
				1980	if (!copy[i]) {
				1981	while (i--)
				1982	kfree(copy[i]);
				1983	kfree(copy);
				1984	return -ENOMEM;
				1985	}
				1986	}
				1987
				1988	cache->nr_ctr_args = argc;
				1989	cache->ctr_args = copy;
				1990
				1991	return 0;
				1992	}
				1993
				1994	static int cache_ctr(struct dm_target ti, unsigned argc, char *argv)
				1995	{
				1996	int r = -EINVAL;
				1997	struct cache_args *ca;
				1998	struct cache *cache = NULL;
				1999
				2000	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
				2001	if (!ca) {
				2002	ti->error = "Error allocating memory for cache";
				2003	return -ENOMEM;
				2004	}
				2005	ca->ti = ti;
				2006
				2007	r = parse_cache_args(ca, argc, argv, &ti->error);
				2008	if (r)
				2009	goto out;
				2010
				2011	r = cache_create(ca, &cache);
				2012
				2013	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
				2014	if (r) {
				2015	destroy(cache);
				2016	goto out;
				2017	}
				2018
				2019	ti->private = cache;
				2020
				2021	out:
				2022	destroy_cache_args(ca);
				2023	return r;
				2024	}
				2025
				2026	static unsigned cache_num_write_bios(struct dm_target ti, struct bio bio)
				2027	{
				2028	int r;
				2029	struct cache *cache = ti->private;
				2030	dm_oblock_t block = get_bio_block(cache, bio);
				2031	dm_cblock_t cblock;
				2032
				2033	r = policy_lookup(cache->policy, block, &cblock);
				2034	if (r < 0)
				2035	return 2; /* assume the worst */
				2036
				2037	return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
				2038	}
				2039
				2040	static int cache_map(struct dm_target ti, struct bio bio)
				2041	{
				2042	struct cache *cache = ti->private;
				2043
				2044	int r;
				2045	dm_oblock_t block = get_bio_block(cache, bio);
				2046	bool can_migrate = false;
				2047	bool discarded_block;
				2048	struct dm_bio_prison_cell *cell;
				2049	struct policy_result lookup_result;
				2050	struct per_bio_data *pb;
				2051
				2052	if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
				2053	/*
				2054	* This can only occur if the io goes to a partial block at
				2055	* the end of the origin device. We don't cache these.
				2056	* Just remap to the origin and carry on.
				2057	*/
				2058	remap_to_origin_clear_discard(cache, bio, block);
				2059	return DM_MAPIO_REMAPPED;
				2060	}
				2061
				2062	pb = init_per_bio_data(bio);
				2063
				2064	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA \| REQ_DISCARD)) {
				2065	defer_bio(cache, bio);
				2066	return DM_MAPIO_SUBMITTED;
				2067	}
				2068
				2069	/*
				2070	* Check to see if that block is currently migrating.
				2071	*/
				2072	cell = alloc_prison_cell(cache);
				2073	if (!cell) {
				2074	defer_bio(cache, bio);
				2075	return DM_MAPIO_SUBMITTED;
				2076	}
				2077
				2078	r = bio_detain(cache, block, bio, cell,
				2079	(cell_free_fn) free_prison_cell,
				2080	cache, &cell);
				2081	if (r) {
				2082	if (r < 0)
				2083	defer_bio(cache, bio);
				2084
				2085	return DM_MAPIO_SUBMITTED;
				2086	}
				2087
				2088	discarded_block = is_discarded_oblock(cache, block);
				2089
				2090	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
				2091	bio, &lookup_result);
				2092	if (r == -EWOULDBLOCK) {
				2093	cell_defer(cache, cell, true);
				2094	return DM_MAPIO_SUBMITTED;
				2095
				2096	} else if (r) {
				2097	DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
				2098	bio_io_error(bio);
				2099	return DM_MAPIO_SUBMITTED;
				2100	}
				2101
				2102	switch (lookup_result.op) {
				2103	case POLICY_HIT:
				2104	inc_hit_counter(cache, bio);
				2105	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
				2106
				2107	if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
				2108	/*
				2109	* No need to mark anything dirty in write through mode.
				2110	*/
				2111	pb->req_nr == 0 ?
				2112	remap_to_cache(cache, bio, lookup_result.cblock) :
				2113	remap_to_origin_clear_discard(cache, bio, block);
				2114	cell_defer(cache, cell, false);
				2115	} else {
				2116	remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
				2117	cell_defer(cache, cell, false);
				2118	}
				2119	break;
				2120
				2121	case POLICY_MISS:
				2122	inc_miss_counter(cache, bio);
				2123	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
				2124
				2125	if (pb->req_nr != 0) {
				2126	/*
				2127	* This is a duplicate writethrough io that is no
				2128	* longer needed because the block has been demoted.
				2129	*/
				2130	bio_endio(bio, 0);
				2131	cell_defer(cache, cell, false);
				2132	return DM_MAPIO_SUBMITTED;
				2133	} else {
				2134	remap_to_origin_clear_discard(cache, bio, block);
				2135	cell_defer(cache, cell, false);
				2136	}
				2137	break;
				2138
				2139	default:
				2140	DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
				2141	(unsigned) lookup_result.op);
				2142	bio_io_error(bio);
				2143	return DM_MAPIO_SUBMITTED;
				2144	}
				2145
				2146	return DM_MAPIO_REMAPPED;
				2147	}
				2148
				2149	static int cache_end_io(struct dm_target ti, struct bio bio, int error)
				2150	{
				2151	struct cache *cache = ti->private;
				2152	unsigned long flags;
				2153	struct per_bio_data *pb = get_per_bio_data(bio);
				2154
				2155	if (pb->tick) {
				2156	policy_tick(cache->policy);
				2157
				2158	spin_lock_irqsave(&cache->lock, flags);
				2159	cache->need_tick_bio = true;
				2160	spin_unlock_irqrestore(&cache->lock, flags);
				2161	}
				2162
				2163	check_for_quiesced_migrations(cache, pb);
				2164
				2165	return 0;
				2166	}
				2167
				2168	static int write_dirty_bitset(struct cache *cache)
				2169	{
				2170	unsigned i, r;
				2171
				2172	for (i = 0; i < from_cblock(cache->cache_size); i++) {
				2173	r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
				2174	is_dirty(cache, to_cblock(i)));
				2175	if (r)
				2176	return r;
				2177	}
				2178
				2179	return 0;
				2180	}
				2181
				2182	static int write_discard_bitset(struct cache *cache)
				2183	{
				2184	unsigned i, r;
				2185
				2186	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
				2187	cache->discard_nr_blocks);
				2188	if (r) {
				2189	DMERR("could not resize on-disk discard bitset");
				2190	return r;
				2191	}
				2192
				2193	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
				2194	r = dm_cache_set_discard(cache->cmd, to_dblock(i),
				2195	is_discarded(cache, to_dblock(i)));
				2196	if (r)
				2197	return r;
				2198	}
				2199
				2200	return 0;
				2201	}
				2202
				2203	static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
				2204	uint32_t hint)
				2205	{
				2206	struct cache *cache = context;
				2207	return dm_cache_save_hint(cache->cmd, cblock, hint);
				2208	}
				2209
				2210	static int write_hints(struct cache *cache)
				2211	{
				2212	int r;
				2213
				2214	r = dm_cache_begin_hints(cache->cmd, cache->policy);
				2215	if (r) {
				2216	DMERR("dm_cache_begin_hints failed");
				2217	return r;
				2218	}
				2219
				2220	r = policy_walk_mappings(cache->policy, save_hint, cache);
				2221	if (r)
				2222	DMERR("policy_walk_mappings failed");
				2223
				2224	return r;
				2225	}
				2226
				2227	/*
				2228	* returns true on success
				2229	*/
				2230	static bool sync_metadata(struct cache *cache)
				2231	{
				2232	int r1, r2, r3, r4;
				2233
				2234	r1 = write_dirty_bitset(cache);
				2235	if (r1)
				2236	DMERR("could not write dirty bitset");
				2237
				2238	r2 = write_discard_bitset(cache);
				2239	if (r2)
				2240	DMERR("could not write discard bitset");
				2241
				2242	save_stats(cache);
				2243
				2244	r3 = write_hints(cache);
				2245	if (r3)
				2246	DMERR("could not write hints");
				2247
				2248	/*
				2249	* If writing the above metadata failed, we still commit, but don't
				2250	* set the clean shutdown flag. This will effectively force every
				2251	* dirty bit to be set on reload.
				2252	*/
				2253	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
				2254	if (r4)
				2255	DMERR("could not write cache metadata. Data loss may occur.");
				2256
				2257	return !r1 && !r2 && !r3 && !r4;
				2258	}
				2259
				2260	static void cache_postsuspend(struct dm_target *ti)
				2261	{
				2262	struct cache *cache = ti->private;
				2263
				2264	start_quiescing(cache);
				2265	wait_for_migrations(cache);
				2266	stop_worker(cache);
				2267	requeue_deferred_io(cache);
				2268	stop_quiescing(cache);
				2269
				2270	(void) sync_metadata(cache);
				2271	}
				2272
				2273	static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
				2274	bool dirty, uint32_t hint, bool hint_valid)
				2275	{
				2276	int r;
				2277	struct cache *cache = context;
				2278
				2279	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
				2280	if (r)
				2281	return r;
				2282
				2283	if (dirty)
				2284	set_dirty(cache, oblock, cblock);
				2285	else
				2286	clear_dirty(cache, oblock, cblock);
				2287
				2288	return 0;
				2289	}
				2290
				2291	static int load_discard(void *context, sector_t discard_block_size,
				2292	dm_dblock_t dblock, bool discard)
				2293	{
				2294	struct cache *cache = context;
				2295
				2296	/* FIXME: handle mis-matched block size */
				2297
				2298	if (discard)
				2299	set_discard(cache, dblock);
				2300	else
				2301	clear_discard(cache, dblock);
				2302
				2303	return 0;
				2304	}
				2305
				2306	static int cache_preresume(struct dm_target *ti)
				2307	{
				2308	int r = 0;
				2309	struct cache *cache = ti->private;
				2310	sector_t actual_cache_size = get_dev_size(cache->cache_dev);
				2311	(void) sector_div(actual_cache_size, cache->sectors_per_block);
				2312
				2313	/*
				2314	* Check to see if the cache has resized.
				2315	*/
				2316	if (from_cblock(cache->cache_size) != actual_cache_size \|\| !cache->sized) {
				2317	cache->cache_size = to_cblock(actual_cache_size);
				2318
				2319	r = dm_cache_resize(cache->cmd, cache->cache_size);
				2320	if (r) {
				2321	DMERR("could not resize cache metadata");
				2322	return r;
				2323	}
				2324
				2325	cache->sized = true;
				2326	}
				2327
				2328	if (!cache->loaded_mappings) {
				2329	r = dm_cache_load_mappings(cache->cmd,
				2330	dm_cache_policy_get_name(cache->policy),
				2331	load_mapping, cache);
				2332	if (r) {
				2333	DMERR("could not load cache mappings");
				2334	return r;
				2335	}
				2336
				2337	cache->loaded_mappings = true;
				2338	}
				2339
				2340	if (!cache->loaded_discards) {
				2341	r = dm_cache_load_discards(cache->cmd, load_discard, cache);
				2342	if (r) {
				2343	DMERR("could not load origin discards");
				2344	return r;
				2345	}
				2346
				2347	cache->loaded_discards = true;
				2348	}
				2349
				2350	return r;
				2351	}
				2352
				2353	static void cache_resume(struct dm_target *ti)
				2354	{
				2355	struct cache *cache = ti->private;
				2356
				2357	cache->need_tick_bio = true;
				2358	do_waker(&cache->waker.work);
				2359	}
				2360
				2361	/*
				2362	* Status format:
				2363	*
				2364	* <#used metadata blocks>/<#total metadata blocks>
				2365	* <#read hits> <#read misses> <#write hits> <#write misses>
				2366	* <#demotions> <#promotions> <#blocks in cache> <#dirty>
				2367	* <#features> <features>*
				2368	* <#core args> <core args>
				2369	* <#policy args> <policy args>*
				2370	*/
				2371	static void cache_status(struct dm_target *ti, status_type_t type,
				2372	unsigned status_flags, char *result, unsigned maxlen)
				2373	{
				2374	int r = 0;
				2375	unsigned i;
				2376	ssize_t sz = 0;
				2377	dm_block_t nr_free_blocks_metadata = 0;
				2378	dm_block_t nr_blocks_metadata = 0;
				2379	char buf[BDEVNAME_SIZE];
				2380	struct cache *cache = ti->private;
				2381	dm_cblock_t residency;
				2382
				2383	switch (type) {
				2384	case STATUSTYPE_INFO:
				2385	/* Commit to ensure statistics aren't out-of-date */
				2386	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
				2387	r = dm_cache_commit(cache->cmd, false);
				2388	if (r)
				2389	DMERR("could not commit metadata for accurate status");
				2390	}
				2391
				2392	r = dm_cache_get_free_metadata_block_count(cache->cmd,
				2393	&nr_free_blocks_metadata);
				2394	if (r) {
				2395	DMERR("could not get metadata free block count");
				2396	goto err;
				2397	}
				2398
				2399	r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
				2400	if (r) {
				2401	DMERR("could not get metadata device size");
				2402	goto err;
				2403	}
				2404
				2405	residency = policy_residency(cache->policy);
				2406
				2407	DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
				2408	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				2409	(unsigned long long)nr_blocks_metadata,
				2410	(unsigned) atomic_read(&cache->stats.read_hit),
				2411	(unsigned) atomic_read(&cache->stats.read_miss),
				2412	(unsigned) atomic_read(&cache->stats.write_hit),
				2413	(unsigned) atomic_read(&cache->stats.write_miss),
				2414	(unsigned) atomic_read(&cache->stats.demotion),
				2415	(unsigned) atomic_read(&cache->stats.promotion),
				2416	(unsigned long long) from_cblock(residency),
				2417	cache->nr_dirty);
				2418
				2419	if (cache->features.write_through)
				2420	DMEMIT("1 writethrough ");
				2421	else
				2422	DMEMIT("0 ");
				2423
				2424	DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
				2425	if (sz < maxlen) {
				2426	r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
				2427	if (r)
				2428	DMERR("policy_emit_config_values returned %d", r);
				2429	}
				2430
				2431	break;
				2432
				2433	case STATUSTYPE_TABLE:
				2434	format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
				2435	DMEMIT("%s ", buf);
				2436	format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
				2437	DMEMIT("%s ", buf);
				2438	format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
				2439	DMEMIT("%s", buf);
				2440
				2441	for (i = 0; i < cache->nr_ctr_args - 1; i++)
				2442	DMEMIT(" %s", cache->ctr_args[i]);
				2443	if (cache->nr_ctr_args)
				2444	DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
				2445	}
				2446
				2447	return;
				2448
				2449	err:
				2450	DMEMIT("Error");
				2451	}
				2452
				2453	#define NOT_CORE_OPTION 1
				2454
				2455	static int process_config_option(struct cache cache, char *argv)
				2456	{
				2457	unsigned long tmp;
				2458
				2459	if (!strcasecmp(argv[0], "migration_threshold")) {
				2460	if (kstrtoul(argv[1], 10, &tmp))
				2461	return -EINVAL;
				2462
				2463	cache->migration_threshold = tmp;
				2464	return 0;
				2465	}
				2466
				2467	return NOT_CORE_OPTION;
				2468	}
				2469
				2470	/*
				2471	* Supports <key> <value>.
				2472	*
				2473	* The key migration_threshold is supported by the cache target core.
				2474	*/
				2475	static int cache_message(struct dm_target ti, unsigned argc, char *argv)
				2476	{
				2477	int r;
				2478	struct cache *cache = ti->private;
				2479
				2480	if (argc != 2)
				2481	return -EINVAL;
				2482
				2483	r = process_config_option(cache, argv);
				2484	if (r == NOT_CORE_OPTION)
				2485	return policy_set_config_value(cache->policy, argv[0], argv[1]);
				2486
				2487	return r;
				2488	}
				2489
				2490	static int cache_iterate_devices(struct dm_target *ti,
				2491	iterate_devices_callout_fn fn, void *data)
				2492	{
				2493	int r = 0;
				2494	struct cache *cache = ti->private;
				2495
				2496	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
				2497	if (!r)
				2498	r = fn(ti, cache->origin_dev, 0, ti->len, data);
				2499
				2500	return r;
				2501	}
				2502
				2503	/*
				2504	* We assume I/O is going to the origin (which is the volume
				2505	* more likely to have restrictions e.g. by being striped).
				2506	* (Looking up the exact location of the data would be expensive
				2507	* and could always be out of date by the time the bio is submitted.)
				2508	*/
				2509	static int cache_bvec_merge(struct dm_target *ti,
				2510	struct bvec_merge_data *bvm,
				2511	struct bio_vec *biovec, int max_size)
				2512	{
				2513	struct cache *cache = ti->private;
				2514	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
				2515
				2516	if (!q->merge_bvec_fn)
				2517	return max_size;
				2518
				2519	bvm->bi_bdev = cache->origin_dev->bdev;
				2520	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
				2521	}
				2522
				2523	static void set_discard_limits(struct cache cache, struct queue_limits limits)
				2524	{
				2525	/*
				2526	* FIXME: these limits may be incompatible with the cache device
				2527	*/
				2528	limits->max_discard_sectors = cache->discard_block_size * 1024;
				2529	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
				2530	}
				2531
				2532	static void cache_io_hints(struct dm_target ti, struct queue_limits limits)
				2533	{
				2534	struct cache *cache = ti->private;
				2535
				2536	blk_limits_io_min(limits, 0);
				2537	blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
				2538	set_discard_limits(cache, limits);
				2539	}
				2540
				2541	/----------------------------------------------------------------/
				2542
				2543	static struct target_type cache_target = {
				2544	.name = "cache",
				2545	.version = {1, 0, 0},
				2546	.module = THIS_MODULE,
				2547	.ctr = cache_ctr,
				2548	.dtr = cache_dtr,
				2549	.map = cache_map,
				2550	.end_io = cache_end_io,
				2551	.postsuspend = cache_postsuspend,
				2552	.preresume = cache_preresume,
				2553	.resume = cache_resume,
				2554	.status = cache_status,
				2555	.message = cache_message,
				2556	.iterate_devices = cache_iterate_devices,
				2557	.merge = cache_bvec_merge,
				2558	.io_hints = cache_io_hints,
				2559	};
				2560
				2561	static int __init dm_cache_init(void)
				2562	{
				2563	int r;
				2564
				2565	r = dm_register_target(&cache_target);
				2566	if (r) {
				2567	DMERR("cache target registration failed: %d", r);
				2568	return r;
				2569	}
				2570
				2571	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
				2572	if (!migration_cache) {
				2573	dm_unregister_target(&cache_target);
				2574	return -ENOMEM;
				2575	}
				2576
				2577	return 0;
				2578	}
				2579
				2580	static void __exit dm_cache_exit(void)
				2581	{
				2582	dm_unregister_target(&cache_target);
				2583	kmem_cache_destroy(migration_cache);
				2584	}
				2585
				2586	module_init(dm_cache_init);
				2587	module_exit(dm_cache_exit);
				2588
				2589	MODULE_DESCRIPTION(DM_NAME " cache target");
				2590	MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
				2591	MODULE_LICENSE("GPL");