Blame - drivers/md/dm-raid1.c - kernel/msm-4.9

blob: 8632825137538bbb050eadf6cf0c13b101cdc05d [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2003 Sistina Software Limited.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm.h"
				8	#include "dm-bio-list.h"
				9	#include "dm-io.h"
				10	#include "dm-log.h"
				11	#include "kcopyd.h"
				12
				13	#include <linux/ctype.h>
				14	#include <linux/init.h>
				15	#include <linux/mempool.h>
				16	#include <linux/module.h>
				17	#include <linux/pagemap.h>
				18	#include <linux/slab.h>
				19	#include <linux/time.h>
				20	#include <linux/vmalloc.h>
				21	#include <linux/workqueue.h>
				22
				23	static struct workqueue_struct *_kmirrord_wq;
				24	static struct work_struct _kmirrord_work;
				25
				26	static inline void wake(void)
				27	{
				28	queue_work(_kmirrord_wq, &_kmirrord_work);
				29	}
				30
				31	/*-----------------------------------------------------------------
				32	* Region hash
				33	*
				34	* The mirror splits itself up into discrete regions. Each
				35	* region can be in one of three states: clean, dirty,
				36	* nosync. There is no need to put clean regions in the hash.
				37	*
				38	* In addition to being present in the hash table a region _may_
				39	* be present on one of three lists.
				40	*
				41	* clean_regions: Regions on this list have no io pending to
				42	* them, they are in sync, we are no longer interested in them,
				43	* they are dull. rh_update_states() will remove them from the
				44	* hash table.
				45	*
				46	* quiesced_regions: These regions have been spun down, ready
				47	* for recovery. rh_recovery_start() will remove regions from
				48	* this list and hand them to kmirrord, which will schedule the
				49	* recovery io with kcopyd.
				50	*
				51	* recovered_regions: Regions that kcopyd has successfully
				52	* recovered. rh_update_states() will now schedule any delayed
				53	* io, up the recovery_count, and remove the region from the
				54	* hash.
				55	*
				56	* There are 2 locks:
				57	* A rw spin lock 'hash_lock' protects just the hash table,
				58	* this is never held in write mode from interrupt context,
				59	* which I believe means that we only have to disable irqs when
				60	* doing a write lock.
				61	*
				62	* An ordinary spin lock 'region_lock' that protects the three
				63	* lists in the region_hash, with the 'state', 'list' and
				64	* 'bhs_delayed' fields of the regions. This is used from irq
				65	* context, so all other uses will have to suspend local irqs.
				66	---------------------------------------------------------------/
				67	struct mirror_set;
				68	struct region_hash {
				69	struct mirror_set *ms;
				70	uint32_t region_size;
				71	unsigned region_shift;
				72
				73	/* holds persistent region state */
				74	struct dirty_log *log;
				75
				76	/* hash table */
				77	rwlock_t hash_lock;
				78	mempool_t *region_pool;
				79	unsigned int mask;
				80	unsigned int nr_buckets;
				81	struct list_head *buckets;
				82
				83	spinlock_t region_lock;
				84	struct semaphore recovery_count;
				85	struct list_head clean_regions;
				86	struct list_head quiesced_regions;
				87	struct list_head recovered_regions;
				88	};
				89
				90	enum {
				91	RH_CLEAN,
				92	RH_DIRTY,
				93	RH_NOSYNC,
				94	RH_RECOVERING
				95	};
				96
				97	struct region {
				98	struct region_hash rh; / FIXME: can we get rid of this ? */
				99	region_t key;
				100	int state;
				101
				102	struct list_head hash_list;
				103	struct list_head list;
				104
				105	atomic_t pending;
				106	struct bio_list delayed_bios;
				107	};
				108
				109	/*
				110	* Conversion fns
				111	*/
				112	static inline region_t bio_to_region(struct region_hash rh, struct bio bio)
				113	{
				114	return bio->bi_sector >> rh->region_shift;
				115	}
				116
				117	static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
				118	{
				119	return region << rh->region_shift;
				120	}
				121
				122	/* FIXME move this */
				123	static void queue_bio(struct mirror_set ms, struct bio bio, int rw);
				124
				125	static void region_alloc(unsigned int __nocast gfp_mask, void pool_data)
				126	{
				127	return kmalloc(sizeof(struct region), gfp_mask);
				128	}
				129
				130	static void region_free(void element, void pool_data)
				131	{
				132	kfree(element);
				133	}
				134
				135	#define MIN_REGIONS 64
				136	#define MAX_RECOVERY 1
				137	static int rh_init(struct region_hash rh, struct mirror_set ms,
				138	struct dirty_log *log, uint32_t region_size,
				139	region_t nr_regions)
				140	{
				141	unsigned int nr_buckets, max_buckets;
				142	size_t i;
				143
				144	/*
				145	* Calculate a suitable number of buckets for our hash
				146	* table.
				147	*/
				148	max_buckets = nr_regions >> 6;
				149	for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
				150	;
				151	nr_buckets >>= 1;
				152
				153	rh->ms = ms;
				154	rh->log = log;
				155	rh->region_size = region_size;
				156	rh->region_shift = ffs(region_size) - 1;
				157	rwlock_init(&rh->hash_lock);
				158	rh->mask = nr_buckets - 1;
				159	rh->nr_buckets = nr_buckets;
				160
				161	rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
				162	if (!rh->buckets) {
				163	DMERR("unable to allocate region hash memory");
				164	return -ENOMEM;
				165	}
				166
				167	for (i = 0; i < nr_buckets; i++)
				168	INIT_LIST_HEAD(rh->buckets + i);
				169
				170	spin_lock_init(&rh->region_lock);
				171	sema_init(&rh->recovery_count, 0);
				172	INIT_LIST_HEAD(&rh->clean_regions);
				173	INIT_LIST_HEAD(&rh->quiesced_regions);
				174	INIT_LIST_HEAD(&rh->recovered_regions);
				175
				176	rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
				177	region_free, NULL);
				178	if (!rh->region_pool) {
				179	vfree(rh->buckets);
				180	rh->buckets = NULL;
				181	return -ENOMEM;
				182	}
				183
				184	return 0;
				185	}
				186
				187	static void rh_exit(struct region_hash *rh)
				188	{
				189	unsigned int h;
				190	struct region reg, nreg;
				191
				192	BUG_ON(!list_empty(&rh->quiesced_regions));
				193	for (h = 0; h < rh->nr_buckets; h++) {
				194	list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) {
				195	BUG_ON(atomic_read(&reg->pending));
				196	mempool_free(reg, rh->region_pool);
				197	}
				198	}
				199
				200	if (rh->log)
				201	dm_destroy_dirty_log(rh->log);
				202	if (rh->region_pool)
				203	mempool_destroy(rh->region_pool);
				204	vfree(rh->buckets);
				205	}
				206
				207	#define RH_HASH_MULT 2654435387U
				208
				209	static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
				210	{
				211	return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
				212	}
				213
				214	static struct region __rh_lookup(struct region_hash rh, region_t region)
				215	{
				216	struct region *reg;
				217
				218	list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
				219	if (reg->key == region)
				220	return reg;
				221
				222	return NULL;
				223	}
				224
				225	static void __rh_insert(struct region_hash rh, struct region reg)
				226	{
				227	unsigned int h = rh_hash(rh, reg->key);
				228	list_add(&reg->hash_list, rh->buckets + h);
				229	}
				230
				231	static struct region __rh_alloc(struct region_hash rh, region_t region)
				232	{
				233	struct region reg, nreg;
				234
				235	read_unlock(&rh->hash_lock);
				236	nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
				237	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
				238	RH_CLEAN : RH_NOSYNC;
				239	nreg->rh = rh;
				240	nreg->key = region;
				241
				242	INIT_LIST_HEAD(&nreg->list);
				243
				244	atomic_set(&nreg->pending, 0);
				245	bio_list_init(&nreg->delayed_bios);
				246	write_lock_irq(&rh->hash_lock);
				247
				248	reg = __rh_lookup(rh, region);
				249	if (reg)
				250	/* we lost the race */
				251	mempool_free(nreg, rh->region_pool);
				252
				253	else {
				254	__rh_insert(rh, nreg);
				255	if (nreg->state == RH_CLEAN) {
				256	spin_lock(&rh->region_lock);
				257	list_add(&nreg->list, &rh->clean_regions);
				258	spin_unlock(&rh->region_lock);
				259	}
				260	reg = nreg;
				261	}
				262	write_unlock_irq(&rh->hash_lock);
				263	read_lock(&rh->hash_lock);
				264
				265	return reg;
				266	}
				267
				268	static inline struct region __rh_find(struct region_hash rh, region_t region)
				269	{
				270	struct region *reg;
				271
				272	reg = __rh_lookup(rh, region);
				273	if (!reg)
				274	reg = __rh_alloc(rh, region);
				275
				276	return reg;
				277	}
				278
				279	static int rh_state(struct region_hash *rh, region_t region, int may_block)
				280	{
				281	int r;
				282	struct region *reg;
				283
				284	read_lock(&rh->hash_lock);
				285	reg = __rh_lookup(rh, region);
				286	read_unlock(&rh->hash_lock);
				287
				288	if (reg)
				289	return reg->state;
				290
				291	/*
				292	* The region wasn't in the hash, so we fall back to the
				293	* dirty log.
				294	*/
				295	r = rh->log->type->in_sync(rh->log, region, may_block);
				296
				297	/*
				298	* Any error from the dirty log (eg. -EWOULDBLOCK) gets
				299	* taken as a RH_NOSYNC
				300	*/
				301	return r == 1 ? RH_CLEAN : RH_NOSYNC;
				302	}
				303
				304	static inline int rh_in_sync(struct region_hash *rh,
				305	region_t region, int may_block)
				306	{
				307	int state = rh_state(rh, region, may_block);
				308	return state == RH_CLEAN \|\| state == RH_DIRTY;
				309	}
				310
				311	static void dispatch_bios(struct mirror_set ms, struct bio_list bio_list)
				312	{
				313	struct bio *bio;
				314
				315	while ((bio = bio_list_pop(bio_list))) {
				316	queue_bio(ms, bio, WRITE);
				317	}
				318	}
				319
				320	static void rh_update_states(struct region_hash *rh)
				321	{
				322	struct region reg, next;
				323
				324	LIST_HEAD(clean);
				325	LIST_HEAD(recovered);
				326
				327	/*
				328	* Quickly grab the lists.
				329	*/
				330	write_lock_irq(&rh->hash_lock);
				331	spin_lock(&rh->region_lock);
				332	if (!list_empty(&rh->clean_regions)) {
				333	list_splice(&rh->clean_regions, &clean);
				334	INIT_LIST_HEAD(&rh->clean_regions);
				335
				336	list_for_each_entry (reg, &clean, list) {
				337	rh->log->type->clear_region(rh->log, reg->key);
				338	list_del(&reg->hash_list);
				339	}
				340	}
				341
				342	if (!list_empty(&rh->recovered_regions)) {
				343	list_splice(&rh->recovered_regions, &recovered);
				344	INIT_LIST_HEAD(&rh->recovered_regions);
				345
				346	list_for_each_entry (reg, &recovered, list)
				347	list_del(&reg->hash_list);
				348	}
				349	spin_unlock(&rh->region_lock);
				350	write_unlock_irq(&rh->hash_lock);
				351
				352	/*
				353	* All the regions on the recovered and clean lists have
				354	* now been pulled out of the system, so no need to do
				355	* any more locking.
				356	*/
				357	list_for_each_entry_safe (reg, next, &recovered, list) {
				358	rh->log->type->clear_region(rh->log, reg->key);
				359	rh->log->type->complete_resync_work(rh->log, reg->key, 1);
				360	dispatch_bios(rh->ms, &reg->delayed_bios);
				361	up(&rh->recovery_count);
				362	mempool_free(reg, rh->region_pool);
				363	}
				364
				365	if (!list_empty(&recovered))
				366	rh->log->type->flush(rh->log);
				367
				368	list_for_each_entry_safe (reg, next, &clean, list)
				369	mempool_free(reg, rh->region_pool);
				370	}
				371
				372	static void rh_inc(struct region_hash *rh, region_t region)
				373	{
				374	struct region *reg;
				375
				376	read_lock(&rh->hash_lock);
				377	reg = __rh_find(rh, region);
Jun'ichi Nomura	844e8d9	2005-09-09 16:23:42 -0700	[diff] [blame^]	378
				379	atomic_inc(&reg->pending);
				380
				381	spin_lock_irq(&rh->region_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	382	if (reg->state == RH_CLEAN) {
				383	rh->log->type->mark_region(rh->log, reg->key);
				384
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	385	reg->state = RH_DIRTY;
				386	list_del_init(&reg->list); /* take off the clean list */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	387	}
Jun'ichi Nomura	844e8d9	2005-09-09 16:23:42 -0700	[diff] [blame^]	388	spin_unlock_irq(&rh->region_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	389
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	390	read_unlock(&rh->hash_lock);
				391	}
				392
				393	static void rh_inc_pending(struct region_hash rh, struct bio_list bios)
				394	{
				395	struct bio *bio;
				396
				397	for (bio = bios->head; bio; bio = bio->bi_next)
				398	rh_inc(rh, bio_to_region(rh, bio));
				399	}
				400
				401	static void rh_dec(struct region_hash *rh, region_t region)
				402	{
				403	unsigned long flags;
				404	struct region *reg;
				405	int should_wake = 0;
				406
				407	read_lock(&rh->hash_lock);
				408	reg = __rh_lookup(rh, region);
				409	read_unlock(&rh->hash_lock);
				410
				411	if (atomic_dec_and_test(&reg->pending)) {
				412	spin_lock_irqsave(&rh->region_lock, flags);
Jun'ichi Nomura	844e8d9	2005-09-09 16:23:42 -0700	[diff] [blame^]	413	if (atomic_read(&reg->pending)) { /* check race */
				414	spin_unlock_irqrestore(&rh->region_lock, flags);
				415	return;
				416	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	417	if (reg->state == RH_RECOVERING) {
				418	list_add_tail(&reg->list, &rh->quiesced_regions);
				419	} else {
				420	reg->state = RH_CLEAN;
				421	list_add(&reg->list, &rh->clean_regions);
				422	}
				423	spin_unlock_irqrestore(&rh->region_lock, flags);
				424	should_wake = 1;
				425	}
				426
				427	if (should_wake)
				428	wake();
				429	}
				430
				431	/*
				432	* Starts quiescing a region in preparation for recovery.
				433	*/
				434	static int __rh_recovery_prepare(struct region_hash *rh)
				435	{
				436	int r;
				437	struct region *reg;
				438	region_t region;
				439
				440	/*
				441	* Ask the dirty log what's next.
				442	*/
				443	r = rh->log->type->get_resync_work(rh->log, &region);
				444	if (r <= 0)
				445	return r;
				446
				447	/*
				448	* Get this region, and start it quiescing by setting the
				449	* recovering flag.
				450	*/
				451	read_lock(&rh->hash_lock);
				452	reg = __rh_find(rh, region);
				453	read_unlock(&rh->hash_lock);
				454
				455	spin_lock_irq(&rh->region_lock);
				456	reg->state = RH_RECOVERING;
				457
				458	/* Already quiesced ? */
				459	if (atomic_read(&reg->pending))
				460	list_del_init(&reg->list);
				461
				462	else {
				463	list_del_init(&reg->list);
				464	list_add(&reg->list, &rh->quiesced_regions);
				465	}
				466	spin_unlock_irq(&rh->region_lock);
				467
				468	return 1;
				469	}
				470
				471	static void rh_recovery_prepare(struct region_hash *rh)
				472	{
				473	while (!down_trylock(&rh->recovery_count))
				474	if (__rh_recovery_prepare(rh) <= 0) {
				475	up(&rh->recovery_count);
				476	break;
				477	}
				478	}
				479
				480	/*
				481	* Returns any quiesced regions.
				482	*/
				483	static struct region rh_recovery_start(struct region_hash rh)
				484	{
				485	struct region *reg = NULL;
				486
				487	spin_lock_irq(&rh->region_lock);
				488	if (!list_empty(&rh->quiesced_regions)) {
				489	reg = list_entry(rh->quiesced_regions.next,
				490	struct region, list);
				491	list_del_init(&reg->list); /* remove from the quiesced list */
				492	}
				493	spin_unlock_irq(&rh->region_lock);
				494
				495	return reg;
				496	}
				497
				498	/* FIXME: success ignored for now */
				499	static void rh_recovery_end(struct region *reg, int success)
				500	{
				501	struct region_hash *rh = reg->rh;
				502
				503	spin_lock_irq(&rh->region_lock);
				504	list_add(&reg->list, &reg->rh->recovered_regions);
				505	spin_unlock_irq(&rh->region_lock);
				506
				507	wake();
				508	}
				509
				510	static void rh_flush(struct region_hash *rh)
				511	{
				512	rh->log->type->flush(rh->log);
				513	}
				514
				515	static void rh_delay(struct region_hash rh, struct bio bio)
				516	{
				517	struct region *reg;
				518
				519	read_lock(&rh->hash_lock);
				520	reg = __rh_find(rh, bio_to_region(rh, bio));
				521	bio_list_add(&reg->delayed_bios, bio);
				522	read_unlock(&rh->hash_lock);
				523	}
				524
				525	static void rh_stop_recovery(struct region_hash *rh)
				526	{
				527	int i;
				528
				529	/* wait for any recovering regions */
				530	for (i = 0; i < MAX_RECOVERY; i++)
				531	down(&rh->recovery_count);
				532	}
				533
				534	static void rh_start_recovery(struct region_hash *rh)
				535	{
				536	int i;
				537
				538	for (i = 0; i < MAX_RECOVERY; i++)
				539	up(&rh->recovery_count);
				540
				541	wake();
				542	}
				543
				544	/*-----------------------------------------------------------------
				545	* Mirror set structures.
				546	---------------------------------------------------------------/
				547	struct mirror {
				548	atomic_t error_count;
				549	struct dm_dev *dev;
				550	sector_t offset;
				551	};
				552
				553	struct mirror_set {
				554	struct dm_target *ti;
				555	struct list_head list;
				556	struct region_hash rh;
				557	struct kcopyd_client *kcopyd_client;
				558
				559	spinlock_t lock; /* protects the next two lists */
				560	struct bio_list reads;
				561	struct bio_list writes;
				562
				563	/* recovery */
				564	region_t nr_regions;
				565	int in_sync;
				566
				567	unsigned int nr_mirrors;
				568	struct mirror mirror[0];
				569	};
				570
				571	/*
				572	* Every mirror should look like this one.
				573	*/
				574	#define DEFAULT_MIRROR 0
				575
				576	/*
				577	* This is yucky. We squirrel the mirror_set struct away inside
				578	* bi_next for write buffers. This is safe since the bh
				579	* doesn't get submitted to the lower levels of block layer.
				580	*/
				581	static struct mirror_set bio_get_ms(struct bio bio)
				582	{
				583	return (struct mirror_set *) bio->bi_next;
				584	}
				585
				586	static void bio_set_ms(struct bio bio, struct mirror_set ms)
				587	{
				588	bio->bi_next = (struct bio *) ms;
				589	}
				590
				591	/*-----------------------------------------------------------------
				592	* Recovery.
				593	*
				594	* When a mirror is first activated we may find that some regions
				595	* are in the no-sync state. We have to recover these by
				596	* recopying from the default mirror to all the others.
				597	---------------------------------------------------------------/
				598	static void recovery_complete(int read_err, unsigned int write_err,
				599	void *context)
				600	{
				601	struct region reg = (struct region ) context;
				602
				603	/* FIXME: better error handling */
				604	rh_recovery_end(reg, read_err \|\| write_err);
				605	}
				606
				607	static int recover(struct mirror_set ms, struct region reg)
				608	{
				609	int r;
				610	unsigned int i;
				611	struct io_region from, to[KCOPYD_MAX_REGIONS], *dest;
				612	struct mirror *m;
				613	unsigned long flags = 0;
				614
				615	/* fill in the source */
				616	m = ms->mirror + DEFAULT_MIRROR;
				617	from.bdev = m->dev->bdev;
				618	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
				619	if (reg->key == (ms->nr_regions - 1)) {
				620	/*
				621	* The final region may be smaller than
				622	* region_size.
				623	*/
				624	from.count = ms->ti->len & (reg->rh->region_size - 1);
				625	if (!from.count)
				626	from.count = reg->rh->region_size;
				627	} else
				628	from.count = reg->rh->region_size;
				629
				630	/* fill in the destinations */
				631	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
				632	if (i == DEFAULT_MIRROR)
				633	continue;
				634
				635	m = ms->mirror + i;
				636	dest->bdev = m->dev->bdev;
				637	dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
				638	dest->count = from.count;
				639	dest++;
				640	}
				641
				642	/* hand to kcopyd */
				643	set_bit(KCOPYD_IGNORE_ERROR, &flags);
				644	r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
				645	recovery_complete, reg);
				646
				647	return r;
				648	}
				649
				650	static void do_recovery(struct mirror_set *ms)
				651	{
				652	int r;
				653	struct region *reg;
				654	struct dirty_log *log = ms->rh.log;
				655
				656	/*
				657	* Start quiescing some regions.
				658	*/
				659	rh_recovery_prepare(&ms->rh);
				660
				661	/*
				662	* Copy any already quiesced regions.
				663	*/
				664	while ((reg = rh_recovery_start(&ms->rh))) {
				665	r = recover(ms, reg);
				666	if (r)
				667	rh_recovery_end(reg, 0);
				668	}
				669
				670	/*
				671	* Update the in sync flag.
				672	*/
				673	if (!ms->in_sync &&
				674	(log->type->get_sync_count(log) == ms->nr_regions)) {
				675	/* the sync is complete */
				676	dm_table_event(ms->ti->table);
				677	ms->in_sync = 1;
				678	}
				679	}
				680
				681	/*-----------------------------------------------------------------
				682	* Reads
				683	---------------------------------------------------------------/
				684	static struct mirror choose_mirror(struct mirror_set ms, sector_t sector)
				685	{
				686	/* FIXME: add read balancing */
				687	return ms->mirror + DEFAULT_MIRROR;
				688	}
				689
				690	/*
				691	* remap a buffer to a particular mirror.
				692	*/
				693	static void map_bio(struct mirror_set ms, struct mirror m, struct bio *bio)
				694	{
				695	bio->bi_bdev = m->dev->bdev;
				696	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
				697	}
				698
				699	static void do_reads(struct mirror_set ms, struct bio_list reads)
				700	{
				701	region_t region;
				702	struct bio *bio;
				703	struct mirror *m;
				704
				705	while ((bio = bio_list_pop(reads))) {
				706	region = bio_to_region(&ms->rh, bio);
				707
				708	/*
				709	* We can only read balance if the region is in sync.
				710	*/
				711	if (rh_in_sync(&ms->rh, region, 0))
				712	m = choose_mirror(ms, bio->bi_sector);
				713	else
				714	m = ms->mirror + DEFAULT_MIRROR;
				715
				716	map_bio(ms, m, bio);
				717	generic_make_request(bio);
				718	}
				719	}
				720
				721	/*-----------------------------------------------------------------
				722	* Writes.
				723	*
				724	* We do different things with the write io depending on the
				725	* state of the region that it's in:
				726	*
				727	* SYNC: increment pending, use kcopyd to write to all mirrors
				728	* RECOVERING: delay the io until recovery completes
				729	* NOSYNC: increment pending, just write to the default mirror
				730	---------------------------------------------------------------/
				731	static void write_callback(unsigned long error, void *context)
				732	{
				733	unsigned int i;
				734	int uptodate = 1;
				735	struct bio bio = (struct bio ) context;
				736	struct mirror_set *ms;
				737
				738	ms = bio_get_ms(bio);
				739	bio_set_ms(bio, NULL);
				740
				741	/*
				742	* NOTE: We don't decrement the pending count here,
				743	* instead it is done by the targets endio function.
				744	* This way we handle both writes to SYNC and NOSYNC
				745	* regions with the same code.
				746	*/
				747
				748	if (error) {
				749	/*
				750	* only error the io if all mirrors failed.
				751	* FIXME: bogus
				752	*/
				753	uptodate = 0;
				754	for (i = 0; i < ms->nr_mirrors; i++)
				755	if (!test_bit(i, &error)) {
				756	uptodate = 1;
				757	break;
				758	}
				759	}
				760	bio_endio(bio, bio->bi_size, 0);
				761	}
				762
				763	static void do_write(struct mirror_set ms, struct bio bio)
				764	{
				765	unsigned int i;
				766	struct io_region io[KCOPYD_MAX_REGIONS+1];
				767	struct mirror *m;
				768
				769	for (i = 0; i < ms->nr_mirrors; i++) {
				770	m = ms->mirror + i;
				771
				772	io[i].bdev = m->dev->bdev;
				773	io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
				774	io[i].count = bio->bi_size >> 9;
				775	}
				776
				777	bio_set_ms(bio, ms);
				778	dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
				779	bio->bi_io_vec + bio->bi_idx,
				780	write_callback, bio);
				781	}
				782
				783	static void do_writes(struct mirror_set ms, struct bio_list writes)
				784	{
				785	int state;
				786	struct bio *bio;
				787	struct bio_list sync, nosync, recover, *this_list = NULL;
				788
				789	if (!writes->head)
				790	return;
				791
				792	/*
				793	* Classify each write.
				794	*/
				795	bio_list_init(&sync);
				796	bio_list_init(&nosync);
				797	bio_list_init(&recover);
				798
				799	while ((bio = bio_list_pop(writes))) {
				800	state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
				801	switch (state) {
				802	case RH_CLEAN:
				803	case RH_DIRTY:
				804	this_list = &sync;
				805	break;
				806
				807	case RH_NOSYNC:
				808	this_list = &nosync;
				809	break;
				810
				811	case RH_RECOVERING:
				812	this_list = &recover;
				813	break;
				814	}
				815
				816	bio_list_add(this_list, bio);
				817	}
				818
				819	/*
				820	* Increment the pending counts for any regions that will
				821	* be written to (writes to recover regions are going to
				822	* be delayed).
				823	*/
				824	rh_inc_pending(&ms->rh, &sync);
				825	rh_inc_pending(&ms->rh, &nosync);
				826	rh_flush(&ms->rh);
				827
				828	/*
				829	* Dispatch io.
				830	*/
				831	while ((bio = bio_list_pop(&sync)))
				832	do_write(ms, bio);
				833
				834	while ((bio = bio_list_pop(&recover)))
				835	rh_delay(&ms->rh, bio);
				836
				837	while ((bio = bio_list_pop(&nosync))) {
				838	map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
				839	generic_make_request(bio);
				840	}
				841	}
				842
				843	/*-----------------------------------------------------------------
				844	* kmirrord
				845	---------------------------------------------------------------/
				846	static LIST_HEAD(_mirror_sets);
				847	static DECLARE_RWSEM(_mirror_sets_lock);
				848
				849	static void do_mirror(struct mirror_set *ms)
				850	{
				851	struct bio_list reads, writes;
				852
				853	spin_lock(&ms->lock);
				854	reads = ms->reads;
				855	writes = ms->writes;
				856	bio_list_init(&ms->reads);
				857	bio_list_init(&ms->writes);
				858	spin_unlock(&ms->lock);
				859
				860	rh_update_states(&ms->rh);
				861	do_recovery(ms);
				862	do_reads(ms, &reads);
				863	do_writes(ms, &writes);
				864	}
				865
				866	static void do_work(void *ignored)
				867	{
				868	struct mirror_set *ms;
				869
				870	down_read(&_mirror_sets_lock);
				871	list_for_each_entry (ms, &_mirror_sets, list)
				872	do_mirror(ms);
				873	up_read(&_mirror_sets_lock);
				874	}
				875
				876	/*-----------------------------------------------------------------
				877	* Target functions
				878	---------------------------------------------------------------/
				879	static struct mirror_set *alloc_context(unsigned int nr_mirrors,
				880	uint32_t region_size,
				881	struct dm_target *ti,
				882	struct dirty_log *dl)
				883	{
				884	size_t len;
				885	struct mirror_set *ms = NULL;
				886
				887	if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
				888	return NULL;
				889
				890	len = sizeof(ms) + (sizeof(ms->mirror[0]) nr_mirrors);
				891
				892	ms = kmalloc(len, GFP_KERNEL);
				893	if (!ms) {
				894	ti->error = "dm-mirror: Cannot allocate mirror context";
				895	return NULL;
				896	}
				897
				898	memset(ms, 0, len);
				899	spin_lock_init(&ms->lock);
				900
				901	ms->ti = ti;
				902	ms->nr_mirrors = nr_mirrors;
				903	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
				904	ms->in_sync = 0;
				905
				906	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
				907	ti->error = "dm-mirror: Error creating dirty region hash";
				908	kfree(ms);
				909	return NULL;
				910	}
				911
				912	return ms;
				913	}
				914
				915	static void free_context(struct mirror_set ms, struct dm_target ti,
				916	unsigned int m)
				917	{
				918	while (m--)
				919	dm_put_device(ti, ms->mirror[m].dev);
				920
				921	rh_exit(&ms->rh);
				922	kfree(ms);
				923	}
				924
				925	static inline int _check_region_size(struct dm_target *ti, uint32_t size)
				926	{
				927	return !(size % (PAGE_SIZE >> 9) \|\| (size & (size - 1)) \|\|
				928	size > ti->len);
				929	}
				930
				931	static int get_mirror(struct mirror_set ms, struct dm_target ti,
				932	unsigned int mirror, char **argv)
				933	{
				934	sector_t offset;
				935
				936	if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
				937	ti->error = "dm-mirror: Invalid offset";
				938	return -EINVAL;
				939	}
				940
				941	if (dm_get_device(ti, argv[0], offset, ti->len,
				942	dm_table_get_mode(ti->table),
				943	&ms->mirror[mirror].dev)) {
				944	ti->error = "dm-mirror: Device lookup failure";
				945	return -ENXIO;
				946	}
				947
				948	ms->mirror[mirror].offset = offset;
				949
				950	return 0;
				951	}
				952
				953	static int add_mirror_set(struct mirror_set *ms)
				954	{
				955	down_write(&_mirror_sets_lock);
				956	list_add_tail(&ms->list, &_mirror_sets);
				957	up_write(&_mirror_sets_lock);
				958	wake();
				959
				960	return 0;
				961	}
				962
				963	static void del_mirror_set(struct mirror_set *ms)
				964	{
				965	down_write(&_mirror_sets_lock);
				966	list_del(&ms->list);
				967	up_write(&_mirror_sets_lock);
				968	}
				969
				970	/*
				971	* Create dirty log: log_type #log_params <log_params>
				972	*/
				973	static struct dirty_log create_dirty_log(struct dm_target ti,
				974	unsigned int argc, char **argv,
				975	unsigned int *args_used)
				976	{
				977	unsigned int param_count;
				978	struct dirty_log *dl;
				979
				980	if (argc < 2) {
				981	ti->error = "dm-mirror: Insufficient mirror log arguments";
				982	return NULL;
				983	}
				984
				985	if (sscanf(argv[1], "%u", &param_count) != 1) {
				986	ti->error = "dm-mirror: Invalid mirror log argument count";
				987	return NULL;
				988	}
				989
				990	*args_used = 2 + param_count;
				991
				992	if (argc < *args_used) {
				993	ti->error = "dm-mirror: Insufficient mirror log arguments";
				994	return NULL;
				995	}
				996
				997	dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
				998	if (!dl) {
				999	ti->error = "dm-mirror: Error creating mirror dirty log";
				1000	return NULL;
				1001	}
				1002
				1003	if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
				1004	ti->error = "dm-mirror: Invalid region size";
				1005	dm_destroy_dirty_log(dl);
				1006	return NULL;
				1007	}
				1008
				1009	return dl;
				1010	}
				1011
				1012	/*
				1013	* Construct a mirror mapping:
				1014	*
				1015	* log_type #log_params <log_params>
				1016	* #mirrors [mirror_path offset]{2,}
				1017	*
				1018	* log_type is "core" or "disk"
				1019	* #log_params is between 1 and 3
				1020	*/
				1021	#define DM_IO_PAGES 64
				1022	static int mirror_ctr(struct dm_target ti, unsigned int argc, char *argv)
				1023	{
				1024	int r;
				1025	unsigned int nr_mirrors, m, args_used;
				1026	struct mirror_set *ms;
				1027	struct dirty_log *dl;
				1028
				1029	dl = create_dirty_log(ti, argc, argv, &args_used);
				1030	if (!dl)
				1031	return -EINVAL;
				1032
				1033	argv += args_used;
				1034	argc -= args_used;
				1035
				1036	if (!argc \|\| sscanf(argv[0], "%u", &nr_mirrors) != 1 \|\|
				1037	nr_mirrors < 2 \|\| nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
				1038	ti->error = "dm-mirror: Invalid number of mirrors";
				1039	dm_destroy_dirty_log(dl);
				1040	return -EINVAL;
				1041	}
				1042
				1043	argv++, argc--;
				1044
				1045	if (argc != nr_mirrors * 2) {
				1046	ti->error = "dm-mirror: Wrong number of mirror arguments";
				1047	dm_destroy_dirty_log(dl);
				1048	return -EINVAL;
				1049	}
				1050
				1051	ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
				1052	if (!ms) {
				1053	dm_destroy_dirty_log(dl);
				1054	return -ENOMEM;
				1055	}
				1056
				1057	/* Get the mirror parameter sets */
				1058	for (m = 0; m < nr_mirrors; m++) {
				1059	r = get_mirror(ms, ti, m, argv);
				1060	if (r) {
				1061	free_context(ms, ti, m);
				1062	return r;
				1063	}
				1064	argv += 2;
				1065	argc -= 2;
				1066	}
				1067
				1068	ti->private = ms;
Alasdair G Kergon	d88854f	2005-07-07 17:59:34 -0700	[diff] [blame]	1069	ti->split_io = ms->rh.region_size;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1070
				1071	r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
				1072	if (r) {
				1073	free_context(ms, ti, ms->nr_mirrors);
				1074	return r;
				1075	}
				1076
				1077	add_mirror_set(ms);
				1078	return 0;
				1079	}
				1080
				1081	static void mirror_dtr(struct dm_target *ti)
				1082	{
				1083	struct mirror_set ms = (struct mirror_set ) ti->private;
				1084
				1085	del_mirror_set(ms);
				1086	kcopyd_client_destroy(ms->kcopyd_client);
				1087	free_context(ms, ti, ms->nr_mirrors);
				1088	}
				1089
				1090	static void queue_bio(struct mirror_set ms, struct bio bio, int rw)
				1091	{
				1092	int should_wake = 0;
				1093	struct bio_list *bl;
				1094
				1095	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
				1096	spin_lock(&ms->lock);
				1097	should_wake = !(bl->head);
				1098	bio_list_add(bl, bio);
				1099	spin_unlock(&ms->lock);
				1100
				1101	if (should_wake)
				1102	wake();
				1103	}
				1104
				1105	/*
				1106	* Mirror mapping function
				1107	*/
				1108	static int mirror_map(struct dm_target ti, struct bio bio,
				1109	union map_info *map_context)
				1110	{
				1111	int r, rw = bio_rw(bio);
				1112	struct mirror *m;
				1113	struct mirror_set *ms = ti->private;
				1114
				1115	map_context->ll = bio->bi_sector >> ms->rh.region_shift;
				1116
				1117	if (rw == WRITE) {
				1118	queue_bio(ms, bio, rw);
				1119	return 0;
				1120	}
				1121
				1122	r = ms->rh.log->type->in_sync(ms->rh.log,
				1123	bio_to_region(&ms->rh, bio), 0);
				1124	if (r < 0 && r != -EWOULDBLOCK)
				1125	return r;
				1126
				1127	if (r == -EWOULDBLOCK) /* FIXME: ugly */
				1128	r = 0;
				1129
				1130	/*
				1131	* We don't want to fast track a recovery just for a read
				1132	* ahead. So we just let it silently fail.
				1133	* FIXME: get rid of this.
				1134	*/
				1135	if (!r && rw == READA)
				1136	return -EIO;
				1137
				1138	if (!r) {
				1139	/* Pass this io over to the daemon */
				1140	queue_bio(ms, bio, rw);
				1141	return 0;
				1142	}
				1143
				1144	m = choose_mirror(ms, bio->bi_sector);
				1145	if (!m)
				1146	return -EIO;
				1147
				1148	map_bio(ms, m, bio);
				1149	return 1;
				1150	}
				1151
				1152	static int mirror_end_io(struct dm_target ti, struct bio bio,
				1153	int error, union map_info *map_context)
				1154	{
				1155	int rw = bio_rw(bio);
				1156	struct mirror_set ms = (struct mirror_set ) ti->private;
				1157	region_t region = map_context->ll;
				1158
				1159	/*
				1160	* We need to dec pending if this was a write.
				1161	*/
				1162	if (rw == WRITE)
				1163	rh_dec(&ms->rh, region);
				1164
				1165	return 0;
				1166	}
				1167
				1168	static void mirror_postsuspend(struct dm_target *ti)
				1169	{
				1170	struct mirror_set ms = (struct mirror_set ) ti->private;
				1171	struct dirty_log *log = ms->rh.log;
				1172
				1173	rh_stop_recovery(&ms->rh);
				1174	if (log->type->suspend && log->type->suspend(log))
				1175	/* FIXME: need better error handling */
				1176	DMWARN("log suspend failed");
				1177	}
				1178
				1179	static void mirror_resume(struct dm_target *ti)
				1180	{
				1181	struct mirror_set ms = (struct mirror_set ) ti->private;
				1182	struct dirty_log *log = ms->rh.log;
				1183	if (log->type->resume && log->type->resume(log))
				1184	/* FIXME: need better error handling */
				1185	DMWARN("log resume failed");
				1186	rh_start_recovery(&ms->rh);
				1187	}
				1188
				1189	static int mirror_status(struct dm_target *ti, status_type_t type,
				1190	char *result, unsigned int maxlen)
				1191	{
				1192	unsigned int m, sz;
				1193	struct mirror_set ms = (struct mirror_set ) ti->private;
				1194
				1195	sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
				1196
				1197	switch (type) {
				1198	case STATUSTYPE_INFO:
				1199	DMEMIT("%d ", ms->nr_mirrors);
				1200	for (m = 0; m < ms->nr_mirrors; m++)
				1201	DMEMIT("%s ", ms->mirror[m].dev->name);
				1202
				1203	DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
				1204	ms->rh.log->type->get_sync_count(ms->rh.log),
				1205	ms->nr_regions);
				1206	break;
				1207
				1208	case STATUSTYPE_TABLE:
				1209	DMEMIT("%d ", ms->nr_mirrors);
				1210	for (m = 0; m < ms->nr_mirrors; m++)
				1211	DMEMIT("%s " SECTOR_FORMAT " ",
				1212	ms->mirror[m].dev->name, ms->mirror[m].offset);
				1213	}
				1214
				1215	return 0;
				1216	}
				1217
				1218	static struct target_type mirror_target = {
				1219	.name = "mirror",
				1220	.version = {1, 0, 1},
				1221	.module = THIS_MODULE,
				1222	.ctr = mirror_ctr,
				1223	.dtr = mirror_dtr,
				1224	.map = mirror_map,
				1225	.end_io = mirror_end_io,
				1226	.postsuspend = mirror_postsuspend,
				1227	.resume = mirror_resume,
				1228	.status = mirror_status,
				1229	};
				1230
				1231	static int __init dm_mirror_init(void)
				1232	{
				1233	int r;
				1234
				1235	r = dm_dirty_log_init();
				1236	if (r)
				1237	return r;
				1238
Alasdair G Kergon	48f1f53	2005-08-04 12:53:37 -0700	[diff] [blame]	1239	_kmirrord_wq = create_singlethread_workqueue("kmirrord");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1240	if (!_kmirrord_wq) {
				1241	DMERR("couldn't start kmirrord");
				1242	dm_dirty_log_exit();
				1243	return r;
				1244	}
				1245	INIT_WORK(&_kmirrord_work, do_work, NULL);
				1246
				1247	r = dm_register_target(&mirror_target);
				1248	if (r < 0) {
				1249	DMERR("%s: Failed to register mirror target",
				1250	mirror_target.name);
				1251	dm_dirty_log_exit();
				1252	destroy_workqueue(_kmirrord_wq);
				1253	}
				1254
				1255	return r;
				1256	}
				1257
				1258	static void __exit dm_mirror_exit(void)
				1259	{
				1260	int r;
				1261
				1262	r = dm_unregister_target(&mirror_target);
				1263	if (r < 0)
				1264	DMERR("%s: unregister failed %d", mirror_target.name, r);
				1265
				1266	destroy_workqueue(_kmirrord_wq);
				1267	dm_dirty_log_exit();
				1268	}
				1269
				1270	/* Module hooks */
				1271	module_init(dm_mirror_init);
				1272	module_exit(dm_mirror_exit);
				1273
				1274	MODULE_DESCRIPTION(DM_NAME " mirror target");
				1275	MODULE_AUTHOR("Joe Thornber");
				1276	MODULE_LICENSE("GPL");