Blame - drivers/md/dm-raid1.c - fp2-dev/kernel/msm

blob: 12031c9d3f1e1394379ee211ae49bf61ff093949 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2003 Sistina Software Limited.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm.h"
				8	#include "dm-bio-list.h"
				9	#include "dm-io.h"
				10	#include "dm-log.h"
				11	#include "kcopyd.h"
				12
				13	#include <linux/ctype.h>
				14	#include <linux/init.h>
				15	#include <linux/mempool.h>
				16	#include <linux/module.h>
				17	#include <linux/pagemap.h>
				18	#include <linux/slab.h>
				19	#include <linux/time.h>
				20	#include <linux/vmalloc.h>
				21	#include <linux/workqueue.h>
				22
				23	static struct workqueue_struct *_kmirrord_wq;
				24	static struct work_struct _kmirrord_work;
				25
				26	static inline void wake(void)
				27	{
				28	queue_work(_kmirrord_wq, &_kmirrord_work);
				29	}
				30
				31	/*-----------------------------------------------------------------
				32	* Region hash
				33	*
				34	* The mirror splits itself up into discrete regions. Each
				35	* region can be in one of three states: clean, dirty,
				36	* nosync. There is no need to put clean regions in the hash.
				37	*
				38	* In addition to being present in the hash table a region _may_
				39	* be present on one of three lists.
				40	*
				41	* clean_regions: Regions on this list have no io pending to
				42	* them, they are in sync, we are no longer interested in them,
				43	* they are dull. rh_update_states() will remove them from the
				44	* hash table.
				45	*
				46	* quiesced_regions: These regions have been spun down, ready
				47	* for recovery. rh_recovery_start() will remove regions from
				48	* this list and hand them to kmirrord, which will schedule the
				49	* recovery io with kcopyd.
				50	*
				51	* recovered_regions: Regions that kcopyd has successfully
				52	* recovered. rh_update_states() will now schedule any delayed
				53	* io, up the recovery_count, and remove the region from the
				54	* hash.
				55	*
				56	* There are 2 locks:
				57	* A rw spin lock 'hash_lock' protects just the hash table,
				58	* this is never held in write mode from interrupt context,
				59	* which I believe means that we only have to disable irqs when
				60	* doing a write lock.
				61	*
				62	* An ordinary spin lock 'region_lock' that protects the three
				63	* lists in the region_hash, with the 'state', 'list' and
				64	* 'bhs_delayed' fields of the regions. This is used from irq
				65	* context, so all other uses will have to suspend local irqs.
				66	---------------------------------------------------------------/
				67	struct mirror_set;
				68	struct region_hash {
				69	struct mirror_set *ms;
				70	uint32_t region_size;
				71	unsigned region_shift;
				72
				73	/* holds persistent region state */
				74	struct dirty_log *log;
				75
				76	/* hash table */
				77	rwlock_t hash_lock;
				78	mempool_t *region_pool;
				79	unsigned int mask;
				80	unsigned int nr_buckets;
				81	struct list_head *buckets;
				82
				83	spinlock_t region_lock;
				84	struct semaphore recovery_count;
				85	struct list_head clean_regions;
				86	struct list_head quiesced_regions;
				87	struct list_head recovered_regions;
				88	};
				89
				90	enum {
				91	RH_CLEAN,
				92	RH_DIRTY,
				93	RH_NOSYNC,
				94	RH_RECOVERING
				95	};
				96
				97	struct region {
				98	struct region_hash rh; / FIXME: can we get rid of this ? */
				99	region_t key;
				100	int state;
				101
				102	struct list_head hash_list;
				103	struct list_head list;
				104
				105	atomic_t pending;
				106	struct bio_list delayed_bios;
				107	};
				108
				109	/*
				110	* Conversion fns
				111	*/
				112	static inline region_t bio_to_region(struct region_hash rh, struct bio bio)
				113	{
				114	return bio->bi_sector >> rh->region_shift;
				115	}
				116
				117	static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
				118	{
				119	return region << rh->region_shift;
				120	}
				121
				122	/* FIXME move this */
				123	static void queue_bio(struct mirror_set ms, struct bio bio, int rw);
				124
				125	static void region_alloc(unsigned int __nocast gfp_mask, void pool_data)
				126	{
				127	return kmalloc(sizeof(struct region), gfp_mask);
				128	}
				129
				130	static void region_free(void element, void pool_data)
				131	{
				132	kfree(element);
				133	}
				134
				135	#define MIN_REGIONS 64
				136	#define MAX_RECOVERY 1
				137	static int rh_init(struct region_hash rh, struct mirror_set ms,
				138	struct dirty_log *log, uint32_t region_size,
				139	region_t nr_regions)
				140	{
				141	unsigned int nr_buckets, max_buckets;
				142	size_t i;
				143
				144	/*
				145	* Calculate a suitable number of buckets for our hash
				146	* table.
				147	*/
				148	max_buckets = nr_regions >> 6;
				149	for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
				150	;
				151	nr_buckets >>= 1;
				152
				153	rh->ms = ms;
				154	rh->log = log;
				155	rh->region_size = region_size;
				156	rh->region_shift = ffs(region_size) - 1;
				157	rwlock_init(&rh->hash_lock);
				158	rh->mask = nr_buckets - 1;
				159	rh->nr_buckets = nr_buckets;
				160
				161	rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
				162	if (!rh->buckets) {
				163	DMERR("unable to allocate region hash memory");
				164	return -ENOMEM;
				165	}
				166
				167	for (i = 0; i < nr_buckets; i++)
				168	INIT_LIST_HEAD(rh->buckets + i);
				169
				170	spin_lock_init(&rh->region_lock);
				171	sema_init(&rh->recovery_count, 0);
				172	INIT_LIST_HEAD(&rh->clean_regions);
				173	INIT_LIST_HEAD(&rh->quiesced_regions);
				174	INIT_LIST_HEAD(&rh->recovered_regions);
				175
				176	rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
				177	region_free, NULL);
				178	if (!rh->region_pool) {
				179	vfree(rh->buckets);
				180	rh->buckets = NULL;
				181	return -ENOMEM;
				182	}
				183
				184	return 0;
				185	}
				186
				187	static void rh_exit(struct region_hash *rh)
				188	{
				189	unsigned int h;
				190	struct region reg, nreg;
				191
				192	BUG_ON(!list_empty(&rh->quiesced_regions));
				193	for (h = 0; h < rh->nr_buckets; h++) {
				194	list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) {
				195	BUG_ON(atomic_read(&reg->pending));
				196	mempool_free(reg, rh->region_pool);
				197	}
				198	}
				199
				200	if (rh->log)
				201	dm_destroy_dirty_log(rh->log);
				202	if (rh->region_pool)
				203	mempool_destroy(rh->region_pool);
				204	vfree(rh->buckets);
				205	}
				206
				207	#define RH_HASH_MULT 2654435387U
				208
				209	static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
				210	{
				211	return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
				212	}
				213
				214	static struct region __rh_lookup(struct region_hash rh, region_t region)
				215	{
				216	struct region *reg;
				217
				218	list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
				219	if (reg->key == region)
				220	return reg;
				221
				222	return NULL;
				223	}
				224
				225	static void __rh_insert(struct region_hash rh, struct region reg)
				226	{
				227	unsigned int h = rh_hash(rh, reg->key);
				228	list_add(&reg->hash_list, rh->buckets + h);
				229	}
				230
				231	static struct region __rh_alloc(struct region_hash rh, region_t region)
				232	{
				233	struct region reg, nreg;
				234
				235	read_unlock(&rh->hash_lock);
				236	nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
				237	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
				238	RH_CLEAN : RH_NOSYNC;
				239	nreg->rh = rh;
				240	nreg->key = region;
				241
				242	INIT_LIST_HEAD(&nreg->list);
				243
				244	atomic_set(&nreg->pending, 0);
				245	bio_list_init(&nreg->delayed_bios);
				246	write_lock_irq(&rh->hash_lock);
				247
				248	reg = __rh_lookup(rh, region);
				249	if (reg)
				250	/* we lost the race */
				251	mempool_free(nreg, rh->region_pool);
				252
				253	else {
				254	__rh_insert(rh, nreg);
				255	if (nreg->state == RH_CLEAN) {
				256	spin_lock(&rh->region_lock);
				257	list_add(&nreg->list, &rh->clean_regions);
				258	spin_unlock(&rh->region_lock);
				259	}
				260	reg = nreg;
				261	}
				262	write_unlock_irq(&rh->hash_lock);
				263	read_lock(&rh->hash_lock);
				264
				265	return reg;
				266	}
				267
				268	static inline struct region __rh_find(struct region_hash rh, region_t region)
				269	{
				270	struct region *reg;
				271
				272	reg = __rh_lookup(rh, region);
				273	if (!reg)
				274	reg = __rh_alloc(rh, region);
				275
				276	return reg;
				277	}
				278
				279	static int rh_state(struct region_hash *rh, region_t region, int may_block)
				280	{
				281	int r;
				282	struct region *reg;
				283
				284	read_lock(&rh->hash_lock);
				285	reg = __rh_lookup(rh, region);
				286	read_unlock(&rh->hash_lock);
				287
				288	if (reg)
				289	return reg->state;
				290
				291	/*
				292	* The region wasn't in the hash, so we fall back to the
				293	* dirty log.
				294	*/
				295	r = rh->log->type->in_sync(rh->log, region, may_block);
				296
				297	/*
				298	* Any error from the dirty log (eg. -EWOULDBLOCK) gets
				299	* taken as a RH_NOSYNC
				300	*/
				301	return r == 1 ? RH_CLEAN : RH_NOSYNC;
				302	}
				303
				304	static inline int rh_in_sync(struct region_hash *rh,
				305	region_t region, int may_block)
				306	{
				307	int state = rh_state(rh, region, may_block);
				308	return state == RH_CLEAN \|\| state == RH_DIRTY;
				309	}
				310
				311	static void dispatch_bios(struct mirror_set ms, struct bio_list bio_list)
				312	{
				313	struct bio *bio;
				314
				315	while ((bio = bio_list_pop(bio_list))) {
				316	queue_bio(ms, bio, WRITE);
				317	}
				318	}
				319
				320	static void rh_update_states(struct region_hash *rh)
				321	{
				322	struct region reg, next;
				323
				324	LIST_HEAD(clean);
				325	LIST_HEAD(recovered);
				326
				327	/*
				328	* Quickly grab the lists.
				329	*/
				330	write_lock_irq(&rh->hash_lock);
				331	spin_lock(&rh->region_lock);
				332	if (!list_empty(&rh->clean_regions)) {
				333	list_splice(&rh->clean_regions, &clean);
				334	INIT_LIST_HEAD(&rh->clean_regions);
				335
				336	list_for_each_entry (reg, &clean, list) {
				337	rh->log->type->clear_region(rh->log, reg->key);
				338	list_del(&reg->hash_list);
				339	}
				340	}
				341
				342	if (!list_empty(&rh->recovered_regions)) {
				343	list_splice(&rh->recovered_regions, &recovered);
				344	INIT_LIST_HEAD(&rh->recovered_regions);
				345
				346	list_for_each_entry (reg, &recovered, list)
				347	list_del(&reg->hash_list);
				348	}
				349	spin_unlock(&rh->region_lock);
				350	write_unlock_irq(&rh->hash_lock);
				351
				352	/*
				353	* All the regions on the recovered and clean lists have
				354	* now been pulled out of the system, so no need to do
				355	* any more locking.
				356	*/
				357	list_for_each_entry_safe (reg, next, &recovered, list) {
				358	rh->log->type->clear_region(rh->log, reg->key);
				359	rh->log->type->complete_resync_work(rh->log, reg->key, 1);
				360	dispatch_bios(rh->ms, &reg->delayed_bios);
				361	up(&rh->recovery_count);
				362	mempool_free(reg, rh->region_pool);
				363	}
				364
				365	if (!list_empty(&recovered))
				366	rh->log->type->flush(rh->log);
				367
				368	list_for_each_entry_safe (reg, next, &clean, list)
				369	mempool_free(reg, rh->region_pool);
				370	}
				371
				372	static void rh_inc(struct region_hash *rh, region_t region)
				373	{
				374	struct region *reg;
				375
				376	read_lock(&rh->hash_lock);
				377	reg = __rh_find(rh, region);
				378	if (reg->state == RH_CLEAN) {
				379	rh->log->type->mark_region(rh->log, reg->key);
				380
				381	spin_lock_irq(&rh->region_lock);
				382	reg->state = RH_DIRTY;
				383	list_del_init(&reg->list); /* take off the clean list */
				384	spin_unlock_irq(&rh->region_lock);
				385	}
				386
				387	atomic_inc(&reg->pending);
				388	read_unlock(&rh->hash_lock);
				389	}
				390
				391	static void rh_inc_pending(struct region_hash rh, struct bio_list bios)
				392	{
				393	struct bio *bio;
				394
				395	for (bio = bios->head; bio; bio = bio->bi_next)
				396	rh_inc(rh, bio_to_region(rh, bio));
				397	}
				398
				399	static void rh_dec(struct region_hash *rh, region_t region)
				400	{
				401	unsigned long flags;
				402	struct region *reg;
				403	int should_wake = 0;
				404
				405	read_lock(&rh->hash_lock);
				406	reg = __rh_lookup(rh, region);
				407	read_unlock(&rh->hash_lock);
				408
				409	if (atomic_dec_and_test(&reg->pending)) {
				410	spin_lock_irqsave(&rh->region_lock, flags);
				411	if (reg->state == RH_RECOVERING) {
				412	list_add_tail(&reg->list, &rh->quiesced_regions);
				413	} else {
				414	reg->state = RH_CLEAN;
				415	list_add(&reg->list, &rh->clean_regions);
				416	}
				417	spin_unlock_irqrestore(&rh->region_lock, flags);
				418	should_wake = 1;
				419	}
				420
				421	if (should_wake)
				422	wake();
				423	}
				424
				425	/*
				426	* Starts quiescing a region in preparation for recovery.
				427	*/
				428	static int __rh_recovery_prepare(struct region_hash *rh)
				429	{
				430	int r;
				431	struct region *reg;
				432	region_t region;
				433
				434	/*
				435	* Ask the dirty log what's next.
				436	*/
				437	r = rh->log->type->get_resync_work(rh->log, &region);
				438	if (r <= 0)
				439	return r;
				440
				441	/*
				442	* Get this region, and start it quiescing by setting the
				443	* recovering flag.
				444	*/
				445	read_lock(&rh->hash_lock);
				446	reg = __rh_find(rh, region);
				447	read_unlock(&rh->hash_lock);
				448
				449	spin_lock_irq(&rh->region_lock);
				450	reg->state = RH_RECOVERING;
				451
				452	/* Already quiesced ? */
				453	if (atomic_read(&reg->pending))
				454	list_del_init(&reg->list);
				455
				456	else {
				457	list_del_init(&reg->list);
				458	list_add(&reg->list, &rh->quiesced_regions);
				459	}
				460	spin_unlock_irq(&rh->region_lock);
				461
				462	return 1;
				463	}
				464
				465	static void rh_recovery_prepare(struct region_hash *rh)
				466	{
				467	while (!down_trylock(&rh->recovery_count))
				468	if (__rh_recovery_prepare(rh) <= 0) {
				469	up(&rh->recovery_count);
				470	break;
				471	}
				472	}
				473
				474	/*
				475	* Returns any quiesced regions.
				476	*/
				477	static struct region rh_recovery_start(struct region_hash rh)
				478	{
				479	struct region *reg = NULL;
				480
				481	spin_lock_irq(&rh->region_lock);
				482	if (!list_empty(&rh->quiesced_regions)) {
				483	reg = list_entry(rh->quiesced_regions.next,
				484	struct region, list);
				485	list_del_init(&reg->list); /* remove from the quiesced list */
				486	}
				487	spin_unlock_irq(&rh->region_lock);
				488
				489	return reg;
				490	}
				491
				492	/* FIXME: success ignored for now */
				493	static void rh_recovery_end(struct region *reg, int success)
				494	{
				495	struct region_hash *rh = reg->rh;
				496
				497	spin_lock_irq(&rh->region_lock);
				498	list_add(&reg->list, &reg->rh->recovered_regions);
				499	spin_unlock_irq(&rh->region_lock);
				500
				501	wake();
				502	}
				503
				504	static void rh_flush(struct region_hash *rh)
				505	{
				506	rh->log->type->flush(rh->log);
				507	}
				508
				509	static void rh_delay(struct region_hash rh, struct bio bio)
				510	{
				511	struct region *reg;
				512
				513	read_lock(&rh->hash_lock);
				514	reg = __rh_find(rh, bio_to_region(rh, bio));
				515	bio_list_add(&reg->delayed_bios, bio);
				516	read_unlock(&rh->hash_lock);
				517	}
				518
				519	static void rh_stop_recovery(struct region_hash *rh)
				520	{
				521	int i;
				522
				523	/* wait for any recovering regions */
				524	for (i = 0; i < MAX_RECOVERY; i++)
				525	down(&rh->recovery_count);
				526	}
				527
				528	static void rh_start_recovery(struct region_hash *rh)
				529	{
				530	int i;
				531
				532	for (i = 0; i < MAX_RECOVERY; i++)
				533	up(&rh->recovery_count);
				534
				535	wake();
				536	}
				537
				538	/*-----------------------------------------------------------------
				539	* Mirror set structures.
				540	---------------------------------------------------------------/
				541	struct mirror {
				542	atomic_t error_count;
				543	struct dm_dev *dev;
				544	sector_t offset;
				545	};
				546
				547	struct mirror_set {
				548	struct dm_target *ti;
				549	struct list_head list;
				550	struct region_hash rh;
				551	struct kcopyd_client *kcopyd_client;
				552
				553	spinlock_t lock; /* protects the next two lists */
				554	struct bio_list reads;
				555	struct bio_list writes;
				556
				557	/* recovery */
				558	region_t nr_regions;
				559	int in_sync;
				560
				561	unsigned int nr_mirrors;
				562	struct mirror mirror[0];
				563	};
				564
				565	/*
				566	* Every mirror should look like this one.
				567	*/
				568	#define DEFAULT_MIRROR 0
				569
				570	/*
				571	* This is yucky. We squirrel the mirror_set struct away inside
				572	* bi_next for write buffers. This is safe since the bh
				573	* doesn't get submitted to the lower levels of block layer.
				574	*/
				575	static struct mirror_set bio_get_ms(struct bio bio)
				576	{
				577	return (struct mirror_set *) bio->bi_next;
				578	}
				579
				580	static void bio_set_ms(struct bio bio, struct mirror_set ms)
				581	{
				582	bio->bi_next = (struct bio *) ms;
				583	}
				584
				585	/*-----------------------------------------------------------------
				586	* Recovery.
				587	*
				588	* When a mirror is first activated we may find that some regions
				589	* are in the no-sync state. We have to recover these by
				590	* recopying from the default mirror to all the others.
				591	---------------------------------------------------------------/
				592	static void recovery_complete(int read_err, unsigned int write_err,
				593	void *context)
				594	{
				595	struct region reg = (struct region ) context;
				596
				597	/* FIXME: better error handling */
				598	rh_recovery_end(reg, read_err \|\| write_err);
				599	}
				600
				601	static int recover(struct mirror_set ms, struct region reg)
				602	{
				603	int r;
				604	unsigned int i;
				605	struct io_region from, to[KCOPYD_MAX_REGIONS], *dest;
				606	struct mirror *m;
				607	unsigned long flags = 0;
				608
				609	/* fill in the source */
				610	m = ms->mirror + DEFAULT_MIRROR;
				611	from.bdev = m->dev->bdev;
				612	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
				613	if (reg->key == (ms->nr_regions - 1)) {
				614	/*
				615	* The final region may be smaller than
				616	* region_size.
				617	*/
				618	from.count = ms->ti->len & (reg->rh->region_size - 1);
				619	if (!from.count)
				620	from.count = reg->rh->region_size;
				621	} else
				622	from.count = reg->rh->region_size;
				623
				624	/* fill in the destinations */
				625	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
				626	if (i == DEFAULT_MIRROR)
				627	continue;
				628
				629	m = ms->mirror + i;
				630	dest->bdev = m->dev->bdev;
				631	dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
				632	dest->count = from.count;
				633	dest++;
				634	}
				635
				636	/* hand to kcopyd */
				637	set_bit(KCOPYD_IGNORE_ERROR, &flags);
				638	r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
				639	recovery_complete, reg);
				640
				641	return r;
				642	}
				643
				644	static void do_recovery(struct mirror_set *ms)
				645	{
				646	int r;
				647	struct region *reg;
				648	struct dirty_log *log = ms->rh.log;
				649
				650	/*
				651	* Start quiescing some regions.
				652	*/
				653	rh_recovery_prepare(&ms->rh);
				654
				655	/*
				656	* Copy any already quiesced regions.
				657	*/
				658	while ((reg = rh_recovery_start(&ms->rh))) {
				659	r = recover(ms, reg);
				660	if (r)
				661	rh_recovery_end(reg, 0);
				662	}
				663
				664	/*
				665	* Update the in sync flag.
				666	*/
				667	if (!ms->in_sync &&
				668	(log->type->get_sync_count(log) == ms->nr_regions)) {
				669	/* the sync is complete */
				670	dm_table_event(ms->ti->table);
				671	ms->in_sync = 1;
				672	}
				673	}
				674
				675	/*-----------------------------------------------------------------
				676	* Reads
				677	---------------------------------------------------------------/
				678	static struct mirror choose_mirror(struct mirror_set ms, sector_t sector)
				679	{
				680	/* FIXME: add read balancing */
				681	return ms->mirror + DEFAULT_MIRROR;
				682	}
				683
				684	/*
				685	* remap a buffer to a particular mirror.
				686	*/
				687	static void map_bio(struct mirror_set ms, struct mirror m, struct bio *bio)
				688	{
				689	bio->bi_bdev = m->dev->bdev;
				690	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
				691	}
				692
				693	static void do_reads(struct mirror_set ms, struct bio_list reads)
				694	{
				695	region_t region;
				696	struct bio *bio;
				697	struct mirror *m;
				698
				699	while ((bio = bio_list_pop(reads))) {
				700	region = bio_to_region(&ms->rh, bio);
				701
				702	/*
				703	* We can only read balance if the region is in sync.
				704	*/
				705	if (rh_in_sync(&ms->rh, region, 0))
				706	m = choose_mirror(ms, bio->bi_sector);
				707	else
				708	m = ms->mirror + DEFAULT_MIRROR;
				709
				710	map_bio(ms, m, bio);
				711	generic_make_request(bio);
				712	}
				713	}
				714
				715	/*-----------------------------------------------------------------
				716	* Writes.
				717	*
				718	* We do different things with the write io depending on the
				719	* state of the region that it's in:
				720	*
				721	* SYNC: increment pending, use kcopyd to write to all mirrors
				722	* RECOVERING: delay the io until recovery completes
				723	* NOSYNC: increment pending, just write to the default mirror
				724	---------------------------------------------------------------/
				725	static void write_callback(unsigned long error, void *context)
				726	{
				727	unsigned int i;
				728	int uptodate = 1;
				729	struct bio bio = (struct bio ) context;
				730	struct mirror_set *ms;
				731
				732	ms = bio_get_ms(bio);
				733	bio_set_ms(bio, NULL);
				734
				735	/*
				736	* NOTE: We don't decrement the pending count here,
				737	* instead it is done by the targets endio function.
				738	* This way we handle both writes to SYNC and NOSYNC
				739	* regions with the same code.
				740	*/
				741
				742	if (error) {
				743	/*
				744	* only error the io if all mirrors failed.
				745	* FIXME: bogus
				746	*/
				747	uptodate = 0;
				748	for (i = 0; i < ms->nr_mirrors; i++)
				749	if (!test_bit(i, &error)) {
				750	uptodate = 1;
				751	break;
				752	}
				753	}
				754	bio_endio(bio, bio->bi_size, 0);
				755	}
				756
				757	static void do_write(struct mirror_set ms, struct bio bio)
				758	{
				759	unsigned int i;
				760	struct io_region io[KCOPYD_MAX_REGIONS+1];
				761	struct mirror *m;
				762
				763	for (i = 0; i < ms->nr_mirrors; i++) {
				764	m = ms->mirror + i;
				765
				766	io[i].bdev = m->dev->bdev;
				767	io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
				768	io[i].count = bio->bi_size >> 9;
				769	}
				770
				771	bio_set_ms(bio, ms);
				772	dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
				773	bio->bi_io_vec + bio->bi_idx,
				774	write_callback, bio);
				775	}
				776
				777	static void do_writes(struct mirror_set ms, struct bio_list writes)
				778	{
				779	int state;
				780	struct bio *bio;
				781	struct bio_list sync, nosync, recover, *this_list = NULL;
				782
				783	if (!writes->head)
				784	return;
				785
				786	/*
				787	* Classify each write.
				788	*/
				789	bio_list_init(&sync);
				790	bio_list_init(&nosync);
				791	bio_list_init(&recover);
				792
				793	while ((bio = bio_list_pop(writes))) {
				794	state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
				795	switch (state) {
				796	case RH_CLEAN:
				797	case RH_DIRTY:
				798	this_list = &sync;
				799	break;
				800
				801	case RH_NOSYNC:
				802	this_list = &nosync;
				803	break;
				804
				805	case RH_RECOVERING:
				806	this_list = &recover;
				807	break;
				808	}
				809
				810	bio_list_add(this_list, bio);
				811	}
				812
				813	/*
				814	* Increment the pending counts for any regions that will
				815	* be written to (writes to recover regions are going to
				816	* be delayed).
				817	*/
				818	rh_inc_pending(&ms->rh, &sync);
				819	rh_inc_pending(&ms->rh, &nosync);
				820	rh_flush(&ms->rh);
				821
				822	/*
				823	* Dispatch io.
				824	*/
				825	while ((bio = bio_list_pop(&sync)))
				826	do_write(ms, bio);
				827
				828	while ((bio = bio_list_pop(&recover)))
				829	rh_delay(&ms->rh, bio);
				830
				831	while ((bio = bio_list_pop(&nosync))) {
				832	map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
				833	generic_make_request(bio);
				834	}
				835	}
				836
				837	/*-----------------------------------------------------------------
				838	* kmirrord
				839	---------------------------------------------------------------/
				840	static LIST_HEAD(_mirror_sets);
				841	static DECLARE_RWSEM(_mirror_sets_lock);
				842
				843	static void do_mirror(struct mirror_set *ms)
				844	{
				845	struct bio_list reads, writes;
				846
				847	spin_lock(&ms->lock);
				848	reads = ms->reads;
				849	writes = ms->writes;
				850	bio_list_init(&ms->reads);
				851	bio_list_init(&ms->writes);
				852	spin_unlock(&ms->lock);
				853
				854	rh_update_states(&ms->rh);
				855	do_recovery(ms);
				856	do_reads(ms, &reads);
				857	do_writes(ms, &writes);
				858	}
				859
				860	static void do_work(void *ignored)
				861	{
				862	struct mirror_set *ms;
				863
				864	down_read(&_mirror_sets_lock);
				865	list_for_each_entry (ms, &_mirror_sets, list)
				866	do_mirror(ms);
				867	up_read(&_mirror_sets_lock);
				868	}
				869
				870	/*-----------------------------------------------------------------
				871	* Target functions
				872	---------------------------------------------------------------/
				873	static struct mirror_set *alloc_context(unsigned int nr_mirrors,
				874	uint32_t region_size,
				875	struct dm_target *ti,
				876	struct dirty_log *dl)
				877	{
				878	size_t len;
				879	struct mirror_set *ms = NULL;
				880
				881	if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
				882	return NULL;
				883
				884	len = sizeof(ms) + (sizeof(ms->mirror[0]) nr_mirrors);
				885
				886	ms = kmalloc(len, GFP_KERNEL);
				887	if (!ms) {
				888	ti->error = "dm-mirror: Cannot allocate mirror context";
				889	return NULL;
				890	}
				891
				892	memset(ms, 0, len);
				893	spin_lock_init(&ms->lock);
				894
				895	ms->ti = ti;
				896	ms->nr_mirrors = nr_mirrors;
				897	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
				898	ms->in_sync = 0;
				899
				900	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
				901	ti->error = "dm-mirror: Error creating dirty region hash";
				902	kfree(ms);
				903	return NULL;
				904	}
				905
				906	return ms;
				907	}
				908
				909	static void free_context(struct mirror_set ms, struct dm_target ti,
				910	unsigned int m)
				911	{
				912	while (m--)
				913	dm_put_device(ti, ms->mirror[m].dev);
				914
				915	rh_exit(&ms->rh);
				916	kfree(ms);
				917	}
				918
				919	static inline int _check_region_size(struct dm_target *ti, uint32_t size)
				920	{
				921	return !(size % (PAGE_SIZE >> 9) \|\| (size & (size - 1)) \|\|
				922	size > ti->len);
				923	}
				924
				925	static int get_mirror(struct mirror_set ms, struct dm_target ti,
				926	unsigned int mirror, char **argv)
				927	{
				928	sector_t offset;
				929
				930	if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
				931	ti->error = "dm-mirror: Invalid offset";
				932	return -EINVAL;
				933	}
				934
				935	if (dm_get_device(ti, argv[0], offset, ti->len,
				936	dm_table_get_mode(ti->table),
				937	&ms->mirror[mirror].dev)) {
				938	ti->error = "dm-mirror: Device lookup failure";
				939	return -ENXIO;
				940	}
				941
				942	ms->mirror[mirror].offset = offset;
				943
				944	return 0;
				945	}
				946
				947	static int add_mirror_set(struct mirror_set *ms)
				948	{
				949	down_write(&_mirror_sets_lock);
				950	list_add_tail(&ms->list, &_mirror_sets);
				951	up_write(&_mirror_sets_lock);
				952	wake();
				953
				954	return 0;
				955	}
				956
				957	static void del_mirror_set(struct mirror_set *ms)
				958	{
				959	down_write(&_mirror_sets_lock);
				960	list_del(&ms->list);
				961	up_write(&_mirror_sets_lock);
				962	}
				963
				964	/*
				965	* Create dirty log: log_type #log_params <log_params>
				966	*/
				967	static struct dirty_log create_dirty_log(struct dm_target ti,
				968	unsigned int argc, char **argv,
				969	unsigned int *args_used)
				970	{
				971	unsigned int param_count;
				972	struct dirty_log *dl;
				973
				974	if (argc < 2) {
				975	ti->error = "dm-mirror: Insufficient mirror log arguments";
				976	return NULL;
				977	}
				978
				979	if (sscanf(argv[1], "%u", &param_count) != 1) {
				980	ti->error = "dm-mirror: Invalid mirror log argument count";
				981	return NULL;
				982	}
				983
				984	*args_used = 2 + param_count;
				985
				986	if (argc < *args_used) {
				987	ti->error = "dm-mirror: Insufficient mirror log arguments";
				988	return NULL;
				989	}
				990
				991	dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
				992	if (!dl) {
				993	ti->error = "dm-mirror: Error creating mirror dirty log";
				994	return NULL;
				995	}
				996
				997	if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
				998	ti->error = "dm-mirror: Invalid region size";
				999	dm_destroy_dirty_log(dl);
				1000	return NULL;
				1001	}
				1002
				1003	return dl;
				1004	}
				1005
				1006	/*
				1007	* Construct a mirror mapping:
				1008	*
				1009	* log_type #log_params <log_params>
				1010	* #mirrors [mirror_path offset]{2,}
				1011	*
				1012	* log_type is "core" or "disk"
				1013	* #log_params is between 1 and 3
				1014	*/
				1015	#define DM_IO_PAGES 64
				1016	static int mirror_ctr(struct dm_target ti, unsigned int argc, char *argv)
				1017	{
				1018	int r;
				1019	unsigned int nr_mirrors, m, args_used;
				1020	struct mirror_set *ms;
				1021	struct dirty_log *dl;
				1022
				1023	dl = create_dirty_log(ti, argc, argv, &args_used);
				1024	if (!dl)
				1025	return -EINVAL;
				1026
				1027	argv += args_used;
				1028	argc -= args_used;
				1029
				1030	if (!argc \|\| sscanf(argv[0], "%u", &nr_mirrors) != 1 \|\|
				1031	nr_mirrors < 2 \|\| nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
				1032	ti->error = "dm-mirror: Invalid number of mirrors";
				1033	dm_destroy_dirty_log(dl);
				1034	return -EINVAL;
				1035	}
				1036
				1037	argv++, argc--;
				1038
				1039	if (argc != nr_mirrors * 2) {
				1040	ti->error = "dm-mirror: Wrong number of mirror arguments";
				1041	dm_destroy_dirty_log(dl);
				1042	return -EINVAL;
				1043	}
				1044
				1045	ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
				1046	if (!ms) {
				1047	dm_destroy_dirty_log(dl);
				1048	return -ENOMEM;
				1049	}
				1050
				1051	/* Get the mirror parameter sets */
				1052	for (m = 0; m < nr_mirrors; m++) {
				1053	r = get_mirror(ms, ti, m, argv);
				1054	if (r) {
				1055	free_context(ms, ti, m);
				1056	return r;
				1057	}
				1058	argv += 2;
				1059	argc -= 2;
				1060	}
				1061
				1062	ti->private = ms;
Alasdair G Kergon	d88854f	2005-07-07 17:59:34 -0700	[diff] [blame^]	1063	ti->split_io = ms->rh.region_size;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1064
				1065	r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
				1066	if (r) {
				1067	free_context(ms, ti, ms->nr_mirrors);
				1068	return r;
				1069	}
				1070
				1071	add_mirror_set(ms);
				1072	return 0;
				1073	}
				1074
				1075	static void mirror_dtr(struct dm_target *ti)
				1076	{
				1077	struct mirror_set ms = (struct mirror_set ) ti->private;
				1078
				1079	del_mirror_set(ms);
				1080	kcopyd_client_destroy(ms->kcopyd_client);
				1081	free_context(ms, ti, ms->nr_mirrors);
				1082	}
				1083
				1084	static void queue_bio(struct mirror_set ms, struct bio bio, int rw)
				1085	{
				1086	int should_wake = 0;
				1087	struct bio_list *bl;
				1088
				1089	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
				1090	spin_lock(&ms->lock);
				1091	should_wake = !(bl->head);
				1092	bio_list_add(bl, bio);
				1093	spin_unlock(&ms->lock);
				1094
				1095	if (should_wake)
				1096	wake();
				1097	}
				1098
				1099	/*
				1100	* Mirror mapping function
				1101	*/
				1102	static int mirror_map(struct dm_target ti, struct bio bio,
				1103	union map_info *map_context)
				1104	{
				1105	int r, rw = bio_rw(bio);
				1106	struct mirror *m;
				1107	struct mirror_set *ms = ti->private;
				1108
				1109	map_context->ll = bio->bi_sector >> ms->rh.region_shift;
				1110
				1111	if (rw == WRITE) {
				1112	queue_bio(ms, bio, rw);
				1113	return 0;
				1114	}
				1115
				1116	r = ms->rh.log->type->in_sync(ms->rh.log,
				1117	bio_to_region(&ms->rh, bio), 0);
				1118	if (r < 0 && r != -EWOULDBLOCK)
				1119	return r;
				1120
				1121	if (r == -EWOULDBLOCK) /* FIXME: ugly */
				1122	r = 0;
				1123
				1124	/*
				1125	* We don't want to fast track a recovery just for a read
				1126	* ahead. So we just let it silently fail.
				1127	* FIXME: get rid of this.
				1128	*/
				1129	if (!r && rw == READA)
				1130	return -EIO;
				1131
				1132	if (!r) {
				1133	/* Pass this io over to the daemon */
				1134	queue_bio(ms, bio, rw);
				1135	return 0;
				1136	}
				1137
				1138	m = choose_mirror(ms, bio->bi_sector);
				1139	if (!m)
				1140	return -EIO;
				1141
				1142	map_bio(ms, m, bio);
				1143	return 1;
				1144	}
				1145
				1146	static int mirror_end_io(struct dm_target ti, struct bio bio,
				1147	int error, union map_info *map_context)
				1148	{
				1149	int rw = bio_rw(bio);
				1150	struct mirror_set ms = (struct mirror_set ) ti->private;
				1151	region_t region = map_context->ll;
				1152
				1153	/*
				1154	* We need to dec pending if this was a write.
				1155	*/
				1156	if (rw == WRITE)
				1157	rh_dec(&ms->rh, region);
				1158
				1159	return 0;
				1160	}
				1161
				1162	static void mirror_postsuspend(struct dm_target *ti)
				1163	{
				1164	struct mirror_set ms = (struct mirror_set ) ti->private;
				1165	struct dirty_log *log = ms->rh.log;
				1166
				1167	rh_stop_recovery(&ms->rh);
				1168	if (log->type->suspend && log->type->suspend(log))
				1169	/* FIXME: need better error handling */
				1170	DMWARN("log suspend failed");
				1171	}
				1172
				1173	static void mirror_resume(struct dm_target *ti)
				1174	{
				1175	struct mirror_set ms = (struct mirror_set ) ti->private;
				1176	struct dirty_log *log = ms->rh.log;
				1177	if (log->type->resume && log->type->resume(log))
				1178	/* FIXME: need better error handling */
				1179	DMWARN("log resume failed");
				1180	rh_start_recovery(&ms->rh);
				1181	}
				1182
				1183	static int mirror_status(struct dm_target *ti, status_type_t type,
				1184	char *result, unsigned int maxlen)
				1185	{
				1186	unsigned int m, sz;
				1187	struct mirror_set ms = (struct mirror_set ) ti->private;
				1188
				1189	sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
				1190
				1191	switch (type) {
				1192	case STATUSTYPE_INFO:
				1193	DMEMIT("%d ", ms->nr_mirrors);
				1194	for (m = 0; m < ms->nr_mirrors; m++)
				1195	DMEMIT("%s ", ms->mirror[m].dev->name);
				1196
				1197	DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
				1198	ms->rh.log->type->get_sync_count(ms->rh.log),
				1199	ms->nr_regions);
				1200	break;
				1201
				1202	case STATUSTYPE_TABLE:
				1203	DMEMIT("%d ", ms->nr_mirrors);
				1204	for (m = 0; m < ms->nr_mirrors; m++)
				1205	DMEMIT("%s " SECTOR_FORMAT " ",
				1206	ms->mirror[m].dev->name, ms->mirror[m].offset);
				1207	}
				1208
				1209	return 0;
				1210	}
				1211
				1212	static struct target_type mirror_target = {
				1213	.name = "mirror",
				1214	.version = {1, 0, 1},
				1215	.module = THIS_MODULE,
				1216	.ctr = mirror_ctr,
				1217	.dtr = mirror_dtr,
				1218	.map = mirror_map,
				1219	.end_io = mirror_end_io,
				1220	.postsuspend = mirror_postsuspend,
				1221	.resume = mirror_resume,
				1222	.status = mirror_status,
				1223	};
				1224
				1225	static int __init dm_mirror_init(void)
				1226	{
				1227	int r;
				1228
				1229	r = dm_dirty_log_init();
				1230	if (r)
				1231	return r;
				1232
				1233	_kmirrord_wq = create_workqueue("kmirrord");
				1234	if (!_kmirrord_wq) {
				1235	DMERR("couldn't start kmirrord");
				1236	dm_dirty_log_exit();
				1237	return r;
				1238	}
				1239	INIT_WORK(&_kmirrord_work, do_work, NULL);
				1240
				1241	r = dm_register_target(&mirror_target);
				1242	if (r < 0) {
				1243	DMERR("%s: Failed to register mirror target",
				1244	mirror_target.name);
				1245	dm_dirty_log_exit();
				1246	destroy_workqueue(_kmirrord_wq);
				1247	}
				1248
				1249	return r;
				1250	}
				1251
				1252	static void __exit dm_mirror_exit(void)
				1253	{
				1254	int r;
				1255
				1256	r = dm_unregister_target(&mirror_target);
				1257	if (r < 0)
				1258	DMERR("%s: unregister failed %d", mirror_target.name, r);
				1259
				1260	destroy_workqueue(_kmirrord_wq);
				1261	dm_dirty_log_exit();
				1262	}
				1263
				1264	/* Module hooks */
				1265	module_init(dm_mirror_init);
				1266	module_exit(dm_mirror_exit);
				1267
				1268	MODULE_DESCRIPTION(DM_NAME " mirror target");
				1269	MODULE_AUTHOR("Joe Thornber");
				1270	MODULE_LICENSE("GPL");