Blame - drivers/md/raid6main.c - kernel/msm-4.19

blob: f62ea1a73d0d9d1b87f5f10d7b91a6f6d7f48b0d [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* raid6main.c : Multiple Devices driver for Linux
				3	* Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
				4	* Copyright (C) 1999, 2000 Ingo Molnar
				5	* Copyright (C) 2002, 2003 H. Peter Anvin
				6	*
				7	* RAID-6 management functions. This code is derived from raid5.c.
				8	* Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
				9	*
				10	* Thanks to Penguin Computing for making the RAID-6 development possible
				11	* by donating a test server!
				12	*
				13	* This program is free software; you can redistribute it and/or modify
				14	* it under the terms of the GNU General Public License as published by
				15	* the Free Software Foundation; either version 2, or (at your option)
				16	* any later version.
				17	*
				18	* You should have received a copy of the GNU General Public License
				19	* (for example /usr/src/linux/COPYING); if not, write to the Free
				20	* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
				21	*/
				22
				23
				24	#include <linux/config.h>
				25	#include <linux/module.h>
				26	#include <linux/slab.h>
				27	#include <linux/highmem.h>
				28	#include <linux/bitops.h>
				29	#include <asm/atomic.h>
				30	#include "raid6.h"
				31
				32	/*
				33	* Stripe cache
				34	*/
				35
				36	#define NR_STRIPES 256
				37	#define STRIPE_SIZE PAGE_SIZE
				38	#define STRIPE_SHIFT (PAGE_SHIFT - 9)
				39	#define STRIPE_SECTORS (STRIPE_SIZE>>9)
				40	#define IO_THRESHOLD 1
				41	#define HASH_PAGES 1
				42	#define HASH_PAGES_ORDER 0
				43	#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
				44	#define HASH_MASK (NR_HASH - 1)
				45
				46	#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
				47
				48	/* bio's attached to a stripe+device for I/O are linked together in bi_sector
				49	* order without overlap. There may be several bio's per stripe+device, and
				50	* a bio could span several devices.
				51	* When walking this list for a particular stripe+device, we must never proceed
				52	* beyond a bio that extends past this device, as the next bio might no longer
				53	* be valid.
				54	* This macro is used to determine the 'next' bio in the list, given the sector
				55	* of the current stripe+device
				56	*/
				57	#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
				58	/*
				59	* The following can be used to debug the driver
				60	*/
				61	#define RAID6_DEBUG 0 /* Extremely verbose printk */
				62	#define RAID6_PARANOIA 1 /* Check spinlocks */
				63	#define RAID6_DUMPSTATE 0 /* Include stripe cache state in /proc/mdstat */
				64	#if RAID6_PARANOIA && defined(CONFIG_SMP)
				65	# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
				66	#else
				67	# define CHECK_DEVLOCK()
				68	#endif
				69
				70	#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
				71	#if RAID6_DEBUG
				72	#undef inline
				73	#undef __inline__
				74	#define inline
				75	#define __inline__
				76	#endif
				77
				78	#if !RAID6_USE_EMPTY_ZERO_PAGE
				79	/* In .bss so it's zeroed */
				80	const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
				81	#endif
				82
				83	static inline int raid6_next_disk(int disk, int raid_disks)
				84	{
				85	disk++;
				86	return (disk < raid_disks) ? disk : 0;
				87	}
				88
				89	static void print_raid6_conf (raid6_conf_t *conf);
				90
				91	static inline void __release_stripe(raid6_conf_t conf, struct stripe_head sh)
				92	{
				93	if (atomic_dec_and_test(&sh->count)) {
				94	if (!list_empty(&sh->lru))
				95	BUG();
				96	if (atomic_read(&conf->active_stripes)==0)
				97	BUG();
				98	if (test_bit(STRIPE_HANDLE, &sh->state)) {
				99	if (test_bit(STRIPE_DELAYED, &sh->state))
				100	list_add_tail(&sh->lru, &conf->delayed_list);
				101	else
				102	list_add_tail(&sh->lru, &conf->handle_list);
				103	md_wakeup_thread(conf->mddev->thread);
				104	} else {
				105	if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
				106	atomic_dec(&conf->preread_active_stripes);
				107	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
				108	md_wakeup_thread(conf->mddev->thread);
				109	}
				110	list_add_tail(&sh->lru, &conf->inactive_list);
				111	atomic_dec(&conf->active_stripes);
				112	if (!conf->inactive_blocked \|\|
				113	atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
				114	wake_up(&conf->wait_for_stripe);
				115	}
				116	}
				117	}
				118	static void release_stripe(struct stripe_head *sh)
				119	{
				120	raid6_conf_t *conf = sh->raid_conf;
				121	unsigned long flags;
				122
				123	spin_lock_irqsave(&conf->device_lock, flags);
				124	__release_stripe(conf, sh);
				125	spin_unlock_irqrestore(&conf->device_lock, flags);
				126	}
				127
				128	static void remove_hash(struct stripe_head *sh)
				129	{
				130	PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
				131
				132	if (sh->hash_pprev) {
				133	if (sh->hash_next)
				134	sh->hash_next->hash_pprev = sh->hash_pprev;
				135	*sh->hash_pprev = sh->hash_next;
				136	sh->hash_pprev = NULL;
				137	}
				138	}
				139
				140	static __inline__ void insert_hash(raid6_conf_t conf, struct stripe_head sh)
				141	{
				142	struct stripe_head **shp = &stripe_hash(conf, sh->sector);
				143
				144	PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
				145
				146	CHECK_DEVLOCK();
				147	if ((sh->hash_next = *shp) != NULL)
				148	(*shp)->hash_pprev = &sh->hash_next;
				149	*shp = sh;
				150	sh->hash_pprev = shp;
				151	}
				152
				153
				154	/* find an idle stripe, make sure it is unhashed, and return it. */
				155	static struct stripe_head get_free_stripe(raid6_conf_t conf)
				156	{
				157	struct stripe_head *sh = NULL;
				158	struct list_head *first;
				159
				160	CHECK_DEVLOCK();
				161	if (list_empty(&conf->inactive_list))
				162	goto out;
				163	first = conf->inactive_list.next;
				164	sh = list_entry(first, struct stripe_head, lru);
				165	list_del_init(first);
				166	remove_hash(sh);
				167	atomic_inc(&conf->active_stripes);
				168	out:
				169	return sh;
				170	}
				171
				172	static void shrink_buffers(struct stripe_head *sh, int num)
				173	{
				174	struct page *p;
				175	int i;
				176
				177	for (i=0; i<num ; i++) {
				178	p = sh->dev[i].page;
				179	if (!p)
				180	continue;
				181	sh->dev[i].page = NULL;
				182	page_cache_release(p);
				183	}
				184	}
				185
				186	static int grow_buffers(struct stripe_head *sh, int num)
				187	{
				188	int i;
				189
				190	for (i=0; i<num; i++) {
				191	struct page *page;
				192
				193	if (!(page = alloc_page(GFP_KERNEL))) {
				194	return 1;
				195	}
				196	sh->dev[i].page = page;
				197	}
				198	return 0;
				199	}
				200
				201	static void raid6_build_block (struct stripe_head *sh, int i);
				202
				203	static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
				204	{
				205	raid6_conf_t *conf = sh->raid_conf;
				206	int disks = conf->raid_disks, i;
				207
				208	if (atomic_read(&sh->count) != 0)
				209	BUG();
				210	if (test_bit(STRIPE_HANDLE, &sh->state))
				211	BUG();
				212
				213	CHECK_DEVLOCK();
				214	PRINTK("init_stripe called, stripe %llu\n",
				215	(unsigned long long)sh->sector);
				216
				217	remove_hash(sh);
				218
				219	sh->sector = sector;
				220	sh->pd_idx = pd_idx;
				221	sh->state = 0;
				222
				223	for (i=disks; i--; ) {
				224	struct r5dev *dev = &sh->dev[i];
				225
				226	if (dev->toread \|\| dev->towrite \|\| dev->written \|\|
				227	test_bit(R5_LOCKED, &dev->flags)) {
				228	PRINTK("sector=%llx i=%d %p %p %p %d\n",
				229	(unsigned long long)sh->sector, i, dev->toread,
				230	dev->towrite, dev->written,
				231	test_bit(R5_LOCKED, &dev->flags));
				232	BUG();
				233	}
				234	dev->flags = 0;
				235	raid6_build_block(sh, i);
				236	}
				237	insert_hash(conf, sh);
				238	}
				239
				240	static struct stripe_head __find_stripe(raid6_conf_t conf, sector_t sector)
				241	{
				242	struct stripe_head *sh;
				243
				244	CHECK_DEVLOCK();
				245	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
				246	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
				247	if (sh->sector == sector)
				248	return sh;
				249	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
				250	return NULL;
				251	}
				252
				253	static void unplug_slaves(mddev_t *mddev);
				254
				255	static struct stripe_head get_active_stripe(raid6_conf_t conf, sector_t sector,
				256	int pd_idx, int noblock)
				257	{
				258	struct stripe_head *sh;
				259
				260	PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
				261
				262	spin_lock_irq(&conf->device_lock);
				263
				264	do {
				265	sh = __find_stripe(conf, sector);
				266	if (!sh) {
				267	if (!conf->inactive_blocked)
				268	sh = get_free_stripe(conf);
				269	if (noblock && sh == NULL)
				270	break;
				271	if (!sh) {
				272	conf->inactive_blocked = 1;
				273	wait_event_lock_irq(conf->wait_for_stripe,
				274	!list_empty(&conf->inactive_list) &&
				275	(atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
				276	\|\| !conf->inactive_blocked),
				277	conf->device_lock,
				278	unplug_slaves(conf->mddev);
				279	);
				280	conf->inactive_blocked = 0;
				281	} else
				282	init_stripe(sh, sector, pd_idx);
				283	} else {
				284	if (atomic_read(&sh->count)) {
				285	if (!list_empty(&sh->lru))
				286	BUG();
				287	} else {
				288	if (!test_bit(STRIPE_HANDLE, &sh->state))
				289	atomic_inc(&conf->active_stripes);
				290	if (list_empty(&sh->lru))
				291	BUG();
				292	list_del_init(&sh->lru);
				293	}
				294	}
				295	} while (sh == NULL);
				296
				297	if (sh)
				298	atomic_inc(&sh->count);
				299
				300	spin_unlock_irq(&conf->device_lock);
				301	return sh;
				302	}
				303
				304	static int grow_stripes(raid6_conf_t *conf, int num)
				305	{
				306	struct stripe_head *sh;
				307	kmem_cache_t *sc;
				308	int devs = conf->raid_disks;
				309
				310	sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
				311
				312	sc = kmem_cache_create(conf->cache_name,
				313	sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
				314	0, 0, NULL, NULL);
				315	if (!sc)
				316	return 1;
				317	conf->slab_cache = sc;
				318	while (num--) {
				319	sh = kmem_cache_alloc(sc, GFP_KERNEL);
				320	if (!sh)
				321	return 1;
				322	memset(sh, 0, sizeof(sh) + (devs-1)sizeof(struct r5dev));
				323	sh->raid_conf = conf;
				324	spin_lock_init(&sh->lock);
				325
				326	if (grow_buffers(sh, conf->raid_disks)) {
				327	shrink_buffers(sh, conf->raid_disks);
				328	kmem_cache_free(sc, sh);
				329	return 1;
				330	}
				331	/* we just created an active stripe so... */
				332	atomic_set(&sh->count, 1);
				333	atomic_inc(&conf->active_stripes);
				334	INIT_LIST_HEAD(&sh->lru);
				335	release_stripe(sh);
				336	}
				337	return 0;
				338	}
				339
				340	static void shrink_stripes(raid6_conf_t *conf)
				341	{
				342	struct stripe_head *sh;
				343
				344	while (1) {
				345	spin_lock_irq(&conf->device_lock);
				346	sh = get_free_stripe(conf);
				347	spin_unlock_irq(&conf->device_lock);
				348	if (!sh)
				349	break;
				350	if (atomic_read(&sh->count))
				351	BUG();
				352	shrink_buffers(sh, conf->raid_disks);
				353	kmem_cache_free(conf->slab_cache, sh);
				354	atomic_dec(&conf->active_stripes);
				355	}
				356	kmem_cache_destroy(conf->slab_cache);
				357	conf->slab_cache = NULL;
				358	}
				359
				360	static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done,
				361	int error)
				362	{
				363	struct stripe_head *sh = bi->bi_private;
				364	raid6_conf_t *conf = sh->raid_conf;
				365	int disks = conf->raid_disks, i;
				366	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
				367
				368	if (bi->bi_size)
				369	return 1;
				370
				371	for (i=0 ; i<disks; i++)
				372	if (bi == &sh->dev[i].req)
				373	break;
				374
				375	PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
				376	(unsigned long long)sh->sector, i, atomic_read(&sh->count),
				377	uptodate);
				378	if (i == disks) {
				379	BUG();
				380	return 0;
				381	}
				382
				383	if (uptodate) {
				384	#if 0
				385	struct bio *bio;
				386	unsigned long flags;
				387	spin_lock_irqsave(&conf->device_lock, flags);
				388	/* we can return a buffer if we bypassed the cache or
				389	* if the top buffer is not in highmem. If there are
				390	* multiple buffers, leave the extra work to
				391	* handle_stripe
				392	*/
				393	buffer = sh->bh_read[i];
				394	if (buffer &&
				395	(!PageHighMem(buffer->b_page)
				396	\|\| buffer->b_page == bh->b_page )
				397	) {
				398	sh->bh_read[i] = buffer->b_reqnext;
				399	buffer->b_reqnext = NULL;
				400	} else
				401	buffer = NULL;
				402	spin_unlock_irqrestore(&conf->device_lock, flags);
				403	if (sh->bh_page[i]==bh->b_page)
				404	set_buffer_uptodate(bh);
				405	if (buffer) {
				406	if (buffer->b_page != bh->b_page)
				407	memcpy(buffer->b_data, bh->b_data, bh->b_size);
				408	buffer->b_end_io(buffer, 1);
				409	}
				410	#else
				411	set_bit(R5_UPTODATE, &sh->dev[i].flags);
				412	#endif
				413	} else {
				414	md_error(conf->mddev, conf->disks[i].rdev);
				415	clear_bit(R5_UPTODATE, &sh->dev[i].flags);
				416	}
				417	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
				418	#if 0
				419	/* must restore b_page before unlocking buffer... */
				420	if (sh->bh_page[i] != bh->b_page) {
				421	bh->b_page = sh->bh_page[i];
				422	bh->b_data = page_address(bh->b_page);
				423	clear_buffer_uptodate(bh);
				424	}
				425	#endif
				426	clear_bit(R5_LOCKED, &sh->dev[i].flags);
				427	set_bit(STRIPE_HANDLE, &sh->state);
				428	release_stripe(sh);
				429	return 0;
				430	}
				431
				432	static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done,
				433	int error)
				434	{
				435	struct stripe_head *sh = bi->bi_private;
				436	raid6_conf_t *conf = sh->raid_conf;
				437	int disks = conf->raid_disks, i;
				438	unsigned long flags;
				439	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
				440
				441	if (bi->bi_size)
				442	return 1;
				443
				444	for (i=0 ; i<disks; i++)
				445	if (bi == &sh->dev[i].req)
				446	break;
				447
				448	PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
				449	(unsigned long long)sh->sector, i, atomic_read(&sh->count),
				450	uptodate);
				451	if (i == disks) {
				452	BUG();
				453	return 0;
				454	}
				455
				456	spin_lock_irqsave(&conf->device_lock, flags);
				457	if (!uptodate)
				458	md_error(conf->mddev, conf->disks[i].rdev);
				459
				460	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
				461
				462	clear_bit(R5_LOCKED, &sh->dev[i].flags);
				463	set_bit(STRIPE_HANDLE, &sh->state);
				464	__release_stripe(conf, sh);
				465	spin_unlock_irqrestore(&conf->device_lock, flags);
				466	return 0;
				467	}
				468
				469
				470	static sector_t compute_blocknr(struct stripe_head *sh, int i);
				471
				472	static void raid6_build_block (struct stripe_head *sh, int i)
				473	{
				474	struct r5dev *dev = &sh->dev[i];
				475	int pd_idx = sh->pd_idx;
				476	int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks);
				477
				478	bio_init(&dev->req);
				479	dev->req.bi_io_vec = &dev->vec;
				480	dev->req.bi_vcnt++;
				481	dev->req.bi_max_vecs++;
				482	dev->vec.bv_page = dev->page;
				483	dev->vec.bv_len = STRIPE_SIZE;
				484	dev->vec.bv_offset = 0;
				485
				486	dev->req.bi_sector = sh->sector;
				487	dev->req.bi_private = sh;
				488
				489	dev->flags = 0;
				490	if (i != pd_idx && i != qd_idx)
				491	dev->sector = compute_blocknr(sh, i);
				492	}
				493
				494	static void error(mddev_t mddev, mdk_rdev_t rdev)
				495	{
				496	char b[BDEVNAME_SIZE];
				497	raid6_conf_t conf = (raid6_conf_t ) mddev->private;
				498	PRINTK("raid6: error called\n");
				499
				500	if (!rdev->faulty) {
				501	mddev->sb_dirty = 1;
				502	if (rdev->in_sync) {
				503	conf->working_disks--;
				504	mddev->degraded++;
				505	conf->failed_disks++;
				506	rdev->in_sync = 0;
				507	/*
				508	* if recovery was running, make sure it aborts.
				509	*/
				510	set_bit(MD_RECOVERY_ERR, &mddev->recovery);
				511	}
				512	rdev->faulty = 1;
				513	printk (KERN_ALERT
				514	"raid6: Disk failure on %s, disabling device."
				515	" Operation continuing on %d devices\n",
				516	bdevname(rdev->bdev,b), conf->working_disks);
				517	}
				518	}
				519
				520	/*
				521	* Input: a 'big' sector number,
				522	* Output: index of the data and parity disk, and the sector # in them.
				523	*/
				524	static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks,
				525	unsigned int data_disks, unsigned int * dd_idx,
				526	unsigned int * pd_idx, raid6_conf_t *conf)
				527	{
				528	long stripe;
				529	unsigned long chunk_number;
				530	unsigned int chunk_offset;
				531	sector_t new_sector;
				532	int sectors_per_chunk = conf->chunk_size >> 9;
				533
				534	/* First compute the information on this sector */
				535
				536	/*
				537	* Compute the chunk number and the sector offset inside the chunk
				538	*/
				539	chunk_offset = sector_div(r_sector, sectors_per_chunk);
				540	chunk_number = r_sector;
				541	if ( r_sector != chunk_number ) {
				542	printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
				543	(unsigned long long)r_sector, (unsigned long)chunk_number);
				544	BUG();
				545	}
				546
				547	/*
				548	* Compute the stripe number
				549	*/
				550	stripe = chunk_number / data_disks;
				551
				552	/*
				553	* Compute the data disk and parity disk indexes inside the stripe
				554	*/
				555	*dd_idx = chunk_number % data_disks;
				556
				557	/*
				558	* Select the parity disk based on the user selected algorithm.
				559	*/
				560
				561	/** FIX THIS **/
				562	switch (conf->algorithm) {
				563	case ALGORITHM_LEFT_ASYMMETRIC:
				564	*pd_idx = raid_disks - 1 - (stripe % raid_disks);
				565	if (*pd_idx == raid_disks-1)
				566	(dd_idx)++; / Q D D D P */
				567	else if (dd_idx >= pd_idx)
				568	(dd_idx) += 2; / D D P Q D */
				569	break;
				570	case ALGORITHM_RIGHT_ASYMMETRIC:
				571	*pd_idx = stripe % raid_disks;
				572	if (*pd_idx == raid_disks-1)
				573	(dd_idx)++; / Q D D D P */
				574	else if (dd_idx >= pd_idx)
				575	(dd_idx) += 2; / D D P Q D */
				576	break;
				577	case ALGORITHM_LEFT_SYMMETRIC:
				578	*pd_idx = raid_disks - 1 - (stripe % raid_disks);
				579	dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
				580	break;
				581	case ALGORITHM_RIGHT_SYMMETRIC:
				582	*pd_idx = stripe % raid_disks;
				583	dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
				584	break;
				585	default:
				586	printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
				587	conf->algorithm);
				588	}
				589
				590	PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
				591	chunk_number, pd_idx, dd_idx);
				592
				593	/*
				594	* Finally, compute the new sector number
				595	*/
				596	new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset;
				597	return new_sector;
				598	}
				599
				600
				601	static sector_t compute_blocknr(struct stripe_head *sh, int i)
				602	{
				603	raid6_conf_t *conf = sh->raid_conf;
				604	int raid_disks = conf->raid_disks, data_disks = raid_disks - 2;
				605	sector_t new_sector = sh->sector, check;
				606	int sectors_per_chunk = conf->chunk_size >> 9;
				607	sector_t stripe;
				608	int chunk_offset;
				609	int chunk_number, dummy1, dummy2, dd_idx = i;
				610	sector_t r_sector;
				611	int i0 = i;
				612
				613	chunk_offset = sector_div(new_sector, sectors_per_chunk);
				614	stripe = new_sector;
				615	if ( new_sector != stripe ) {
				616	printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n",
				617	(unsigned long long)new_sector, (unsigned long)stripe);
				618	BUG();
				619	}
				620
				621	switch (conf->algorithm) {
				622	case ALGORITHM_LEFT_ASYMMETRIC:
				623	case ALGORITHM_RIGHT_ASYMMETRIC:
				624	if (sh->pd_idx == raid_disks-1)
				625	i--; /* Q D D D P */
				626	else if (i > sh->pd_idx)
				627	i -= 2; /* D D P Q D */
				628	break;
				629	case ALGORITHM_LEFT_SYMMETRIC:
				630	case ALGORITHM_RIGHT_SYMMETRIC:
				631	if (sh->pd_idx == raid_disks-1)
				632	i--; /* Q D D D P */
				633	else {
				634	/* D D P Q D */
				635	if (i < sh->pd_idx)
				636	i += raid_disks;
				637	i -= (sh->pd_idx + 2);
				638	}
				639	break;
				640	default:
				641	printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
				642	conf->algorithm);
				643	}
				644
				645	PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i);
				646
				647	chunk_number = stripe * data_disks + i;
				648	r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
				649
				650	check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
				651	if (check != sh->sector \|\| dummy1 != dd_idx \|\| dummy2 != sh->pd_idx) {
				652	printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n");
				653	return 0;
				654	}
				655	return r_sector;
				656	}
				657
				658
				659
				660	/*
				661	* Copy data between a page in the stripe cache, and one or more bion
				662	* The page could align with the middle of the bio, or there could be
				663	* several bion, each with several bio_vecs, which cover part of the page
				664	* Multiple bion are linked together on bi_next. There may be extras
				665	* at the end of this list. We ignore them.
				666	*/
				667	static void copy_data(int frombio, struct bio *bio,
				668	struct page *page,
				669	sector_t sector)
				670	{
				671	char *pa = page_address(page);
				672	struct bio_vec *bvl;
				673	int i;
				674	int page_offset;
				675
				676	if (bio->bi_sector >= sector)
				677	page_offset = (signed)(bio->bi_sector - sector) * 512;
				678	else
				679	page_offset = (signed)(sector - bio->bi_sector) * -512;
				680	bio_for_each_segment(bvl, bio, i) {
				681	int len = bio_iovec_idx(bio,i)->bv_len;
				682	int clen;
				683	int b_offset = 0;
				684
				685	if (page_offset < 0) {
				686	b_offset = -page_offset;
				687	page_offset += b_offset;
				688	len -= b_offset;
				689	}
				690
				691	if (len > 0 && page_offset + len > STRIPE_SIZE)
				692	clen = STRIPE_SIZE - page_offset;
				693	else clen = len;
				694
				695	if (clen > 0) {
				696	char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
				697	if (frombio)
				698	memcpy(pa+page_offset, ba+b_offset, clen);
				699	else
				700	memcpy(ba+b_offset, pa+page_offset, clen);
				701	__bio_kunmap_atomic(ba, KM_USER0);
				702	}
				703	if (clen < len) /* hit end of page */
				704	break;
				705	page_offset += len;
				706	}
				707	}
				708
				709	#define check_xor() do { \
				710	if (count == MAX_XOR_BLOCKS) { \
				711	xor_block(count, STRIPE_SIZE, ptr); \
				712	count = 1; \
				713	} \
				714	} while(0)
				715
				716	/* Compute P and Q syndromes */
				717	static void compute_parity(struct stripe_head *sh, int method)
				718	{
				719	raid6_conf_t *conf = sh->raid_conf;
				720	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
				721	struct bio *chosen;
				722	/** FIX THIS: This could be very bad if disks is close to 256 **/
				723	void *ptrs[disks];
				724
				725	qd_idx = raid6_next_disk(pd_idx, disks);
				726	d0_idx = raid6_next_disk(qd_idx, disks);
				727
				728	PRINTK("compute_parity, stripe %llu, method %d\n",
				729	(unsigned long long)sh->sector, method);
				730
				731	switch(method) {
				732	case READ_MODIFY_WRITE:
				733	BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
				734	case RECONSTRUCT_WRITE:
				735	for (i= disks; i-- ;)
				736	if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
				737	chosen = sh->dev[i].towrite;
				738	sh->dev[i].towrite = NULL;
				739
				740	if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
				741	wake_up(&conf->wait_for_overlap);
				742
				743	if (sh->dev[i].written) BUG();
				744	sh->dev[i].written = chosen;
				745	}
				746	break;
				747	case CHECK_PARITY:
				748	BUG(); /* Not implemented yet */
				749	}
				750
				751	for (i = disks; i--;)
				752	if (sh->dev[i].written) {
				753	sector_t sector = sh->dev[i].sector;
				754	struct bio *wbi = sh->dev[i].written;
				755	while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
				756	copy_data(1, wbi, sh->dev[i].page, sector);
				757	wbi = r5_next_bio(wbi, sector);
				758	}
				759
				760	set_bit(R5_LOCKED, &sh->dev[i].flags);
				761	set_bit(R5_UPTODATE, &sh->dev[i].flags);
				762	}
				763
				764	// switch(method) {
				765	// case RECONSTRUCT_WRITE:
				766	// case CHECK_PARITY:
				767	// case UPDATE_PARITY:
				768	/* Note that unlike RAID-5, the ordering of the disks matters greatly. */
				769	/* FIX: Is this ordering of drives even remotely optimal? */
				770	count = 0;
				771	i = d0_idx;
				772	do {
				773	ptrs[count++] = page_address(sh->dev[i].page);
				774	if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
				775	printk("block %d/%d not uptodate on parity calc\n", i,count);
				776	i = raid6_next_disk(i, disks);
				777	} while ( i != d0_idx );
				778	// break;
				779	// }
				780
				781	raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
				782
				783	switch(method) {
				784	case RECONSTRUCT_WRITE:
				785	set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
				786	set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
				787	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
				788	set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
				789	break;
				790	case UPDATE_PARITY:
				791	set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
				792	set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
				793	break;
				794	}
				795	}
				796
				797	/* Compute one missing block */
				798	static void compute_block_1(struct stripe_head *sh, int dd_idx)
				799	{
				800	raid6_conf_t *conf = sh->raid_conf;
				801	int i, count, disks = conf->raid_disks;
				802	void ptr[MAX_XOR_BLOCKS], p;
				803	int pd_idx = sh->pd_idx;
				804	int qd_idx = raid6_next_disk(pd_idx, disks);
				805
				806	PRINTK("compute_block_1, stripe %llu, idx %d\n",
				807	(unsigned long long)sh->sector, dd_idx);
				808
				809	if ( dd_idx == qd_idx ) {
				810	/* We're actually computing the Q drive */
				811	compute_parity(sh, UPDATE_PARITY);
				812	} else {
				813	ptr[0] = page_address(sh->dev[dd_idx].page);
				814	memset(ptr[0], 0, STRIPE_SIZE);
				815	count = 1;
				816	for (i = disks ; i--; ) {
				817	if (i == dd_idx \|\| i == qd_idx)
				818	continue;
				819	p = page_address(sh->dev[i].page);
				820	if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
				821	ptr[count++] = p;
				822	else
				823	printk("compute_block() %d, stripe %llu, %d"
				824	" not present\n", dd_idx,
				825	(unsigned long long)sh->sector, i);
				826
				827	check_xor();
				828	}
				829	if (count != 1)
				830	xor_block(count, STRIPE_SIZE, ptr);
				831	set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
				832	}
				833	}
				834
				835	/* Compute two missing blocks */
				836	static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
				837	{
				838	raid6_conf_t *conf = sh->raid_conf;
				839	int i, count, disks = conf->raid_disks;
				840	int pd_idx = sh->pd_idx;
				841	int qd_idx = raid6_next_disk(pd_idx, disks);
				842	int d0_idx = raid6_next_disk(qd_idx, disks);
				843	int faila, failb;
				844
				845	/* faila and failb are disk numbers relative to d0_idx */
				846	/* pd_idx become disks-2 and qd_idx become disks-1 */
				847	faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
				848	failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
				849
				850	BUG_ON(faila == failb);
				851	if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
				852
				853	PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
				854	(unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
				855
				856	if ( failb == disks-1 ) {
				857	/* Q disk is one of the missing disks */
				858	if ( faila == disks-2 ) {
				859	/* Missing P+Q, just recompute */
				860	compute_parity(sh, UPDATE_PARITY);
				861	return;
				862	} else {
				863	/* We're missing D+Q; recompute D from P */
				864	compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1);
				865	compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
				866	return;
				867	}
				868	}
				869
				870	/* We're missing D+P or D+D; build pointer table */
				871	{
				872	/** FIX THIS: This could be very bad if disks is close to 256 **/
				873	void *ptrs[disks];
				874
				875	count = 0;
				876	i = d0_idx;
				877	do {
				878	ptrs[count++] = page_address(sh->dev[i].page);
				879	i = raid6_next_disk(i, disks);
				880	if (i != dd_idx1 && i != dd_idx2 &&
				881	!test_bit(R5_UPTODATE, &sh->dev[i].flags))
				882	printk("compute_2 with missing block %d/%d\n", count, i);
				883	} while ( i != d0_idx );
				884
				885	if ( failb == disks-2 ) {
				886	/* We're missing D+P. */
				887	raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
				888	} else {
				889	/* We're missing D+D. */
				890	raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
				891	}
				892
				893	/* Both the above update both missing blocks */
				894	set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
				895	set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
				896	}
				897	}
				898
				899
				900	/*
				901	* Each stripe/dev can have one or more bion attached.
				902	* toread/towrite point to the first in a chain.
				903	* The bi_next chain must be in order.
				904	*/
				905	static int add_stripe_bio(struct stripe_head sh, struct bio bi, int dd_idx, int forwrite)
				906	{
				907	struct bio **bip;
				908	raid6_conf_t *conf = sh->raid_conf;
				909
				910	PRINTK("adding bh b#%llu to stripe s#%llu\n",
				911	(unsigned long long)bi->bi_sector,
				912	(unsigned long long)sh->sector);
				913
				914
				915	spin_lock(&sh->lock);
				916	spin_lock_irq(&conf->device_lock);
				917	if (forwrite)
				918	bip = &sh->dev[dd_idx].towrite;
				919	else
				920	bip = &sh->dev[dd_idx].toread;
				921	while (bip && (bip)->bi_sector < bi->bi_sector) {
				922	if ((bip)->bi_sector + ((bip)->bi_size >> 9) > bi->bi_sector)
				923	goto overlap;
				924	bip = &(*bip)->bi_next;
				925	}
				926	if (bip && (bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
				927	goto overlap;
				928
				929	if (bip && bi->bi_next && (bip) != bi->bi_next)
				930	BUG();
				931	if (*bip)
				932	bi->bi_next = *bip;
				933	*bip = bi;
				934	bi->bi_phys_segments ++;
				935	spin_unlock_irq(&conf->device_lock);
				936	spin_unlock(&sh->lock);
				937
				938	PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
				939	(unsigned long long)bi->bi_sector,
				940	(unsigned long long)sh->sector, dd_idx);
				941
				942	if (forwrite) {
				943	/* check if page is covered */
				944	sector_t sector = sh->dev[dd_idx].sector;
				945	for (bi=sh->dev[dd_idx].towrite;
				946	sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
				947	bi && bi->bi_sector <= sector;
				948	bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
				949	if (bi->bi_sector + (bi->bi_size>>9) >= sector)
				950	sector = bi->bi_sector + (bi->bi_size>>9);
				951	}
				952	if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
				953	set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
				954	}
				955	return 1;
				956
				957	overlap:
				958	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
				959	spin_unlock_irq(&conf->device_lock);
				960	spin_unlock(&sh->lock);
				961	return 0;
				962	}
				963
				964
				965	/*
				966	* handle_stripe - do things to a stripe.
				967	*
				968	* We lock the stripe and then examine the state of various bits
				969	* to see what needs to be done.
				970	* Possible results:
				971	* return some read request which now have data
				972	* return some write requests which are safely on disc
				973	* schedule a read on some buffers
				974	* schedule a write of some buffers
				975	* return confirmation of parity correctness
				976	*
				977	* Parity calculations are done inside the stripe lock
				978	* buffers are taken off read_list or write_list, and bh_cache buffers
				979	* get BH_Lock set before the stripe lock is released.
				980	*
				981	*/
				982
				983	static void handle_stripe(struct stripe_head *sh)
				984	{
				985	raid6_conf_t *conf = sh->raid_conf;
				986	int disks = conf->raid_disks;
				987	struct bio *return_bi= NULL;
				988	struct bio *bi;
				989	int i;
				990	int syncing;
				991	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
				992	int non_overwrite = 0;
				993	int failed_num[2] = {0, 0};
				994	struct r5dev dev, pdev, *qdev;
				995	int pd_idx = sh->pd_idx;
				996	int qd_idx = raid6_next_disk(pd_idx, disks);
				997	int p_failed, q_failed;
				998
				999	PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
				1000	(unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
				1001	pd_idx, qd_idx);
				1002
				1003	spin_lock(&sh->lock);
				1004	clear_bit(STRIPE_HANDLE, &sh->state);
				1005	clear_bit(STRIPE_DELAYED, &sh->state);
				1006
				1007	syncing = test_bit(STRIPE_SYNCING, &sh->state);
				1008	/* Now to look around and see what can be done */
				1009
				1010	for (i=disks; i--; ) {
				1011	mdk_rdev_t *rdev;
				1012	dev = &sh->dev[i];
				1013	clear_bit(R5_Insync, &dev->flags);
				1014	clear_bit(R5_Syncio, &dev->flags);
				1015
				1016	PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
				1017	i, dev->flags, dev->toread, dev->towrite, dev->written);
				1018	/* maybe we can reply to a read */
				1019	if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
				1020	struct bio rbi, rbi2;
				1021	PRINTK("Return read for disc %d\n", i);
				1022	spin_lock_irq(&conf->device_lock);
				1023	rbi = dev->toread;
				1024	dev->toread = NULL;
				1025	if (test_and_clear_bit(R5_Overlap, &dev->flags))
				1026	wake_up(&conf->wait_for_overlap);
				1027	spin_unlock_irq(&conf->device_lock);
				1028	while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
				1029	copy_data(0, rbi, dev->page, dev->sector);
				1030	rbi2 = r5_next_bio(rbi, dev->sector);
				1031	spin_lock_irq(&conf->device_lock);
				1032	if (--rbi->bi_phys_segments == 0) {
				1033	rbi->bi_next = return_bi;
				1034	return_bi = rbi;
				1035	}
				1036	spin_unlock_irq(&conf->device_lock);
				1037	rbi = rbi2;
				1038	}
				1039	}
				1040
				1041	/* now count some things */
				1042	if (test_bit(R5_LOCKED, &dev->flags)) locked++;
				1043	if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
				1044
				1045
				1046	if (dev->toread) to_read++;
				1047	if (dev->towrite) {
				1048	to_write++;
				1049	if (!test_bit(R5_OVERWRITE, &dev->flags))
				1050	non_overwrite++;
				1051	}
				1052	if (dev->written) written++;
				1053	rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
				1054	if (!rdev \|\| !rdev->in_sync) {
				1055	if ( failed < 2 )
				1056	failed_num[failed] = i;
				1057	failed++;
				1058	} else
				1059	set_bit(R5_Insync, &dev->flags);
				1060	}
				1061	PRINTK("locked=%d uptodate=%d to_read=%d"
				1062	" to_write=%d failed=%d failed_num=%d,%d\n",
				1063	locked, uptodate, to_read, to_write, failed,
				1064	failed_num[0], failed_num[1]);
				1065	/* check if the array has lost >2 devices and, if so, some requests might
				1066	* need to be failed
				1067	*/
				1068	if (failed > 2 && to_read+to_write+written) {
				1069	spin_lock_irq(&conf->device_lock);
				1070	for (i=disks; i--; ) {
				1071	/* fail all writes first */
				1072	bi = sh->dev[i].towrite;
				1073	sh->dev[i].towrite = NULL;
				1074	if (bi) to_write--;
				1075
				1076	if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
				1077	wake_up(&conf->wait_for_overlap);
				1078
				1079	while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
				1080	struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
				1081	clear_bit(BIO_UPTODATE, &bi->bi_flags);
				1082	if (--bi->bi_phys_segments == 0) {
				1083	md_write_end(conf->mddev);
				1084	bi->bi_next = return_bi;
				1085	return_bi = bi;
				1086	}
				1087	bi = nextbi;
				1088	}
				1089	/* and fail all 'written' */
				1090	bi = sh->dev[i].written;
				1091	sh->dev[i].written = NULL;
				1092	while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
				1093	struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
				1094	clear_bit(BIO_UPTODATE, &bi->bi_flags);
				1095	if (--bi->bi_phys_segments == 0) {
				1096	md_write_end(conf->mddev);
				1097	bi->bi_next = return_bi;
				1098	return_bi = bi;
				1099	}
				1100	bi = bi2;
				1101	}
				1102
				1103	/* fail any reads if this device is non-operational */
				1104	if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
				1105	bi = sh->dev[i].toread;
				1106	sh->dev[i].toread = NULL;
				1107	if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
				1108	wake_up(&conf->wait_for_overlap);
				1109	if (bi) to_read--;
				1110	while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
				1111	struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
				1112	clear_bit(BIO_UPTODATE, &bi->bi_flags);
				1113	if (--bi->bi_phys_segments == 0) {
				1114	bi->bi_next = return_bi;
				1115	return_bi = bi;
				1116	}
				1117	bi = nextbi;
				1118	}
				1119	}
				1120	}
				1121	spin_unlock_irq(&conf->device_lock);
				1122	}
				1123	if (failed > 2 && syncing) {
				1124	md_done_sync(conf->mddev, STRIPE_SECTORS,0);
				1125	clear_bit(STRIPE_SYNCING, &sh->state);
				1126	syncing = 0;
				1127	}
				1128
				1129	/*
				1130	* might be able to return some write requests if the parity blocks
				1131	* are safe, or on a failed drive
				1132	*/
				1133	pdev = &sh->dev[pd_idx];
				1134	p_failed = (failed >= 1 && failed_num[0] == pd_idx)
				1135	\|\| (failed >= 2 && failed_num[1] == pd_idx);
				1136	qdev = &sh->dev[qd_idx];
				1137	q_failed = (failed >= 1 && failed_num[0] == qd_idx)
				1138	\|\| (failed >= 2 && failed_num[1] == qd_idx);
				1139
				1140	if ( written &&
				1141	( p_failed \|\| ((test_bit(R5_Insync, &pdev->flags)
				1142	&& !test_bit(R5_LOCKED, &pdev->flags)
				1143	&& test_bit(R5_UPTODATE, &pdev->flags))) ) &&
				1144	( q_failed \|\| ((test_bit(R5_Insync, &qdev->flags)
				1145	&& !test_bit(R5_LOCKED, &qdev->flags)
				1146	&& test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
				1147	/* any written block on an uptodate or failed drive can be
				1148	* returned. Note that if we 'wrote' to a failed drive,
				1149	* it will be UPTODATE, but never LOCKED, so we don't need
				1150	* to test 'failed' directly.
				1151	*/
				1152	for (i=disks; i--; )
				1153	if (sh->dev[i].written) {
				1154	dev = &sh->dev[i];
				1155	if (!test_bit(R5_LOCKED, &dev->flags) &&
				1156	test_bit(R5_UPTODATE, &dev->flags) ) {
				1157	/* We can return any write requests */
				1158	struct bio wbi, wbi2;
				1159	PRINTK("Return write for stripe %llu disc %d\n",
				1160	(unsigned long long)sh->sector, i);
				1161	spin_lock_irq(&conf->device_lock);
				1162	wbi = dev->written;
				1163	dev->written = NULL;
				1164	while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
				1165	wbi2 = r5_next_bio(wbi, dev->sector);
				1166	if (--wbi->bi_phys_segments == 0) {
				1167	md_write_end(conf->mddev);
				1168	wbi->bi_next = return_bi;
				1169	return_bi = wbi;
				1170	}
				1171	wbi = wbi2;
				1172	}
				1173	spin_unlock_irq(&conf->device_lock);
				1174	}
				1175	}
				1176	}
				1177
				1178	/* Now we might consider reading some blocks, either to check/generate
				1179	* parity, or to satisfy requests
				1180	* or to load a block that is being partially written.
				1181	*/
				1182	if (to_read \|\| non_overwrite \|\| (to_write && failed) \|\| (syncing && (uptodate < disks))) {
				1183	for (i=disks; i--;) {
				1184	dev = &sh->dev[i];
				1185	if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
				1186	(dev->toread \|\|
				1187	(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) \|\|
				1188	syncing \|\|
				1189	(failed >= 1 && (sh->dev[failed_num[0]].toread \|\| to_write)) \|\|
				1190	(failed >= 2 && (sh->dev[failed_num[1]].toread \|\| to_write))
				1191	)
				1192	) {
				1193	/* we would like to get this block, possibly
				1194	* by computing it, but we might not be able to
				1195	*/
				1196	if (uptodate == disks-1) {
				1197	PRINTK("Computing stripe %llu block %d\n",
				1198	(unsigned long long)sh->sector, i);
				1199	compute_block_1(sh, i);
				1200	uptodate++;
				1201	} else if ( uptodate == disks-2 && failed >= 2 ) {
				1202	/* Computing 2-failure is very expensive; only do it if failed >= 2 */
				1203	int other;
				1204	for (other=disks; other--;) {
				1205	if ( other == i )
				1206	continue;
				1207	if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
				1208	break;
				1209	}
				1210	BUG_ON(other < 0);
				1211	PRINTK("Computing stripe %llu blocks %d,%d\n",
				1212	(unsigned long long)sh->sector, i, other);
				1213	compute_block_2(sh, i, other);
				1214	uptodate += 2;
				1215	} else if (test_bit(R5_Insync, &dev->flags)) {
				1216	set_bit(R5_LOCKED, &dev->flags);
				1217	set_bit(R5_Wantread, &dev->flags);
				1218	#if 0
				1219	/* if I am just reading this block and we don't have
				1220	a failed drive, or any pending writes then sidestep the cache */
				1221	if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
				1222	! syncing && !failed && !to_write) {
				1223	sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
				1224	sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
				1225	}
				1226	#endif
				1227	locked++;
				1228	PRINTK("Reading block %d (sync=%d)\n",
				1229	i, syncing);
				1230	if (syncing)
				1231	md_sync_acct(conf->disks[i].rdev->bdev,
				1232	STRIPE_SECTORS);
				1233	}
				1234	}
				1235	}
				1236	set_bit(STRIPE_HANDLE, &sh->state);
				1237	}
				1238
				1239	/* now to consider writing and what else, if anything should be read */
				1240	if (to_write) {
				1241	int rcw=0, must_compute=0;
				1242	for (i=disks ; i--;) {
				1243	dev = &sh->dev[i];
				1244	/* Would I have to read this buffer for reconstruct_write */
				1245	if (!test_bit(R5_OVERWRITE, &dev->flags)
				1246	&& i != pd_idx && i != qd_idx
				1247	&& (!test_bit(R5_LOCKED, &dev->flags)
				1248	#if 0
				1249	\|\| sh->bh_page[i] != bh->b_page
				1250	#endif
				1251	) &&
				1252	!test_bit(R5_UPTODATE, &dev->flags)) {
				1253	if (test_bit(R5_Insync, &dev->flags)) rcw++;
				1254	else {
				1255	PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
				1256	must_compute++;
				1257	}
				1258	}
				1259	}
				1260	PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
				1261	(unsigned long long)sh->sector, rcw, must_compute);
				1262	set_bit(STRIPE_HANDLE, &sh->state);
				1263
				1264	if (rcw > 0)
				1265	/* want reconstruct write, but need to get some data */
				1266	for (i=disks; i--;) {
				1267	dev = &sh->dev[i];
				1268	if (!test_bit(R5_OVERWRITE, &dev->flags)
				1269	&& !(failed == 0 && (i == pd_idx \|\| i == qd_idx))
				1270	&& !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
				1271	test_bit(R5_Insync, &dev->flags)) {
				1272	if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
				1273	{
				1274	PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
				1275	(unsigned long long)sh->sector, i);
				1276	set_bit(R5_LOCKED, &dev->flags);
				1277	set_bit(R5_Wantread, &dev->flags);
				1278	locked++;
				1279	} else {
				1280	PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
				1281	(unsigned long long)sh->sector, i);
				1282	set_bit(STRIPE_DELAYED, &sh->state);
				1283	set_bit(STRIPE_HANDLE, &sh->state);
				1284	}
				1285	}
				1286	}
				1287	/* now if nothing is locked, and if we have enough data, we can start a write request */
				1288	if (locked == 0 && rcw == 0) {
				1289	if ( must_compute > 0 ) {
				1290	/* We have failed blocks and need to compute them */
				1291	switch ( failed ) {
				1292	case 0: BUG();
				1293	case 1: compute_block_1(sh, failed_num[0]); break;
				1294	case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
				1295	default: BUG(); /* This request should have been failed? */
				1296	}
				1297	}
				1298
				1299	PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
				1300	compute_parity(sh, RECONSTRUCT_WRITE);
				1301	/* now every locked buffer is ready to be written */
				1302	for (i=disks; i--;)
				1303	if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
				1304	PRINTK("Writing stripe %llu block %d\n",
				1305	(unsigned long long)sh->sector, i);
				1306	locked++;
				1307	set_bit(R5_Wantwrite, &sh->dev[i].flags);
				1308	#if 0 /** FIX: I don't understand the logic here... **/
				1309	if (!test_bit(R5_Insync, &sh->dev[i].flags)
				1310	\|\| ((i==pd_idx \|\| i==qd_idx) && failed == 0)) /* FIX? */
				1311	set_bit(STRIPE_INSYNC, &sh->state);
				1312	#endif
				1313	}
				1314	if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
				1315	atomic_dec(&conf->preread_active_stripes);
				1316	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
				1317	md_wakeup_thread(conf->mddev->thread);
				1318	}
				1319	}
				1320	}
				1321
				1322	/* maybe we need to check and possibly fix the parity for this stripe
				1323	* Any reads will already have been scheduled, so we just see if enough data
				1324	* is available
				1325	*/
				1326	if (syncing && locked == 0 &&
				1327	!test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) {
				1328	set_bit(STRIPE_HANDLE, &sh->state);
				1329	#if 0 /* RAID-6: Don't support CHECK PARITY yet */
				1330	if (failed == 0) {
				1331	char *pagea;
				1332	if (uptodate != disks)
				1333	BUG();
				1334	compute_parity(sh, CHECK_PARITY);
				1335	uptodate--;
				1336	pagea = page_address(sh->dev[pd_idx].page);
				1337	if (((u32)pagea) == 0 &&
				1338	!memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
				1339	/* parity is correct (on disc, not in buffer any more) */
				1340	set_bit(STRIPE_INSYNC, &sh->state);
				1341	}
				1342	}
				1343	#endif
				1344	if (!test_bit(STRIPE_INSYNC, &sh->state)) {
				1345	int failed_needupdate[2];
				1346	struct r5dev adev, bdev;
				1347
				1348	if ( failed < 1 )
				1349	failed_num[0] = pd_idx;
				1350	if ( failed < 2 )
				1351	failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;
				1352
				1353	failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags);
				1354	failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);
				1355
				1356	PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n",
				1357	failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]);
				1358
				1359	#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */
				1360	/* should be able to compute the missing block(s) and write to spare */
				1361	if ( failed_needupdate[0] ^ failed_needupdate[1] ) {
				1362	if (uptodate+1 != disks)
				1363	BUG();
				1364	compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]);
				1365	uptodate++;
				1366	} else if ( failed_needupdate[0] & failed_needupdate[1] ) {
				1367	if (uptodate+2 != disks)
				1368	BUG();
				1369	compute_block_2(sh, failed_num[0], failed_num[1]);
				1370	uptodate += 2;
				1371	}
				1372	#else
				1373	compute_block_2(sh, failed_num[0], failed_num[1]);
				1374	uptodate += failed_needupdate[0] + failed_needupdate[1];
				1375	#endif
				1376
				1377	if (uptodate != disks)
				1378	BUG();
				1379
				1380	PRINTK("Marking for sync stripe %llu blocks %d,%d\n",
				1381	(unsigned long long)sh->sector, failed_num[0], failed_num[1]);
				1382
				1383	/** FIX: Should we really do both of these unconditionally? **/
				1384	adev = &sh->dev[failed_num[0]];
				1385	locked += !test_bit(R5_LOCKED, &adev->flags);
				1386	set_bit(R5_LOCKED, &adev->flags);
				1387	set_bit(R5_Wantwrite, &adev->flags);
				1388	bdev = &sh->dev[failed_num[1]];
				1389	locked += !test_bit(R5_LOCKED, &bdev->flags);
				1390	set_bit(R5_LOCKED, &bdev->flags);
				1391	set_bit(R5_Wantwrite, &bdev->flags);
				1392
				1393	set_bit(STRIPE_INSYNC, &sh->state);
				1394	set_bit(R5_Syncio, &adev->flags);
				1395	set_bit(R5_Syncio, &bdev->flags);
				1396	}
				1397	}
				1398	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
				1399	md_done_sync(conf->mddev, STRIPE_SECTORS,1);
				1400	clear_bit(STRIPE_SYNCING, &sh->state);
				1401	}
				1402
				1403	spin_unlock(&sh->lock);
				1404
				1405	while ((bi=return_bi)) {
				1406	int bytes = bi->bi_size;
				1407
				1408	return_bi = bi->bi_next;
				1409	bi->bi_next = NULL;
				1410	bi->bi_size = 0;
				1411	bi->bi_end_io(bi, bytes, 0);
				1412	}
				1413	for (i=disks; i-- ;) {
				1414	int rw;
				1415	struct bio *bi;
				1416	mdk_rdev_t *rdev;
				1417	if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
				1418	rw = 1;
				1419	else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
				1420	rw = 0;
				1421	else
				1422	continue;
				1423
				1424	bi = &sh->dev[i].req;
				1425
				1426	bi->bi_rw = rw;
				1427	if (rw)
				1428	bi->bi_end_io = raid6_end_write_request;
				1429	else
				1430	bi->bi_end_io = raid6_end_read_request;
				1431
				1432	rcu_read_lock();
				1433	rdev = conf->disks[i].rdev;
				1434	if (rdev && rdev->faulty)
				1435	rdev = NULL;
				1436	if (rdev)
				1437	atomic_inc(&rdev->nr_pending);
				1438	rcu_read_unlock();
				1439
				1440	if (rdev) {
				1441	if (test_bit(R5_Syncio, &sh->dev[i].flags))
				1442	md_sync_acct(rdev->bdev, STRIPE_SECTORS);
				1443
				1444	bi->bi_bdev = rdev->bdev;
				1445	PRINTK("for %llu schedule op %ld on disc %d\n",
				1446	(unsigned long long)sh->sector, bi->bi_rw, i);
				1447	atomic_inc(&sh->count);
				1448	bi->bi_sector = sh->sector + rdev->data_offset;
				1449	bi->bi_flags = 1 << BIO_UPTODATE;
				1450	bi->bi_vcnt = 1;
				1451	bi->bi_max_vecs = 1;
				1452	bi->bi_idx = 0;
				1453	bi->bi_io_vec = &sh->dev[i].vec;
				1454	bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
				1455	bi->bi_io_vec[0].bv_offset = 0;
				1456	bi->bi_size = STRIPE_SIZE;
				1457	bi->bi_next = NULL;
				1458	generic_make_request(bi);
				1459	} else {
				1460	PRINTK("skip op %ld on disc %d for sector %llu\n",
				1461	bi->bi_rw, i, (unsigned long long)sh->sector);
				1462	clear_bit(R5_LOCKED, &sh->dev[i].flags);
				1463	set_bit(STRIPE_HANDLE, &sh->state);
				1464	}
				1465	}
				1466	}
				1467
				1468	static inline void raid6_activate_delayed(raid6_conf_t *conf)
				1469	{
				1470	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
				1471	while (!list_empty(&conf->delayed_list)) {
				1472	struct list_head *l = conf->delayed_list.next;
				1473	struct stripe_head *sh;
				1474	sh = list_entry(l, struct stripe_head, lru);
				1475	list_del_init(l);
				1476	clear_bit(STRIPE_DELAYED, &sh->state);
				1477	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
				1478	atomic_inc(&conf->preread_active_stripes);
				1479	list_add_tail(&sh->lru, &conf->handle_list);
				1480	}
				1481	}
				1482	}
				1483
				1484	static void unplug_slaves(mddev_t *mddev)
				1485	{
				1486	raid6_conf_t *conf = mddev_to_conf(mddev);
				1487	int i;
				1488
				1489	rcu_read_lock();
				1490	for (i=0; i<mddev->raid_disks; i++) {
				1491	mdk_rdev_t *rdev = conf->disks[i].rdev;
				1492	if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
				1493	request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
				1494
				1495	atomic_inc(&rdev->nr_pending);
				1496	rcu_read_unlock();
				1497
				1498	if (r_queue->unplug_fn)
				1499	r_queue->unplug_fn(r_queue);
				1500
				1501	rdev_dec_pending(rdev, mddev);
				1502	rcu_read_lock();
				1503	}
				1504	}
				1505	rcu_read_unlock();
				1506	}
				1507
				1508	static void raid6_unplug_device(request_queue_t *q)
				1509	{
				1510	mddev_t *mddev = q->queuedata;
				1511	raid6_conf_t *conf = mddev_to_conf(mddev);
				1512	unsigned long flags;
				1513
				1514	spin_lock_irqsave(&conf->device_lock, flags);
				1515
				1516	if (blk_remove_plug(q))
				1517	raid6_activate_delayed(conf);
				1518	md_wakeup_thread(mddev->thread);
				1519
				1520	spin_unlock_irqrestore(&conf->device_lock, flags);
				1521
				1522	unplug_slaves(mddev);
				1523	}
				1524
				1525	static int raid6_issue_flush(request_queue_t q, struct gendisk disk,
				1526	sector_t *error_sector)
				1527	{
				1528	mddev_t *mddev = q->queuedata;
				1529	raid6_conf_t *conf = mddev_to_conf(mddev);
				1530	int i, ret = 0;
				1531
				1532	rcu_read_lock();
				1533	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
				1534	mdk_rdev_t *rdev = conf->disks[i].rdev;
				1535	if (rdev && !rdev->faulty) {
				1536	struct block_device *bdev = rdev->bdev;
				1537	request_queue_t *r_queue = bdev_get_queue(bdev);
				1538
				1539	if (!r_queue->issue_flush_fn)
				1540	ret = -EOPNOTSUPP;
				1541	else {
				1542	atomic_inc(&rdev->nr_pending);
				1543	rcu_read_unlock();
				1544	ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
				1545	error_sector);
				1546	rdev_dec_pending(rdev, mddev);
				1547	rcu_read_lock();
				1548	}
				1549	}
				1550	}
				1551	rcu_read_unlock();
				1552	return ret;
				1553	}
				1554
				1555	static inline void raid6_plug_device(raid6_conf_t *conf)
				1556	{
				1557	spin_lock_irq(&conf->device_lock);
				1558	blk_plug_device(conf->mddev->queue);
				1559	spin_unlock_irq(&conf->device_lock);
				1560	}
				1561
				1562	static int make_request (request_queue_t q, struct bio bi)
				1563	{
				1564	mddev_t *mddev = q->queuedata;
				1565	raid6_conf_t *conf = mddev_to_conf(mddev);
				1566	const unsigned int raid_disks = conf->raid_disks;
				1567	const unsigned int data_disks = raid_disks - 2;
				1568	unsigned int dd_idx, pd_idx;
				1569	sector_t new_sector;
				1570	sector_t logical_sector, last_sector;
				1571	struct stripe_head *sh;
				1572
NeilBrown	3d310eb	2005-06-21 17:17:26 -0700	[diff] [blame]	1573	md_write_start(mddev, bi);
NeilBrown	06d91a5	2005-06-21 17:17:12 -0700	[diff] [blame]	1574
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1575	if (bio_data_dir(bi)==WRITE) {
				1576	disk_stat_inc(mddev->gendisk, writes);
				1577	disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
				1578	} else {
				1579	disk_stat_inc(mddev->gendisk, reads);
				1580	disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
				1581	}
				1582
				1583	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
				1584	last_sector = bi->bi_sector + (bi->bi_size>>9);
				1585
				1586	bi->bi_next = NULL;
				1587	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
NeilBrown	06d91a5	2005-06-21 17:17:12 -0700	[diff] [blame]	1588
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1589	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
				1590	DEFINE_WAIT(w);
				1591
				1592	new_sector = raid6_compute_sector(logical_sector,
				1593	raid_disks, data_disks, &dd_idx, &pd_idx, conf);
				1594
				1595	PRINTK("raid6: make_request, sector %llu logical %llu\n",
				1596	(unsigned long long)new_sector,
				1597	(unsigned long long)logical_sector);
				1598
				1599	retry:
				1600	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
				1601	sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
				1602	if (sh) {
				1603	if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
				1604	/* Add failed due to overlap. Flush everything
				1605	* and wait a while
				1606	*/
				1607	raid6_unplug_device(mddev->queue);
				1608	release_stripe(sh);
				1609	schedule();
				1610	goto retry;
				1611	}
				1612	finish_wait(&conf->wait_for_overlap, &w);
				1613	raid6_plug_device(conf);
				1614	handle_stripe(sh);
				1615	release_stripe(sh);
				1616	} else {
				1617	/* cannot get stripe for read-ahead, just give-up */
				1618	clear_bit(BIO_UPTODATE, &bi->bi_flags);
				1619	finish_wait(&conf->wait_for_overlap, &w);
				1620	break;
				1621	}
				1622
				1623	}
				1624	spin_lock_irq(&conf->device_lock);
				1625	if (--bi->bi_phys_segments == 0) {
				1626	int bytes = bi->bi_size;
				1627
				1628	if ( bio_data_dir(bi) == WRITE )
				1629	md_write_end(mddev);
				1630	bi->bi_size = 0;
				1631	bi->bi_end_io(bi, bytes, 0);
				1632	}
				1633	spin_unlock_irq(&conf->device_lock);
				1634	return 0;
				1635	}
				1636
				1637	/* FIXME go_faster isn't used */
NeilBrown	57afd89	2005-06-21 17:17:13 -0700	[diff] [blame]	1638	static sector_t sync_request(mddev_t mddev, sector_t sector_nr, int skipped, int go_faster)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1639	{
				1640	raid6_conf_t conf = (raid6_conf_t ) mddev->private;
				1641	struct stripe_head *sh;
				1642	int sectors_per_chunk = conf->chunk_size >> 9;
				1643	sector_t x;
				1644	unsigned long stripe;
				1645	int chunk_offset;
				1646	int dd_idx, pd_idx;
				1647	sector_t first_sector;
				1648	int raid_disks = conf->raid_disks;
				1649	int data_disks = raid_disks - 2;
				1650
				1651	if (sector_nr >= mddev->size <<1) {
				1652	/* just being told to finish up .. nothing much to do */
				1653	unplug_slaves(mddev);
				1654	return 0;
				1655	}
				1656	/* if there are 2 or more failed drives and we are trying
				1657	* to resync, then assert that we are finished, because there is
				1658	* nothing we can do.
				1659	*/
				1660	if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
NeilBrown	57afd89	2005-06-21 17:17:13 -0700	[diff] [blame]	1661	sector_t rv = (mddev->size << 1) - sector_nr;
				1662	*skipped = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1663	return rv;
				1664	}
				1665
				1666	x = sector_nr;
				1667	chunk_offset = sector_div(x, sectors_per_chunk);
				1668	stripe = x;
				1669	BUG_ON(x != stripe);
				1670
				1671	first_sector = raid6_compute_sector((sector_t)stripedata_diskssectors_per_chunk
				1672	+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
				1673	sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
				1674	if (sh == NULL) {
				1675	sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
				1676	/* make sure we don't swamp the stripe cache if someone else
				1677	* is trying to get access
				1678	*/
				1679	set_current_state(TASK_UNINTERRUPTIBLE);
				1680	schedule_timeout(1);
				1681	}
				1682	spin_lock(&sh->lock);
				1683	set_bit(STRIPE_SYNCING, &sh->state);
				1684	clear_bit(STRIPE_INSYNC, &sh->state);
				1685	spin_unlock(&sh->lock);
				1686
				1687	handle_stripe(sh);
				1688	release_stripe(sh);
				1689
				1690	return STRIPE_SECTORS;
				1691	}
				1692
				1693	/*
				1694	* This is our raid6 kernel thread.
				1695	*
				1696	* We scan the hash table for stripes which can be handled now.
				1697	* During the scan, completed stripes are saved for us by the interrupt
				1698	* handler, so that they will not have to wait for our next wakeup.
				1699	*/
				1700	static void raid6d (mddev_t *mddev)
				1701	{
				1702	struct stripe_head *sh;
				1703	raid6_conf_t *conf = mddev_to_conf(mddev);
				1704	int handled;
				1705
				1706	PRINTK("+++ raid6d active\n");
				1707
				1708	md_check_recovery(mddev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1709
				1710	handled = 0;
				1711	spin_lock_irq(&conf->device_lock);
				1712	while (1) {
				1713	struct list_head *first;
				1714
				1715	if (list_empty(&conf->handle_list) &&
				1716	atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
				1717	!blk_queue_plugged(mddev->queue) &&
				1718	!list_empty(&conf->delayed_list))
				1719	raid6_activate_delayed(conf);
				1720
				1721	if (list_empty(&conf->handle_list))
				1722	break;
				1723
				1724	first = conf->handle_list.next;
				1725	sh = list_entry(first, struct stripe_head, lru);
				1726
				1727	list_del_init(first);
				1728	atomic_inc(&sh->count);
				1729	if (atomic_read(&sh->count)!= 1)
				1730	BUG();
				1731	spin_unlock_irq(&conf->device_lock);
				1732
				1733	handled++;
				1734	handle_stripe(sh);
				1735	release_stripe(sh);
				1736
				1737	spin_lock_irq(&conf->device_lock);
				1738	}
				1739	PRINTK("%d stripes handled\n", handled);
				1740
				1741	spin_unlock_irq(&conf->device_lock);
				1742
				1743	unplug_slaves(mddev);
				1744
				1745	PRINTK("--- raid6d inactive\n");
				1746	}
				1747
				1748	static int run (mddev_t *mddev)
				1749	{
				1750	raid6_conf_t *conf;
				1751	int raid_disk, memory;
				1752	mdk_rdev_t *rdev;
				1753	struct disk_info *disk;
				1754	struct list_head *tmp;
				1755
				1756	if (mddev->level != 6) {
				1757	PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level);
				1758	return -EIO;
				1759	}
				1760
				1761	mddev->private = kmalloc (sizeof (raid6_conf_t)
				1762	+ mddev->raid_disks * sizeof(struct disk_info),
				1763	GFP_KERNEL);
				1764	if ((conf = mddev->private) == NULL)
				1765	goto abort;
				1766	memset (conf, 0, sizeof (conf) + mddev->raid_disks sizeof(struct disk_info) );
				1767	conf->mddev = mddev;
				1768
				1769	if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
				1770	goto abort;
				1771	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
				1772
				1773	spin_lock_init(&conf->device_lock);
				1774	init_waitqueue_head(&conf->wait_for_stripe);
				1775	init_waitqueue_head(&conf->wait_for_overlap);
				1776	INIT_LIST_HEAD(&conf->handle_list);
				1777	INIT_LIST_HEAD(&conf->delayed_list);
				1778	INIT_LIST_HEAD(&conf->inactive_list);
				1779	atomic_set(&conf->active_stripes, 0);
				1780	atomic_set(&conf->preread_active_stripes, 0);
				1781
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1782	PRINTK("raid6: run(%s) called.\n", mdname(mddev));
				1783
				1784	ITERATE_RDEV(mddev,rdev,tmp) {
				1785	raid_disk = rdev->raid_disk;
				1786	if (raid_disk >= mddev->raid_disks
				1787	\|\| raid_disk < 0)
				1788	continue;
				1789	disk = conf->disks + raid_disk;
				1790
				1791	disk->rdev = rdev;
				1792
				1793	if (rdev->in_sync) {
				1794	char b[BDEVNAME_SIZE];
				1795	printk(KERN_INFO "raid6: device %s operational as raid"
				1796	" disk %d\n", bdevname(rdev->bdev,b),
				1797	raid_disk);
				1798	conf->working_disks++;
				1799	}
				1800	}
				1801
				1802	conf->raid_disks = mddev->raid_disks;
				1803
				1804	/*
				1805	* 0 for a fully functional array, 1 or 2 for a degraded array.
				1806	*/
				1807	mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
				1808	conf->mddev = mddev;
				1809	conf->chunk_size = mddev->chunk_size;
				1810	conf->level = mddev->level;
				1811	conf->algorithm = mddev->layout;
				1812	conf->max_nr_stripes = NR_STRIPES;
				1813
				1814	/* device size must be a multiple of chunk size */
				1815	mddev->size &= ~(mddev->chunk_size/1024 -1);
				1816
				1817	if (conf->raid_disks < 4) {
				1818	printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
				1819	mdname(mddev), conf->raid_disks);
				1820	goto abort;
				1821	}
				1822	if (!conf->chunk_size \|\| conf->chunk_size % 4) {
				1823	printk(KERN_ERR "raid6: invalid chunk size %d for %s\n",
				1824	conf->chunk_size, mdname(mddev));
				1825	goto abort;
				1826	}
				1827	if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
				1828	printk(KERN_ERR
				1829	"raid6: unsupported parity algorithm %d for %s\n",
				1830	conf->algorithm, mdname(mddev));
				1831	goto abort;
				1832	}
				1833	if (mddev->degraded > 2) {
				1834	printk(KERN_ERR "raid6: not enough operational devices for %s"
				1835	" (%d/%d failed)\n",
				1836	mdname(mddev), conf->failed_disks, conf->raid_disks);
				1837	goto abort;
				1838	}
				1839
				1840	#if 0 /* FIX: For now */
				1841	if (mddev->degraded > 0 &&
				1842	mddev->recovery_cp != MaxSector) {
				1843	printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev));
				1844	goto abort;
				1845	}
				1846	#endif
				1847
				1848	{
				1849	mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
				1850	if (!mddev->thread) {
				1851	printk(KERN_ERR
				1852	"raid6: couldn't allocate thread for %s\n",
				1853	mdname(mddev));
				1854	goto abort;
				1855	}
				1856	}
				1857
				1858	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
				1859	conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
				1860	if (grow_stripes(conf, conf->max_nr_stripes)) {
				1861	printk(KERN_ERR
				1862	"raid6: couldn't allocate %dkB for buffers\n", memory);
				1863	shrink_stripes(conf);
				1864	md_unregister_thread(mddev->thread);
				1865	goto abort;
				1866	} else
				1867	printk(KERN_INFO "raid6: allocated %dkB for %s\n",
				1868	memory, mdname(mddev));
				1869
				1870	if (mddev->degraded == 0)
				1871	printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d"
				1872	" devices, algorithm %d\n", conf->level, mdname(mddev),
				1873	mddev->raid_disks-mddev->degraded, mddev->raid_disks,
				1874	conf->algorithm);
				1875	else
				1876	printk(KERN_ALERT "raid6: raid level %d set %s active with %d"
				1877	" out of %d devices, algorithm %d\n", conf->level,
				1878	mdname(mddev), mddev->raid_disks - mddev->degraded,
				1879	mddev->raid_disks, conf->algorithm);
				1880
				1881	print_raid6_conf(conf);
				1882
				1883	/* read-ahead size must cover two whole stripes, which is
				1884	* 2 * (n-2) * chunksize where 'n' is the number of raid devices
				1885	*/
				1886	{
				1887	int stripe = (mddev->raid_disks-2) * mddev->chunk_size
				1888	/ PAGE_CACHE_SIZE;
				1889	if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
				1890	mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
				1891	}
				1892
				1893	/* Ok, everything is just fine now */
				1894	mddev->array_size = mddev->size * (mddev->raid_disks - 2);
NeilBrown	7a5febe	2005-05-16 21:53:16 -0700	[diff] [blame]	1895
				1896	mddev->queue->unplug_fn = raid6_unplug_device;
				1897	mddev->queue->issue_flush_fn = raid6_issue_flush;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1898	return 0;
				1899	abort:
				1900	if (conf) {
				1901	print_raid6_conf(conf);
				1902	if (conf->stripe_hashtbl)
				1903	free_pages((unsigned long) conf->stripe_hashtbl,
				1904	HASH_PAGES_ORDER);
				1905	kfree(conf);
				1906	}
				1907	mddev->private = NULL;
				1908	printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev));
				1909	return -EIO;
				1910	}
				1911
				1912
				1913
				1914	static int stop (mddev_t *mddev)
				1915	{
				1916	raid6_conf_t conf = (raid6_conf_t ) mddev->private;
				1917
				1918	md_unregister_thread(mddev->thread);
				1919	mddev->thread = NULL;
				1920	shrink_stripes(conf);
				1921	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
				1922	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
				1923	kfree(conf);
				1924	mddev->private = NULL;
				1925	return 0;
				1926	}
				1927
				1928	#if RAID6_DUMPSTATE
				1929	static void print_sh (struct seq_file seq, struct stripe_head sh)
				1930	{
				1931	int i;
				1932
				1933	seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
				1934	(unsigned long long)sh->sector, sh->pd_idx, sh->state);
				1935	seq_printf(seq, "sh %llu, count %d.\n",
				1936	(unsigned long long)sh->sector, atomic_read(&sh->count));
				1937	seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
				1938	for (i = 0; i < sh->raid_conf->raid_disks; i++) {
				1939	seq_printf(seq, "(cache%d: %p %ld) ",
				1940	i, sh->dev[i].page, sh->dev[i].flags);
				1941	}
				1942	seq_printf(seq, "\n");
				1943	}
				1944
				1945	static void printall (struct seq_file seq, raid6_conf_t conf)
				1946	{
				1947	struct stripe_head *sh;
				1948	int i;
				1949
				1950	spin_lock_irq(&conf->device_lock);
				1951	for (i = 0; i < NR_HASH; i++) {
				1952	sh = conf->stripe_hashtbl[i];
				1953	for (; sh; sh = sh->hash_next) {
				1954	if (sh->raid_conf != conf)
				1955	continue;
				1956	print_sh(seq, sh);
				1957	}
				1958	}
				1959	spin_unlock_irq(&conf->device_lock);
				1960	}
				1961	#endif
				1962
				1963	static void status (struct seq_file seq, mddev_t mddev)
				1964	{
				1965	raid6_conf_t conf = (raid6_conf_t ) mddev->private;
				1966	int i;
				1967
				1968	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
				1969	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
				1970	for (i = 0; i < conf->raid_disks; i++)
				1971	seq_printf (seq, "%s",
				1972	conf->disks[i].rdev &&
				1973	conf->disks[i].rdev->in_sync ? "U" : "_");
				1974	seq_printf (seq, "]");
				1975	#if RAID6_DUMPSTATE
				1976	seq_printf (seq, "\n");
				1977	printall(seq, conf);
				1978	#endif
				1979	}
				1980
				1981	static void print_raid6_conf (raid6_conf_t *conf)
				1982	{
				1983	int i;
				1984	struct disk_info *tmp;
				1985
				1986	printk("RAID6 conf printout:\n");
				1987	if (!conf) {
				1988	printk("(conf==NULL)\n");
				1989	return;
				1990	}
				1991	printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
				1992	conf->working_disks, conf->failed_disks);
				1993
				1994	for (i = 0; i < conf->raid_disks; i++) {
				1995	char b[BDEVNAME_SIZE];
				1996	tmp = conf->disks + i;
				1997	if (tmp->rdev)
				1998	printk(" disk %d, o:%d, dev:%s\n",
				1999	i, !tmp->rdev->faulty,
				2000	bdevname(tmp->rdev->bdev,b));
				2001	}
				2002	}
				2003
				2004	static int raid6_spare_active(mddev_t *mddev)
				2005	{
				2006	int i;
				2007	raid6_conf_t *conf = mddev->private;
				2008	struct disk_info *tmp;
				2009
				2010	for (i = 0; i < conf->raid_disks; i++) {
				2011	tmp = conf->disks + i;
				2012	if (tmp->rdev
				2013	&& !tmp->rdev->faulty
				2014	&& !tmp->rdev->in_sync) {
				2015	mddev->degraded--;
				2016	conf->failed_disks--;
				2017	conf->working_disks++;
				2018	tmp->rdev->in_sync = 1;
				2019	}
				2020	}
				2021	print_raid6_conf(conf);
				2022	return 0;
				2023	}
				2024
				2025	static int raid6_remove_disk(mddev_t *mddev, int number)
				2026	{
				2027	raid6_conf_t *conf = mddev->private;
				2028	int err = 0;
				2029	mdk_rdev_t *rdev;
				2030	struct disk_info *p = conf->disks + number;
				2031
				2032	print_raid6_conf(conf);
				2033	rdev = p->rdev;
				2034	if (rdev) {
				2035	if (rdev->in_sync \|\|
				2036	atomic_read(&rdev->nr_pending)) {
				2037	err = -EBUSY;
				2038	goto abort;
				2039	}
				2040	p->rdev = NULL;
Paul E. McKenney	fbd568a3e	2005-05-01 08:59:04 -0700	[diff] [blame]	2041	synchronize_rcu();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2042	if (atomic_read(&rdev->nr_pending)) {
				2043	/* lost the race, try later */
				2044	err = -EBUSY;
				2045	p->rdev = rdev;
				2046	}
				2047	}
				2048
				2049	abort:
				2050
				2051	print_raid6_conf(conf);
				2052	return err;
				2053	}
				2054
				2055	static int raid6_add_disk(mddev_t mddev, mdk_rdev_t rdev)
				2056	{
				2057	raid6_conf_t *conf = mddev->private;
				2058	int found = 0;
				2059	int disk;
				2060	struct disk_info *p;
				2061
				2062	if (mddev->degraded > 2)
				2063	/* no point adding a device */
				2064	return 0;
				2065	/*
				2066	* find the disk ...
				2067	*/
				2068	for (disk=0; disk < mddev->raid_disks; disk++)
				2069	if ((p=conf->disks + disk)->rdev == NULL) {
				2070	rdev->in_sync = 0;
				2071	rdev->raid_disk = disk;
				2072	found = 1;
				2073	p->rdev = rdev;
				2074	break;
				2075	}
				2076	print_raid6_conf(conf);
				2077	return found;
				2078	}
				2079
				2080	static int raid6_resize(mddev_t *mddev, sector_t sectors)
				2081	{
				2082	/* no resync is happening, and there is enough space
				2083	* on all devices, so we can resize.
				2084	* We need to make sure resync covers any new space.
				2085	* If the array is shrinking we should possibly wait until
				2086	* any io in the removed space completes, but it hardly seems
				2087	* worth it.
				2088	*/
				2089	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
				2090	mddev->array_size = (sectors * (mddev->raid_disks-2))>>1;
				2091	set_capacity(mddev->gendisk, mddev->array_size << 1);
				2092	mddev->changed = 1;
				2093	if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
				2094	mddev->recovery_cp = mddev->size << 1;
				2095	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
				2096	}
				2097	mddev->size = sectors /2;
				2098	return 0;
				2099	}
				2100
				2101	static mdk_personality_t raid6_personality=
				2102	{
				2103	.name = "raid6",
				2104	.owner = THIS_MODULE,
				2105	.make_request = make_request,
				2106	.run = run,
				2107	.stop = stop,
				2108	.status = status,
				2109	.error_handler = error,
				2110	.hot_add_disk = raid6_add_disk,
				2111	.hot_remove_disk= raid6_remove_disk,
				2112	.spare_active = raid6_spare_active,
				2113	.sync_request = sync_request,
				2114	.resize = raid6_resize,
				2115	};
				2116
				2117	static int __init raid6_init (void)
				2118	{
				2119	int e;
				2120
				2121	e = raid6_select_algo();
				2122	if ( e )
				2123	return e;
				2124
				2125	return register_md_personality (RAID6, &raid6_personality);
				2126	}
				2127
				2128	static void raid6_exit (void)
				2129	{
				2130	unregister_md_personality (RAID6);
				2131	}
				2132
				2133	module_init(raid6_init);
				2134	module_exit(raid6_exit);
				2135	MODULE_LICENSE("GPL");
				2136	MODULE_ALIAS("md-personality-8"); /* RAID6 */