[PATCH] md: write intent bitmap support for raid10
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ee199d4..64e7da3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -714,9 +714,10 @@
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
mddev->bitmap_file == NULL) {
- if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
+ if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+ && mddev->level != 10) {
/* FIXME use a better test */
- printk(KERN_WARNING "md: bitmaps only support for raid1\n");
+ printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
return -EINVAL;
}
mddev->bitmap_offset = mddev->default_bitmap_offset;
@@ -1037,8 +1038,9 @@
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
mddev->bitmap_file == NULL ) {
- if (mddev->level != 1) {
- printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
+ if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+ && mddev->level != 10) {
+ printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
return -EINVAL;
}
mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 50bd7b1..8f58a44 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,7 +18,9 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#include "dm-bio-list.h"
#include <linux/raid/raid10.h>
+#include <linux/raid/bitmap.h>
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -306,9 +308,11 @@
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
- if (!uptodate)
+ if (!uptodate) {
md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
- else
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R10BIO_Degraded, &r10_bio->state);
+ } else
/*
* Set R10BIO_Uptodate in our master bio, so that
* we will return a good error code for to the higher
@@ -328,6 +332,11 @@
* already.
*/
if (atomic_dec_and_test(&r10_bio->remaining)) {
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
+ r10_bio->sectors,
+ !test_bit(R10BIO_Degraded, &r10_bio->state),
+ 0);
md_write_end(r10_bio->mddev);
raid_end_bio_io(r10_bio);
}
@@ -486,8 +495,9 @@
rcu_read_lock();
/*
* Check if we can balance. We can balance on the whole
- * device if no resync is going on, or below the resync window.
- * We take the first readable disk when above the resync window.
+ * device if no resync is going on (recovery is ok), or below
+ * the resync window. We take the first readable disk when
+ * above the resync window.
*/
if (conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) {
@@ -591,7 +601,10 @@
static void raid10_unplug(request_queue_t *q)
{
+ mddev_t *mddev = q->queuedata;
+
unplug_slaves(q->queuedata);
+ md_wakeup_thread(mddev->thread);
}
static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -647,12 +660,13 @@
*/
#define RESYNC_DEPTH 32
-static void raise_barrier(conf_t *conf)
+static void raise_barrier(conf_t *conf, int force)
{
+ BUG_ON(force && !conf->barrier);
spin_lock_irq(&conf->resync_lock);
- /* Wait until no block IO is waiting */
- wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+ /* Wait until no block IO is waiting (unless 'force') */
+ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
conf->resync_lock,
raid10_unplug(conf->mddev->queue));
@@ -710,6 +724,8 @@
int i;
int chunk_sects = conf->chunk_mask + 1;
const int rw = bio_data_dir(bio);
+ struct bio_list bl;
+ unsigned long flags;
if (unlikely(bio_barrier(bio))) {
bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -767,6 +783,7 @@
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_sector;
+ r10_bio->state = 0;
if (rw == READ) {
/*
@@ -811,13 +828,16 @@
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
r10_bio->devs[i].bio = bio;
- } else
+ } else {
r10_bio->devs[i].bio = NULL;
+ set_bit(R10BIO_Degraded, &r10_bio->state);
+ }
}
rcu_read_unlock();
- atomic_set(&r10_bio->remaining, 1);
+ atomic_set(&r10_bio->remaining, 0);
+ bio_list_init(&bl);
for (i = 0; i < conf->copies; i++) {
struct bio *mbio;
int d = r10_bio->devs[i].devnum;
@@ -835,13 +855,14 @@
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
- generic_make_request(mbio);
+ bio_list_add(&bl, mbio);
}
- if (atomic_dec_and_test(&r10_bio->remaining)) {
- md_write_end(mddev);
- raid_end_bio_io(r10_bio);
- }
+ bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_merge(&conf->pending_bio_list, &bl);
+ blk_plug_device(mddev->queue);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
return 0;
}
@@ -999,7 +1020,12 @@
if (!enough(conf))
return 0;
- for (mirror=0; mirror < mddev->raid_disks; mirror++)
+ if (rdev->saved_raid_disk >= 0 &&
+ conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+ mirror = rdev->saved_raid_disk;
+ else
+ mirror = 0;
+ for ( ; mirror < mddev->raid_disks; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue,
@@ -1015,6 +1041,8 @@
p->head_position = 0;
rdev->raid_disk = mirror;
found = 1;
+ if (rdev->saved_raid_disk != mirror)
+ conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
break;
}
@@ -1282,6 +1310,26 @@
for (;;) {
char b[BDEVNAME_SIZE];
spin_lock_irqsave(&conf->device_lock, flags);
+
+ if (conf->pending_bio_list.head) {
+ bio = bio_list_get(&conf->pending_bio_list);
+ blk_remove_plug(mddev->queue);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /* flush any pending bitmap writes to disk before proceeding w/ I/O */
+ if (bitmap_unplug(mddev->bitmap) != 0)
+ printk("%s: bitmap file write failed!\n", mdname(mddev));
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ unplug = 1;
+
+ continue;
+ }
+
if (list_empty(head))
break;
r10_bio = list_entry(head->prev, r10bio_t, retry_list);
@@ -1388,6 +1436,8 @@
sector_t max_sector, nr_sectors;
int disk;
int i;
+ int max_sync;
+ int sync_blocks;
sector_t sectors_skipped = 0;
int chunks_skipped = 0;
@@ -1401,6 +1451,29 @@
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
+ /* If we aborted, we need to abort the
+ * sync on the 'current' bitmap chucks (there can
+ * be several when recovering multiple devices).
+ * as we may have started syncing it but not finished.
+ * We can find the current address in
+ * mddev->curr_resync, but for recovery,
+ * we need to convert that to several
+ * virtual addresses.
+ */
+ if (mddev->curr_resync < max_sector) { /* aborted */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else for (i=0; i<conf->raid_disks; i++) {
+ sector_t sect =
+ raid10_find_virt(conf, mddev->curr_resync, i);
+ bitmap_end_sync(mddev->bitmap, sect,
+ &sync_blocks, 1);
+ }
+ } else /* completed sync */
+ conf->fullsync = 0;
+
+ bitmap_close_sync(mddev->bitmap);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
@@ -1425,8 +1498,6 @@
*/
if (!go_faster && conf->nr_waiting)
msleep_interruptible(1000);
- raise_barrier(conf);
- conf->next_resync = sector_nr;
/* Again, very different code for resync and recovery.
* Both must result in an r10bio with a list of bios that
@@ -1443,6 +1514,7 @@
* end_sync_write if we will want to write.
*/
+ max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* recovery... the complicated one */
int i, j, k;
@@ -1451,13 +1523,29 @@
for (i=0 ; i<conf->raid_disks; i++)
if (conf->mirrors[i].rdev &&
!test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
+ int still_degraded = 0;
/* want to reconstruct this device */
r10bio_t *rb2 = r10_bio;
+ sector_t sect = raid10_find_virt(conf, sector_nr, i);
+ int must_sync;
+ /* Unless we are doing a full sync, we only need
+ * to recover the block if it is set in the bitmap
+ */
+ must_sync = bitmap_start_sync(mddev->bitmap, sect,
+ &sync_blocks, 1);
+ if (sync_blocks < max_sync)
+ max_sync = sync_blocks;
+ if (!must_sync &&
+ !conf->fullsync) {
+ /* yep, skip the sync_blocks here, but don't assume
+ * that there will never be anything to do here
+ */
+ chunks_skipped = -1;
+ continue;
+ }
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
- spin_lock_irq(&conf->resync_lock);
- if (rb2) conf->barrier++;
- spin_unlock_irq(&conf->resync_lock);
+ raise_barrier(conf, rb2 != NULL);
atomic_set(&r10_bio->remaining, 0);
r10_bio->master_bio = (struct bio*)rb2;
@@ -1465,8 +1553,21 @@
atomic_inc(&rb2->remaining);
r10_bio->mddev = mddev;
set_bit(R10BIO_IsRecover, &r10_bio->state);
- r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
+ r10_bio->sector = sect;
+
raid10_find_phys(conf, r10_bio);
+ /* Need to check if this section will still be
+ * degraded
+ */
+ for (j=0; j<conf->copies;j++) {
+ int d = r10_bio->devs[j].devnum;
+ if (conf->mirrors[d].rdev == NULL ||
+ test_bit(Faulty, &conf->mirrors[d].rdev->flags))
+ still_degraded = 1;
+ }
+ must_sync = bitmap_start_sync(mddev->bitmap, sect,
+ &sync_blocks, still_degraded);
+
for (j=0; j<conf->copies;j++) {
int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev &&
@@ -1526,10 +1627,22 @@
} else {
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
+
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr,
+ &sync_blocks, mddev->degraded) &&
+ !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* We can skip this block */
+ *skipped = 1;
+ return sync_blocks + sectors_skipped;
+ }
+ if (sync_blocks < max_sync)
+ max_sync = sync_blocks;
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
r10_bio->mddev = mddev;
atomic_set(&r10_bio->remaining, 0);
+ raise_barrier(conf, 0);
+ conf->next_resync = sector_nr;
r10_bio->master_bio = NULL;
r10_bio->sector = sector_nr;
@@ -1582,6 +1695,8 @@
}
nr_sectors = 0;
+ if (sector_nr + max_sync < max_sector)
+ max_sector = sector_nr + max_sync;
do {
struct page *page;
int len = PAGE_SIZE;
@@ -1821,6 +1936,26 @@
return 0;
}
+static void raid10_quiesce(mddev_t *mddev, int state)
+{
+ conf_t *conf = mddev_to_conf(mddev);
+
+ switch(state) {
+ case 1:
+ raise_barrier(conf, 0);
+ break;
+ case 0:
+ lower_barrier(conf);
+ break;
+ }
+ if (mddev->thread) {
+ if (mddev->bitmap)
+ mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+ else
+ mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ md_wakeup_thread(mddev->thread);
+ }
+}
static mdk_personality_t raid10_personality =
{
@@ -1835,6 +1970,7 @@
.hot_remove_disk= raid10_remove_disk,
.spare_active = raid10_spare_active,
.sync_request = sync_request,
+ .quiesce = raid10_quiesce,
};
static int __init raid_init(void)
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index 08317b7..b660cbf 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -35,13 +35,19 @@
sector_t chunk_mask;
struct list_head retry_list;
- /* for use when syncing mirrors: */
+ /* queue pending writes and submit them on unplug */
+ struct bio_list pending_bio_list;
+
spinlock_t resync_lock;
int nr_pending;
int nr_waiting;
int barrier;
sector_t next_resync;
+ int fullsync; /* set to 1 if a full sync is needed,
+ * (fresh device added).
+ * Cleared when a sync completes.
+ */
wait_queue_head_t wait_barrier;
@@ -100,4 +106,5 @@
#define R10BIO_Uptodate 0
#define R10BIO_IsSync 1
#define R10BIO_IsRecover 2
+#define R10BIO_Degraded 3
#endif