md/raid10: locking changes for 'enough()'. As 'enough' accesses conf->prev and conf->geo, which can change spontanously, it should guard against changes. This can be done with device_lock as start_reshape holds device_lock while updating 'geo' and end_reshape holds it while updating 'prev'. So 'error' needs to hold 'device_lock'. On the other hand, raid10_end_read_request knows which of the two it really wants to access, and as it is an active request on that one, the value cannot change underneath it. So change _enough to take flag rather than a pointer, pass the appropriate flag from raid10_end_read_request(), and remove the locking. All other calls to 'enough' are made with reconfig_mutex held, so neither 'prev' nor 'geo' can change. Signed-off-by: NeilBrown <neilb@suse.de>

commit: 635f6416a2e983adac8ccf90bffbeed0c1a76454 [log] [tgz]
author: NeilBrown <neilb@suse.de> Tue Jun 11 14:57:09 2013 +1000
committer: NeilBrown <neilb@suse.de> Fri Jun 14 08:10:27 2013 +1000
tree: 36389c7bd9b6dd43ec6055354e9e718d2147240d
parent: b29bebd66dbd492105668ec3515a5ffb0b25e4c1 [diff] [blame]
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c6b193..5169ed2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c

@@ -97,7 +97,7 @@
 
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
-static int enough(struct r10conf *conf, int ignore);
+static int _enough(struct r10conf *conf, int previous, int ignore);
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 				int *skipped);
 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
@@ -392,11 +392,9 @@
 		 * than fail the last device.  Here we redefine
 		 * "uptodate" to mean "Don't want to retry"
 		 */
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		if (!enough(conf, rdev->raid_disk))
+		if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
+			     rdev->raid_disk))
 			uptodate = 1;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	if (uptodate) {
 		raid_end_bio_io(r10_bio);
@@ -1632,9 +1630,17 @@
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
-static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
+static int _enough(struct r10conf *conf, int previous, int ignore)
 {
 	int first = 0;
+	int disks, ncopies;
+	if (previous) {
+		disks = conf->prev.raid_disks;
+		ncopies = conf->prev.near_copies;
+	} else {
+		disks = conf->geo.raid_disks;
+		ncopies = conf->geo.near_copies;
+	}
 
 	do {
 		int n = conf->copies;
@@ -1644,25 +1650,31 @@
 			if (conf->mirrors[this].rdev &&
 			    this != ignore)
 				cnt++;
-			this = (this+1) % geo->raid_disks;
+			this = (this+1) % disks;
 		}
 		if (cnt == 0)
 			return 0;
-		first = (first + geo->near_copies) % geo->raid_disks;
+		first = (first + ncopies) % disks;
 	} while (first != 0);
 	return 1;
 }
 
 static int enough(struct r10conf *conf, int ignore)
 {
-	return _enough(conf, &conf->geo, ignore) &&
-		_enough(conf, &conf->prev, ignore);
+	/* when calling 'enough', both 'prev' and 'geo' must
+	 * be stable.
+	 * This is ensured if ->reconfig_mutex or ->device_lock
+	 * is held.
+	 */
+	return _enough(conf, 0, ignore) &&
+		_enough(conf, 1, ignore);
 }
 
 static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r10conf *conf = mddev->private;
+	unsigned long flags;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1670,18 +1682,18 @@
 	 * next level up know.
 	 * else mark the drive as failed
 	 */
+	spin_lock_irqsave(&conf->device_lock, flags);
 	if (test_bit(In_sync, &rdev->flags)
-	    && !enough(conf, rdev->raid_disk))
+	    && !enough(conf, rdev->raid_disk)) {
 		/*
 		 * Don't fail the drive, just return an IO error.
 		 */
-		return;
-	if (test_and_clear_bit(In_sync, &rdev->flags)) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		mddev->degraded++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
+		return;
+	}
+	if (test_and_clear_bit(In_sync, &rdev->flags)) {
+		mddev->degraded++;
+			/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -1689,6 +1701,7 @@
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
@@ -1791,7 +1804,7 @@
 		 * very different from resync
 		 */
 		return -EBUSY;
-	if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
+	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
 		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
commit	635f6416a2e983adac8ccf90bffbeed0c1a76454	[log] [tgz]
author	NeilBrown <neilb@suse.de>	Tue Jun 11 14:57:09 2013 +1000
committer	NeilBrown <neilb@suse.de>	Fri Jun 14 08:10:27 2013 +1000
tree	36389c7bd9b6dd43ec6055354e9e718d2147240d
parent	b29bebd66dbd492105668ec3515a5ffb0b25e4c1 [diff] [blame]