md/raid10: Allow replacement device to be replace old drive.

When recovery finish and spare_active is called, check for a
replace that might have just become fully synced and mark it
as such, marking the original as failed.

Then when the original is removed, move the replacement into
its position.

This means that 'replacement' and spontaneously become NULL in some
situations.  Make sure we check for those.
It also means that 'rdev' and 'replacement' could appear to be
identical - check for that too.

Signed-off-by: NeilBrown <neilb@suse.de>
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 403f05a..90e95173 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -397,14 +397,17 @@
 	int dec_rdev = 1;
 	struct r10conf *conf = r10_bio->mddev->private;
 	int slot, repl;
-	struct md_rdev *rdev;
+	struct md_rdev *rdev = NULL;
 
 	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 
 	if (repl)
 		rdev = conf->mirrors[dev].replacement;
-	else
+	if (!rdev) {
+		smp_rmb();
+		repl = 0;
 		rdev = conf->mirrors[dev].rdev;
+	}
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
@@ -1089,6 +1092,8 @@
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
 		struct md_rdev *rrdev = rcu_dereference(
 			conf->mirrors[d].replacement);
+		if (rdev == rrdev)
+			rrdev = NULL;
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			atomic_inc(&rdev->nr_pending);
 			blocked_rdev = rdev;
@@ -1170,9 +1175,15 @@
 				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 			}
 			if (r10_bio->devs[j].repl_bio) {
+				struct md_rdev *rdev;
 				d = r10_bio->devs[j].devnum;
-				rdev_dec_pending(
-					conf->mirrors[d].replacement, mddev);
+				rdev = conf->mirrors[d].replacement;
+				if (!rdev) {
+					/* Race with remove_disk */
+					smp_mb();
+					rdev = conf->mirrors[d].rdev;
+				}
+				rdev_dec_pending(rdev, mddev);
 			}
 		}
 		allow_barrier(conf);
@@ -1230,6 +1241,10 @@
 			    max_sectors);
 		r10_bio->devs[i].repl_bio = mbio;
 
+		/* We are actively writing to the original device
+		 * so it cannot disappear, so the replacement cannot
+		 * become NULL here
+		 */
 		mbio->bi_sector	= (r10_bio->devs[i].addr+
 				   conf->mirrors[d].replacement->data_offset);
 		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
@@ -1404,9 +1419,27 @@
 	 */
 	for (i = 0; i < conf->raid_disks; i++) {
 		tmp = conf->mirrors + i;
-		if (tmp->rdev
-		    && !test_bit(Faulty, &tmp->rdev->flags)
-		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+		if (tmp->replacement
+		    && tmp->replacement->recovery_offset == MaxSector
+		    && !test_bit(Faulty, &tmp->replacement->flags)
+		    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+			/* Replacement has just become active */
+			if (!tmp->rdev
+			    || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+				count++;
+			if (tmp->rdev) {
+				/* Replaced device not technically faulty,
+				 * but we need to be sure it gets removed
+				 * and never re-added.
+				 */
+				set_bit(Faulty, &tmp->rdev->flags);
+				sysfs_notify_dirent_safe(
+					tmp->rdev->sysfs_state);
+			}
+			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+		} else if (tmp->rdev
+			   && !test_bit(Faulty, &tmp->rdev->flags)
+			   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			count++;
 			sysfs_notify_dirent(tmp->rdev->sysfs_state);
 		}
@@ -1506,6 +1539,7 @@
 	 */
 	if (!test_bit(Faulty, &rdev->flags) &&
 	    mddev->recovery_disabled != p->recovery_disabled &&
+	    (!p->replacement || p->replacement == rdev) &&
 	    enough(conf, -1)) {
 		err = -EBUSY;
 		goto abort;
@@ -1517,7 +1551,21 @@
 		err = -EBUSY;
 		*rdevp = rdev;
 		goto abort;
-	}
+	} else if (p->replacement) {
+		/* We must have just cleared 'rdev' */
+		p->rdev = p->replacement;
+		clear_bit(Replacement, &p->replacement->flags);
+		smp_mb(); /* Make sure other CPUs may see both as identical
+			   * but will never see neither -- if they are careful.
+			   */
+		p->replacement = NULL;
+		clear_bit(WantReplacement, &rdev->flags);
+	} else
+		/* We might have just remove the Replacement as faulty
+		 * Clear the flag just in case
+		 */
+		clear_bit(WantReplacement, &rdev->flags);
+
 	err = md_integrity_register(mddev);
 
 abort:
@@ -1595,13 +1643,15 @@
 	int bad_sectors;
 	int slot;
 	int repl;
-	struct md_rdev *rdev;
+	struct md_rdev *rdev = NULL;
 
 	d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 	if (repl)
 		rdev = conf->mirrors[d].replacement;
-	else
+	if (!rdev) {
+		smp_mb();
 		rdev = conf->mirrors[d].rdev;
+	}
 
 	if (!uptodate) {
 		if (repl)
@@ -2368,7 +2418,7 @@
 			}
 			bio = r10_bio->devs[m].repl_bio;
 			rdev = conf->mirrors[dev].replacement;
-			if (bio == IO_MADE_GOOD) {
+			if (rdev && bio == IO_MADE_GOOD) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,