Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm * git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (34 commits) dm table: set flush capability based on underlying devices dm crypt: optionally support discard requests dm raid: add md raid1 support dm raid: support metadata devices dm raid: add write_mostly parameter dm raid: add region_size parameter dm raid: improve table parameters documentation dm ioctl: forbid multiple device specifiers dm ioctl: introduce __get_dev_cell dm ioctl: fill in device parameters in more ioctls dm flakey: add corrupt_bio_byte feature dm flakey: add drop_writes dm flakey: support feature args dm flakey: use dm_target_offset and support discards dm table: share target argument parsing functions dm snapshot: skip reading origin when overwriting complete chunk dm: ignore merge_bvec for snapshots when safe dm table: clean dm_get_device and move exports dm raid: tidy includes dm ioctl: prevent empty message ...

commit: f3406816bb2486fc44558bec77179cd9bcbd4450 [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Tue Aug 02 20:49:21 2011 -1000
committer: Linus Torvalds <torvalds@linux-foundation.org> Tue Aug 02 20:49:21 2011 -1000
tree: 718db1ef45e55314b5e7290f77e70e6328d855a4
parent: 4400478ba3d939b680810aa004f1e954b4f8ba16 [diff]
parent: ed8b752bccf2560e305e25125721d2f0ac759e88 [diff]
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 6b5c42d..2c656ae 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt

@@ -4,7 +4,8 @@
 Device-Mapper's "crypt" target provides transparent encryption of block devices
 using the kernel crypto API.
 
-Parameters: <cipher> <key> <iv_offset> <device path> <offset>
+Parameters: <cipher> <key> <iv_offset> <device path> \
+	      <offset> [<#opt_params> <opt_params>]
 
 <cipher>
     Encryption cipher and an optional IV generation mode.
@@ -37,6 +38,24 @@
 <offset>
     Starting sector within the device where the encrypted data begins.
 
+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        1 allow_discards
+
+allow_discards
+    Block discard requests (a.k.a. TRIM) are passed through the crypt device.
+    The default is to ignore discard requests.
+
+    WARNING: Assess the specific security risks carefully before enabling this
+    option.  For example, allowing discards on encrypted devices may lead to
+    the leak of information about the ciphertext device (filesystem type,
+    used space etc.) if the discarded blocks can be located easily on the
+    device later.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk

diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt
index c8efdfd..6ff5c23 100644
--- a/Documentation/device-mapper/dm-flakey.txt
+++ b/Documentation/device-mapper/dm-flakey.txt

@@ -1,17 +1,53 @@
 dm-flakey
 =========
 
-This target is the same as the linear target except that it returns I/O
-errors periodically.  It's been found useful in simulating failing
-devices for testing purposes.
+This target is the same as the linear target except that it exhibits
+unreliable behaviour periodically.  It's been found useful in simulating
+failing devices for testing purposes.
 
 Starting from the time the table is loaded, the device is available for
-<up interval> seconds, then returns errors for <down interval> seconds,
-and then this cycle repeats.
+<up interval> seconds, then exhibits unreliable behaviour for <down
+interval> seconds, and then this cycle repeats.
 
-Parameters: <dev path> <offset> <up interval> <down interval>
+Also, consider using this in combination with the dm-delay target too,
+which can delay reads and writes and/or send them to different
+underlying devices.
+
+Table parameters
+----------------
+  <dev path> <offset> <up interval> <down interval> \
+    [<num_features> [<feature arguments>]]
+
+Mandatory parameters:
     <dev path>: Full pathname to the underlying block-device, or a
                 "major:minor" device-number.
     <offset>: Starting sector within the device.
     <up interval>: Number of seconds device is available.
     <down interval>: Number of seconds device returns errors.
+
+Optional feature parameters:
+  If no feature parameters are present, during the periods of
+  unreliability, all I/O returns errors.
+
+  drop_writes:
+	All write I/O is silently ignored.
+	Read I/O is handled correctly.
+
+  corrupt_bio_byte <Nth_byte> <direction> <value> <flags>:
+	During <down interval>, replace <Nth_byte> of the data of
+	each matching bio with <value>.
+
+    <Nth_byte>: The offset of the byte to replace.
+		Counting starts at 1, to replace the first byte.
+    <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes.
+		 'w' is incompatible with drop_writes.
+    <value>: The value (from 0-255) to write.
+    <flags>: Perform the replacement only if bio->bi_rw has all the
+	     selected flags set.
+
+Examples:
+  corrupt_bio_byte 32 r 1 0
+	- replaces the 32nd byte of READ bios with the value 1
+
+  corrupt_bio_byte 224 w 0 32
+	- replaces the 224th byte of REQ_META (=32) bios with the value 0

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 33b6b70..2a8c113 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt

@@ -1,70 +1,108 @@
-Device-mapper RAID (dm-raid) is a bridge from DM to MD.  It
-provides a way to use device-mapper interfaces to access the MD RAID
-drivers.
+dm-raid
+-------
 
-As with all device-mapper targets, the nominal public interfaces are the
-constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
-and STATUSTYPE_TABLE).  The CTR table looks like the following:
+The device-mapper RAID (dm-raid) target provides a bridge from DM to MD.
+It allows the MD RAID drivers to be accessed using a device-mapper
+interface.
 
-1: <s> <l> raid \
-2:      <raid_type> <#raid_params> <raid_params> \
-3:      <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
+The target is named "raid" and it accepts the following parameters:
 
-Line 1 contains the standard first three arguments to any device-mapper
-target - the start, length, and target type fields.  The target type in
-this case is "raid".
+  <raid_type> <#raid_params> <raid_params> \
+    <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
 
-Line 2 contains the arguments that define the particular raid
-type/personality/level, the required arguments for that raid type, and
-any optional arguments.  Possible raid types include: raid4, raid5_la,
-raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc.  (raid1 is
-planned for the future.)  The list of required and optional parameters
-is the same for all the current raid types.  The required parameters are
-positional, while the optional parameters are given as key/value pairs.
-The possible parameters are as follows:
- <chunk_size>           Chunk size in sectors.
- [[no]sync]             Force/Prevent RAID initialization
- [rebuild <idx>]        Rebuild the drive indicated by the index
- [daemon_sleep <ms>]    Time between bitmap daemon work to clear bits
- [min_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
- [max_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
- [max_write_behind <sectors>]           See '-write-behind=' (man mdadm)
- [stripe_cache <sectors>]               Stripe cache size for higher RAIDs
+<raid_type>:
+  raid1		RAID1 mirroring
+  raid4		RAID4 dedicated parity disk
+  raid5_la	RAID5 left asymmetric
+		- rotating parity 0 with data continuation
+  raid5_ra	RAID5 right asymmetric
+		- rotating parity N with data continuation
+  raid5_ls	RAID5 left symmetric
+		- rotating parity 0 with data restart
+  raid5_rs 	RAID5 right symmetric
+		- rotating parity N with data restart
+  raid6_zr	RAID6 zero restart
+		- rotating parity zero (left-to-right) with data restart
+  raid6_nr	RAID6 N restart
+		- rotating parity N (right-to-left) with data restart
+  raid6_nc	RAID6 N continue
+		- rotating parity N (right-to-left) with data continuation
 
-Line 3 contains the list of devices that compose the array in
-metadata/data device pairs.  If the metadata is stored separately, a '-'
-is given for the metadata device position.  If a drive has failed or is
-missing at creation time, a '-' can be given for both the metadata and
-data drives for a given position.
+  Refererence: Chapter 4 of
+  http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
 
-NB. Currently all metadata devices must be specified as '-'.
+<#raid_params>: The number of parameters that follow.
 
-Examples:
-# RAID4 - 4 data drives, 1 parity
+<raid_params> consists of
+    Mandatory parameters:
+        <chunk_size>: Chunk size in sectors.  This parameter is often known as
+		      "stripe size".  It is the only mandatory parameter and
+		      is placed first.
+
+    followed by optional parameters (in any order):
+	[sync|nosync]   Force or prevent RAID initialization.
+
+	[rebuild <idx>]	Rebuild drive number idx (first drive is 0).
+
+	[daemon_sleep <ms>]
+		Interval between runs of the bitmap daemon that
+		clear bits.  A longer interval means less bitmap I/O but
+		resyncing after a failure is likely to take longer.
+
+	[min_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
+	[max_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
+	[write_mostly <idx>]		   Drive index is write-mostly
+	[max_write_behind <sectors>]       See '-write-behind=' (man mdadm)
+	[stripe_cache <sectors>]           Stripe cache size (higher RAIDs only)
+	[region_size <sectors>]
+		The region_size multiplied by the number of regions is the
+		logical size of the array.  The bitmap records the device
+		synchronisation state for each region.
+
+<#raid_devs>: The number of devices composing the array.
+	Each device consists of two entries.  The first is the device
+	containing the metadata (if any); the second is the one containing the
+	data.
+
+	If a drive has failed or is missing at creation time, a '-' can be
+	given for both the metadata and data drives for a given position.
+
+
+Example tables
+--------------
+# RAID4 - 4 data drives, 1 parity (no metadata devices)
 # No metadata devices specified to hold superblock/bitmap info
 # Chunk size of 1MiB
 # (Lines separated for easy reading)
+
 0 1960893648 raid \
         raid4 1 2048 \
         5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
 
-# RAID4 - 4 data drives, 1 parity (no metadata devices)
+# RAID4 - 4 data drives, 1 parity (with metadata devices)
 # Chunk size of 1MiB, force RAID initialization,
 #       min recovery rate at 20 kiB/sec/disk
+
 0 1960893648 raid \
-        raid4 4 2048 min_recovery_rate 20 sync\
-        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+        raid4 4 2048 sync min_recovery_rate 20 \
+        5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82
 
-Performing a 'dmsetup table' should display the CTR table used to
-construct the mapping (with possible reordering of optional
-parameters).
+'dmsetup table' displays the table used to construct the mapping.
+The optional parameters are always printed in the order listed
+above with "sync" or "nosync" always output ahead of the other
+arguments, regardless of the order used when originally loading the table.
+Arguments that can be repeated are ordered by value.
 
-Performing a 'dmsetup status' will yield information on the state and
-health of the array.  The output is as follows:
+'dmsetup status' yields information on the state and health of the
+array.
+The output is as follows:
 1: <s> <l> raid \
 2:      <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
 
-Line 1 is standard DM output.  Line 2 is best shown by example:
+Line 1 is the standard output produced by device-mapper.
+Line 2 is produced by the raid target, and best explained by example:
         0 1960893648 raid raid4 5 AAAAA 2/490221568
 Here we can see the RAID type is raid4, there are 5 devices - all of
 which are 'A'live, and the array is 2/490221568 complete with recovery.
+Faulty or missing devices are marked 'D'.  Devices that are out-of-sync
+are marked 'a'.

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8420129..f75a66e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig

@@ -241,12 +241,13 @@
          needed for live data migration tools such as 'pvmove'.
 
 config DM_RAID
-       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
        depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID1
        select MD_RAID456
        select BLK_DEV_MD
        ---help---
-	 A dm target that supports RAID4, RAID5 and RAID6 mappings
+	 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
 
 	 A RAID-5 set of N drives with a capacity of C MB per drive provides
 	 the capacity of C * (N - 1) MB, and protects against a failure

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bae6c4e..49da55c 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c

@@ -30,7 +30,6 @@
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "crypt"
-#define MESG_STR(x) x, sizeof(x)
 
 /*
  * context holding the current state of a multi-part conversion
@@ -239,7 +238,7 @@
 			      struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
-	*(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
+	*(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
 
 	return 0;
 }
@@ -248,7 +247,7 @@
 				struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
-	*(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+	*(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
 
 	return 0;
 }
@@ -415,7 +414,7 @@
 	struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
 
 	memset(iv, 0, cc->iv_size);
-	*(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+	*(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
 	crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
 
 	return 0;
@@ -1575,11 +1574,17 @@
 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct crypt_config *cc;
-	unsigned int key_size;
+	unsigned int key_size, opt_params;
 	unsigned long long tmpll;
 	int ret;
+	struct dm_arg_set as;
+	const char *opt_string;
 
-	if (argc != 5) {
+	static struct dm_arg _args[] = {
+		{0, 1, "Invalid number of feature args"},
+	};
+
+	if (argc < 5) {
 		ti->error = "Not enough arguments";
 		return -EINVAL;
 	}
@@ -1648,6 +1653,30 @@
 	}
 	cc->start = tmpll;
 
+	argv += 5;
+	argc -= 5;
+
+	/* Optional parameters */
+	if (argc) {
+		as.argc = argc;
+		as.argv = argv;
+
+		ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+		if (ret)
+			goto bad;
+
+		opt_string = dm_shift_arg(&as);
+
+		if (opt_params == 1 && opt_string &&
+		    !strcasecmp(opt_string, "allow_discards"))
+			ti->num_discard_requests = 1;
+		else if (opt_params) {
+			ret = -EINVAL;
+			ti->error = "Invalid feature arguments";
+			goto bad;
+		}
+	}
+
 	ret = -ENOMEM;
 	cc->io_queue = alloc_workqueue("kcryptd_io",
 				       WQ_NON_REENTRANT|
@@ -1682,9 +1711,16 @@
 	struct dm_crypt_io *io;
 	struct crypt_config *cc;
 
-	if (bio->bi_rw & REQ_FLUSH) {
+	/*
+	 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
+	 * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
+	 * - for REQ_DISCARD caller must use flush if IO ordering matters
+	 */
+	if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
 		cc = ti->private;
 		bio->bi_bdev = cc->dev->bdev;
+		if (bio_sectors(bio))
+			bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
 		return DM_MAPIO_REMAPPED;
 	}
 
@@ -1727,6 +1763,10 @@
 
 		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
 				cc->dev->name, (unsigned long long)cc->start);
+
+		if (ti->num_discard_requests)
+			DMEMIT(" 1 allow_discards");
+
 		break;
 	}
 	return 0;
@@ -1770,12 +1810,12 @@
 	if (argc < 2)
 		goto error;
 
-	if (!strnicmp(argv[0], MESG_STR("key"))) {
+	if (!strcasecmp(argv[0], "key")) {
 		if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
 			DMWARN("not suspended during key manipulation.");
 			return -EINVAL;
 		}
-		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
+		if (argc == 3 && !strcasecmp(argv[1], "set")) {
 			ret = crypt_set_key(cc, argv[2]);
 			if (ret)
 				return ret;
@@ -1783,7 +1823,7 @@
 				ret = cc->iv_gen_ops->init(cc);
 			return ret;
 		}
-		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
+		if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
 			if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
 				ret = cc->iv_gen_ops->wipe(cc);
 				if (ret)
@@ -1823,7 +1863,7 @@
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 10, 0},
+	.version = {1, 11, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ea79062..89f73ca 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c

@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2003 Sistina Software (UK) Limited.
- * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -15,6 +15,9 @@
 
 #define DM_MSG_PREFIX "flakey"
 
+#define all_corrupt_bio_flags_match(bio, fc)	\
+	(((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
+
 /*
  * Flakey: Used for testing only, simulates intermittent,
  * catastrophic device failure.
@@ -25,60 +28,189 @@
 	sector_t start;
 	unsigned up_interval;
 	unsigned down_interval;
+	unsigned long flags;
+	unsigned corrupt_bio_byte;
+	unsigned corrupt_bio_rw;
+	unsigned corrupt_bio_value;
+	unsigned corrupt_bio_flags;
 };
 
-/*
- * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
- */
-static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
-{
-	struct flakey_c *fc;
-	unsigned long long tmp;
+enum feature_flag_bits {
+	DROP_WRITES
+};
 
-	if (argc != 4) {
-		ti->error = "dm-flakey: Invalid argument count";
+static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
+			  struct dm_target *ti)
+{
+	int r;
+	unsigned argc;
+	const char *arg_name;
+
+	static struct dm_arg _args[] = {
+		{0, 6, "Invalid number of feature args"},
+		{1, UINT_MAX, "Invalid corrupt bio byte"},
+		{0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
+		{0, UINT_MAX, "Invalid corrupt bio flags mask"},
+	};
+
+	/* No feature arguments supplied. */
+	if (!as->argc)
+		return 0;
+
+	r = dm_read_arg_group(_args, as, &argc, &ti->error);
+	if (r)
+		return r;
+
+	while (argc) {
+		arg_name = dm_shift_arg(as);
+		argc--;
+
+		/*
+		 * drop_writes
+		 */
+		if (!strcasecmp(arg_name, "drop_writes")) {
+			if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
+				ti->error = "Feature drop_writes duplicated";
+				return -EINVAL;
+			}
+
+			continue;
+		}
+
+		/*
+		 * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
+		 */
+		if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
+			if (!argc)
+				ti->error = "Feature corrupt_bio_byte requires parameters";
+
+			r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			/*
+			 * Direction r or w?
+			 */
+			arg_name = dm_shift_arg(as);
+			if (!strcasecmp(arg_name, "w"))
+				fc->corrupt_bio_rw = WRITE;
+			else if (!strcasecmp(arg_name, "r"))
+				fc->corrupt_bio_rw = READ;
+			else {
+				ti->error = "Invalid corrupt bio direction (r or w)";
+				return -EINVAL;
+			}
+			argc--;
+
+			/*
+			 * Value of byte (0-255) to write in place of correct one.
+			 */
+			r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			/*
+			 * Only corrupt bios with these flags set.
+			 */
+			r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			continue;
+		}
+
+		ti->error = "Unrecognised flakey feature requested";
 		return -EINVAL;
 	}
 
-	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
+		ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Construct a flakey mapping:
+ * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
+ *
+ *   Feature args:
+ *     [drop_writes]
+ *     [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
+ *
+ *   Nth_byte starts from 1 for the first byte.
+ *   Direction is r for READ or w for WRITE.
+ *   bio_flags is ignored if 0.
+ */
+static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	static struct dm_arg _args[] = {
+		{0, UINT_MAX, "Invalid up interval"},
+		{0, UINT_MAX, "Invalid down interval"},
+	};
+
+	int r;
+	struct flakey_c *fc;
+	unsigned long long tmpll;
+	struct dm_arg_set as;
+	const char *devname;
+
+	as.argc = argc;
+	as.argv = argv;
+
+	if (argc < 4) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (!fc) {
-		ti->error = "dm-flakey: Cannot allocate linear context";
+		ti->error = "Cannot allocate linear context";
 		return -ENOMEM;
 	}
 	fc->start_time = jiffies;
 
-	if (sscanf(argv[1], "%llu", &tmp) != 1) {
-		ti->error = "dm-flakey: Invalid device sector";
-		goto bad;
-	}
-	fc->start = tmp;
+	devname = dm_shift_arg(&as);
 
-	if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
-		ti->error = "dm-flakey: Invalid up interval";
+	if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
+		ti->error = "Invalid device sector";
 		goto bad;
 	}
+	fc->start = tmpll;
 
-	if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
-		ti->error = "dm-flakey: Invalid down interval";
+	r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error);
+	if (r)
 		goto bad;
-	}
+
+	r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
+	if (r)
+		goto bad;
 
 	if (!(fc->up_interval + fc->down_interval)) {
-		ti->error = "dm-flakey: Total (up + down) interval is zero";
+		ti->error = "Total (up + down) interval is zero";
 		goto bad;
 	}
 
 	if (fc->up_interval + fc->down_interval < fc->up_interval) {
-		ti->error = "dm-flakey: Interval overflow";
+		ti->error = "Interval overflow";
 		goto bad;
 	}
 
-	if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
-		ti->error = "dm-flakey: Device lookup failed";
+	r = parse_features(&as, fc, ti);
+	if (r)
+		goto bad;
+
+	if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
+		ti->error = "Device lookup failed";
 		goto bad;
 	}
 
 	ti->num_flush_requests = 1;
+	ti->num_discard_requests = 1;
 	ti->private = fc;
 	return 0;
 
@@ -99,7 +231,7 @@
 {
 	struct flakey_c *fc = ti->private;
 
-	return fc->start + (bi_sector - ti->begin);
+	return fc->start + dm_target_offset(ti, bi_sector);
 }
 
 static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
@@ -111,6 +243,25 @@
 		bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
 }
 
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+{
+	unsigned bio_bytes = bio_cur_bytes(bio);
+	char *data = bio_data(bio);
+
+	/*
+	 * Overwrite the Nth byte of the data returned.
+	 */
+	if (data && bio_bytes >= fc->corrupt_bio_byte) {
+		data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
+
+		DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+			"(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
+			bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+			(bio_data_dir(bio) == WRITE) ? 'w' : 'r',
+			bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
+	}
+}
+
 static int flakey_map(struct dm_target *ti, struct bio *bio,
 		      union map_info *map_context)
 {
@@ -119,18 +270,71 @@
 
 	/* Are we alive ? */
 	elapsed = (jiffies - fc->start_time) / HZ;
-	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
-		return -EIO;
+	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
+		/*
+		 * Flag this bio as submitted while down.
+		 */
+		map_context->ll = 1;
 
+		/*
+		 * Map reads as normal.
+		 */
+		if (bio_data_dir(bio) == READ)
+			goto map_bio;
+
+		/*
+		 * Drop writes?
+		 */
+		if (test_bit(DROP_WRITES, &fc->flags)) {
+			bio_endio(bio, 0);
+			return DM_MAPIO_SUBMITTED;
+		}
+
+		/*
+		 * Corrupt matching writes.
+		 */
+		if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
+			if (all_corrupt_bio_flags_match(bio, fc))
+				corrupt_bio_data(bio, fc);
+			goto map_bio;
+		}
+
+		/*
+		 * By default, error all I/O.
+		 */
+		return -EIO;
+	}
+
+map_bio:
 	flakey_map_bio(ti, bio);
 
 	return DM_MAPIO_REMAPPED;
 }
 
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+			 int error, union map_info *map_context)
+{
+	struct flakey_c *fc = ti->private;
+	unsigned bio_submitted_while_down = map_context->ll;
+
+	/*
+	 * Corrupt successful READs while in down state.
+	 * If flags were specified, only corrupt those that match.
+	 */
+	if (!error && bio_submitted_while_down &&
+	    (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
+	    all_corrupt_bio_flags_match(bio, fc))
+		corrupt_bio_data(bio, fc);
+
+	return error;
+}
+
 static int flakey_status(struct dm_target *ti, status_type_t type,
 			 char *result, unsigned int maxlen)
 {
+	unsigned sz = 0;
 	struct flakey_c *fc = ti->private;
+	unsigned drop_writes;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -138,9 +342,22 @@
 		break;
 
 	case STATUSTYPE_TABLE:
-		snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
-			 (unsigned long long)fc->start, fc->up_interval,
-			 fc->down_interval);
+		DMEMIT("%s %llu %u %u ", fc->dev->name,
+		       (unsigned long long)fc->start, fc->up_interval,
+		       fc->down_interval);
+
+		drop_writes = test_bit(DROP_WRITES, &fc->flags);
+		DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
+
+		if (drop_writes)
+			DMEMIT("drop_writes ");
+
+		if (fc->corrupt_bio_byte)
+			DMEMIT("corrupt_bio_byte %u %c %u %u ",
+			       fc->corrupt_bio_byte,
+			       (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
+			       fc->corrupt_bio_value, fc->corrupt_bio_flags);
+
 		break;
 	}
 	return 0;
@@ -177,11 +394,12 @@
 
 static struct target_type flakey_target = {
 	.name   = "flakey",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.module = THIS_MODULE,
 	.ctr    = flakey_ctr,
 	.dtr    = flakey_dtr,
 	.map    = flakey_map,
+	.end_io = flakey_end_io,
 	.status = flakey_status,
 	.ioctl	= flakey_ioctl,
 	.merge	= flakey_merge,

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2067288..ad2eba4 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c

@@ -38,6 +38,8 @@
 	struct dm_io_client *client;
 	io_notify_fn callback;
 	void *context;
+	void *vma_invalidate_address;
+	unsigned long vma_invalidate_size;
 } __attribute__((aligned(DM_IO_MAX_REGIONS)));
 
 static struct kmem_cache *_dm_io_cache;
@@ -116,6 +118,10 @@
 		set_bit(region, &io->error_bits);
 
 	if (atomic_dec_and_test(&io->count)) {
+		if (io->vma_invalidate_size)
+			invalidate_kernel_vmap_range(io->vma_invalidate_address,
+						     io->vma_invalidate_size);
+
 		if (io->sleeper)
 			wake_up_process(io->sleeper);
 
@@ -159,6 +165,9 @@
 
 	unsigned context_u;
 	void *context_ptr;
+
+	void *vma_invalidate_address;
+	unsigned long vma_invalidate_size;
 };
 
 /*
@@ -377,6 +386,9 @@
 	io->sleeper = current;
 	io->client = client;
 
+	io->vma_invalidate_address = dp->vma_invalidate_address;
+	io->vma_invalidate_size = dp->vma_invalidate_size;
+
 	dispatch_io(rw, num_regions, where, dp, io, 1);
 
 	while (1) {
@@ -415,13 +427,21 @@
 	io->callback = fn;
 	io->context = context;
 
+	io->vma_invalidate_address = dp->vma_invalidate_address;
+	io->vma_invalidate_size = dp->vma_invalidate_size;
+
 	dispatch_io(rw, num_regions, where, dp, io, 0);
 	return 0;
 }
 
-static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
+static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
+		   unsigned long size)
 {
 	/* Set up dpages based on memory type */
+
+	dp->vma_invalidate_address = NULL;
+	dp->vma_invalidate_size = 0;
+
 	switch (io_req->mem.type) {
 	case DM_IO_PAGE_LIST:
 		list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
@@ -432,6 +452,11 @@
 		break;
 
 	case DM_IO_VMA:
+		flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
+		if ((io_req->bi_rw & RW_MASK) == READ) {
+			dp->vma_invalidate_address = io_req->mem.ptr.vma;
+			dp->vma_invalidate_size = size;
+		}
 		vm_dp_init(dp, io_req->mem.ptr.vma);
 		break;
 
@@ -460,7 +485,7 @@
 	int r;
 	struct dpages dp;
 
-	r = dp_init(io_req, &dp);
+	r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
 	if (r)
 		return r;
 

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4cacdad..2e9a3ca 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c

@@ -128,6 +128,24 @@
 	return NULL;
 }
 
+static struct hash_cell *__get_dev_cell(uint64_t dev)
+{
+	struct mapped_device *md;
+	struct hash_cell *hc;
+
+	md = dm_get_md(huge_decode_dev(dev));
+	if (!md)
+		return NULL;
+
+	hc = dm_get_mdptr(md);
+	if (!hc) {
+		dm_put(md);
+		return NULL;
+	}
+
+	return hc;
+}
+
 /*-----------------------------------------------------------------
  * Inserting, removing and renaming a device.
  *---------------------------------------------------------------*/
@@ -718,25 +736,45 @@
  */
 static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
 {
-	struct mapped_device *md;
-	void *mdptr = NULL;
+	struct hash_cell *hc = NULL;
 
-	if (*param->uuid)
-		return __get_uuid_cell(param->uuid);
+	if (*param->uuid) {
+		if (*param->name || param->dev)
+			return NULL;
 
-	if (*param->name)
-		return __get_name_cell(param->name);
+		hc = __get_uuid_cell(param->uuid);
+		if (!hc)
+			return NULL;
+	} else if (*param->name) {
+		if (param->dev)
+			return NULL;
 
-	md = dm_get_md(huge_decode_dev(param->dev));
-	if (!md)
-		goto out;
+		hc = __get_name_cell(param->name);
+		if (!hc)
+			return NULL;
+	} else if (param->dev) {
+		hc = __get_dev_cell(param->dev);
+		if (!hc)
+			return NULL;
+	} else
+		return NULL;
 
-	mdptr = dm_get_mdptr(md);
-	if (!mdptr)
-		dm_put(md);
+	/*
+	 * Sneakily write in both the name and the uuid
+	 * while we have the cell.
+	 */
+	strlcpy(param->name, hc->name, sizeof(param->name));
+	if (hc->uuid)
+		strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
+	else
+		param->uuid[0] = '\0';
 
-out:
-	return mdptr;
+	if (hc->new_map)
+		param->flags |= DM_INACTIVE_PRESENT_FLAG;
+	else
+		param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+
+	return hc;
 }
 
 static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -746,24 +784,8 @@
 
 	down_read(&_hash_lock);
 	hc = __find_device_hash_cell(param);
-	if (hc) {
+	if (hc)
 		md = hc->md;
-
-		/*
-		 * Sneakily write in both the name and the uuid
-		 * while we have the cell.
-		 */
-		strlcpy(param->name, hc->name, sizeof(param->name));
-		if (hc->uuid)
-			strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
-		else
-			param->uuid[0] = '\0';
-
-		if (hc->new_map)
-			param->flags |= DM_INACTIVE_PRESENT_FLAG;
-		else
-			param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
-	}
 	up_read(&_hash_lock);
 
 	return md;
@@ -1402,6 +1424,11 @@
 		goto out;
 	}
 
+	if (!argc) {
+		DMWARN("Empty message received.");
+		goto out;
+	}
+
 	table = dm_get_live_table(md);
 	if (!table)
 		goto out_argv;

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 320401d..f821470 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c

@@ -224,8 +224,6 @@
 	unsigned int num_dests;
 	struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
 
-	sector_t offset;
-	unsigned int nr_pages;
 	struct page_list *pages;
 
 	/*
@@ -380,7 +378,7 @@
 		.bi_rw = job->rw,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
-		.mem.offset = job->offset,
+		.mem.offset = 0,
 		.notify.fn = complete_io,
 		.notify.context = job,
 		.client = job->kc->io_client,
@@ -397,10 +395,9 @@
 static int run_pages_job(struct kcopyd_job *job)
 {
 	int r;
+	unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
 
-	job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
-				  PAGE_SIZE >> 9);
-	r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
+	r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
 	if (!r) {
 		/* this job is ready for io */
 		push(&job->kc->io_jobs, job);
@@ -602,8 +599,6 @@
 	job->num_dests = num_dests;
 	memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
 
-	job->offset = 0;
-	job->nr_pages = 0;
 	job->pages = NULL;
 
 	job->fn = fn;
@@ -622,6 +617,37 @@
 }
 EXPORT_SYMBOL(dm_kcopyd_copy);
 
+void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
+				 dm_kcopyd_notify_fn fn, void *context)
+{
+	struct kcopyd_job *job;
+
+	job = mempool_alloc(kc->job_pool, GFP_NOIO);
+
+	memset(job, 0, sizeof(struct kcopyd_job));
+	job->kc = kc;
+	job->fn = fn;
+	job->context = context;
+
+	atomic_inc(&kc->nr_jobs);
+
+	return job;
+}
+EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
+
+void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
+{
+	struct kcopyd_job *job = j;
+	struct dm_kcopyd_client *kc = job->kc;
+
+	job->read_err = read_err;
+	job->write_err = write_err;
+
+	push(&kc->complete_jobs, job);
+	wake(kc);
+}
+EXPORT_SYMBOL(dm_kcopyd_do_callback);
+
 /*
  * Cancels a kcopyd job, eg. someone might be deactivating a
  * mirror.

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index aa2e0c3..1021c89 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c

@@ -394,8 +394,7 @@
 			group[count] = fe->region;
 			count++;
 
-			list_del(&fe->list);
-			list_add(&fe->list, &tmp_list);
+			list_move(&fe->list, &tmp_list);
 
 			type = fe->type;
 			if (count >= MAX_FLUSH_GROUP_COUNT)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 948e3f4..3b52bb7 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c

@@ -197,15 +197,21 @@
 #define MIRROR_DISK_VERSION 2
 #define LOG_OFFSET 2
 
-struct log_header {
-	uint32_t magic;
+struct log_header_disk {
+	__le32 magic;
 
 	/*
 	 * Simple, incrementing version. no backward
 	 * compatibility.
 	 */
+	__le32 version;
+	__le64 nr_regions;
+} __packed;
+
+struct log_header_core {
+	uint32_t magic;
 	uint32_t version;
-	sector_t nr_regions;
+	uint64_t nr_regions;
 };
 
 struct log_c {
@@ -239,10 +245,10 @@
 	int log_dev_failed;
 	int log_dev_flush_failed;
 	struct dm_dev *log_dev;
-	struct log_header header;
+	struct log_header_core header;
 
 	struct dm_io_region header_location;
-	struct log_header *disk_header;
+	struct log_header_disk *disk_header;
 };
 
 /*
@@ -251,34 +257,34 @@
  */
 static inline int log_test_bit(uint32_t *bs, unsigned bit)
 {
-	return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
+	return test_bit_le(bit, bs) ? 1 : 0;
 }
 
 static inline void log_set_bit(struct log_c *l,
 			       uint32_t *bs, unsigned bit)
 {
-	__test_and_set_bit_le(bit, (unsigned long *) bs);
+	__set_bit_le(bit, bs);
 	l->touched_cleaned = 1;
 }
 
 static inline void log_clear_bit(struct log_c *l,
 				 uint32_t *bs, unsigned bit)
 {
-	__test_and_clear_bit_le(bit, (unsigned long *) bs);
+	__clear_bit_le(bit, bs);
 	l->touched_dirtied = 1;
 }
 
 /*----------------------------------------------------------------
  * Header IO
  *--------------------------------------------------------------*/
-static void header_to_disk(struct log_header *core, struct log_header *disk)
+static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk)
 {
 	disk->magic = cpu_to_le32(core->magic);
 	disk->version = cpu_to_le32(core->version);
 	disk->nr_regions = cpu_to_le64(core->nr_regions);
 }
 
-static void header_from_disk(struct log_header *core, struct log_header *disk)
+static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk)
 {
 	core->magic = le32_to_cpu(disk->magic);
 	core->version = le32_to_cpu(disk->version);
@@ -486,7 +492,7 @@
 	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
 	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
-	lc->recovering_bits = vmalloc(bitset_size);
+	lc->recovering_bits = vzalloc(bitset_size);
 	if (!lc->recovering_bits) {
 		DMWARN("couldn't allocate sync bitset");
 		vfree(lc->sync_bits);
@@ -498,7 +504,6 @@
 		kfree(lc);
 		return -ENOMEM;
 	}
-	memset(lc->recovering_bits, 0, bitset_size);
 	lc->sync_search = 0;
 	log->context = lc;
 
@@ -739,8 +744,7 @@
 		return 0;
 
 	do {
-		*region = find_next_zero_bit_le(
-					     (unsigned long *) lc->sync_bits,
+		*region = find_next_zero_bit_le(lc->sync_bits,
 					     lc->region_count,
 					     lc->sync_search);
 		lc->sync_search = *region + 1;

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c354701..5e0090e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c

@@ -22,7 +22,6 @@
 #include <linux/atomic.h>
 
 #define DM_MSG_PREFIX "multipath"
-#define MESG_STR(x) x, sizeof(x)
 #define DM_PG_INIT_DELAY_MSECS 2000
 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 
@@ -505,80 +504,29 @@
  *      <#paths> <#per-path selector args>
  *         [<path> [<arg>]* ]+ ]+
  *---------------------------------------------------------------*/
-struct param {
-	unsigned min;
-	unsigned max;
-	char *error;
-};
-
-static int read_param(struct param *param, char *str, unsigned *v, char **error)
-{
-	if (!str ||
-	    (sscanf(str, "%u", v) != 1) ||
-	    (*v < param->min) ||
-	    (*v > param->max)) {
-		*error = param->error;
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-struct arg_set {
-	unsigned argc;
-	char **argv;
-};
-
-static char *shift(struct arg_set *as)
-{
-	char *r;
-
-	if (as->argc) {
-		as->argc--;
-		r = *as->argv;
-		as->argv++;
-		return r;
-	}
-
-	return NULL;
-}
-
-static void consume(struct arg_set *as, unsigned n)
-{
-	BUG_ON (as->argc < n);
-	as->argc -= n;
-	as->argv += n;
-}
-
-static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
+static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
 			       struct dm_target *ti)
 {
 	int r;
 	struct path_selector_type *pst;
 	unsigned ps_argc;
 
-	static struct param _params[] = {
+	static struct dm_arg _args[] = {
 		{0, 1024, "invalid number of path selector args"},
 	};
 
-	pst = dm_get_path_selector(shift(as));
+	pst = dm_get_path_selector(dm_shift_arg(as));
 	if (!pst) {
 		ti->error = "unknown path selector type";
 		return -EINVAL;
 	}
 
-	r = read_param(_params, shift(as), &ps_argc, &ti->error);
+	r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
 	if (r) {
 		dm_put_path_selector(pst);
 		return -EINVAL;
 	}
 
-	if (ps_argc > as->argc) {
-		dm_put_path_selector(pst);
-		ti->error = "not enough arguments for path selector";
-		return -EINVAL;
-	}
-
 	r = pst->create(&pg->ps, ps_argc, as->argv);
 	if (r) {
 		dm_put_path_selector(pst);
@@ -587,12 +535,12 @@
 	}
 
 	pg->ps.type = pst;
-	consume(as, ps_argc);
+	dm_consume_args(as, ps_argc);
 
 	return 0;
 }
 
-static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
+static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
 			       struct dm_target *ti)
 {
 	int r;
@@ -609,7 +557,7 @@
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
-	r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
+	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
 			  &p->path.dev);
 	if (r) {
 		ti->error = "error getting device";
@@ -660,16 +608,16 @@
 	return ERR_PTR(r);
 }
 
-static struct priority_group *parse_priority_group(struct arg_set *as,
+static struct priority_group *parse_priority_group(struct dm_arg_set *as,
 						   struct multipath *m)
 {
-	static struct param _params[] = {
+	static struct dm_arg _args[] = {
 		{1, 1024, "invalid number of paths"},
 		{0, 1024, "invalid number of selector args"}
 	};
 
 	int r;
-	unsigned i, nr_selector_args, nr_params;
+	unsigned i, nr_selector_args, nr_args;
 	struct priority_group *pg;
 	struct dm_target *ti = m->ti;
 
@@ -693,26 +641,26 @@
 	/*
 	 * read the paths
 	 */
-	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
+	r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
 	if (r)
 		goto bad;
 
-	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
+	r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
 	if (r)
 		goto bad;
 
-	nr_params = 1 + nr_selector_args;
+	nr_args = 1 + nr_selector_args;
 	for (i = 0; i < pg->nr_pgpaths; i++) {
 		struct pgpath *pgpath;
-		struct arg_set path_args;
+		struct dm_arg_set path_args;
 
-		if (as->argc < nr_params) {
+		if (as->argc < nr_args) {
 			ti->error = "not enough path parameters";
 			r = -EINVAL;
 			goto bad;
 		}
 
-		path_args.argc = nr_params;
+		path_args.argc = nr_args;
 		path_args.argv = as->argv;
 
 		pgpath = parse_path(&path_args, &pg->ps, ti);
@@ -723,7 +671,7 @@
 
 		pgpath->pg = pg;
 		list_add_tail(&pgpath->list, &pg->pgpaths);
-		consume(as, nr_params);
+		dm_consume_args(as, nr_args);
 	}
 
 	return pg;
@@ -733,28 +681,23 @@
 	return ERR_PTR(r);
 }
 
-static int parse_hw_handler(struct arg_set *as, struct multipath *m)
+static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
 {
 	unsigned hw_argc;
 	int ret;
 	struct dm_target *ti = m->ti;
 
-	static struct param _params[] = {
+	static struct dm_arg _args[] = {
 		{0, 1024, "invalid number of hardware handler args"},
 	};
 
-	if (read_param(_params, shift(as), &hw_argc, &ti->error))
+	if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
 		return -EINVAL;
 
 	if (!hw_argc)
 		return 0;
 
-	if (hw_argc > as->argc) {
-		ti->error = "not enough arguments for hardware handler";
-		return -EINVAL;
-	}
-
-	m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
+	m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
 	request_module("scsi_dh_%s", m->hw_handler_name);
 	if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
 		ti->error = "unknown hardware handler type";
@@ -778,7 +721,7 @@
 		for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
 			j = sprintf(p, "%s", as->argv[i]);
 	}
-	consume(as, hw_argc - 1);
+	dm_consume_args(as, hw_argc - 1);
 
 	return 0;
 fail:
@@ -787,20 +730,20 @@
 	return ret;
 }
 
-static int parse_features(struct arg_set *as, struct multipath *m)
+static int parse_features(struct dm_arg_set *as, struct multipath *m)
 {
 	int r;
 	unsigned argc;
 	struct dm_target *ti = m->ti;
-	const char *param_name;
+	const char *arg_name;
 
-	static struct param _params[] = {
+	static struct dm_arg _args[] = {
 		{0, 5, "invalid number of feature args"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 	};
 
-	r = read_param(_params, shift(as), &argc, &ti->error);
+	r = dm_read_arg_group(_args, as, &argc, &ti->error);
 	if (r)
 		return -EINVAL;
 
@@ -808,26 +751,24 @@
 		return 0;
 
 	do {
-		param_name = shift(as);
+		arg_name = dm_shift_arg(as);
 		argc--;
 
-		if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) {
+		if (!strcasecmp(arg_name, "queue_if_no_path")) {
 			r = queue_if_no_path(m, 1, 0);
 			continue;
 		}
 
-		if (!strnicmp(param_name, MESG_STR("pg_init_retries")) &&
+		if (!strcasecmp(arg_name, "pg_init_retries") &&
 		    (argc >= 1)) {
-			r = read_param(_params + 1, shift(as),
-				       &m->pg_init_retries, &ti->error);
+			r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
 			argc--;
 			continue;
 		}
 
-		if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+		if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
 		    (argc >= 1)) {
-			r = read_param(_params + 2, shift(as),
-				       &m->pg_init_delay_msecs, &ti->error);
+			r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
 			argc--;
 			continue;
 		}
@@ -842,15 +783,15 @@
 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 			 char **argv)
 {
-	/* target parameters */
-	static struct param _params[] = {
+	/* target arguments */
+	static struct dm_arg _args[] = {
 		{0, 1024, "invalid number of priority groups"},
 		{0, 1024, "invalid initial priority group number"},
 	};
 
 	int r;
 	struct multipath *m;
-	struct arg_set as;
+	struct dm_arg_set as;
 	unsigned pg_count = 0;
 	unsigned next_pg_num;
 
@@ -871,11 +812,11 @@
 	if (r)
 		goto bad;
 
-	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
+	r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
 	if (r)
 		goto bad;
 
-	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
+	r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
 	if (r)
 		goto bad;
 
@@ -1505,10 +1446,10 @@
 	}
 
 	if (argc == 1) {
-		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
+		if (!strcasecmp(argv[0], "queue_if_no_path")) {
 			r = queue_if_no_path(m, 1, 0);
 			goto out;
-		} else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
+		} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
 			r = queue_if_no_path(m, 0, 0);
 			goto out;
 		}
@@ -1519,18 +1460,18 @@
 		goto out;
 	}
 
-	if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
+	if (!strcasecmp(argv[0], "disable_group")) {
 		r = bypass_pg_num(m, argv[1], 1);
 		goto out;
-	} else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
+	} else if (!strcasecmp(argv[0], "enable_group")) {
 		r = bypass_pg_num(m, argv[1], 0);
 		goto out;
-	} else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
+	} else if (!strcasecmp(argv[0], "switch_group")) {
 		r = switch_pg_num(m, argv[1]);
 		goto out;
-	} else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
+	} else if (!strcasecmp(argv[0], "reinstate_path"))
 		action = reinstate_path;
-	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
+	else if (!strcasecmp(argv[0], "fail_path"))
 		action = fail_path;
 	else {
 		DMWARN("Unrecognised multipath message received.");

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e5d8904..a002dd8 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c

@@ -8,19 +8,19 @@
 #include <linux/slab.h>
 
 #include "md.h"
+#include "raid1.h"
 #include "raid5.h"
-#include "dm.h"
 #include "bitmap.h"
 
+#include <linux/device-mapper.h>
+
 #define DM_MSG_PREFIX "raid"
 
 /*
- * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
- * make it so the flag doesn't set anything.
+ * The following flags are used by dm-raid.c to set up the array state.
+ * They must be cleared before md_run is called.
  */
-#ifndef MD_SYNC_STATE_FORCED
-#define MD_SYNC_STATE_FORCED 0
-#endif
+#define FirstUse 10             /* rdev flag */
 
 struct raid_dev {
 	/*
@@ -43,14 +43,15 @@
 /*
  * Flags for rs->print_flags field.
  */
-#define DMPF_DAEMON_SLEEP      0x1
-#define DMPF_MAX_WRITE_BEHIND  0x2
-#define DMPF_SYNC              0x4
-#define DMPF_NOSYNC            0x8
-#define DMPF_STRIPE_CACHE      0x10
-#define DMPF_MIN_RECOVERY_RATE 0x20
-#define DMPF_MAX_RECOVERY_RATE 0x40
-
+#define DMPF_SYNC              0x1
+#define DMPF_NOSYNC            0x2
+#define DMPF_REBUILD           0x4
+#define DMPF_DAEMON_SLEEP      0x8
+#define DMPF_MIN_RECOVERY_RATE 0x10
+#define DMPF_MAX_RECOVERY_RATE 0x20
+#define DMPF_MAX_WRITE_BEHIND  0x40
+#define DMPF_STRIPE_CACHE      0x80
+#define DMPF_REGION_SIZE       0X100
 struct raid_set {
 	struct dm_target *ti;
 
@@ -72,6 +73,7 @@
 	const unsigned level;		/* RAID level. */
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
+	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
 	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
 	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -105,7 +107,8 @@
 	}
 
 	sectors_per_dev = ti->len;
-	if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+	if ((raid_type->level > 1) &&
+	    sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
 		ti->error = "Target length not divisible by number of data devices";
 		return ERR_PTR(-EINVAL);
 	}
@@ -147,9 +150,16 @@
 {
 	int i;
 
-	for (i = 0; i < rs->md.raid_disks; i++)
+	for (i = 0; i < rs->md.raid_disks; i++) {
+		if (rs->dev[i].meta_dev)
+			dm_put_device(rs->ti, rs->dev[i].meta_dev);
+		if (rs->dev[i].rdev.sb_page)
+			put_page(rs->dev[i].rdev.sb_page);
+		rs->dev[i].rdev.sb_page = NULL;
+		rs->dev[i].rdev.sb_loaded = 0;
 		if (rs->dev[i].data_dev)
 			dm_put_device(rs->ti, rs->dev[i].data_dev);
+	}
 
 	kfree(rs);
 }
@@ -159,7 +169,16 @@
  *  <meta_dev>: meta device name or '-' if missing
  *  <data_dev>: data device name or '-' if missing
  *
- * This code parses those words.
+ * The following are permitted:
+ *    - -
+ *    - <data_dev>
+ *    <meta_dev> <data_dev>
+ *
+ * The following is not allowed:
+ *    <meta_dev> -
+ *
+ * This code parses those words.  If there is a failure,
+ * the caller must use context_free to unwind the operations.
  */
 static int dev_parms(struct raid_set *rs, char **argv)
 {
@@ -182,8 +201,16 @@
 		rs->dev[i].rdev.mddev = &rs->md;
 
 		if (strcmp(argv[0], "-")) {
-			rs->ti->error = "Metadata devices not supported";
-			return -EINVAL;
+			ret = dm_get_device(rs->ti, argv[0],
+					    dm_table_get_mode(rs->ti->table),
+					    &rs->dev[i].meta_dev);
+			rs->ti->error = "RAID metadata device lookup failure";
+			if (ret)
+				return ret;
+
+			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
+			if (!rs->dev[i].rdev.sb_page)
+				return -ENOMEM;
 		}
 
 		if (!strcmp(argv[1], "-")) {
@@ -193,6 +220,10 @@
 				return -EINVAL;
 			}
 
+			rs->ti->error = "No data device supplied with metadata device";
+			if (rs->dev[i].meta_dev)
+				return -EINVAL;
+
 			continue;
 		}
 
@@ -204,6 +235,10 @@
 			return ret;
 		}
 
+		if (rs->dev[i].meta_dev) {
+			metadata_available = 1;
+			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
+		}
 		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
 		list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -235,33 +270,109 @@
 }
 
 /*
+ * validate_region_size
+ * @rs
+ * @region_size:  region size in sectors.  If 0, pick a size (4MiB default).
+ *
+ * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
+ * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_region_size(struct raid_set *rs, unsigned long region_size)
+{
+	unsigned long min_region_size = rs->ti->len / (1 << 21);
+
+	if (!region_size) {
+		/*
+		 * Choose a reasonable default.  All figures in sectors.
+		 */
+		if (min_region_size > (1 << 13)) {
+			DMINFO("Choosing default region size of %lu sectors",
+			       region_size);
+			region_size = min_region_size;
+		} else {
+			DMINFO("Choosing default region size of 4MiB");
+			region_size = 1 << 13; /* sectors */
+		}
+	} else {
+		/*
+		 * Validate user-supplied value.
+		 */
+		if (region_size > rs->ti->len) {
+			rs->ti->error = "Supplied region size is too large";
+			return -EINVAL;
+		}
+
+		if (region_size < min_region_size) {
+			DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
+			      region_size, min_region_size);
+			rs->ti->error = "Supplied region size is too small";
+			return -EINVAL;
+		}
+
+		if (!is_power_of_2(region_size)) {
+			rs->ti->error = "Region size is not a power of 2";
+			return -EINVAL;
+		}
+
+		if (region_size < rs->md.chunk_sectors) {
+			rs->ti->error = "Region size is smaller than the chunk size";
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Convert sectors to bytes.
+	 */
+	rs->md.bitmap_info.chunksize = (region_size << 9);
+
+	return 0;
+}
+
+/*
  * Possible arguments are...
- * RAID456:
  *	<chunk_size> [optional_args]
  *
- * Optional args:
- *    [[no]sync]			Force or prevent recovery of the entire array
+ * Argument definitions
+ *    <chunk_size>			The number of sectors per disk that
+ *                                      will form the "stripe"
+ *    [[no]sync]			Force or prevent recovery of the
+ *                                      entire array
  *    [rebuild <idx>]			Rebuild the drive indicated by the index
- *    [daemon_sleep <ms>]		Time between bitmap daemon work to clear bits
+ *    [daemon_sleep <ms>]		Time between bitmap daemon work to
+ *                                      clear bits
  *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
+ *    [write_mostly <idx>]		Indicate a write mostly drive via index
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
+ *    [region_size <sectors>]           Defines granularity of bitmap
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
 			     unsigned num_raid_params)
 {
 	unsigned i, rebuild_cnt = 0;
-	unsigned long value;
+	unsigned long value, region_size = 0;
 	char *key;
 
 	/*
 	 * First, parse the in-order required arguments
+	 * "chunk_size" is the only argument of this type.
 	 */
-	if ((strict_strtoul(argv[0], 10, &value) < 0) ||
-	    !is_power_of_2(value) || (value < 8)) {
+	if ((strict_strtoul(argv[0], 10, &value) < 0)) {
 		rs->ti->error = "Bad chunk size";
 		return -EINVAL;
+	} else if (rs->raid_type->level == 1) {
+		if (value)
+			DMERR("Ignoring chunk size parameter for RAID 1");
+		value = 0;
+	} else if (!is_power_of_2(value)) {
+		rs->ti->error = "Chunk size must be a power of 2";
+		return -EINVAL;
+	} else if (value < 8) {
+		rs->ti->error = "Chunk size value is too small";
+		return -EINVAL;
 	}
 
 	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
@@ -269,22 +380,39 @@
 	num_raid_params--;
 
 	/*
+	 * We set each individual device as In_sync with a completed
+	 * 'recovery_offset'.  If there has been a device failure or
+	 * replacement then one of the following cases applies:
+	 *
+	 *   1) User specifies 'rebuild'.
+	 *      - Device is reset when param is read.
+	 *   2) A new device is supplied.
+	 *      - No matching superblock found, resets device.
+	 *   3) Device failure was transient and returns on reload.
+	 *      - Failure noticed, resets device for bitmap replay.
+	 *   4) Device hadn't completed recovery after previous failure.
+	 *      - Superblock is read and overrides recovery_offset.
+	 *
+	 * What is found in the superblocks of the devices is always
+	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
+	 */
+	for (i = 0; i < rs->md.raid_disks; i++) {
+		set_bit(In_sync, &rs->dev[i].rdev.flags);
+		rs->dev[i].rdev.recovery_offset = MaxSector;
+	}
+
+	/*
 	 * Second, parse the unordered optional arguments
 	 */
-	for (i = 0; i < rs->md.raid_disks; i++)
-		set_bit(In_sync, &rs->dev[i].rdev.flags);
-
 	for (i = 0; i < num_raid_params; i++) {
-		if (!strcmp(argv[i], "nosync")) {
+		if (!strcasecmp(argv[i], "nosync")) {
 			rs->md.recovery_cp = MaxSector;
 			rs->print_flags |= DMPF_NOSYNC;
-			rs->md.flags |= MD_SYNC_STATE_FORCED;
 			continue;
 		}
-		if (!strcmp(argv[i], "sync")) {
+		if (!strcasecmp(argv[i], "sync")) {
 			rs->md.recovery_cp = 0;
 			rs->print_flags |= DMPF_SYNC;
-			rs->md.flags |= MD_SYNC_STATE_FORCED;
 			continue;
 		}
 
@@ -300,9 +428,13 @@
 			return -EINVAL;
 		}
 
-		if (!strcmp(key, "rebuild")) {
-			if (++rebuild_cnt > rs->raid_type->parity_devs) {
-				rs->ti->error = "Too many rebuild drives given";
+		if (!strcasecmp(key, "rebuild")) {
+			rebuild_cnt++;
+			if (((rs->raid_type->level != 1) &&
+			     (rebuild_cnt > rs->raid_type->parity_devs)) ||
+			    ((rs->raid_type->level == 1) &&
+			     (rebuild_cnt > (rs->md.raid_disks - 1)))) {
+				rs->ti->error = "Too many rebuild devices specified for given RAID type";
 				return -EINVAL;
 			}
 			if (value > rs->md.raid_disks) {
@@ -311,7 +443,22 @@
 			}
 			clear_bit(In_sync, &rs->dev[value].rdev.flags);
 			rs->dev[value].rdev.recovery_offset = 0;
-		} else if (!strcmp(key, "max_write_behind")) {
+			rs->print_flags |= DMPF_REBUILD;
+		} else if (!strcasecmp(key, "write_mostly")) {
+			if (rs->raid_type->level != 1) {
+				rs->ti->error = "write_mostly option is only valid for RAID1";
+				return -EINVAL;
+			}
+			if (value > rs->md.raid_disks) {
+				rs->ti->error = "Invalid write_mostly drive index given";
+				return -EINVAL;
+			}
+			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
+		} else if (!strcasecmp(key, "max_write_behind")) {
+			if (rs->raid_type->level != 1) {
+				rs->ti->error = "max_write_behind option is only valid for RAID1";
+				return -EINVAL;
+			}
 			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
 
 			/*
@@ -324,14 +471,14 @@
 				return -EINVAL;
 			}
 			rs->md.bitmap_info.max_write_behind = value;
-		} else if (!strcmp(key, "daemon_sleep")) {
+		} else if (!strcasecmp(key, "daemon_sleep")) {
 			rs->print_flags |= DMPF_DAEMON_SLEEP;
 			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
 				rs->ti->error = "daemon sleep period out of range";
 				return -EINVAL;
 			}
 			rs->md.bitmap_info.daemon_sleep = value;
-		} else if (!strcmp(key, "stripe_cache")) {
+		} else if (!strcasecmp(key, "stripe_cache")) {
 			rs->print_flags |= DMPF_STRIPE_CACHE;
 
 			/*
@@ -348,20 +495,23 @@
 				rs->ti->error = "Bad stripe_cache size";
 				return -EINVAL;
 			}
-		} else if (!strcmp(key, "min_recovery_rate")) {
+		} else if (!strcasecmp(key, "min_recovery_rate")) {
 			rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
 			if (value > INT_MAX) {
 				rs->ti->error = "min_recovery_rate out of range";
 				return -EINVAL;
 			}
 			rs->md.sync_speed_min = (int)value;
-		} else if (!strcmp(key, "max_recovery_rate")) {
+		} else if (!strcasecmp(key, "max_recovery_rate")) {
 			rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
 			if (value > INT_MAX) {
 				rs->ti->error = "max_recovery_rate out of range";
 				return -EINVAL;
 			}
 			rs->md.sync_speed_max = (int)value;
+		} else if (!strcasecmp(key, "region_size")) {
+			rs->print_flags |= DMPF_REGION_SIZE;
+			region_size = value;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
 			rs->ti->error = "Unable to parse RAID parameters";
@@ -369,6 +519,19 @@
 		}
 	}
 
+	if (validate_region_size(rs, region_size))
+		return -EINVAL;
+
+	if (rs->md.chunk_sectors)
+		rs->ti->split_io = rs->md.chunk_sectors;
+	else
+		rs->ti->split_io = region_size;
+
+	if (rs->md.chunk_sectors)
+		rs->ti->split_io = rs->md.chunk_sectors;
+	else
+		rs->ti->split_io = region_size;
+
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.external = 1;
@@ -387,17 +550,351 @@
 {
 	struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
 
+	if (rs->raid_type->level == 1)
+		return md_raid1_congested(&rs->md, bits);
+
 	return md_raid5_congested(&rs->md, bits);
 }
 
 /*
+ * This structure is never routinely used by userspace, unlike md superblocks.
+ * Devices with this superblock should only ever be accessed via device-mapper.
+ */
+#define DM_RAID_MAGIC 0x64526D44
+struct dm_raid_superblock {
+	__le32 magic;		/* "DmRd" */
+	__le32 features;	/* Used to indicate possible future changes */
+
+	__le32 num_devices;	/* Number of devices in this array. (Max 64) */
+	__le32 array_position;	/* The position of this drive in the array */
+
+	__le64 events;		/* Incremented by md when superblock updated */
+	__le64 failed_devices;	/* Bit field of devices to indicate failures */
+
+	/*
+	 * This offset tracks the progress of the repair or replacement of
+	 * an individual drive.
+	 */
+	__le64 disk_recovery_offset;
+
+	/*
+	 * This offset tracks the progress of the initial array
+	 * synchronisation/parity calculation.
+	 */
+	__le64 array_resync_offset;
+
+	/*
+	 * RAID characteristics
+	 */
+	__le32 level;
+	__le32 layout;
+	__le32 stripe_sectors;
+
+	__u8 pad[452];		/* Round struct to 512 bytes. */
+				/* Always set to 0 when writing. */
+} __packed;
+
+static int read_disk_sb(mdk_rdev_t *rdev, int size)
+{
+	BUG_ON(!rdev->sb_page);
+
+	if (rdev->sb_loaded)
+		return 0;
+
+	if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
+		DMERR("Failed to read device superblock");
+		return -EINVAL;
+	}
+
+	rdev->sb_loaded = 1;
+
+	return 0;
+}
+
+static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdk_rdev_t *r, *t;
+	uint64_t failed_devices;
+	struct dm_raid_superblock *sb;
+
+	sb = page_address(rdev->sb_page);
+	failed_devices = le64_to_cpu(sb->failed_devices);
+
+	rdev_for_each(r, t, mddev)
+		if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
+			failed_devices |= (1ULL << r->raid_disk);
+
+	memset(sb, 0, sizeof(*sb));
+
+	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
+	sb->features = cpu_to_le32(0);	/* No features yet */
+
+	sb->num_devices = cpu_to_le32(mddev->raid_disks);
+	sb->array_position = cpu_to_le32(rdev->raid_disk);
+
+	sb->events = cpu_to_le64(mddev->events);
+	sb->failed_devices = cpu_to_le64(failed_devices);
+
+	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
+	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+
+	sb->level = cpu_to_le32(mddev->level);
+	sb->layout = cpu_to_le32(mddev->layout);
+	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+}
+
+/*
+ * super_load
+ *
+ * This function creates a superblock if one is not found on the device
+ * and will decide which superblock to use if there's a choice.
+ *
+ * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
+ */
+static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
+{
+	int ret;
+	struct dm_raid_superblock *sb;
+	struct dm_raid_superblock *refsb;
+	uint64_t events_sb, events_refsb;
+
+	rdev->sb_start = 0;
+	rdev->sb_size = sizeof(*sb);
+
+	ret = read_disk_sb(rdev, rdev->sb_size);
+	if (ret)
+		return ret;
+
+	sb = page_address(rdev->sb_page);
+	if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
+		super_sync(rdev->mddev, rdev);
+
+		set_bit(FirstUse, &rdev->flags);
+
+		/* Force writing of superblocks to disk */
+		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+
+		/* Any superblock is better than none, choose that if given */
+		return refdev ? 0 : 1;
+	}
+
+	if (!refdev)
+		return 1;
+
+	events_sb = le64_to_cpu(sb->events);
+
+	refsb = page_address(refdev->sb_page);
+	events_refsb = le64_to_cpu(refsb->events);
+
+	return (events_sb > events_refsb) ? 1 : 0;
+}
+
+static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	int role;
+	struct raid_set *rs = container_of(mddev, struct raid_set, md);
+	uint64_t events_sb;
+	uint64_t failed_devices;
+	struct dm_raid_superblock *sb;
+	uint32_t new_devs = 0;
+	uint32_t rebuilds = 0;
+	mdk_rdev_t *r, *t;
+	struct dm_raid_superblock *sb2;
+
+	sb = page_address(rdev->sb_page);
+	events_sb = le64_to_cpu(sb->events);
+	failed_devices = le64_to_cpu(sb->failed_devices);
+
+	/*
+	 * Initialise to 1 if this is a new superblock.
+	 */
+	mddev->events = events_sb ? : 1;
+
+	/*
+	 * Reshaping is not currently allowed
+	 */
+	if ((le32_to_cpu(sb->level) != mddev->level) ||
+	    (le32_to_cpu(sb->layout) != mddev->layout) ||
+	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
+		DMERR("Reshaping arrays not yet supported.");
+		return -EINVAL;
+	}
+
+	/* We can only change the number of devices in RAID1 right now */
+	if ((rs->raid_type->level != 1) &&
+	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+		DMERR("Reshaping arrays not yet supported.");
+		return -EINVAL;
+	}
+
+	if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+
+	/*
+	 * During load, we set FirstUse if a new superblock was written.
+	 * There are two reasons we might not have a superblock:
+	 * 1) The array is brand new - in which case, all of the
+	 *    devices must have their In_sync bit set.  Also,
+	 *    recovery_cp must be 0, unless forced.
+	 * 2) This is a new device being added to an old array
+	 *    and the new device needs to be rebuilt - in which
+	 *    case the In_sync bit will /not/ be set and
+	 *    recovery_cp must be MaxSector.
+	 */
+	rdev_for_each(r, t, mddev) {
+		if (!test_bit(In_sync, &r->flags)) {
+			if (!test_bit(FirstUse, &r->flags))
+				DMERR("Superblock area of "
+				      "rebuild device %d should have been "
+				      "cleared.", r->raid_disk);
+			set_bit(FirstUse, &r->flags);
+			rebuilds++;
+		} else if (test_bit(FirstUse, &r->flags))
+			new_devs++;
+	}
+
+	if (!rebuilds) {
+		if (new_devs == mddev->raid_disks) {
+			DMINFO("Superblocks created for new array");
+			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+		} else if (new_devs) {
+			DMERR("New device injected "
+			      "into existing array without 'rebuild' "
+			      "parameter specified");
+			return -EINVAL;
+		}
+	} else if (new_devs) {
+		DMERR("'rebuild' devices cannot be "
+		      "injected into an array with other first-time devices");
+		return -EINVAL;
+	} else if (mddev->recovery_cp != MaxSector) {
+		DMERR("'rebuild' specified while array is not in-sync");
+		return -EINVAL;
+	}
+
+	/*
+	 * Now we set the Faulty bit for those devices that are
+	 * recorded in the superblock as failed.
+	 */
+	rdev_for_each(r, t, mddev) {
+		if (!r->sb_page)
+			continue;
+		sb2 = page_address(r->sb_page);
+		sb2->failed_devices = 0;
+
+		/*
+		 * Check for any device re-ordering.
+		 */
+		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
+			role = le32_to_cpu(sb2->array_position);
+			if (role != r->raid_disk) {
+				if (rs->raid_type->level != 1) {
+					rs->ti->error = "Cannot change device "
+						"positions in RAID array";
+					return -EINVAL;
+				}
+				DMINFO("RAID1 device #%d now at position #%d",
+				       role, r->raid_disk);
+			}
+
+			/*
+			 * Partial recovery is performed on
+			 * returning failed devices.
+			 */
+			if (failed_devices & (1 << role))
+				set_bit(Faulty, &r->flags);
+		}
+	}
+
+	return 0;
+}
+
+static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
+
+	/*
+	 * If mddev->events is not set, we know we have not yet initialized
+	 * the array.
+	 */
+	if (!mddev->events && super_init_validation(mddev, rdev))
+		return -EINVAL;
+
+	mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
+	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+	if (!test_bit(FirstUse, &rdev->flags)) {
+		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
+		if (rdev->recovery_offset != MaxSector)
+			clear_bit(In_sync, &rdev->flags);
+	}
+
+	/*
+	 * If a device comes back, set it as not In_sync and no longer faulty.
+	 */
+	if (test_bit(Faulty, &rdev->flags)) {
+		clear_bit(Faulty, &rdev->flags);
+		clear_bit(In_sync, &rdev->flags);
+		rdev->saved_raid_disk = rdev->raid_disk;
+		rdev->recovery_offset = 0;
+	}
+
+	clear_bit(FirstUse, &rdev->flags);
+
+	return 0;
+}
+
+/*
+ * Analyse superblocks and select the freshest.
+ */
+static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
+{
+	int ret;
+	mdk_rdev_t *rdev, *freshest, *tmp;
+	mddev_t *mddev = &rs->md;
+
+	freshest = NULL;
+	rdev_for_each(rdev, tmp, mddev) {
+		if (!rdev->meta_bdev)
+			continue;
+
+		ret = super_load(rdev, freshest);
+
+		switch (ret) {
+		case 1:
+			freshest = rdev;
+			break;
+		case 0:
+			break;
+		default:
+			ti->error = "Failed to load superblock";
+			return ret;
+		}
+	}
+
+	if (!freshest)
+		return 0;
+
+	/*
+	 * Validation of the freshest device provides the source of
+	 * validation for the remaining devices.
+	 */
+	ti->error = "Unable to assemble array: Invalid superblocks";
+	if (super_validate(mddev, freshest))
+		return -EINVAL;
+
+	rdev_for_each(rdev, tmp, mddev)
+		if ((rdev != freshest) && super_validate(mddev, rdev))
+			return -EINVAL;
+
+	return 0;
+}
+
+/*
  * Construct a RAID4/5/6 mapping:
  * Args:
  *	<raid_type> <#raid_params> <raid_params>		\
  *	<#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
  *
- * ** metadata devices are not supported yet, use '-' instead **
- *
  * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
  * details on possible <raid_params>.
  */
@@ -465,8 +962,12 @@
 	if (ret)
 		goto bad;
 
+	rs->md.sync_super = super_sync;
+	ret = analyse_superblocks(ti, rs);
+	if (ret)
+		goto bad;
+
 	INIT_WORK(&rs->md.event_work, do_table_event);
-	ti->split_io = rs->md.chunk_sectors;
 	ti->private = rs;
 
 	mutex_lock(&rs->md.reconfig_mutex);
@@ -482,6 +983,7 @@
 	rs->callbacks.congested_fn = raid_is_congested;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
+	mddev_suspend(&rs->md);
 	return 0;
 
 bad:
@@ -546,12 +1048,17 @@
 		break;
 	case STATUSTYPE_TABLE:
 		/* The string you would use to construct this array */
-		for (i = 0; i < rs->md.raid_disks; i++)
-			if (rs->dev[i].data_dev &&
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			if ((rs->print_flags & DMPF_REBUILD) &&
+			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
-				raid_param_cnt++; /* for rebuilds */
+				raid_param_cnt += 2; /* for rebuilds */
+			if (rs->dev[i].data_dev &&
+			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+				raid_param_cnt += 2;
+		}
 
-		raid_param_cnt += (hweight64(rs->print_flags) * 2);
+		raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
 		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
 			raid_param_cnt--;
 
@@ -565,7 +1072,8 @@
 			DMEMIT(" nosync");
 
 		for (i = 0; i < rs->md.raid_disks; i++)
-			if (rs->dev[i].data_dev &&
+			if ((rs->print_flags & DMPF_REBUILD) &&
+			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				DMEMIT(" rebuild %u", i);
 
@@ -579,6 +1087,11 @@
 		if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
 			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
 
+		for (i = 0; i < rs->md.raid_disks; i++)
+			if (rs->dev[i].data_dev &&
+			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+				DMEMIT(" write_mostly %u", i);
+
 		if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
 			DMEMIT(" max_write_behind %lu",
 			       rs->md.bitmap_info.max_write_behind);
@@ -591,9 +1104,16 @@
 			       conf ? conf->max_nr_stripes * 2 : 0);
 		}
 
+		if (rs->print_flags & DMPF_REGION_SIZE)
+			DMEMIT(" region_size %lu",
+			       rs->md.bitmap_info.chunksize >> 9);
+
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
-			DMEMIT(" -"); /* metadata device */
+			if (rs->dev[i].meta_dev)
+				DMEMIT(" %s", rs->dev[i].meta_dev->name);
+			else
+				DMEMIT(" -");
 
 			if (rs->dev[i].data_dev)
 				DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -650,12 +1170,13 @@
 {
 	struct raid_set *rs = ti->private;
 
+	bitmap_load(&rs->md);
 	mddev_resume(&rs->md);
 }
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 135c2f1..d1f1d70 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c

@@ -58,25 +58,30 @@
 #define NUM_SNAPSHOT_HDR_CHUNKS 1
 
 struct disk_header {
-	uint32_t magic;
+	__le32 magic;
 
 	/*
 	 * Is this snapshot valid.  There is no way of recovering
 	 * an invalid snapshot.
 	 */
-	uint32_t valid;
+	__le32 valid;
 
 	/*
 	 * Simple, incrementing version. no backward
 	 * compatibility.
 	 */
-	uint32_t version;
+	__le32 version;
 
 	/* In sectors */
-	uint32_t chunk_size;
-};
+	__le32 chunk_size;
+} __packed;
 
 struct disk_exception {
+	__le64 old_chunk;
+	__le64 new_chunk;
+} __packed;
+
+struct core_exception {
 	uint64_t old_chunk;
 	uint64_t new_chunk;
 };
@@ -169,10 +174,9 @@
 	if (!ps->area)
 		goto err_area;
 
-	ps->zero_area = vmalloc(len);
+	ps->zero_area = vzalloc(len);
 	if (!ps->zero_area)
 		goto err_zero_area;
-	memset(ps->zero_area, 0, len);
 
 	ps->header_area = vmalloc(len);
 	if (!ps->header_area)
@@ -396,32 +400,32 @@
 }
 
 static void read_exception(struct pstore *ps,
-			   uint32_t index, struct disk_exception *result)
+			   uint32_t index, struct core_exception *result)
 {
-	struct disk_exception *e = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, index);
 
 	/* copy it */
-	result->old_chunk = le64_to_cpu(e->old_chunk);
-	result->new_chunk = le64_to_cpu(e->new_chunk);
+	result->old_chunk = le64_to_cpu(de->old_chunk);
+	result->new_chunk = le64_to_cpu(de->new_chunk);
 }
 
 static void write_exception(struct pstore *ps,
-			    uint32_t index, struct disk_exception *de)
+			    uint32_t index, struct core_exception *e)
 {
-	struct disk_exception *e = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, index);
 
 	/* copy it */
-	e->old_chunk = cpu_to_le64(de->old_chunk);
-	e->new_chunk = cpu_to_le64(de->new_chunk);
+	de->old_chunk = cpu_to_le64(e->old_chunk);
+	de->new_chunk = cpu_to_le64(e->new_chunk);
 }
 
 static void clear_exception(struct pstore *ps, uint32_t index)
 {
-	struct disk_exception *e = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, index);
 
 	/* clear it */
-	e->old_chunk = 0;
-	e->new_chunk = 0;
+	de->old_chunk = 0;
+	de->new_chunk = 0;
 }
 
 /*
@@ -437,13 +441,13 @@
 {
 	int r;
 	unsigned int i;
-	struct disk_exception de;
+	struct core_exception e;
 
 	/* presume the area is full */
 	*full = 1;
 
 	for (i = 0; i < ps->exceptions_per_area; i++) {
-		read_exception(ps, i, &de);
+		read_exception(ps, i, &e);
 
 		/*
 		 * If the new_chunk is pointing at the start of
@@ -451,7 +455,7 @@
 		 * is we know that we've hit the end of the
 		 * exceptions.  Therefore the area is not full.
 		 */
-		if (de.new_chunk == 0LL) {
+		if (e.new_chunk == 0LL) {
 			ps->current_committed = i;
 			*full = 0;
 			break;
@@ -460,13 +464,13 @@
 		/*
 		 * Keep track of the start of the free chunks.
 		 */
-		if (ps->next_free <= de.new_chunk)
-			ps->next_free = de.new_chunk + 1;
+		if (ps->next_free <= e.new_chunk)
+			ps->next_free = e.new_chunk + 1;
 
 		/*
 		 * Otherwise we add the exception to the snapshot.
 		 */
-		r = callback(callback_context, de.old_chunk, de.new_chunk);
+		r = callback(callback_context, e.old_chunk, e.new_chunk);
 		if (r)
 			return r;
 	}
@@ -563,7 +567,7 @@
 	ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
 				  sizeof(struct disk_exception);
 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
-			sizeof(*ps->callbacks));
+				   sizeof(*ps->callbacks));
 	if (!ps->callbacks)
 		return -ENOMEM;
 
@@ -641,12 +645,12 @@
 {
 	unsigned int i;
 	struct pstore *ps = get_info(store);
-	struct disk_exception de;
+	struct core_exception ce;
 	struct commit_callback *cb;
 
-	de.old_chunk = e->old_chunk;
-	de.new_chunk = e->new_chunk;
-	write_exception(ps, ps->current_committed++, &de);
+	ce.old_chunk = e->old_chunk;
+	ce.new_chunk = e->new_chunk;
+	write_exception(ps, ps->current_committed++, &ce);
 
 	/*
 	 * Add the callback to the back of the array.  This code
@@ -670,7 +674,7 @@
 	 * If we completely filled the current area, then wipe the next one.
 	 */
 	if ((ps->current_committed == ps->exceptions_per_area) &&
-	     zero_disk_area(ps, ps->current_area + 1))
+	    zero_disk_area(ps, ps->current_area + 1))
 		ps->valid = 0;
 
 	/*
@@ -701,7 +705,7 @@
 				    chunk_t *last_new_chunk)
 {
 	struct pstore *ps = get_info(store);
-	struct disk_exception de;
+	struct core_exception ce;
 	int nr_consecutive;
 	int r;
 
@@ -722,9 +726,9 @@
 		ps->current_committed = ps->exceptions_per_area;
 	}
 
-	read_exception(ps, ps->current_committed - 1, &de);
-	*last_old_chunk = de.old_chunk;
-	*last_new_chunk = de.new_chunk;
+	read_exception(ps, ps->current_committed - 1, &ce);
+	*last_old_chunk = ce.old_chunk;
+	*last_new_chunk = ce.new_chunk;
 
 	/*
 	 * Find number of consecutive chunks within the current area,
@@ -733,9 +737,9 @@
 	for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
 	     nr_consecutive++) {
 		read_exception(ps, ps->current_committed - 1 - nr_consecutive,
-			       &de);
-		if (de.old_chunk != *last_old_chunk - nr_consecutive ||
-		    de.new_chunk != *last_new_chunk - nr_consecutive)
+			       &ce);
+		if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
+		    ce.new_chunk != *last_new_chunk - nr_consecutive)
 			break;
 	}
 
@@ -753,7 +757,7 @@
 	for (i = 0; i < nr_merged; i++)
 		clear_exception(ps, ps->current_committed - 1 - i);
 
-	r = area_io(ps, WRITE);
+	r = area_io(ps, WRITE_FLUSH_FUA);
 	if (r < 0)
 		return r;
 

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9ecff5f..6f75887 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c

@@ -30,16 +30,6 @@
 	((ti)->type->name == dm_snapshot_merge_target_name)
 
 /*
- * The percentage increment we will wake up users at
- */
-#define WAKE_UP_PERCENT 5
-
-/*
- * kcopyd priority of snapshot operations
- */
-#define SNAPSHOT_COPY_PRIORITY 2
-
-/*
  * The size of the mempool used to track chunks in use.
  */
 #define MIN_IOS 256
@@ -180,6 +170,13 @@
 	 * kcopyd.
 	 */
 	int started;
+
+	/*
+	 * For writing a complete chunk, bypassing the copy.
+	 */
+	struct bio *full_bio;
+	bio_end_io_t *full_bio_end_io;
+	void *full_bio_private;
 };
 
 /*
@@ -1055,8 +1052,7 @@
 
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s) {
-		ti->error = "Cannot allocate snapshot context private "
-		    "structure";
+		ti->error = "Cannot allocate private snapshot structure";
 		r = -ENOMEM;
 		goto bad;
 	}
@@ -1380,6 +1376,7 @@
 	struct dm_snapshot *s = pe->snap;
 	struct bio *origin_bios = NULL;
 	struct bio *snapshot_bios = NULL;
+	struct bio *full_bio = NULL;
 	int error = 0;
 
 	if (!success) {
@@ -1415,10 +1412,15 @@
 	 */
 	dm_insert_exception(&s->complete, e);
 
- out:
+out:
 	dm_remove_exception(&pe->e);
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = bio_list_get(&pe->origin_bios);
+	full_bio = pe->full_bio;
+	if (full_bio) {
+		full_bio->bi_end_io = pe->full_bio_end_io;
+		full_bio->bi_private = pe->full_bio_private;
+	}
 	free_pending_exception(pe);
 
 	increment_pending_exceptions_done_count();
@@ -1426,10 +1428,15 @@
 	up_write(&s->lock);
 
 	/* Submit any pending write bios */
-	if (error)
+	if (error) {
+		if (full_bio)
+			bio_io_error(full_bio);
 		error_bios(snapshot_bios);
-	else
+	} else {
+		if (full_bio)
+			bio_endio(full_bio, 0);
 		flush_bios(snapshot_bios);
+	}
 
 	retry_origin_bios(s, origin_bios);
 }
@@ -1480,8 +1487,33 @@
 	dest.count = src.count;
 
 	/* Hand over to kcopyd */
-	dm_kcopyd_copy(s->kcopyd_client,
-		    &src, 1, &dest, 0, copy_callback, pe);
+	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
+}
+
+static void full_bio_end_io(struct bio *bio, int error)
+{
+	void *callback_data = bio->bi_private;
+
+	dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
+}
+
+static void start_full_bio(struct dm_snap_pending_exception *pe,
+			   struct bio *bio)
+{
+	struct dm_snapshot *s = pe->snap;
+	void *callback_data;
+
+	pe->full_bio = bio;
+	pe->full_bio_end_io = bio->bi_end_io;
+	pe->full_bio_private = bio->bi_private;
+
+	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
+						   copy_callback, pe);
+
+	bio->bi_end_io = full_bio_end_io;
+	bio->bi_private = callback_data;
+
+	generic_make_request(bio);
 }
 
 static struct dm_snap_pending_exception *
@@ -1519,6 +1551,7 @@
 	bio_list_init(&pe->origin_bios);
 	bio_list_init(&pe->snapshot_bios);
 	pe->started = 0;
+	pe->full_bio = NULL;
 
 	if (s->store->type->prepare_exception(s->store, &pe->e)) {
 		free_pending_exception(pe);
@@ -1612,10 +1645,19 @@
 		}
 
 		remap_exception(s, &pe->e, bio, chunk);
-		bio_list_add(&pe->snapshot_bios, bio);
 
 		r = DM_MAPIO_SUBMITTED;
 
+		if (!pe->started &&
+		    bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+			pe->started = 1;
+			up_write(&s->lock);
+			start_full_bio(pe, bio);
+			goto out;
+		}
+
+		bio_list_add(&pe->snapshot_bios, bio);
+
 		if (!pe->started) {
 			/* this is protected by snap->lock */
 			pe->started = 1;
@@ -1628,9 +1670,9 @@
 		map_context->ptr = track_chunk(s, chunk);
 	}
 
- out_unlock:
+out_unlock:
 	up_write(&s->lock);
- out:
+out:
 	return r;
 }
 
@@ -1974,7 +2016,7 @@
 			pe_to_start_now = pe;
 		}
 
- next_snapshot:
+next_snapshot:
 		up_write(&snap->lock);
 
 		if (pe_to_start_now) {

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bfe9c23..986b875 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c

@@ -54,7 +54,6 @@
 	sector_t *highs;
 	struct dm_target *targets;
 
-	unsigned discards_supported:1;
 	unsigned integrity_supported:1;
 
 	/*
@@ -154,12 +153,11 @@
 		return NULL;
 
 	size = nmemb * elem_size;
-	addr = vmalloc(size);
-	if (addr)
-		memset(addr, 0, size);
+	addr = vzalloc(size);
 
 	return addr;
 }
+EXPORT_SYMBOL(dm_vcalloc);
 
 /*
  * highs, and targets are managed as dynamic arrays during a
@@ -209,7 +207,6 @@
 	INIT_LIST_HEAD(&t->devices);
 	INIT_LIST_HEAD(&t->target_callbacks);
 	atomic_set(&t->holders, 0);
-	t->discards_supported = 1;
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -281,6 +278,7 @@
 {
 	atomic_inc(&t->holders);
 }
+EXPORT_SYMBOL(dm_table_get);
 
 void dm_table_put(struct dm_table *t)
 {
@@ -290,6 +288,7 @@
 	smp_mb__before_atomic_dec();
 	atomic_dec(&t->holders);
 }
+EXPORT_SYMBOL(dm_table_put);
 
 /*
  * Checks to see if we need to extend highs or targets.
@@ -455,13 +454,14 @@
  * Add a device to the list, or just increment the usage count if
  * it's already present.
  */
-static int __table_get_device(struct dm_table *t, struct dm_target *ti,
-		      const char *path, fmode_t mode, struct dm_dev **result)
+int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+		  struct dm_dev **result)
 {
 	int r;
 	dev_t uninitialized_var(dev);
 	struct dm_dev_internal *dd;
 	unsigned int major, minor;
+	struct dm_table *t = ti->table;
 
 	BUG_ON(!t);
 
@@ -509,6 +509,7 @@
 	*result = &dd->dm_dev;
 	return 0;
 }
+EXPORT_SYMBOL(dm_get_device);
 
 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 			 sector_t start, sector_t len, void *data)
@@ -539,23 +540,15 @@
 	 * If not we'll force DM to use PAGE_SIZE or
 	 * smaller I/O, just to be safe.
 	 */
-
-	if (q->merge_bvec_fn && !ti->type->merge)
+	if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
 		blk_limits_max_hw_sectors(limits,
 					  (unsigned int) (PAGE_SIZE >> 9));
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
-int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
-		  struct dm_dev **result)
-{
-	return __table_get_device(ti->table, ti, path, mode, result);
-}
-
-
 /*
- * Decrement a devices use count and remove it if necessary.
+ * Decrement a device's use count and remove it if necessary.
  */
 void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 {
@@ -568,6 +561,7 @@
 		kfree(dd);
 	}
 }
+EXPORT_SYMBOL(dm_put_device);
 
 /*
  * Checks to see if the target joins onto the end of the table.
@@ -791,8 +785,9 @@
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	if (!tgt->num_discard_requests)
-		t->discards_supported = 0;
+	if (!tgt->num_discard_requests && tgt->discards_supported)
+		DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
+		       dm_device_name(t->md), type);
 
 	return 0;
 
@@ -802,6 +797,63 @@
 	return r;
 }
 
+/*
+ * Target argument parsing helpers.
+ */
+static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+			     unsigned *value, char **error, unsigned grouped)
+{
+	const char *arg_str = dm_shift_arg(arg_set);
+
+	if (!arg_str ||
+	    (sscanf(arg_str, "%u", value) != 1) ||
+	    (*value < arg->min) ||
+	    (*value > arg->max) ||
+	    (grouped && arg_set->argc < *value)) {
+		*error = arg->error;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+		unsigned *value, char **error)
+{
+	return validate_next_arg(arg, arg_set, value, error, 0);
+}
+EXPORT_SYMBOL(dm_read_arg);
+
+int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+		      unsigned *value, char **error)
+{
+	return validate_next_arg(arg, arg_set, value, error, 1);
+}
+EXPORT_SYMBOL(dm_read_arg_group);
+
+const char *dm_shift_arg(struct dm_arg_set *as)
+{
+	char *r;
+
+	if (as->argc) {
+		as->argc--;
+		r = *as->argv;
+		as->argv++;
+		return r;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(dm_shift_arg);
+
+void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
+{
+	BUG_ON(as->argc < num_args);
+	as->argc -= num_args;
+	as->argv += num_args;
+}
+EXPORT_SYMBOL(dm_consume_args);
+
 static int dm_table_set_type(struct dm_table *t)
 {
 	unsigned i;
@@ -1077,11 +1129,13 @@
 		t->event_fn(t->event_context);
 	mutex_unlock(&_event_lock);
 }
+EXPORT_SYMBOL(dm_table_event);
 
 sector_t dm_table_get_size(struct dm_table *t)
 {
 	return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
 }
+EXPORT_SYMBOL(dm_table_get_size);
 
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
 {
@@ -1194,9 +1248,45 @@
 			       blk_get_integrity(template_disk));
 }
 
+static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
+				sector_t start, sector_t len, void *data)
+{
+	unsigned flush = (*(unsigned *)data);
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && (q->flush_flags & flush);
+}
+
+static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	/*
+	 * Require at least one underlying device to support flushes.
+	 * t->devices includes internal dm devices such as mirror logs
+	 * so we need to use iterate_devices here, which targets
+	 * supporting flushes must provide.
+	 */
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->num_flush_requests)
+			continue;
+
+		if (ti->type->iterate_devices &&
+		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
+			return 1;
+	}
+
+	return 0;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 			       struct queue_limits *limits)
 {
+	unsigned flush = 0;
+
 	/*
 	 * Copy table's limits to the DM device's request_queue
 	 */
@@ -1207,6 +1297,13 @@
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 
+	if (dm_table_supports_flush(t, REQ_FLUSH)) {
+		flush |= REQ_FLUSH;
+		if (dm_table_supports_flush(t, REQ_FUA))
+			flush |= REQ_FUA;
+	}
+	blk_queue_flush(q, flush);
+
 	dm_table_set_integrity(t);
 
 	/*
@@ -1237,6 +1334,7 @@
 {
 	return t->mode;
 }
+EXPORT_SYMBOL(dm_table_get_mode);
 
 static void suspend_targets(struct dm_table *t, unsigned postsuspend)
 {
@@ -1345,6 +1443,7 @@
 {
 	return t->md;
 }
+EXPORT_SYMBOL(dm_table_get_md);
 
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
@@ -1359,19 +1458,19 @@
 	struct dm_target *ti;
 	unsigned i = 0;
 
-	if (!t->discards_supported)
-		return 0;
-
 	/*
 	 * Unless any target used by the table set discards_supported,
 	 * require at least one underlying device to support discards.
 	 * t->devices includes internal dm devices such as mirror logs
 	 * so we need to use iterate_devices here, which targets
-	 * supporting discard must provide.
+	 * supporting discard selectively must provide.
 	 */
 	while (i < dm_table_get_num_targets(t)) {
 		ti = dm_table_get_target(t, i++);
 
+		if (!ti->num_discard_requests)
+			continue;
+
 		if (ti->discards_supported)
 			return 1;
 
@@ -1382,13 +1481,3 @@
 
 	return 0;
 }
-
-EXPORT_SYMBOL(dm_vcalloc);
-EXPORT_SYMBOL(dm_get_device);
-EXPORT_SYMBOL(dm_put_device);
-EXPORT_SYMBOL(dm_table_event);
-EXPORT_SYMBOL(dm_table_get_size);
-EXPORT_SYMBOL(dm_table_get_mode);
-EXPORT_SYMBOL(dm_table_get_md);
-EXPORT_SYMBOL(dm_table_put);
-EXPORT_SYMBOL(dm_table_get);

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0cf68b4..52b39f3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c

@@ -37,6 +37,8 @@
 static unsigned int major = 0;
 static unsigned int _major = 0;
 
+static DEFINE_IDR(_minor_idr);
+
 static DEFINE_SPINLOCK(_minor_lock);
 /*
  * For bio-based dm.
@@ -109,6 +111,7 @@
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_MERGE_IS_OPTIONAL 6
 
 /*
  * Work processed by per-device workqueue.
@@ -313,6 +316,12 @@
 
 	while (i--)
 		_exits[i]();
+
+	/*
+	 * Should be empty by this point.
+	 */
+	idr_remove_all(&_minor_idr);
+	idr_destroy(&_minor_idr);
 }
 
 /*
@@ -1171,7 +1180,8 @@
 
 		/*
 		 * Even though the device advertised discard support,
-		 * reconfiguration might have changed that since the
+		 * that does not mean every target supports it, and
+		 * reconfiguration might also have changed that since the
 		 * check was performed.
 		 */
 		if (!ti->num_discard_requests)
@@ -1705,8 +1715,6 @@
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
-static DEFINE_IDR(_minor_idr);
-
 static void free_minor(int minor)
 {
 	spin_lock(&_minor_lock);
@@ -1800,7 +1808,6 @@
 	blk_queue_make_request(md->queue, dm_request);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
-	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
 
 /*
@@ -1986,6 +1993,59 @@
 }
 
 /*
+ * Return 1 if the queue has a compulsory merge_bvec_fn function.
+ *
+ * If this function returns 0, then the device is either a non-dm
+ * device without a merge_bvec_fn, or it is a dm device that is
+ * able to split any bios it receives that are too big.
+ */
+int dm_queue_merge_is_compulsory(struct request_queue *q)
+{
+	struct mapped_device *dev_md;
+
+	if (!q->merge_bvec_fn)
+		return 0;
+
+	if (q->make_request_fn == dm_request) {
+		dev_md = q->queuedata;
+		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
+			return 0;
+	}
+
+	return 1;
+}
+
+static int dm_device_merge_is_compulsory(struct dm_target *ti,
+					 struct dm_dev *dev, sector_t start,
+					 sector_t len, void *data)
+{
+	struct block_device *bdev = dev->bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	return dm_queue_merge_is_compulsory(q);
+}
+
+/*
+ * Return 1 if it is acceptable to ignore merge_bvec_fn based
+ * on the properties of the underlying devices.
+ */
+static int dm_table_merge_is_optional(struct dm_table *table)
+{
+	unsigned i = 0;
+	struct dm_target *ti;
+
+	while (i < dm_table_get_num_targets(table)) {
+		ti = dm_table_get_target(table, i++);
+
+		if (ti->type->iterate_devices &&
+		    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
  * Returns old map, which caller must destroy.
  */
 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
@@ -1995,6 +2055,7 @@
 	struct request_queue *q = md->queue;
 	sector_t size;
 	unsigned long flags;
+	int merge_is_optional;
 
 	size = dm_table_get_size(t);
 
@@ -2020,10 +2081,16 @@
 
 	__bind_mempools(md, t);
 
+	merge_is_optional = dm_table_merge_is_optional(t);
+
 	write_lock_irqsave(&md->map_lock, flags);
 	old_map = md->map;
 	md->map = t;
 	dm_table_set_restrictions(t, q, limits);
+	if (merge_is_optional)
+		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
+	else
+		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	write_unlock_irqrestore(&md->map_lock, flags);
 
 	return old_map;

diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1aaf167..6745dbd 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h

@@ -66,6 +66,8 @@
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
+int dm_queue_merge_is_compulsory(struct request_queue *q);
+
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
 void dm_set_md_type(struct mapped_device *md, unsigned type);

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 4427e04..3fa1f3d 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h

@@ -208,6 +208,49 @@
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
+/*
+ * Target argument parsing.
+ */
+struct dm_arg_set {
+	unsigned argc;
+	char **argv;
+};
+
+/*
+ * The minimum and maximum value of a numeric argument, together with
+ * the error message to use if the number is found to be outside that range.
+ */
+struct dm_arg {
+	unsigned min;
+	unsigned max;
+	char *error;
+};
+
+/*
+ * Validate the next argument, either returning it as *value or, if invalid,
+ * returning -EINVAL and setting *error.
+ */
+int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+		unsigned *value, char **error);
+
+/*
+ * Process the next argument as the start of a group containing between
+ * arg->min and arg->max further arguments. Either return the size as
+ * *num_args or, if invalid, return -EINVAL and set *error.
+ */
+int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+		      unsigned *num_args, char **error);
+
+/*
+ * Return the current argument and shift to the next.
+ */
+const char *dm_shift_arg(struct dm_arg_set *as);
+
+/*
+ * Move through num_args arguments.
+ */
+void dm_consume_args(struct dm_arg_set *as, unsigned num_args);
+
 /*-----------------------------------------------------------------
  * Functions for creating and manipulating mapped devices.
  * Drop the reference with dm_put when you finish with the object.

diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 3708455..0cb8eff 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h

@@ -267,9 +267,9 @@
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	20
+#define DM_VERSION_MINOR	21
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2011-02-02)"
+#define DM_VERSION_EXTRA	"-ioctl (2011-07-06)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */

diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h
index 298d587..5e54458 100644
--- a/include/linux/dm-kcopyd.h
+++ b/include/linux/dm-kcopyd.h

@@ -42,5 +42,20 @@
 		   unsigned num_dests, struct dm_io_region *dests,
 		   unsigned flags, dm_kcopyd_notify_fn fn, void *context);
 
+/*
+ * Prepare a callback and submit it via the kcopyd thread.
+ *
+ * dm_kcopyd_prepare_callback allocates a callback structure and returns it.
+ * It must not be called from interrupt context.
+ * The returned value should be passed into dm_kcopyd_do_callback.
+ *
+ * dm_kcopyd_do_callback submits the callback.
+ * It may be called from interrupt context.
+ * The callback is issued from the kcopyd thread.
+ */
+void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
+				 dm_kcopyd_notify_fn fn, void *context);
+void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err);
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_DM_KCOPYD_H */
commit	f3406816bb2486fc44558bec77179cd9bcbd4450	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Tue Aug 02 20:49:21 2011 -1000
committer	Linus Torvalds <torvalds@linux-foundation.org>	Tue Aug 02 20:49:21 2011 -1000
tree	718db1ef45e55314b5e7290f77e70e6328d855a4
parent	4400478ba3d939b680810aa004f1e954b4f8ba16 [diff]
parent	ed8b752bccf2560e305e25125721d2f0ac759e88 [diff]