Add support for setting RAID stride and strip-width via mke2fs and tune2fs

This is useful for mballoc to align block allocation on the RAID
stripe boundaries.

Signed-off-by: Rupesh Thakare <rupesh@clusterfs.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
diff --git a/misc/mke2fs.8.in b/misc/mke2fs.8.in
index 9afd7f4..65c42bf 100644
--- a/misc/mke2fs.8.in
+++ b/misc/mke2fs.8.in
@@ -179,10 +179,23 @@
 following extended options are supported:
 .RS 1.2i
 .TP
-.BI stride= stripe-size
+.BI stride= stride-size
 Configure the filesystem for a RAID array with
-.I stripe-size
-filesystem blocks per stripe.
+.I stride-size
+filesystem blocks. This is the number of blocks read or written to disk
+before moving to next disk. This mostly affects placement of filesystem
+metadata like bitmaps at
+.BR mke2fs (2)
+time to avoid placing them on a single disk, which can hurt the performanace.
+It may also be used by block allocator.
+.TP
+.BI stripe-width= stripe-width
+Configure the filesystem for a RAID array with
+.I stripe-width
+filesystem blocks per stripe. This is typically be stride-size * N, where
+N is the number of data disks in the RAID (e.g. RAID 5 N+1, RAID 6 N+2).
+This allows the block allocator to prevent read-modify-write of the
+parity in a RAID stripe if possible when the data is written.
 .TP
 .BI resize= max-online-resize
 Reserve enough space so that the block group descriptor table can grow
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index ede6722..cfc8116 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -756,7 +756,7 @@
 static void parse_extended_opts(struct ext2_super_block *param, 
 				const char *opts)
 {
-	char	*buf, *token, *next, *p, *arg;
+	char	*buf, *token, *next, *p, *arg, *badopt = "";
 	int	len;
 	int	r_usage = 0;
 
@@ -783,16 +783,32 @@
 		if (strcmp(token, "stride") == 0) {
 			if (!arg) {
 				r_usage++;
+				badopt = token;
 				continue;
 			}
-			fs_stride = strtoul(arg, &p, 0);
-			if (*p || (fs_stride == 0)) {
+			param->s_raid_stride = strtoul(arg, &p, 0);
+			if (*p || (param->s_raid_stride == 0)) {
 				fprintf(stderr,
 					_("Invalid stride parameter: %s\n"),
 					arg);
 				r_usage++;
 				continue;
 			}
+		} else if (strcmp(token, "stripe-width") == 0 ||
+			   strcmp(token, "stripe_width") == 0) {
+			if (!arg) {
+				r_usage++;
+				badopt = token;
+				continue;
+			}
+			param->s_raid_stripe_width = strtoul(arg, &p, 0);
+			if (*p || (param->s_raid_stripe_width == 0)) {
+				fprintf(stderr,
+					_("Invalid stripe-width parameter: %s\n"),
+					arg);
+				r_usage++;
+				continue;
+			}
 		} else if (!strcmp(token, "resize")) {
 			unsigned long resize, bpg, rsv_groups;
 			unsigned long group_desc_count, desc_blocks;
@@ -801,6 +817,7 @@
 
 			if (!arg) {
 				r_usage++;
+				badopt = token;
 				continue;
 			}
 
@@ -851,21 +868,31 @@
 			}
 		} else if (!strcmp(token, "test_fs")) {
 			param->s_flags |= EXT2_FLAGS_TEST_FILESYS;
-		} else
+		} else {
 			r_usage++;
+			badopt = token;
+		}
 	}
 	if (r_usage) {
-		fprintf(stderr, _("\nBad options specified.\n\n"
+		fprintf(stderr, _("\nBad option(s) specified: %s\n\n"
 			"Extended options are separated by commas, "
 			"and may take an argument which\n"
 			"\tis set off by an equals ('=') sign.\n\n"
 			"Valid extended options are:\n"
-			"\tstride=<stride length in blocks>\n"
-			"\tresize=<resize maximum size in blocks>\n"
-			"\ttest_fs\n"));
+			"\tstride=<RAID per-disk data chunk in blocks>\n"
+			"\tstripe-width=<RAID stride * data disks in blocks>\n"
+			"\tresize=<resize maximum size in blocks>\n\n"
+			"\ttest_fs\n"),
+			badopt);
 		free(buf);
 		exit(1);
 	}
+	if (param->s_raid_stride &&
+	    (param->s_raid_stripe_width % param->s_raid_stride) != 0)
+		fprintf(stderr, _("\nWarning: RAID stripe-width %u not an even "
+				  "multiple of stride %u.\n\n"),
+			param->s_raid_stripe_width, param->s_raid_stride);
+
 	free(buf);
 }	
 
@@ -1643,7 +1670,7 @@
 		test_disk(fs, &bb_list);
 
 	handle_bad_blocks(fs, bb_list);
-	fs->stride = fs->super->s_raid_stride = fs_stride;
+	fs->stride = fs_stride = fs->super->s_raid_stride;
 	retval = ext2fs_allocate_tables(fs);
 	if (retval) {
 		com_err(program_name, retval,
diff --git a/misc/tune2fs.8.in b/misc/tune2fs.8.in
index d1edbd9..435556f 100644
--- a/misc/tune2fs.8.in
+++ b/misc/tune2fs.8.in
@@ -150,10 +150,28 @@
 .TP
 .BI \-E " extended-options"
 Set extended options for the filesystem.  Extended options are comma
-separated, and may take an argument using the equals ('=') sign.  The 
-following extended options are supported:
+separated, and may take an argument using the equals ('=') sign.
+The following extended options are supported:
 .RS 1.2i
 .TP
+.BI stride= stride-size
+Configure the filesystem for a RAID array with
+.I stride-size
+filesystem blocks. This is the number of blocks read or written to disk
+before moving to next disk. This mostly affects placement of filesystem
+metadata like bitmaps at
+.BR mke2fs (2)
+time to avoid placing them on a single disk, which can hurt the performanace.
+It may also be used by block allocator.
+.TP
+.BI stripe-width= stripe-width
+Configure the filesystem for a RAID array with
+.I stripe-width
+filesystem blocks per stripe. This is typically be stride-size * N, where
+N is the number of data disks in the RAID (e.g. RAID 5 N+1, RAID 6 N+2).
+This allows the block allocator to prevent read-modify-write of the
+parity in a RAID stripe if possible when the data is written.
+.TP
 .B test_fs
 Set a flag in the filesystem superblock indicating that it may be
 mounted using experimental kernel code, such as the ext4dev filesystem.
diff --git a/misc/tune2fs.c b/misc/tune2fs.c
index d6999e5..083d6f5 100644
--- a/misc/tune2fs.c
+++ b/misc/tune2fs.c
@@ -81,6 +81,8 @@
 static int open_flag;
 static char *features_cmd;
 static char *mntopts_cmd;
+static int stride, stripe_width;
+static int stride_set, stripe_width_set;
 static char *extended_cmd;
 
 int journal_size, journal_flags;
@@ -797,7 +799,36 @@
 			fs->super->s_flags &= ~EXT2_FLAGS_TEST_FILESYS;
 			printf("Clearing test filesystem flag\n");
 			ext2fs_mark_super_dirty(fs);
-		} else
+		} else if (strcmp(token, "stride") == 0) {
+			if (!arg) {
+				r_usage++;
+				continue;
+			}
+			stride = strtoul(arg, &p, 0);
+			if (*p || (stride == 0)) {
+				fprintf(stderr,
+				       _("Invalid RAID stride: %s\n"),
+					arg);
+				r_usage++;
+				continue;
+			}
+			stride_set = 1;
+		} else if (strcmp(token, "stripe-width") == 0 ||
+			   strcmp(token, "stripe_width") == 0) {
+			if (!arg) {
+				r_usage++;
+				continue;
+			}
+			stripe_width = strtoul(arg, &p, 0);
+			if (*p || (stripe_width == 0)) {
+				fprintf(stderr,
+					_("Invalid RAID stripe-width: %s\n"),
+					arg);
+				r_usage++;
+				continue;
+			}
+			stripe_width_set = 1;
+		} else 
 			r_usage++;
 	}
 	if (r_usage) {
@@ -806,6 +837,8 @@
 			"and may take an argument which\n"
 			"\tis set off by an equals ('=') sign.\n\n"
 			"Valid extended options are:\n"
+			"\tstride=<RAID per-disk chunk size in blocks>\n"
+			"\tstripe-width=<RAID stride*data disks in blocks>\n"
 			"\ttest_fs\n"
 			"\t^test_fs\n"));
 		free(buf);
@@ -1006,6 +1039,16 @@
 
 	if (l_flag)
 		list_super (sb);
+	if (stride_set) {
+		sb->s_raid_stride = stride;
+		ext2fs_mark_super_dirty(fs);
+		printf(_("Setting stride size to %d\n"), stride);
+	}
+	if (stripe_width_set) {
+		sb->s_raid_stripe_width = stripe_width;
+		ext2fs_mark_super_dirty(fs);
+		printf(_("Setting stripe width to %d\n"), stripe_width);
+	}
 	remove_error_table(&et_ext2_error_table);
 	return (ext2fs_close (fs) ? 1 : 0);
 }