[XFS] Introduce two new mount options (nolargeio/largeio) to allow
filesystems to expose the filesystem stripe width in stat(2) rather than
the page cache size. This allows applications requiring high bandwidth to
easily determine the optimum I/O size for the underlying filesystem. The
default is to report the page cache size (i.e. "nolargeio").

SGI-PV: 942818
SGI-Modid: xfs-linux:xfs-kern:23830a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index fa87279..f6f6b67 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -189,7 +189,7 @@
 		break;
 	}
 
-	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blksize = xfs_preferred_iosize(mp);
 	inode->i_generation = ip->i_d.di_gen;
 	i_size_write(inode, ip->i_d.di_size);
 	inode->i_blocks =
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 268f45b..6199964 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -124,6 +124,7 @@
 	inode->i_mtime	    = vap->va_mtime;
 	inode->i_ctime	    = vap->va_ctime;
 	inode->i_atime	    = vap->va_atime;
+	inode->i_blksize    = vap->va_blocksize;
 	if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index c93cb28..90d9d56 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -106,5 +106,7 @@
 #define XFSMNT_IHASHSIZE	0x20000000	/* inode hash table size */
 #define XFSMNT_DIRSYNC		0x40000000	/* sync creat,link,unlink,rename
 						 * symlink,mkdir,rmdir,mknod */
+#define XFSMNT_COMPAT_IOSIZE	0x80000000	/* don't report large preferred
+						 * I/O size in stat() */
 
 #endif	/* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0653bee..b71af18 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -421,6 +421,9 @@
 						 * allocation */
 #define XFS_MOUNT_IHASHSIZE	0x00100000	/* inode hash table size */
 #define XFS_MOUNT_DIRSYNC	0x00200000	/* synchronous directory ops */
+#define XFS_MOUNT_COMPAT_IOSIZE	0x00400000	/* don't report large preferred
+						 * I/O size in stat() */
+
 
 /*
  * Default minimum read and write sizes.
@@ -442,6 +445,30 @@
 #define	XFS_WSYNC_READIO_LOG	15	/* 32K */
 #define	XFS_WSYNC_WRITEIO_LOG	14	/* 16K */
 
+/*
+ * Allow large block sizes to be reported to userspace programs if the
+ * "largeio" mount option is used. 
+ *
+ * If compatibility mode is specified, simply return the basic unit of caching
+ * so that we don't get inefficient read/modify/write I/O from user apps.
+ * Otherwise....
+ *
+ * If the underlying volume is a stripe, then return the stripe width in bytes
+ * as the recommended I/O size. It is not a stripe and we've set a default
+ * buffered I/O size, return that, otherwise return the compat default.
+ */
+static inline unsigned long
+xfs_preferred_iosize(xfs_mount_t *mp)
+{
+	if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
+		return PAGE_CACHE_SIZE;
+	return (mp->m_swidth ?
+		(mp->m_swidth << mp->m_sb.sb_blocklog) :
+		((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
+			(1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) :
+			PAGE_CACHE_SIZE));
+}
+
 #define XFS_MAXIOFFSET(mp)	((mp)->m_maxioffset)
 
 #define XFS_FORCED_SHUTDOWN(mp)	((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 9142351..7227bae 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -307,6 +307,9 @@
 	if (ap->flags & XFSMNT_DIRSYNC)
 		mp->m_flags |= XFS_MOUNT_DIRSYNC;
 
+	if (ap->flags & XFSMNT_COMPAT_IOSIZE)
+		mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+
 	/*
 	 * no recovery flag requires a read-only mount
 	 */
@@ -1645,6 +1648,9 @@
 #define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
 #define MNTOPT_NOIKEEP	"noikeep"	/* free empty inode clusters */
+#define MNTOPT_LARGEIO	   "largeio"	/* report large I/O sizes in stat() */
+#define MNTOPT_NOLARGEIO   "nolargeio"	/* do not report large I/O sizes
+					 * in stat(). */
 
 STATIC unsigned long
 suffix_strtoul(const char *cp, char **endp, unsigned int base)
@@ -1681,6 +1687,7 @@
 	int			dsunit, dswidth, vol_dsunit, vol_dswidth;
 	int			iosize;
 
+	args->flags |= XFSMNT_COMPAT_IOSIZE;
 #if 0	/* XXX: off by default, until some remaining issues ironed out */
 	args->flags |= XFSMNT_IDELETE; /* default to on */
 #endif
@@ -1809,6 +1816,10 @@
 			args->flags &= ~XFSMNT_IDELETE;
 		} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
 			args->flags |= XFSMNT_IDELETE;
+		} else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+			args->flags &= ~XFSMNT_COMPAT_IOSIZE;
+		} else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+			args->flags |= XFSMNT_COMPAT_IOSIZE;
 		} else if (!strcmp(this_char, "osyncisdsync")) {
 			/* no-op, this is now the default */
 printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index e2bf2ef..8221b11 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -181,40 +181,7 @@
 		vap->va_rdev = 0;
 
 		if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
-
-#if 0
-			/* Large block sizes confuse various
-			 * user space programs, so letting the
-			 * stripe size through is not a good
-			 * idea for now.
-			 */
-			vap->va_blocksize = mp->m_swidth ?
-				/*
-				 * If the underlying volume is a stripe, then
-				 * return the stripe width in bytes as the
-				 * recommended I/O size.
-				 */
-				(mp->m_swidth << mp->m_sb.sb_blocklog) :
-				/*
-				 * Return the largest of the preferred buffer
-				 * sizes since doing small I/Os into larger
-				 * buffers causes buffers to be decommissioned.
-				 * The value returned is in bytes.
-				 */
-				(1 << (int)MAX(mp->m_readio_log,
-					       mp->m_writeio_log));
-
-#else
-			vap->va_blocksize =
-				/*
-				 * Return the largest of the preferred buffer
-				 * sizes since doing small I/Os into larger
-				 * buffers causes buffers to be decommissioned.
-				 * The value returned is in bytes.
-				 */
-				1 << (int)MAX(mp->m_readio_log,
-					       mp->m_writeio_log);
-#endif
+			vap->va_blocksize = xfs_preferred_iosize(mp);
 		} else {
 
 			/*