[PATCH] Add tmpfs options for memory placement policies

Anything that writes into a tmpfs filesystem is liable to disproportionately
decrease the available memory on a particular node.  Since there's no telling
what sort of application (e.g.  dd/cp/cat) might be dropping large files
there, this lets the admin choose the appropriate default behavior for their
site's situation.

Introduce a tmpfs mount option which allows specifying a memory policy and
a second option to specify the nodelist for that policy.  With the default
policy, tmpfs will behave as it does today.  This patch adds support for
preferred, bind, and interleave policies.

The default policy will cause pages to be added to tmpfs files on the node
which is doing the writing.  Some jobs expect a single process to create
and manage the tmpfs files.  This results in a node which has a
significantly reduced number of free pages.

With this patch, the administrator can specify the policy and nodes for
that policy where they would prefer allocations.

This patch was originally written by Brent Casavant and Hugh Dickins.  I
added support for the bind and preferred policies and the mpol_nodelist
mount option.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/mm/shmem.c b/mm/shmem.c
index 343b3c0..ce501bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1316,7 +1316,8 @@
 		case S_IFREG:
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
-			mpol_shared_policy_init(&info->policy);
+			mpol_shared_policy_init(&info->policy, sbinfo->policy,
+							&sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
 			inode->i_nlink++;
@@ -1330,7 +1331,8 @@
 			 * Must not load anything in the rbtree,
 			 * mpol_free_shared_policy will not be called.
 			 */
-			mpol_shared_policy_init(&info->policy);
+			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+						NULL);
 			break;
 		}
 	} else if (sbinfo->max_inodes) {
@@ -1843,7 +1845,9 @@
 	.put_link	= shmem_put_link,
 };
 
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid,
+	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
+	int *policy, nodemask_t *policy_nodes)
 {
 	char *this_char, *value, *rest;
 
@@ -1897,6 +1901,19 @@
 			*gid = simple_strtoul(value,&rest,0);
 			if (*rest)
 				goto bad_val;
+		} else if (!strcmp(this_char,"mpol")) {
+			if (!strcmp(value,"default"))
+				*policy = MPOL_DEFAULT;
+			else if (!strcmp(value,"preferred"))
+				*policy = MPOL_PREFERRED;
+			else if (!strcmp(value,"bind"))
+				*policy = MPOL_BIND;
+			else if (!strcmp(value,"interleave"))
+				*policy = MPOL_INTERLEAVE;
+			else
+				goto bad_val;
+		} else if (!strcmp(this_char,"mpol_nodelist")) {
+			nodelist_parse(value, *policy_nodes);
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
 			       this_char);
@@ -1917,12 +1934,14 @@
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	unsigned long max_blocks = sbinfo->max_blocks;
 	unsigned long max_inodes = sbinfo->max_inodes;
+	int policy = sbinfo->policy;
+	nodemask_t policy_nodes = sbinfo->policy_nodes;
 	unsigned long blocks;
 	unsigned long inodes;
 	int error = -EINVAL;
 
-	if (shmem_parse_options(data, NULL, NULL, NULL,
-				&max_blocks, &max_inodes))
+	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
+				&max_inodes, &policy, &policy_nodes))
 		return error;
 
 	spin_lock(&sbinfo->stat_lock);
@@ -1948,6 +1967,8 @@
 	sbinfo->free_blocks = max_blocks - blocks;
 	sbinfo->max_inodes  = max_inodes;
 	sbinfo->free_inodes = max_inodes - inodes;
+	sbinfo->policy = policy;
+	sbinfo->policy_nodes = policy_nodes;
 out:
 	spin_unlock(&sbinfo->stat_lock);
 	return error;
@@ -1972,6 +1993,8 @@
 	struct shmem_sb_info *sbinfo;
 	unsigned long blocks = 0;
 	unsigned long inodes = 0;
+	int policy = MPOL_DEFAULT;
+	nodemask_t policy_nodes = node_online_map;
 
 #ifdef CONFIG_TMPFS
 	/*
@@ -1984,8 +2007,8 @@
 		inodes = totalram_pages - totalhigh_pages;
 		if (inodes > blocks)
 			inodes = blocks;
-		if (shmem_parse_options(data, &mode, &uid, &gid,
-					&blocks, &inodes))
+		if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
+					&inodes, &policy, &policy_nodes))
 			return -EINVAL;
 	}
 #else
@@ -2003,6 +2026,8 @@
 	sbinfo->free_blocks = blocks;
 	sbinfo->max_inodes = inodes;
 	sbinfo->free_inodes = inodes;
+	sbinfo->policy = policy;
+	sbinfo->policy_nodes = policy_nodes;
 
 	sb->s_fs_info = sbinfo;
 	sb->s_maxbytes = SHMEM_MAX_BYTES;