mempolicy: use MPOL_PREFERRED for system-wide default policy

Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API
[set_mempolicy(), mbind() and internal versions], the kernel simply installs a
NULL struct mempolicy pointer in the appropriate context: task policy, vma
policy, or shared policy.  This causes any use of that policy to "fall back"
to the next most specific policy scope.

The only use of MPOL_DEFAULT to mean "local allocation" is in the system
default policy.  This requires extra checks/cases for MPOL_DEFAULT in many
mempolicy.c functions.

There is another, "preferred" way to specify local allocation via the APIs.
That is using the MPOL_PREFERRED policy mode with an empty nodemask.
Internally, the empty nodemask gets converted to a preferred_node id of '-1'.
All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the
node local to the cpu where the allocation occurs.

System default policy, except during boot, is hard-coded to "local
allocation".  By using the MPOL_PREFERRED mode with a negative value of
preferred node for system default policy, MPOL_DEFAULT will never occur in the
'policy' member of a struct mempolicy.  Thus, we can remove all checks for
MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation
paths.

In slab_node() return local node id when policy pointer is NULL.  No need to
set a pol value to take the switch default.  Replace switch default with
BUG()--i.e., shouldn't happen.

With this patch MPOL_DEFAULT is only used in the APIs, including internal
calls to do_set_mempolicy() and in the display of policy in
/proc/<pid>/numa_maps.  It always means "fall back" to the the next most
specific policy scope.  This simplifies the description of memory policies
quite a bit, with no visible change in behavior.

get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when
the requested policy [task or vma/shared] is NULL.  These are the values one
would supply via set_mempolicy() or mbind() to achieve that condition--default
behavior.

This patch updates Documentation to reflect this change.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a237295..fea4a5d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -104,9 +104,13 @@
    policied. */
 enum zone_type policy_zone = 0;
 
+/*
+ * run-time system-wide default policy => local allocation
+ */
 struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
-	.mode   = MPOL_DEFAULT,
+	.mode = MPOL_PREFERRED,
+	.v =  { .preferred_node =  -1 },
 };
 
 static const struct mempolicy_operations {
@@ -189,7 +193,7 @@
 	if (mode == MPOL_DEFAULT) {
 		if (nodes && !nodes_empty(*nodes))
 			return ERR_PTR(-EINVAL);
-		return NULL;
+		return NULL;	/* simply delete any existing policy */
 	}
 	VM_BUG_ON(!nodes);
 
@@ -246,7 +250,6 @@
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
-	p->mode = MPOL_DEFAULT;
 	kmem_cache_free(policy_cache, p);
 }
 
@@ -626,13 +629,16 @@
 	return 0;
 }
 
-/* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
+/*
+ * Return nodemask for policy for get_mempolicy() query
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
 	nodes_clear(*nodes);
+	if (p == &default_policy)
+		return;
+
 	switch (p->mode) {
-	case MPOL_DEFAULT:
-		break;
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
@@ -686,6 +692,11 @@
 	}
 
 	if (flags & MPOL_F_ADDR) {
+		/*
+		 * Do NOT fall back to task policy if the
+		 * vma/shared policy at addr is NULL.  We
+		 * want to return MPOL_DEFAULT in this case.
+		 */
 		down_read(&mm->mmap_sem);
 		vma = find_vma_intersection(mm, addr, addr+1);
 		if (!vma) {
@@ -700,7 +711,7 @@
 		return -EINVAL;
 
 	if (!pol)
-		pol = &default_policy;
+		pol = &default_policy;	/* indicates default behavior */
 
 	if (flags & MPOL_F_NODE) {
 		if (flags & MPOL_F_ADDR) {
@@ -715,8 +726,11 @@
 			err = -EINVAL;
 			goto out;
 		}
-	} else
-		*policy = pol->mode | pol->flags;
+	} else {
+		*policy = pol == &default_policy ? MPOL_DEFAULT :
+						pol->mode;
+		*policy |= pol->flags;
+	}
 
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
@@ -725,7 +739,7 @@
 
 	err = 0;
 	if (nmask)
-		get_zonemask(pol, nmask);
+		get_policy_nodemask(pol, nmask);
 
  out:
 	mpol_cond_put(pol);
@@ -1286,8 +1300,7 @@
 									addr);
 			if (vpol)
 				pol = vpol;
-		} else if (vma->vm_policy &&
-				vma->vm_policy->mode != MPOL_DEFAULT)
+		} else if (vma->vm_policy)
 			pol = vma->vm_policy;
 	}
 	if (!pol)
@@ -1334,7 +1347,6 @@
 			nd = first_node(policy->v.nodes);
 		break;
 	case MPOL_INTERLEAVE: /* should not happen */
-	case MPOL_DEFAULT:
 		nd = numa_node_id();
 		break;
 	default:
@@ -1369,9 +1381,15 @@
  */
 unsigned slab_node(struct mempolicy *policy)
 {
-	unsigned short pol = policy ? policy->mode : MPOL_DEFAULT;
+	if (!policy)
+		return numa_node_id();
 
-	switch (pol) {
+	switch (policy->mode) {
+	case MPOL_PREFERRED:
+		if (unlikely(policy->v.preferred_node >= 0))
+			return policy->v.preferred_node;
+		return numa_node_id();
+
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
 
@@ -1390,13 +1408,8 @@
 		return zone->node;
 	}
 
-	case MPOL_PREFERRED:
-		if (policy->v.preferred_node >= 0)
-			return policy->v.preferred_node;
-		/* Fall through */
-
 	default:
-		return numa_node_id();
+		BUG();
 	}
 }
 
@@ -1650,8 +1663,6 @@
 	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
 		return 0;
 	switch (a->mode) {
-	case MPOL_DEFAULT:
-		return 1;
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
@@ -1828,7 +1839,7 @@
 	if (policy != MPOL_DEFAULT) {
 		struct mempolicy *newpol;
 
-		/* Falls back to MPOL_DEFAULT on any error */
+		/* Falls back to NULL policy [MPOL_DEFAULT] on any error */
 		newpol = mpol_new(policy, flags, policy_nodes);
 		if (!IS_ERR(newpol)) {
 			/* Create pseudo-vma that contains just the policy */
@@ -1952,9 +1963,14 @@
 	char *p = buffer;
 	int l;
 	nodemask_t nodes;
-	unsigned short mode = pol ? pol->mode : MPOL_DEFAULT;
+	unsigned short mode;
 	unsigned short flags = pol ? pol->flags : 0;
 
+	if (!pol || pol == &default_policy)
+		mode = MPOL_DEFAULT;
+	else
+		mode = pol->mode;
+
 	switch (mode) {
 	case MPOL_DEFAULT:
 		nodes_clear(nodes);