blob: 284974230459728b2ec92f4afdc9e49a4e03ec21 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
David Rientjes00ef2d22013-02-22 16:35:36 -080029 * As a special case NUMA_NO_NODE here means do the allocation
Linus Torvalds1da177e2005-04-16 15:20:36 -070030 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
Linus Torvalds1da177e2005-04-16 15:20:36 -070066*/
67
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -070068#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <linux/mempolicy.h>
71#include <linux/mm.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/nodemask.h>
77#include <linux/cpuset.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/slab.h>
79#include <linux/string.h>
Paul Gortmakerb95f1b312011-10-16 02:01:52 -040080#include <linux/export.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070081#include <linux/nsproxy.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070082#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080085#include <linux/swap.h>
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080086#include <linux/seq_file.h>
87#include <linux/proc_fs.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080088#include <linux/migrate.h>
Hugh Dickins62b61f62009-12-14 17:59:33 -080089#include <linux/ksm.h>
Christoph Lameter95a402c2006-06-23 02:03:53 -070090#include <linux/rmap.h>
David Quigley86c3a762006-06-23 02:04:02 -070091#include <linux/security.h>
Adrian Bunkdbcb0f12007-10-16 01:26:26 -070092#include <linux/syscalls.h>
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -070093#include <linux/ctype.h>
KOSAKI Motohiro6d9c2852009-12-14 17:58:11 -080094#include <linux/mm_inline.h>
Lee Schermerhornb24f53a2012-10-25 14:16:32 +020095#include <linux/mmu_notifier.h>
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -070096#include <linux/printk.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080097
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <asm/tlbflush.h>
99#include <asm/uaccess.h>
Michal Hocko778d3b02011-07-26 16:08:30 -0700100#include <linux/random.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101
Nick Piggin62695a82008-10-18 20:26:09 -0700102#include "internal.h"
103
Christoph Lameter38e35862006-01-08 01:01:01 -0800104/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800105#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -0800106#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800107
Pekka Enbergfcc234f2006-03-22 00:08:13 -0800108static struct kmem_cache *policy_cache;
109static struct kmem_cache *sn_cache;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111/* Highest zone. An specific allocation for a zone below that is not
112 policied. */
Christoph Lameter62672762007-02-10 01:43:07 -0800113enum zone_type policy_zone = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700115/*
116 * run-time system-wide default policy => local allocation
117 */
H Hartley Sweetene754d792011-10-31 17:09:23 -0700118static struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 .refcnt = ATOMIC_INIT(1), /* never free it */
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700120 .mode = MPOL_PREFERRED,
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700121 .flags = MPOL_F_LOCAL,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122};
123
Mel Gorman5606e382012-11-02 18:19:13 +0000124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125
126static struct mempolicy *get_task_policy(struct task_struct *p)
127{
128 struct mempolicy *pol = p->mempolicy;
Mel Gorman5606e382012-11-02 18:19:13 +0000129
130 if (!pol) {
Jianguo Wu1da6f0e2013-09-11 14:21:25 -0700131 int node = numa_node_id();
Mel Gorman5606e382012-11-02 18:19:13 +0000132
Jianguo Wu1da6f0e2013-09-11 14:21:25 -0700133 if (node != NUMA_NO_NODE) {
134 pol = &preferred_node_policy[node];
135 /*
136 * preferred_node_policy is not initialised early in
137 * boot
138 */
139 if (!pol->mode)
140 pol = NULL;
141 }
Mel Gorman5606e382012-11-02 18:19:13 +0000142 }
143
144 return pol;
145}
146
David Rientjes37012942008-04-28 02:12:33 -0700147static const struct mempolicy_operations {
148 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
Miao Xie708c1bb2010-05-24 14:32:07 -0700149 /*
150 * If read-side task has no lock to protect task->mempolicy, write-side
151 * task will rebind the task->mempolicy by two step. The first step is
152 * setting all the newly nodes, and the second step is cleaning all the
153 * disallowed nodes. In this way, we can avoid finding no node to alloc
154 * page.
155 * If we have a lock to protect task->mempolicy in read-side, we do
156 * rebind directly.
157 *
158 * step:
159 * MPOL_REBIND_ONCE - do rebind work at once
160 * MPOL_REBIND_STEP1 - set all the newly nodes
161 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
162 */
163 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
164 enum mpol_rebind_step step);
David Rientjes37012942008-04-28 02:12:33 -0700165} mpol_ops[MPOL_MAX];
166
Mel Gorman19770b32008-04-28 02:12:18 -0700167/* Check that the nodemask contains at least one populated zone */
David Rientjes37012942008-04-28 02:12:33 -0700168static int is_valid_nodemask(const nodemask_t *nodemask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169{
Lai Jiangshand3eb1572013-02-22 16:33:22 -0800170 return nodes_intersects(*nodemask, node_states[N_MEMORY]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171}
172
David Rientjesf5b087b2008-04-28 02:12:27 -0700173static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
174{
Bob Liu6d556292010-05-24 14:31:59 -0700175 return pol->flags & MPOL_MODE_FLAGS;
David Rientjes4c50bc02008-04-28 02:12:30 -0700176}
177
178static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
179 const nodemask_t *rel)
180{
181 nodemask_t tmp;
182 nodes_fold(tmp, *orig, nodes_weight(*rel));
183 nodes_onto(*ret, tmp, *rel);
David Rientjesf5b087b2008-04-28 02:12:27 -0700184}
185
David Rientjes37012942008-04-28 02:12:33 -0700186static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
187{
188 if (nodes_empty(*nodes))
189 return -EINVAL;
190 pol->v.nodes = *nodes;
191 return 0;
192}
193
194static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
195{
196 if (!nodes)
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700197 pol->flags |= MPOL_F_LOCAL; /* local allocation */
David Rientjes37012942008-04-28 02:12:33 -0700198 else if (nodes_empty(*nodes))
199 return -EINVAL; /* no allowed nodes */
200 else
201 pol->v.preferred_node = first_node(*nodes);
202 return 0;
203}
204
205static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
206{
207 if (!is_valid_nodemask(nodes))
208 return -EINVAL;
209 pol->v.nodes = *nodes;
210 return 0;
211}
212
Miao Xie58568d22009-06-16 15:31:49 -0700213/*
214 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
215 * any, for the new policy. mpol_new() has already validated the nodes
216 * parameter with respect to the policy mode and flags. But, we need to
217 * handle an empty nodemask with MPOL_PREFERRED here.
218 *
219 * Must be called holding task's alloc_lock to protect task's mems_allowed
220 * and mempolicy. May also be called holding the mmap_semaphore for write.
221 */
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700222static int mpol_set_nodemask(struct mempolicy *pol,
223 const nodemask_t *nodes, struct nodemask_scratch *nsc)
Miao Xie58568d22009-06-16 15:31:49 -0700224{
Miao Xie58568d22009-06-16 15:31:49 -0700225 int ret;
226
227 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
228 if (pol == NULL)
229 return 0;
Lai Jiangshan01f13bd2012-12-12 13:51:33 -0800230 /* Check N_MEMORY */
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700231 nodes_and(nsc->mask1,
Lai Jiangshan01f13bd2012-12-12 13:51:33 -0800232 cpuset_current_mems_allowed, node_states[N_MEMORY]);
Miao Xie58568d22009-06-16 15:31:49 -0700233
234 VM_BUG_ON(!nodes);
235 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
236 nodes = NULL; /* explicit local allocation */
237 else {
238 if (pol->flags & MPOL_F_RELATIVE_NODES)
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700239 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
Miao Xie58568d22009-06-16 15:31:49 -0700240 else
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700241 nodes_and(nsc->mask2, *nodes, nsc->mask1);
242
Miao Xie58568d22009-06-16 15:31:49 -0700243 if (mpol_store_user_nodemask(pol))
244 pol->w.user_nodemask = *nodes;
245 else
246 pol->w.cpuset_mems_allowed =
247 cpuset_current_mems_allowed;
248 }
249
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700250 if (nodes)
251 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
252 else
253 ret = mpol_ops[pol->mode].create(pol, NULL);
Miao Xie58568d22009-06-16 15:31:49 -0700254 return ret;
255}
256
257/*
258 * This function just creates a new policy, does some check and simple
259 * initialization. You must invoke mpol_set_nodemask() to set nodes.
260 */
David Rientjes028fec42008-04-28 02:12:25 -0700261static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
264 struct mempolicy *policy;
265
David Rientjes028fec42008-04-28 02:12:25 -0700266 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
David Rientjes00ef2d22013-02-22 16:35:36 -0800267 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
Paul Mundt140d5a42007-07-15 23:38:16 -0700268
David Rientjes3e1f0642008-04-28 02:12:34 -0700269 if (mode == MPOL_DEFAULT) {
270 if (nodes && !nodes_empty(*nodes))
David Rientjes37012942008-04-28 02:12:33 -0700271 return ERR_PTR(-EINVAL);
Lee Schermerhornd3a71032012-10-25 14:16:29 +0200272 return NULL;
David Rientjes37012942008-04-28 02:12:33 -0700273 }
David Rientjes3e1f0642008-04-28 02:12:34 -0700274 VM_BUG_ON(!nodes);
275
276 /*
277 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
278 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
279 * All other modes require a valid pointer to a non-empty nodemask.
280 */
281 if (mode == MPOL_PREFERRED) {
282 if (nodes_empty(*nodes)) {
283 if (((flags & MPOL_F_STATIC_NODES) ||
284 (flags & MPOL_F_RELATIVE_NODES)))
285 return ERR_PTR(-EINVAL);
David Rientjes3e1f0642008-04-28 02:12:34 -0700286 }
Peter Zijlstra479e2802012-10-25 14:16:28 +0200287 } else if (mode == MPOL_LOCAL) {
288 if (!nodes_empty(*nodes))
289 return ERR_PTR(-EINVAL);
290 mode = MPOL_PREFERRED;
David Rientjes3e1f0642008-04-28 02:12:34 -0700291 } else if (nodes_empty(*nodes))
292 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
294 if (!policy)
295 return ERR_PTR(-ENOMEM);
296 atomic_set(&policy->refcnt, 1);
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700297 policy->mode = mode;
David Rientjes3e1f0642008-04-28 02:12:34 -0700298 policy->flags = flags;
David Rientjesf5b087b2008-04-28 02:12:27 -0700299
David Rientjes37012942008-04-28 02:12:33 -0700300 return policy;
301}
302
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700303/* Slow path of a mpol destructor. */
304void __mpol_put(struct mempolicy *p)
305{
306 if (!atomic_dec_and_test(&p->refcnt))
307 return;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700308 kmem_cache_free(policy_cache, p);
309}
310
Miao Xie708c1bb2010-05-24 14:32:07 -0700311static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
312 enum mpol_rebind_step step)
David Rientjes37012942008-04-28 02:12:33 -0700313{
314}
315
Miao Xie708c1bb2010-05-24 14:32:07 -0700316/*
317 * step:
318 * MPOL_REBIND_ONCE - do rebind work at once
319 * MPOL_REBIND_STEP1 - set all the newly nodes
320 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
321 */
322static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
323 enum mpol_rebind_step step)
David Rientjes37012942008-04-28 02:12:33 -0700324{
325 nodemask_t tmp;
326
327 if (pol->flags & MPOL_F_STATIC_NODES)
328 nodes_and(tmp, pol->w.user_nodemask, *nodes);
329 else if (pol->flags & MPOL_F_RELATIVE_NODES)
330 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
331 else {
Miao Xie708c1bb2010-05-24 14:32:07 -0700332 /*
333 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
334 * result
335 */
336 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
337 nodes_remap(tmp, pol->v.nodes,
338 pol->w.cpuset_mems_allowed, *nodes);
339 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
340 } else if (step == MPOL_REBIND_STEP2) {
341 tmp = pol->w.cpuset_mems_allowed;
342 pol->w.cpuset_mems_allowed = *nodes;
343 } else
344 BUG();
David Rientjes37012942008-04-28 02:12:33 -0700345 }
346
Miao Xie708c1bb2010-05-24 14:32:07 -0700347 if (nodes_empty(tmp))
348 tmp = *nodes;
349
350 if (step == MPOL_REBIND_STEP1)
351 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
352 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
353 pol->v.nodes = tmp;
354 else
355 BUG();
356
David Rientjes37012942008-04-28 02:12:33 -0700357 if (!node_isset(current->il_next, tmp)) {
358 current->il_next = next_node(current->il_next, tmp);
359 if (current->il_next >= MAX_NUMNODES)
360 current->il_next = first_node(tmp);
361 if (current->il_next >= MAX_NUMNODES)
362 current->il_next = numa_node_id();
363 }
364}
365
366static void mpol_rebind_preferred(struct mempolicy *pol,
Miao Xie708c1bb2010-05-24 14:32:07 -0700367 const nodemask_t *nodes,
368 enum mpol_rebind_step step)
David Rientjes37012942008-04-28 02:12:33 -0700369{
370 nodemask_t tmp;
371
David Rientjes37012942008-04-28 02:12:33 -0700372 if (pol->flags & MPOL_F_STATIC_NODES) {
373 int node = first_node(pol->w.user_nodemask);
374
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700375 if (node_isset(node, *nodes)) {
David Rientjes37012942008-04-28 02:12:33 -0700376 pol->v.preferred_node = node;
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700377 pol->flags &= ~MPOL_F_LOCAL;
378 } else
379 pol->flags |= MPOL_F_LOCAL;
David Rientjes37012942008-04-28 02:12:33 -0700380 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
381 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
382 pol->v.preferred_node = first_node(tmp);
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700383 } else if (!(pol->flags & MPOL_F_LOCAL)) {
David Rientjes37012942008-04-28 02:12:33 -0700384 pol->v.preferred_node = node_remap(pol->v.preferred_node,
385 pol->w.cpuset_mems_allowed,
386 *nodes);
387 pol->w.cpuset_mems_allowed = *nodes;
388 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389}
390
Miao Xie708c1bb2010-05-24 14:32:07 -0700391/*
392 * mpol_rebind_policy - Migrate a policy to a different set of nodes
393 *
394 * If read-side task has no lock to protect task->mempolicy, write-side
395 * task will rebind the task->mempolicy by two step. The first step is
396 * setting all the newly nodes, and the second step is cleaning all the
397 * disallowed nodes. In this way, we can avoid finding no node to alloc
398 * page.
399 * If we have a lock to protect task->mempolicy in read-side, we do
400 * rebind directly.
401 *
402 * step:
403 * MPOL_REBIND_ONCE - do rebind work at once
404 * MPOL_REBIND_STEP1 - set all the newly nodes
405 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
406 */
407static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
408 enum mpol_rebind_step step)
David Rientjes1d0d2682008-04-28 02:12:32 -0700409{
David Rientjes1d0d2682008-04-28 02:12:32 -0700410 if (!pol)
411 return;
Wang Sheng-Hui89c522c2012-05-29 15:06:16 -0700412 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
David Rientjes1d0d2682008-04-28 02:12:32 -0700413 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
414 return;
Miao Xie708c1bb2010-05-24 14:32:07 -0700415
416 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
417 return;
418
419 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
420 BUG();
421
422 if (step == MPOL_REBIND_STEP1)
423 pol->flags |= MPOL_F_REBINDING;
424 else if (step == MPOL_REBIND_STEP2)
425 pol->flags &= ~MPOL_F_REBINDING;
426 else if (step >= MPOL_REBIND_NSTEP)
427 BUG();
428
429 mpol_ops[pol->mode].rebind(pol, newmask, step);
David Rientjes1d0d2682008-04-28 02:12:32 -0700430}
431
432/*
433 * Wrapper for mpol_rebind_policy() that just requires task
434 * pointer, and updates task mempolicy.
Miao Xie58568d22009-06-16 15:31:49 -0700435 *
436 * Called with task's alloc_lock held.
David Rientjes1d0d2682008-04-28 02:12:32 -0700437 */
438
Miao Xie708c1bb2010-05-24 14:32:07 -0700439void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
440 enum mpol_rebind_step step)
David Rientjes1d0d2682008-04-28 02:12:32 -0700441{
Miao Xie708c1bb2010-05-24 14:32:07 -0700442 mpol_rebind_policy(tsk->mempolicy, new, step);
David Rientjes1d0d2682008-04-28 02:12:32 -0700443}
444
445/*
446 * Rebind each vma in mm to new nodemask.
447 *
448 * Call holding a reference to mm. Takes mm->mmap_sem during call.
449 */
450
451void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
452{
453 struct vm_area_struct *vma;
454
455 down_write(&mm->mmap_sem);
456 for (vma = mm->mmap; vma; vma = vma->vm_next)
Miao Xie708c1bb2010-05-24 14:32:07 -0700457 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
David Rientjes1d0d2682008-04-28 02:12:32 -0700458 up_write(&mm->mmap_sem);
459}
460
David Rientjes37012942008-04-28 02:12:33 -0700461static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
462 [MPOL_DEFAULT] = {
463 .rebind = mpol_rebind_default,
464 },
465 [MPOL_INTERLEAVE] = {
466 .create = mpol_new_interleave,
467 .rebind = mpol_rebind_nodemask,
468 },
469 [MPOL_PREFERRED] = {
470 .create = mpol_new_preferred,
471 .rebind = mpol_rebind_preferred,
472 },
473 [MPOL_BIND] = {
474 .create = mpol_new_bind,
475 .rebind = mpol_rebind_nodemask,
476 },
477};
478
Christoph Lameterfc301282006-01-18 17:42:29 -0800479static void migrate_page_add(struct page *page, struct list_head *pagelist,
480 unsigned long flags);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800481
Naoya Horiguchi98094942013-09-11 14:22:14 -0700482/*
483 * Scan through pages checking if pages follow certain conditions,
484 * and move them to the pagelist if they do.
485 */
486static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800487 unsigned long addr, unsigned long end,
488 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800489 void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490{
Hugh Dickins91612e02005-06-21 17:15:07 -0700491 pte_t *orig_pte;
492 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700493 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700494
Hugh Dickins705e87c2005-10-29 18:16:27 -0700495 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700496 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800497 struct page *page;
Andy Whitcroft25ba77c2006-12-06 20:33:03 -0800498 int nid;
Hugh Dickins91612e02005-06-21 17:15:07 -0700499
500 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800502 page = vm_normal_page(vma, addr, *pte);
503 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504 continue;
Nick Piggin053837f2006-01-18 17:42:27 -0800505 /*
Hugh Dickins62b61f62009-12-14 17:59:33 -0800506 * vm_normal_page() filters out zero pages, but there might
507 * still be PageReserved pages to skip, perhaps in a VDSO.
Nick Piggin053837f2006-01-18 17:42:27 -0800508 */
Hugh Dickinsb79bc0a2013-02-22 16:35:13 -0800509 if (PageReserved(page))
Christoph Lameterf4598c82006-01-12 01:05:20 -0800510 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800511 nid = page_to_nid(page);
Christoph Lameter38e35862006-01-08 01:01:01 -0800512 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
513 continue;
514
Stephen Wilsonb1f72d12011-05-24 17:12:43 -0700515 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameterfc301282006-01-18 17:42:29 -0800516 migrate_page_add(page, private, flags);
Christoph Lameter38e35862006-01-08 01:01:01 -0800517 else
518 break;
Hugh Dickins91612e02005-06-21 17:15:07 -0700519 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700520 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700521 return addr != end;
522}
523
Naoya Horiguchi98094942013-09-11 14:22:14 -0700524static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
525 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700526 void *private)
527{
528#ifdef CONFIG_HUGETLB_PAGE
529 int nid;
530 struct page *page;
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800531 spinlock_t *ptl;
Naoya Horiguchid4c54912014-06-06 10:00:01 -0400532 pte_t entry;
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700533
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800534 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
Naoya Horiguchid4c54912014-06-06 10:00:01 -0400535 entry = huge_ptep_get((pte_t *)pmd);
536 if (!pte_present(entry))
537 goto unlock;
538 page = pte_page(entry);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700539 nid = page_to_nid(page);
540 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
541 goto unlock;
542 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
543 if (flags & (MPOL_MF_MOVE_ALL) ||
544 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
545 isolate_huge_page(page, private);
546unlock:
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800547 spin_unlock(ptl);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700548#else
549 BUG();
550#endif
551}
552
Naoya Horiguchi98094942013-09-11 14:22:14 -0700553static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800554 unsigned long addr, unsigned long end,
555 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800556 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700557{
558 pmd_t *pmd;
559 unsigned long next;
560
561 pmd = pmd_offset(pud, addr);
562 do {
563 next = pmd_addr_end(addr, end);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700564 if (!pmd_present(*pmd))
565 continue;
566 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
Naoya Horiguchi98094942013-09-11 14:22:14 -0700567 queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700568 flags, private);
569 continue;
570 }
Kirill A. Shutemove1803772012-12-12 13:50:59 -0800571 split_huge_page_pmd(vma, addr, pmd);
Andrea Arcangeli1a5a9902012-03-21 16:33:42 -0700572 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
Hugh Dickins91612e02005-06-21 17:15:07 -0700573 continue;
Naoya Horiguchi98094942013-09-11 14:22:14 -0700574 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800575 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700576 return -EIO;
577 } while (pmd++, addr = next, addr != end);
578 return 0;
579}
580
Naoya Horiguchi98094942013-09-11 14:22:14 -0700581static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800582 unsigned long addr, unsigned long end,
583 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800584 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700585{
586 pud_t *pud;
587 unsigned long next;
588
589 pud = pud_offset(pgd, addr);
590 do {
591 next = pud_addr_end(addr, end);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700592 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
593 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700594 if (pud_none_or_clear_bad(pud))
595 continue;
Naoya Horiguchi98094942013-09-11 14:22:14 -0700596 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800597 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700598 return -EIO;
599 } while (pud++, addr = next, addr != end);
600 return 0;
601}
602
Naoya Horiguchi98094942013-09-11 14:22:14 -0700603static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800604 unsigned long addr, unsigned long end,
605 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800606 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700607{
608 pgd_t *pgd;
609 unsigned long next;
610
Nick Pigginb5810032005-10-29 18:16:12 -0700611 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700612 do {
613 next = pgd_addr_end(addr, end);
614 if (pgd_none_or_clear_bad(pgd))
615 continue;
Naoya Horiguchi98094942013-09-11 14:22:14 -0700616 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800617 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700618 return -EIO;
619 } while (pgd++, addr = next, addr != end);
620 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621}
622
Aneesh Kumar K.V58772312013-12-06 00:08:22 +0530623#ifdef CONFIG_NUMA_BALANCING
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200624/*
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200625 * This is used to mark a range of virtual addresses to be inaccessible.
626 * These are later cleared by a NUMA hinting fault. Depending on these
627 * faults, pages may be migrated for better NUMA placement.
628 *
629 * This is assuming that NUMA faults are handled using PROT_NONE. If
630 * an architecture makes a different choice, it will need further
631 * changes to the core.
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200632 */
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200633unsigned long change_prot_numa(struct vm_area_struct *vma,
634 unsigned long addr, unsigned long end)
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200635{
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200636 int nr_updated;
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200637
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200638 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
Mel Gorman03c5a6e2012-11-02 14:52:48 +0000639 if (nr_updated)
640 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200641
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200642 return nr_updated;
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200643}
644#else
645static unsigned long change_prot_numa(struct vm_area_struct *vma,
646 unsigned long addr, unsigned long end)
647{
648 return 0;
649}
Aneesh Kumar K.V58772312013-12-06 00:08:22 +0530650#endif /* CONFIG_NUMA_BALANCING */
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200651
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800652/*
Naoya Horiguchi98094942013-09-11 14:22:14 -0700653 * Walk through page tables and collect pages to be migrated.
654 *
655 * If pages found in a given range are on a set of nodes (determined by
656 * @nodes and @flags,) it's isolated and queued to the pagelist which is
657 * passed via @private.)
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800658 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659static struct vm_area_struct *
Naoya Horiguchi98094942013-09-11 14:22:14 -0700660queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Christoph Lameter38e35862006-01-08 01:01:01 -0800661 const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662{
663 int err;
664 struct vm_area_struct *first, *vma, *prev;
665
Nick Piggin053837f2006-01-18 17:42:27 -0800666
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667 first = find_vma(mm, start);
668 if (!first)
669 return ERR_PTR(-EFAULT);
670 prev = NULL;
671 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200672 unsigned long endvma = vma->vm_end;
673
674 if (endvma > end)
675 endvma = end;
676 if (vma->vm_start > start)
677 start = vma->vm_start;
678
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800679 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
680 if (!vma->vm_next && vma->vm_end < end)
681 return ERR_PTR(-EFAULT);
682 if (prev && prev->vm_end < vma->vm_start)
683 return ERR_PTR(-EFAULT);
684 }
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800685
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200686 if (flags & MPOL_MF_LAZY) {
687 change_prot_numa(vma, start, endvma);
688 goto next;
689 }
690
691 if ((flags & MPOL_MF_STRICT) ||
692 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
693 vma_migratable(vma))) {
694
Naoya Horiguchi98094942013-09-11 14:22:14 -0700695 err = queue_pages_pgd_range(vma, start, endvma, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800696 flags, private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697 if (err) {
698 first = ERR_PTR(err);
699 break;
700 }
701 }
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200702next:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 prev = vma;
704 }
705 return first;
706}
707
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700708/*
709 * Apply policy to a single VMA
710 * This must be called with the mmap_sem held for writing.
711 */
712static int vma_replace_policy(struct vm_area_struct *vma,
713 struct mempolicy *pol)
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700714{
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700715 int err;
716 struct mempolicy *old;
717 struct mempolicy *new;
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700718
719 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
720 vma->vm_start, vma->vm_end, vma->vm_pgoff,
721 vma->vm_ops, vma->vm_file,
722 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
723
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700724 new = mpol_dup(pol);
725 if (IS_ERR(new))
726 return PTR_ERR(new);
727
728 if (vma->vm_ops && vma->vm_ops->set_policy) {
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700729 err = vma->vm_ops->set_policy(vma, new);
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700730 if (err)
731 goto err_out;
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700732 }
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700733
734 old = vma->vm_policy;
735 vma->vm_policy = new; /* protected by mmap_sem */
736 mpol_put(old);
737
738 return 0;
739 err_out:
740 mpol_put(new);
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700741 return err;
742}
743
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744/* Step 2: apply policy to a range and do splits. */
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800745static int mbind_range(struct mm_struct *mm, unsigned long start,
746 unsigned long end, struct mempolicy *new_pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747{
748 struct vm_area_struct *next;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800749 struct vm_area_struct *prev;
750 struct vm_area_struct *vma;
751 int err = 0;
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800752 pgoff_t pgoff;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800753 unsigned long vmstart;
754 unsigned long vmend;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755
Linus Torvalds097d5912012-03-06 18:23:36 -0800756 vma = find_vma(mm, start);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800757 if (!vma || vma->vm_start > start)
758 return -EFAULT;
759
Linus Torvalds097d5912012-03-06 18:23:36 -0800760 prev = vma->vm_prev;
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800761 if (start > vma->vm_start)
762 prev = vma;
763
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800764 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765 next = vma->vm_next;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800766 vmstart = max(start, vma->vm_start);
767 vmend = min(end, vma->vm_end);
768
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800769 if (mpol_equal(vma_policy(vma), new_pol))
770 continue;
771
772 pgoff = vma->vm_pgoff +
773 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800774 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800775 vma->anon_vma, vma->vm_file, pgoff,
Caspar Zhang8aacc9f2011-09-14 16:20:58 -0700776 new_pol);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800777 if (prev) {
778 vma = prev;
779 next = vma->vm_next;
Oleg Nesterov3964acd2013-07-31 13:53:28 -0700780 if (mpol_equal(vma_policy(vma), new_pol))
781 continue;
782 /* vma_merge() joined vma && vma->next, case 8 */
783 goto replace;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800784 }
785 if (vma->vm_start != vmstart) {
786 err = split_vma(vma->vm_mm, vma, vmstart, 1);
787 if (err)
788 goto out;
789 }
790 if (vma->vm_end != vmend) {
791 err = split_vma(vma->vm_mm, vma, vmend, 0);
792 if (err)
793 goto out;
794 }
Oleg Nesterov3964acd2013-07-31 13:53:28 -0700795 replace:
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700796 err = vma_replace_policy(vma, new_pol);
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700797 if (err)
798 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 }
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800800
801 out:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802 return err;
803}
804
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805/* Set the process memory policy */
David Rientjes028fec42008-04-28 02:12:25 -0700806static long do_set_mempolicy(unsigned short mode, unsigned short flags,
807 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808{
Miao Xie58568d22009-06-16 15:31:49 -0700809 struct mempolicy *new, *old;
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700810 struct mm_struct *mm = current->mm;
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700811 NODEMASK_SCRATCH(scratch);
Miao Xie58568d22009-06-16 15:31:49 -0700812 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700814 if (!scratch)
815 return -ENOMEM;
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700816
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700817 new = mpol_new(mode, flags, nodes);
818 if (IS_ERR(new)) {
819 ret = PTR_ERR(new);
820 goto out;
821 }
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700822 /*
823 * prevent changing our mempolicy while show_numa_maps()
824 * is using it.
825 * Note: do_set_mempolicy() can be called at init time
826 * with no 'mm'.
827 */
828 if (mm)
829 down_write(&mm->mmap_sem);
Miao Xie58568d22009-06-16 15:31:49 -0700830 task_lock(current);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700831 ret = mpol_set_nodemask(new, nodes, scratch);
Miao Xie58568d22009-06-16 15:31:49 -0700832 if (ret) {
833 task_unlock(current);
834 if (mm)
835 up_write(&mm->mmap_sem);
836 mpol_put(new);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700837 goto out;
Miao Xie58568d22009-06-16 15:31:49 -0700838 }
839 old = current->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700840 current->mempolicy = new;
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700841 if (new && new->mode == MPOL_INTERLEAVE &&
David Rientjesf5b087b2008-04-28 02:12:27 -0700842 nodes_weight(new->v.nodes))
Andi Kleendfcd3c02005-10-29 18:15:48 -0700843 current->il_next = first_node(new->v.nodes);
Miao Xie58568d22009-06-16 15:31:49 -0700844 task_unlock(current);
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700845 if (mm)
846 up_write(&mm->mmap_sem);
847
Miao Xie58568d22009-06-16 15:31:49 -0700848 mpol_put(old);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700849 ret = 0;
850out:
851 NODEMASK_SCRATCH_FREE(scratch);
852 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853}
854
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700855/*
856 * Return nodemask for policy for get_mempolicy() query
Miao Xie58568d22009-06-16 15:31:49 -0700857 *
858 * Called with task's alloc_lock held
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700859 */
860static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700862 nodes_clear(*nodes);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700863 if (p == &default_policy)
864 return;
865
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700866 switch (p->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -0700867 case MPOL_BIND:
868 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700869 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700870 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871 break;
872 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700873 if (!(p->flags & MPOL_F_LOCAL))
Andi Kleendfcd3c02005-10-29 18:15:48 -0700874 node_set(p->v.preferred_node, *nodes);
Lee Schermerhorn53f25562008-04-28 02:13:20 -0700875 /* else return empty node mask for local allocation */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876 break;
877 default:
878 BUG();
879 }
880}
881
882static int lookup_node(struct mm_struct *mm, unsigned long addr)
883{
884 struct page *p;
885 int err;
886
887 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
888 if (err >= 0) {
889 err = page_to_nid(p);
890 put_page(p);
891 }
892 return err;
893}
894
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895/* Retrieve NUMA policy */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700896static long do_get_mempolicy(int *policy, nodemask_t *nmask,
897 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700898{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700899 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 struct mm_struct *mm = current->mm;
901 struct vm_area_struct *vma = NULL;
902 struct mempolicy *pol = current->mempolicy;
903
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700904 if (flags &
905 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906 return -EINVAL;
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700907
908 if (flags & MPOL_F_MEMS_ALLOWED) {
909 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
910 return -EINVAL;
911 *policy = 0; /* just so it's initialized */
Miao Xie58568d22009-06-16 15:31:49 -0700912 task_lock(current);
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700913 *nmask = cpuset_current_mems_allowed;
Miao Xie58568d22009-06-16 15:31:49 -0700914 task_unlock(current);
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700915 return 0;
916 }
917
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918 if (flags & MPOL_F_ADDR) {
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700919 /*
920 * Do NOT fall back to task policy if the
921 * vma/shared policy at addr is NULL. We
922 * want to return MPOL_DEFAULT in this case.
923 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700924 down_read(&mm->mmap_sem);
925 vma = find_vma_intersection(mm, addr, addr+1);
926 if (!vma) {
927 up_read(&mm->mmap_sem);
928 return -EFAULT;
929 }
930 if (vma->vm_ops && vma->vm_ops->get_policy)
931 pol = vma->vm_ops->get_policy(vma, addr);
932 else
933 pol = vma->vm_policy;
934 } else if (addr)
935 return -EINVAL;
936
937 if (!pol)
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700938 pol = &default_policy; /* indicates default behavior */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939
940 if (flags & MPOL_F_NODE) {
941 if (flags & MPOL_F_ADDR) {
942 err = lookup_node(mm, addr);
943 if (err < 0)
944 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700945 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946 } else if (pol == current->mempolicy &&
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700947 pol->mode == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700948 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949 } else {
950 err = -EINVAL;
951 goto out;
952 }
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700953 } else {
954 *policy = pol == &default_policy ? MPOL_DEFAULT :
955 pol->mode;
David Rientjesd79df632008-07-04 12:24:13 -0700956 /*
957 * Internal mempolicy flags must be masked off before exposing
958 * the policy to userspace.
959 */
960 *policy |= (pol->flags & MPOL_MODE_FLAGS);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700961 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962
963 if (vma) {
964 up_read(&current->mm->mmap_sem);
965 vma = NULL;
966 }
967
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968 err = 0;
Miao Xie58568d22009-06-16 15:31:49 -0700969 if (nmask) {
Lee Schermerhornc6b6ef82010-03-23 13:35:41 -0700970 if (mpol_store_user_nodemask(pol)) {
971 *nmask = pol->w.user_nodemask;
972 } else {
973 task_lock(current);
974 get_policy_nodemask(pol, nmask);
975 task_unlock(current);
976 }
Miao Xie58568d22009-06-16 15:31:49 -0700977 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978
979 out:
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700980 mpol_cond_put(pol);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981 if (vma)
982 up_read(&current->mm->mmap_sem);
983 return err;
984}
985
Christoph Lameterb20a3502006-03-22 00:09:12 -0800986#ifdef CONFIG_MIGRATION
Christoph Lameter8bccd852005-10-29 18:16:59 -0700987/*
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800988 * page migration
989 */
Christoph Lameterfc301282006-01-18 17:42:29 -0800990static void migrate_page_add(struct page *page, struct list_head *pagelist,
991 unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800992{
993 /*
Christoph Lameterfc301282006-01-18 17:42:29 -0800994 * Avoid migrating a page that is shared with others.
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800995 */
Nick Piggin62695a82008-10-18 20:26:09 -0700996 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
997 if (!isolate_lru_page(page)) {
998 list_add_tail(&page->lru, pagelist);
KOSAKI Motohiro6d9c2852009-12-14 17:58:11 -0800999 inc_zone_page_state(page, NR_ISOLATED_ANON +
1000 page_is_file_cache(page));
Nick Piggin62695a82008-10-18 20:26:09 -07001001 }
1002 }
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001003}
1004
Christoph Lameter742755a2006-06-23 02:03:55 -07001005static struct page *new_node_page(struct page *page, unsigned long node, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001006{
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -07001007 if (PageHuge(page))
1008 return alloc_huge_page_node(page_hstate(compound_head(page)),
1009 node);
1010 else
1011 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001012}
1013
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001014/*
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001015 * Migrate pages from one node to a target node.
1016 * Returns error or the number of pages not migrated.
1017 */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001018static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1019 int flags)
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001020{
1021 nodemask_t nmask;
1022 LIST_HEAD(pagelist);
1023 int err = 0;
1024
1025 nodes_clear(nmask);
1026 node_set(source, nmask);
1027
Minchan Kim08270802012-10-08 16:33:38 -07001028 /*
1029 * This does not "check" the range but isolates all pages that
1030 * need migration. Between passing in the full user address
1031 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1032 */
1033 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
Naoya Horiguchi98094942013-09-11 14:22:14 -07001034 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001035 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1036
Minchan Kimcf608ac2010-10-26 14:21:29 -07001037 if (!list_empty(&pagelist)) {
David Rientjes68711a72014-06-04 16:08:25 -07001038 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
Hugh Dickins9c620e22013-02-22 16:35:14 -08001039 MIGRATE_SYNC, MR_SYSCALL);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001040 if (err)
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -07001041 putback_movable_pages(&pagelist);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001042 }
Christoph Lameter95a402c2006-06-23 02:03:53 -07001043
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001044 return err;
1045}
1046
1047/*
1048 * Move pages between the two nodesets so as to preserve the physical
1049 * layout as much as possible.
Christoph Lameter39743882006-01-08 01:00:51 -08001050 *
1051 * Returns the number of page that could not be moved.
1052 */
Andrew Morton0ce72d42012-05-29 15:06:24 -07001053int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1054 const nodemask_t *to, int flags)
Christoph Lameter39743882006-01-08 01:00:51 -08001055{
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001056 int busy = 0;
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001057 int err;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001058 nodemask_t tmp;
Christoph Lameter39743882006-01-08 01:00:51 -08001059
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001060 err = migrate_prep();
1061 if (err)
1062 return err;
1063
Lee Schermerhorn53f25562008-04-28 02:13:20 -07001064 down_read(&mm->mmap_sem);
Christoph Lameter39743882006-01-08 01:00:51 -08001065
Andrew Morton0ce72d42012-05-29 15:06:24 -07001066 err = migrate_vmas(mm, from, to, flags);
Christoph Lameter7b2259b2006-06-25 05:46:48 -07001067 if (err)
1068 goto out;
1069
KOSAKI Motohiroda0aa132010-03-05 13:41:59 -08001070 /*
1071 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1072 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1073 * bit in 'tmp', and return that <source, dest> pair for migration.
1074 * The pair of nodemasks 'to' and 'from' define the map.
1075 *
1076 * If no pair of bits is found that way, fallback to picking some
1077 * pair of 'source' and 'dest' bits that are not the same. If the
1078 * 'source' and 'dest' bits are the same, this represents a node
1079 * that will be migrating to itself, so no pages need move.
1080 *
1081 * If no bits are left in 'tmp', or if all remaining bits left
1082 * in 'tmp' correspond to the same bit in 'to', return false
1083 * (nothing left to migrate).
1084 *
1085 * This lets us pick a pair of nodes to migrate between, such that
1086 * if possible the dest node is not already occupied by some other
1087 * source node, minimizing the risk of overloading the memory on a
1088 * node that would happen if we migrated incoming memory to a node
1089 * before migrating outgoing memory source that same node.
1090 *
1091 * A single scan of tmp is sufficient. As we go, we remember the
1092 * most recent <s, d> pair that moved (s != d). If we find a pair
1093 * that not only moved, but what's better, moved to an empty slot
1094 * (d is not set in tmp), then we break out then, with that pair.
Justin P. Mattockae0e47f2011-03-01 15:06:02 +01001095 * Otherwise when we finish scanning from_tmp, we at least have the
KOSAKI Motohiroda0aa132010-03-05 13:41:59 -08001096 * most recent <s, d> pair that moved. If we get all the way through
1097 * the scan of tmp without finding any node that moved, much less
1098 * moved to an empty node, then there is nothing left worth migrating.
1099 */
Christoph Lameterd4984712006-01-08 01:00:55 -08001100
Andrew Morton0ce72d42012-05-29 15:06:24 -07001101 tmp = *from;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001102 while (!nodes_empty(tmp)) {
1103 int s,d;
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001104 int source = NUMA_NO_NODE;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001105 int dest = 0;
1106
1107 for_each_node_mask(s, tmp) {
Larry Woodman4a5b18c2012-05-29 15:06:24 -07001108
1109 /*
1110 * do_migrate_pages() tries to maintain the relative
1111 * node relationship of the pages established between
1112 * threads and memory areas.
1113 *
1114 * However if the number of source nodes is not equal to
1115 * the number of destination nodes we can not preserve
1116 * this node relative relationship. In that case, skip
1117 * copying memory from a node that is in the destination
1118 * mask.
1119 *
1120 * Example: [2,3,4] -> [3,4,5] moves everything.
1121 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1122 */
1123
Andrew Morton0ce72d42012-05-29 15:06:24 -07001124 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1125 (node_isset(s, *to)))
Larry Woodman4a5b18c2012-05-29 15:06:24 -07001126 continue;
1127
Andrew Morton0ce72d42012-05-29 15:06:24 -07001128 d = node_remap(s, *from, *to);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001129 if (s == d)
1130 continue;
1131
1132 source = s; /* Node moved. Memorize */
1133 dest = d;
1134
1135 /* dest not in remaining from nodes? */
1136 if (!node_isset(dest, tmp))
1137 break;
1138 }
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001139 if (source == NUMA_NO_NODE)
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001140 break;
1141
1142 node_clear(source, tmp);
1143 err = migrate_to_node(mm, source, dest, flags);
1144 if (err > 0)
1145 busy += err;
1146 if (err < 0)
1147 break;
Christoph Lameter39743882006-01-08 01:00:51 -08001148 }
Christoph Lameter7b2259b2006-06-25 05:46:48 -07001149out:
Christoph Lameter39743882006-01-08 01:00:51 -08001150 up_read(&mm->mmap_sem);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001151 if (err < 0)
1152 return err;
1153 return busy;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001154
Christoph Lameter39743882006-01-08 01:00:51 -08001155}
1156
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -08001157/*
1158 * Allocate a new page for page migration based on vma policy.
1159 * Start assuming that page is mapped by vma pointed to by @private.
1160 * Search forward from there, if not. N.B., this assumes that the
1161 * list of pages handed to migrate_pages()--which is how we get here--
1162 * is in virtual address order.
1163 */
Christoph Lameter742755a2006-06-23 02:03:55 -07001164static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001165{
1166 struct vm_area_struct *vma = (struct vm_area_struct *)private;
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -08001167 unsigned long uninitialized_var(address);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001168
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -08001169 while (vma) {
1170 address = page_address_in_vma(page, vma);
1171 if (address != -EFAULT)
1172 break;
1173 vma = vma->vm_next;
1174 }
1175
Wanpeng Li11c731e2013-12-18 17:08:56 -08001176 if (PageHuge(page)) {
Michal Hockocc817172014-01-23 15:53:15 -08001177 BUG_ON(!vma);
1178 return alloc_huge_page_noerr(vma, address, 1);
Wanpeng Li11c731e2013-12-18 17:08:56 -08001179 }
1180 /*
1181 * if !vma, alloc_page_vma() will use task or system default policy
1182 */
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -08001183 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001184}
Christoph Lameterb20a3502006-03-22 00:09:12 -08001185#else
1186
1187static void migrate_page_add(struct page *page, struct list_head *pagelist,
1188 unsigned long flags)
1189{
1190}
1191
Andrew Morton0ce72d42012-05-29 15:06:24 -07001192int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1193 const nodemask_t *to, int flags)
Christoph Lameterb20a3502006-03-22 00:09:12 -08001194{
1195 return -ENOSYS;
1196}
Christoph Lameter95a402c2006-06-23 02:03:53 -07001197
Keith Owens69939742006-10-11 01:21:28 -07001198static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001199{
1200 return NULL;
1201}
Christoph Lameterb20a3502006-03-22 00:09:12 -08001202#endif
1203
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001204static long do_mbind(unsigned long start, unsigned long len,
David Rientjes028fec42008-04-28 02:12:25 -07001205 unsigned short mode, unsigned short mode_flags,
1206 nodemask_t *nmask, unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001207{
1208 struct vm_area_struct *vma;
1209 struct mm_struct *mm = current->mm;
1210 struct mempolicy *new;
1211 unsigned long end;
1212 int err;
1213 LIST_HEAD(pagelist);
1214
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001215 if (flags & ~(unsigned long)MPOL_MF_VALID)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001216 return -EINVAL;
Christoph Lameter74c00242006-03-14 19:50:21 -08001217 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001218 return -EPERM;
1219
1220 if (start & ~PAGE_MASK)
1221 return -EINVAL;
1222
1223 if (mode == MPOL_DEFAULT)
1224 flags &= ~MPOL_MF_STRICT;
1225
1226 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1227 end = start + len;
1228
1229 if (end < start)
1230 return -EINVAL;
1231 if (end == start)
1232 return 0;
1233
David Rientjes028fec42008-04-28 02:12:25 -07001234 new = mpol_new(mode, mode_flags, nmask);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001235 if (IS_ERR(new))
1236 return PTR_ERR(new);
1237
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001238 if (flags & MPOL_MF_LAZY)
1239 new->flags |= MPOL_F_MOF;
1240
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001241 /*
1242 * If we are using the default policy then operation
1243 * on discontinuous address spaces is okay after all
1244 */
1245 if (!new)
1246 flags |= MPOL_MF_DISCONTIG_OK;
1247
David Rientjes028fec42008-04-28 02:12:25 -07001248 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1249 start, start + len, mode, mode_flags,
David Rientjes00ef2d22013-02-22 16:35:36 -08001250 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001251
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001252 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1253
1254 err = migrate_prep();
1255 if (err)
KOSAKI Motohirob05ca732009-10-26 16:49:59 -07001256 goto mpol_out;
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001257 }
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07001258 {
1259 NODEMASK_SCRATCH(scratch);
1260 if (scratch) {
1261 down_write(&mm->mmap_sem);
1262 task_lock(current);
1263 err = mpol_set_nodemask(new, nmask, scratch);
1264 task_unlock(current);
1265 if (err)
1266 up_write(&mm->mmap_sem);
1267 } else
1268 err = -ENOMEM;
1269 NODEMASK_SCRATCH_FREE(scratch);
1270 }
KOSAKI Motohirob05ca732009-10-26 16:49:59 -07001271 if (err)
1272 goto mpol_out;
1273
Naoya Horiguchi98094942013-09-11 14:22:14 -07001274 vma = queue_pages_range(mm, start, end, nmask,
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001275 flags | MPOL_MF_INVERT, &pagelist);
1276
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001277 err = PTR_ERR(vma); /* maybe ... */
Mel Gormana7200942012-11-16 09:37:58 +00001278 if (!IS_ERR(vma))
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -08001279 err = mbind_range(mm, start, end, new);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001280
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001281 if (!err) {
1282 int nr_failed = 0;
1283
Minchan Kimcf608ac2010-10-26 14:21:29 -07001284 if (!list_empty(&pagelist)) {
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001285 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001286 nr_failed = migrate_pages(&pagelist, new_vma_page,
David Rientjes68711a72014-06-04 16:08:25 -07001287 NULL, (unsigned long)vma,
Hugh Dickins9c620e22013-02-22 16:35:14 -08001288 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001289 if (nr_failed)
Naoya Horiguchi74060e42013-09-11 14:22:06 -07001290 putback_movable_pages(&pagelist);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001291 }
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001292
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001293 if (nr_failed && (flags & MPOL_MF_STRICT))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001294 err = -EIO;
KOSAKI Motohiroab8a3e12009-10-26 16:49:58 -07001295 } else
Joonsoo Kimb0e5fd72013-12-18 17:08:51 -08001296 putback_movable_pages(&pagelist);
Christoph Lameterb20a3502006-03-22 00:09:12 -08001297
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001298 up_write(&mm->mmap_sem);
KOSAKI Motohirob05ca732009-10-26 16:49:59 -07001299 mpol_out:
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001300 mpol_put(new);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001301 return err;
1302}
1303
Christoph Lameter39743882006-01-08 01:00:51 -08001304/*
Christoph Lameter8bccd852005-10-29 18:16:59 -07001305 * User space interface with variable sized bitmaps for nodelists.
1306 */
1307
1308/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -08001309static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -07001310 unsigned long maxnode)
1311{
1312 unsigned long k;
1313 unsigned long nlongs;
1314 unsigned long endmask;
1315
1316 --maxnode;
1317 nodes_clear(*nodes);
1318 if (maxnode == 0 || !nmask)
1319 return 0;
Andi Kleena9c930b2006-02-20 18:27:59 -08001320 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
Chris Wright636f13c2006-02-17 13:59:36 -08001321 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001322
1323 nlongs = BITS_TO_LONGS(maxnode);
1324 if ((maxnode % BITS_PER_LONG) == 0)
1325 endmask = ~0UL;
1326 else
1327 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1328
1329 /* When the user specified more nodes than supported just check
1330 if the non supported part is all zero. */
1331 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1332 if (nlongs > PAGE_SIZE/sizeof(long))
1333 return -EINVAL;
1334 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1335 unsigned long t;
1336 if (get_user(t, nmask + k))
1337 return -EFAULT;
1338 if (k == nlongs - 1) {
1339 if (t & endmask)
1340 return -EINVAL;
1341 } else if (t)
1342 return -EINVAL;
1343 }
1344 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1345 endmask = ~0UL;
1346 }
1347
1348 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1349 return -EFAULT;
1350 nodes_addr(*nodes)[nlongs-1] &= endmask;
1351 return 0;
1352}
1353
1354/* Copy a kernel node mask to user space */
1355static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1356 nodemask_t *nodes)
1357{
1358 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1359 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1360
1361 if (copy > nbytes) {
1362 if (copy > PAGE_SIZE)
1363 return -EINVAL;
1364 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1365 return -EFAULT;
1366 copy = nbytes;
1367 }
1368 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1369}
1370
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001371SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
Rasmus Villemoesf7f28ca2014-06-04 16:07:57 -07001372 unsigned long, mode, const unsigned long __user *, nmask,
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001373 unsigned long, maxnode, unsigned, flags)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001374{
1375 nodemask_t nodes;
1376 int err;
David Rientjes028fec42008-04-28 02:12:25 -07001377 unsigned short mode_flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001378
David Rientjes028fec42008-04-28 02:12:25 -07001379 mode_flags = mode & MPOL_MODE_FLAGS;
1380 mode &= ~MPOL_MODE_FLAGS;
David Rientjesa3b51e02008-04-28 02:12:23 -07001381 if (mode >= MPOL_MAX)
1382 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001383 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1384 (mode_flags & MPOL_F_RELATIVE_NODES))
1385 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001386 err = get_nodes(&nodes, nmask, maxnode);
1387 if (err)
1388 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001389 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001390}
1391
1392/* Set the process memory policy */
Rasmus Villemoes23c89022014-06-04 16:07:58 -07001393SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001394 unsigned long, maxnode)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001395{
1396 int err;
1397 nodemask_t nodes;
David Rientjes028fec42008-04-28 02:12:25 -07001398 unsigned short flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001399
David Rientjes028fec42008-04-28 02:12:25 -07001400 flags = mode & MPOL_MODE_FLAGS;
1401 mode &= ~MPOL_MODE_FLAGS;
1402 if ((unsigned int)mode >= MPOL_MAX)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001403 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001404 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1405 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001406 err = get_nodes(&nodes, nmask, maxnode);
1407 if (err)
1408 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001409 return do_set_mempolicy(mode, flags, &nodes);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001410}
1411
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001412SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1413 const unsigned long __user *, old_nodes,
1414 const unsigned long __user *, new_nodes)
Christoph Lameter39743882006-01-08 01:00:51 -08001415{
David Howellsc69e8d92008-11-14 10:39:19 +11001416 const struct cred *cred = current_cred(), *tcred;
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001417 struct mm_struct *mm = NULL;
Christoph Lameter39743882006-01-08 01:00:51 -08001418 struct task_struct *task;
Christoph Lameter39743882006-01-08 01:00:51 -08001419 nodemask_t task_nodes;
1420 int err;
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001421 nodemask_t *old;
1422 nodemask_t *new;
1423 NODEMASK_SCRATCH(scratch);
Christoph Lameter39743882006-01-08 01:00:51 -08001424
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001425 if (!scratch)
1426 return -ENOMEM;
Christoph Lameter39743882006-01-08 01:00:51 -08001427
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001428 old = &scratch->mask1;
1429 new = &scratch->mask2;
1430
1431 err = get_nodes(old, old_nodes, maxnode);
Christoph Lameter39743882006-01-08 01:00:51 -08001432 if (err)
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001433 goto out;
1434
1435 err = get_nodes(new, new_nodes, maxnode);
1436 if (err)
1437 goto out;
Christoph Lameter39743882006-01-08 01:00:51 -08001438
1439 /* Find the mm_struct */
Zeng Zhaoming55cfaa32010-12-02 14:31:13 -08001440 rcu_read_lock();
Pavel Emelyanov228ebcb2007-10-18 23:40:16 -07001441 task = pid ? find_task_by_vpid(pid) : current;
Christoph Lameter39743882006-01-08 01:00:51 -08001442 if (!task) {
Zeng Zhaoming55cfaa32010-12-02 14:31:13 -08001443 rcu_read_unlock();
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001444 err = -ESRCH;
1445 goto out;
Christoph Lameter39743882006-01-08 01:00:51 -08001446 }
Christoph Lameter3268c632012-03-21 16:34:06 -07001447 get_task_struct(task);
Christoph Lameter39743882006-01-08 01:00:51 -08001448
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001449 err = -EINVAL;
Christoph Lameter39743882006-01-08 01:00:51 -08001450
1451 /*
1452 * Check if this process has the right to modify the specified
1453 * process. The right exists if the process has administrative
Alexey Dobriyan7f927fc2006-03-28 01:56:53 -08001454 * capabilities, superuser privileges or the same
Christoph Lameter39743882006-01-08 01:00:51 -08001455 * userid as the target process.
1456 */
David Howellsc69e8d92008-11-14 10:39:19 +11001457 tcred = __task_cred(task);
Eric W. Biedermanb38a86e2012-03-12 15:48:24 -07001458 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1459 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
Christoph Lameter74c00242006-03-14 19:50:21 -08001460 !capable(CAP_SYS_NICE)) {
David Howellsc69e8d92008-11-14 10:39:19 +11001461 rcu_read_unlock();
Christoph Lameter39743882006-01-08 01:00:51 -08001462 err = -EPERM;
Christoph Lameter3268c632012-03-21 16:34:06 -07001463 goto out_put;
Christoph Lameter39743882006-01-08 01:00:51 -08001464 }
David Howellsc69e8d92008-11-14 10:39:19 +11001465 rcu_read_unlock();
Christoph Lameter39743882006-01-08 01:00:51 -08001466
1467 task_nodes = cpuset_mems_allowed(task);
1468 /* Is the user allowed to access the target nodes? */
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001469 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001470 err = -EPERM;
Christoph Lameter3268c632012-03-21 16:34:06 -07001471 goto out_put;
Christoph Lameter39743882006-01-08 01:00:51 -08001472 }
1473
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08001474 if (!nodes_subset(*new, node_states[N_MEMORY])) {
Christoph Lameter3b42d282007-08-31 00:12:08 -07001475 err = -EINVAL;
Christoph Lameter3268c632012-03-21 16:34:06 -07001476 goto out_put;
Christoph Lameter3b42d282007-08-31 00:12:08 -07001477 }
1478
David Quigley86c3a762006-06-23 02:04:02 -07001479 err = security_task_movememory(task);
1480 if (err)
Christoph Lameter3268c632012-03-21 16:34:06 -07001481 goto out_put;
David Quigley86c3a762006-06-23 02:04:02 -07001482
Christoph Lameter3268c632012-03-21 16:34:06 -07001483 mm = get_task_mm(task);
1484 put_task_struct(task);
Sasha Levinf2a9ef82012-04-25 16:01:52 -07001485
1486 if (!mm) {
Christoph Lameter3268c632012-03-21 16:34:06 -07001487 err = -EINVAL;
Sasha Levinf2a9ef82012-04-25 16:01:52 -07001488 goto out;
1489 }
1490
1491 err = do_migrate_pages(mm, old, new,
1492 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
Christoph Lameter3268c632012-03-21 16:34:06 -07001493
1494 mmput(mm);
1495out:
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001496 NODEMASK_SCRATCH_FREE(scratch);
1497
Christoph Lameter39743882006-01-08 01:00:51 -08001498 return err;
Christoph Lameter3268c632012-03-21 16:34:06 -07001499
1500out_put:
1501 put_task_struct(task);
1502 goto out;
1503
Christoph Lameter39743882006-01-08 01:00:51 -08001504}
1505
1506
Christoph Lameter8bccd852005-10-29 18:16:59 -07001507/* Retrieve NUMA policy */
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001508SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1509 unsigned long __user *, nmask, unsigned long, maxnode,
1510 unsigned long, addr, unsigned long, flags)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001511{
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001512 int err;
1513 int uninitialized_var(pval);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001514 nodemask_t nodes;
1515
1516 if (nmask != NULL && maxnode < MAX_NUMNODES)
1517 return -EINVAL;
1518
1519 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1520
1521 if (err)
1522 return err;
1523
1524 if (policy && put_user(pval, policy))
1525 return -EFAULT;
1526
1527 if (nmask)
1528 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1529
1530 return err;
1531}
1532
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533#ifdef CONFIG_COMPAT
1534
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001535COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1536 compat_ulong_t __user *, nmask,
1537 compat_ulong_t, maxnode,
1538 compat_ulong_t, addr, compat_ulong_t, flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539{
1540 long err;
1541 unsigned long __user *nm = NULL;
1542 unsigned long nr_bits, alloc_size;
1543 DECLARE_BITMAP(bm, MAX_NUMNODES);
1544
1545 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1546 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1547
1548 if (nmask)
1549 nm = compat_alloc_user_space(alloc_size);
1550
1551 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1552
1553 if (!err && nmask) {
KAMEZAWA Hiroyuki2bbff6c2011-09-14 16:21:02 -07001554 unsigned long copy_size;
1555 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1556 err = copy_from_user(bm, nm, copy_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001557 /* ensure entire bitmap is zeroed */
1558 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1559 err |= compat_put_bitmap(nmask, bm, nr_bits);
1560 }
1561
1562 return err;
1563}
1564
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001565COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1566 compat_ulong_t, maxnode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001567{
1568 long err = 0;
1569 unsigned long __user *nm = NULL;
1570 unsigned long nr_bits, alloc_size;
1571 DECLARE_BITMAP(bm, MAX_NUMNODES);
1572
1573 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1574 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1575
1576 if (nmask) {
1577 err = compat_get_bitmap(bm, nmask, nr_bits);
1578 nm = compat_alloc_user_space(alloc_size);
1579 err |= copy_to_user(nm, bm, alloc_size);
1580 }
1581
1582 if (err)
1583 return -EFAULT;
1584
1585 return sys_set_mempolicy(mode, nm, nr_bits+1);
1586}
1587
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001588COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1589 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1590 compat_ulong_t, maxnode, compat_ulong_t, flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001591{
1592 long err = 0;
1593 unsigned long __user *nm = NULL;
1594 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -07001595 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596
1597 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1598 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1599
1600 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -07001601 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -07001603 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604 }
1605
1606 if (err)
1607 return -EFAULT;
1608
1609 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1610}
1611
1612#endif
1613
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001614/*
1615 * get_vma_policy(@task, @vma, @addr)
Fabian Frederickb46e14a2014-06-04 16:08:18 -07001616 * @task: task for fallback if vma policy == default
1617 * @vma: virtual memory area whose policy is sought
1618 * @addr: address in @vma for shared policy lookup
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001619 *
1620 * Returns effective policy for a VMA at specified address.
1621 * Falls back to @task or system default policy, as necessary.
David Rientjes32f85162012-10-16 17:31:23 -07001622 * Current or other task's task mempolicy and non-shared vma policies must be
1623 * protected by task_lock(task) by the caller.
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001624 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1625 * count--added by the get_policy() vm_op, as appropriate--to protect against
1626 * freeing by another task. It is the caller's responsibility to free the
1627 * extra reference for shared policies.
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001628 */
Stephen Wilsond98f6cb2011-05-24 17:12:41 -07001629struct mempolicy *get_vma_policy(struct task_struct *task,
Christoph Lameter48fce342006-01-08 01:01:03 -08001630 struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001631{
Mel Gorman5606e382012-11-02 18:19:13 +00001632 struct mempolicy *pol = get_task_policy(task);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633
1634 if (vma) {
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001635 if (vma->vm_ops && vma->vm_ops->get_policy) {
Lee Schermerhornae4d8c12008-04-28 02:13:11 -07001636 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1637 addr);
1638 if (vpol)
1639 pol = vpol;
Mel Gorman00442ad2012-10-08 16:29:20 -07001640 } else if (vma->vm_policy) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001641 pol = vma->vm_policy;
Mel Gorman00442ad2012-10-08 16:29:20 -07001642
1643 /*
1644 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1645 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1646 * count on these policies which will be dropped by
1647 * mpol_cond_put() later
1648 */
1649 if (mpol_needs_cond_ref(pol))
1650 mpol_get(pol);
1651 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001652 }
1653 if (!pol)
1654 pol = &default_policy;
1655 return pol;
1656}
1657
Mel Gormanfc3147242013-10-07 11:29:09 +01001658bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1659{
1660 struct mempolicy *pol = get_task_policy(task);
1661 if (vma) {
1662 if (vma->vm_ops && vma->vm_ops->get_policy) {
1663 bool ret = false;
1664
1665 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1666 if (pol && (pol->flags & MPOL_F_MOF))
1667 ret = true;
1668 mpol_cond_put(pol);
1669
1670 return ret;
1671 } else if (vma->vm_policy) {
1672 pol = vma->vm_policy;
1673 }
1674 }
1675
1676 if (!pol)
1677 return default_policy.flags & MPOL_F_MOF;
1678
1679 return pol->flags & MPOL_F_MOF;
1680}
1681
Lai Jiangshand3eb1572013-02-22 16:33:22 -08001682static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1683{
1684 enum zone_type dynamic_policy_zone = policy_zone;
1685
1686 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1687
1688 /*
1689 * if policy->v.nodes has movable memory only,
1690 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1691 *
1692 * policy->v.nodes is intersect with node_states[N_MEMORY].
1693 * so if the following test faile, it implies
1694 * policy->v.nodes has movable memory only.
1695 */
1696 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1697 dynamic_policy_zone = ZONE_MOVABLE;
1698
1699 return zone >= dynamic_policy_zone;
1700}
1701
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001702/*
1703 * Return a nodemask representing a mempolicy for filtering nodes for
1704 * page allocation
1705 */
1706static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
Mel Gorman19770b32008-04-28 02:12:18 -07001707{
1708 /* Lower zones don't get a nodemask applied for MPOL_BIND */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001709 if (unlikely(policy->mode == MPOL_BIND) &&
Lai Jiangshand3eb1572013-02-22 16:33:22 -08001710 apply_policy_zone(policy, gfp_zone(gfp)) &&
Mel Gorman19770b32008-04-28 02:12:18 -07001711 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1712 return &policy->v.nodes;
1713
1714 return NULL;
1715}
1716
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001717/* Return a zonelist indicated by gfp for node representing a mempolicy */
Andi Kleen2f5f9482011-03-04 17:36:29 -08001718static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1719 int nd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720{
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001721 switch (policy->mode) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001723 if (!(policy->flags & MPOL_F_LOCAL))
1724 nd = policy->v.preferred_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725 break;
1726 case MPOL_BIND:
Mel Gorman19770b32008-04-28 02:12:18 -07001727 /*
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001728 * Normally, MPOL_BIND allocations are node-local within the
1729 * allowed nodemask. However, if __GFP_THISNODE is set and the
Bob Liu6eb27e12010-05-24 14:32:00 -07001730 * current node isn't part of the mask, we use the zonelist for
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001731 * the first node in the mask instead.
Mel Gorman19770b32008-04-28 02:12:18 -07001732 */
Mel Gorman19770b32008-04-28 02:12:18 -07001733 if (unlikely(gfp & __GFP_THISNODE) &&
1734 unlikely(!node_isset(nd, policy->v.nodes)))
1735 nd = first_node(policy->v.nodes);
1736 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737 default:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001738 BUG();
1739 }
Mel Gorman0e884602008-04-28 02:12:14 -07001740 return node_zonelist(nd, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741}
1742
1743/* Do dynamic interleaving for a process */
1744static unsigned interleave_nodes(struct mempolicy *policy)
1745{
1746 unsigned nid, next;
1747 struct task_struct *me = current;
1748
1749 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -07001750 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001751 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -07001752 next = first_node(policy->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001753 if (next < MAX_NUMNODES)
1754 me->il_next = next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755 return nid;
1756}
1757
Christoph Lameterdc85da12006-01-18 17:42:36 -08001758/*
1759 * Depending on the memory policy provide a node from which to allocate the
1760 * next slab entry.
1761 */
David Rientjes2a389612014-04-07 15:37:29 -07001762unsigned int mempolicy_slab_node(void)
Christoph Lameterdc85da12006-01-18 17:42:36 -08001763{
Andi Kleene7b691b2012-06-09 02:40:03 -07001764 struct mempolicy *policy;
David Rientjes2a389612014-04-07 15:37:29 -07001765 int node = numa_mem_id();
Andi Kleene7b691b2012-06-09 02:40:03 -07001766
1767 if (in_interrupt())
David Rientjes2a389612014-04-07 15:37:29 -07001768 return node;
Andi Kleene7b691b2012-06-09 02:40:03 -07001769
1770 policy = current->mempolicy;
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001771 if (!policy || policy->flags & MPOL_F_LOCAL)
David Rientjes2a389612014-04-07 15:37:29 -07001772 return node;
Christoph Lameter765c4502006-09-27 01:50:08 -07001773
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001774 switch (policy->mode) {
1775 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001776 /*
1777 * handled MPOL_F_LOCAL above
1778 */
1779 return policy->v.preferred_node;
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001780
Christoph Lameterdc85da12006-01-18 17:42:36 -08001781 case MPOL_INTERLEAVE:
1782 return interleave_nodes(policy);
1783
Mel Gormandd1a2392008-04-28 02:12:17 -07001784 case MPOL_BIND: {
Christoph Lameterdc85da12006-01-18 17:42:36 -08001785 /*
1786 * Follow bind policy behavior and start allocation at the
1787 * first node.
1788 */
Mel Gorman19770b32008-04-28 02:12:18 -07001789 struct zonelist *zonelist;
1790 struct zone *zone;
1791 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
David Rientjes2a389612014-04-07 15:37:29 -07001792 zonelist = &NODE_DATA(node)->node_zonelists[0];
Mel Gorman19770b32008-04-28 02:12:18 -07001793 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1794 &policy->v.nodes,
1795 &zone);
David Rientjes2a389612014-04-07 15:37:29 -07001796 return zone ? zone->node : node;
Mel Gormandd1a2392008-04-28 02:12:17 -07001797 }
Christoph Lameterdc85da12006-01-18 17:42:36 -08001798
Christoph Lameterdc85da12006-01-18 17:42:36 -08001799 default:
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001800 BUG();
Christoph Lameterdc85da12006-01-18 17:42:36 -08001801 }
1802}
1803
Linus Torvalds1da177e2005-04-16 15:20:36 -07001804/* Do static interleaving for a VMA with known offset. */
1805static unsigned offset_il_node(struct mempolicy *pol,
1806 struct vm_area_struct *vma, unsigned long off)
1807{
Andi Kleendfcd3c02005-10-29 18:15:48 -07001808 unsigned nnodes = nodes_weight(pol->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001809 unsigned target;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810 int c;
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001811 int nid = NUMA_NO_NODE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812
David Rientjesf5b087b2008-04-28 02:12:27 -07001813 if (!nnodes)
1814 return numa_node_id();
1815 target = (unsigned int)off % nnodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816 c = 0;
1817 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -07001818 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001819 c++;
1820 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821 return nid;
1822}
1823
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001824/* Determine a node number for interleave */
1825static inline unsigned interleave_nid(struct mempolicy *pol,
1826 struct vm_area_struct *vma, unsigned long addr, int shift)
1827{
1828 if (vma) {
1829 unsigned long off;
1830
Nishanth Aravamudan3b98b082006-08-31 21:27:53 -07001831 /*
1832 * for small pages, there is no difference between
1833 * shift and PAGE_SHIFT, so the bit-shift is safe.
1834 * for huge pages, since vm_pgoff is in units of small
1835 * pages, we need to shift off the always 0 bits to get
1836 * a useful offset.
1837 */
1838 BUG_ON(shift < PAGE_SHIFT);
1839 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001840 off += (addr - vma->vm_start) >> shift;
1841 return offset_il_node(pol, vma, off);
1842 } else
1843 return interleave_nodes(pol);
1844}
1845
Michal Hocko778d3b02011-07-26 16:08:30 -07001846/*
1847 * Return the bit number of a random bit set in the nodemask.
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001848 * (returns NUMA_NO_NODE if nodemask is empty)
Michal Hocko778d3b02011-07-26 16:08:30 -07001849 */
1850int node_random(const nodemask_t *maskp)
1851{
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001852 int w, bit = NUMA_NO_NODE;
Michal Hocko778d3b02011-07-26 16:08:30 -07001853
1854 w = nodes_weight(*maskp);
1855 if (w)
1856 bit = bitmap_ord_to_pos(maskp->bits,
1857 get_random_int() % w, MAX_NUMNODES);
1858 return bit;
1859}
1860
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001861#ifdef CONFIG_HUGETLBFS
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001862/*
1863 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
Fabian Frederickb46e14a2014-06-04 16:08:18 -07001864 * @vma: virtual memory area whose policy is sought
1865 * @addr: address in @vma for shared policy lookup and interleave policy
1866 * @gfp_flags: for requested zone
1867 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1868 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001869 *
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001870 * Returns a zonelist suitable for a huge page allocation and a pointer
1871 * to the struct mempolicy for conditional unref after allocation.
1872 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1873 * @nodemask for filtering the zonelist.
Miao Xiec0ff7452010-05-24 14:32:08 -07001874 *
Mel Gormand26914d2014-04-03 14:47:24 -07001875 * Must be protected by read_mems_allowed_begin()
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001876 */
Mel Gorman396faf02007-07-17 04:03:13 -07001877struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
Mel Gorman19770b32008-04-28 02:12:18 -07001878 gfp_t gfp_flags, struct mempolicy **mpol,
1879 nodemask_t **nodemask)
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001880{
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001881 struct zonelist *zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001882
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001883 *mpol = get_vma_policy(current, vma, addr);
Mel Gorman19770b32008-04-28 02:12:18 -07001884 *nodemask = NULL; /* assume !MPOL_BIND */
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001885
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001886 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1887 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
Andi Kleena5516432008-07-23 21:27:41 -07001888 huge_page_shift(hstate_vma(vma))), gfp_flags);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001889 } else {
Andi Kleen2f5f9482011-03-04 17:36:29 -08001890 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001891 if ((*mpol)->mode == MPOL_BIND)
1892 *nodemask = &(*mpol)->v.nodes;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001893 }
1894 return zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001895}
Lee Schermerhorn06808b02009-12-14 17:58:21 -08001896
1897/*
1898 * init_nodemask_of_mempolicy
1899 *
1900 * If the current task's mempolicy is "default" [NULL], return 'false'
1901 * to indicate default policy. Otherwise, extract the policy nodemask
1902 * for 'bind' or 'interleave' policy into the argument nodemask, or
1903 * initialize the argument nodemask to contain the single node for
1904 * 'preferred' or 'local' policy and return 'true' to indicate presence
1905 * of non-default mempolicy.
1906 *
1907 * We don't bother with reference counting the mempolicy [mpol_get/put]
1908 * because the current task is examining it's own mempolicy and a task's
1909 * mempolicy is only ever changed by the task itself.
1910 *
1911 * N.B., it is the caller's responsibility to free a returned nodemask.
1912 */
1913bool init_nodemask_of_mempolicy(nodemask_t *mask)
1914{
1915 struct mempolicy *mempolicy;
1916 int nid;
1917
1918 if (!(mask && current->mempolicy))
1919 return false;
1920
Miao Xiec0ff7452010-05-24 14:32:08 -07001921 task_lock(current);
Lee Schermerhorn06808b02009-12-14 17:58:21 -08001922 mempolicy = current->mempolicy;
1923 switch (mempolicy->mode) {
1924 case MPOL_PREFERRED:
1925 if (mempolicy->flags & MPOL_F_LOCAL)
1926 nid = numa_node_id();
1927 else
1928 nid = mempolicy->v.preferred_node;
1929 init_nodemask_of_node(mask, nid);
1930 break;
1931
1932 case MPOL_BIND:
1933 /* Fall through */
1934 case MPOL_INTERLEAVE:
1935 *mask = mempolicy->v.nodes;
1936 break;
1937
1938 default:
1939 BUG();
1940 }
Miao Xiec0ff7452010-05-24 14:32:08 -07001941 task_unlock(current);
Lee Schermerhorn06808b02009-12-14 17:58:21 -08001942
1943 return true;
1944}
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001945#endif
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001946
David Rientjes6f48d0eb2010-08-09 17:18:52 -07001947/*
1948 * mempolicy_nodemask_intersects
1949 *
1950 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1951 * policy. Otherwise, check for intersection between mask and the policy
1952 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1953 * policy, always return true since it may allocate elsewhere on fallback.
1954 *
1955 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1956 */
1957bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1958 const nodemask_t *mask)
1959{
1960 struct mempolicy *mempolicy;
1961 bool ret = true;
1962
1963 if (!mask)
1964 return ret;
1965 task_lock(tsk);
1966 mempolicy = tsk->mempolicy;
1967 if (!mempolicy)
1968 goto out;
1969
1970 switch (mempolicy->mode) {
1971 case MPOL_PREFERRED:
1972 /*
1973 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1974 * allocate from, they may fallback to other nodes when oom.
1975 * Thus, it's possible for tsk to have allocated memory from
1976 * nodes in mask.
1977 */
1978 break;
1979 case MPOL_BIND:
1980 case MPOL_INTERLEAVE:
1981 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1982 break;
1983 default:
1984 BUG();
1985 }
1986out:
1987 task_unlock(tsk);
1988 return ret;
1989}
1990
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991/* Allocate a page in interleaved policy.
1992 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07001993static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1994 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995{
1996 struct zonelist *zl;
1997 struct page *page;
1998
Mel Gorman0e884602008-04-28 02:12:14 -07001999 zl = node_zonelist(nid, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000 page = __alloc_pages(gfp, order, zl);
Mel Gormandd1a2392008-04-28 02:12:17 -07002001 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
Christoph Lameterca889e62006-06-30 01:55:44 -07002002 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003 return page;
2004}
2005
2006/**
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002007 * alloc_pages_vma - Allocate a page for a VMA.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002008 *
2009 * @gfp:
2010 * %GFP_USER user allocation.
2011 * %GFP_KERNEL kernel allocations,
2012 * %GFP_HIGHMEM highmem/user allocations,
2013 * %GFP_FS allocation should not call back into a file system.
2014 * %GFP_ATOMIC don't sleep.
2015 *
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002016 * @order:Order of the GFP allocation.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 * @vma: Pointer to VMA or NULL if not available.
2018 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2019 *
2020 * This function allocates a page from the kernel page pool and applies
2021 * a NUMA policy associated with the VMA or the current process.
2022 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2023 * mm_struct of the VMA to prevent it from going away. Should be used for
2024 * all allocations for pages that will be mapped into
2025 * user space. Returns NULL when no page can be allocated.
2026 *
2027 * Should be called with the mm_sem of the vma hold.
2028 */
2029struct page *
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002030alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
Andi Kleen2f5f9482011-03-04 17:36:29 -08002031 unsigned long addr, int node)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032{
Mel Gormancc9a6c82012-03-21 16:34:11 -07002033 struct mempolicy *pol;
Miao Xiec0ff7452010-05-24 14:32:08 -07002034 struct page *page;
Mel Gormancc9a6c82012-03-21 16:34:11 -07002035 unsigned int cpuset_mems_cookie;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036
Mel Gormancc9a6c82012-03-21 16:34:11 -07002037retry_cpuset:
2038 pol = get_vma_policy(current, vma, addr);
Mel Gormand26914d2014-04-03 14:47:24 -07002039 cpuset_mems_cookie = read_mems_allowed_begin();
Mel Gormancc9a6c82012-03-21 16:34:11 -07002040
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002041 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002042 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08002043
Andi Kleen8eac5632011-02-25 14:44:28 -08002044 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002045 mpol_cond_put(pol);
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002046 page = alloc_page_interleave(gfp, order, nid);
Mel Gormand26914d2014-04-03 14:47:24 -07002047 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
Mel Gormancc9a6c82012-03-21 16:34:11 -07002048 goto retry_cpuset;
2049
Miao Xiec0ff7452010-05-24 14:32:08 -07002050 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051 }
David Rientjes212a0a62012-12-11 16:02:51 -08002052 page = __alloc_pages_nodemask(gfp, order,
2053 policy_zonelist(gfp, pol, node),
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002054 policy_nodemask(gfp, pol));
David Rientjes212a0a62012-12-11 16:02:51 -08002055 if (unlikely(mpol_needs_cond_ref(pol)))
2056 __mpol_put(pol);
Mel Gormand26914d2014-04-03 14:47:24 -07002057 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
Mel Gormancc9a6c82012-03-21 16:34:11 -07002058 goto retry_cpuset;
Miao Xiec0ff7452010-05-24 14:32:08 -07002059 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002060}
2061
2062/**
2063 * alloc_pages_current - Allocate pages.
2064 *
2065 * @gfp:
2066 * %GFP_USER user allocation,
2067 * %GFP_KERNEL kernel allocation,
2068 * %GFP_HIGHMEM highmem allocation,
2069 * %GFP_FS don't call back into a file system.
2070 * %GFP_ATOMIC don't sleep.
2071 * @order: Power of two of allocation size in pages. 0 is a single page.
2072 *
2073 * Allocate a page from the kernel page pool. When not in
2074 * interrupt context and apply the current process NUMA policy.
2075 * Returns NULL when no page can be allocated.
2076 *
Paul Jacksoncf2a473c2006-01-08 01:01:54 -08002077 * Don't call cpuset_update_task_memory_state() unless
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078 * 1) it's ok to take cpuset_sem (can WAIT), and
2079 * 2) allocating for current task (not interrupt).
2080 */
Al Virodd0fc662005-10-07 07:46:04 +01002081struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082{
Mel Gorman5606e382012-11-02 18:19:13 +00002083 struct mempolicy *pol = get_task_policy(current);
Miao Xiec0ff7452010-05-24 14:32:08 -07002084 struct page *page;
Mel Gormancc9a6c82012-03-21 16:34:11 -07002085 unsigned int cpuset_mems_cookie;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086
Christoph Lameter9b819d22006-09-25 23:31:40 -07002087 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088 pol = &default_policy;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002089
Mel Gormancc9a6c82012-03-21 16:34:11 -07002090retry_cpuset:
Mel Gormand26914d2014-04-03 14:47:24 -07002091 cpuset_mems_cookie = read_mems_allowed_begin();
Mel Gormancc9a6c82012-03-21 16:34:11 -07002092
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002093 /*
2094 * No reference counting needed for current->mempolicy
2095 * nor system default_policy
2096 */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002097 if (pol->mode == MPOL_INTERLEAVE)
Miao Xiec0ff7452010-05-24 14:32:08 -07002098 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2099 else
2100 page = __alloc_pages_nodemask(gfp, order,
Andi Kleen5c4b4be2011-03-04 17:36:32 -08002101 policy_zonelist(gfp, pol, numa_node_id()),
2102 policy_nodemask(gfp, pol));
Mel Gormancc9a6c82012-03-21 16:34:11 -07002103
Mel Gormand26914d2014-04-03 14:47:24 -07002104 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
Mel Gormancc9a6c82012-03-21 16:34:11 -07002105 goto retry_cpuset;
2106
Miao Xiec0ff7452010-05-24 14:32:08 -07002107 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108}
2109EXPORT_SYMBOL(alloc_pages_current);
2110
Oleg Nesterovef0855d2013-09-11 14:20:14 -07002111int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2112{
2113 struct mempolicy *pol = mpol_dup(vma_policy(src));
2114
2115 if (IS_ERR(pol))
2116 return PTR_ERR(pol);
2117 dst->vm_policy = pol;
2118 return 0;
2119}
2120
Paul Jackson42253992006-01-08 01:01:59 -08002121/*
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07002122 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
Paul Jackson42253992006-01-08 01:01:59 -08002123 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2124 * with the mems_allowed returned by cpuset_mems_allowed(). This
2125 * keeps mempolicies cpuset relative after its cpuset moves. See
2126 * further kernel/cpuset.c update_nodemask().
Miao Xie708c1bb2010-05-24 14:32:07 -07002127 *
2128 * current's mempolicy may be rebinded by the other task(the task that changes
2129 * cpuset's mems), so we needn't do rebind work for current task.
Paul Jackson42253992006-01-08 01:01:59 -08002130 */
Paul Jackson42253992006-01-08 01:01:59 -08002131
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07002132/* Slow path of a mempolicy duplicate */
2133struct mempolicy *__mpol_dup(struct mempolicy *old)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002134{
2135 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2136
2137 if (!new)
2138 return ERR_PTR(-ENOMEM);
Miao Xie708c1bb2010-05-24 14:32:07 -07002139
2140 /* task's mempolicy is protected by alloc_lock */
2141 if (old == current->mempolicy) {
2142 task_lock(current);
2143 *new = *old;
2144 task_unlock(current);
2145 } else
2146 *new = *old;
2147
Paul E. McKenney99ee4ca2010-03-03 17:50:17 -08002148 rcu_read_lock();
Paul Jackson42253992006-01-08 01:01:59 -08002149 if (current_cpuset_is_being_rebound()) {
2150 nodemask_t mems = cpuset_mems_allowed(current);
Miao Xie708c1bb2010-05-24 14:32:07 -07002151 if (new->flags & MPOL_F_REBINDING)
2152 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2153 else
2154 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
Paul Jackson42253992006-01-08 01:01:59 -08002155 }
Paul E. McKenney99ee4ca2010-03-03 17:50:17 -08002156 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002157 atomic_set(&new->refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 return new;
2159}
2160
2161/* Slow path of a mempolicy comparison */
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002162bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002163{
2164 if (!a || !b)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002165 return false;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002166 if (a->mode != b->mode)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002167 return false;
Bob Liu19800502010-05-24 14:32:01 -07002168 if (a->flags != b->flags)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002169 return false;
Bob Liu19800502010-05-24 14:32:01 -07002170 if (mpol_store_user_nodemask(a))
2171 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002172 return false;
Bob Liu19800502010-05-24 14:32:01 -07002173
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002174 switch (a->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -07002175 case MPOL_BIND:
2176 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 case MPOL_INTERLEAVE:
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002178 return !!nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179 case MPOL_PREFERRED:
Namhyung Kim75719662011-03-22 16:33:02 -07002180 return a->v.preferred_node == b->v.preferred_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002181 default:
2182 BUG();
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002183 return false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184 }
2185}
2186
Linus Torvalds1da177e2005-04-16 15:20:36 -07002187/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002188 * Shared memory backing store policy support.
2189 *
2190 * Remember policies even when nobody has shared memory mapped.
2191 * The policies are kept in Red-Black tree linked from the inode.
2192 * They are protected by the sp->lock spinlock, which should be held
2193 * for any accesses to the tree.
2194 */
2195
2196/* lookup first element intersecting start-end */
Mel Gorman42288fe2012-12-21 23:10:25 +00002197/* Caller holds sp->lock */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198static struct sp_node *
2199sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2200{
2201 struct rb_node *n = sp->root.rb_node;
2202
2203 while (n) {
2204 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2205
2206 if (start >= p->end)
2207 n = n->rb_right;
2208 else if (end <= p->start)
2209 n = n->rb_left;
2210 else
2211 break;
2212 }
2213 if (!n)
2214 return NULL;
2215 for (;;) {
2216 struct sp_node *w = NULL;
2217 struct rb_node *prev = rb_prev(n);
2218 if (!prev)
2219 break;
2220 w = rb_entry(prev, struct sp_node, nd);
2221 if (w->end <= start)
2222 break;
2223 n = prev;
2224 }
2225 return rb_entry(n, struct sp_node, nd);
2226}
2227
2228/* Insert a new shared policy into the list. */
2229/* Caller holds sp->lock */
2230static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2231{
2232 struct rb_node **p = &sp->root.rb_node;
2233 struct rb_node *parent = NULL;
2234 struct sp_node *nd;
2235
2236 while (*p) {
2237 parent = *p;
2238 nd = rb_entry(parent, struct sp_node, nd);
2239 if (new->start < nd->start)
2240 p = &(*p)->rb_left;
2241 else if (new->end > nd->end)
2242 p = &(*p)->rb_right;
2243 else
2244 BUG();
2245 }
2246 rb_link_node(&new->nd, parent, p);
2247 rb_insert_color(&new->nd, &sp->root);
Paul Mundt140d5a42007-07-15 23:38:16 -07002248 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002249 new->policy ? new->policy->mode : 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250}
2251
2252/* Find shared policy intersecting idx */
2253struct mempolicy *
2254mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2255{
2256 struct mempolicy *pol = NULL;
2257 struct sp_node *sn;
2258
2259 if (!sp->root.rb_node)
2260 return NULL;
Mel Gorman42288fe2012-12-21 23:10:25 +00002261 spin_lock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262 sn = sp_lookup(sp, idx, idx+1);
2263 if (sn) {
2264 mpol_get(sn->policy);
2265 pol = sn->policy;
2266 }
Mel Gorman42288fe2012-12-21 23:10:25 +00002267 spin_unlock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 return pol;
2269}
2270
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002271static void sp_free(struct sp_node *n)
2272{
2273 mpol_put(n->policy);
2274 kmem_cache_free(sn_cache, n);
2275}
2276
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002277/**
2278 * mpol_misplaced - check whether current page node is valid in policy
2279 *
Fabian Frederickb46e14a2014-06-04 16:08:18 -07002280 * @page: page to be checked
2281 * @vma: vm area where page mapped
2282 * @addr: virtual address where page mapped
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002283 *
2284 * Lookup current policy node id for vma,addr and "compare to" page's
2285 * node id.
2286 *
2287 * Returns:
2288 * -1 - not misplaced, page is in the right node
2289 * node - node id where the page should be
2290 *
2291 * Policy determination "mimics" alloc_page_vma().
2292 * Called from fault path where we know the vma and faulting address.
2293 */
2294int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2295{
2296 struct mempolicy *pol;
2297 struct zone *zone;
2298 int curnid = page_to_nid(page);
2299 unsigned long pgoff;
Peter Zijlstra90572892013-10-07 11:29:20 +01002300 int thiscpu = raw_smp_processor_id();
2301 int thisnid = cpu_to_node(thiscpu);
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002302 int polnid = -1;
2303 int ret = -1;
2304
2305 BUG_ON(!vma);
2306
2307 pol = get_vma_policy(current, vma, addr);
2308 if (!(pol->flags & MPOL_F_MOF))
2309 goto out;
2310
2311 switch (pol->mode) {
2312 case MPOL_INTERLEAVE:
2313 BUG_ON(addr >= vma->vm_end);
2314 BUG_ON(addr < vma->vm_start);
2315
2316 pgoff = vma->vm_pgoff;
2317 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2318 polnid = offset_il_node(pol, vma, pgoff);
2319 break;
2320
2321 case MPOL_PREFERRED:
2322 if (pol->flags & MPOL_F_LOCAL)
2323 polnid = numa_node_id();
2324 else
2325 polnid = pol->v.preferred_node;
2326 break;
2327
2328 case MPOL_BIND:
2329 /*
2330 * allows binding to multiple nodes.
2331 * use current page if in policy nodemask,
2332 * else select nearest allowed node, if any.
2333 * If no allowed nodes, use current [!misplaced].
2334 */
2335 if (node_isset(curnid, pol->v.nodes))
2336 goto out;
2337 (void)first_zones_zonelist(
2338 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2339 gfp_zone(GFP_HIGHUSER),
2340 &pol->v.nodes, &zone);
2341 polnid = zone->node;
2342 break;
2343
2344 default:
2345 BUG();
2346 }
Mel Gorman5606e382012-11-02 18:19:13 +00002347
2348 /* Migrate the page towards the node whose CPU is referencing it */
Mel Gormane42c8ff2012-11-12 09:17:07 +00002349 if (pol->flags & MPOL_F_MORON) {
Peter Zijlstra90572892013-10-07 11:29:20 +01002350 polnid = thisnid;
Mel Gorman5606e382012-11-02 18:19:13 +00002351
Rik van Riel10f39042014-01-27 17:03:44 -05002352 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
Rik van Rielde1c9ce2013-10-07 11:29:39 +01002353 goto out;
Mel Gormane42c8ff2012-11-12 09:17:07 +00002354 }
2355
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002356 if (curnid != polnid)
2357 ret = polnid;
2358out:
2359 mpol_cond_put(pol);
2360
2361 return ret;
2362}
2363
Linus Torvalds1da177e2005-04-16 15:20:36 -07002364static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2365{
Paul Mundt140d5a42007-07-15 23:38:16 -07002366 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367 rb_erase(&n->nd, &sp->root);
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002368 sp_free(n);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369}
2370
Mel Gorman42288fe2012-12-21 23:10:25 +00002371static void sp_node_init(struct sp_node *node, unsigned long start,
2372 unsigned long end, struct mempolicy *pol)
2373{
2374 node->start = start;
2375 node->end = end;
2376 node->policy = pol;
2377}
2378
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07002379static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2380 struct mempolicy *pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002381{
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002382 struct sp_node *n;
2383 struct mempolicy *newpol;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002385 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386 if (!n)
2387 return NULL;
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002388
2389 newpol = mpol_dup(pol);
2390 if (IS_ERR(newpol)) {
2391 kmem_cache_free(sn_cache, n);
2392 return NULL;
2393 }
2394 newpol->flags |= MPOL_F_SHARED;
Mel Gorman42288fe2012-12-21 23:10:25 +00002395 sp_node_init(n, start, end, newpol);
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002396
Linus Torvalds1da177e2005-04-16 15:20:36 -07002397 return n;
2398}
2399
2400/* Replace a policy range. */
2401static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2402 unsigned long end, struct sp_node *new)
2403{
Mel Gormanb22d1272012-10-08 16:29:17 -07002404 struct sp_node *n;
Mel Gorman42288fe2012-12-21 23:10:25 +00002405 struct sp_node *n_new = NULL;
2406 struct mempolicy *mpol_new = NULL;
Mel Gormanb22d1272012-10-08 16:29:17 -07002407 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002408
Mel Gorman42288fe2012-12-21 23:10:25 +00002409restart:
2410 spin_lock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411 n = sp_lookup(sp, start, end);
2412 /* Take care of old policies in the same range. */
2413 while (n && n->start < end) {
2414 struct rb_node *next = rb_next(&n->nd);
2415 if (n->start >= start) {
2416 if (n->end <= end)
2417 sp_delete(sp, n);
2418 else
2419 n->start = end;
2420 } else {
2421 /* Old policy spanning whole new range. */
2422 if (n->end > end) {
Mel Gorman42288fe2012-12-21 23:10:25 +00002423 if (!n_new)
2424 goto alloc_new;
2425
2426 *mpol_new = *n->policy;
2427 atomic_set(&mpol_new->refcnt, 1);
KOSAKI Motohiro78806392013-03-08 12:43:29 -08002428 sp_node_init(n_new, end, n->end, mpol_new);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429 n->end = start;
Hillf Danton5ca39572013-03-08 12:43:28 -08002430 sp_insert(sp, n_new);
Mel Gorman42288fe2012-12-21 23:10:25 +00002431 n_new = NULL;
2432 mpol_new = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433 break;
2434 } else
2435 n->end = start;
2436 }
2437 if (!next)
2438 break;
2439 n = rb_entry(next, struct sp_node, nd);
2440 }
2441 if (new)
2442 sp_insert(sp, new);
Mel Gorman42288fe2012-12-21 23:10:25 +00002443 spin_unlock(&sp->lock);
2444 ret = 0;
2445
2446err_out:
2447 if (mpol_new)
2448 mpol_put(mpol_new);
2449 if (n_new)
2450 kmem_cache_free(sn_cache, n_new);
2451
Mel Gormanb22d1272012-10-08 16:29:17 -07002452 return ret;
Mel Gorman42288fe2012-12-21 23:10:25 +00002453
2454alloc_new:
2455 spin_unlock(&sp->lock);
2456 ret = -ENOMEM;
2457 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2458 if (!n_new)
2459 goto err_out;
2460 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2461 if (!mpol_new)
2462 goto err_out;
2463 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464}
2465
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002466/**
2467 * mpol_shared_policy_init - initialize shared policy for inode
2468 * @sp: pointer to inode shared policy
2469 * @mpol: struct mempolicy to install
2470 *
2471 * Install non-NULL @mpol in inode's shared policy rb-tree.
2472 * On entry, the current task has a reference on a non-NULL @mpol.
2473 * This must be released on exit.
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002474 * This is called at get_inode() calls and we can use GFP_KERNEL.
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002475 */
2476void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
Robin Holt7339ff82006-01-14 13:20:48 -08002477{
Miao Xie58568d22009-06-16 15:31:49 -07002478 int ret;
2479
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002480 sp->root = RB_ROOT; /* empty tree == default mempolicy */
Mel Gorman42288fe2012-12-21 23:10:25 +00002481 spin_lock_init(&sp->lock);
Robin Holt7339ff82006-01-14 13:20:48 -08002482
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002483 if (mpol) {
2484 struct vm_area_struct pvma;
2485 struct mempolicy *new;
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002486 NODEMASK_SCRATCH(scratch);
Robin Holt7339ff82006-01-14 13:20:48 -08002487
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002488 if (!scratch)
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002489 goto put_mpol;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002490 /* contextualize the tmpfs mount point mempolicy */
2491 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002492 if (IS_ERR(new))
Dan Carpenter0cae3452010-05-25 23:42:58 -07002493 goto free_scratch; /* no valid nodemask intersection */
Miao Xie58568d22009-06-16 15:31:49 -07002494
2495 task_lock(current);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002496 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
Miao Xie58568d22009-06-16 15:31:49 -07002497 task_unlock(current);
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002498 if (ret)
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002499 goto put_new;
Robin Holt7339ff82006-01-14 13:20:48 -08002500
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002501 /* Create pseudo-vma that contains just the policy */
2502 memset(&pvma, 0, sizeof(struct vm_area_struct));
2503 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2504 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002505
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002506put_new:
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002507 mpol_put(new); /* drop initial ref */
Dan Carpenter0cae3452010-05-25 23:42:58 -07002508free_scratch:
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002509 NODEMASK_SCRATCH_FREE(scratch);
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002510put_mpol:
2511 mpol_put(mpol); /* drop our incoming ref on sb mpol */
Robin Holt7339ff82006-01-14 13:20:48 -08002512 }
2513}
2514
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515int mpol_set_shared_policy(struct shared_policy *info,
2516 struct vm_area_struct *vma, struct mempolicy *npol)
2517{
2518 int err;
2519 struct sp_node *new = NULL;
2520 unsigned long sz = vma_pages(vma);
2521
David Rientjes028fec42008-04-28 02:12:25 -07002522 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523 vma->vm_pgoff,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002524 sz, npol ? npol->mode : -1,
David Rientjes028fec42008-04-28 02:12:25 -07002525 npol ? npol->flags : -1,
David Rientjes00ef2d22013-02-22 16:35:36 -08002526 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527
2528 if (npol) {
2529 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2530 if (!new)
2531 return -ENOMEM;
2532 }
2533 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2534 if (err && new)
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002535 sp_free(new);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536 return err;
2537}
2538
2539/* Free a backing policy store on inode delete. */
2540void mpol_free_shared_policy(struct shared_policy *p)
2541{
2542 struct sp_node *n;
2543 struct rb_node *next;
2544
2545 if (!p->root.rb_node)
2546 return;
Mel Gorman42288fe2012-12-21 23:10:25 +00002547 spin_lock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002548 next = rb_first(&p->root);
2549 while (next) {
2550 n = rb_entry(next, struct sp_node, nd);
2551 next = rb_next(&n->nd);
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002552 sp_delete(p, n);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002553 }
Mel Gorman42288fe2012-12-21 23:10:25 +00002554 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555}
2556
Mel Gorman1a687c22012-11-22 11:16:36 +00002557#ifdef CONFIG_NUMA_BALANCING
Mel Gormanc2976632014-01-29 14:05:42 -08002558static int __initdata numabalancing_override;
Mel Gorman1a687c22012-11-22 11:16:36 +00002559
2560static void __init check_numabalancing_enable(void)
2561{
2562 bool numabalancing_default = false;
2563
2564 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2565 numabalancing_default = true;
2566
Mel Gormanc2976632014-01-29 14:05:42 -08002567 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2568 if (numabalancing_override)
2569 set_numabalancing_state(numabalancing_override == 1);
2570
Mel Gorman1a687c22012-11-22 11:16:36 +00002571 if (nr_node_ids > 1 && !numabalancing_override) {
Andrew Morton4a404be2014-01-29 14:05:43 -08002572 pr_info("%s automatic NUMA balancing. "
Mel Gormanc2976632014-01-29 14:05:42 -08002573 "Configure with numa_balancing= or the "
2574 "kernel.numa_balancing sysctl",
2575 numabalancing_default ? "Enabling" : "Disabling");
Mel Gorman1a687c22012-11-22 11:16:36 +00002576 set_numabalancing_state(numabalancing_default);
2577 }
2578}
2579
2580static int __init setup_numabalancing(char *str)
2581{
2582 int ret = 0;
2583 if (!str)
2584 goto out;
Mel Gorman1a687c22012-11-22 11:16:36 +00002585
2586 if (!strcmp(str, "enable")) {
Mel Gormanc2976632014-01-29 14:05:42 -08002587 numabalancing_override = 1;
Mel Gorman1a687c22012-11-22 11:16:36 +00002588 ret = 1;
2589 } else if (!strcmp(str, "disable")) {
Mel Gormanc2976632014-01-29 14:05:42 -08002590 numabalancing_override = -1;
Mel Gorman1a687c22012-11-22 11:16:36 +00002591 ret = 1;
2592 }
2593out:
2594 if (!ret)
Andrew Morton4a404be2014-01-29 14:05:43 -08002595 pr_warn("Unable to parse numa_balancing=\n");
Mel Gorman1a687c22012-11-22 11:16:36 +00002596
2597 return ret;
2598}
2599__setup("numa_balancing=", setup_numabalancing);
2600#else
2601static inline void __init check_numabalancing_enable(void)
2602{
2603}
2604#endif /* CONFIG_NUMA_BALANCING */
2605
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606/* assumes fs == KERNEL_DS */
2607void __init numa_policy_init(void)
2608{
Paul Mundtb71636e2007-07-15 23:38:15 -07002609 nodemask_t interleave_nodes;
2610 unsigned long largest = 0;
2611 int nid, prefer = 0;
2612
Linus Torvalds1da177e2005-04-16 15:20:36 -07002613 policy_cache = kmem_cache_create("numa_policy",
2614 sizeof(struct mempolicy),
Paul Mundt20c2df82007-07-20 10:11:58 +09002615 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616
2617 sn_cache = kmem_cache_create("shared_policy_node",
2618 sizeof(struct sp_node),
Paul Mundt20c2df82007-07-20 10:11:58 +09002619 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002620
Mel Gorman5606e382012-11-02 18:19:13 +00002621 for_each_node(nid) {
2622 preferred_node_policy[nid] = (struct mempolicy) {
2623 .refcnt = ATOMIC_INIT(1),
2624 .mode = MPOL_PREFERRED,
2625 .flags = MPOL_F_MOF | MPOL_F_MORON,
2626 .v = { .preferred_node = nid, },
2627 };
2628 }
2629
Paul Mundtb71636e2007-07-15 23:38:15 -07002630 /*
2631 * Set interleaving policy for system init. Interleaving is only
2632 * enabled across suitably sized nodes (default is >= 16MB), or
2633 * fall back to the largest node if they're all smaller.
2634 */
2635 nodes_clear(interleave_nodes);
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002636 for_each_node_state(nid, N_MEMORY) {
Paul Mundtb71636e2007-07-15 23:38:15 -07002637 unsigned long total_pages = node_present_pages(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638
Paul Mundtb71636e2007-07-15 23:38:15 -07002639 /* Preserve the largest node */
2640 if (largest < total_pages) {
2641 largest = total_pages;
2642 prefer = nid;
2643 }
2644
2645 /* Interleave this node? */
2646 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2647 node_set(nid, interleave_nodes);
2648 }
2649
2650 /* All too small, use the largest */
2651 if (unlikely(nodes_empty(interleave_nodes)))
2652 node_set(prefer, interleave_nodes);
2653
David Rientjes028fec42008-04-28 02:12:25 -07002654 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -07002655 pr_err("%s: interleaving failed\n", __func__);
Mel Gorman1a687c22012-11-22 11:16:36 +00002656
2657 check_numabalancing_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002658}
2659
Christoph Lameter8bccd852005-10-29 18:16:59 -07002660/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661void numa_default_policy(void)
2662{
David Rientjes028fec42008-04-28 02:12:25 -07002663 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002664}
Paul Jackson68860ec2005-10-30 15:02:36 -08002665
Paul Jackson42253992006-01-08 01:01:59 -08002666/*
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002667 * Parse and format mempolicy from/to strings
2668 */
2669
2670/*
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002671 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002672 */
Lee Schermerhorn345ace92010-05-24 14:32:04 -07002673static const char * const policy_modes[] =
2674{
2675 [MPOL_DEFAULT] = "default",
2676 [MPOL_PREFERRED] = "prefer",
2677 [MPOL_BIND] = "bind",
2678 [MPOL_INTERLEAVE] = "interleave",
Lee Schermerhornd3a71032012-10-25 14:16:29 +02002679 [MPOL_LOCAL] = "local",
Lee Schermerhorn345ace92010-05-24 14:32:04 -07002680};
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002681
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002682
2683#ifdef CONFIG_TMPFS
2684/**
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002685 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002686 * @str: string containing mempolicy to parse
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002687 * @mpol: pointer to struct mempolicy pointer, returned on success.
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002688 *
2689 * Format of input:
2690 * <mode>[=<flags>][:<nodelist>]
2691 *
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002692 * On success, returns 0, else 1
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002693 */
Hugh Dickinsa7a88b22013-01-02 02:04:23 -08002694int mpol_parse_str(char *str, struct mempolicy **mpol)
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002695{
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002696 struct mempolicy *new = NULL;
Lee Schermerhornb4652e82010-05-24 14:32:03 -07002697 unsigned short mode;
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002698 unsigned short mode_flags;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002699 nodemask_t nodes;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002700 char *nodelist = strchr(str, ':');
2701 char *flags = strchr(str, '=');
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002702 int err = 1;
2703
2704 if (nodelist) {
2705 /* NUL-terminate mode or flags string */
2706 *nodelist++ = '\0';
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002707 if (nodelist_parse(nodelist, nodes))
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002708 goto out;
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002709 if (!nodes_subset(nodes, node_states[N_MEMORY]))
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002710 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002711 } else
2712 nodes_clear(nodes);
2713
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002714 if (flags)
2715 *flags++ = '\0'; /* terminate mode string */
2716
Peter Zijlstra479e2802012-10-25 14:16:28 +02002717 for (mode = 0; mode < MPOL_MAX; mode++) {
Lee Schermerhorn345ace92010-05-24 14:32:04 -07002718 if (!strcmp(str, policy_modes[mode])) {
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002719 break;
2720 }
2721 }
Mel Gormana7200942012-11-16 09:37:58 +00002722 if (mode >= MPOL_MAX)
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002723 goto out;
2724
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002725 switch (mode) {
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002726 case MPOL_PREFERRED:
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002727 /*
2728 * Insist on a nodelist of one node only
2729 */
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002730 if (nodelist) {
2731 char *rest = nodelist;
2732 while (isdigit(*rest))
2733 rest++;
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002734 if (*rest)
2735 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002736 }
2737 break;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002738 case MPOL_INTERLEAVE:
2739 /*
2740 * Default to online nodes with memory if no nodelist
2741 */
2742 if (!nodelist)
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002743 nodes = node_states[N_MEMORY];
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002744 break;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002745 case MPOL_LOCAL:
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002746 /*
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002747 * Don't allow a nodelist; mpol_new() checks flags
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002748 */
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002749 if (nodelist)
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002750 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002751 mode = MPOL_PREFERRED;
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002752 break;
Ravikiran G Thirumalai413b43d2010-03-23 13:35:28 -07002753 case MPOL_DEFAULT:
2754 /*
2755 * Insist on a empty nodelist
2756 */
2757 if (!nodelist)
2758 err = 0;
2759 goto out;
KOSAKI Motohirod69b2e62010-03-23 13:35:30 -07002760 case MPOL_BIND:
2761 /*
2762 * Insist on a nodelist
2763 */
2764 if (!nodelist)
2765 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002766 }
2767
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002768 mode_flags = 0;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002769 if (flags) {
2770 /*
2771 * Currently, we only support two mutually exclusive
2772 * mode flags.
2773 */
2774 if (!strcmp(flags, "static"))
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002775 mode_flags |= MPOL_F_STATIC_NODES;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002776 else if (!strcmp(flags, "relative"))
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002777 mode_flags |= MPOL_F_RELATIVE_NODES;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002778 else
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002779 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002780 }
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002781
2782 new = mpol_new(mode, mode_flags, &nodes);
2783 if (IS_ERR(new))
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002784 goto out;
2785
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002786 /*
2787 * Save nodes for mpol_to_str() to show the tmpfs mount options
2788 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2789 */
2790 if (mode != MPOL_PREFERRED)
2791 new->v.nodes = nodes;
2792 else if (nodelist)
2793 new->v.preferred_node = first_node(nodes);
2794 else
2795 new->flags |= MPOL_F_LOCAL;
2796
2797 /*
2798 * Save nodes for contextualization: this will be used to "clone"
2799 * the mempolicy in a specific context [cpuset] at a later time.
2800 */
2801 new->w.user_nodemask = nodes;
2802
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002803 err = 0;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002804
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002805out:
2806 /* Restore string for error message */
2807 if (nodelist)
2808 *--nodelist = ':';
2809 if (flags)
2810 *--flags = '=';
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002811 if (!err)
2812 *mpol = new;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002813 return err;
2814}
2815#endif /* CONFIG_TMPFS */
2816
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002817/**
2818 * mpol_to_str - format a mempolicy structure for printing
2819 * @buffer: to contain formatted mempolicy string
2820 * @maxlen: length of @buffer
2821 * @pol: pointer to mempolicy to be formatted
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002822 *
David Rientjes948927e2013-11-12 15:07:28 -08002823 * Convert @pol into a string. If @buffer is too short, truncate the string.
2824 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2825 * longest flag, "relative", and to display at least a few node ids.
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002826 */
David Rientjes948927e2013-11-12 15:07:28 -08002827void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002828{
2829 char *p = buffer;
David Rientjes948927e2013-11-12 15:07:28 -08002830 nodemask_t nodes = NODE_MASK_NONE;
2831 unsigned short mode = MPOL_DEFAULT;
2832 unsigned short flags = 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002833
David Rientjes8790c712014-01-30 15:46:08 -08002834 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
Lee Schermerhornbea904d2008-04-28 02:13:18 -07002835 mode = pol->mode;
David Rientjes948927e2013-11-12 15:07:28 -08002836 flags = pol->flags;
2837 }
Lee Schermerhornbea904d2008-04-28 02:13:18 -07002838
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002839 switch (mode) {
2840 case MPOL_DEFAULT:
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002841 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002842 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002843 if (flags & MPOL_F_LOCAL)
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002844 mode = MPOL_LOCAL;
Lee Schermerhorn53f25562008-04-28 02:13:20 -07002845 else
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002846 node_set(pol->v.preferred_node, nodes);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002847 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002848 case MPOL_BIND:
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002849 case MPOL_INTERLEAVE:
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002850 nodes = pol->v.nodes;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002851 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002852 default:
David Rientjes948927e2013-11-12 15:07:28 -08002853 WARN_ON_ONCE(1);
2854 snprintf(p, maxlen, "unknown");
2855 return;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002856 }
2857
David Rientjesb7a9f422013-11-21 14:32:06 -08002858 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002859
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002860 if (flags & MPOL_MODE_FLAGS) {
David Rientjes948927e2013-11-12 15:07:28 -08002861 p += snprintf(p, buffer + maxlen - p, "=");
David Rientjesf5b087b2008-04-28 02:12:27 -07002862
Lee Schermerhorn22919902008-04-28 02:13:22 -07002863 /*
2864 * Currently, the only defined flags are mutually exclusive
2865 */
David Rientjesf5b087b2008-04-28 02:12:27 -07002866 if (flags & MPOL_F_STATIC_NODES)
Lee Schermerhorn22919902008-04-28 02:13:22 -07002867 p += snprintf(p, buffer + maxlen - p, "static");
2868 else if (flags & MPOL_F_RELATIVE_NODES)
2869 p += snprintf(p, buffer + maxlen - p, "relative");
David Rientjesf5b087b2008-04-28 02:12:27 -07002870 }
2871
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002872 if (!nodes_empty(nodes)) {
David Rientjes948927e2013-11-12 15:07:28 -08002873 p += snprintf(p, buffer + maxlen - p, ":");
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002874 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2875 }
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002876}