blob: fea4a5da6e4453897b425b0cdab6e08718299382 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
Linus Torvalds1da177e2005-04-16 15:20:36 -070066*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070080#include <linux/nsproxy.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070081#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080084#include <linux/swap.h>
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080085#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080087#include <linux/migrate.h>
Christoph Lameter95a402c2006-06-23 02:03:53 -070088#include <linux/rmap.h>
David Quigley86c3a762006-06-23 02:04:02 -070089#include <linux/security.h>
Adrian Bunkdbcb0f12007-10-16 01:26:26 -070090#include <linux/syscalls.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080091
Linus Torvalds1da177e2005-04-16 15:20:36 -070092#include <asm/tlbflush.h>
93#include <asm/uaccess.h>
94
Christoph Lameter38e35862006-01-08 01:01:01 -080095/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080096#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -080097#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080098#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080099
Pekka Enbergfcc234f2006-03-22 00:08:13 -0800100static struct kmem_cache *policy_cache;
101static struct kmem_cache *sn_cache;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103/* Highest zone. An specific allocation for a zone below that is not
104 policied. */
Christoph Lameter62672762007-02-10 01:43:07 -0800105enum zone_type policy_zone = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700107/*
108 * run-time system-wide default policy => local allocation
109 */
Andi Kleend42c6992005-07-06 19:56:03 +0200110struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111 .refcnt = ATOMIC_INIT(1), /* never free it */
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700112 .mode = MPOL_PREFERRED,
113 .v = { .preferred_node = -1 },
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114};
115
David Rientjes37012942008-04-28 02:12:33 -0700116static const struct mempolicy_operations {
117 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
118 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
119} mpol_ops[MPOL_MAX];
120
Mel Gorman19770b32008-04-28 02:12:18 -0700121/* Check that the nodemask contains at least one populated zone */
David Rientjes37012942008-04-28 02:12:33 -0700122static int is_valid_nodemask(const nodemask_t *nodemask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123{
Mel Gorman19770b32008-04-28 02:12:18 -0700124 int nd, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700125
Mel Gorman19770b32008-04-28 02:12:18 -0700126 /* Check that there is something useful in this mask */
127 k = policy_zone;
128
129 for_each_node_mask(nd, *nodemask) {
130 struct zone *z;
131
132 for (k = 0; k <= policy_zone; k++) {
133 z = &NODE_DATA(nd)->node_zones[k];
134 if (z->present_pages > 0)
135 return 1;
Andi Kleendd942ae2006-02-17 01:39:16 +0100136 }
137 }
Mel Gorman19770b32008-04-28 02:12:18 -0700138
139 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140}
141
David Rientjesf5b087b2008-04-28 02:12:27 -0700142static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
143{
David Rientjes4c50bc02008-04-28 02:12:30 -0700144 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
145}
146
147static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
148 const nodemask_t *rel)
149{
150 nodemask_t tmp;
151 nodes_fold(tmp, *orig, nodes_weight(*rel));
152 nodes_onto(*ret, tmp, *rel);
David Rientjesf5b087b2008-04-28 02:12:27 -0700153}
154
David Rientjes37012942008-04-28 02:12:33 -0700155static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
156{
157 if (nodes_empty(*nodes))
158 return -EINVAL;
159 pol->v.nodes = *nodes;
160 return 0;
161}
162
163static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
164{
165 if (!nodes)
166 pol->v.preferred_node = -1; /* local allocation */
167 else if (nodes_empty(*nodes))
168 return -EINVAL; /* no allowed nodes */
169 else
170 pol->v.preferred_node = first_node(*nodes);
171 return 0;
172}
173
174static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
175{
176 if (!is_valid_nodemask(nodes))
177 return -EINVAL;
178 pol->v.nodes = *nodes;
179 return 0;
180}
181
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182/* Create a new policy */
David Rientjes028fec42008-04-28 02:12:25 -0700183static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
184 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185{
186 struct mempolicy *policy;
David Rientjesf5b087b2008-04-28 02:12:27 -0700187 nodemask_t cpuset_context_nmask;
David Rientjes37012942008-04-28 02:12:33 -0700188 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189
David Rientjes028fec42008-04-28 02:12:25 -0700190 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
191 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
Paul Mundt140d5a42007-07-15 23:38:16 -0700192
David Rientjes3e1f0642008-04-28 02:12:34 -0700193 if (mode == MPOL_DEFAULT) {
194 if (nodes && !nodes_empty(*nodes))
David Rientjes37012942008-04-28 02:12:33 -0700195 return ERR_PTR(-EINVAL);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700196 return NULL; /* simply delete any existing policy */
David Rientjes37012942008-04-28 02:12:33 -0700197 }
David Rientjes3e1f0642008-04-28 02:12:34 -0700198 VM_BUG_ON(!nodes);
199
200 /*
201 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
202 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
203 * All other modes require a valid pointer to a non-empty nodemask.
204 */
205 if (mode == MPOL_PREFERRED) {
206 if (nodes_empty(*nodes)) {
207 if (((flags & MPOL_F_STATIC_NODES) ||
208 (flags & MPOL_F_RELATIVE_NODES)))
209 return ERR_PTR(-EINVAL);
210 nodes = NULL; /* flag local alloc */
211 }
212 } else if (nodes_empty(*nodes))
213 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
215 if (!policy)
216 return ERR_PTR(-ENOMEM);
217 atomic_set(&policy->refcnt, 1);
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700218 policy->mode = mode;
David Rientjes3e1f0642008-04-28 02:12:34 -0700219 policy->flags = flags;
David Rientjesf5b087b2008-04-28 02:12:27 -0700220
David Rientjes3e1f0642008-04-28 02:12:34 -0700221 if (nodes) {
222 /*
223 * cpuset related setup doesn't apply to local allocation
224 */
David Rientjes37012942008-04-28 02:12:33 -0700225 cpuset_update_task_memory_state();
226 if (flags & MPOL_F_RELATIVE_NODES)
227 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
228 &cpuset_current_mems_allowed);
229 else
230 nodes_and(cpuset_context_nmask, *nodes,
231 cpuset_current_mems_allowed);
232 if (mpol_store_user_nodemask(policy))
233 policy->w.user_nodemask = *nodes;
234 else
235 policy->w.cpuset_mems_allowed =
236 cpuset_mems_allowed(current);
237 }
238
239 ret = mpol_ops[mode].create(policy,
David Rientjes3e1f0642008-04-28 02:12:34 -0700240 nodes ? &cpuset_context_nmask : NULL);
David Rientjes37012942008-04-28 02:12:33 -0700241 if (ret < 0) {
242 kmem_cache_free(policy_cache, policy);
243 return ERR_PTR(ret);
244 }
245 return policy;
246}
247
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700248/* Slow path of a mpol destructor. */
249void __mpol_put(struct mempolicy *p)
250{
251 if (!atomic_dec_and_test(&p->refcnt))
252 return;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700253 kmem_cache_free(policy_cache, p);
254}
255
David Rientjes37012942008-04-28 02:12:33 -0700256static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
257{
258}
259
260static void mpol_rebind_nodemask(struct mempolicy *pol,
261 const nodemask_t *nodes)
262{
263 nodemask_t tmp;
264
265 if (pol->flags & MPOL_F_STATIC_NODES)
266 nodes_and(tmp, pol->w.user_nodemask, *nodes);
267 else if (pol->flags & MPOL_F_RELATIVE_NODES)
268 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
269 else {
270 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
271 *nodes);
272 pol->w.cpuset_mems_allowed = *nodes;
273 }
274
275 pol->v.nodes = tmp;
276 if (!node_isset(current->il_next, tmp)) {
277 current->il_next = next_node(current->il_next, tmp);
278 if (current->il_next >= MAX_NUMNODES)
279 current->il_next = first_node(tmp);
280 if (current->il_next >= MAX_NUMNODES)
281 current->il_next = numa_node_id();
282 }
283}
284
285static void mpol_rebind_preferred(struct mempolicy *pol,
286 const nodemask_t *nodes)
287{
288 nodemask_t tmp;
289
David Rientjes37012942008-04-28 02:12:33 -0700290 if (pol->flags & MPOL_F_STATIC_NODES) {
291 int node = first_node(pol->w.user_nodemask);
292
293 if (node_isset(node, *nodes))
294 pol->v.preferred_node = node;
295 else
296 pol->v.preferred_node = -1;
David Rientjes37012942008-04-28 02:12:33 -0700297 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
298 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
299 pol->v.preferred_node = first_node(tmp);
David Rientjes3e1f0642008-04-28 02:12:34 -0700300 } else if (pol->v.preferred_node != -1) {
David Rientjes37012942008-04-28 02:12:33 -0700301 pol->v.preferred_node = node_remap(pol->v.preferred_node,
302 pol->w.cpuset_mems_allowed,
303 *nodes);
304 pol->w.cpuset_mems_allowed = *nodes;
305 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306}
307
David Rientjes1d0d2682008-04-28 02:12:32 -0700308/* Migrate a policy to a different set of nodes */
309static void mpol_rebind_policy(struct mempolicy *pol,
310 const nodemask_t *newmask)
311{
David Rientjes1d0d2682008-04-28 02:12:32 -0700312 if (!pol)
313 return;
David Rientjes1d0d2682008-04-28 02:12:32 -0700314 if (!mpol_store_user_nodemask(pol) &&
315 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
316 return;
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700317 mpol_ops[pol->mode].rebind(pol, newmask);
David Rientjes1d0d2682008-04-28 02:12:32 -0700318}
319
320/*
321 * Wrapper for mpol_rebind_policy() that just requires task
322 * pointer, and updates task mempolicy.
323 */
324
325void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
326{
327 mpol_rebind_policy(tsk->mempolicy, new);
328}
329
330/*
331 * Rebind each vma in mm to new nodemask.
332 *
333 * Call holding a reference to mm. Takes mm->mmap_sem during call.
334 */
335
336void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
337{
338 struct vm_area_struct *vma;
339
340 down_write(&mm->mmap_sem);
341 for (vma = mm->mmap; vma; vma = vma->vm_next)
342 mpol_rebind_policy(vma->vm_policy, new);
343 up_write(&mm->mmap_sem);
344}
345
David Rientjes37012942008-04-28 02:12:33 -0700346static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
347 [MPOL_DEFAULT] = {
348 .rebind = mpol_rebind_default,
349 },
350 [MPOL_INTERLEAVE] = {
351 .create = mpol_new_interleave,
352 .rebind = mpol_rebind_nodemask,
353 },
354 [MPOL_PREFERRED] = {
355 .create = mpol_new_preferred,
356 .rebind = mpol_rebind_preferred,
357 },
358 [MPOL_BIND] = {
359 .create = mpol_new_bind,
360 .rebind = mpol_rebind_nodemask,
361 },
362};
363
Christoph Lameter397874d2006-03-06 15:42:53 -0800364static void gather_stats(struct page *, void *, int pte_dirty);
Christoph Lameterfc301282006-01-18 17:42:29 -0800365static void migrate_page_add(struct page *page, struct list_head *pagelist,
366 unsigned long flags);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800367
Christoph Lameter38e35862006-01-08 01:01:01 -0800368/* Scan through pages checking if pages follow certain conditions. */
Nick Pigginb5810032005-10-29 18:16:12 -0700369static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800370 unsigned long addr, unsigned long end,
371 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800372 void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373{
Hugh Dickins91612e02005-06-21 17:15:07 -0700374 pte_t *orig_pte;
375 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700376 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700377
Hugh Dickins705e87c2005-10-29 18:16:27 -0700378 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700379 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800380 struct page *page;
Andy Whitcroft25ba77c2006-12-06 20:33:03 -0800381 int nid;
Hugh Dickins91612e02005-06-21 17:15:07 -0700382
383 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800385 page = vm_normal_page(vma, addr, *pte);
386 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700387 continue;
Nick Piggin053837f2006-01-18 17:42:27 -0800388 /*
389 * The check for PageReserved here is important to avoid
390 * handling zero pages and other pages that may have been
391 * marked special by the system.
392 *
393 * If the PageReserved would not be checked here then f.e.
394 * the location of the zero page could have an influence
395 * on MPOL_MF_STRICT, zero pages would be counted for
396 * the per node stats, and there would be useless attempts
397 * to put zero pages on the migration list.
398 */
Christoph Lameterf4598c82006-01-12 01:05:20 -0800399 if (PageReserved(page))
400 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800401 nid = page_to_nid(page);
Christoph Lameter38e35862006-01-08 01:01:01 -0800402 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
403 continue;
404
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800405 if (flags & MPOL_MF_STATS)
Christoph Lameter397874d2006-03-06 15:42:53 -0800406 gather_stats(page, private, pte_dirty(*pte));
Nick Piggin053837f2006-01-18 17:42:27 -0800407 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameterfc301282006-01-18 17:42:29 -0800408 migrate_page_add(page, private, flags);
Christoph Lameter38e35862006-01-08 01:01:01 -0800409 else
410 break;
Hugh Dickins91612e02005-06-21 17:15:07 -0700411 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700412 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700413 return addr != end;
414}
415
Nick Pigginb5810032005-10-29 18:16:12 -0700416static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800417 unsigned long addr, unsigned long end,
418 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800419 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700420{
421 pmd_t *pmd;
422 unsigned long next;
423
424 pmd = pmd_offset(pud, addr);
425 do {
426 next = pmd_addr_end(addr, end);
427 if (pmd_none_or_clear_bad(pmd))
428 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800429 if (check_pte_range(vma, pmd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800430 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700431 return -EIO;
432 } while (pmd++, addr = next, addr != end);
433 return 0;
434}
435
Nick Pigginb5810032005-10-29 18:16:12 -0700436static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800437 unsigned long addr, unsigned long end,
438 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800439 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700440{
441 pud_t *pud;
442 unsigned long next;
443
444 pud = pud_offset(pgd, addr);
445 do {
446 next = pud_addr_end(addr, end);
447 if (pud_none_or_clear_bad(pud))
448 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800449 if (check_pmd_range(vma, pud, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800450 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700451 return -EIO;
452 } while (pud++, addr = next, addr != end);
453 return 0;
454}
455
Nick Pigginb5810032005-10-29 18:16:12 -0700456static inline int check_pgd_range(struct vm_area_struct *vma,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800457 unsigned long addr, unsigned long end,
458 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800459 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700460{
461 pgd_t *pgd;
462 unsigned long next;
463
Nick Pigginb5810032005-10-29 18:16:12 -0700464 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700465 do {
466 next = pgd_addr_end(addr, end);
467 if (pgd_none_or_clear_bad(pgd))
468 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800469 if (check_pud_range(vma, pgd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800470 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700471 return -EIO;
472 } while (pgd++, addr = next, addr != end);
473 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474}
475
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800476/*
477 * Check if all pages in a range are on a set of nodes.
478 * If pagelist != NULL then isolate pages from the LRU and
479 * put them on the pagelist.
480 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481static struct vm_area_struct *
482check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Christoph Lameter38e35862006-01-08 01:01:01 -0800483 const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484{
485 int err;
486 struct vm_area_struct *first, *vma, *prev;
487
Christoph Lameter90036ee2006-03-16 23:03:59 -0800488 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
Christoph Lameter90036ee2006-03-16 23:03:59 -0800489
Christoph Lameterb20a3502006-03-22 00:09:12 -0800490 err = migrate_prep();
491 if (err)
492 return ERR_PTR(err);
Christoph Lameter90036ee2006-03-16 23:03:59 -0800493 }
Nick Piggin053837f2006-01-18 17:42:27 -0800494
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 first = find_vma(mm, start);
496 if (!first)
497 return ERR_PTR(-EFAULT);
498 prev = NULL;
499 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800500 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
501 if (!vma->vm_next && vma->vm_end < end)
502 return ERR_PTR(-EFAULT);
503 if (prev && prev->vm_end < vma->vm_start)
504 return ERR_PTR(-EFAULT);
505 }
506 if (!is_vm_hugetlb_page(vma) &&
507 ((flags & MPOL_MF_STRICT) ||
508 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
509 vma_migratable(vma)))) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700510 unsigned long endvma = vma->vm_end;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800511
Andi Kleen5b952b32005-09-13 01:25:08 -0700512 if (endvma > end)
513 endvma = end;
514 if (vma->vm_start > start)
515 start = vma->vm_start;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800516 err = check_pgd_range(vma, start, endvma, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800517 flags, private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518 if (err) {
519 first = ERR_PTR(err);
520 break;
521 }
522 }
523 prev = vma;
524 }
525 return first;
526}
527
528/* Apply policy to a single VMA */
529static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
530{
531 int err = 0;
532 struct mempolicy *old = vma->vm_policy;
533
Paul Mundt140d5a42007-07-15 23:38:16 -0700534 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535 vma->vm_start, vma->vm_end, vma->vm_pgoff,
536 vma->vm_ops, vma->vm_file,
537 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
538
539 if (vma->vm_ops && vma->vm_ops->set_policy)
540 err = vma->vm_ops->set_policy(vma, new);
541 if (!err) {
542 mpol_get(new);
543 vma->vm_policy = new;
Lee Schermerhornf0be3d32008-04-28 02:13:08 -0700544 mpol_put(old);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545 }
546 return err;
547}
548
549/* Step 2: apply policy to a range and do splits. */
550static int mbind_range(struct vm_area_struct *vma, unsigned long start,
551 unsigned long end, struct mempolicy *new)
552{
553 struct vm_area_struct *next;
554 int err;
555
556 err = 0;
557 for (; vma && vma->vm_start < end; vma = next) {
558 next = vma->vm_next;
559 if (vma->vm_start < start)
560 err = split_vma(vma->vm_mm, vma, start, 1);
561 if (!err && vma->vm_end > end)
562 err = split_vma(vma->vm_mm, vma, end, 0);
563 if (!err)
564 err = policy_vma(vma, new);
565 if (err)
566 break;
567 }
568 return err;
569}
570
Paul Jacksonc61afb12006-03-24 03:16:08 -0800571/*
572 * Update task->flags PF_MEMPOLICY bit: set iff non-default
573 * mempolicy. Allows more rapid checking of this (combined perhaps
574 * with other PF_* flag bits) on memory allocation hot code paths.
575 *
576 * If called from outside this file, the task 'p' should -only- be
577 * a newly forked child not yet visible on the task list, because
578 * manipulating the task flags of a visible task is not safe.
579 *
580 * The above limitation is why this routine has the funny name
581 * mpol_fix_fork_child_flag().
582 *
583 * It is also safe to call this with a task pointer of current,
584 * which the static wrapper mpol_set_task_struct_flag() does,
585 * for use within this file.
586 */
587
588void mpol_fix_fork_child_flag(struct task_struct *p)
589{
590 if (p->mempolicy)
591 p->flags |= PF_MEMPOLICY;
592 else
593 p->flags &= ~PF_MEMPOLICY;
594}
595
596static void mpol_set_task_struct_flag(void)
597{
598 mpol_fix_fork_child_flag(current);
599}
600
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601/* Set the process memory policy */
David Rientjes028fec42008-04-28 02:12:25 -0700602static long do_set_mempolicy(unsigned short mode, unsigned short flags,
603 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 struct mempolicy *new;
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700606 struct mm_struct *mm = current->mm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607
David Rientjes028fec42008-04-28 02:12:25 -0700608 new = mpol_new(mode, flags, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 if (IS_ERR(new))
610 return PTR_ERR(new);
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700611
612 /*
613 * prevent changing our mempolicy while show_numa_maps()
614 * is using it.
615 * Note: do_set_mempolicy() can be called at init time
616 * with no 'mm'.
617 */
618 if (mm)
619 down_write(&mm->mmap_sem);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -0700620 mpol_put(current->mempolicy);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621 current->mempolicy = new;
Paul Jacksonc61afb12006-03-24 03:16:08 -0800622 mpol_set_task_struct_flag();
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700623 if (new && new->mode == MPOL_INTERLEAVE &&
David Rientjesf5b087b2008-04-28 02:12:27 -0700624 nodes_weight(new->v.nodes))
Andi Kleendfcd3c02005-10-29 18:15:48 -0700625 current->il_next = first_node(new->v.nodes);
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700626 if (mm)
627 up_write(&mm->mmap_sem);
628
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 return 0;
630}
631
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700632/*
633 * Return nodemask for policy for get_mempolicy() query
634 */
635static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700637 nodes_clear(*nodes);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700638 if (p == &default_policy)
639 return;
640
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700641 switch (p->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -0700642 case MPOL_BIND:
643 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700645 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 break;
647 case MPOL_PREFERRED:
Christoph Lameter56bbd652007-10-16 01:25:35 -0700648 /* or use current node instead of memory_map? */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649 if (p->v.preferred_node < 0)
Christoph Lameter56bbd652007-10-16 01:25:35 -0700650 *nodes = node_states[N_HIGH_MEMORY];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700651 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700652 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653 break;
654 default:
655 BUG();
656 }
657}
658
659static int lookup_node(struct mm_struct *mm, unsigned long addr)
660{
661 struct page *p;
662 int err;
663
664 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
665 if (err >= 0) {
666 err = page_to_nid(p);
667 put_page(p);
668 }
669 return err;
670}
671
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672/* Retrieve NUMA policy */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700673static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700676 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677 struct mm_struct *mm = current->mm;
678 struct vm_area_struct *vma = NULL;
679 struct mempolicy *pol = current->mempolicy;
680
Paul Jacksoncf2a4732006-01-08 01:01:54 -0800681 cpuset_update_task_memory_state();
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700682 if (flags &
683 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700684 return -EINVAL;
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700685
686 if (flags & MPOL_F_MEMS_ALLOWED) {
687 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
688 return -EINVAL;
689 *policy = 0; /* just so it's initialized */
690 *nmask = cpuset_current_mems_allowed;
691 return 0;
692 }
693
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 if (flags & MPOL_F_ADDR) {
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700695 /*
696 * Do NOT fall back to task policy if the
697 * vma/shared policy at addr is NULL. We
698 * want to return MPOL_DEFAULT in this case.
699 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700 down_read(&mm->mmap_sem);
701 vma = find_vma_intersection(mm, addr, addr+1);
702 if (!vma) {
703 up_read(&mm->mmap_sem);
704 return -EFAULT;
705 }
706 if (vma->vm_ops && vma->vm_ops->get_policy)
707 pol = vma->vm_ops->get_policy(vma, addr);
708 else
709 pol = vma->vm_policy;
710 } else if (addr)
711 return -EINVAL;
712
713 if (!pol)
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700714 pol = &default_policy; /* indicates default behavior */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715
716 if (flags & MPOL_F_NODE) {
717 if (flags & MPOL_F_ADDR) {
718 err = lookup_node(mm, addr);
719 if (err < 0)
720 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700721 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722 } else if (pol == current->mempolicy &&
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700723 pol->mode == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700724 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 } else {
726 err = -EINVAL;
727 goto out;
728 }
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700729 } else {
730 *policy = pol == &default_policy ? MPOL_DEFAULT :
731 pol->mode;
732 *policy |= pol->flags;
733 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734
735 if (vma) {
736 up_read(&current->mm->mmap_sem);
737 vma = NULL;
738 }
739
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 err = 0;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700741 if (nmask)
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700742 get_policy_nodemask(pol, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743
744 out:
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700745 mpol_cond_put(pol);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 if (vma)
747 up_read(&current->mm->mmap_sem);
748 return err;
749}
750
Christoph Lameterb20a3502006-03-22 00:09:12 -0800751#ifdef CONFIG_MIGRATION
Christoph Lameter8bccd852005-10-29 18:16:59 -0700752/*
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800753 * page migration
754 */
Christoph Lameterfc301282006-01-18 17:42:29 -0800755static void migrate_page_add(struct page *page, struct list_head *pagelist,
756 unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800757{
758 /*
Christoph Lameterfc301282006-01-18 17:42:29 -0800759 * Avoid migrating a page that is shared with others.
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800760 */
Christoph Lameterb20a3502006-03-22 00:09:12 -0800761 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
762 isolate_lru_page(page, pagelist);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800763}
764
Christoph Lameter742755a2006-06-23 02:03:55 -0700765static struct page *new_node_page(struct page *page, unsigned long node, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700766{
Mel Gorman769848c2007-07-17 04:03:05 -0700767 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700768}
769
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800770/*
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800771 * Migrate pages from one node to a target node.
772 * Returns error or the number of pages not migrated.
773 */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700774static int migrate_to_node(struct mm_struct *mm, int source, int dest,
775 int flags)
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800776{
777 nodemask_t nmask;
778 LIST_HEAD(pagelist);
779 int err = 0;
780
781 nodes_clear(nmask);
782 node_set(source, nmask);
783
784 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
785 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
786
Christoph Lameteraaa994b2006-06-23 02:03:52 -0700787 if (!list_empty(&pagelist))
Christoph Lameter95a402c2006-06-23 02:03:53 -0700788 err = migrate_pages(&pagelist, new_node_page, dest);
789
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800790 return err;
791}
792
793/*
794 * Move pages between the two nodesets so as to preserve the physical
795 * layout as much as possible.
Christoph Lameter39743882006-01-08 01:00:51 -0800796 *
797 * Returns the number of page that could not be moved.
798 */
799int do_migrate_pages(struct mm_struct *mm,
800 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
801{
802 LIST_HEAD(pagelist);
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800803 int busy = 0;
804 int err = 0;
805 nodemask_t tmp;
Christoph Lameter39743882006-01-08 01:00:51 -0800806
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800807 down_read(&mm->mmap_sem);
Christoph Lameter39743882006-01-08 01:00:51 -0800808
Christoph Lameter7b2259b2006-06-25 05:46:48 -0700809 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
810 if (err)
811 goto out;
812
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800813/*
814 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
815 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
816 * bit in 'tmp', and return that <source, dest> pair for migration.
817 * The pair of nodemasks 'to' and 'from' define the map.
818 *
819 * If no pair of bits is found that way, fallback to picking some
820 * pair of 'source' and 'dest' bits that are not the same. If the
821 * 'source' and 'dest' bits are the same, this represents a node
822 * that will be migrating to itself, so no pages need move.
823 *
824 * If no bits are left in 'tmp', or if all remaining bits left
825 * in 'tmp' correspond to the same bit in 'to', return false
826 * (nothing left to migrate).
827 *
828 * This lets us pick a pair of nodes to migrate between, such that
829 * if possible the dest node is not already occupied by some other
830 * source node, minimizing the risk of overloading the memory on a
831 * node that would happen if we migrated incoming memory to a node
832 * before migrating outgoing memory source that same node.
833 *
834 * A single scan of tmp is sufficient. As we go, we remember the
835 * most recent <s, d> pair that moved (s != d). If we find a pair
836 * that not only moved, but what's better, moved to an empty slot
837 * (d is not set in tmp), then we break out then, with that pair.
838 * Otherwise when we finish scannng from_tmp, we at least have the
839 * most recent <s, d> pair that moved. If we get all the way through
840 * the scan of tmp without finding any node that moved, much less
841 * moved to an empty node, then there is nothing left worth migrating.
842 */
Christoph Lameterd4984712006-01-08 01:00:55 -0800843
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800844 tmp = *from_nodes;
845 while (!nodes_empty(tmp)) {
846 int s,d;
847 int source = -1;
848 int dest = 0;
849
850 for_each_node_mask(s, tmp) {
851 d = node_remap(s, *from_nodes, *to_nodes);
852 if (s == d)
853 continue;
854
855 source = s; /* Node moved. Memorize */
856 dest = d;
857
858 /* dest not in remaining from nodes? */
859 if (!node_isset(dest, tmp))
860 break;
861 }
862 if (source == -1)
863 break;
864
865 node_clear(source, tmp);
866 err = migrate_to_node(mm, source, dest, flags);
867 if (err > 0)
868 busy += err;
869 if (err < 0)
870 break;
Christoph Lameter39743882006-01-08 01:00:51 -0800871 }
Christoph Lameter7b2259b2006-06-25 05:46:48 -0700872out:
Christoph Lameter39743882006-01-08 01:00:51 -0800873 up_read(&mm->mmap_sem);
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800874 if (err < 0)
875 return err;
876 return busy;
Christoph Lameterb20a3502006-03-22 00:09:12 -0800877
Christoph Lameter39743882006-01-08 01:00:51 -0800878}
879
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -0800880/*
881 * Allocate a new page for page migration based on vma policy.
882 * Start assuming that page is mapped by vma pointed to by @private.
883 * Search forward from there, if not. N.B., this assumes that the
884 * list of pages handed to migrate_pages()--which is how we get here--
885 * is in virtual address order.
886 */
Christoph Lameter742755a2006-06-23 02:03:55 -0700887static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700888{
889 struct vm_area_struct *vma = (struct vm_area_struct *)private;
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -0800890 unsigned long uninitialized_var(address);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700891
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -0800892 while (vma) {
893 address = page_address_in_vma(page, vma);
894 if (address != -EFAULT)
895 break;
896 vma = vma->vm_next;
897 }
898
899 /*
900 * if !vma, alloc_page_vma() will use task or system default policy
901 */
902 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700903}
Christoph Lameterb20a3502006-03-22 00:09:12 -0800904#else
905
906static void migrate_page_add(struct page *page, struct list_head *pagelist,
907 unsigned long flags)
908{
909}
910
911int do_migrate_pages(struct mm_struct *mm,
912 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
913{
914 return -ENOSYS;
915}
Christoph Lameter95a402c2006-06-23 02:03:53 -0700916
Keith Owens69939742006-10-11 01:21:28 -0700917static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700918{
919 return NULL;
920}
Christoph Lameterb20a3502006-03-22 00:09:12 -0800921#endif
922
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700923static long do_mbind(unsigned long start, unsigned long len,
David Rientjes028fec42008-04-28 02:12:25 -0700924 unsigned short mode, unsigned short mode_flags,
925 nodemask_t *nmask, unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800926{
927 struct vm_area_struct *vma;
928 struct mm_struct *mm = current->mm;
929 struct mempolicy *new;
930 unsigned long end;
931 int err;
932 LIST_HEAD(pagelist);
933
David Rientjesa3b51e02008-04-28 02:12:23 -0700934 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
935 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800936 return -EINVAL;
Christoph Lameter74c00242006-03-14 19:50:21 -0800937 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800938 return -EPERM;
939
940 if (start & ~PAGE_MASK)
941 return -EINVAL;
942
943 if (mode == MPOL_DEFAULT)
944 flags &= ~MPOL_MF_STRICT;
945
946 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
947 end = start + len;
948
949 if (end < start)
950 return -EINVAL;
951 if (end == start)
952 return 0;
953
David Rientjes028fec42008-04-28 02:12:25 -0700954 new = mpol_new(mode, mode_flags, nmask);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800955 if (IS_ERR(new))
956 return PTR_ERR(new);
957
958 /*
959 * If we are using the default policy then operation
960 * on discontinuous address spaces is okay after all
961 */
962 if (!new)
963 flags |= MPOL_MF_DISCONTIG_OK;
964
David Rientjes028fec42008-04-28 02:12:25 -0700965 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
966 start, start + len, mode, mode_flags,
967 nmask ? nodes_addr(*nmask)[0] : -1);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800968
969 down_write(&mm->mmap_sem);
970 vma = check_range(mm, start, end, nmask,
971 flags | MPOL_MF_INVERT, &pagelist);
972
973 err = PTR_ERR(vma);
974 if (!IS_ERR(vma)) {
975 int nr_failed = 0;
976
977 err = mbind_range(vma, start, end, new);
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800978
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800979 if (!list_empty(&pagelist))
Christoph Lameter95a402c2006-06-23 02:03:53 -0700980 nr_failed = migrate_pages(&pagelist, new_vma_page,
981 (unsigned long)vma);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800982
983 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
984 err = -EIO;
985 }
Christoph Lameterb20a3502006-03-22 00:09:12 -0800986
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800987 up_write(&mm->mmap_sem);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -0700988 mpol_put(new);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800989 return err;
990}
991
Christoph Lameter39743882006-01-08 01:00:51 -0800992/*
Christoph Lameter8bccd852005-10-29 18:16:59 -0700993 * User space interface with variable sized bitmaps for nodelists.
994 */
995
996/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -0800997static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -0700998 unsigned long maxnode)
999{
1000 unsigned long k;
1001 unsigned long nlongs;
1002 unsigned long endmask;
1003
1004 --maxnode;
1005 nodes_clear(*nodes);
1006 if (maxnode == 0 || !nmask)
1007 return 0;
Andi Kleena9c930b2006-02-20 18:27:59 -08001008 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
Chris Wright636f13c2006-02-17 13:59:36 -08001009 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001010
1011 nlongs = BITS_TO_LONGS(maxnode);
1012 if ((maxnode % BITS_PER_LONG) == 0)
1013 endmask = ~0UL;
1014 else
1015 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1016
1017 /* When the user specified more nodes than supported just check
1018 if the non supported part is all zero. */
1019 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1020 if (nlongs > PAGE_SIZE/sizeof(long))
1021 return -EINVAL;
1022 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1023 unsigned long t;
1024 if (get_user(t, nmask + k))
1025 return -EFAULT;
1026 if (k == nlongs - 1) {
1027 if (t & endmask)
1028 return -EINVAL;
1029 } else if (t)
1030 return -EINVAL;
1031 }
1032 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1033 endmask = ~0UL;
1034 }
1035
1036 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1037 return -EFAULT;
1038 nodes_addr(*nodes)[nlongs-1] &= endmask;
1039 return 0;
1040}
1041
1042/* Copy a kernel node mask to user space */
1043static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1044 nodemask_t *nodes)
1045{
1046 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1047 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1048
1049 if (copy > nbytes) {
1050 if (copy > PAGE_SIZE)
1051 return -EINVAL;
1052 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1053 return -EFAULT;
1054 copy = nbytes;
1055 }
1056 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1057}
1058
1059asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1060 unsigned long mode,
1061 unsigned long __user *nmask, unsigned long maxnode,
1062 unsigned flags)
1063{
1064 nodemask_t nodes;
1065 int err;
David Rientjes028fec42008-04-28 02:12:25 -07001066 unsigned short mode_flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001067
David Rientjes028fec42008-04-28 02:12:25 -07001068 mode_flags = mode & MPOL_MODE_FLAGS;
1069 mode &= ~MPOL_MODE_FLAGS;
David Rientjesa3b51e02008-04-28 02:12:23 -07001070 if (mode >= MPOL_MAX)
1071 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001072 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1073 (mode_flags & MPOL_F_RELATIVE_NODES))
1074 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001075 err = get_nodes(&nodes, nmask, maxnode);
1076 if (err)
1077 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001078 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001079}
1080
1081/* Set the process memory policy */
1082asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1083 unsigned long maxnode)
1084{
1085 int err;
1086 nodemask_t nodes;
David Rientjes028fec42008-04-28 02:12:25 -07001087 unsigned short flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001088
David Rientjes028fec42008-04-28 02:12:25 -07001089 flags = mode & MPOL_MODE_FLAGS;
1090 mode &= ~MPOL_MODE_FLAGS;
1091 if ((unsigned int)mode >= MPOL_MAX)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001092 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001093 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1094 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001095 err = get_nodes(&nodes, nmask, maxnode);
1096 if (err)
1097 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001098 return do_set_mempolicy(mode, flags, &nodes);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001099}
1100
Christoph Lameter39743882006-01-08 01:00:51 -08001101asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1102 const unsigned long __user *old_nodes,
1103 const unsigned long __user *new_nodes)
1104{
1105 struct mm_struct *mm;
1106 struct task_struct *task;
1107 nodemask_t old;
1108 nodemask_t new;
1109 nodemask_t task_nodes;
1110 int err;
1111
1112 err = get_nodes(&old, old_nodes, maxnode);
1113 if (err)
1114 return err;
1115
1116 err = get_nodes(&new, new_nodes, maxnode);
1117 if (err)
1118 return err;
1119
1120 /* Find the mm_struct */
1121 read_lock(&tasklist_lock);
Pavel Emelyanov228ebcb2007-10-18 23:40:16 -07001122 task = pid ? find_task_by_vpid(pid) : current;
Christoph Lameter39743882006-01-08 01:00:51 -08001123 if (!task) {
1124 read_unlock(&tasklist_lock);
1125 return -ESRCH;
1126 }
1127 mm = get_task_mm(task);
1128 read_unlock(&tasklist_lock);
1129
1130 if (!mm)
1131 return -EINVAL;
1132
1133 /*
1134 * Check if this process has the right to modify the specified
1135 * process. The right exists if the process has administrative
Alexey Dobriyan7f927fc2006-03-28 01:56:53 -08001136 * capabilities, superuser privileges or the same
Christoph Lameter39743882006-01-08 01:00:51 -08001137 * userid as the target process.
1138 */
1139 if ((current->euid != task->suid) && (current->euid != task->uid) &&
1140 (current->uid != task->suid) && (current->uid != task->uid) &&
Christoph Lameter74c00242006-03-14 19:50:21 -08001141 !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001142 err = -EPERM;
1143 goto out;
1144 }
1145
1146 task_nodes = cpuset_mems_allowed(task);
1147 /* Is the user allowed to access the target nodes? */
Christoph Lameter74c00242006-03-14 19:50:21 -08001148 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001149 err = -EPERM;
1150 goto out;
1151 }
1152
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07001153 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
Christoph Lameter3b42d282007-08-31 00:12:08 -07001154 err = -EINVAL;
1155 goto out;
1156 }
1157
David Quigley86c3a762006-06-23 02:04:02 -07001158 err = security_task_movememory(task);
1159 if (err)
1160 goto out;
1161
Christoph Lameter511030b2006-02-28 16:58:57 -08001162 err = do_migrate_pages(mm, &old, &new,
Christoph Lameter74c00242006-03-14 19:50:21 -08001163 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
Christoph Lameter39743882006-01-08 01:00:51 -08001164out:
1165 mmput(mm);
1166 return err;
1167}
1168
1169
Christoph Lameter8bccd852005-10-29 18:16:59 -07001170/* Retrieve NUMA policy */
1171asmlinkage long sys_get_mempolicy(int __user *policy,
1172 unsigned long __user *nmask,
1173 unsigned long maxnode,
1174 unsigned long addr, unsigned long flags)
1175{
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001176 int err;
1177 int uninitialized_var(pval);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001178 nodemask_t nodes;
1179
1180 if (nmask != NULL && maxnode < MAX_NUMNODES)
1181 return -EINVAL;
1182
1183 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1184
1185 if (err)
1186 return err;
1187
1188 if (policy && put_user(pval, policy))
1189 return -EFAULT;
1190
1191 if (nmask)
1192 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1193
1194 return err;
1195}
1196
Linus Torvalds1da177e2005-04-16 15:20:36 -07001197#ifdef CONFIG_COMPAT
1198
1199asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1200 compat_ulong_t __user *nmask,
1201 compat_ulong_t maxnode,
1202 compat_ulong_t addr, compat_ulong_t flags)
1203{
1204 long err;
1205 unsigned long __user *nm = NULL;
1206 unsigned long nr_bits, alloc_size;
1207 DECLARE_BITMAP(bm, MAX_NUMNODES);
1208
1209 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1210 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1211
1212 if (nmask)
1213 nm = compat_alloc_user_space(alloc_size);
1214
1215 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1216
1217 if (!err && nmask) {
1218 err = copy_from_user(bm, nm, alloc_size);
1219 /* ensure entire bitmap is zeroed */
1220 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1221 err |= compat_put_bitmap(nmask, bm, nr_bits);
1222 }
1223
1224 return err;
1225}
1226
1227asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1228 compat_ulong_t maxnode)
1229{
1230 long err = 0;
1231 unsigned long __user *nm = NULL;
1232 unsigned long nr_bits, alloc_size;
1233 DECLARE_BITMAP(bm, MAX_NUMNODES);
1234
1235 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1236 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1237
1238 if (nmask) {
1239 err = compat_get_bitmap(bm, nmask, nr_bits);
1240 nm = compat_alloc_user_space(alloc_size);
1241 err |= copy_to_user(nm, bm, alloc_size);
1242 }
1243
1244 if (err)
1245 return -EFAULT;
1246
1247 return sys_set_mempolicy(mode, nm, nr_bits+1);
1248}
1249
1250asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1251 compat_ulong_t mode, compat_ulong_t __user *nmask,
1252 compat_ulong_t maxnode, compat_ulong_t flags)
1253{
1254 long err = 0;
1255 unsigned long __user *nm = NULL;
1256 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -07001257 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258
1259 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1260 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1261
1262 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -07001263 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001264 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -07001265 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 }
1267
1268 if (err)
1269 return -EFAULT;
1270
1271 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1272}
1273
1274#endif
1275
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001276/*
1277 * get_vma_policy(@task, @vma, @addr)
1278 * @task - task for fallback if vma policy == default
1279 * @vma - virtual memory area whose policy is sought
1280 * @addr - address in @vma for shared policy lookup
1281 *
1282 * Returns effective policy for a VMA at specified address.
1283 * Falls back to @task or system default policy, as necessary.
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001284 * Current or other task's task mempolicy and non-shared vma policies
1285 * are protected by the task's mmap_sem, which must be held for read by
1286 * the caller.
1287 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1288 * count--added by the get_policy() vm_op, as appropriate--to protect against
1289 * freeing by another task. It is the caller's responsibility to free the
1290 * extra reference for shared policies.
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001291 */
Lee Schermerhornae4d8c12008-04-28 02:13:11 -07001292static struct mempolicy *get_vma_policy(struct task_struct *task,
Christoph Lameter48fce342006-01-08 01:01:03 -08001293 struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001294{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001295 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296
1297 if (vma) {
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001298 if (vma->vm_ops && vma->vm_ops->get_policy) {
Lee Schermerhornae4d8c12008-04-28 02:13:11 -07001299 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1300 addr);
1301 if (vpol)
1302 pol = vpol;
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001303 } else if (vma->vm_policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 pol = vma->vm_policy;
1305 }
1306 if (!pol)
1307 pol = &default_policy;
1308 return pol;
1309}
1310
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001311/*
1312 * Return a nodemask representing a mempolicy for filtering nodes for
1313 * page allocation
1314 */
1315static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
Mel Gorman19770b32008-04-28 02:12:18 -07001316{
1317 /* Lower zones don't get a nodemask applied for MPOL_BIND */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001318 if (unlikely(policy->mode == MPOL_BIND) &&
Mel Gorman19770b32008-04-28 02:12:18 -07001319 gfp_zone(gfp) >= policy_zone &&
1320 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1321 return &policy->v.nodes;
1322
1323 return NULL;
1324}
1325
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001326/* Return a zonelist indicated by gfp for node representing a mempolicy */
1327static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001328{
1329 int nd;
1330
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001331 switch (policy->mode) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332 case MPOL_PREFERRED:
1333 nd = policy->v.preferred_node;
1334 if (nd < 0)
1335 nd = numa_node_id();
1336 break;
1337 case MPOL_BIND:
Mel Gorman19770b32008-04-28 02:12:18 -07001338 /*
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001339 * Normally, MPOL_BIND allocations are node-local within the
1340 * allowed nodemask. However, if __GFP_THISNODE is set and the
1341 * current node is part of the mask, we use the zonelist for
1342 * the first node in the mask instead.
Mel Gorman19770b32008-04-28 02:12:18 -07001343 */
1344 nd = numa_node_id();
1345 if (unlikely(gfp & __GFP_THISNODE) &&
1346 unlikely(!node_isset(nd, policy->v.nodes)))
1347 nd = first_node(policy->v.nodes);
1348 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 case MPOL_INTERLEAVE: /* should not happen */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 nd = numa_node_id();
1351 break;
1352 default:
1353 nd = 0;
1354 BUG();
1355 }
Mel Gorman0e884602008-04-28 02:12:14 -07001356 return node_zonelist(nd, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357}
1358
1359/* Do dynamic interleaving for a process */
1360static unsigned interleave_nodes(struct mempolicy *policy)
1361{
1362 unsigned nid, next;
1363 struct task_struct *me = current;
1364
1365 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -07001366 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -07001368 next = first_node(policy->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001369 if (next < MAX_NUMNODES)
1370 me->il_next = next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371 return nid;
1372}
1373
Christoph Lameterdc85da12006-01-18 17:42:36 -08001374/*
1375 * Depending on the memory policy provide a node from which to allocate the
1376 * next slab entry.
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001377 * @policy must be protected by freeing by the caller. If @policy is
1378 * the current task's mempolicy, this protection is implicit, as only the
1379 * task can change it's policy. The system default policy requires no
1380 * such protection.
Christoph Lameterdc85da12006-01-18 17:42:36 -08001381 */
1382unsigned slab_node(struct mempolicy *policy)
1383{
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001384 if (!policy)
1385 return numa_node_id();
Christoph Lameter765c4502006-09-27 01:50:08 -07001386
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001387 switch (policy->mode) {
1388 case MPOL_PREFERRED:
1389 if (unlikely(policy->v.preferred_node >= 0))
1390 return policy->v.preferred_node;
1391 return numa_node_id();
1392
Christoph Lameterdc85da12006-01-18 17:42:36 -08001393 case MPOL_INTERLEAVE:
1394 return interleave_nodes(policy);
1395
Mel Gormandd1a2392008-04-28 02:12:17 -07001396 case MPOL_BIND: {
Christoph Lameterdc85da12006-01-18 17:42:36 -08001397 /*
1398 * Follow bind policy behavior and start allocation at the
1399 * first node.
1400 */
Mel Gorman19770b32008-04-28 02:12:18 -07001401 struct zonelist *zonelist;
1402 struct zone *zone;
1403 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1404 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1405 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1406 &policy->v.nodes,
1407 &zone);
1408 return zone->node;
Mel Gormandd1a2392008-04-28 02:12:17 -07001409 }
Christoph Lameterdc85da12006-01-18 17:42:36 -08001410
Christoph Lameterdc85da12006-01-18 17:42:36 -08001411 default:
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001412 BUG();
Christoph Lameterdc85da12006-01-18 17:42:36 -08001413 }
1414}
1415
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416/* Do static interleaving for a VMA with known offset. */
1417static unsigned offset_il_node(struct mempolicy *pol,
1418 struct vm_area_struct *vma, unsigned long off)
1419{
Andi Kleendfcd3c02005-10-29 18:15:48 -07001420 unsigned nnodes = nodes_weight(pol->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001421 unsigned target;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422 int c;
1423 int nid = -1;
1424
David Rientjesf5b087b2008-04-28 02:12:27 -07001425 if (!nnodes)
1426 return numa_node_id();
1427 target = (unsigned int)off % nnodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428 c = 0;
1429 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -07001430 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431 c++;
1432 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433 return nid;
1434}
1435
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001436/* Determine a node number for interleave */
1437static inline unsigned interleave_nid(struct mempolicy *pol,
1438 struct vm_area_struct *vma, unsigned long addr, int shift)
1439{
1440 if (vma) {
1441 unsigned long off;
1442
Nishanth Aravamudan3b98b082006-08-31 21:27:53 -07001443 /*
1444 * for small pages, there is no difference between
1445 * shift and PAGE_SHIFT, so the bit-shift is safe.
1446 * for huge pages, since vm_pgoff is in units of small
1447 * pages, we need to shift off the always 0 bits to get
1448 * a useful offset.
1449 */
1450 BUG_ON(shift < PAGE_SHIFT);
1451 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001452 off += (addr - vma->vm_start) >> shift;
1453 return offset_il_node(pol, vma, off);
1454 } else
1455 return interleave_nodes(pol);
1456}
1457
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001458#ifdef CONFIG_HUGETLBFS
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001459/*
1460 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1461 * @vma = virtual memory area whose policy is sought
1462 * @addr = address in @vma for shared policy lookup and interleave policy
1463 * @gfp_flags = for requested zone
Mel Gorman19770b32008-04-28 02:12:18 -07001464 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1465 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001466 *
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001467 * Returns a zonelist suitable for a huge page allocation and a pointer
1468 * to the struct mempolicy for conditional unref after allocation.
1469 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1470 * @nodemask for filtering the zonelist.
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001471 */
Mel Gorman396faf02007-07-17 04:03:13 -07001472struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
Mel Gorman19770b32008-04-28 02:12:18 -07001473 gfp_t gfp_flags, struct mempolicy **mpol,
1474 nodemask_t **nodemask)
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001475{
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001476 struct zonelist *zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001477
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001478 *mpol = get_vma_policy(current, vma, addr);
Mel Gorman19770b32008-04-28 02:12:18 -07001479 *nodemask = NULL; /* assume !MPOL_BIND */
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001480
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001481 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1482 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1483 HPAGE_SHIFT), gfp_flags);
1484 } else {
1485 zl = policy_zonelist(gfp_flags, *mpol);
1486 if ((*mpol)->mode == MPOL_BIND)
1487 *nodemask = &(*mpol)->v.nodes;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001488 }
1489 return zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001490}
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001491#endif
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001492
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493/* Allocate a page in interleaved policy.
1494 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07001495static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1496 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497{
1498 struct zonelist *zl;
1499 struct page *page;
1500
Mel Gorman0e884602008-04-28 02:12:14 -07001501 zl = node_zonelist(nid, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 page = __alloc_pages(gfp, order, zl);
Mel Gormandd1a2392008-04-28 02:12:17 -07001503 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
Christoph Lameterca889e62006-06-30 01:55:44 -07001504 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 return page;
1506}
1507
1508/**
1509 * alloc_page_vma - Allocate a page for a VMA.
1510 *
1511 * @gfp:
1512 * %GFP_USER user allocation.
1513 * %GFP_KERNEL kernel allocations,
1514 * %GFP_HIGHMEM highmem/user allocations,
1515 * %GFP_FS allocation should not call back into a file system.
1516 * %GFP_ATOMIC don't sleep.
1517 *
1518 * @vma: Pointer to VMA or NULL if not available.
1519 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1520 *
1521 * This function allocates a page from the kernel page pool and applies
1522 * a NUMA policy associated with the VMA or the current process.
1523 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1524 * mm_struct of the VMA to prevent it from going away. Should be used for
1525 * all allocations for pages that will be mapped into
1526 * user space. Returns NULL when no page can be allocated.
1527 *
1528 * Should be called with the mm_sem of the vma hold.
1529 */
1530struct page *
Al Virodd0fc662005-10-07 07:46:04 +01001531alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001533 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001534 struct zonelist *zl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535
Paul Jacksoncf2a4732006-01-08 01:01:54 -08001536 cpuset_update_task_memory_state();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001538 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001540
1541 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001542 mpol_cond_put(pol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 return alloc_page_interleave(gfp, 0, nid);
1544 }
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001545 zl = policy_zonelist(gfp, pol);
1546 if (unlikely(mpol_needs_cond_ref(pol))) {
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001547 /*
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001548 * slow path: ref counted shared policy
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001549 */
Mel Gorman19770b32008-04-28 02:12:18 -07001550 struct page *page = __alloc_pages_nodemask(gfp, 0,
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001551 zl, policy_nodemask(gfp, pol));
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001552 __mpol_put(pol);
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001553 return page;
1554 }
1555 /*
1556 * fast path: default or task policy
1557 */
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001558 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559}
1560
1561/**
1562 * alloc_pages_current - Allocate pages.
1563 *
1564 * @gfp:
1565 * %GFP_USER user allocation,
1566 * %GFP_KERNEL kernel allocation,
1567 * %GFP_HIGHMEM highmem allocation,
1568 * %GFP_FS don't call back into a file system.
1569 * %GFP_ATOMIC don't sleep.
1570 * @order: Power of two of allocation size in pages. 0 is a single page.
1571 *
1572 * Allocate a page from the kernel page pool. When not in
1573 * interrupt context and apply the current process NUMA policy.
1574 * Returns NULL when no page can be allocated.
1575 *
Paul Jacksoncf2a4732006-01-08 01:01:54 -08001576 * Don't call cpuset_update_task_memory_state() unless
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 * 1) it's ok to take cpuset_sem (can WAIT), and
1578 * 2) allocating for current task (not interrupt).
1579 */
Al Virodd0fc662005-10-07 07:46:04 +01001580struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581{
1582 struct mempolicy *pol = current->mempolicy;
1583
1584 if ((gfp & __GFP_WAIT) && !in_interrupt())
Paul Jacksoncf2a4732006-01-08 01:01:54 -08001585 cpuset_update_task_memory_state();
Christoph Lameter9b819d22006-09-25 23:31:40 -07001586 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587 pol = &default_policy;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001588
1589 /*
1590 * No reference counting needed for current->mempolicy
1591 * nor system default_policy
1592 */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001593 if (pol->mode == MPOL_INTERLEAVE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
Mel Gorman19770b32008-04-28 02:12:18 -07001595 return __alloc_pages_nodemask(gfp, order,
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001596 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597}
1598EXPORT_SYMBOL(alloc_pages_current);
1599
Paul Jackson42253992006-01-08 01:01:59 -08001600/*
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07001601 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
Paul Jackson42253992006-01-08 01:01:59 -08001602 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1603 * with the mems_allowed returned by cpuset_mems_allowed(). This
1604 * keeps mempolicies cpuset relative after its cpuset moves. See
1605 * further kernel/cpuset.c update_nodemask().
1606 */
Paul Jackson42253992006-01-08 01:01:59 -08001607
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07001608/* Slow path of a mempolicy duplicate */
1609struct mempolicy *__mpol_dup(struct mempolicy *old)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610{
1611 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1612
1613 if (!new)
1614 return ERR_PTR(-ENOMEM);
Paul Jackson42253992006-01-08 01:01:59 -08001615 if (current_cpuset_is_being_rebound()) {
1616 nodemask_t mems = cpuset_mems_allowed(current);
1617 mpol_rebind_policy(old, &mems);
1618 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001619 *new = *old;
1620 atomic_set(&new->refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621 return new;
1622}
1623
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001624/*
1625 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1626 * eliminate the * MPOL_F_* flags that require conditional ref and
1627 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
1628 * after return. Use the returned value.
1629 *
1630 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1631 * policy lookup, even if the policy needs/has extra ref on lookup.
1632 * shmem_readahead needs this.
1633 */
1634struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1635 struct mempolicy *frompol)
1636{
1637 if (!mpol_needs_cond_ref(frompol))
1638 return frompol;
1639
1640 *tompol = *frompol;
1641 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
1642 __mpol_put(frompol);
1643 return tompol;
1644}
1645
David Rientjesf5b087b2008-04-28 02:12:27 -07001646static int mpol_match_intent(const struct mempolicy *a,
1647 const struct mempolicy *b)
1648{
1649 if (a->flags != b->flags)
1650 return 0;
1651 if (!mpol_store_user_nodemask(a))
1652 return 1;
1653 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1654}
1655
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656/* Slow path of a mempolicy comparison */
1657int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1658{
1659 if (!a || !b)
1660 return 0;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001661 if (a->mode != b->mode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001662 return 0;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001663 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
David Rientjesf5b087b2008-04-28 02:12:27 -07001664 return 0;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001665 switch (a->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -07001666 case MPOL_BIND:
1667 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -07001669 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670 case MPOL_PREFERRED:
1671 return a->v.preferred_node == b->v.preferred_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672 default:
1673 BUG();
1674 return 0;
1675 }
1676}
1677
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679 * Shared memory backing store policy support.
1680 *
1681 * Remember policies even when nobody has shared memory mapped.
1682 * The policies are kept in Red-Black tree linked from the inode.
1683 * They are protected by the sp->lock spinlock, which should be held
1684 * for any accesses to the tree.
1685 */
1686
1687/* lookup first element intersecting start-end */
1688/* Caller holds sp->lock */
1689static struct sp_node *
1690sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1691{
1692 struct rb_node *n = sp->root.rb_node;
1693
1694 while (n) {
1695 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1696
1697 if (start >= p->end)
1698 n = n->rb_right;
1699 else if (end <= p->start)
1700 n = n->rb_left;
1701 else
1702 break;
1703 }
1704 if (!n)
1705 return NULL;
1706 for (;;) {
1707 struct sp_node *w = NULL;
1708 struct rb_node *prev = rb_prev(n);
1709 if (!prev)
1710 break;
1711 w = rb_entry(prev, struct sp_node, nd);
1712 if (w->end <= start)
1713 break;
1714 n = prev;
1715 }
1716 return rb_entry(n, struct sp_node, nd);
1717}
1718
1719/* Insert a new shared policy into the list. */
1720/* Caller holds sp->lock */
1721static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1722{
1723 struct rb_node **p = &sp->root.rb_node;
1724 struct rb_node *parent = NULL;
1725 struct sp_node *nd;
1726
1727 while (*p) {
1728 parent = *p;
1729 nd = rb_entry(parent, struct sp_node, nd);
1730 if (new->start < nd->start)
1731 p = &(*p)->rb_left;
1732 else if (new->end > nd->end)
1733 p = &(*p)->rb_right;
1734 else
1735 BUG();
1736 }
1737 rb_link_node(&new->nd, parent, p);
1738 rb_insert_color(&new->nd, &sp->root);
Paul Mundt140d5a42007-07-15 23:38:16 -07001739 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001740 new->policy ? new->policy->mode : 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741}
1742
1743/* Find shared policy intersecting idx */
1744struct mempolicy *
1745mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1746{
1747 struct mempolicy *pol = NULL;
1748 struct sp_node *sn;
1749
1750 if (!sp->root.rb_node)
1751 return NULL;
1752 spin_lock(&sp->lock);
1753 sn = sp_lookup(sp, idx, idx+1);
1754 if (sn) {
1755 mpol_get(sn->policy);
1756 pol = sn->policy;
1757 }
1758 spin_unlock(&sp->lock);
1759 return pol;
1760}
1761
1762static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1763{
Paul Mundt140d5a42007-07-15 23:38:16 -07001764 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001765 rb_erase(&n->nd, &sp->root);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001766 mpol_put(n->policy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767 kmem_cache_free(sn_cache, n);
1768}
1769
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001770static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1771 struct mempolicy *pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001772{
1773 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1774
1775 if (!n)
1776 return NULL;
1777 n->start = start;
1778 n->end = end;
1779 mpol_get(pol);
Lee Schermerhornaab0b102008-04-28 02:13:13 -07001780 pol->flags |= MPOL_F_SHARED; /* for unref */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 n->policy = pol;
1782 return n;
1783}
1784
1785/* Replace a policy range. */
1786static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1787 unsigned long end, struct sp_node *new)
1788{
1789 struct sp_node *n, *new2 = NULL;
1790
1791restart:
1792 spin_lock(&sp->lock);
1793 n = sp_lookup(sp, start, end);
1794 /* Take care of old policies in the same range. */
1795 while (n && n->start < end) {
1796 struct rb_node *next = rb_next(&n->nd);
1797 if (n->start >= start) {
1798 if (n->end <= end)
1799 sp_delete(sp, n);
1800 else
1801 n->start = end;
1802 } else {
1803 /* Old policy spanning whole new range. */
1804 if (n->end > end) {
1805 if (!new2) {
1806 spin_unlock(&sp->lock);
1807 new2 = sp_alloc(end, n->end, n->policy);
1808 if (!new2)
1809 return -ENOMEM;
1810 goto restart;
1811 }
1812 n->end = start;
1813 sp_insert(sp, new2);
1814 new2 = NULL;
1815 break;
1816 } else
1817 n->end = start;
1818 }
1819 if (!next)
1820 break;
1821 n = rb_entry(next, struct sp_node, nd);
1822 }
1823 if (new)
1824 sp_insert(sp, new);
1825 spin_unlock(&sp->lock);
1826 if (new2) {
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001827 mpol_put(new2->policy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001828 kmem_cache_free(sn_cache, new2);
1829 }
1830 return 0;
1831}
1832
David Rientjesa3b51e02008-04-28 02:12:23 -07001833void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
David Rientjes028fec42008-04-28 02:12:25 -07001834 unsigned short flags, nodemask_t *policy_nodes)
Robin Holt7339ff82006-01-14 13:20:48 -08001835{
1836 info->root = RB_ROOT;
1837 spin_lock_init(&info->lock);
1838
1839 if (policy != MPOL_DEFAULT) {
1840 struct mempolicy *newpol;
1841
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001842 /* Falls back to NULL policy [MPOL_DEFAULT] on any error */
David Rientjes028fec42008-04-28 02:12:25 -07001843 newpol = mpol_new(policy, flags, policy_nodes);
Robin Holt7339ff82006-01-14 13:20:48 -08001844 if (!IS_ERR(newpol)) {
1845 /* Create pseudo-vma that contains just the policy */
1846 struct vm_area_struct pvma;
1847
1848 memset(&pvma, 0, sizeof(struct vm_area_struct));
1849 /* Policy covers entire file */
1850 pvma.vm_end = TASK_SIZE;
1851 mpol_set_shared_policy(info, &pvma, newpol);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001852 mpol_put(newpol);
Robin Holt7339ff82006-01-14 13:20:48 -08001853 }
1854 }
1855}
1856
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857int mpol_set_shared_policy(struct shared_policy *info,
1858 struct vm_area_struct *vma, struct mempolicy *npol)
1859{
1860 int err;
1861 struct sp_node *new = NULL;
1862 unsigned long sz = vma_pages(vma);
1863
David Rientjes028fec42008-04-28 02:12:25 -07001864 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865 vma->vm_pgoff,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001866 sz, npol ? npol->mode : -1,
David Rientjes028fec42008-04-28 02:12:25 -07001867 npol ? npol->flags : -1,
Paul Mundt140d5a42007-07-15 23:38:16 -07001868 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869
1870 if (npol) {
1871 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1872 if (!new)
1873 return -ENOMEM;
1874 }
1875 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1876 if (err && new)
1877 kmem_cache_free(sn_cache, new);
1878 return err;
1879}
1880
1881/* Free a backing policy store on inode delete. */
1882void mpol_free_shared_policy(struct shared_policy *p)
1883{
1884 struct sp_node *n;
1885 struct rb_node *next;
1886
1887 if (!p->root.rb_node)
1888 return;
1889 spin_lock(&p->lock);
1890 next = rb_first(&p->root);
1891 while (next) {
1892 n = rb_entry(next, struct sp_node, nd);
1893 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001894 rb_erase(&n->nd, &p->root);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001895 mpol_put(n->policy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001896 kmem_cache_free(sn_cache, n);
1897 }
1898 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899}
1900
1901/* assumes fs == KERNEL_DS */
1902void __init numa_policy_init(void)
1903{
Paul Mundtb71636e2007-07-15 23:38:15 -07001904 nodemask_t interleave_nodes;
1905 unsigned long largest = 0;
1906 int nid, prefer = 0;
1907
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 policy_cache = kmem_cache_create("numa_policy",
1909 sizeof(struct mempolicy),
Paul Mundt20c2df82007-07-20 10:11:58 +09001910 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001911
1912 sn_cache = kmem_cache_create("shared_policy_node",
1913 sizeof(struct sp_node),
Paul Mundt20c2df82007-07-20 10:11:58 +09001914 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915
Paul Mundtb71636e2007-07-15 23:38:15 -07001916 /*
1917 * Set interleaving policy for system init. Interleaving is only
1918 * enabled across suitably sized nodes (default is >= 16MB), or
1919 * fall back to the largest node if they're all smaller.
1920 */
1921 nodes_clear(interleave_nodes);
Christoph Lameter56bbd652007-10-16 01:25:35 -07001922 for_each_node_state(nid, N_HIGH_MEMORY) {
Paul Mundtb71636e2007-07-15 23:38:15 -07001923 unsigned long total_pages = node_present_pages(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924
Paul Mundtb71636e2007-07-15 23:38:15 -07001925 /* Preserve the largest node */
1926 if (largest < total_pages) {
1927 largest = total_pages;
1928 prefer = nid;
1929 }
1930
1931 /* Interleave this node? */
1932 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1933 node_set(nid, interleave_nodes);
1934 }
1935
1936 /* All too small, use the largest */
1937 if (unlikely(nodes_empty(interleave_nodes)))
1938 node_set(prefer, interleave_nodes);
1939
David Rientjes028fec42008-04-28 02:12:25 -07001940 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 printk("numa_policy_init: interleaving failed\n");
1942}
1943
Christoph Lameter8bccd852005-10-29 18:16:59 -07001944/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001945void numa_default_policy(void)
1946{
David Rientjes028fec42008-04-28 02:12:25 -07001947 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001948}
Paul Jackson68860ec2005-10-30 15:02:36 -08001949
Paul Jackson42253992006-01-08 01:01:59 -08001950/*
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001951 * Display pages allocated per node and memory policy via /proc.
1952 */
Helge Deller15ad7cd2006-12-06 20:40:36 -08001953static const char * const policy_types[] =
1954 { "default", "prefer", "bind", "interleave" };
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001955
1956/*
1957 * Convert a mempolicy into a string.
1958 * Returns the number of characters in buffer (if positive)
1959 * or an error (negative)
1960 */
1961static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1962{
1963 char *p = buffer;
1964 int l;
1965 nodemask_t nodes;
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001966 unsigned short mode;
David Rientjesf5b087b2008-04-28 02:12:27 -07001967 unsigned short flags = pol ? pol->flags : 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001968
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001969 if (!pol || pol == &default_policy)
1970 mode = MPOL_DEFAULT;
1971 else
1972 mode = pol->mode;
1973
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001974 switch (mode) {
1975 case MPOL_DEFAULT:
1976 nodes_clear(nodes);
1977 break;
1978
1979 case MPOL_PREFERRED:
1980 nodes_clear(nodes);
1981 node_set(pol->v.preferred_node, nodes);
1982 break;
1983
1984 case MPOL_BIND:
Mel Gorman19770b32008-04-28 02:12:18 -07001985 /* Fall through */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001986 case MPOL_INTERLEAVE:
1987 nodes = pol->v.nodes;
1988 break;
1989
1990 default:
1991 BUG();
1992 return -EFAULT;
1993 }
1994
1995 l = strlen(policy_types[mode]);
1996 if (buffer + maxlen < p + l + 1)
1997 return -ENOSPC;
1998
1999 strcpy(p, policy_types[mode]);
2000 p += l;
2001
David Rientjesf5b087b2008-04-28 02:12:27 -07002002 if (flags) {
2003 int need_bar = 0;
2004
2005 if (buffer + maxlen < p + 2)
2006 return -ENOSPC;
2007 *p++ = '=';
2008
2009 if (flags & MPOL_F_STATIC_NODES)
2010 p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
David Rientjes4c50bc02008-04-28 02:12:30 -07002011 if (flags & MPOL_F_RELATIVE_NODES)
2012 p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
David Rientjesf5b087b2008-04-28 02:12:27 -07002013 }
2014
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002015 if (!nodes_empty(nodes)) {
2016 if (buffer + maxlen < p + 2)
2017 return -ENOSPC;
2018 *p++ = '=';
2019 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2020 }
2021 return p - buffer;
2022}
2023
2024struct numa_maps {
2025 unsigned long pages;
2026 unsigned long anon;
Christoph Lameter397874d2006-03-06 15:42:53 -08002027 unsigned long active;
2028 unsigned long writeback;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002029 unsigned long mapcount_max;
Christoph Lameter397874d2006-03-06 15:42:53 -08002030 unsigned long dirty;
2031 unsigned long swapcache;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002032 unsigned long node[MAX_NUMNODES];
2033};
2034
Christoph Lameter397874d2006-03-06 15:42:53 -08002035static void gather_stats(struct page *page, void *private, int pte_dirty)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002036{
2037 struct numa_maps *md = private;
2038 int count = page_mapcount(page);
2039
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002040 md->pages++;
Christoph Lameter397874d2006-03-06 15:42:53 -08002041 if (pte_dirty || PageDirty(page))
2042 md->dirty++;
2043
2044 if (PageSwapCache(page))
2045 md->swapcache++;
2046
2047 if (PageActive(page))
2048 md->active++;
2049
2050 if (PageWriteback(page))
2051 md->writeback++;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002052
2053 if (PageAnon(page))
2054 md->anon++;
2055
Christoph Lameter397874d2006-03-06 15:42:53 -08002056 if (count > md->mapcount_max)
2057 md->mapcount_max = count;
2058
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002059 md->node[page_to_nid(page)]++;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002060}
2061
Andrew Morton7f709ed2006-03-07 21:55:22 -08002062#ifdef CONFIG_HUGETLB_PAGE
Christoph Lameter397874d2006-03-06 15:42:53 -08002063static void check_huge_range(struct vm_area_struct *vma,
2064 unsigned long start, unsigned long end,
2065 struct numa_maps *md)
2066{
2067 unsigned long addr;
2068 struct page *page;
2069
2070 for (addr = start; addr < end; addr += HPAGE_SIZE) {
2071 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2072 pte_t pte;
2073
2074 if (!ptep)
2075 continue;
2076
2077 pte = *ptep;
2078 if (pte_none(pte))
2079 continue;
2080
2081 page = pte_page(pte);
2082 if (!page)
2083 continue;
2084
2085 gather_stats(page, md, pte_dirty(*ptep));
2086 }
2087}
Andrew Morton7f709ed2006-03-07 21:55:22 -08002088#else
2089static inline void check_huge_range(struct vm_area_struct *vma,
2090 unsigned long start, unsigned long end,
2091 struct numa_maps *md)
2092{
2093}
2094#endif
Christoph Lameter397874d2006-03-06 15:42:53 -08002095
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002096int show_numa_map(struct seq_file *m, void *v)
2097{
Eric W. Biederman99f89552006-06-26 00:25:55 -07002098 struct proc_maps_private *priv = m->private;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002099 struct vm_area_struct *vma = v;
2100 struct numa_maps *md;
Christoph Lameter397874d2006-03-06 15:42:53 -08002101 struct file *file = vma->vm_file;
2102 struct mm_struct *mm = vma->vm_mm;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07002103 struct mempolicy *pol;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002104 int n;
2105 char buffer[50];
2106
Christoph Lameter397874d2006-03-06 15:42:53 -08002107 if (!mm)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002108 return 0;
2109
2110 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2111 if (!md)
2112 return 0;
2113
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07002114 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2115 mpol_to_str(buffer, sizeof(buffer), pol);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002116 mpol_cond_put(pol);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002117
Christoph Lameter397874d2006-03-06 15:42:53 -08002118 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002119
Christoph Lameter397874d2006-03-06 15:42:53 -08002120 if (file) {
2121 seq_printf(m, " file=");
Jan Blunckc32c2f62008-02-14 19:38:43 -08002122 seq_path(m, &file->f_path, "\n\t= ");
Christoph Lameter397874d2006-03-06 15:42:53 -08002123 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2124 seq_printf(m, " heap");
2125 } else if (vma->vm_start <= mm->start_stack &&
2126 vma->vm_end >= mm->start_stack) {
2127 seq_printf(m, " stack");
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002128 }
Christoph Lameter397874d2006-03-06 15:42:53 -08002129
2130 if (is_vm_hugetlb_page(vma)) {
2131 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2132 seq_printf(m, " huge");
2133 } else {
2134 check_pgd_range(vma, vma->vm_start, vma->vm_end,
Christoph Lameter56bbd652007-10-16 01:25:35 -07002135 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
Christoph Lameter397874d2006-03-06 15:42:53 -08002136 }
2137
2138 if (!md->pages)
2139 goto out;
2140
2141 if (md->anon)
2142 seq_printf(m," anon=%lu",md->anon);
2143
2144 if (md->dirty)
2145 seq_printf(m," dirty=%lu",md->dirty);
2146
2147 if (md->pages != md->anon && md->pages != md->dirty)
2148 seq_printf(m, " mapped=%lu", md->pages);
2149
2150 if (md->mapcount_max > 1)
2151 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2152
2153 if (md->swapcache)
2154 seq_printf(m," swapcache=%lu", md->swapcache);
2155
2156 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2157 seq_printf(m," active=%lu", md->active);
2158
2159 if (md->writeback)
2160 seq_printf(m," writeback=%lu", md->writeback);
2161
Christoph Lameter56bbd652007-10-16 01:25:35 -07002162 for_each_node_state(n, N_HIGH_MEMORY)
Christoph Lameter397874d2006-03-06 15:42:53 -08002163 if (md->node[n])
2164 seq_printf(m, " N%d=%lu", n, md->node[n]);
2165out:
2166 seq_putc(m, '\n');
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002167 kfree(md);
2168
2169 if (m->count < m->size)
Eric W. Biederman99f89552006-06-26 00:25:55 -07002170 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002171 return 0;
2172}