blob: 8bc0be1c9efdaee5259986c616052353462286cf [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
Andi Kleend42c6992005-07-06 19:56:03 +020091struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -070092 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
Linus Torvalds1da177e2005-04-16 15:20:36 -070096/* Do sanity checking on a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -070097static int mpol_check_policy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -070098{
Andi Kleendfcd3c02005-10-29 18:15:48 -070099 int empty = nodes_empty(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100
101 switch (mode) {
102 case MPOL_DEFAULT:
103 if (!empty)
104 return -EINVAL;
105 break;
106 case MPOL_BIND:
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
109 more for now. */
110 if (empty)
111 return -EINVAL;
112 break;
113 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115}
116
117/* Copy a node mask from user space. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700118static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 unsigned long maxnode, int mode)
120{
121 unsigned long k;
122 unsigned long nlongs;
123 unsigned long endmask;
124
125 --maxnode;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700126 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127 if (maxnode == 0 || !nmask)
128 return 0;
129
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
132 endmask = ~0UL;
133 else
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
135
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
140 return -EINVAL;
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
142 unsigned long t;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700143 if (get_user(t, nmask + k))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 return -EFAULT;
145 if (k == nlongs - 1) {
146 if (t & endmask)
147 return -EINVAL;
148 } else if (t)
149 return -EINVAL;
150 }
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
152 endmask = ~0UL;
153 }
154
Andi Kleendfcd3c02005-10-29 18:15:48 -0700155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 return -EFAULT;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700157 nodes_addr(*nodes)[nlongs-1] &= endmask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 return mpol_check_policy(mode, nodes);
164}
165
166/* Generate a custom zonelist for the BIND policy. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700167static struct zonelist *bind_zonelist(nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168{
169 struct zonelist *zl;
170 int num, max, nd;
171
Andi Kleendfcd3c02005-10-29 18:15:48 -0700172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
174 if (!zl)
175 return NULL;
176 num = 0;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700177 for_each_node_mask(nd, *nodes) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 int k;
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
182 continue;
183 zl->zones[num++] = z;
184 if (k > policy_zone)
185 policy_zone = k;
186 }
187 }
188 BUG_ON(num >= max);
189 zl->zones[num] = NULL;
190 return zl;
191}
192
193/* Create a new policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700194static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195{
196 struct mempolicy *policy;
197
Andi Kleendfcd3c02005-10-29 18:15:48 -0700198 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 if (mode == MPOL_DEFAULT)
200 return NULL;
201 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
202 if (!policy)
203 return ERR_PTR(-ENOMEM);
204 atomic_set(&policy->refcnt, 1);
205 switch (mode) {
206 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700207 policy->v.nodes = *nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208 break;
209 case MPOL_PREFERRED:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700210 policy->v.preferred_node = first_node(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211 if (policy->v.preferred_node >= MAX_NUMNODES)
212 policy->v.preferred_node = -1;
213 break;
214 case MPOL_BIND:
215 policy->v.zonelist = bind_zonelist(nodes);
216 if (policy->v.zonelist == NULL) {
217 kmem_cache_free(policy_cache, policy);
218 return ERR_PTR(-ENOMEM);
219 }
220 break;
221 }
222 policy->policy = mode;
223 return policy;
224}
225
226/* Ensure all existing pages follow the policy. */
Hugh Dickins91612e02005-06-21 17:15:07 -0700227static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700228 unsigned long addr, unsigned long end, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700229{
Hugh Dickins91612e02005-06-21 17:15:07 -0700230 pte_t *orig_pte;
231 pte_t *pte;
Hugh Dickins941150a2005-06-21 17:15:06 -0700232
233 spin_lock(&mm->page_table_lock);
Hugh Dickins91612e02005-06-21 17:15:07 -0700234 orig_pte = pte = pte_offset_map(pmd, addr);
235 do {
236 unsigned long pfn;
237 unsigned int nid;
238
239 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700240 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700241 pfn = pte_pfn(*pte);
242 if (!pfn_valid(pfn))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700244 nid = pfn_to_nid(pfn);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700245 if (!node_isset(nid, *nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700246 break;
247 } while (pte++, addr += PAGE_SIZE, addr != end);
248 pte_unmap(orig_pte);
Hugh Dickins941150a2005-06-21 17:15:06 -0700249 spin_unlock(&mm->page_table_lock);
Hugh Dickins91612e02005-06-21 17:15:07 -0700250 return addr != end;
251}
252
253static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700254 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700255{
256 pmd_t *pmd;
257 unsigned long next;
258
259 pmd = pmd_offset(pud, addr);
260 do {
261 next = pmd_addr_end(addr, end);
262 if (pmd_none_or_clear_bad(pmd))
263 continue;
264 if (check_pte_range(mm, pmd, addr, next, nodes))
265 return -EIO;
266 } while (pmd++, addr = next, addr != end);
267 return 0;
268}
269
270static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700271 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700272{
273 pud_t *pud;
274 unsigned long next;
275
276 pud = pud_offset(pgd, addr);
277 do {
278 next = pud_addr_end(addr, end);
279 if (pud_none_or_clear_bad(pud))
280 continue;
281 if (check_pmd_range(mm, pud, addr, next, nodes))
282 return -EIO;
283 } while (pud++, addr = next, addr != end);
284 return 0;
285}
286
287static inline int check_pgd_range(struct mm_struct *mm,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700288 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700289{
290 pgd_t *pgd;
291 unsigned long next;
292
293 pgd = pgd_offset(mm, addr);
294 do {
295 next = pgd_addr_end(addr, end);
296 if (pgd_none_or_clear_bad(pgd))
297 continue;
298 if (check_pud_range(mm, pgd, addr, next, nodes))
299 return -EIO;
300 } while (pgd++, addr = next, addr != end);
301 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302}
303
304/* Step 1: check the range */
305static struct vm_area_struct *
306check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700307 nodemask_t *nodes, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308{
309 int err;
310 struct vm_area_struct *first, *vma, *prev;
311
312 first = find_vma(mm, start);
313 if (!first)
314 return ERR_PTR(-EFAULT);
315 prev = NULL;
316 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
317 if (!vma->vm_next && vma->vm_end < end)
318 return ERR_PTR(-EFAULT);
319 if (prev && prev->vm_end < vma->vm_start)
320 return ERR_PTR(-EFAULT);
321 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700322 unsigned long endvma = vma->vm_end;
323 if (endvma > end)
324 endvma = end;
325 if (vma->vm_start > start)
326 start = vma->vm_start;
Hugh Dickins91612e02005-06-21 17:15:07 -0700327 err = check_pgd_range(vma->vm_mm,
Andi Kleen5b952b32005-09-13 01:25:08 -0700328 start, endvma, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 if (err) {
330 first = ERR_PTR(err);
331 break;
332 }
333 }
334 prev = vma;
335 }
336 return first;
337}
338
339/* Apply policy to a single VMA */
340static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
341{
342 int err = 0;
343 struct mempolicy *old = vma->vm_policy;
344
345 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
346 vma->vm_start, vma->vm_end, vma->vm_pgoff,
347 vma->vm_ops, vma->vm_file,
348 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
349
350 if (vma->vm_ops && vma->vm_ops->set_policy)
351 err = vma->vm_ops->set_policy(vma, new);
352 if (!err) {
353 mpol_get(new);
354 vma->vm_policy = new;
355 mpol_free(old);
356 }
357 return err;
358}
359
360/* Step 2: apply policy to a range and do splits. */
361static int mbind_range(struct vm_area_struct *vma, unsigned long start,
362 unsigned long end, struct mempolicy *new)
363{
364 struct vm_area_struct *next;
365 int err;
366
367 err = 0;
368 for (; vma && vma->vm_start < end; vma = next) {
369 next = vma->vm_next;
370 if (vma->vm_start < start)
371 err = split_vma(vma->vm_mm, vma, start, 1);
372 if (!err && vma->vm_end > end)
373 err = split_vma(vma->vm_mm, vma, end, 0);
374 if (!err)
375 err = policy_vma(vma, new);
376 if (err)
377 break;
378 }
379 return err;
380}
381
382/* Change policy for a memory range */
383asmlinkage long sys_mbind(unsigned long start, unsigned long len,
384 unsigned long mode,
385 unsigned long __user *nmask, unsigned long maxnode,
386 unsigned flags)
387{
388 struct vm_area_struct *vma;
389 struct mm_struct *mm = current->mm;
390 struct mempolicy *new;
391 unsigned long end;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700392 nodemask_t nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 int err;
394
395 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
396 return -EINVAL;
397 if (start & ~PAGE_MASK)
398 return -EINVAL;
399 if (mode == MPOL_DEFAULT)
400 flags &= ~MPOL_MF_STRICT;
401 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
402 end = start + len;
403 if (end < start)
404 return -EINVAL;
405 if (end == start)
406 return 0;
407
Andi Kleendfcd3c02005-10-29 18:15:48 -0700408 err = get_nodes(&nodes, nmask, maxnode, mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409 if (err)
410 return err;
411
Andi Kleendfcd3c02005-10-29 18:15:48 -0700412 new = mpol_new(mode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413 if (IS_ERR(new))
414 return PTR_ERR(new);
415
416 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700417 mode,nodes_addr(nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418
419 down_write(&mm->mmap_sem);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700420 vma = check_range(mm, start, end, &nodes, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421 err = PTR_ERR(vma);
422 if (!IS_ERR(vma))
423 err = mbind_range(vma, start, end, new);
424 up_write(&mm->mmap_sem);
425 mpol_free(new);
426 return err;
427}
428
429/* Set the process memory policy */
430asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
431 unsigned long maxnode)
432{
433 int err;
434 struct mempolicy *new;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700435 nodemask_t nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436
Eric Dumazetba171012005-08-01 21:11:43 -0700437 if (mode < 0 || mode > MPOL_MAX)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 return -EINVAL;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700439 err = get_nodes(&nodes, nmask, maxnode, mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 if (err)
441 return err;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700442 new = mpol_new(mode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443 if (IS_ERR(new))
444 return PTR_ERR(new);
445 mpol_free(current->mempolicy);
446 current->mempolicy = new;
447 if (new && new->policy == MPOL_INTERLEAVE)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700448 current->il_next = first_node(new->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449 return 0;
450}
451
452/* Fill a zone bitmap for a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700453static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454{
455 int i;
456
Andi Kleendfcd3c02005-10-29 18:15:48 -0700457 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 switch (p->policy) {
459 case MPOL_BIND:
460 for (i = 0; p->v.zonelist->zones[i]; i++)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700461 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 break;
463 case MPOL_DEFAULT:
464 break;
465 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700466 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 break;
468 case MPOL_PREFERRED:
469 /* or use current node instead of online map? */
470 if (p->v.preferred_node < 0)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700471 *nodes = node_online_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700473 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 break;
475 default:
476 BUG();
477 }
478}
479
480static int lookup_node(struct mm_struct *mm, unsigned long addr)
481{
482 struct page *p;
483 int err;
484
485 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
486 if (err >= 0) {
487 err = page_to_nid(p);
488 put_page(p);
489 }
490 return err;
491}
492
493/* Copy a kernel node mask to user space */
494static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700495 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496{
497 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700498 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499
500 if (copy > nbytes) {
501 if (copy > PAGE_SIZE)
502 return -EINVAL;
503 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
504 return -EFAULT;
505 copy = nbytes;
506 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700507 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508}
509
510/* Retrieve NUMA policy */
511asmlinkage long sys_get_mempolicy(int __user *policy,
512 unsigned long __user *nmask,
513 unsigned long maxnode,
514 unsigned long addr, unsigned long flags)
515{
516 int err, pval;
517 struct mm_struct *mm = current->mm;
518 struct vm_area_struct *vma = NULL;
519 struct mempolicy *pol = current->mempolicy;
520
521 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
522 return -EINVAL;
523 if (nmask != NULL && maxnode < MAX_NUMNODES)
524 return -EINVAL;
525 if (flags & MPOL_F_ADDR) {
526 down_read(&mm->mmap_sem);
527 vma = find_vma_intersection(mm, addr, addr+1);
528 if (!vma) {
529 up_read(&mm->mmap_sem);
530 return -EFAULT;
531 }
532 if (vma->vm_ops && vma->vm_ops->get_policy)
533 pol = vma->vm_ops->get_policy(vma, addr);
534 else
535 pol = vma->vm_policy;
536 } else if (addr)
537 return -EINVAL;
538
539 if (!pol)
540 pol = &default_policy;
541
542 if (flags & MPOL_F_NODE) {
543 if (flags & MPOL_F_ADDR) {
544 err = lookup_node(mm, addr);
545 if (err < 0)
546 goto out;
547 pval = err;
548 } else if (pol == current->mempolicy &&
549 pol->policy == MPOL_INTERLEAVE) {
550 pval = current->il_next;
551 } else {
552 err = -EINVAL;
553 goto out;
554 }
555 } else
556 pval = pol->policy;
557
558 if (vma) {
559 up_read(&current->mm->mmap_sem);
560 vma = NULL;
561 }
562
563 if (policy && put_user(pval, policy))
564 return -EFAULT;
565
566 err = 0;
567 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700568 nodemask_t nodes;
569 get_zonemask(pol, &nodes);
570 err = copy_nodes_to_user(nmask, maxnode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571 }
572
573 out:
574 if (vma)
575 up_read(&current->mm->mmap_sem);
576 return err;
577}
578
579#ifdef CONFIG_COMPAT
580
581asmlinkage long compat_sys_get_mempolicy(int __user *policy,
582 compat_ulong_t __user *nmask,
583 compat_ulong_t maxnode,
584 compat_ulong_t addr, compat_ulong_t flags)
585{
586 long err;
587 unsigned long __user *nm = NULL;
588 unsigned long nr_bits, alloc_size;
589 DECLARE_BITMAP(bm, MAX_NUMNODES);
590
591 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
592 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
593
594 if (nmask)
595 nm = compat_alloc_user_space(alloc_size);
596
597 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
598
599 if (!err && nmask) {
600 err = copy_from_user(bm, nm, alloc_size);
601 /* ensure entire bitmap is zeroed */
602 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
603 err |= compat_put_bitmap(nmask, bm, nr_bits);
604 }
605
606 return err;
607}
608
609asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
610 compat_ulong_t maxnode)
611{
612 long err = 0;
613 unsigned long __user *nm = NULL;
614 unsigned long nr_bits, alloc_size;
615 DECLARE_BITMAP(bm, MAX_NUMNODES);
616
617 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
618 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
619
620 if (nmask) {
621 err = compat_get_bitmap(bm, nmask, nr_bits);
622 nm = compat_alloc_user_space(alloc_size);
623 err |= copy_to_user(nm, bm, alloc_size);
624 }
625
626 if (err)
627 return -EFAULT;
628
629 return sys_set_mempolicy(mode, nm, nr_bits+1);
630}
631
632asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
633 compat_ulong_t mode, compat_ulong_t __user *nmask,
634 compat_ulong_t maxnode, compat_ulong_t flags)
635{
636 long err = 0;
637 unsigned long __user *nm = NULL;
638 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700639 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640
641 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
642 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
643
644 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700645 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700647 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 }
649
650 if (err)
651 return -EFAULT;
652
653 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
654}
655
656#endif
657
658/* Return effective policy for a VMA */
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700659struct mempolicy *
660get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700662 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663
664 if (vma) {
665 if (vma->vm_ops && vma->vm_ops->get_policy)
666 pol = vma->vm_ops->get_policy(vma, addr);
667 else if (vma->vm_policy &&
668 vma->vm_policy->policy != MPOL_DEFAULT)
669 pol = vma->vm_policy;
670 }
671 if (!pol)
672 pol = &default_policy;
673 return pol;
674}
675
676/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +0100677static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678{
679 int nd;
680
681 switch (policy->policy) {
682 case MPOL_PREFERRED:
683 nd = policy->v.preferred_node;
684 if (nd < 0)
685 nd = numa_node_id();
686 break;
687 case MPOL_BIND:
688 /* Lower zones don't get a policy applied */
689 /* Careful: current->mems_allowed might have moved */
Al Viroaf4ca452005-10-21 02:55:38 -0400690 if (gfp_zone(gfp) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
692 return policy->v.zonelist;
693 /*FALL THROUGH*/
694 case MPOL_INTERLEAVE: /* should not happen */
695 case MPOL_DEFAULT:
696 nd = numa_node_id();
697 break;
698 default:
699 nd = 0;
700 BUG();
701 }
Al Viroaf4ca452005-10-21 02:55:38 -0400702 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703}
704
705/* Do dynamic interleaving for a process */
706static unsigned interleave_nodes(struct mempolicy *policy)
707{
708 unsigned nid, next;
709 struct task_struct *me = current;
710
711 nid = me->il_next;
712 BUG_ON(nid >= MAX_NUMNODES);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700713 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700715 next = first_node(policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 me->il_next = next;
717 return nid;
718}
719
720/* Do static interleaving for a VMA with known offset. */
721static unsigned offset_il_node(struct mempolicy *pol,
722 struct vm_area_struct *vma, unsigned long off)
723{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700724 unsigned nnodes = nodes_weight(pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 unsigned target = (unsigned)off % nnodes;
726 int c;
727 int nid = -1;
728
729 c = 0;
730 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700731 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732 c++;
733 } while (c <= target);
734 BUG_ON(nid >= MAX_NUMNODES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700735 return nid;
736}
737
738/* Allocate a page in interleaved policy.
739 Own path because it needs to do special accounting. */
Al Virodd0fc662005-10-07 07:46:04 +0100740static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741{
742 struct zonelist *zl;
743 struct page *page;
744
745 BUG_ON(!node_online(nid));
Al Viroaf4ca452005-10-21 02:55:38 -0400746 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747 page = __alloc_pages(gfp, order, zl);
748 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -0700749 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750 put_cpu();
751 }
752 return page;
753}
754
755/**
756 * alloc_page_vma - Allocate a page for a VMA.
757 *
758 * @gfp:
759 * %GFP_USER user allocation.
760 * %GFP_KERNEL kernel allocations,
761 * %GFP_HIGHMEM highmem/user allocations,
762 * %GFP_FS allocation should not call back into a file system.
763 * %GFP_ATOMIC don't sleep.
764 *
765 * @vma: Pointer to VMA or NULL if not available.
766 * @addr: Virtual Address of the allocation. Must be inside the VMA.
767 *
768 * This function allocates a page from the kernel page pool and applies
769 * a NUMA policy associated with the VMA or the current process.
770 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
771 * mm_struct of the VMA to prevent it from going away. Should be used for
772 * all allocations for pages that will be mapped into
773 * user space. Returns NULL when no page can be allocated.
774 *
775 * Should be called with the mm_sem of the vma hold.
776 */
777struct page *
Al Virodd0fc662005-10-07 07:46:04 +0100778alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700780 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781
782 cpuset_update_current_mems_allowed();
783
784 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
785 unsigned nid;
786 if (vma) {
787 unsigned long off;
788 BUG_ON(addr >= vma->vm_end);
789 BUG_ON(addr < vma->vm_start);
790 off = vma->vm_pgoff;
791 off += (addr - vma->vm_start) >> PAGE_SHIFT;
792 nid = offset_il_node(pol, vma, off);
793 } else {
794 /* fall back to process interleaving */
795 nid = interleave_nodes(pol);
796 }
797 return alloc_page_interleave(gfp, 0, nid);
798 }
799 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
800}
801
802/**
803 * alloc_pages_current - Allocate pages.
804 *
805 * @gfp:
806 * %GFP_USER user allocation,
807 * %GFP_KERNEL kernel allocation,
808 * %GFP_HIGHMEM highmem allocation,
809 * %GFP_FS don't call back into a file system.
810 * %GFP_ATOMIC don't sleep.
811 * @order: Power of two of allocation size in pages. 0 is a single page.
812 *
813 * Allocate a page from the kernel page pool. When not in
814 * interrupt context and apply the current process NUMA policy.
815 * Returns NULL when no page can be allocated.
816 *
817 * Don't call cpuset_update_current_mems_allowed() unless
818 * 1) it's ok to take cpuset_sem (can WAIT), and
819 * 2) allocating for current task (not interrupt).
820 */
Al Virodd0fc662005-10-07 07:46:04 +0100821struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822{
823 struct mempolicy *pol = current->mempolicy;
824
825 if ((gfp & __GFP_WAIT) && !in_interrupt())
826 cpuset_update_current_mems_allowed();
827 if (!pol || in_interrupt())
828 pol = &default_policy;
829 if (pol->policy == MPOL_INTERLEAVE)
830 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
831 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
832}
833EXPORT_SYMBOL(alloc_pages_current);
834
835/* Slow path of a mempolicy copy */
836struct mempolicy *__mpol_copy(struct mempolicy *old)
837{
838 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
839
840 if (!new)
841 return ERR_PTR(-ENOMEM);
842 *new = *old;
843 atomic_set(&new->refcnt, 1);
844 if (new->policy == MPOL_BIND) {
845 int sz = ksize(old->v.zonelist);
846 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
847 if (!new->v.zonelist) {
848 kmem_cache_free(policy_cache, new);
849 return ERR_PTR(-ENOMEM);
850 }
851 memcpy(new->v.zonelist, old->v.zonelist, sz);
852 }
853 return new;
854}
855
856/* Slow path of a mempolicy comparison */
857int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
858{
859 if (!a || !b)
860 return 0;
861 if (a->policy != b->policy)
862 return 0;
863 switch (a->policy) {
864 case MPOL_DEFAULT:
865 return 1;
866 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700867 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868 case MPOL_PREFERRED:
869 return a->v.preferred_node == b->v.preferred_node;
870 case MPOL_BIND: {
871 int i;
872 for (i = 0; a->v.zonelist->zones[i]; i++)
873 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
874 return 0;
875 return b->v.zonelist->zones[i] == NULL;
876 }
877 default:
878 BUG();
879 return 0;
880 }
881}
882
883/* Slow path of a mpol destructor. */
884void __mpol_free(struct mempolicy *p)
885{
886 if (!atomic_dec_and_test(&p->refcnt))
887 return;
888 if (p->policy == MPOL_BIND)
889 kfree(p->v.zonelist);
890 p->policy = MPOL_DEFAULT;
891 kmem_cache_free(policy_cache, p);
892}
893
894/*
895 * Hugetlb policy. Same as above, just works with node numbers instead of
896 * zonelists.
897 */
898
899/* Find first node suitable for an allocation */
900int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
901{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700902 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903
904 switch (pol->policy) {
905 case MPOL_DEFAULT:
906 return numa_node_id();
907 case MPOL_BIND:
908 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
909 case MPOL_INTERLEAVE:
910 return interleave_nodes(pol);
911 case MPOL_PREFERRED:
912 return pol->v.preferred_node >= 0 ?
913 pol->v.preferred_node : numa_node_id();
914 }
915 BUG();
916 return 0;
917}
918
919/* Find secondary valid nodes for an allocation */
920int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
921{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700922 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923
924 switch (pol->policy) {
925 case MPOL_PREFERRED:
926 case MPOL_DEFAULT:
927 case MPOL_INTERLEAVE:
928 return 1;
929 case MPOL_BIND: {
930 struct zone **z;
931 for (z = pol->v.zonelist->zones; *z; z++)
932 if ((*z)->zone_pgdat->node_id == nid)
933 return 1;
934 return 0;
935 }
936 default:
937 BUG();
938 return 0;
939 }
940}
941
942/*
943 * Shared memory backing store policy support.
944 *
945 * Remember policies even when nobody has shared memory mapped.
946 * The policies are kept in Red-Black tree linked from the inode.
947 * They are protected by the sp->lock spinlock, which should be held
948 * for any accesses to the tree.
949 */
950
951/* lookup first element intersecting start-end */
952/* Caller holds sp->lock */
953static struct sp_node *
954sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
955{
956 struct rb_node *n = sp->root.rb_node;
957
958 while (n) {
959 struct sp_node *p = rb_entry(n, struct sp_node, nd);
960
961 if (start >= p->end)
962 n = n->rb_right;
963 else if (end <= p->start)
964 n = n->rb_left;
965 else
966 break;
967 }
968 if (!n)
969 return NULL;
970 for (;;) {
971 struct sp_node *w = NULL;
972 struct rb_node *prev = rb_prev(n);
973 if (!prev)
974 break;
975 w = rb_entry(prev, struct sp_node, nd);
976 if (w->end <= start)
977 break;
978 n = prev;
979 }
980 return rb_entry(n, struct sp_node, nd);
981}
982
983/* Insert a new shared policy into the list. */
984/* Caller holds sp->lock */
985static void sp_insert(struct shared_policy *sp, struct sp_node *new)
986{
987 struct rb_node **p = &sp->root.rb_node;
988 struct rb_node *parent = NULL;
989 struct sp_node *nd;
990
991 while (*p) {
992 parent = *p;
993 nd = rb_entry(parent, struct sp_node, nd);
994 if (new->start < nd->start)
995 p = &(*p)->rb_left;
996 else if (new->end > nd->end)
997 p = &(*p)->rb_right;
998 else
999 BUG();
1000 }
1001 rb_link_node(&new->nd, parent, p);
1002 rb_insert_color(&new->nd, &sp->root);
1003 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1004 new->policy ? new->policy->policy : 0);
1005}
1006
1007/* Find shared policy intersecting idx */
1008struct mempolicy *
1009mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1010{
1011 struct mempolicy *pol = NULL;
1012 struct sp_node *sn;
1013
1014 if (!sp->root.rb_node)
1015 return NULL;
1016 spin_lock(&sp->lock);
1017 sn = sp_lookup(sp, idx, idx+1);
1018 if (sn) {
1019 mpol_get(sn->policy);
1020 pol = sn->policy;
1021 }
1022 spin_unlock(&sp->lock);
1023 return pol;
1024}
1025
1026static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1027{
1028 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1029 rb_erase(&n->nd, &sp->root);
1030 mpol_free(n->policy);
1031 kmem_cache_free(sn_cache, n);
1032}
1033
1034struct sp_node *
1035sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1036{
1037 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1038
1039 if (!n)
1040 return NULL;
1041 n->start = start;
1042 n->end = end;
1043 mpol_get(pol);
1044 n->policy = pol;
1045 return n;
1046}
1047
1048/* Replace a policy range. */
1049static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1050 unsigned long end, struct sp_node *new)
1051{
1052 struct sp_node *n, *new2 = NULL;
1053
1054restart:
1055 spin_lock(&sp->lock);
1056 n = sp_lookup(sp, start, end);
1057 /* Take care of old policies in the same range. */
1058 while (n && n->start < end) {
1059 struct rb_node *next = rb_next(&n->nd);
1060 if (n->start >= start) {
1061 if (n->end <= end)
1062 sp_delete(sp, n);
1063 else
1064 n->start = end;
1065 } else {
1066 /* Old policy spanning whole new range. */
1067 if (n->end > end) {
1068 if (!new2) {
1069 spin_unlock(&sp->lock);
1070 new2 = sp_alloc(end, n->end, n->policy);
1071 if (!new2)
1072 return -ENOMEM;
1073 goto restart;
1074 }
1075 n->end = start;
1076 sp_insert(sp, new2);
1077 new2 = NULL;
1078 break;
1079 } else
1080 n->end = start;
1081 }
1082 if (!next)
1083 break;
1084 n = rb_entry(next, struct sp_node, nd);
1085 }
1086 if (new)
1087 sp_insert(sp, new);
1088 spin_unlock(&sp->lock);
1089 if (new2) {
1090 mpol_free(new2->policy);
1091 kmem_cache_free(sn_cache, new2);
1092 }
1093 return 0;
1094}
1095
1096int mpol_set_shared_policy(struct shared_policy *info,
1097 struct vm_area_struct *vma, struct mempolicy *npol)
1098{
1099 int err;
1100 struct sp_node *new = NULL;
1101 unsigned long sz = vma_pages(vma);
1102
1103 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1104 vma->vm_pgoff,
1105 sz, npol? npol->policy : -1,
Andi Kleendfcd3c02005-10-29 18:15:48 -07001106 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107
1108 if (npol) {
1109 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1110 if (!new)
1111 return -ENOMEM;
1112 }
1113 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1114 if (err && new)
1115 kmem_cache_free(sn_cache, new);
1116 return err;
1117}
1118
1119/* Free a backing policy store on inode delete. */
1120void mpol_free_shared_policy(struct shared_policy *p)
1121{
1122 struct sp_node *n;
1123 struct rb_node *next;
1124
1125 if (!p->root.rb_node)
1126 return;
1127 spin_lock(&p->lock);
1128 next = rb_first(&p->root);
1129 while (next) {
1130 n = rb_entry(next, struct sp_node, nd);
1131 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001132 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 mpol_free(n->policy);
1134 kmem_cache_free(sn_cache, n);
1135 }
1136 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137}
1138
1139/* assumes fs == KERNEL_DS */
1140void __init numa_policy_init(void)
1141{
1142 policy_cache = kmem_cache_create("numa_policy",
1143 sizeof(struct mempolicy),
1144 0, SLAB_PANIC, NULL, NULL);
1145
1146 sn_cache = kmem_cache_create("shared_policy_node",
1147 sizeof(struct sp_node),
1148 0, SLAB_PANIC, NULL, NULL);
1149
1150 /* Set interleaving policy for system init. This way not all
1151 the data structures allocated at system boot end up in node zero. */
1152
1153 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1154 MAX_NUMNODES) < 0)
1155 printk("numa_policy_init: interleaving failed\n");
1156}
1157
1158/* Reset policy of current process to default.
1159 * Assumes fs == KERNEL_DS */
1160void numa_default_policy(void)
1161{
1162 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1163}