blob: 11d824f282f10fa04743642b75f30270316be250 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
Andi Kleend42c6992005-07-06 19:56:03 +020091struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -070092 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
Linus Torvalds1da177e2005-04-16 15:20:36 -070096/* Do sanity checking on a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -070097static int mpol_check_policy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -070098{
Andi Kleendfcd3c02005-10-29 18:15:48 -070099 int empty = nodes_empty(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100
101 switch (mode) {
102 case MPOL_DEFAULT:
103 if (!empty)
104 return -EINVAL;
105 break;
106 case MPOL_BIND:
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
109 more for now. */
110 if (empty)
111 return -EINVAL;
112 break;
113 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115}
116
117/* Copy a node mask from user space. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700118static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 unsigned long maxnode, int mode)
120{
121 unsigned long k;
122 unsigned long nlongs;
123 unsigned long endmask;
124
125 --maxnode;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700126 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127 if (maxnode == 0 || !nmask)
128 return 0;
129
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
132 endmask = ~0UL;
133 else
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
135
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
140 return -EINVAL;
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
142 unsigned long t;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700143 if (get_user(t, nmask + k))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 return -EFAULT;
145 if (k == nlongs - 1) {
146 if (t & endmask)
147 return -EINVAL;
148 } else if (t)
149 return -EINVAL;
150 }
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
152 endmask = ~0UL;
153 }
154
Andi Kleendfcd3c02005-10-29 18:15:48 -0700155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 return -EFAULT;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700157 nodes_addr(*nodes)[nlongs-1] &= endmask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 return mpol_check_policy(mode, nodes);
164}
165
166/* Generate a custom zonelist for the BIND policy. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700167static struct zonelist *bind_zonelist(nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168{
169 struct zonelist *zl;
170 int num, max, nd;
171
Andi Kleendfcd3c02005-10-29 18:15:48 -0700172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
174 if (!zl)
175 return NULL;
176 num = 0;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700177 for_each_node_mask(nd, *nodes) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 int k;
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
182 continue;
183 zl->zones[num++] = z;
184 if (k > policy_zone)
185 policy_zone = k;
186 }
187 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 zl->zones[num] = NULL;
189 return zl;
190}
191
192/* Create a new policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700193static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194{
195 struct mempolicy *policy;
196
Andi Kleendfcd3c02005-10-29 18:15:48 -0700197 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 if (mode == MPOL_DEFAULT)
199 return NULL;
200 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
201 if (!policy)
202 return ERR_PTR(-ENOMEM);
203 atomic_set(&policy->refcnt, 1);
204 switch (mode) {
205 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700206 policy->v.nodes = *nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 break;
208 case MPOL_PREFERRED:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700209 policy->v.preferred_node = first_node(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210 if (policy->v.preferred_node >= MAX_NUMNODES)
211 policy->v.preferred_node = -1;
212 break;
213 case MPOL_BIND:
214 policy->v.zonelist = bind_zonelist(nodes);
215 if (policy->v.zonelist == NULL) {
216 kmem_cache_free(policy_cache, policy);
217 return ERR_PTR(-ENOMEM);
218 }
219 break;
220 }
221 policy->policy = mode;
222 return policy;
223}
224
225/* Ensure all existing pages follow the policy. */
Nick Pigginb5810032005-10-29 18:16:12 -0700226static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700227 unsigned long addr, unsigned long end, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228{
Hugh Dickins91612e02005-06-21 17:15:07 -0700229 pte_t *orig_pte;
230 pte_t *pte;
Hugh Dickins941150a2005-06-21 17:15:06 -0700231
Nick Pigginb5810032005-10-29 18:16:12 -0700232 spin_lock(&vma->vm_mm->page_table_lock);
Hugh Dickins91612e02005-06-21 17:15:07 -0700233 orig_pte = pte = pte_offset_map(pmd, addr);
234 do {
235 unsigned long pfn;
236 unsigned int nid;
237
238 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700240 pfn = pte_pfn(*pte);
Nick Pigginb5810032005-10-29 18:16:12 -0700241 if (!pfn_valid(pfn)) {
242 print_bad_pte(vma, *pte, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700244 }
Hugh Dickins91612e02005-06-21 17:15:07 -0700245 nid = pfn_to_nid(pfn);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700246 if (!node_isset(nid, *nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700247 break;
248 } while (pte++, addr += PAGE_SIZE, addr != end);
249 pte_unmap(orig_pte);
Nick Pigginb5810032005-10-29 18:16:12 -0700250 spin_unlock(&vma->vm_mm->page_table_lock);
Hugh Dickins91612e02005-06-21 17:15:07 -0700251 return addr != end;
252}
253
Nick Pigginb5810032005-10-29 18:16:12 -0700254static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700255 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700256{
257 pmd_t *pmd;
258 unsigned long next;
259
260 pmd = pmd_offset(pud, addr);
261 do {
262 next = pmd_addr_end(addr, end);
263 if (pmd_none_or_clear_bad(pmd))
264 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700265 if (check_pte_range(vma, pmd, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700266 return -EIO;
267 } while (pmd++, addr = next, addr != end);
268 return 0;
269}
270
Nick Pigginb5810032005-10-29 18:16:12 -0700271static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700272 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700273{
274 pud_t *pud;
275 unsigned long next;
276
277 pud = pud_offset(pgd, addr);
278 do {
279 next = pud_addr_end(addr, end);
280 if (pud_none_or_clear_bad(pud))
281 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700282 if (check_pmd_range(vma, pud, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700283 return -EIO;
284 } while (pud++, addr = next, addr != end);
285 return 0;
286}
287
Nick Pigginb5810032005-10-29 18:16:12 -0700288static inline int check_pgd_range(struct vm_area_struct *vma,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700289 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700290{
291 pgd_t *pgd;
292 unsigned long next;
293
Nick Pigginb5810032005-10-29 18:16:12 -0700294 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700295 do {
296 next = pgd_addr_end(addr, end);
297 if (pgd_none_or_clear_bad(pgd))
298 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700299 if (check_pud_range(vma, pgd, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700300 return -EIO;
301 } while (pgd++, addr = next, addr != end);
302 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303}
304
305/* Step 1: check the range */
306static struct vm_area_struct *
307check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700308 nodemask_t *nodes, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309{
310 int err;
311 struct vm_area_struct *first, *vma, *prev;
312
313 first = find_vma(mm, start);
314 if (!first)
315 return ERR_PTR(-EFAULT);
Nick Pigginb5810032005-10-29 18:16:12 -0700316 if (first->vm_flags & VM_RESERVED)
317 return ERR_PTR(-EACCES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318 prev = NULL;
319 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
320 if (!vma->vm_next && vma->vm_end < end)
321 return ERR_PTR(-EFAULT);
322 if (prev && prev->vm_end < vma->vm_start)
323 return ERR_PTR(-EFAULT);
324 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700325 unsigned long endvma = vma->vm_end;
326 if (endvma > end)
327 endvma = end;
328 if (vma->vm_start > start)
329 start = vma->vm_start;
Nick Pigginb5810032005-10-29 18:16:12 -0700330 err = check_pgd_range(vma, start, endvma, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331 if (err) {
332 first = ERR_PTR(err);
333 break;
334 }
335 }
336 prev = vma;
337 }
338 return first;
339}
340
341/* Apply policy to a single VMA */
342static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
343{
344 int err = 0;
345 struct mempolicy *old = vma->vm_policy;
346
347 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
348 vma->vm_start, vma->vm_end, vma->vm_pgoff,
349 vma->vm_ops, vma->vm_file,
350 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
351
352 if (vma->vm_ops && vma->vm_ops->set_policy)
353 err = vma->vm_ops->set_policy(vma, new);
354 if (!err) {
355 mpol_get(new);
356 vma->vm_policy = new;
357 mpol_free(old);
358 }
359 return err;
360}
361
362/* Step 2: apply policy to a range and do splits. */
363static int mbind_range(struct vm_area_struct *vma, unsigned long start,
364 unsigned long end, struct mempolicy *new)
365{
366 struct vm_area_struct *next;
367 int err;
368
369 err = 0;
370 for (; vma && vma->vm_start < end; vma = next) {
371 next = vma->vm_next;
372 if (vma->vm_start < start)
373 err = split_vma(vma->vm_mm, vma, start, 1);
374 if (!err && vma->vm_end > end)
375 err = split_vma(vma->vm_mm, vma, end, 0);
376 if (!err)
377 err = policy_vma(vma, new);
378 if (err)
379 break;
380 }
381 return err;
382}
383
384/* Change policy for a memory range */
385asmlinkage long sys_mbind(unsigned long start, unsigned long len,
386 unsigned long mode,
387 unsigned long __user *nmask, unsigned long maxnode,
388 unsigned flags)
389{
390 struct vm_area_struct *vma;
391 struct mm_struct *mm = current->mm;
392 struct mempolicy *new;
393 unsigned long end;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700394 nodemask_t nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 int err;
396
397 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
398 return -EINVAL;
399 if (start & ~PAGE_MASK)
400 return -EINVAL;
401 if (mode == MPOL_DEFAULT)
402 flags &= ~MPOL_MF_STRICT;
403 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
404 end = start + len;
405 if (end < start)
406 return -EINVAL;
407 if (end == start)
408 return 0;
409
Andi Kleendfcd3c02005-10-29 18:15:48 -0700410 err = get_nodes(&nodes, nmask, maxnode, mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411 if (err)
412 return err;
413
Andi Kleendfcd3c02005-10-29 18:15:48 -0700414 new = mpol_new(mode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415 if (IS_ERR(new))
416 return PTR_ERR(new);
417
418 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700419 mode,nodes_addr(nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420
421 down_write(&mm->mmap_sem);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700422 vma = check_range(mm, start, end, &nodes, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423 err = PTR_ERR(vma);
424 if (!IS_ERR(vma))
425 err = mbind_range(vma, start, end, new);
426 up_write(&mm->mmap_sem);
427 mpol_free(new);
428 return err;
429}
430
431/* Set the process memory policy */
432asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
433 unsigned long maxnode)
434{
435 int err;
436 struct mempolicy *new;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700437 nodemask_t nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438
Eric Dumazetba171012005-08-01 21:11:43 -0700439 if (mode < 0 || mode > MPOL_MAX)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 return -EINVAL;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700441 err = get_nodes(&nodes, nmask, maxnode, mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442 if (err)
443 return err;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700444 new = mpol_new(mode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 if (IS_ERR(new))
446 return PTR_ERR(new);
447 mpol_free(current->mempolicy);
448 current->mempolicy = new;
449 if (new && new->policy == MPOL_INTERLEAVE)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700450 current->il_next = first_node(new->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451 return 0;
452}
453
454/* Fill a zone bitmap for a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700455static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456{
457 int i;
458
Andi Kleendfcd3c02005-10-29 18:15:48 -0700459 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 switch (p->policy) {
461 case MPOL_BIND:
462 for (i = 0; p->v.zonelist->zones[i]; i++)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700463 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464 break;
465 case MPOL_DEFAULT:
466 break;
467 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700468 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 break;
470 case MPOL_PREFERRED:
471 /* or use current node instead of online map? */
472 if (p->v.preferred_node < 0)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700473 *nodes = node_online_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700475 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700476 break;
477 default:
478 BUG();
479 }
480}
481
482static int lookup_node(struct mm_struct *mm, unsigned long addr)
483{
484 struct page *p;
485 int err;
486
487 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
488 if (err >= 0) {
489 err = page_to_nid(p);
490 put_page(p);
491 }
492 return err;
493}
494
495/* Copy a kernel node mask to user space */
496static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700497 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498{
499 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700500 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501
502 if (copy > nbytes) {
503 if (copy > PAGE_SIZE)
504 return -EINVAL;
505 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
506 return -EFAULT;
507 copy = nbytes;
508 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700509 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700510}
511
512/* Retrieve NUMA policy */
513asmlinkage long sys_get_mempolicy(int __user *policy,
514 unsigned long __user *nmask,
515 unsigned long maxnode,
516 unsigned long addr, unsigned long flags)
517{
518 int err, pval;
519 struct mm_struct *mm = current->mm;
520 struct vm_area_struct *vma = NULL;
521 struct mempolicy *pol = current->mempolicy;
522
523 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
524 return -EINVAL;
525 if (nmask != NULL && maxnode < MAX_NUMNODES)
526 return -EINVAL;
527 if (flags & MPOL_F_ADDR) {
528 down_read(&mm->mmap_sem);
529 vma = find_vma_intersection(mm, addr, addr+1);
530 if (!vma) {
531 up_read(&mm->mmap_sem);
532 return -EFAULT;
533 }
534 if (vma->vm_ops && vma->vm_ops->get_policy)
535 pol = vma->vm_ops->get_policy(vma, addr);
536 else
537 pol = vma->vm_policy;
538 } else if (addr)
539 return -EINVAL;
540
541 if (!pol)
542 pol = &default_policy;
543
544 if (flags & MPOL_F_NODE) {
545 if (flags & MPOL_F_ADDR) {
546 err = lookup_node(mm, addr);
547 if (err < 0)
548 goto out;
549 pval = err;
550 } else if (pol == current->mempolicy &&
551 pol->policy == MPOL_INTERLEAVE) {
552 pval = current->il_next;
553 } else {
554 err = -EINVAL;
555 goto out;
556 }
557 } else
558 pval = pol->policy;
559
560 if (vma) {
561 up_read(&current->mm->mmap_sem);
562 vma = NULL;
563 }
564
565 if (policy && put_user(pval, policy))
566 return -EFAULT;
567
568 err = 0;
569 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700570 nodemask_t nodes;
571 get_zonemask(pol, &nodes);
572 err = copy_nodes_to_user(nmask, maxnode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573 }
574
575 out:
576 if (vma)
577 up_read(&current->mm->mmap_sem);
578 return err;
579}
580
581#ifdef CONFIG_COMPAT
582
583asmlinkage long compat_sys_get_mempolicy(int __user *policy,
584 compat_ulong_t __user *nmask,
585 compat_ulong_t maxnode,
586 compat_ulong_t addr, compat_ulong_t flags)
587{
588 long err;
589 unsigned long __user *nm = NULL;
590 unsigned long nr_bits, alloc_size;
591 DECLARE_BITMAP(bm, MAX_NUMNODES);
592
593 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
594 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
595
596 if (nmask)
597 nm = compat_alloc_user_space(alloc_size);
598
599 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
600
601 if (!err && nmask) {
602 err = copy_from_user(bm, nm, alloc_size);
603 /* ensure entire bitmap is zeroed */
604 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
605 err |= compat_put_bitmap(nmask, bm, nr_bits);
606 }
607
608 return err;
609}
610
611asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
612 compat_ulong_t maxnode)
613{
614 long err = 0;
615 unsigned long __user *nm = NULL;
616 unsigned long nr_bits, alloc_size;
617 DECLARE_BITMAP(bm, MAX_NUMNODES);
618
619 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
620 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
621
622 if (nmask) {
623 err = compat_get_bitmap(bm, nmask, nr_bits);
624 nm = compat_alloc_user_space(alloc_size);
625 err |= copy_to_user(nm, bm, alloc_size);
626 }
627
628 if (err)
629 return -EFAULT;
630
631 return sys_set_mempolicy(mode, nm, nr_bits+1);
632}
633
634asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
635 compat_ulong_t mode, compat_ulong_t __user *nmask,
636 compat_ulong_t maxnode, compat_ulong_t flags)
637{
638 long err = 0;
639 unsigned long __user *nm = NULL;
640 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700641 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642
643 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
644 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
645
646 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700647 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700649 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 }
651
652 if (err)
653 return -EFAULT;
654
655 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
656}
657
658#endif
659
660/* Return effective policy for a VMA */
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700661struct mempolicy *
662get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700664 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665
666 if (vma) {
667 if (vma->vm_ops && vma->vm_ops->get_policy)
668 pol = vma->vm_ops->get_policy(vma, addr);
669 else if (vma->vm_policy &&
670 vma->vm_policy->policy != MPOL_DEFAULT)
671 pol = vma->vm_policy;
672 }
673 if (!pol)
674 pol = &default_policy;
675 return pol;
676}
677
678/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +0100679static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680{
681 int nd;
682
683 switch (policy->policy) {
684 case MPOL_PREFERRED:
685 nd = policy->v.preferred_node;
686 if (nd < 0)
687 nd = numa_node_id();
688 break;
689 case MPOL_BIND:
690 /* Lower zones don't get a policy applied */
691 /* Careful: current->mems_allowed might have moved */
Al Viroaf4ca452005-10-21 02:55:38 -0400692 if (gfp_zone(gfp) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
694 return policy->v.zonelist;
695 /*FALL THROUGH*/
696 case MPOL_INTERLEAVE: /* should not happen */
697 case MPOL_DEFAULT:
698 nd = numa_node_id();
699 break;
700 default:
701 nd = 0;
702 BUG();
703 }
Al Viroaf4ca452005-10-21 02:55:38 -0400704 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705}
706
707/* Do dynamic interleaving for a process */
708static unsigned interleave_nodes(struct mempolicy *policy)
709{
710 unsigned nid, next;
711 struct task_struct *me = current;
712
713 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700714 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700716 next = first_node(policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 me->il_next = next;
718 return nid;
719}
720
721/* Do static interleaving for a VMA with known offset. */
722static unsigned offset_il_node(struct mempolicy *pol,
723 struct vm_area_struct *vma, unsigned long off)
724{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700725 unsigned nnodes = nodes_weight(pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726 unsigned target = (unsigned)off % nnodes;
727 int c;
728 int nid = -1;
729
730 c = 0;
731 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700732 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733 c++;
734 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700735 return nid;
736}
737
738/* Allocate a page in interleaved policy.
739 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -0700740static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
741 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742{
743 struct zonelist *zl;
744 struct page *page;
745
Al Viroaf4ca452005-10-21 02:55:38 -0400746 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747 page = __alloc_pages(gfp, order, zl);
748 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -0700749 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750 put_cpu();
751 }
752 return page;
753}
754
755/**
756 * alloc_page_vma - Allocate a page for a VMA.
757 *
758 * @gfp:
759 * %GFP_USER user allocation.
760 * %GFP_KERNEL kernel allocations,
761 * %GFP_HIGHMEM highmem/user allocations,
762 * %GFP_FS allocation should not call back into a file system.
763 * %GFP_ATOMIC don't sleep.
764 *
765 * @vma: Pointer to VMA or NULL if not available.
766 * @addr: Virtual Address of the allocation. Must be inside the VMA.
767 *
768 * This function allocates a page from the kernel page pool and applies
769 * a NUMA policy associated with the VMA or the current process.
770 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
771 * mm_struct of the VMA to prevent it from going away. Should be used for
772 * all allocations for pages that will be mapped into
773 * user space. Returns NULL when no page can be allocated.
774 *
775 * Should be called with the mm_sem of the vma hold.
776 */
777struct page *
Al Virodd0fc662005-10-07 07:46:04 +0100778alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700780 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781
782 cpuset_update_current_mems_allowed();
783
784 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
785 unsigned nid;
786 if (vma) {
787 unsigned long off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788 off = vma->vm_pgoff;
789 off += (addr - vma->vm_start) >> PAGE_SHIFT;
790 nid = offset_il_node(pol, vma, off);
791 } else {
792 /* fall back to process interleaving */
793 nid = interleave_nodes(pol);
794 }
795 return alloc_page_interleave(gfp, 0, nid);
796 }
797 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
798}
799
800/**
801 * alloc_pages_current - Allocate pages.
802 *
803 * @gfp:
804 * %GFP_USER user allocation,
805 * %GFP_KERNEL kernel allocation,
806 * %GFP_HIGHMEM highmem allocation,
807 * %GFP_FS don't call back into a file system.
808 * %GFP_ATOMIC don't sleep.
809 * @order: Power of two of allocation size in pages. 0 is a single page.
810 *
811 * Allocate a page from the kernel page pool. When not in
812 * interrupt context and apply the current process NUMA policy.
813 * Returns NULL when no page can be allocated.
814 *
815 * Don't call cpuset_update_current_mems_allowed() unless
816 * 1) it's ok to take cpuset_sem (can WAIT), and
817 * 2) allocating for current task (not interrupt).
818 */
Al Virodd0fc662005-10-07 07:46:04 +0100819struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820{
821 struct mempolicy *pol = current->mempolicy;
822
823 if ((gfp & __GFP_WAIT) && !in_interrupt())
824 cpuset_update_current_mems_allowed();
825 if (!pol || in_interrupt())
826 pol = &default_policy;
827 if (pol->policy == MPOL_INTERLEAVE)
828 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
829 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
830}
831EXPORT_SYMBOL(alloc_pages_current);
832
833/* Slow path of a mempolicy copy */
834struct mempolicy *__mpol_copy(struct mempolicy *old)
835{
836 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
837
838 if (!new)
839 return ERR_PTR(-ENOMEM);
840 *new = *old;
841 atomic_set(&new->refcnt, 1);
842 if (new->policy == MPOL_BIND) {
843 int sz = ksize(old->v.zonelist);
844 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
845 if (!new->v.zonelist) {
846 kmem_cache_free(policy_cache, new);
847 return ERR_PTR(-ENOMEM);
848 }
849 memcpy(new->v.zonelist, old->v.zonelist, sz);
850 }
851 return new;
852}
853
854/* Slow path of a mempolicy comparison */
855int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
856{
857 if (!a || !b)
858 return 0;
859 if (a->policy != b->policy)
860 return 0;
861 switch (a->policy) {
862 case MPOL_DEFAULT:
863 return 1;
864 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700865 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866 case MPOL_PREFERRED:
867 return a->v.preferred_node == b->v.preferred_node;
868 case MPOL_BIND: {
869 int i;
870 for (i = 0; a->v.zonelist->zones[i]; i++)
871 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
872 return 0;
873 return b->v.zonelist->zones[i] == NULL;
874 }
875 default:
876 BUG();
877 return 0;
878 }
879}
880
881/* Slow path of a mpol destructor. */
882void __mpol_free(struct mempolicy *p)
883{
884 if (!atomic_dec_and_test(&p->refcnt))
885 return;
886 if (p->policy == MPOL_BIND)
887 kfree(p->v.zonelist);
888 p->policy = MPOL_DEFAULT;
889 kmem_cache_free(policy_cache, p);
890}
891
892/*
893 * Hugetlb policy. Same as above, just works with node numbers instead of
894 * zonelists.
895 */
896
897/* Find first node suitable for an allocation */
898int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
899{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700900 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901
902 switch (pol->policy) {
903 case MPOL_DEFAULT:
904 return numa_node_id();
905 case MPOL_BIND:
906 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
907 case MPOL_INTERLEAVE:
908 return interleave_nodes(pol);
909 case MPOL_PREFERRED:
910 return pol->v.preferred_node >= 0 ?
911 pol->v.preferred_node : numa_node_id();
912 }
913 BUG();
914 return 0;
915}
916
917/* Find secondary valid nodes for an allocation */
918int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
919{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700920 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921
922 switch (pol->policy) {
923 case MPOL_PREFERRED:
924 case MPOL_DEFAULT:
925 case MPOL_INTERLEAVE:
926 return 1;
927 case MPOL_BIND: {
928 struct zone **z;
929 for (z = pol->v.zonelist->zones; *z; z++)
930 if ((*z)->zone_pgdat->node_id == nid)
931 return 1;
932 return 0;
933 }
934 default:
935 BUG();
936 return 0;
937 }
938}
939
940/*
941 * Shared memory backing store policy support.
942 *
943 * Remember policies even when nobody has shared memory mapped.
944 * The policies are kept in Red-Black tree linked from the inode.
945 * They are protected by the sp->lock spinlock, which should be held
946 * for any accesses to the tree.
947 */
948
949/* lookup first element intersecting start-end */
950/* Caller holds sp->lock */
951static struct sp_node *
952sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
953{
954 struct rb_node *n = sp->root.rb_node;
955
956 while (n) {
957 struct sp_node *p = rb_entry(n, struct sp_node, nd);
958
959 if (start >= p->end)
960 n = n->rb_right;
961 else if (end <= p->start)
962 n = n->rb_left;
963 else
964 break;
965 }
966 if (!n)
967 return NULL;
968 for (;;) {
969 struct sp_node *w = NULL;
970 struct rb_node *prev = rb_prev(n);
971 if (!prev)
972 break;
973 w = rb_entry(prev, struct sp_node, nd);
974 if (w->end <= start)
975 break;
976 n = prev;
977 }
978 return rb_entry(n, struct sp_node, nd);
979}
980
981/* Insert a new shared policy into the list. */
982/* Caller holds sp->lock */
983static void sp_insert(struct shared_policy *sp, struct sp_node *new)
984{
985 struct rb_node **p = &sp->root.rb_node;
986 struct rb_node *parent = NULL;
987 struct sp_node *nd;
988
989 while (*p) {
990 parent = *p;
991 nd = rb_entry(parent, struct sp_node, nd);
992 if (new->start < nd->start)
993 p = &(*p)->rb_left;
994 else if (new->end > nd->end)
995 p = &(*p)->rb_right;
996 else
997 BUG();
998 }
999 rb_link_node(&new->nd, parent, p);
1000 rb_insert_color(&new->nd, &sp->root);
1001 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1002 new->policy ? new->policy->policy : 0);
1003}
1004
1005/* Find shared policy intersecting idx */
1006struct mempolicy *
1007mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1008{
1009 struct mempolicy *pol = NULL;
1010 struct sp_node *sn;
1011
1012 if (!sp->root.rb_node)
1013 return NULL;
1014 spin_lock(&sp->lock);
1015 sn = sp_lookup(sp, idx, idx+1);
1016 if (sn) {
1017 mpol_get(sn->policy);
1018 pol = sn->policy;
1019 }
1020 spin_unlock(&sp->lock);
1021 return pol;
1022}
1023
1024static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1025{
1026 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1027 rb_erase(&n->nd, &sp->root);
1028 mpol_free(n->policy);
1029 kmem_cache_free(sn_cache, n);
1030}
1031
1032struct sp_node *
1033sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1034{
1035 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1036
1037 if (!n)
1038 return NULL;
1039 n->start = start;
1040 n->end = end;
1041 mpol_get(pol);
1042 n->policy = pol;
1043 return n;
1044}
1045
1046/* Replace a policy range. */
1047static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1048 unsigned long end, struct sp_node *new)
1049{
1050 struct sp_node *n, *new2 = NULL;
1051
1052restart:
1053 spin_lock(&sp->lock);
1054 n = sp_lookup(sp, start, end);
1055 /* Take care of old policies in the same range. */
1056 while (n && n->start < end) {
1057 struct rb_node *next = rb_next(&n->nd);
1058 if (n->start >= start) {
1059 if (n->end <= end)
1060 sp_delete(sp, n);
1061 else
1062 n->start = end;
1063 } else {
1064 /* Old policy spanning whole new range. */
1065 if (n->end > end) {
1066 if (!new2) {
1067 spin_unlock(&sp->lock);
1068 new2 = sp_alloc(end, n->end, n->policy);
1069 if (!new2)
1070 return -ENOMEM;
1071 goto restart;
1072 }
1073 n->end = start;
1074 sp_insert(sp, new2);
1075 new2 = NULL;
1076 break;
1077 } else
1078 n->end = start;
1079 }
1080 if (!next)
1081 break;
1082 n = rb_entry(next, struct sp_node, nd);
1083 }
1084 if (new)
1085 sp_insert(sp, new);
1086 spin_unlock(&sp->lock);
1087 if (new2) {
1088 mpol_free(new2->policy);
1089 kmem_cache_free(sn_cache, new2);
1090 }
1091 return 0;
1092}
1093
1094int mpol_set_shared_policy(struct shared_policy *info,
1095 struct vm_area_struct *vma, struct mempolicy *npol)
1096{
1097 int err;
1098 struct sp_node *new = NULL;
1099 unsigned long sz = vma_pages(vma);
1100
1101 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1102 vma->vm_pgoff,
1103 sz, npol? npol->policy : -1,
Andi Kleendfcd3c02005-10-29 18:15:48 -07001104 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001105
1106 if (npol) {
1107 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1108 if (!new)
1109 return -ENOMEM;
1110 }
1111 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1112 if (err && new)
1113 kmem_cache_free(sn_cache, new);
1114 return err;
1115}
1116
1117/* Free a backing policy store on inode delete. */
1118void mpol_free_shared_policy(struct shared_policy *p)
1119{
1120 struct sp_node *n;
1121 struct rb_node *next;
1122
1123 if (!p->root.rb_node)
1124 return;
1125 spin_lock(&p->lock);
1126 next = rb_first(&p->root);
1127 while (next) {
1128 n = rb_entry(next, struct sp_node, nd);
1129 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001130 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001131 mpol_free(n->policy);
1132 kmem_cache_free(sn_cache, n);
1133 }
1134 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135}
1136
1137/* assumes fs == KERNEL_DS */
1138void __init numa_policy_init(void)
1139{
1140 policy_cache = kmem_cache_create("numa_policy",
1141 sizeof(struct mempolicy),
1142 0, SLAB_PANIC, NULL, NULL);
1143
1144 sn_cache = kmem_cache_create("shared_policy_node",
1145 sizeof(struct sp_node),
1146 0, SLAB_PANIC, NULL, NULL);
1147
1148 /* Set interleaving policy for system init. This way not all
1149 the data structures allocated at system boot end up in node zero. */
1150
1151 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1152 MAX_NUMNODES) < 0)
1153 printk("numa_policy_init: interleaving failed\n");
1154}
1155
1156/* Reset policy of current process to default.
1157 * Assumes fs == KERNEL_DS */
1158void numa_default_policy(void)
1159{
1160 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1161}