blob: 902d4c9eccdc3e256e99d59a82b9b9dcd4a1976f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
Andi Kleend42c6992005-07-06 19:56:03 +020091struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -070092 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
Linus Torvalds1da177e2005-04-16 15:20:36 -070096/* Do sanity checking on a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -070097static int mpol_check_policy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -070098{
Andi Kleendfcd3c02005-10-29 18:15:48 -070099 int empty = nodes_empty(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100
101 switch (mode) {
102 case MPOL_DEFAULT:
103 if (!empty)
104 return -EINVAL;
105 break;
106 case MPOL_BIND:
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
109 more for now. */
110 if (empty)
111 return -EINVAL;
112 break;
113 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115}
116
117/* Copy a node mask from user space. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700118static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 unsigned long maxnode, int mode)
120{
121 unsigned long k;
122 unsigned long nlongs;
123 unsigned long endmask;
124
125 --maxnode;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700126 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127 if (maxnode == 0 || !nmask)
128 return 0;
129
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
132 endmask = ~0UL;
133 else
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
135
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
140 return -EINVAL;
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
142 unsigned long t;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700143 if (get_user(t, nmask + k))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 return -EFAULT;
145 if (k == nlongs - 1) {
146 if (t & endmask)
147 return -EINVAL;
148 } else if (t)
149 return -EINVAL;
150 }
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
152 endmask = ~0UL;
153 }
154
Andi Kleendfcd3c02005-10-29 18:15:48 -0700155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 return -EFAULT;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700157 nodes_addr(*nodes)[nlongs-1] &= endmask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 return mpol_check_policy(mode, nodes);
164}
165
166/* Generate a custom zonelist for the BIND policy. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700167static struct zonelist *bind_zonelist(nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168{
169 struct zonelist *zl;
170 int num, max, nd;
171
Andi Kleendfcd3c02005-10-29 18:15:48 -0700172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
174 if (!zl)
175 return NULL;
176 num = 0;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700177 for_each_node_mask(nd, *nodes) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 int k;
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
182 continue;
183 zl->zones[num++] = z;
184 if (k > policy_zone)
185 policy_zone = k;
186 }
187 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 zl->zones[num] = NULL;
189 return zl;
190}
191
192/* Create a new policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700193static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194{
195 struct mempolicy *policy;
196
Andi Kleendfcd3c02005-10-29 18:15:48 -0700197 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 if (mode == MPOL_DEFAULT)
199 return NULL;
200 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
201 if (!policy)
202 return ERR_PTR(-ENOMEM);
203 atomic_set(&policy->refcnt, 1);
204 switch (mode) {
205 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700206 policy->v.nodes = *nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 break;
208 case MPOL_PREFERRED:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700209 policy->v.preferred_node = first_node(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210 if (policy->v.preferred_node >= MAX_NUMNODES)
211 policy->v.preferred_node = -1;
212 break;
213 case MPOL_BIND:
214 policy->v.zonelist = bind_zonelist(nodes);
215 if (policy->v.zonelist == NULL) {
216 kmem_cache_free(policy_cache, policy);
217 return ERR_PTR(-ENOMEM);
218 }
219 break;
220 }
221 policy->policy = mode;
222 return policy;
223}
224
225/* Ensure all existing pages follow the policy. */
Nick Pigginb5810032005-10-29 18:16:12 -0700226static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700227 unsigned long addr, unsigned long end, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228{
Hugh Dickins91612e02005-06-21 17:15:07 -0700229 pte_t *orig_pte;
230 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700231 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700232
Hugh Dickins705e87c2005-10-29 18:16:27 -0700233 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700234 do {
235 unsigned long pfn;
236 unsigned int nid;
237
238 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700240 pfn = pte_pfn(*pte);
Nick Pigginb5810032005-10-29 18:16:12 -0700241 if (!pfn_valid(pfn)) {
242 print_bad_pte(vma, *pte, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700244 }
Hugh Dickins91612e02005-06-21 17:15:07 -0700245 nid = pfn_to_nid(pfn);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700246 if (!node_isset(nid, *nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700247 break;
248 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700249 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700250 return addr != end;
251}
252
Nick Pigginb5810032005-10-29 18:16:12 -0700253static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700254 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700255{
256 pmd_t *pmd;
257 unsigned long next;
258
259 pmd = pmd_offset(pud, addr);
260 do {
261 next = pmd_addr_end(addr, end);
262 if (pmd_none_or_clear_bad(pmd))
263 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700264 if (check_pte_range(vma, pmd, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700265 return -EIO;
266 } while (pmd++, addr = next, addr != end);
267 return 0;
268}
269
Nick Pigginb5810032005-10-29 18:16:12 -0700270static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700271 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700272{
273 pud_t *pud;
274 unsigned long next;
275
276 pud = pud_offset(pgd, addr);
277 do {
278 next = pud_addr_end(addr, end);
279 if (pud_none_or_clear_bad(pud))
280 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700281 if (check_pmd_range(vma, pud, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700282 return -EIO;
283 } while (pud++, addr = next, addr != end);
284 return 0;
285}
286
Nick Pigginb5810032005-10-29 18:16:12 -0700287static inline int check_pgd_range(struct vm_area_struct *vma,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700288 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700289{
290 pgd_t *pgd;
291 unsigned long next;
292
Nick Pigginb5810032005-10-29 18:16:12 -0700293 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700294 do {
295 next = pgd_addr_end(addr, end);
296 if (pgd_none_or_clear_bad(pgd))
297 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700298 if (check_pud_range(vma, pgd, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700299 return -EIO;
300 } while (pgd++, addr = next, addr != end);
301 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302}
303
304/* Step 1: check the range */
305static struct vm_area_struct *
306check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700307 nodemask_t *nodes, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308{
309 int err;
310 struct vm_area_struct *first, *vma, *prev;
311
312 first = find_vma(mm, start);
313 if (!first)
314 return ERR_PTR(-EFAULT);
Nick Pigginb5810032005-10-29 18:16:12 -0700315 if (first->vm_flags & VM_RESERVED)
316 return ERR_PTR(-EACCES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317 prev = NULL;
318 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
319 if (!vma->vm_next && vma->vm_end < end)
320 return ERR_PTR(-EFAULT);
321 if (prev && prev->vm_end < vma->vm_start)
322 return ERR_PTR(-EFAULT);
323 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700324 unsigned long endvma = vma->vm_end;
325 if (endvma > end)
326 endvma = end;
327 if (vma->vm_start > start)
328 start = vma->vm_start;
Nick Pigginb5810032005-10-29 18:16:12 -0700329 err = check_pgd_range(vma, start, endvma, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330 if (err) {
331 first = ERR_PTR(err);
332 break;
333 }
334 }
335 prev = vma;
336 }
337 return first;
338}
339
340/* Apply policy to a single VMA */
341static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
342{
343 int err = 0;
344 struct mempolicy *old = vma->vm_policy;
345
346 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
347 vma->vm_start, vma->vm_end, vma->vm_pgoff,
348 vma->vm_ops, vma->vm_file,
349 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
350
351 if (vma->vm_ops && vma->vm_ops->set_policy)
352 err = vma->vm_ops->set_policy(vma, new);
353 if (!err) {
354 mpol_get(new);
355 vma->vm_policy = new;
356 mpol_free(old);
357 }
358 return err;
359}
360
361/* Step 2: apply policy to a range and do splits. */
362static int mbind_range(struct vm_area_struct *vma, unsigned long start,
363 unsigned long end, struct mempolicy *new)
364{
365 struct vm_area_struct *next;
366 int err;
367
368 err = 0;
369 for (; vma && vma->vm_start < end; vma = next) {
370 next = vma->vm_next;
371 if (vma->vm_start < start)
372 err = split_vma(vma->vm_mm, vma, start, 1);
373 if (!err && vma->vm_end > end)
374 err = split_vma(vma->vm_mm, vma, end, 0);
375 if (!err)
376 err = policy_vma(vma, new);
377 if (err)
378 break;
379 }
380 return err;
381}
382
383/* Change policy for a memory range */
384asmlinkage long sys_mbind(unsigned long start, unsigned long len,
385 unsigned long mode,
386 unsigned long __user *nmask, unsigned long maxnode,
387 unsigned flags)
388{
389 struct vm_area_struct *vma;
390 struct mm_struct *mm = current->mm;
391 struct mempolicy *new;
392 unsigned long end;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700393 nodemask_t nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 int err;
395
396 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
397 return -EINVAL;
398 if (start & ~PAGE_MASK)
399 return -EINVAL;
400 if (mode == MPOL_DEFAULT)
401 flags &= ~MPOL_MF_STRICT;
402 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
403 end = start + len;
404 if (end < start)
405 return -EINVAL;
406 if (end == start)
407 return 0;
408
Andi Kleendfcd3c02005-10-29 18:15:48 -0700409 err = get_nodes(&nodes, nmask, maxnode, mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410 if (err)
411 return err;
412
Andi Kleendfcd3c02005-10-29 18:15:48 -0700413 new = mpol_new(mode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414 if (IS_ERR(new))
415 return PTR_ERR(new);
416
417 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700418 mode,nodes_addr(nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700419
420 down_write(&mm->mmap_sem);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700421 vma = check_range(mm, start, end, &nodes, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422 err = PTR_ERR(vma);
423 if (!IS_ERR(vma))
424 err = mbind_range(vma, start, end, new);
425 up_write(&mm->mmap_sem);
426 mpol_free(new);
427 return err;
428}
429
430/* Set the process memory policy */
431asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
432 unsigned long maxnode)
433{
434 int err;
435 struct mempolicy *new;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700436 nodemask_t nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437
Eric Dumazetba171012005-08-01 21:11:43 -0700438 if (mode < 0 || mode > MPOL_MAX)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700439 return -EINVAL;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700440 err = get_nodes(&nodes, nmask, maxnode, mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 if (err)
442 return err;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700443 new = mpol_new(mode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444 if (IS_ERR(new))
445 return PTR_ERR(new);
446 mpol_free(current->mempolicy);
447 current->mempolicy = new;
448 if (new && new->policy == MPOL_INTERLEAVE)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700449 current->il_next = first_node(new->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 return 0;
451}
452
453/* Fill a zone bitmap for a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700454static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455{
456 int i;
457
Andi Kleendfcd3c02005-10-29 18:15:48 -0700458 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 switch (p->policy) {
460 case MPOL_BIND:
461 for (i = 0; p->v.zonelist->zones[i]; i++)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700462 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 break;
464 case MPOL_DEFAULT:
465 break;
466 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700467 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468 break;
469 case MPOL_PREFERRED:
470 /* or use current node instead of online map? */
471 if (p->v.preferred_node < 0)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700472 *nodes = node_online_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700474 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475 break;
476 default:
477 BUG();
478 }
479}
480
481static int lookup_node(struct mm_struct *mm, unsigned long addr)
482{
483 struct page *p;
484 int err;
485
486 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
487 if (err >= 0) {
488 err = page_to_nid(p);
489 put_page(p);
490 }
491 return err;
492}
493
494/* Copy a kernel node mask to user space */
495static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700496 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700497{
498 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700499 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500
501 if (copy > nbytes) {
502 if (copy > PAGE_SIZE)
503 return -EINVAL;
504 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
505 return -EFAULT;
506 copy = nbytes;
507 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700508 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509}
510
511/* Retrieve NUMA policy */
512asmlinkage long sys_get_mempolicy(int __user *policy,
513 unsigned long __user *nmask,
514 unsigned long maxnode,
515 unsigned long addr, unsigned long flags)
516{
517 int err, pval;
518 struct mm_struct *mm = current->mm;
519 struct vm_area_struct *vma = NULL;
520 struct mempolicy *pol = current->mempolicy;
521
522 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
523 return -EINVAL;
524 if (nmask != NULL && maxnode < MAX_NUMNODES)
525 return -EINVAL;
526 if (flags & MPOL_F_ADDR) {
527 down_read(&mm->mmap_sem);
528 vma = find_vma_intersection(mm, addr, addr+1);
529 if (!vma) {
530 up_read(&mm->mmap_sem);
531 return -EFAULT;
532 }
533 if (vma->vm_ops && vma->vm_ops->get_policy)
534 pol = vma->vm_ops->get_policy(vma, addr);
535 else
536 pol = vma->vm_policy;
537 } else if (addr)
538 return -EINVAL;
539
540 if (!pol)
541 pol = &default_policy;
542
543 if (flags & MPOL_F_NODE) {
544 if (flags & MPOL_F_ADDR) {
545 err = lookup_node(mm, addr);
546 if (err < 0)
547 goto out;
548 pval = err;
549 } else if (pol == current->mempolicy &&
550 pol->policy == MPOL_INTERLEAVE) {
551 pval = current->il_next;
552 } else {
553 err = -EINVAL;
554 goto out;
555 }
556 } else
557 pval = pol->policy;
558
559 if (vma) {
560 up_read(&current->mm->mmap_sem);
561 vma = NULL;
562 }
563
564 if (policy && put_user(pval, policy))
565 return -EFAULT;
566
567 err = 0;
568 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700569 nodemask_t nodes;
570 get_zonemask(pol, &nodes);
571 err = copy_nodes_to_user(nmask, maxnode, &nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700572 }
573
574 out:
575 if (vma)
576 up_read(&current->mm->mmap_sem);
577 return err;
578}
579
580#ifdef CONFIG_COMPAT
581
582asmlinkage long compat_sys_get_mempolicy(int __user *policy,
583 compat_ulong_t __user *nmask,
584 compat_ulong_t maxnode,
585 compat_ulong_t addr, compat_ulong_t flags)
586{
587 long err;
588 unsigned long __user *nm = NULL;
589 unsigned long nr_bits, alloc_size;
590 DECLARE_BITMAP(bm, MAX_NUMNODES);
591
592 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
593 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
594
595 if (nmask)
596 nm = compat_alloc_user_space(alloc_size);
597
598 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
599
600 if (!err && nmask) {
601 err = copy_from_user(bm, nm, alloc_size);
602 /* ensure entire bitmap is zeroed */
603 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
604 err |= compat_put_bitmap(nmask, bm, nr_bits);
605 }
606
607 return err;
608}
609
610asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
611 compat_ulong_t maxnode)
612{
613 long err = 0;
614 unsigned long __user *nm = NULL;
615 unsigned long nr_bits, alloc_size;
616 DECLARE_BITMAP(bm, MAX_NUMNODES);
617
618 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
619 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
620
621 if (nmask) {
622 err = compat_get_bitmap(bm, nmask, nr_bits);
623 nm = compat_alloc_user_space(alloc_size);
624 err |= copy_to_user(nm, bm, alloc_size);
625 }
626
627 if (err)
628 return -EFAULT;
629
630 return sys_set_mempolicy(mode, nm, nr_bits+1);
631}
632
633asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
634 compat_ulong_t mode, compat_ulong_t __user *nmask,
635 compat_ulong_t maxnode, compat_ulong_t flags)
636{
637 long err = 0;
638 unsigned long __user *nm = NULL;
639 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700640 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641
642 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
643 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
644
645 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700646 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700648 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649 }
650
651 if (err)
652 return -EFAULT;
653
654 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
655}
656
657#endif
658
659/* Return effective policy for a VMA */
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700660struct mempolicy *
661get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700663 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664
665 if (vma) {
666 if (vma->vm_ops && vma->vm_ops->get_policy)
667 pol = vma->vm_ops->get_policy(vma, addr);
668 else if (vma->vm_policy &&
669 vma->vm_policy->policy != MPOL_DEFAULT)
670 pol = vma->vm_policy;
671 }
672 if (!pol)
673 pol = &default_policy;
674 return pol;
675}
676
677/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +0100678static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700679{
680 int nd;
681
682 switch (policy->policy) {
683 case MPOL_PREFERRED:
684 nd = policy->v.preferred_node;
685 if (nd < 0)
686 nd = numa_node_id();
687 break;
688 case MPOL_BIND:
689 /* Lower zones don't get a policy applied */
690 /* Careful: current->mems_allowed might have moved */
Al Viroaf4ca452005-10-21 02:55:38 -0400691 if (gfp_zone(gfp) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
693 return policy->v.zonelist;
694 /*FALL THROUGH*/
695 case MPOL_INTERLEAVE: /* should not happen */
696 case MPOL_DEFAULT:
697 nd = numa_node_id();
698 break;
699 default:
700 nd = 0;
701 BUG();
702 }
Al Viroaf4ca452005-10-21 02:55:38 -0400703 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704}
705
706/* Do dynamic interleaving for a process */
707static unsigned interleave_nodes(struct mempolicy *policy)
708{
709 unsigned nid, next;
710 struct task_struct *me = current;
711
712 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700713 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700715 next = first_node(policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 me->il_next = next;
717 return nid;
718}
719
720/* Do static interleaving for a VMA with known offset. */
721static unsigned offset_il_node(struct mempolicy *pol,
722 struct vm_area_struct *vma, unsigned long off)
723{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700724 unsigned nnodes = nodes_weight(pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 unsigned target = (unsigned)off % nnodes;
726 int c;
727 int nid = -1;
728
729 c = 0;
730 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700731 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732 c++;
733 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734 return nid;
735}
736
737/* Allocate a page in interleaved policy.
738 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -0700739static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
740 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741{
742 struct zonelist *zl;
743 struct page *page;
744
Al Viroaf4ca452005-10-21 02:55:38 -0400745 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 page = __alloc_pages(gfp, order, zl);
747 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -0700748 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 put_cpu();
750 }
751 return page;
752}
753
754/**
755 * alloc_page_vma - Allocate a page for a VMA.
756 *
757 * @gfp:
758 * %GFP_USER user allocation.
759 * %GFP_KERNEL kernel allocations,
760 * %GFP_HIGHMEM highmem/user allocations,
761 * %GFP_FS allocation should not call back into a file system.
762 * %GFP_ATOMIC don't sleep.
763 *
764 * @vma: Pointer to VMA or NULL if not available.
765 * @addr: Virtual Address of the allocation. Must be inside the VMA.
766 *
767 * This function allocates a page from the kernel page pool and applies
768 * a NUMA policy associated with the VMA or the current process.
769 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
770 * mm_struct of the VMA to prevent it from going away. Should be used for
771 * all allocations for pages that will be mapped into
772 * user space. Returns NULL when no page can be allocated.
773 *
774 * Should be called with the mm_sem of the vma hold.
775 */
776struct page *
Al Virodd0fc662005-10-07 07:46:04 +0100777alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700778{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700779 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780
781 cpuset_update_current_mems_allowed();
782
783 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
784 unsigned nid;
785 if (vma) {
786 unsigned long off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787 off = vma->vm_pgoff;
788 off += (addr - vma->vm_start) >> PAGE_SHIFT;
789 nid = offset_il_node(pol, vma, off);
790 } else {
791 /* fall back to process interleaving */
792 nid = interleave_nodes(pol);
793 }
794 return alloc_page_interleave(gfp, 0, nid);
795 }
796 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
797}
798
799/**
800 * alloc_pages_current - Allocate pages.
801 *
802 * @gfp:
803 * %GFP_USER user allocation,
804 * %GFP_KERNEL kernel allocation,
805 * %GFP_HIGHMEM highmem allocation,
806 * %GFP_FS don't call back into a file system.
807 * %GFP_ATOMIC don't sleep.
808 * @order: Power of two of allocation size in pages. 0 is a single page.
809 *
810 * Allocate a page from the kernel page pool. When not in
811 * interrupt context and apply the current process NUMA policy.
812 * Returns NULL when no page can be allocated.
813 *
814 * Don't call cpuset_update_current_mems_allowed() unless
815 * 1) it's ok to take cpuset_sem (can WAIT), and
816 * 2) allocating for current task (not interrupt).
817 */
Al Virodd0fc662005-10-07 07:46:04 +0100818struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819{
820 struct mempolicy *pol = current->mempolicy;
821
822 if ((gfp & __GFP_WAIT) && !in_interrupt())
823 cpuset_update_current_mems_allowed();
824 if (!pol || in_interrupt())
825 pol = &default_policy;
826 if (pol->policy == MPOL_INTERLEAVE)
827 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
828 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
829}
830EXPORT_SYMBOL(alloc_pages_current);
831
832/* Slow path of a mempolicy copy */
833struct mempolicy *__mpol_copy(struct mempolicy *old)
834{
835 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
836
837 if (!new)
838 return ERR_PTR(-ENOMEM);
839 *new = *old;
840 atomic_set(&new->refcnt, 1);
841 if (new->policy == MPOL_BIND) {
842 int sz = ksize(old->v.zonelist);
843 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
844 if (!new->v.zonelist) {
845 kmem_cache_free(policy_cache, new);
846 return ERR_PTR(-ENOMEM);
847 }
848 memcpy(new->v.zonelist, old->v.zonelist, sz);
849 }
850 return new;
851}
852
853/* Slow path of a mempolicy comparison */
854int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
855{
856 if (!a || !b)
857 return 0;
858 if (a->policy != b->policy)
859 return 0;
860 switch (a->policy) {
861 case MPOL_DEFAULT:
862 return 1;
863 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700864 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 case MPOL_PREFERRED:
866 return a->v.preferred_node == b->v.preferred_node;
867 case MPOL_BIND: {
868 int i;
869 for (i = 0; a->v.zonelist->zones[i]; i++)
870 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
871 return 0;
872 return b->v.zonelist->zones[i] == NULL;
873 }
874 default:
875 BUG();
876 return 0;
877 }
878}
879
880/* Slow path of a mpol destructor. */
881void __mpol_free(struct mempolicy *p)
882{
883 if (!atomic_dec_and_test(&p->refcnt))
884 return;
885 if (p->policy == MPOL_BIND)
886 kfree(p->v.zonelist);
887 p->policy = MPOL_DEFAULT;
888 kmem_cache_free(policy_cache, p);
889}
890
891/*
892 * Hugetlb policy. Same as above, just works with node numbers instead of
893 * zonelists.
894 */
895
896/* Find first node suitable for an allocation */
897int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
898{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700899 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900
901 switch (pol->policy) {
902 case MPOL_DEFAULT:
903 return numa_node_id();
904 case MPOL_BIND:
905 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
906 case MPOL_INTERLEAVE:
907 return interleave_nodes(pol);
908 case MPOL_PREFERRED:
909 return pol->v.preferred_node >= 0 ?
910 pol->v.preferred_node : numa_node_id();
911 }
912 BUG();
913 return 0;
914}
915
916/* Find secondary valid nodes for an allocation */
917int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
918{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700919 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920
921 switch (pol->policy) {
922 case MPOL_PREFERRED:
923 case MPOL_DEFAULT:
924 case MPOL_INTERLEAVE:
925 return 1;
926 case MPOL_BIND: {
927 struct zone **z;
928 for (z = pol->v.zonelist->zones; *z; z++)
929 if ((*z)->zone_pgdat->node_id == nid)
930 return 1;
931 return 0;
932 }
933 default:
934 BUG();
935 return 0;
936 }
937}
938
939/*
940 * Shared memory backing store policy support.
941 *
942 * Remember policies even when nobody has shared memory mapped.
943 * The policies are kept in Red-Black tree linked from the inode.
944 * They are protected by the sp->lock spinlock, which should be held
945 * for any accesses to the tree.
946 */
947
948/* lookup first element intersecting start-end */
949/* Caller holds sp->lock */
950static struct sp_node *
951sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
952{
953 struct rb_node *n = sp->root.rb_node;
954
955 while (n) {
956 struct sp_node *p = rb_entry(n, struct sp_node, nd);
957
958 if (start >= p->end)
959 n = n->rb_right;
960 else if (end <= p->start)
961 n = n->rb_left;
962 else
963 break;
964 }
965 if (!n)
966 return NULL;
967 for (;;) {
968 struct sp_node *w = NULL;
969 struct rb_node *prev = rb_prev(n);
970 if (!prev)
971 break;
972 w = rb_entry(prev, struct sp_node, nd);
973 if (w->end <= start)
974 break;
975 n = prev;
976 }
977 return rb_entry(n, struct sp_node, nd);
978}
979
980/* Insert a new shared policy into the list. */
981/* Caller holds sp->lock */
982static void sp_insert(struct shared_policy *sp, struct sp_node *new)
983{
984 struct rb_node **p = &sp->root.rb_node;
985 struct rb_node *parent = NULL;
986 struct sp_node *nd;
987
988 while (*p) {
989 parent = *p;
990 nd = rb_entry(parent, struct sp_node, nd);
991 if (new->start < nd->start)
992 p = &(*p)->rb_left;
993 else if (new->end > nd->end)
994 p = &(*p)->rb_right;
995 else
996 BUG();
997 }
998 rb_link_node(&new->nd, parent, p);
999 rb_insert_color(&new->nd, &sp->root);
1000 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1001 new->policy ? new->policy->policy : 0);
1002}
1003
1004/* Find shared policy intersecting idx */
1005struct mempolicy *
1006mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1007{
1008 struct mempolicy *pol = NULL;
1009 struct sp_node *sn;
1010
1011 if (!sp->root.rb_node)
1012 return NULL;
1013 spin_lock(&sp->lock);
1014 sn = sp_lookup(sp, idx, idx+1);
1015 if (sn) {
1016 mpol_get(sn->policy);
1017 pol = sn->policy;
1018 }
1019 spin_unlock(&sp->lock);
1020 return pol;
1021}
1022
1023static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1024{
1025 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1026 rb_erase(&n->nd, &sp->root);
1027 mpol_free(n->policy);
1028 kmem_cache_free(sn_cache, n);
1029}
1030
1031struct sp_node *
1032sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1033{
1034 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1035
1036 if (!n)
1037 return NULL;
1038 n->start = start;
1039 n->end = end;
1040 mpol_get(pol);
1041 n->policy = pol;
1042 return n;
1043}
1044
1045/* Replace a policy range. */
1046static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1047 unsigned long end, struct sp_node *new)
1048{
1049 struct sp_node *n, *new2 = NULL;
1050
1051restart:
1052 spin_lock(&sp->lock);
1053 n = sp_lookup(sp, start, end);
1054 /* Take care of old policies in the same range. */
1055 while (n && n->start < end) {
1056 struct rb_node *next = rb_next(&n->nd);
1057 if (n->start >= start) {
1058 if (n->end <= end)
1059 sp_delete(sp, n);
1060 else
1061 n->start = end;
1062 } else {
1063 /* Old policy spanning whole new range. */
1064 if (n->end > end) {
1065 if (!new2) {
1066 spin_unlock(&sp->lock);
1067 new2 = sp_alloc(end, n->end, n->policy);
1068 if (!new2)
1069 return -ENOMEM;
1070 goto restart;
1071 }
1072 n->end = start;
1073 sp_insert(sp, new2);
1074 new2 = NULL;
1075 break;
1076 } else
1077 n->end = start;
1078 }
1079 if (!next)
1080 break;
1081 n = rb_entry(next, struct sp_node, nd);
1082 }
1083 if (new)
1084 sp_insert(sp, new);
1085 spin_unlock(&sp->lock);
1086 if (new2) {
1087 mpol_free(new2->policy);
1088 kmem_cache_free(sn_cache, new2);
1089 }
1090 return 0;
1091}
1092
1093int mpol_set_shared_policy(struct shared_policy *info,
1094 struct vm_area_struct *vma, struct mempolicy *npol)
1095{
1096 int err;
1097 struct sp_node *new = NULL;
1098 unsigned long sz = vma_pages(vma);
1099
1100 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1101 vma->vm_pgoff,
1102 sz, npol? npol->policy : -1,
Andi Kleendfcd3c02005-10-29 18:15:48 -07001103 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001104
1105 if (npol) {
1106 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1107 if (!new)
1108 return -ENOMEM;
1109 }
1110 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1111 if (err && new)
1112 kmem_cache_free(sn_cache, new);
1113 return err;
1114}
1115
1116/* Free a backing policy store on inode delete. */
1117void mpol_free_shared_policy(struct shared_policy *p)
1118{
1119 struct sp_node *n;
1120 struct rb_node *next;
1121
1122 if (!p->root.rb_node)
1123 return;
1124 spin_lock(&p->lock);
1125 next = rb_first(&p->root);
1126 while (next) {
1127 n = rb_entry(next, struct sp_node, nd);
1128 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001129 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 mpol_free(n->policy);
1131 kmem_cache_free(sn_cache, n);
1132 }
1133 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134}
1135
1136/* assumes fs == KERNEL_DS */
1137void __init numa_policy_init(void)
1138{
1139 policy_cache = kmem_cache_create("numa_policy",
1140 sizeof(struct mempolicy),
1141 0, SLAB_PANIC, NULL, NULL);
1142
1143 sn_cache = kmem_cache_create("shared_policy_node",
1144 sizeof(struct sp_node),
1145 0, SLAB_PANIC, NULL, NULL);
1146
1147 /* Set interleaving policy for system init. This way not all
1148 the data structures allocated at system boot end up in node zero. */
1149
1150 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1151 MAX_NUMNODES) < 0)
1152 printk("numa_policy_init: interleaving failed\n");
1153}
1154
1155/* Reset policy of current process to default.
1156 * Assumes fs == KERNEL_DS */
1157void numa_default_policy(void)
1158{
1159 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1160}