blob: 37af443eb0944e5e81b4094bf5b7312540ce170d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
Andi Kleend42c6992005-07-06 19:56:03 +020091struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -070092 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
96/* Check if all specified nodes are online */
97static int nodes_online(unsigned long *nodes)
98{
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
100
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
103 set_bit(0, online2);
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 return -EINVAL;
106 return 0;
107}
108
109/* Do sanity checking on a policy */
110static int mpol_check_policy(int mode, unsigned long *nodes)
111{
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
113
114 switch (mode) {
115 case MPOL_DEFAULT:
116 if (!empty)
117 return -EINVAL;
118 break;
119 case MPOL_BIND:
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
122 more for now. */
123 if (empty)
124 return -EINVAL;
125 break;
126 }
127 return nodes_online(nodes);
128}
129
130/* Copy a node mask from user space. */
131static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
133{
134 unsigned long k;
135 unsigned long nlongs;
136 unsigned long endmask;
137
138 --maxnode;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
141 return 0;
142
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
145 endmask = ~0UL;
146 else
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
153 return -EINVAL;
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 unsigned long t;
156 if (get_user(t, nmask + k))
157 return -EFAULT;
158 if (k == nlongs - 1) {
159 if (t & endmask)
160 return -EINVAL;
161 } else if (t)
162 return -EINVAL;
163 }
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 endmask = ~0UL;
166 }
167
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 return -EFAULT;
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
176}
177
178/* Generate a custom zonelist for the BIND policy. */
179static struct zonelist *bind_zonelist(unsigned long *nodes)
180{
181 struct zonelist *zl;
182 int num, max, nd;
183
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 if (!zl)
187 return NULL;
188 num = 0;
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
190 nd < MAX_NUMNODES;
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 int k;
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
196 continue;
197 zl->zones[num++] = z;
198 if (k > policy_zone)
199 policy_zone = k;
200 }
201 }
202 BUG_ON(num >= max);
203 zl->zones[num] = NULL;
204 return zl;
205}
206
207/* Create a new policy */
208static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209{
210 struct mempolicy *policy;
211
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
214 return NULL;
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 if (!policy)
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
219 switch (mode) {
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222 break;
223 case MPOL_PREFERRED:
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
227 break;
228 case MPOL_BIND:
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
233 }
234 break;
235 }
236 policy->policy = mode;
237 return policy;
238}
239
240/* Ensure all existing pages follow the policy. */
Hugh Dickins91612e02005-06-21 17:15:07 -0700241static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
242 unsigned long addr, unsigned long end, unsigned long *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243{
Hugh Dickins91612e02005-06-21 17:15:07 -0700244 pte_t *orig_pte;
245 pte_t *pte;
Hugh Dickins941150a2005-06-21 17:15:06 -0700246
247 spin_lock(&mm->page_table_lock);
Hugh Dickins91612e02005-06-21 17:15:07 -0700248 orig_pte = pte = pte_offset_map(pmd, addr);
249 do {
250 unsigned long pfn;
251 unsigned int nid;
252
253 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700255 pfn = pte_pfn(*pte);
256 if (!pfn_valid(pfn))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700258 nid = pfn_to_nid(pfn);
259 if (!test_bit(nid, nodes))
260 break;
261 } while (pte++, addr += PAGE_SIZE, addr != end);
262 pte_unmap(orig_pte);
Hugh Dickins941150a2005-06-21 17:15:06 -0700263 spin_unlock(&mm->page_table_lock);
Hugh Dickins91612e02005-06-21 17:15:07 -0700264 return addr != end;
265}
266
267static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
268 unsigned long addr, unsigned long end, unsigned long *nodes)
269{
270 pmd_t *pmd;
271 unsigned long next;
272
273 pmd = pmd_offset(pud, addr);
274 do {
275 next = pmd_addr_end(addr, end);
276 if (pmd_none_or_clear_bad(pmd))
277 continue;
278 if (check_pte_range(mm, pmd, addr, next, nodes))
279 return -EIO;
280 } while (pmd++, addr = next, addr != end);
281 return 0;
282}
283
284static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
285 unsigned long addr, unsigned long end, unsigned long *nodes)
286{
287 pud_t *pud;
288 unsigned long next;
289
290 pud = pud_offset(pgd, addr);
291 do {
292 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud))
294 continue;
295 if (check_pmd_range(mm, pud, addr, next, nodes))
296 return -EIO;
297 } while (pud++, addr = next, addr != end);
298 return 0;
299}
300
301static inline int check_pgd_range(struct mm_struct *mm,
302 unsigned long addr, unsigned long end, unsigned long *nodes)
303{
304 pgd_t *pgd;
305 unsigned long next;
306
307 pgd = pgd_offset(mm, addr);
308 do {
309 next = pgd_addr_end(addr, end);
310 if (pgd_none_or_clear_bad(pgd))
311 continue;
312 if (check_pud_range(mm, pgd, addr, next, nodes))
313 return -EIO;
314 } while (pgd++, addr = next, addr != end);
315 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316}
317
318/* Step 1: check the range */
319static struct vm_area_struct *
320check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 unsigned long *nodes, unsigned long flags)
322{
323 int err;
324 struct vm_area_struct *first, *vma, *prev;
325
326 first = find_vma(mm, start);
327 if (!first)
328 return ERR_PTR(-EFAULT);
329 prev = NULL;
330 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 if (!vma->vm_next && vma->vm_end < end)
332 return ERR_PTR(-EFAULT);
333 if (prev && prev->vm_end < vma->vm_start)
334 return ERR_PTR(-EFAULT);
335 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700336 unsigned long endvma = vma->vm_end;
337 if (endvma > end)
338 endvma = end;
339 if (vma->vm_start > start)
340 start = vma->vm_start;
Hugh Dickins91612e02005-06-21 17:15:07 -0700341 err = check_pgd_range(vma->vm_mm,
Andi Kleen5b952b32005-09-13 01:25:08 -0700342 start, endvma, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343 if (err) {
344 first = ERR_PTR(err);
345 break;
346 }
347 }
348 prev = vma;
349 }
350 return first;
351}
352
353/* Apply policy to a single VMA */
354static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
355{
356 int err = 0;
357 struct mempolicy *old = vma->vm_policy;
358
359 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
360 vma->vm_start, vma->vm_end, vma->vm_pgoff,
361 vma->vm_ops, vma->vm_file,
362 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
363
364 if (vma->vm_ops && vma->vm_ops->set_policy)
365 err = vma->vm_ops->set_policy(vma, new);
366 if (!err) {
367 mpol_get(new);
368 vma->vm_policy = new;
369 mpol_free(old);
370 }
371 return err;
372}
373
374/* Step 2: apply policy to a range and do splits. */
375static int mbind_range(struct vm_area_struct *vma, unsigned long start,
376 unsigned long end, struct mempolicy *new)
377{
378 struct vm_area_struct *next;
379 int err;
380
381 err = 0;
382 for (; vma && vma->vm_start < end; vma = next) {
383 next = vma->vm_next;
384 if (vma->vm_start < start)
385 err = split_vma(vma->vm_mm, vma, start, 1);
386 if (!err && vma->vm_end > end)
387 err = split_vma(vma->vm_mm, vma, end, 0);
388 if (!err)
389 err = policy_vma(vma, new);
390 if (err)
391 break;
392 }
393 return err;
394}
395
396/* Change policy for a memory range */
397asmlinkage long sys_mbind(unsigned long start, unsigned long len,
398 unsigned long mode,
399 unsigned long __user *nmask, unsigned long maxnode,
400 unsigned flags)
401{
402 struct vm_area_struct *vma;
403 struct mm_struct *mm = current->mm;
404 struct mempolicy *new;
405 unsigned long end;
406 DECLARE_BITMAP(nodes, MAX_NUMNODES);
407 int err;
408
409 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
410 return -EINVAL;
411 if (start & ~PAGE_MASK)
412 return -EINVAL;
413 if (mode == MPOL_DEFAULT)
414 flags &= ~MPOL_MF_STRICT;
415 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
416 end = start + len;
417 if (end < start)
418 return -EINVAL;
419 if (end == start)
420 return 0;
421
422 err = get_nodes(nodes, nmask, maxnode, mode);
423 if (err)
424 return err;
425
426 new = mpol_new(mode, nodes);
427 if (IS_ERR(new))
428 return PTR_ERR(new);
429
430 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
431 mode,nodes[0]);
432
433 down_write(&mm->mmap_sem);
434 vma = check_range(mm, start, end, nodes, flags);
435 err = PTR_ERR(vma);
436 if (!IS_ERR(vma))
437 err = mbind_range(vma, start, end, new);
438 up_write(&mm->mmap_sem);
439 mpol_free(new);
440 return err;
441}
442
443/* Set the process memory policy */
444asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
445 unsigned long maxnode)
446{
447 int err;
448 struct mempolicy *new;
449 DECLARE_BITMAP(nodes, MAX_NUMNODES);
450
Eric Dumazetba171012005-08-01 21:11:43 -0700451 if (mode < 0 || mode > MPOL_MAX)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 return -EINVAL;
453 err = get_nodes(nodes, nmask, maxnode, mode);
454 if (err)
455 return err;
456 new = mpol_new(mode, nodes);
457 if (IS_ERR(new))
458 return PTR_ERR(new);
459 mpol_free(current->mempolicy);
460 current->mempolicy = new;
461 if (new && new->policy == MPOL_INTERLEAVE)
462 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
463 return 0;
464}
465
466/* Fill a zone bitmap for a policy */
467static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
468{
469 int i;
470
471 bitmap_zero(nodes, MAX_NUMNODES);
472 switch (p->policy) {
473 case MPOL_BIND:
474 for (i = 0; p->v.zonelist->zones[i]; i++)
475 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
476 break;
477 case MPOL_DEFAULT:
478 break;
479 case MPOL_INTERLEAVE:
480 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
481 break;
482 case MPOL_PREFERRED:
483 /* or use current node instead of online map? */
484 if (p->v.preferred_node < 0)
485 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
486 else
487 __set_bit(p->v.preferred_node, nodes);
488 break;
489 default:
490 BUG();
491 }
492}
493
494static int lookup_node(struct mm_struct *mm, unsigned long addr)
495{
496 struct page *p;
497 int err;
498
499 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
500 if (err >= 0) {
501 err = page_to_nid(p);
502 put_page(p);
503 }
504 return err;
505}
506
507/* Copy a kernel node mask to user space */
508static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
509 void *nodes, unsigned nbytes)
510{
511 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
512
513 if (copy > nbytes) {
514 if (copy > PAGE_SIZE)
515 return -EINVAL;
516 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
517 return -EFAULT;
518 copy = nbytes;
519 }
520 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
521}
522
523/* Retrieve NUMA policy */
524asmlinkage long sys_get_mempolicy(int __user *policy,
525 unsigned long __user *nmask,
526 unsigned long maxnode,
527 unsigned long addr, unsigned long flags)
528{
529 int err, pval;
530 struct mm_struct *mm = current->mm;
531 struct vm_area_struct *vma = NULL;
532 struct mempolicy *pol = current->mempolicy;
533
534 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
535 return -EINVAL;
536 if (nmask != NULL && maxnode < MAX_NUMNODES)
537 return -EINVAL;
538 if (flags & MPOL_F_ADDR) {
539 down_read(&mm->mmap_sem);
540 vma = find_vma_intersection(mm, addr, addr+1);
541 if (!vma) {
542 up_read(&mm->mmap_sem);
543 return -EFAULT;
544 }
545 if (vma->vm_ops && vma->vm_ops->get_policy)
546 pol = vma->vm_ops->get_policy(vma, addr);
547 else
548 pol = vma->vm_policy;
549 } else if (addr)
550 return -EINVAL;
551
552 if (!pol)
553 pol = &default_policy;
554
555 if (flags & MPOL_F_NODE) {
556 if (flags & MPOL_F_ADDR) {
557 err = lookup_node(mm, addr);
558 if (err < 0)
559 goto out;
560 pval = err;
561 } else if (pol == current->mempolicy &&
562 pol->policy == MPOL_INTERLEAVE) {
563 pval = current->il_next;
564 } else {
565 err = -EINVAL;
566 goto out;
567 }
568 } else
569 pval = pol->policy;
570
571 if (vma) {
572 up_read(&current->mm->mmap_sem);
573 vma = NULL;
574 }
575
576 if (policy && put_user(pval, policy))
577 return -EFAULT;
578
579 err = 0;
580 if (nmask) {
581 DECLARE_BITMAP(nodes, MAX_NUMNODES);
582 get_zonemask(pol, nodes);
583 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
584 }
585
586 out:
587 if (vma)
588 up_read(&current->mm->mmap_sem);
589 return err;
590}
591
592#ifdef CONFIG_COMPAT
593
594asmlinkage long compat_sys_get_mempolicy(int __user *policy,
595 compat_ulong_t __user *nmask,
596 compat_ulong_t maxnode,
597 compat_ulong_t addr, compat_ulong_t flags)
598{
599 long err;
600 unsigned long __user *nm = NULL;
601 unsigned long nr_bits, alloc_size;
602 DECLARE_BITMAP(bm, MAX_NUMNODES);
603
604 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
605 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
606
607 if (nmask)
608 nm = compat_alloc_user_space(alloc_size);
609
610 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
611
612 if (!err && nmask) {
613 err = copy_from_user(bm, nm, alloc_size);
614 /* ensure entire bitmap is zeroed */
615 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
616 err |= compat_put_bitmap(nmask, bm, nr_bits);
617 }
618
619 return err;
620}
621
622asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
623 compat_ulong_t maxnode)
624{
625 long err = 0;
626 unsigned long __user *nm = NULL;
627 unsigned long nr_bits, alloc_size;
628 DECLARE_BITMAP(bm, MAX_NUMNODES);
629
630 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
631 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
632
633 if (nmask) {
634 err = compat_get_bitmap(bm, nmask, nr_bits);
635 nm = compat_alloc_user_space(alloc_size);
636 err |= copy_to_user(nm, bm, alloc_size);
637 }
638
639 if (err)
640 return -EFAULT;
641
642 return sys_set_mempolicy(mode, nm, nr_bits+1);
643}
644
645asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
646 compat_ulong_t mode, compat_ulong_t __user *nmask,
647 compat_ulong_t maxnode, compat_ulong_t flags)
648{
649 long err = 0;
650 unsigned long __user *nm = NULL;
651 unsigned long nr_bits, alloc_size;
652 DECLARE_BITMAP(bm, MAX_NUMNODES);
653
654 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
655 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
656
657 if (nmask) {
658 err = compat_get_bitmap(bm, nmask, nr_bits);
659 nm = compat_alloc_user_space(alloc_size);
660 err |= copy_to_user(nm, bm, alloc_size);
661 }
662
663 if (err)
664 return -EFAULT;
665
666 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
667}
668
669#endif
670
671/* Return effective policy for a VMA */
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700672struct mempolicy *
673get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700675 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676
677 if (vma) {
678 if (vma->vm_ops && vma->vm_ops->get_policy)
679 pol = vma->vm_ops->get_policy(vma, addr);
680 else if (vma->vm_policy &&
681 vma->vm_policy->policy != MPOL_DEFAULT)
682 pol = vma->vm_policy;
683 }
684 if (!pol)
685 pol = &default_policy;
686 return pol;
687}
688
689/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +0100690static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691{
692 int nd;
693
694 switch (policy->policy) {
695 case MPOL_PREFERRED:
696 nd = policy->v.preferred_node;
697 if (nd < 0)
698 nd = numa_node_id();
699 break;
700 case MPOL_BIND:
701 /* Lower zones don't get a policy applied */
702 /* Careful: current->mems_allowed might have moved */
Al Viro01424962005-04-24 12:28:34 -0700703 if ((gfp & GFP_ZONEMASK) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
705 return policy->v.zonelist;
706 /*FALL THROUGH*/
707 case MPOL_INTERLEAVE: /* should not happen */
708 case MPOL_DEFAULT:
709 nd = numa_node_id();
710 break;
711 default:
712 nd = 0;
713 BUG();
714 }
715 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
716}
717
718/* Do dynamic interleaving for a process */
719static unsigned interleave_nodes(struct mempolicy *policy)
720{
721 unsigned nid, next;
722 struct task_struct *me = current;
723
724 nid = me->il_next;
725 BUG_ON(nid >= MAX_NUMNODES);
726 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
727 if (next >= MAX_NUMNODES)
728 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
729 me->il_next = next;
730 return nid;
731}
732
733/* Do static interleaving for a VMA with known offset. */
734static unsigned offset_il_node(struct mempolicy *pol,
735 struct vm_area_struct *vma, unsigned long off)
736{
737 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
738 unsigned target = (unsigned)off % nnodes;
739 int c;
740 int nid = -1;
741
742 c = 0;
743 do {
744 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
745 c++;
746 } while (c <= target);
747 BUG_ON(nid >= MAX_NUMNODES);
748 BUG_ON(!test_bit(nid, pol->v.nodes));
749 return nid;
750}
751
752/* Allocate a page in interleaved policy.
753 Own path because it needs to do special accounting. */
Al Virodd0fc662005-10-07 07:46:04 +0100754static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755{
756 struct zonelist *zl;
757 struct page *page;
758
759 BUG_ON(!node_online(nid));
760 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
761 page = __alloc_pages(gfp, order, zl);
762 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -0700763 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 put_cpu();
765 }
766 return page;
767}
768
769/**
770 * alloc_page_vma - Allocate a page for a VMA.
771 *
772 * @gfp:
773 * %GFP_USER user allocation.
774 * %GFP_KERNEL kernel allocations,
775 * %GFP_HIGHMEM highmem/user allocations,
776 * %GFP_FS allocation should not call back into a file system.
777 * %GFP_ATOMIC don't sleep.
778 *
779 * @vma: Pointer to VMA or NULL if not available.
780 * @addr: Virtual Address of the allocation. Must be inside the VMA.
781 *
782 * This function allocates a page from the kernel page pool and applies
783 * a NUMA policy associated with the VMA or the current process.
784 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
785 * mm_struct of the VMA to prevent it from going away. Should be used for
786 * all allocations for pages that will be mapped into
787 * user space. Returns NULL when no page can be allocated.
788 *
789 * Should be called with the mm_sem of the vma hold.
790 */
791struct page *
Al Virodd0fc662005-10-07 07:46:04 +0100792alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700794 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795
796 cpuset_update_current_mems_allowed();
797
798 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
799 unsigned nid;
800 if (vma) {
801 unsigned long off;
802 BUG_ON(addr >= vma->vm_end);
803 BUG_ON(addr < vma->vm_start);
804 off = vma->vm_pgoff;
805 off += (addr - vma->vm_start) >> PAGE_SHIFT;
806 nid = offset_il_node(pol, vma, off);
807 } else {
808 /* fall back to process interleaving */
809 nid = interleave_nodes(pol);
810 }
811 return alloc_page_interleave(gfp, 0, nid);
812 }
813 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
814}
815
816/**
817 * alloc_pages_current - Allocate pages.
818 *
819 * @gfp:
820 * %GFP_USER user allocation,
821 * %GFP_KERNEL kernel allocation,
822 * %GFP_HIGHMEM highmem allocation,
823 * %GFP_FS don't call back into a file system.
824 * %GFP_ATOMIC don't sleep.
825 * @order: Power of two of allocation size in pages. 0 is a single page.
826 *
827 * Allocate a page from the kernel page pool. When not in
828 * interrupt context and apply the current process NUMA policy.
829 * Returns NULL when no page can be allocated.
830 *
831 * Don't call cpuset_update_current_mems_allowed() unless
832 * 1) it's ok to take cpuset_sem (can WAIT), and
833 * 2) allocating for current task (not interrupt).
834 */
Al Virodd0fc662005-10-07 07:46:04 +0100835struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836{
837 struct mempolicy *pol = current->mempolicy;
838
839 if ((gfp & __GFP_WAIT) && !in_interrupt())
840 cpuset_update_current_mems_allowed();
841 if (!pol || in_interrupt())
842 pol = &default_policy;
843 if (pol->policy == MPOL_INTERLEAVE)
844 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
845 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
846}
847EXPORT_SYMBOL(alloc_pages_current);
848
849/* Slow path of a mempolicy copy */
850struct mempolicy *__mpol_copy(struct mempolicy *old)
851{
852 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
853
854 if (!new)
855 return ERR_PTR(-ENOMEM);
856 *new = *old;
857 atomic_set(&new->refcnt, 1);
858 if (new->policy == MPOL_BIND) {
859 int sz = ksize(old->v.zonelist);
860 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
861 if (!new->v.zonelist) {
862 kmem_cache_free(policy_cache, new);
863 return ERR_PTR(-ENOMEM);
864 }
865 memcpy(new->v.zonelist, old->v.zonelist, sz);
866 }
867 return new;
868}
869
870/* Slow path of a mempolicy comparison */
871int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
872{
873 if (!a || !b)
874 return 0;
875 if (a->policy != b->policy)
876 return 0;
877 switch (a->policy) {
878 case MPOL_DEFAULT:
879 return 1;
880 case MPOL_INTERLEAVE:
881 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
882 case MPOL_PREFERRED:
883 return a->v.preferred_node == b->v.preferred_node;
884 case MPOL_BIND: {
885 int i;
886 for (i = 0; a->v.zonelist->zones[i]; i++)
887 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
888 return 0;
889 return b->v.zonelist->zones[i] == NULL;
890 }
891 default:
892 BUG();
893 return 0;
894 }
895}
896
897/* Slow path of a mpol destructor. */
898void __mpol_free(struct mempolicy *p)
899{
900 if (!atomic_dec_and_test(&p->refcnt))
901 return;
902 if (p->policy == MPOL_BIND)
903 kfree(p->v.zonelist);
904 p->policy = MPOL_DEFAULT;
905 kmem_cache_free(policy_cache, p);
906}
907
908/*
909 * Hugetlb policy. Same as above, just works with node numbers instead of
910 * zonelists.
911 */
912
913/* Find first node suitable for an allocation */
914int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
915{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700916 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917
918 switch (pol->policy) {
919 case MPOL_DEFAULT:
920 return numa_node_id();
921 case MPOL_BIND:
922 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
923 case MPOL_INTERLEAVE:
924 return interleave_nodes(pol);
925 case MPOL_PREFERRED:
926 return pol->v.preferred_node >= 0 ?
927 pol->v.preferred_node : numa_node_id();
928 }
929 BUG();
930 return 0;
931}
932
933/* Find secondary valid nodes for an allocation */
934int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
935{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700936 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937
938 switch (pol->policy) {
939 case MPOL_PREFERRED:
940 case MPOL_DEFAULT:
941 case MPOL_INTERLEAVE:
942 return 1;
943 case MPOL_BIND: {
944 struct zone **z;
945 for (z = pol->v.zonelist->zones; *z; z++)
946 if ((*z)->zone_pgdat->node_id == nid)
947 return 1;
948 return 0;
949 }
950 default:
951 BUG();
952 return 0;
953 }
954}
955
956/*
957 * Shared memory backing store policy support.
958 *
959 * Remember policies even when nobody has shared memory mapped.
960 * The policies are kept in Red-Black tree linked from the inode.
961 * They are protected by the sp->lock spinlock, which should be held
962 * for any accesses to the tree.
963 */
964
965/* lookup first element intersecting start-end */
966/* Caller holds sp->lock */
967static struct sp_node *
968sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
969{
970 struct rb_node *n = sp->root.rb_node;
971
972 while (n) {
973 struct sp_node *p = rb_entry(n, struct sp_node, nd);
974
975 if (start >= p->end)
976 n = n->rb_right;
977 else if (end <= p->start)
978 n = n->rb_left;
979 else
980 break;
981 }
982 if (!n)
983 return NULL;
984 for (;;) {
985 struct sp_node *w = NULL;
986 struct rb_node *prev = rb_prev(n);
987 if (!prev)
988 break;
989 w = rb_entry(prev, struct sp_node, nd);
990 if (w->end <= start)
991 break;
992 n = prev;
993 }
994 return rb_entry(n, struct sp_node, nd);
995}
996
997/* Insert a new shared policy into the list. */
998/* Caller holds sp->lock */
999static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1000{
1001 struct rb_node **p = &sp->root.rb_node;
1002 struct rb_node *parent = NULL;
1003 struct sp_node *nd;
1004
1005 while (*p) {
1006 parent = *p;
1007 nd = rb_entry(parent, struct sp_node, nd);
1008 if (new->start < nd->start)
1009 p = &(*p)->rb_left;
1010 else if (new->end > nd->end)
1011 p = &(*p)->rb_right;
1012 else
1013 BUG();
1014 }
1015 rb_link_node(&new->nd, parent, p);
1016 rb_insert_color(&new->nd, &sp->root);
1017 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1018 new->policy ? new->policy->policy : 0);
1019}
1020
1021/* Find shared policy intersecting idx */
1022struct mempolicy *
1023mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1024{
1025 struct mempolicy *pol = NULL;
1026 struct sp_node *sn;
1027
1028 if (!sp->root.rb_node)
1029 return NULL;
1030 spin_lock(&sp->lock);
1031 sn = sp_lookup(sp, idx, idx+1);
1032 if (sn) {
1033 mpol_get(sn->policy);
1034 pol = sn->policy;
1035 }
1036 spin_unlock(&sp->lock);
1037 return pol;
1038}
1039
1040static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1041{
1042 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1043 rb_erase(&n->nd, &sp->root);
1044 mpol_free(n->policy);
1045 kmem_cache_free(sn_cache, n);
1046}
1047
1048struct sp_node *
1049sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1050{
1051 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1052
1053 if (!n)
1054 return NULL;
1055 n->start = start;
1056 n->end = end;
1057 mpol_get(pol);
1058 n->policy = pol;
1059 return n;
1060}
1061
1062/* Replace a policy range. */
1063static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1064 unsigned long end, struct sp_node *new)
1065{
1066 struct sp_node *n, *new2 = NULL;
1067
1068restart:
1069 spin_lock(&sp->lock);
1070 n = sp_lookup(sp, start, end);
1071 /* Take care of old policies in the same range. */
1072 while (n && n->start < end) {
1073 struct rb_node *next = rb_next(&n->nd);
1074 if (n->start >= start) {
1075 if (n->end <= end)
1076 sp_delete(sp, n);
1077 else
1078 n->start = end;
1079 } else {
1080 /* Old policy spanning whole new range. */
1081 if (n->end > end) {
1082 if (!new2) {
1083 spin_unlock(&sp->lock);
1084 new2 = sp_alloc(end, n->end, n->policy);
1085 if (!new2)
1086 return -ENOMEM;
1087 goto restart;
1088 }
1089 n->end = start;
1090 sp_insert(sp, new2);
1091 new2 = NULL;
1092 break;
1093 } else
1094 n->end = start;
1095 }
1096 if (!next)
1097 break;
1098 n = rb_entry(next, struct sp_node, nd);
1099 }
1100 if (new)
1101 sp_insert(sp, new);
1102 spin_unlock(&sp->lock);
1103 if (new2) {
1104 mpol_free(new2->policy);
1105 kmem_cache_free(sn_cache, new2);
1106 }
1107 return 0;
1108}
1109
1110int mpol_set_shared_policy(struct shared_policy *info,
1111 struct vm_area_struct *vma, struct mempolicy *npol)
1112{
1113 int err;
1114 struct sp_node *new = NULL;
1115 unsigned long sz = vma_pages(vma);
1116
1117 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1118 vma->vm_pgoff,
1119 sz, npol? npol->policy : -1,
1120 npol ? npol->v.nodes[0] : -1);
1121
1122 if (npol) {
1123 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1124 if (!new)
1125 return -ENOMEM;
1126 }
1127 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1128 if (err && new)
1129 kmem_cache_free(sn_cache, new);
1130 return err;
1131}
1132
1133/* Free a backing policy store on inode delete. */
1134void mpol_free_shared_policy(struct shared_policy *p)
1135{
1136 struct sp_node *n;
1137 struct rb_node *next;
1138
1139 if (!p->root.rb_node)
1140 return;
1141 spin_lock(&p->lock);
1142 next = rb_first(&p->root);
1143 while (next) {
1144 n = rb_entry(next, struct sp_node, nd);
1145 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001146 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001147 mpol_free(n->policy);
1148 kmem_cache_free(sn_cache, n);
1149 }
1150 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151}
1152
1153/* assumes fs == KERNEL_DS */
1154void __init numa_policy_init(void)
1155{
1156 policy_cache = kmem_cache_create("numa_policy",
1157 sizeof(struct mempolicy),
1158 0, SLAB_PANIC, NULL, NULL);
1159
1160 sn_cache = kmem_cache_create("shared_policy_node",
1161 sizeof(struct sp_node),
1162 0, SLAB_PANIC, NULL, NULL);
1163
1164 /* Set interleaving policy for system init. This way not all
1165 the data structures allocated at system boot end up in node zero. */
1166
1167 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1168 MAX_NUMNODES) < 0)
1169 printk("numa_policy_init: interleaving failed\n");
1170}
1171
1172/* Reset policy of current process to default.
1173 * Assumes fs == KERNEL_DS */
1174void numa_default_policy(void)
1175{
1176 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1177}