blob: 13492d66b7c809ff603f6c996f5ea1aafbe51a34 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
91static struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
96/* Check if all specified nodes are online */
97static int nodes_online(unsigned long *nodes)
98{
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
100
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
103 set_bit(0, online2);
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 return -EINVAL;
106 return 0;
107}
108
109/* Do sanity checking on a policy */
110static int mpol_check_policy(int mode, unsigned long *nodes)
111{
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
113
114 switch (mode) {
115 case MPOL_DEFAULT:
116 if (!empty)
117 return -EINVAL;
118 break;
119 case MPOL_BIND:
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
122 more for now. */
123 if (empty)
124 return -EINVAL;
125 break;
126 }
127 return nodes_online(nodes);
128}
129
130/* Copy a node mask from user space. */
131static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
133{
134 unsigned long k;
135 unsigned long nlongs;
136 unsigned long endmask;
137
138 --maxnode;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
141 return 0;
142
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
145 endmask = ~0UL;
146 else
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
153 return -EINVAL;
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 unsigned long t;
156 if (get_user(t, nmask + k))
157 return -EFAULT;
158 if (k == nlongs - 1) {
159 if (t & endmask)
160 return -EINVAL;
161 } else if (t)
162 return -EINVAL;
163 }
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 endmask = ~0UL;
166 }
167
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 return -EFAULT;
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
176}
177
178/* Generate a custom zonelist for the BIND policy. */
179static struct zonelist *bind_zonelist(unsigned long *nodes)
180{
181 struct zonelist *zl;
182 int num, max, nd;
183
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 if (!zl)
187 return NULL;
188 num = 0;
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
190 nd < MAX_NUMNODES;
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 int k;
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
196 continue;
197 zl->zones[num++] = z;
198 if (k > policy_zone)
199 policy_zone = k;
200 }
201 }
202 BUG_ON(num >= max);
203 zl->zones[num] = NULL;
204 return zl;
205}
206
207/* Create a new policy */
208static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209{
210 struct mempolicy *policy;
211
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
214 return NULL;
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 if (!policy)
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
219 switch (mode) {
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222 break;
223 case MPOL_PREFERRED:
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
227 break;
228 case MPOL_BIND:
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
233 }
234 break;
235 }
236 policy->policy = mode;
237 return policy;
238}
239
240/* Ensure all existing pages follow the policy. */
Hugh Dickins91612e02005-06-21 17:15:07 -0700241static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
242 unsigned long addr, unsigned long end, unsigned long *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243{
Hugh Dickins91612e02005-06-21 17:15:07 -0700244 pte_t *orig_pte;
245 pte_t *pte;
Hugh Dickins941150a2005-06-21 17:15:06 -0700246
247 spin_lock(&mm->page_table_lock);
Hugh Dickins91612e02005-06-21 17:15:07 -0700248 orig_pte = pte = pte_offset_map(pmd, addr);
249 do {
250 unsigned long pfn;
251 unsigned int nid;
252
253 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700255 pfn = pte_pfn(*pte);
256 if (!pfn_valid(pfn))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700258 nid = pfn_to_nid(pfn);
259 if (!test_bit(nid, nodes))
260 break;
261 } while (pte++, addr += PAGE_SIZE, addr != end);
262 pte_unmap(orig_pte);
Hugh Dickins941150a2005-06-21 17:15:06 -0700263 spin_unlock(&mm->page_table_lock);
Hugh Dickins91612e02005-06-21 17:15:07 -0700264 return addr != end;
265}
266
267static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
268 unsigned long addr, unsigned long end, unsigned long *nodes)
269{
270 pmd_t *pmd;
271 unsigned long next;
272
273 pmd = pmd_offset(pud, addr);
274 do {
275 next = pmd_addr_end(addr, end);
276 if (pmd_none_or_clear_bad(pmd))
277 continue;
278 if (check_pte_range(mm, pmd, addr, next, nodes))
279 return -EIO;
280 } while (pmd++, addr = next, addr != end);
281 return 0;
282}
283
284static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
285 unsigned long addr, unsigned long end, unsigned long *nodes)
286{
287 pud_t *pud;
288 unsigned long next;
289
290 pud = pud_offset(pgd, addr);
291 do {
292 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud))
294 continue;
295 if (check_pmd_range(mm, pud, addr, next, nodes))
296 return -EIO;
297 } while (pud++, addr = next, addr != end);
298 return 0;
299}
300
301static inline int check_pgd_range(struct mm_struct *mm,
302 unsigned long addr, unsigned long end, unsigned long *nodes)
303{
304 pgd_t *pgd;
305 unsigned long next;
306
307 pgd = pgd_offset(mm, addr);
308 do {
309 next = pgd_addr_end(addr, end);
310 if (pgd_none_or_clear_bad(pgd))
311 continue;
312 if (check_pud_range(mm, pgd, addr, next, nodes))
313 return -EIO;
314 } while (pgd++, addr = next, addr != end);
315 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316}
317
318/* Step 1: check the range */
319static struct vm_area_struct *
320check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 unsigned long *nodes, unsigned long flags)
322{
323 int err;
324 struct vm_area_struct *first, *vma, *prev;
325
326 first = find_vma(mm, start);
327 if (!first)
328 return ERR_PTR(-EFAULT);
329 prev = NULL;
330 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 if (!vma->vm_next && vma->vm_end < end)
332 return ERR_PTR(-EFAULT);
333 if (prev && prev->vm_end < vma->vm_start)
334 return ERR_PTR(-EFAULT);
335 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
Hugh Dickins91612e02005-06-21 17:15:07 -0700336 err = check_pgd_range(vma->vm_mm,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337 vma->vm_start, vma->vm_end, nodes);
338 if (err) {
339 first = ERR_PTR(err);
340 break;
341 }
342 }
343 prev = vma;
344 }
345 return first;
346}
347
348/* Apply policy to a single VMA */
349static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
350{
351 int err = 0;
352 struct mempolicy *old = vma->vm_policy;
353
354 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
355 vma->vm_start, vma->vm_end, vma->vm_pgoff,
356 vma->vm_ops, vma->vm_file,
357 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
358
359 if (vma->vm_ops && vma->vm_ops->set_policy)
360 err = vma->vm_ops->set_policy(vma, new);
361 if (!err) {
362 mpol_get(new);
363 vma->vm_policy = new;
364 mpol_free(old);
365 }
366 return err;
367}
368
369/* Step 2: apply policy to a range and do splits. */
370static int mbind_range(struct vm_area_struct *vma, unsigned long start,
371 unsigned long end, struct mempolicy *new)
372{
373 struct vm_area_struct *next;
374 int err;
375
376 err = 0;
377 for (; vma && vma->vm_start < end; vma = next) {
378 next = vma->vm_next;
379 if (vma->vm_start < start)
380 err = split_vma(vma->vm_mm, vma, start, 1);
381 if (!err && vma->vm_end > end)
382 err = split_vma(vma->vm_mm, vma, end, 0);
383 if (!err)
384 err = policy_vma(vma, new);
385 if (err)
386 break;
387 }
388 return err;
389}
390
391/* Change policy for a memory range */
392asmlinkage long sys_mbind(unsigned long start, unsigned long len,
393 unsigned long mode,
394 unsigned long __user *nmask, unsigned long maxnode,
395 unsigned flags)
396{
397 struct vm_area_struct *vma;
398 struct mm_struct *mm = current->mm;
399 struct mempolicy *new;
400 unsigned long end;
401 DECLARE_BITMAP(nodes, MAX_NUMNODES);
402 int err;
403
404 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
405 return -EINVAL;
406 if (start & ~PAGE_MASK)
407 return -EINVAL;
408 if (mode == MPOL_DEFAULT)
409 flags &= ~MPOL_MF_STRICT;
410 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
411 end = start + len;
412 if (end < start)
413 return -EINVAL;
414 if (end == start)
415 return 0;
416
417 err = get_nodes(nodes, nmask, maxnode, mode);
418 if (err)
419 return err;
420
421 new = mpol_new(mode, nodes);
422 if (IS_ERR(new))
423 return PTR_ERR(new);
424
425 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
426 mode,nodes[0]);
427
428 down_write(&mm->mmap_sem);
429 vma = check_range(mm, start, end, nodes, flags);
430 err = PTR_ERR(vma);
431 if (!IS_ERR(vma))
432 err = mbind_range(vma, start, end, new);
433 up_write(&mm->mmap_sem);
434 mpol_free(new);
435 return err;
436}
437
438/* Set the process memory policy */
439asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
440 unsigned long maxnode)
441{
442 int err;
443 struct mempolicy *new;
444 DECLARE_BITMAP(nodes, MAX_NUMNODES);
445
Eric Dumazetba171012005-08-01 21:11:43 -0700446 if (mode < 0 || mode > MPOL_MAX)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 return -EINVAL;
448 err = get_nodes(nodes, nmask, maxnode, mode);
449 if (err)
450 return err;
451 new = mpol_new(mode, nodes);
452 if (IS_ERR(new))
453 return PTR_ERR(new);
454 mpol_free(current->mempolicy);
455 current->mempolicy = new;
456 if (new && new->policy == MPOL_INTERLEAVE)
457 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
458 return 0;
459}
460
461/* Fill a zone bitmap for a policy */
462static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
463{
464 int i;
465
466 bitmap_zero(nodes, MAX_NUMNODES);
467 switch (p->policy) {
468 case MPOL_BIND:
469 for (i = 0; p->v.zonelist->zones[i]; i++)
470 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
471 break;
472 case MPOL_DEFAULT:
473 break;
474 case MPOL_INTERLEAVE:
475 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
476 break;
477 case MPOL_PREFERRED:
478 /* or use current node instead of online map? */
479 if (p->v.preferred_node < 0)
480 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
481 else
482 __set_bit(p->v.preferred_node, nodes);
483 break;
484 default:
485 BUG();
486 }
487}
488
489static int lookup_node(struct mm_struct *mm, unsigned long addr)
490{
491 struct page *p;
492 int err;
493
494 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
495 if (err >= 0) {
496 err = page_to_nid(p);
497 put_page(p);
498 }
499 return err;
500}
501
502/* Copy a kernel node mask to user space */
503static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
504 void *nodes, unsigned nbytes)
505{
506 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
507
508 if (copy > nbytes) {
509 if (copy > PAGE_SIZE)
510 return -EINVAL;
511 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
512 return -EFAULT;
513 copy = nbytes;
514 }
515 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
516}
517
518/* Retrieve NUMA policy */
519asmlinkage long sys_get_mempolicy(int __user *policy,
520 unsigned long __user *nmask,
521 unsigned long maxnode,
522 unsigned long addr, unsigned long flags)
523{
524 int err, pval;
525 struct mm_struct *mm = current->mm;
526 struct vm_area_struct *vma = NULL;
527 struct mempolicy *pol = current->mempolicy;
528
529 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
530 return -EINVAL;
531 if (nmask != NULL && maxnode < MAX_NUMNODES)
532 return -EINVAL;
533 if (flags & MPOL_F_ADDR) {
534 down_read(&mm->mmap_sem);
535 vma = find_vma_intersection(mm, addr, addr+1);
536 if (!vma) {
537 up_read(&mm->mmap_sem);
538 return -EFAULT;
539 }
540 if (vma->vm_ops && vma->vm_ops->get_policy)
541 pol = vma->vm_ops->get_policy(vma, addr);
542 else
543 pol = vma->vm_policy;
544 } else if (addr)
545 return -EINVAL;
546
547 if (!pol)
548 pol = &default_policy;
549
550 if (flags & MPOL_F_NODE) {
551 if (flags & MPOL_F_ADDR) {
552 err = lookup_node(mm, addr);
553 if (err < 0)
554 goto out;
555 pval = err;
556 } else if (pol == current->mempolicy &&
557 pol->policy == MPOL_INTERLEAVE) {
558 pval = current->il_next;
559 } else {
560 err = -EINVAL;
561 goto out;
562 }
563 } else
564 pval = pol->policy;
565
566 if (vma) {
567 up_read(&current->mm->mmap_sem);
568 vma = NULL;
569 }
570
571 if (policy && put_user(pval, policy))
572 return -EFAULT;
573
574 err = 0;
575 if (nmask) {
576 DECLARE_BITMAP(nodes, MAX_NUMNODES);
577 get_zonemask(pol, nodes);
578 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
579 }
580
581 out:
582 if (vma)
583 up_read(&current->mm->mmap_sem);
584 return err;
585}
586
587#ifdef CONFIG_COMPAT
588
589asmlinkage long compat_sys_get_mempolicy(int __user *policy,
590 compat_ulong_t __user *nmask,
591 compat_ulong_t maxnode,
592 compat_ulong_t addr, compat_ulong_t flags)
593{
594 long err;
595 unsigned long __user *nm = NULL;
596 unsigned long nr_bits, alloc_size;
597 DECLARE_BITMAP(bm, MAX_NUMNODES);
598
599 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
600 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
601
602 if (nmask)
603 nm = compat_alloc_user_space(alloc_size);
604
605 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
606
607 if (!err && nmask) {
608 err = copy_from_user(bm, nm, alloc_size);
609 /* ensure entire bitmap is zeroed */
610 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
611 err |= compat_put_bitmap(nmask, bm, nr_bits);
612 }
613
614 return err;
615}
616
617asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
618 compat_ulong_t maxnode)
619{
620 long err = 0;
621 unsigned long __user *nm = NULL;
622 unsigned long nr_bits, alloc_size;
623 DECLARE_BITMAP(bm, MAX_NUMNODES);
624
625 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
626 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
627
628 if (nmask) {
629 err = compat_get_bitmap(bm, nmask, nr_bits);
630 nm = compat_alloc_user_space(alloc_size);
631 err |= copy_to_user(nm, bm, alloc_size);
632 }
633
634 if (err)
635 return -EFAULT;
636
637 return sys_set_mempolicy(mode, nm, nr_bits+1);
638}
639
640asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
641 compat_ulong_t mode, compat_ulong_t __user *nmask,
642 compat_ulong_t maxnode, compat_ulong_t flags)
643{
644 long err = 0;
645 unsigned long __user *nm = NULL;
646 unsigned long nr_bits, alloc_size;
647 DECLARE_BITMAP(bm, MAX_NUMNODES);
648
649 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
650 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
651
652 if (nmask) {
653 err = compat_get_bitmap(bm, nmask, nr_bits);
654 nm = compat_alloc_user_space(alloc_size);
655 err |= copy_to_user(nm, bm, alloc_size);
656 }
657
658 if (err)
659 return -EFAULT;
660
661 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
662}
663
664#endif
665
666/* Return effective policy for a VMA */
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700667struct mempolicy *
668get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700670 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671
672 if (vma) {
673 if (vma->vm_ops && vma->vm_ops->get_policy)
674 pol = vma->vm_ops->get_policy(vma, addr);
675 else if (vma->vm_policy &&
676 vma->vm_policy->policy != MPOL_DEFAULT)
677 pol = vma->vm_policy;
678 }
679 if (!pol)
680 pol = &default_policy;
681 return pol;
682}
683
684/* Return a zonelist representing a mempolicy */
685static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
686{
687 int nd;
688
689 switch (policy->policy) {
690 case MPOL_PREFERRED:
691 nd = policy->v.preferred_node;
692 if (nd < 0)
693 nd = numa_node_id();
694 break;
695 case MPOL_BIND:
696 /* Lower zones don't get a policy applied */
697 /* Careful: current->mems_allowed might have moved */
Al Viro01424962005-04-24 12:28:34 -0700698 if ((gfp & GFP_ZONEMASK) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
700 return policy->v.zonelist;
701 /*FALL THROUGH*/
702 case MPOL_INTERLEAVE: /* should not happen */
703 case MPOL_DEFAULT:
704 nd = numa_node_id();
705 break;
706 default:
707 nd = 0;
708 BUG();
709 }
710 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
711}
712
713/* Do dynamic interleaving for a process */
714static unsigned interleave_nodes(struct mempolicy *policy)
715{
716 unsigned nid, next;
717 struct task_struct *me = current;
718
719 nid = me->il_next;
720 BUG_ON(nid >= MAX_NUMNODES);
721 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
722 if (next >= MAX_NUMNODES)
723 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
724 me->il_next = next;
725 return nid;
726}
727
728/* Do static interleaving for a VMA with known offset. */
729static unsigned offset_il_node(struct mempolicy *pol,
730 struct vm_area_struct *vma, unsigned long off)
731{
732 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
733 unsigned target = (unsigned)off % nnodes;
734 int c;
735 int nid = -1;
736
737 c = 0;
738 do {
739 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
740 c++;
741 } while (c <= target);
742 BUG_ON(nid >= MAX_NUMNODES);
743 BUG_ON(!test_bit(nid, pol->v.nodes));
744 return nid;
745}
746
747/* Allocate a page in interleaved policy.
748 Own path because it needs to do special accounting. */
749static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
750{
751 struct zonelist *zl;
752 struct page *page;
753
754 BUG_ON(!node_online(nid));
755 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
756 page = __alloc_pages(gfp, order, zl);
757 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -0700758 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759 put_cpu();
760 }
761 return page;
762}
763
764/**
765 * alloc_page_vma - Allocate a page for a VMA.
766 *
767 * @gfp:
768 * %GFP_USER user allocation.
769 * %GFP_KERNEL kernel allocations,
770 * %GFP_HIGHMEM highmem/user allocations,
771 * %GFP_FS allocation should not call back into a file system.
772 * %GFP_ATOMIC don't sleep.
773 *
774 * @vma: Pointer to VMA or NULL if not available.
775 * @addr: Virtual Address of the allocation. Must be inside the VMA.
776 *
777 * This function allocates a page from the kernel page pool and applies
778 * a NUMA policy associated with the VMA or the current process.
779 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
780 * mm_struct of the VMA to prevent it from going away. Should be used for
781 * all allocations for pages that will be mapped into
782 * user space. Returns NULL when no page can be allocated.
783 *
784 * Should be called with the mm_sem of the vma hold.
785 */
786struct page *
787alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
788{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700789 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790
791 cpuset_update_current_mems_allowed();
792
793 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
794 unsigned nid;
795 if (vma) {
796 unsigned long off;
797 BUG_ON(addr >= vma->vm_end);
798 BUG_ON(addr < vma->vm_start);
799 off = vma->vm_pgoff;
800 off += (addr - vma->vm_start) >> PAGE_SHIFT;
801 nid = offset_il_node(pol, vma, off);
802 } else {
803 /* fall back to process interleaving */
804 nid = interleave_nodes(pol);
805 }
806 return alloc_page_interleave(gfp, 0, nid);
807 }
808 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
809}
810
811/**
812 * alloc_pages_current - Allocate pages.
813 *
814 * @gfp:
815 * %GFP_USER user allocation,
816 * %GFP_KERNEL kernel allocation,
817 * %GFP_HIGHMEM highmem allocation,
818 * %GFP_FS don't call back into a file system.
819 * %GFP_ATOMIC don't sleep.
820 * @order: Power of two of allocation size in pages. 0 is a single page.
821 *
822 * Allocate a page from the kernel page pool. When not in
823 * interrupt context and apply the current process NUMA policy.
824 * Returns NULL when no page can be allocated.
825 *
826 * Don't call cpuset_update_current_mems_allowed() unless
827 * 1) it's ok to take cpuset_sem (can WAIT), and
828 * 2) allocating for current task (not interrupt).
829 */
830struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
831{
832 struct mempolicy *pol = current->mempolicy;
833
834 if ((gfp & __GFP_WAIT) && !in_interrupt())
835 cpuset_update_current_mems_allowed();
836 if (!pol || in_interrupt())
837 pol = &default_policy;
838 if (pol->policy == MPOL_INTERLEAVE)
839 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
840 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
841}
842EXPORT_SYMBOL(alloc_pages_current);
843
844/* Slow path of a mempolicy copy */
845struct mempolicy *__mpol_copy(struct mempolicy *old)
846{
847 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
848
849 if (!new)
850 return ERR_PTR(-ENOMEM);
851 *new = *old;
852 atomic_set(&new->refcnt, 1);
853 if (new->policy == MPOL_BIND) {
854 int sz = ksize(old->v.zonelist);
855 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
856 if (!new->v.zonelist) {
857 kmem_cache_free(policy_cache, new);
858 return ERR_PTR(-ENOMEM);
859 }
860 memcpy(new->v.zonelist, old->v.zonelist, sz);
861 }
862 return new;
863}
864
865/* Slow path of a mempolicy comparison */
866int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
867{
868 if (!a || !b)
869 return 0;
870 if (a->policy != b->policy)
871 return 0;
872 switch (a->policy) {
873 case MPOL_DEFAULT:
874 return 1;
875 case MPOL_INTERLEAVE:
876 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
877 case MPOL_PREFERRED:
878 return a->v.preferred_node == b->v.preferred_node;
879 case MPOL_BIND: {
880 int i;
881 for (i = 0; a->v.zonelist->zones[i]; i++)
882 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
883 return 0;
884 return b->v.zonelist->zones[i] == NULL;
885 }
886 default:
887 BUG();
888 return 0;
889 }
890}
891
892/* Slow path of a mpol destructor. */
893void __mpol_free(struct mempolicy *p)
894{
895 if (!atomic_dec_and_test(&p->refcnt))
896 return;
897 if (p->policy == MPOL_BIND)
898 kfree(p->v.zonelist);
899 p->policy = MPOL_DEFAULT;
900 kmem_cache_free(policy_cache, p);
901}
902
903/*
904 * Hugetlb policy. Same as above, just works with node numbers instead of
905 * zonelists.
906 */
907
908/* Find first node suitable for an allocation */
909int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
910{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700911 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912
913 switch (pol->policy) {
914 case MPOL_DEFAULT:
915 return numa_node_id();
916 case MPOL_BIND:
917 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
918 case MPOL_INTERLEAVE:
919 return interleave_nodes(pol);
920 case MPOL_PREFERRED:
921 return pol->v.preferred_node >= 0 ?
922 pol->v.preferred_node : numa_node_id();
923 }
924 BUG();
925 return 0;
926}
927
928/* Find secondary valid nodes for an allocation */
929int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
930{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700931 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700932
933 switch (pol->policy) {
934 case MPOL_PREFERRED:
935 case MPOL_DEFAULT:
936 case MPOL_INTERLEAVE:
937 return 1;
938 case MPOL_BIND: {
939 struct zone **z;
940 for (z = pol->v.zonelist->zones; *z; z++)
941 if ((*z)->zone_pgdat->node_id == nid)
942 return 1;
943 return 0;
944 }
945 default:
946 BUG();
947 return 0;
948 }
949}
950
951/*
952 * Shared memory backing store policy support.
953 *
954 * Remember policies even when nobody has shared memory mapped.
955 * The policies are kept in Red-Black tree linked from the inode.
956 * They are protected by the sp->lock spinlock, which should be held
957 * for any accesses to the tree.
958 */
959
960/* lookup first element intersecting start-end */
961/* Caller holds sp->lock */
962static struct sp_node *
963sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
964{
965 struct rb_node *n = sp->root.rb_node;
966
967 while (n) {
968 struct sp_node *p = rb_entry(n, struct sp_node, nd);
969
970 if (start >= p->end)
971 n = n->rb_right;
972 else if (end <= p->start)
973 n = n->rb_left;
974 else
975 break;
976 }
977 if (!n)
978 return NULL;
979 for (;;) {
980 struct sp_node *w = NULL;
981 struct rb_node *prev = rb_prev(n);
982 if (!prev)
983 break;
984 w = rb_entry(prev, struct sp_node, nd);
985 if (w->end <= start)
986 break;
987 n = prev;
988 }
989 return rb_entry(n, struct sp_node, nd);
990}
991
992/* Insert a new shared policy into the list. */
993/* Caller holds sp->lock */
994static void sp_insert(struct shared_policy *sp, struct sp_node *new)
995{
996 struct rb_node **p = &sp->root.rb_node;
997 struct rb_node *parent = NULL;
998 struct sp_node *nd;
999
1000 while (*p) {
1001 parent = *p;
1002 nd = rb_entry(parent, struct sp_node, nd);
1003 if (new->start < nd->start)
1004 p = &(*p)->rb_left;
1005 else if (new->end > nd->end)
1006 p = &(*p)->rb_right;
1007 else
1008 BUG();
1009 }
1010 rb_link_node(&new->nd, parent, p);
1011 rb_insert_color(&new->nd, &sp->root);
1012 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1013 new->policy ? new->policy->policy : 0);
1014}
1015
1016/* Find shared policy intersecting idx */
1017struct mempolicy *
1018mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1019{
1020 struct mempolicy *pol = NULL;
1021 struct sp_node *sn;
1022
1023 if (!sp->root.rb_node)
1024 return NULL;
1025 spin_lock(&sp->lock);
1026 sn = sp_lookup(sp, idx, idx+1);
1027 if (sn) {
1028 mpol_get(sn->policy);
1029 pol = sn->policy;
1030 }
1031 spin_unlock(&sp->lock);
1032 return pol;
1033}
1034
1035static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1036{
1037 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1038 rb_erase(&n->nd, &sp->root);
1039 mpol_free(n->policy);
1040 kmem_cache_free(sn_cache, n);
1041}
1042
1043struct sp_node *
1044sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1045{
1046 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1047
1048 if (!n)
1049 return NULL;
1050 n->start = start;
1051 n->end = end;
1052 mpol_get(pol);
1053 n->policy = pol;
1054 return n;
1055}
1056
1057/* Replace a policy range. */
1058static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1059 unsigned long end, struct sp_node *new)
1060{
1061 struct sp_node *n, *new2 = NULL;
1062
1063restart:
1064 spin_lock(&sp->lock);
1065 n = sp_lookup(sp, start, end);
1066 /* Take care of old policies in the same range. */
1067 while (n && n->start < end) {
1068 struct rb_node *next = rb_next(&n->nd);
1069 if (n->start >= start) {
1070 if (n->end <= end)
1071 sp_delete(sp, n);
1072 else
1073 n->start = end;
1074 } else {
1075 /* Old policy spanning whole new range. */
1076 if (n->end > end) {
1077 if (!new2) {
1078 spin_unlock(&sp->lock);
1079 new2 = sp_alloc(end, n->end, n->policy);
1080 if (!new2)
1081 return -ENOMEM;
1082 goto restart;
1083 }
1084 n->end = start;
1085 sp_insert(sp, new2);
1086 new2 = NULL;
1087 break;
1088 } else
1089 n->end = start;
1090 }
1091 if (!next)
1092 break;
1093 n = rb_entry(next, struct sp_node, nd);
1094 }
1095 if (new)
1096 sp_insert(sp, new);
1097 spin_unlock(&sp->lock);
1098 if (new2) {
1099 mpol_free(new2->policy);
1100 kmem_cache_free(sn_cache, new2);
1101 }
1102 return 0;
1103}
1104
1105int mpol_set_shared_policy(struct shared_policy *info,
1106 struct vm_area_struct *vma, struct mempolicy *npol)
1107{
1108 int err;
1109 struct sp_node *new = NULL;
1110 unsigned long sz = vma_pages(vma);
1111
1112 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1113 vma->vm_pgoff,
1114 sz, npol? npol->policy : -1,
1115 npol ? npol->v.nodes[0] : -1);
1116
1117 if (npol) {
1118 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1119 if (!new)
1120 return -ENOMEM;
1121 }
1122 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1123 if (err && new)
1124 kmem_cache_free(sn_cache, new);
1125 return err;
1126}
1127
1128/* Free a backing policy store on inode delete. */
1129void mpol_free_shared_policy(struct shared_policy *p)
1130{
1131 struct sp_node *n;
1132 struct rb_node *next;
1133
1134 if (!p->root.rb_node)
1135 return;
1136 spin_lock(&p->lock);
1137 next = rb_first(&p->root);
1138 while (next) {
1139 n = rb_entry(next, struct sp_node, nd);
1140 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001141 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001142 mpol_free(n->policy);
1143 kmem_cache_free(sn_cache, n);
1144 }
1145 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001146}
1147
1148/* assumes fs == KERNEL_DS */
1149void __init numa_policy_init(void)
1150{
1151 policy_cache = kmem_cache_create("numa_policy",
1152 sizeof(struct mempolicy),
1153 0, SLAB_PANIC, NULL, NULL);
1154
1155 sn_cache = kmem_cache_create("shared_policy_node",
1156 sizeof(struct sp_node),
1157 0, SLAB_PANIC, NULL, NULL);
1158
1159 /* Set interleaving policy for system init. This way not all
1160 the data structures allocated at system boot end up in node zero. */
1161
1162 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1163 MAX_NUMNODES) < 0)
1164 printk("numa_policy_init: interleaving failed\n");
1165}
1166
1167/* Reset policy of current process to default.
1168 * Assumes fs == KERNEL_DS */
1169void numa_default_policy(void)
1170{
1171 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1172}