blob: 08c41da429cf015e9c22443ae5d811d4f5fa3569 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
91static struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
96/* Check if all specified nodes are online */
97static int nodes_online(unsigned long *nodes)
98{
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
100
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
103 set_bit(0, online2);
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 return -EINVAL;
106 return 0;
107}
108
109/* Do sanity checking on a policy */
110static int mpol_check_policy(int mode, unsigned long *nodes)
111{
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
113
114 switch (mode) {
115 case MPOL_DEFAULT:
116 if (!empty)
117 return -EINVAL;
118 break;
119 case MPOL_BIND:
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
122 more for now. */
123 if (empty)
124 return -EINVAL;
125 break;
126 }
127 return nodes_online(nodes);
128}
129
130/* Copy a node mask from user space. */
131static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
133{
134 unsigned long k;
135 unsigned long nlongs;
136 unsigned long endmask;
137
138 --maxnode;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
141 return 0;
142
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
145 endmask = ~0UL;
146 else
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
153 return -EINVAL;
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 unsigned long t;
156 if (get_user(t, nmask + k))
157 return -EFAULT;
158 if (k == nlongs - 1) {
159 if (t & endmask)
160 return -EINVAL;
161 } else if (t)
162 return -EINVAL;
163 }
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 endmask = ~0UL;
166 }
167
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 return -EFAULT;
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
176}
177
178/* Generate a custom zonelist for the BIND policy. */
179static struct zonelist *bind_zonelist(unsigned long *nodes)
180{
181 struct zonelist *zl;
182 int num, max, nd;
183
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 if (!zl)
187 return NULL;
188 num = 0;
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
190 nd < MAX_NUMNODES;
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 int k;
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
196 continue;
197 zl->zones[num++] = z;
198 if (k > policy_zone)
199 policy_zone = k;
200 }
201 }
202 BUG_ON(num >= max);
203 zl->zones[num] = NULL;
204 return zl;
205}
206
207/* Create a new policy */
208static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209{
210 struct mempolicy *policy;
211
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
214 return NULL;
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 if (!policy)
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
219 switch (mode) {
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222 break;
223 case MPOL_PREFERRED:
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
227 break;
228 case MPOL_BIND:
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
233 }
234 break;
235 }
236 policy->policy = mode;
237 return policy;
238}
239
240/* Ensure all existing pages follow the policy. */
241static int
242verify_pages(struct mm_struct *mm,
243 unsigned long addr, unsigned long end, unsigned long *nodes)
244{
245 while (addr < end) {
246 struct page *p;
247 pte_t *pte;
248 pmd_t *pmd;
249 pud_t *pud;
250 pgd_t *pgd;
251 pgd = pgd_offset(mm, addr);
252 if (pgd_none(*pgd)) {
253 unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
254 if (next > addr)
255 break;
256 addr = next;
257 continue;
258 }
259 pud = pud_offset(pgd, addr);
260 if (pud_none(*pud)) {
261 addr = (addr + PUD_SIZE) & PUD_MASK;
262 continue;
263 }
264 pmd = pmd_offset(pud, addr);
265 if (pmd_none(*pmd)) {
266 addr = (addr + PMD_SIZE) & PMD_MASK;
267 continue;
268 }
269 p = NULL;
270 pte = pte_offset_map(pmd, addr);
271 if (pte_present(*pte))
272 p = pte_page(*pte);
273 pte_unmap(pte);
274 if (p) {
275 unsigned nid = page_to_nid(p);
276 if (!test_bit(nid, nodes))
277 return -EIO;
278 }
279 addr += PAGE_SIZE;
280 }
281 return 0;
282}
283
284/* Step 1: check the range */
285static struct vm_area_struct *
286check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
287 unsigned long *nodes, unsigned long flags)
288{
289 int err;
290 struct vm_area_struct *first, *vma, *prev;
291
292 first = find_vma(mm, start);
293 if (!first)
294 return ERR_PTR(-EFAULT);
295 prev = NULL;
296 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
297 if (!vma->vm_next && vma->vm_end < end)
298 return ERR_PTR(-EFAULT);
299 if (prev && prev->vm_end < vma->vm_start)
300 return ERR_PTR(-EFAULT);
301 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
302 err = verify_pages(vma->vm_mm,
303 vma->vm_start, vma->vm_end, nodes);
304 if (err) {
305 first = ERR_PTR(err);
306 break;
307 }
308 }
309 prev = vma;
310 }
311 return first;
312}
313
314/* Apply policy to a single VMA */
315static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
316{
317 int err = 0;
318 struct mempolicy *old = vma->vm_policy;
319
320 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
321 vma->vm_start, vma->vm_end, vma->vm_pgoff,
322 vma->vm_ops, vma->vm_file,
323 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
324
325 if (vma->vm_ops && vma->vm_ops->set_policy)
326 err = vma->vm_ops->set_policy(vma, new);
327 if (!err) {
328 mpol_get(new);
329 vma->vm_policy = new;
330 mpol_free(old);
331 }
332 return err;
333}
334
335/* Step 2: apply policy to a range and do splits. */
336static int mbind_range(struct vm_area_struct *vma, unsigned long start,
337 unsigned long end, struct mempolicy *new)
338{
339 struct vm_area_struct *next;
340 int err;
341
342 err = 0;
343 for (; vma && vma->vm_start < end; vma = next) {
344 next = vma->vm_next;
345 if (vma->vm_start < start)
346 err = split_vma(vma->vm_mm, vma, start, 1);
347 if (!err && vma->vm_end > end)
348 err = split_vma(vma->vm_mm, vma, end, 0);
349 if (!err)
350 err = policy_vma(vma, new);
351 if (err)
352 break;
353 }
354 return err;
355}
356
357/* Change policy for a memory range */
358asmlinkage long sys_mbind(unsigned long start, unsigned long len,
359 unsigned long mode,
360 unsigned long __user *nmask, unsigned long maxnode,
361 unsigned flags)
362{
363 struct vm_area_struct *vma;
364 struct mm_struct *mm = current->mm;
365 struct mempolicy *new;
366 unsigned long end;
367 DECLARE_BITMAP(nodes, MAX_NUMNODES);
368 int err;
369
370 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
371 return -EINVAL;
372 if (start & ~PAGE_MASK)
373 return -EINVAL;
374 if (mode == MPOL_DEFAULT)
375 flags &= ~MPOL_MF_STRICT;
376 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
377 end = start + len;
378 if (end < start)
379 return -EINVAL;
380 if (end == start)
381 return 0;
382
383 err = get_nodes(nodes, nmask, maxnode, mode);
384 if (err)
385 return err;
386
387 new = mpol_new(mode, nodes);
388 if (IS_ERR(new))
389 return PTR_ERR(new);
390
391 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
392 mode,nodes[0]);
393
394 down_write(&mm->mmap_sem);
395 vma = check_range(mm, start, end, nodes, flags);
396 err = PTR_ERR(vma);
397 if (!IS_ERR(vma))
398 err = mbind_range(vma, start, end, new);
399 up_write(&mm->mmap_sem);
400 mpol_free(new);
401 return err;
402}
403
404/* Set the process memory policy */
405asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
406 unsigned long maxnode)
407{
408 int err;
409 struct mempolicy *new;
410 DECLARE_BITMAP(nodes, MAX_NUMNODES);
411
412 if (mode > MPOL_MAX)
413 return -EINVAL;
414 err = get_nodes(nodes, nmask, maxnode, mode);
415 if (err)
416 return err;
417 new = mpol_new(mode, nodes);
418 if (IS_ERR(new))
419 return PTR_ERR(new);
420 mpol_free(current->mempolicy);
421 current->mempolicy = new;
422 if (new && new->policy == MPOL_INTERLEAVE)
423 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
424 return 0;
425}
426
427/* Fill a zone bitmap for a policy */
428static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
429{
430 int i;
431
432 bitmap_zero(nodes, MAX_NUMNODES);
433 switch (p->policy) {
434 case MPOL_BIND:
435 for (i = 0; p->v.zonelist->zones[i]; i++)
436 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
437 break;
438 case MPOL_DEFAULT:
439 break;
440 case MPOL_INTERLEAVE:
441 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
442 break;
443 case MPOL_PREFERRED:
444 /* or use current node instead of online map? */
445 if (p->v.preferred_node < 0)
446 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
447 else
448 __set_bit(p->v.preferred_node, nodes);
449 break;
450 default:
451 BUG();
452 }
453}
454
455static int lookup_node(struct mm_struct *mm, unsigned long addr)
456{
457 struct page *p;
458 int err;
459
460 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
461 if (err >= 0) {
462 err = page_to_nid(p);
463 put_page(p);
464 }
465 return err;
466}
467
468/* Copy a kernel node mask to user space */
469static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
470 void *nodes, unsigned nbytes)
471{
472 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
473
474 if (copy > nbytes) {
475 if (copy > PAGE_SIZE)
476 return -EINVAL;
477 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
478 return -EFAULT;
479 copy = nbytes;
480 }
481 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
482}
483
484/* Retrieve NUMA policy */
485asmlinkage long sys_get_mempolicy(int __user *policy,
486 unsigned long __user *nmask,
487 unsigned long maxnode,
488 unsigned long addr, unsigned long flags)
489{
490 int err, pval;
491 struct mm_struct *mm = current->mm;
492 struct vm_area_struct *vma = NULL;
493 struct mempolicy *pol = current->mempolicy;
494
495 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
496 return -EINVAL;
497 if (nmask != NULL && maxnode < MAX_NUMNODES)
498 return -EINVAL;
499 if (flags & MPOL_F_ADDR) {
500 down_read(&mm->mmap_sem);
501 vma = find_vma_intersection(mm, addr, addr+1);
502 if (!vma) {
503 up_read(&mm->mmap_sem);
504 return -EFAULT;
505 }
506 if (vma->vm_ops && vma->vm_ops->get_policy)
507 pol = vma->vm_ops->get_policy(vma, addr);
508 else
509 pol = vma->vm_policy;
510 } else if (addr)
511 return -EINVAL;
512
513 if (!pol)
514 pol = &default_policy;
515
516 if (flags & MPOL_F_NODE) {
517 if (flags & MPOL_F_ADDR) {
518 err = lookup_node(mm, addr);
519 if (err < 0)
520 goto out;
521 pval = err;
522 } else if (pol == current->mempolicy &&
523 pol->policy == MPOL_INTERLEAVE) {
524 pval = current->il_next;
525 } else {
526 err = -EINVAL;
527 goto out;
528 }
529 } else
530 pval = pol->policy;
531
532 if (vma) {
533 up_read(&current->mm->mmap_sem);
534 vma = NULL;
535 }
536
537 if (policy && put_user(pval, policy))
538 return -EFAULT;
539
540 err = 0;
541 if (nmask) {
542 DECLARE_BITMAP(nodes, MAX_NUMNODES);
543 get_zonemask(pol, nodes);
544 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
545 }
546
547 out:
548 if (vma)
549 up_read(&current->mm->mmap_sem);
550 return err;
551}
552
553#ifdef CONFIG_COMPAT
554
555asmlinkage long compat_sys_get_mempolicy(int __user *policy,
556 compat_ulong_t __user *nmask,
557 compat_ulong_t maxnode,
558 compat_ulong_t addr, compat_ulong_t flags)
559{
560 long err;
561 unsigned long __user *nm = NULL;
562 unsigned long nr_bits, alloc_size;
563 DECLARE_BITMAP(bm, MAX_NUMNODES);
564
565 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
566 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
567
568 if (nmask)
569 nm = compat_alloc_user_space(alloc_size);
570
571 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
572
573 if (!err && nmask) {
574 err = copy_from_user(bm, nm, alloc_size);
575 /* ensure entire bitmap is zeroed */
576 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
577 err |= compat_put_bitmap(nmask, bm, nr_bits);
578 }
579
580 return err;
581}
582
583asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
584 compat_ulong_t maxnode)
585{
586 long err = 0;
587 unsigned long __user *nm = NULL;
588 unsigned long nr_bits, alloc_size;
589 DECLARE_BITMAP(bm, MAX_NUMNODES);
590
591 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
592 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
593
594 if (nmask) {
595 err = compat_get_bitmap(bm, nmask, nr_bits);
596 nm = compat_alloc_user_space(alloc_size);
597 err |= copy_to_user(nm, bm, alloc_size);
598 }
599
600 if (err)
601 return -EFAULT;
602
603 return sys_set_mempolicy(mode, nm, nr_bits+1);
604}
605
606asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
607 compat_ulong_t mode, compat_ulong_t __user *nmask,
608 compat_ulong_t maxnode, compat_ulong_t flags)
609{
610 long err = 0;
611 unsigned long __user *nm = NULL;
612 unsigned long nr_bits, alloc_size;
613 DECLARE_BITMAP(bm, MAX_NUMNODES);
614
615 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
616 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
617
618 if (nmask) {
619 err = compat_get_bitmap(bm, nmask, nr_bits);
620 nm = compat_alloc_user_space(alloc_size);
621 err |= copy_to_user(nm, bm, alloc_size);
622 }
623
624 if (err)
625 return -EFAULT;
626
627 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
628}
629
630#endif
631
632/* Return effective policy for a VMA */
633static struct mempolicy *
634get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
635{
636 struct mempolicy *pol = current->mempolicy;
637
638 if (vma) {
639 if (vma->vm_ops && vma->vm_ops->get_policy)
640 pol = vma->vm_ops->get_policy(vma, addr);
641 else if (vma->vm_policy &&
642 vma->vm_policy->policy != MPOL_DEFAULT)
643 pol = vma->vm_policy;
644 }
645 if (!pol)
646 pol = &default_policy;
647 return pol;
648}
649
650/* Return a zonelist representing a mempolicy */
651static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
652{
653 int nd;
654
655 switch (policy->policy) {
656 case MPOL_PREFERRED:
657 nd = policy->v.preferred_node;
658 if (nd < 0)
659 nd = numa_node_id();
660 break;
661 case MPOL_BIND:
662 /* Lower zones don't get a policy applied */
663 /* Careful: current->mems_allowed might have moved */
Al Viro01424962005-04-24 12:28:34 -0700664 if ((gfp & GFP_ZONEMASK) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
666 return policy->v.zonelist;
667 /*FALL THROUGH*/
668 case MPOL_INTERLEAVE: /* should not happen */
669 case MPOL_DEFAULT:
670 nd = numa_node_id();
671 break;
672 default:
673 nd = 0;
674 BUG();
675 }
676 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
677}
678
679/* Do dynamic interleaving for a process */
680static unsigned interleave_nodes(struct mempolicy *policy)
681{
682 unsigned nid, next;
683 struct task_struct *me = current;
684
685 nid = me->il_next;
686 BUG_ON(nid >= MAX_NUMNODES);
687 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
688 if (next >= MAX_NUMNODES)
689 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
690 me->il_next = next;
691 return nid;
692}
693
694/* Do static interleaving for a VMA with known offset. */
695static unsigned offset_il_node(struct mempolicy *pol,
696 struct vm_area_struct *vma, unsigned long off)
697{
698 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
699 unsigned target = (unsigned)off % nnodes;
700 int c;
701 int nid = -1;
702
703 c = 0;
704 do {
705 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
706 c++;
707 } while (c <= target);
708 BUG_ON(nid >= MAX_NUMNODES);
709 BUG_ON(!test_bit(nid, pol->v.nodes));
710 return nid;
711}
712
713/* Allocate a page in interleaved policy.
714 Own path because it needs to do special accounting. */
715static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
716{
717 struct zonelist *zl;
718 struct page *page;
719
720 BUG_ON(!node_online(nid));
721 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722 page = __alloc_pages(gfp, order, zl);
723 if (page && page_zone(page) == zl->zones[0]) {
724 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
725 put_cpu();
726 }
727 return page;
728}
729
730/**
731 * alloc_page_vma - Allocate a page for a VMA.
732 *
733 * @gfp:
734 * %GFP_USER user allocation.
735 * %GFP_KERNEL kernel allocations,
736 * %GFP_HIGHMEM highmem/user allocations,
737 * %GFP_FS allocation should not call back into a file system.
738 * %GFP_ATOMIC don't sleep.
739 *
740 * @vma: Pointer to VMA or NULL if not available.
741 * @addr: Virtual Address of the allocation. Must be inside the VMA.
742 *
743 * This function allocates a page from the kernel page pool and applies
744 * a NUMA policy associated with the VMA or the current process.
745 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
746 * mm_struct of the VMA to prevent it from going away. Should be used for
747 * all allocations for pages that will be mapped into
748 * user space. Returns NULL when no page can be allocated.
749 *
750 * Should be called with the mm_sem of the vma hold.
751 */
752struct page *
753alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
754{
755 struct mempolicy *pol = get_vma_policy(vma, addr);
756
757 cpuset_update_current_mems_allowed();
758
759 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
760 unsigned nid;
761 if (vma) {
762 unsigned long off;
763 BUG_ON(addr >= vma->vm_end);
764 BUG_ON(addr < vma->vm_start);
765 off = vma->vm_pgoff;
766 off += (addr - vma->vm_start) >> PAGE_SHIFT;
767 nid = offset_il_node(pol, vma, off);
768 } else {
769 /* fall back to process interleaving */
770 nid = interleave_nodes(pol);
771 }
772 return alloc_page_interleave(gfp, 0, nid);
773 }
774 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
775}
776
777/**
778 * alloc_pages_current - Allocate pages.
779 *
780 * @gfp:
781 * %GFP_USER user allocation,
782 * %GFP_KERNEL kernel allocation,
783 * %GFP_HIGHMEM highmem allocation,
784 * %GFP_FS don't call back into a file system.
785 * %GFP_ATOMIC don't sleep.
786 * @order: Power of two of allocation size in pages. 0 is a single page.
787 *
788 * Allocate a page from the kernel page pool. When not in
789 * interrupt context and apply the current process NUMA policy.
790 * Returns NULL when no page can be allocated.
791 *
792 * Don't call cpuset_update_current_mems_allowed() unless
793 * 1) it's ok to take cpuset_sem (can WAIT), and
794 * 2) allocating for current task (not interrupt).
795 */
796struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
797{
798 struct mempolicy *pol = current->mempolicy;
799
800 if ((gfp & __GFP_WAIT) && !in_interrupt())
801 cpuset_update_current_mems_allowed();
802 if (!pol || in_interrupt())
803 pol = &default_policy;
804 if (pol->policy == MPOL_INTERLEAVE)
805 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
806 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
807}
808EXPORT_SYMBOL(alloc_pages_current);
809
810/* Slow path of a mempolicy copy */
811struct mempolicy *__mpol_copy(struct mempolicy *old)
812{
813 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
814
815 if (!new)
816 return ERR_PTR(-ENOMEM);
817 *new = *old;
818 atomic_set(&new->refcnt, 1);
819 if (new->policy == MPOL_BIND) {
820 int sz = ksize(old->v.zonelist);
821 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
822 if (!new->v.zonelist) {
823 kmem_cache_free(policy_cache, new);
824 return ERR_PTR(-ENOMEM);
825 }
826 memcpy(new->v.zonelist, old->v.zonelist, sz);
827 }
828 return new;
829}
830
831/* Slow path of a mempolicy comparison */
832int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
833{
834 if (!a || !b)
835 return 0;
836 if (a->policy != b->policy)
837 return 0;
838 switch (a->policy) {
839 case MPOL_DEFAULT:
840 return 1;
841 case MPOL_INTERLEAVE:
842 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
843 case MPOL_PREFERRED:
844 return a->v.preferred_node == b->v.preferred_node;
845 case MPOL_BIND: {
846 int i;
847 for (i = 0; a->v.zonelist->zones[i]; i++)
848 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
849 return 0;
850 return b->v.zonelist->zones[i] == NULL;
851 }
852 default:
853 BUG();
854 return 0;
855 }
856}
857
858/* Slow path of a mpol destructor. */
859void __mpol_free(struct mempolicy *p)
860{
861 if (!atomic_dec_and_test(&p->refcnt))
862 return;
863 if (p->policy == MPOL_BIND)
864 kfree(p->v.zonelist);
865 p->policy = MPOL_DEFAULT;
866 kmem_cache_free(policy_cache, p);
867}
868
869/*
870 * Hugetlb policy. Same as above, just works with node numbers instead of
871 * zonelists.
872 */
873
874/* Find first node suitable for an allocation */
875int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
876{
877 struct mempolicy *pol = get_vma_policy(vma, addr);
878
879 switch (pol->policy) {
880 case MPOL_DEFAULT:
881 return numa_node_id();
882 case MPOL_BIND:
883 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
884 case MPOL_INTERLEAVE:
885 return interleave_nodes(pol);
886 case MPOL_PREFERRED:
887 return pol->v.preferred_node >= 0 ?
888 pol->v.preferred_node : numa_node_id();
889 }
890 BUG();
891 return 0;
892}
893
894/* Find secondary valid nodes for an allocation */
895int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
896{
897 struct mempolicy *pol = get_vma_policy(vma, addr);
898
899 switch (pol->policy) {
900 case MPOL_PREFERRED:
901 case MPOL_DEFAULT:
902 case MPOL_INTERLEAVE:
903 return 1;
904 case MPOL_BIND: {
905 struct zone **z;
906 for (z = pol->v.zonelist->zones; *z; z++)
907 if ((*z)->zone_pgdat->node_id == nid)
908 return 1;
909 return 0;
910 }
911 default:
912 BUG();
913 return 0;
914 }
915}
916
917/*
918 * Shared memory backing store policy support.
919 *
920 * Remember policies even when nobody has shared memory mapped.
921 * The policies are kept in Red-Black tree linked from the inode.
922 * They are protected by the sp->lock spinlock, which should be held
923 * for any accesses to the tree.
924 */
925
926/* lookup first element intersecting start-end */
927/* Caller holds sp->lock */
928static struct sp_node *
929sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
930{
931 struct rb_node *n = sp->root.rb_node;
932
933 while (n) {
934 struct sp_node *p = rb_entry(n, struct sp_node, nd);
935
936 if (start >= p->end)
937 n = n->rb_right;
938 else if (end <= p->start)
939 n = n->rb_left;
940 else
941 break;
942 }
943 if (!n)
944 return NULL;
945 for (;;) {
946 struct sp_node *w = NULL;
947 struct rb_node *prev = rb_prev(n);
948 if (!prev)
949 break;
950 w = rb_entry(prev, struct sp_node, nd);
951 if (w->end <= start)
952 break;
953 n = prev;
954 }
955 return rb_entry(n, struct sp_node, nd);
956}
957
958/* Insert a new shared policy into the list. */
959/* Caller holds sp->lock */
960static void sp_insert(struct shared_policy *sp, struct sp_node *new)
961{
962 struct rb_node **p = &sp->root.rb_node;
963 struct rb_node *parent = NULL;
964 struct sp_node *nd;
965
966 while (*p) {
967 parent = *p;
968 nd = rb_entry(parent, struct sp_node, nd);
969 if (new->start < nd->start)
970 p = &(*p)->rb_left;
971 else if (new->end > nd->end)
972 p = &(*p)->rb_right;
973 else
974 BUG();
975 }
976 rb_link_node(&new->nd, parent, p);
977 rb_insert_color(&new->nd, &sp->root);
978 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
979 new->policy ? new->policy->policy : 0);
980}
981
982/* Find shared policy intersecting idx */
983struct mempolicy *
984mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
985{
986 struct mempolicy *pol = NULL;
987 struct sp_node *sn;
988
989 if (!sp->root.rb_node)
990 return NULL;
991 spin_lock(&sp->lock);
992 sn = sp_lookup(sp, idx, idx+1);
993 if (sn) {
994 mpol_get(sn->policy);
995 pol = sn->policy;
996 }
997 spin_unlock(&sp->lock);
998 return pol;
999}
1000
1001static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1002{
1003 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1004 rb_erase(&n->nd, &sp->root);
1005 mpol_free(n->policy);
1006 kmem_cache_free(sn_cache, n);
1007}
1008
1009struct sp_node *
1010sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1011{
1012 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1013
1014 if (!n)
1015 return NULL;
1016 n->start = start;
1017 n->end = end;
1018 mpol_get(pol);
1019 n->policy = pol;
1020 return n;
1021}
1022
1023/* Replace a policy range. */
1024static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1025 unsigned long end, struct sp_node *new)
1026{
1027 struct sp_node *n, *new2 = NULL;
1028
1029restart:
1030 spin_lock(&sp->lock);
1031 n = sp_lookup(sp, start, end);
1032 /* Take care of old policies in the same range. */
1033 while (n && n->start < end) {
1034 struct rb_node *next = rb_next(&n->nd);
1035 if (n->start >= start) {
1036 if (n->end <= end)
1037 sp_delete(sp, n);
1038 else
1039 n->start = end;
1040 } else {
1041 /* Old policy spanning whole new range. */
1042 if (n->end > end) {
1043 if (!new2) {
1044 spin_unlock(&sp->lock);
1045 new2 = sp_alloc(end, n->end, n->policy);
1046 if (!new2)
1047 return -ENOMEM;
1048 goto restart;
1049 }
1050 n->end = start;
1051 sp_insert(sp, new2);
1052 new2 = NULL;
1053 break;
1054 } else
1055 n->end = start;
1056 }
1057 if (!next)
1058 break;
1059 n = rb_entry(next, struct sp_node, nd);
1060 }
1061 if (new)
1062 sp_insert(sp, new);
1063 spin_unlock(&sp->lock);
1064 if (new2) {
1065 mpol_free(new2->policy);
1066 kmem_cache_free(sn_cache, new2);
1067 }
1068 return 0;
1069}
1070
1071int mpol_set_shared_policy(struct shared_policy *info,
1072 struct vm_area_struct *vma, struct mempolicy *npol)
1073{
1074 int err;
1075 struct sp_node *new = NULL;
1076 unsigned long sz = vma_pages(vma);
1077
1078 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1079 vma->vm_pgoff,
1080 sz, npol? npol->policy : -1,
1081 npol ? npol->v.nodes[0] : -1);
1082
1083 if (npol) {
1084 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1085 if (!new)
1086 return -ENOMEM;
1087 }
1088 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1089 if (err && new)
1090 kmem_cache_free(sn_cache, new);
1091 return err;
1092}
1093
1094/* Free a backing policy store on inode delete. */
1095void mpol_free_shared_policy(struct shared_policy *p)
1096{
1097 struct sp_node *n;
1098 struct rb_node *next;
1099
1100 if (!p->root.rb_node)
1101 return;
1102 spin_lock(&p->lock);
1103 next = rb_first(&p->root);
1104 while (next) {
1105 n = rb_entry(next, struct sp_node, nd);
1106 next = rb_next(&n->nd);
1107 mpol_free(n->policy);
1108 kmem_cache_free(sn_cache, n);
1109 }
1110 spin_unlock(&p->lock);
1111 p->root = RB_ROOT;
1112}
1113
1114/* assumes fs == KERNEL_DS */
1115void __init numa_policy_init(void)
1116{
1117 policy_cache = kmem_cache_create("numa_policy",
1118 sizeof(struct mempolicy),
1119 0, SLAB_PANIC, NULL, NULL);
1120
1121 sn_cache = kmem_cache_create("shared_policy_node",
1122 sizeof(struct sp_node),
1123 0, SLAB_PANIC, NULL, NULL);
1124
1125 /* Set interleaving policy for system init. This way not all
1126 the data structures allocated at system boot end up in node zero. */
1127
1128 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1129 MAX_NUMNODES) < 0)
1130 printk("numa_policy_init: interleaving failed\n");
1131}
1132
1133/* Reset policy of current process to default.
1134 * Assumes fs == KERNEL_DS */
1135void numa_default_policy(void)
1136{
1137 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1138}