blob: bec88c81244e0d4f9541f4098f5f8c8409ba455b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
86#include <asm/tlbflush.h>
87#include <asm/uaccess.h>
88
89static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache;
91
92#define PDprintk(fmt...)
93
94/* Highest zone. An specific allocation for a zone below that is not
95 policied. */
96static int policy_zone;
97
Andi Kleend42c6992005-07-06 19:56:03 +020098struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -070099 .refcnt = ATOMIC_INIT(1), /* never free it */
100 .policy = MPOL_DEFAULT,
101};
102
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103/* Do sanity checking on a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700104static int mpol_check_policy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700106 int empty = nodes_empty(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107
108 switch (mode) {
109 case MPOL_DEFAULT:
110 if (!empty)
111 return -EINVAL;
112 break;
113 case MPOL_BIND:
114 case MPOL_INTERLEAVE:
115 /* Preferred will only use the first bit, but allow
116 more for now. */
117 if (empty)
118 return -EINVAL;
119 break;
120 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123/* Generate a custom zonelist for the BIND policy. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700124static struct zonelist *bind_zonelist(nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700125{
126 struct zonelist *zl;
127 int num, max, nd;
128
Andi Kleendfcd3c02005-10-29 18:15:48 -0700129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
131 if (!zl)
132 return NULL;
133 num = 0;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700134 for_each_node_mask(nd, *nodes) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 int k;
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145 zl->zones[num] = NULL;
146 return zl;
147}
148
149/* Create a new policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151{
152 struct mempolicy *policy;
153
Andi Kleendfcd3c02005-10-29 18:15:48 -0700154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700163 policy->v.nodes = *nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164 break;
165 case MPOL_PREFERRED:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700166 policy->v.preferred_node = first_node(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 if (policy->v.preferred_node >= MAX_NUMNODES)
168 policy->v.preferred_node = -1;
169 break;
170 case MPOL_BIND:
171 policy->v.zonelist = bind_zonelist(nodes);
172 if (policy->v.zonelist == NULL) {
173 kmem_cache_free(policy_cache, policy);
174 return ERR_PTR(-ENOMEM);
175 }
176 break;
177 }
178 policy->policy = mode;
179 return policy;
180}
181
182/* Ensure all existing pages follow the policy. */
Nick Pigginb5810032005-10-29 18:16:12 -0700183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700184 unsigned long addr, unsigned long end, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185{
Hugh Dickins91612e02005-06-21 17:15:07 -0700186 pte_t *orig_pte;
187 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700188 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700189
Hugh Dickins705e87c2005-10-29 18:16:27 -0700190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700191 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800192 struct page *page;
Hugh Dickins91612e02005-06-21 17:15:07 -0700193 unsigned int nid;
194
195 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800197 page = vm_normal_page(vma, addr, *pte);
198 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800200 nid = page_to_nid(page);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700201 if (!node_isset(nid, *nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700202 break;
203 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700204 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700205 return addr != end;
206}
207
Nick Pigginb5810032005-10-29 18:16:12 -0700208static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700209 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700210{
211 pmd_t *pmd;
212 unsigned long next;
213
214 pmd = pmd_offset(pud, addr);
215 do {
216 next = pmd_addr_end(addr, end);
217 if (pmd_none_or_clear_bad(pmd))
218 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700219 if (check_pte_range(vma, pmd, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700220 return -EIO;
221 } while (pmd++, addr = next, addr != end);
222 return 0;
223}
224
Nick Pigginb5810032005-10-29 18:16:12 -0700225static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700226 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700227{
228 pud_t *pud;
229 unsigned long next;
230
231 pud = pud_offset(pgd, addr);
232 do {
233 next = pud_addr_end(addr, end);
234 if (pud_none_or_clear_bad(pud))
235 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700236 if (check_pmd_range(vma, pud, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700237 return -EIO;
238 } while (pud++, addr = next, addr != end);
239 return 0;
240}
241
Nick Pigginb5810032005-10-29 18:16:12 -0700242static inline int check_pgd_range(struct vm_area_struct *vma,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700243 unsigned long addr, unsigned long end, nodemask_t *nodes)
Hugh Dickins91612e02005-06-21 17:15:07 -0700244{
245 pgd_t *pgd;
246 unsigned long next;
247
Nick Pigginb5810032005-10-29 18:16:12 -0700248 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700249 do {
250 next = pgd_addr_end(addr, end);
251 if (pgd_none_or_clear_bad(pgd))
252 continue;
Nick Pigginb5810032005-10-29 18:16:12 -0700253 if (check_pud_range(vma, pgd, addr, next, nodes))
Hugh Dickins91612e02005-06-21 17:15:07 -0700254 return -EIO;
255 } while (pgd++, addr = next, addr != end);
256 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257}
258
259/* Step 1: check the range */
260static struct vm_area_struct *
261check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700262 nodemask_t *nodes, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
264 int err;
265 struct vm_area_struct *first, *vma, *prev;
266
267 first = find_vma(mm, start);
268 if (!first)
269 return ERR_PTR(-EFAULT);
270 prev = NULL;
271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
272 if (!vma->vm_next && vma->vm_end < end)
273 return ERR_PTR(-EFAULT);
274 if (prev && prev->vm_end < vma->vm_start)
275 return ERR_PTR(-EFAULT);
276 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700277 unsigned long endvma = vma->vm_end;
278 if (endvma > end)
279 endvma = end;
280 if (vma->vm_start > start)
281 start = vma->vm_start;
Nick Pigginb5810032005-10-29 18:16:12 -0700282 err = check_pgd_range(vma, start, endvma, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283 if (err) {
284 first = ERR_PTR(err);
285 break;
286 }
287 }
288 prev = vma;
289 }
290 return first;
291}
292
293/* Apply policy to a single VMA */
294static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
295{
296 int err = 0;
297 struct mempolicy *old = vma->vm_policy;
298
299 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
300 vma->vm_start, vma->vm_end, vma->vm_pgoff,
301 vma->vm_ops, vma->vm_file,
302 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
303
304 if (vma->vm_ops && vma->vm_ops->set_policy)
305 err = vma->vm_ops->set_policy(vma, new);
306 if (!err) {
307 mpol_get(new);
308 vma->vm_policy = new;
309 mpol_free(old);
310 }
311 return err;
312}
313
314/* Step 2: apply policy to a range and do splits. */
315static int mbind_range(struct vm_area_struct *vma, unsigned long start,
316 unsigned long end, struct mempolicy *new)
317{
318 struct vm_area_struct *next;
319 int err;
320
321 err = 0;
322 for (; vma && vma->vm_start < end; vma = next) {
323 next = vma->vm_next;
324 if (vma->vm_start < start)
325 err = split_vma(vma->vm_mm, vma, start, 1);
326 if (!err && vma->vm_end > end)
327 err = split_vma(vma->vm_mm, vma, end, 0);
328 if (!err)
329 err = policy_vma(vma, new);
330 if (err)
331 break;
332 }
333 return err;
334}
335
Christoph Lameter8bccd852005-10-29 18:16:59 -0700336static int contextualize_policy(int mode, nodemask_t *nodes)
337{
338 if (!nodes)
339 return 0;
340
341 /* Update current mems_allowed */
342 cpuset_update_current_mems_allowed();
343 /* Ignore nodes not set in current->mems_allowed */
344 cpuset_restrict_to_mems_allowed(nodes->bits);
345 return mpol_check_policy(mode, nodes);
346}
347
348long do_mbind(unsigned long start, unsigned long len,
349 unsigned long mode, nodemask_t *nmask, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350{
351 struct vm_area_struct *vma;
352 struct mm_struct *mm = current->mm;
353 struct mempolicy *new;
354 unsigned long end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355 int err;
356
357 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
358 return -EINVAL;
359 if (start & ~PAGE_MASK)
360 return -EINVAL;
361 if (mode == MPOL_DEFAULT)
362 flags &= ~MPOL_MF_STRICT;
363 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
364 end = start + len;
365 if (end < start)
366 return -EINVAL;
367 if (end == start)
368 return 0;
Christoph Lameter5fcbb232005-10-29 18:17:00 -0700369 if (mpol_check_policy(mode, nmask))
Christoph Lameter8bccd852005-10-29 18:16:59 -0700370 return -EINVAL;
371 new = mpol_new(mode, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372 if (IS_ERR(new))
373 return PTR_ERR(new);
374
375 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700376 mode,nodes_addr(nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377
378 down_write(&mm->mmap_sem);
Christoph Lameter8bccd852005-10-29 18:16:59 -0700379 vma = check_range(mm, start, end, nmask, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 err = PTR_ERR(vma);
381 if (!IS_ERR(vma))
382 err = mbind_range(vma, start, end, new);
383 up_write(&mm->mmap_sem);
384 mpol_free(new);
385 return err;
386}
387
388/* Set the process memory policy */
Christoph Lameter8bccd852005-10-29 18:16:59 -0700389long do_set_mempolicy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391 struct mempolicy *new;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392
Christoph Lameter8bccd852005-10-29 18:16:59 -0700393 if (contextualize_policy(mode, nodes))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700395 new = mpol_new(mode, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 if (IS_ERR(new))
397 return PTR_ERR(new);
398 mpol_free(current->mempolicy);
399 current->mempolicy = new;
400 if (new && new->policy == MPOL_INTERLEAVE)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700401 current->il_next = first_node(new->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 return 0;
403}
404
405/* Fill a zone bitmap for a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700406static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407{
408 int i;
409
Andi Kleendfcd3c02005-10-29 18:15:48 -0700410 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411 switch (p->policy) {
412 case MPOL_BIND:
413 for (i = 0; p->v.zonelist->zones[i]; i++)
Christoph Lameter8bccd852005-10-29 18:16:59 -0700414 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
415 *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 break;
417 case MPOL_DEFAULT:
418 break;
419 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700420 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421 break;
422 case MPOL_PREFERRED:
423 /* or use current node instead of online map? */
424 if (p->v.preferred_node < 0)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700425 *nodes = node_online_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700427 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428 break;
429 default:
430 BUG();
431 }
432}
433
434static int lookup_node(struct mm_struct *mm, unsigned long addr)
435{
436 struct page *p;
437 int err;
438
439 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
440 if (err >= 0) {
441 err = page_to_nid(p);
442 put_page(p);
443 }
444 return err;
445}
446
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447/* Retrieve NUMA policy */
Christoph Lameter8bccd852005-10-29 18:16:59 -0700448long do_get_mempolicy(int *policy, nodemask_t *nmask,
449 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700451 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 struct mm_struct *mm = current->mm;
453 struct vm_area_struct *vma = NULL;
454 struct mempolicy *pol = current->mempolicy;
455
Paul Jackson68860ec2005-10-30 15:02:36 -0800456 cpuset_update_current_mems_allowed();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
458 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 if (flags & MPOL_F_ADDR) {
460 down_read(&mm->mmap_sem);
461 vma = find_vma_intersection(mm, addr, addr+1);
462 if (!vma) {
463 up_read(&mm->mmap_sem);
464 return -EFAULT;
465 }
466 if (vma->vm_ops && vma->vm_ops->get_policy)
467 pol = vma->vm_ops->get_policy(vma, addr);
468 else
469 pol = vma->vm_policy;
470 } else if (addr)
471 return -EINVAL;
472
473 if (!pol)
474 pol = &default_policy;
475
476 if (flags & MPOL_F_NODE) {
477 if (flags & MPOL_F_ADDR) {
478 err = lookup_node(mm, addr);
479 if (err < 0)
480 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700481 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 } else if (pol == current->mempolicy &&
483 pol->policy == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700484 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 } else {
486 err = -EINVAL;
487 goto out;
488 }
489 } else
Christoph Lameter8bccd852005-10-29 18:16:59 -0700490 *policy = pol->policy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491
492 if (vma) {
493 up_read(&current->mm->mmap_sem);
494 vma = NULL;
495 }
496
Linus Torvalds1da177e2005-04-16 15:20:36 -0700497 err = 0;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700498 if (nmask)
499 get_zonemask(pol, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500
501 out:
502 if (vma)
503 up_read(&current->mm->mmap_sem);
504 return err;
505}
506
Christoph Lameter8bccd852005-10-29 18:16:59 -0700507/*
508 * User space interface with variable sized bitmaps for nodelists.
509 */
510
511/* Copy a node mask from user space. */
512static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
513 unsigned long maxnode)
514{
515 unsigned long k;
516 unsigned long nlongs;
517 unsigned long endmask;
518
519 --maxnode;
520 nodes_clear(*nodes);
521 if (maxnode == 0 || !nmask)
522 return 0;
523
524 nlongs = BITS_TO_LONGS(maxnode);
525 if ((maxnode % BITS_PER_LONG) == 0)
526 endmask = ~0UL;
527 else
528 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
529
530 /* When the user specified more nodes than supported just check
531 if the non supported part is all zero. */
532 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
533 if (nlongs > PAGE_SIZE/sizeof(long))
534 return -EINVAL;
535 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
536 unsigned long t;
537 if (get_user(t, nmask + k))
538 return -EFAULT;
539 if (k == nlongs - 1) {
540 if (t & endmask)
541 return -EINVAL;
542 } else if (t)
543 return -EINVAL;
544 }
545 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
546 endmask = ~0UL;
547 }
548
549 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
550 return -EFAULT;
551 nodes_addr(*nodes)[nlongs-1] &= endmask;
552 return 0;
553}
554
555/* Copy a kernel node mask to user space */
556static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
557 nodemask_t *nodes)
558{
559 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
560 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
561
562 if (copy > nbytes) {
563 if (copy > PAGE_SIZE)
564 return -EINVAL;
565 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
566 return -EFAULT;
567 copy = nbytes;
568 }
569 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
570}
571
572asmlinkage long sys_mbind(unsigned long start, unsigned long len,
573 unsigned long mode,
574 unsigned long __user *nmask, unsigned long maxnode,
575 unsigned flags)
576{
577 nodemask_t nodes;
578 int err;
579
580 err = get_nodes(&nodes, nmask, maxnode);
581 if (err)
582 return err;
583 return do_mbind(start, len, mode, &nodes, flags);
584}
585
586/* Set the process memory policy */
587asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
588 unsigned long maxnode)
589{
590 int err;
591 nodemask_t nodes;
592
593 if (mode < 0 || mode > MPOL_MAX)
594 return -EINVAL;
595 err = get_nodes(&nodes, nmask, maxnode);
596 if (err)
597 return err;
598 return do_set_mempolicy(mode, &nodes);
599}
600
601/* Retrieve NUMA policy */
602asmlinkage long sys_get_mempolicy(int __user *policy,
603 unsigned long __user *nmask,
604 unsigned long maxnode,
605 unsigned long addr, unsigned long flags)
606{
607 int err, pval;
608 nodemask_t nodes;
609
610 if (nmask != NULL && maxnode < MAX_NUMNODES)
611 return -EINVAL;
612
613 err = do_get_mempolicy(&pval, &nodes, addr, flags);
614
615 if (err)
616 return err;
617
618 if (policy && put_user(pval, policy))
619 return -EFAULT;
620
621 if (nmask)
622 err = copy_nodes_to_user(nmask, maxnode, &nodes);
623
624 return err;
625}
626
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627#ifdef CONFIG_COMPAT
628
629asmlinkage long compat_sys_get_mempolicy(int __user *policy,
630 compat_ulong_t __user *nmask,
631 compat_ulong_t maxnode,
632 compat_ulong_t addr, compat_ulong_t flags)
633{
634 long err;
635 unsigned long __user *nm = NULL;
636 unsigned long nr_bits, alloc_size;
637 DECLARE_BITMAP(bm, MAX_NUMNODES);
638
639 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
640 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
641
642 if (nmask)
643 nm = compat_alloc_user_space(alloc_size);
644
645 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
646
647 if (!err && nmask) {
648 err = copy_from_user(bm, nm, alloc_size);
649 /* ensure entire bitmap is zeroed */
650 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
651 err |= compat_put_bitmap(nmask, bm, nr_bits);
652 }
653
654 return err;
655}
656
657asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
658 compat_ulong_t maxnode)
659{
660 long err = 0;
661 unsigned long __user *nm = NULL;
662 unsigned long nr_bits, alloc_size;
663 DECLARE_BITMAP(bm, MAX_NUMNODES);
664
665 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
666 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
667
668 if (nmask) {
669 err = compat_get_bitmap(bm, nmask, nr_bits);
670 nm = compat_alloc_user_space(alloc_size);
671 err |= copy_to_user(nm, bm, alloc_size);
672 }
673
674 if (err)
675 return -EFAULT;
676
677 return sys_set_mempolicy(mode, nm, nr_bits+1);
678}
679
680asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
681 compat_ulong_t mode, compat_ulong_t __user *nmask,
682 compat_ulong_t maxnode, compat_ulong_t flags)
683{
684 long err = 0;
685 unsigned long __user *nm = NULL;
686 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700687 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700688
689 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
690 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
691
692 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700693 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700695 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696 }
697
698 if (err)
699 return -EFAULT;
700
701 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
702}
703
704#endif
705
706/* Return effective policy for a VMA */
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700707struct mempolicy *
708get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700709{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700710 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711
712 if (vma) {
713 if (vma->vm_ops && vma->vm_ops->get_policy)
Christoph Lameter8bccd852005-10-29 18:16:59 -0700714 pol = vma->vm_ops->get_policy(vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715 else if (vma->vm_policy &&
716 vma->vm_policy->policy != MPOL_DEFAULT)
717 pol = vma->vm_policy;
718 }
719 if (!pol)
720 pol = &default_policy;
721 return pol;
722}
723
724/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +0100725static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726{
727 int nd;
728
729 switch (policy->policy) {
730 case MPOL_PREFERRED:
731 nd = policy->v.preferred_node;
732 if (nd < 0)
733 nd = numa_node_id();
734 break;
735 case MPOL_BIND:
736 /* Lower zones don't get a policy applied */
737 /* Careful: current->mems_allowed might have moved */
Al Viroaf4ca452005-10-21 02:55:38 -0400738 if (gfp_zone(gfp) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
740 return policy->v.zonelist;
741 /*FALL THROUGH*/
742 case MPOL_INTERLEAVE: /* should not happen */
743 case MPOL_DEFAULT:
744 nd = numa_node_id();
745 break;
746 default:
747 nd = 0;
748 BUG();
749 }
Al Viroaf4ca452005-10-21 02:55:38 -0400750 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751}
752
753/* Do dynamic interleaving for a process */
754static unsigned interleave_nodes(struct mempolicy *policy)
755{
756 unsigned nid, next;
757 struct task_struct *me = current;
758
759 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700760 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700762 next = first_node(policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763 me->il_next = next;
764 return nid;
765}
766
767/* Do static interleaving for a VMA with known offset. */
768static unsigned offset_il_node(struct mempolicy *pol,
769 struct vm_area_struct *vma, unsigned long off)
770{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700771 unsigned nnodes = nodes_weight(pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700772 unsigned target = (unsigned)off % nnodes;
773 int c;
774 int nid = -1;
775
776 c = 0;
777 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700778 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779 c++;
780 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 return nid;
782}
783
784/* Allocate a page in interleaved policy.
785 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -0700786static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
787 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788{
789 struct zonelist *zl;
790 struct page *page;
791
Al Viroaf4ca452005-10-21 02:55:38 -0400792 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793 page = __alloc_pages(gfp, order, zl);
794 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -0700795 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796 put_cpu();
797 }
798 return page;
799}
800
801/**
802 * alloc_page_vma - Allocate a page for a VMA.
803 *
804 * @gfp:
805 * %GFP_USER user allocation.
806 * %GFP_KERNEL kernel allocations,
807 * %GFP_HIGHMEM highmem/user allocations,
808 * %GFP_FS allocation should not call back into a file system.
809 * %GFP_ATOMIC don't sleep.
810 *
811 * @vma: Pointer to VMA or NULL if not available.
812 * @addr: Virtual Address of the allocation. Must be inside the VMA.
813 *
814 * This function allocates a page from the kernel page pool and applies
815 * a NUMA policy associated with the VMA or the current process.
816 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
817 * mm_struct of the VMA to prevent it from going away. Should be used for
818 * all allocations for pages that will be mapped into
819 * user space. Returns NULL when no page can be allocated.
820 *
821 * Should be called with the mm_sem of the vma hold.
822 */
823struct page *
Al Virodd0fc662005-10-07 07:46:04 +0100824alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700826 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827
828 cpuset_update_current_mems_allowed();
829
830 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
831 unsigned nid;
832 if (vma) {
833 unsigned long off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700834 off = vma->vm_pgoff;
835 off += (addr - vma->vm_start) >> PAGE_SHIFT;
836 nid = offset_il_node(pol, vma, off);
837 } else {
838 /* fall back to process interleaving */
839 nid = interleave_nodes(pol);
840 }
841 return alloc_page_interleave(gfp, 0, nid);
842 }
843 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
844}
845
846/**
847 * alloc_pages_current - Allocate pages.
848 *
849 * @gfp:
850 * %GFP_USER user allocation,
851 * %GFP_KERNEL kernel allocation,
852 * %GFP_HIGHMEM highmem allocation,
853 * %GFP_FS don't call back into a file system.
854 * %GFP_ATOMIC don't sleep.
855 * @order: Power of two of allocation size in pages. 0 is a single page.
856 *
857 * Allocate a page from the kernel page pool. When not in
858 * interrupt context and apply the current process NUMA policy.
859 * Returns NULL when no page can be allocated.
860 *
861 * Don't call cpuset_update_current_mems_allowed() unless
862 * 1) it's ok to take cpuset_sem (can WAIT), and
863 * 2) allocating for current task (not interrupt).
864 */
Al Virodd0fc662005-10-07 07:46:04 +0100865struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866{
867 struct mempolicy *pol = current->mempolicy;
868
869 if ((gfp & __GFP_WAIT) && !in_interrupt())
870 cpuset_update_current_mems_allowed();
871 if (!pol || in_interrupt())
872 pol = &default_policy;
873 if (pol->policy == MPOL_INTERLEAVE)
874 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
875 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
876}
877EXPORT_SYMBOL(alloc_pages_current);
878
879/* Slow path of a mempolicy copy */
880struct mempolicy *__mpol_copy(struct mempolicy *old)
881{
882 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
883
884 if (!new)
885 return ERR_PTR(-ENOMEM);
886 *new = *old;
887 atomic_set(&new->refcnt, 1);
888 if (new->policy == MPOL_BIND) {
889 int sz = ksize(old->v.zonelist);
890 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
891 if (!new->v.zonelist) {
892 kmem_cache_free(policy_cache, new);
893 return ERR_PTR(-ENOMEM);
894 }
895 memcpy(new->v.zonelist, old->v.zonelist, sz);
896 }
897 return new;
898}
899
900/* Slow path of a mempolicy comparison */
901int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
902{
903 if (!a || !b)
904 return 0;
905 if (a->policy != b->policy)
906 return 0;
907 switch (a->policy) {
908 case MPOL_DEFAULT:
909 return 1;
910 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700911 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912 case MPOL_PREFERRED:
913 return a->v.preferred_node == b->v.preferred_node;
914 case MPOL_BIND: {
915 int i;
916 for (i = 0; a->v.zonelist->zones[i]; i++)
917 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
918 return 0;
919 return b->v.zonelist->zones[i] == NULL;
920 }
921 default:
922 BUG();
923 return 0;
924 }
925}
926
927/* Slow path of a mpol destructor. */
928void __mpol_free(struct mempolicy *p)
929{
930 if (!atomic_dec_and_test(&p->refcnt))
931 return;
932 if (p->policy == MPOL_BIND)
933 kfree(p->v.zonelist);
934 p->policy = MPOL_DEFAULT;
935 kmem_cache_free(policy_cache, p);
936}
937
938/*
939 * Hugetlb policy. Same as above, just works with node numbers instead of
940 * zonelists.
941 */
942
943/* Find first node suitable for an allocation */
944int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
945{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700946 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947
948 switch (pol->policy) {
949 case MPOL_DEFAULT:
950 return numa_node_id();
951 case MPOL_BIND:
952 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
953 case MPOL_INTERLEAVE:
954 return interleave_nodes(pol);
955 case MPOL_PREFERRED:
956 return pol->v.preferred_node >= 0 ?
957 pol->v.preferred_node : numa_node_id();
958 }
959 BUG();
960 return 0;
961}
962
963/* Find secondary valid nodes for an allocation */
964int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
965{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700966 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700967
968 switch (pol->policy) {
969 case MPOL_PREFERRED:
970 case MPOL_DEFAULT:
971 case MPOL_INTERLEAVE:
972 return 1;
973 case MPOL_BIND: {
974 struct zone **z;
975 for (z = pol->v.zonelist->zones; *z; z++)
976 if ((*z)->zone_pgdat->node_id == nid)
977 return 1;
978 return 0;
979 }
980 default:
981 BUG();
982 return 0;
983 }
984}
985
986/*
987 * Shared memory backing store policy support.
988 *
989 * Remember policies even when nobody has shared memory mapped.
990 * The policies are kept in Red-Black tree linked from the inode.
991 * They are protected by the sp->lock spinlock, which should be held
992 * for any accesses to the tree.
993 */
994
995/* lookup first element intersecting start-end */
996/* Caller holds sp->lock */
997static struct sp_node *
998sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
999{
1000 struct rb_node *n = sp->root.rb_node;
1001
1002 while (n) {
1003 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1004
1005 if (start >= p->end)
1006 n = n->rb_right;
1007 else if (end <= p->start)
1008 n = n->rb_left;
1009 else
1010 break;
1011 }
1012 if (!n)
1013 return NULL;
1014 for (;;) {
1015 struct sp_node *w = NULL;
1016 struct rb_node *prev = rb_prev(n);
1017 if (!prev)
1018 break;
1019 w = rb_entry(prev, struct sp_node, nd);
1020 if (w->end <= start)
1021 break;
1022 n = prev;
1023 }
1024 return rb_entry(n, struct sp_node, nd);
1025}
1026
1027/* Insert a new shared policy into the list. */
1028/* Caller holds sp->lock */
1029static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1030{
1031 struct rb_node **p = &sp->root.rb_node;
1032 struct rb_node *parent = NULL;
1033 struct sp_node *nd;
1034
1035 while (*p) {
1036 parent = *p;
1037 nd = rb_entry(parent, struct sp_node, nd);
1038 if (new->start < nd->start)
1039 p = &(*p)->rb_left;
1040 else if (new->end > nd->end)
1041 p = &(*p)->rb_right;
1042 else
1043 BUG();
1044 }
1045 rb_link_node(&new->nd, parent, p);
1046 rb_insert_color(&new->nd, &sp->root);
1047 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1048 new->policy ? new->policy->policy : 0);
1049}
1050
1051/* Find shared policy intersecting idx */
1052struct mempolicy *
1053mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1054{
1055 struct mempolicy *pol = NULL;
1056 struct sp_node *sn;
1057
1058 if (!sp->root.rb_node)
1059 return NULL;
1060 spin_lock(&sp->lock);
1061 sn = sp_lookup(sp, idx, idx+1);
1062 if (sn) {
1063 mpol_get(sn->policy);
1064 pol = sn->policy;
1065 }
1066 spin_unlock(&sp->lock);
1067 return pol;
1068}
1069
1070static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1071{
1072 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1073 rb_erase(&n->nd, &sp->root);
1074 mpol_free(n->policy);
1075 kmem_cache_free(sn_cache, n);
1076}
1077
1078struct sp_node *
1079sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1080{
1081 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1082
1083 if (!n)
1084 return NULL;
1085 n->start = start;
1086 n->end = end;
1087 mpol_get(pol);
1088 n->policy = pol;
1089 return n;
1090}
1091
1092/* Replace a policy range. */
1093static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1094 unsigned long end, struct sp_node *new)
1095{
1096 struct sp_node *n, *new2 = NULL;
1097
1098restart:
1099 spin_lock(&sp->lock);
1100 n = sp_lookup(sp, start, end);
1101 /* Take care of old policies in the same range. */
1102 while (n && n->start < end) {
1103 struct rb_node *next = rb_next(&n->nd);
1104 if (n->start >= start) {
1105 if (n->end <= end)
1106 sp_delete(sp, n);
1107 else
1108 n->start = end;
1109 } else {
1110 /* Old policy spanning whole new range. */
1111 if (n->end > end) {
1112 if (!new2) {
1113 spin_unlock(&sp->lock);
1114 new2 = sp_alloc(end, n->end, n->policy);
1115 if (!new2)
1116 return -ENOMEM;
1117 goto restart;
1118 }
1119 n->end = start;
1120 sp_insert(sp, new2);
1121 new2 = NULL;
1122 break;
1123 } else
1124 n->end = start;
1125 }
1126 if (!next)
1127 break;
1128 n = rb_entry(next, struct sp_node, nd);
1129 }
1130 if (new)
1131 sp_insert(sp, new);
1132 spin_unlock(&sp->lock);
1133 if (new2) {
1134 mpol_free(new2->policy);
1135 kmem_cache_free(sn_cache, new2);
1136 }
1137 return 0;
1138}
1139
1140int mpol_set_shared_policy(struct shared_policy *info,
1141 struct vm_area_struct *vma, struct mempolicy *npol)
1142{
1143 int err;
1144 struct sp_node *new = NULL;
1145 unsigned long sz = vma_pages(vma);
1146
1147 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1148 vma->vm_pgoff,
1149 sz, npol? npol->policy : -1,
Andi Kleendfcd3c02005-10-29 18:15:48 -07001150 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151
1152 if (npol) {
1153 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1154 if (!new)
1155 return -ENOMEM;
1156 }
1157 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1158 if (err && new)
1159 kmem_cache_free(sn_cache, new);
1160 return err;
1161}
1162
1163/* Free a backing policy store on inode delete. */
1164void mpol_free_shared_policy(struct shared_policy *p)
1165{
1166 struct sp_node *n;
1167 struct rb_node *next;
1168
1169 if (!p->root.rb_node)
1170 return;
1171 spin_lock(&p->lock);
1172 next = rb_first(&p->root);
1173 while (next) {
1174 n = rb_entry(next, struct sp_node, nd);
1175 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001176 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001177 mpol_free(n->policy);
1178 kmem_cache_free(sn_cache, n);
1179 }
1180 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181}
1182
1183/* assumes fs == KERNEL_DS */
1184void __init numa_policy_init(void)
1185{
1186 policy_cache = kmem_cache_create("numa_policy",
1187 sizeof(struct mempolicy),
1188 0, SLAB_PANIC, NULL, NULL);
1189
1190 sn_cache = kmem_cache_create("shared_policy_node",
1191 sizeof(struct sp_node),
1192 0, SLAB_PANIC, NULL, NULL);
1193
1194 /* Set interleaving policy for system init. This way not all
1195 the data structures allocated at system boot end up in node zero. */
1196
Christoph Lameter8bccd852005-10-29 18:16:59 -07001197 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 printk("numa_policy_init: interleaving failed\n");
1199}
1200
Christoph Lameter8bccd852005-10-29 18:16:59 -07001201/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202void numa_default_policy(void)
1203{
Christoph Lameter8bccd852005-10-29 18:16:59 -07001204 do_set_mempolicy(MPOL_DEFAULT, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205}
Paul Jackson68860ec2005-10-30 15:02:36 -08001206
1207/* Migrate a policy to a different set of nodes */
1208static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1209 const nodemask_t *new)
1210{
1211 nodemask_t tmp;
1212
1213 if (!pol)
1214 return;
1215
1216 switch (pol->policy) {
1217 case MPOL_DEFAULT:
1218 break;
1219 case MPOL_INTERLEAVE:
1220 nodes_remap(tmp, pol->v.nodes, *old, *new);
1221 pol->v.nodes = tmp;
1222 current->il_next = node_remap(current->il_next, *old, *new);
1223 break;
1224 case MPOL_PREFERRED:
1225 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1226 *old, *new);
1227 break;
1228 case MPOL_BIND: {
1229 nodemask_t nodes;
1230 struct zone **z;
1231 struct zonelist *zonelist;
1232
1233 nodes_clear(nodes);
1234 for (z = pol->v.zonelist->zones; *z; z++)
1235 node_set((*z)->zone_pgdat->node_id, nodes);
1236 nodes_remap(tmp, nodes, *old, *new);
1237 nodes = tmp;
1238
1239 zonelist = bind_zonelist(&nodes);
1240
1241 /* If no mem, then zonelist is NULL and we keep old zonelist.
1242 * If that old zonelist has no remaining mems_allowed nodes,
1243 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1244 */
1245
1246 if (zonelist) {
1247 /* Good - got mem - substitute new zonelist */
1248 kfree(pol->v.zonelist);
1249 pol->v.zonelist = zonelist;
1250 }
1251 break;
1252 }
1253 default:
1254 BUG();
1255 break;
1256 }
1257}
1258
1259/*
1260 * Someone moved this task to different nodes. Fixup mempolicies.
1261 *
1262 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1263 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1264 */
1265void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1266{
1267 rebind_policy(current->mempolicy, old, new);
1268}