blob: 270e9a39ec1537da0b7fc9d8469086d3e353287d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080086#include <linux/swap.h>
87
Linus Torvalds1da177e2005-04-16 15:20:36 -070088#include <asm/tlbflush.h>
89#include <asm/uaccess.h>
90
Christoph Lameter38e35862006-01-08 01:01:01 -080091/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080092#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -080093#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080094
Linus Torvalds1da177e2005-04-16 15:20:36 -070095static kmem_cache_t *policy_cache;
96static kmem_cache_t *sn_cache;
97
98#define PDprintk(fmt...)
99
100/* Highest zone. An specific allocation for a zone below that is not
101 policied. */
Christoph Lameter4be38e32006-01-06 00:11:17 -0800102int policy_zone = ZONE_DMA;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
Andi Kleend42c6992005-07-06 19:56:03 +0200104struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105 .refcnt = ATOMIC_INIT(1), /* never free it */
106 .policy = MPOL_DEFAULT,
107};
108
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109/* Do sanity checking on a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700110static int mpol_check_policy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700112 int empty = nodes_empty(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
114 switch (mode) {
115 case MPOL_DEFAULT:
116 if (!empty)
117 return -EINVAL;
118 break;
119 case MPOL_BIND:
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
122 more for now. */
123 if (empty)
124 return -EINVAL;
125 break;
126 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700127 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700129/* Generate a custom zonelist for the BIND policy. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700130static struct zonelist *bind_zonelist(nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131{
132 struct zonelist *zl;
133 int num, max, nd;
134
Andi Kleendfcd3c02005-10-29 18:15:48 -0700135 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
137 if (!zl)
138 return NULL;
139 num = 0;
Christoph Lameter4be38e32006-01-06 00:11:17 -0800140 for_each_node_mask(nd, *nodes)
141 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142 zl->zones[num] = NULL;
143 return zl;
144}
145
146/* Create a new policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700147static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148{
149 struct mempolicy *policy;
150
Andi Kleendfcd3c02005-10-29 18:15:48 -0700151 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152 if (mode == MPOL_DEFAULT)
153 return NULL;
154 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
155 if (!policy)
156 return ERR_PTR(-ENOMEM);
157 atomic_set(&policy->refcnt, 1);
158 switch (mode) {
159 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700160 policy->v.nodes = *nodes;
Andi Kleen8f493d72006-01-03 00:07:28 +0100161 if (nodes_weight(*nodes) == 0) {
162 kmem_cache_free(policy_cache, policy);
163 return ERR_PTR(-EINVAL);
164 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165 break;
166 case MPOL_PREFERRED:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700167 policy->v.preferred_node = first_node(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168 if (policy->v.preferred_node >= MAX_NUMNODES)
169 policy->v.preferred_node = -1;
170 break;
171 case MPOL_BIND:
172 policy->v.zonelist = bind_zonelist(nodes);
173 if (policy->v.zonelist == NULL) {
174 kmem_cache_free(policy_cache, policy);
175 return ERR_PTR(-ENOMEM);
176 }
177 break;
178 }
179 policy->policy = mode;
180 return policy;
181}
182
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800183/* Check if we are the only process mapping the page in question */
184static inline int single_mm_mapping(struct mm_struct *mm,
185 struct address_space *mapping)
186{
187 struct vm_area_struct *vma;
188 struct prio_tree_iter iter;
189 int rc = 1;
190
191 spin_lock(&mapping->i_mmap_lock);
192 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
193 if (mm != vma->vm_mm) {
194 rc = 0;
195 goto out;
196 }
197 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
198 if (mm != vma->vm_mm) {
199 rc = 0;
200 goto out;
201 }
202out:
203 spin_unlock(&mapping->i_mmap_lock);
204 return rc;
205}
206
207/*
208 * Add a page to be migrated to the pagelist
209 */
210static void migrate_page_add(struct vm_area_struct *vma,
211 struct page *page, struct list_head *pagelist, unsigned long flags)
212{
213 /*
214 * Avoid migrating a page that is shared by others and not writable.
215 */
216 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
217 mapping_writably_mapped(page->mapping) ||
218 single_mm_mapping(vma->vm_mm, page->mapping)) {
219 int rc = isolate_lru_page(page);
220
221 if (rc == 1)
222 list_add(&page->lru, pagelist);
223 /*
224 * If the isolate attempt was not successful then we just
225 * encountered an unswappable page. Something must be wrong.
226 */
227 WARN_ON(rc == 0);
228 }
229}
230
Christoph Lameter38e35862006-01-08 01:01:01 -0800231/* Scan through pages checking if pages follow certain conditions. */
Nick Pigginb5810032005-10-29 18:16:12 -0700232static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800233 unsigned long addr, unsigned long end,
234 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800235 void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236{
Hugh Dickins91612e02005-06-21 17:15:07 -0700237 pte_t *orig_pte;
238 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700239 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700240
Hugh Dickins705e87c2005-10-29 18:16:27 -0700241 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700242 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800243 struct page *page;
Hugh Dickins91612e02005-06-21 17:15:07 -0700244 unsigned int nid;
245
246 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800248 page = vm_normal_page(vma, addr, *pte);
249 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800251 nid = page_to_nid(page);
Christoph Lameter38e35862006-01-08 01:01:01 -0800252 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
253 continue;
254
255 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
256 migrate_page_add(vma, page, private, flags);
257 else
258 break;
Hugh Dickins91612e02005-06-21 17:15:07 -0700259 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700260 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700261 return addr != end;
262}
263
Nick Pigginb5810032005-10-29 18:16:12 -0700264static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800265 unsigned long addr, unsigned long end,
266 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800267 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700268{
269 pmd_t *pmd;
270 unsigned long next;
271
272 pmd = pmd_offset(pud, addr);
273 do {
274 next = pmd_addr_end(addr, end);
275 if (pmd_none_or_clear_bad(pmd))
276 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800277 if (check_pte_range(vma, pmd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800278 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700279 return -EIO;
280 } while (pmd++, addr = next, addr != end);
281 return 0;
282}
283
Nick Pigginb5810032005-10-29 18:16:12 -0700284static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800285 unsigned long addr, unsigned long end,
286 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800287 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700288{
289 pud_t *pud;
290 unsigned long next;
291
292 pud = pud_offset(pgd, addr);
293 do {
294 next = pud_addr_end(addr, end);
295 if (pud_none_or_clear_bad(pud))
296 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800297 if (check_pmd_range(vma, pud, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800298 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700299 return -EIO;
300 } while (pud++, addr = next, addr != end);
301 return 0;
302}
303
Nick Pigginb5810032005-10-29 18:16:12 -0700304static inline int check_pgd_range(struct vm_area_struct *vma,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800305 unsigned long addr, unsigned long end,
306 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800307 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700308{
309 pgd_t *pgd;
310 unsigned long next;
311
Nick Pigginb5810032005-10-29 18:16:12 -0700312 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700313 do {
314 next = pgd_addr_end(addr, end);
315 if (pgd_none_or_clear_bad(pgd))
316 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800317 if (check_pud_range(vma, pgd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800318 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700319 return -EIO;
320 } while (pgd++, addr = next, addr != end);
321 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322}
323
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800324/* Check if a vma is migratable */
325static inline int vma_migratable(struct vm_area_struct *vma)
326{
327 if (vma->vm_flags & (
328 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
329 return 0;
330 return 1;
331}
332
333/*
334 * Check if all pages in a range are on a set of nodes.
335 * If pagelist != NULL then isolate pages from the LRU and
336 * put them on the pagelist.
337 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338static struct vm_area_struct *
339check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Christoph Lameter38e35862006-01-08 01:01:01 -0800340 const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341{
342 int err;
343 struct vm_area_struct *first, *vma, *prev;
344
345 first = find_vma(mm, start);
346 if (!first)
347 return ERR_PTR(-EFAULT);
348 prev = NULL;
349 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800350 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
351 if (!vma->vm_next && vma->vm_end < end)
352 return ERR_PTR(-EFAULT);
353 if (prev && prev->vm_end < vma->vm_start)
354 return ERR_PTR(-EFAULT);
355 }
356 if (!is_vm_hugetlb_page(vma) &&
357 ((flags & MPOL_MF_STRICT) ||
358 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
359 vma_migratable(vma)))) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700360 unsigned long endvma = vma->vm_end;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800361
Andi Kleen5b952b32005-09-13 01:25:08 -0700362 if (endvma > end)
363 endvma = end;
364 if (vma->vm_start > start)
365 start = vma->vm_start;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800366 err = check_pgd_range(vma, start, endvma, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800367 flags, private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368 if (err) {
369 first = ERR_PTR(err);
370 break;
371 }
372 }
373 prev = vma;
374 }
375 return first;
376}
377
378/* Apply policy to a single VMA */
379static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
380{
381 int err = 0;
382 struct mempolicy *old = vma->vm_policy;
383
384 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
385 vma->vm_start, vma->vm_end, vma->vm_pgoff,
386 vma->vm_ops, vma->vm_file,
387 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
388
389 if (vma->vm_ops && vma->vm_ops->set_policy)
390 err = vma->vm_ops->set_policy(vma, new);
391 if (!err) {
392 mpol_get(new);
393 vma->vm_policy = new;
394 mpol_free(old);
395 }
396 return err;
397}
398
399/* Step 2: apply policy to a range and do splits. */
400static int mbind_range(struct vm_area_struct *vma, unsigned long start,
401 unsigned long end, struct mempolicy *new)
402{
403 struct vm_area_struct *next;
404 int err;
405
406 err = 0;
407 for (; vma && vma->vm_start < end; vma = next) {
408 next = vma->vm_next;
409 if (vma->vm_start < start)
410 err = split_vma(vma->vm_mm, vma, start, 1);
411 if (!err && vma->vm_end > end)
412 err = split_vma(vma->vm_mm, vma, end, 0);
413 if (!err)
414 err = policy_vma(vma, new);
415 if (err)
416 break;
417 }
418 return err;
419}
420
Christoph Lameter8bccd852005-10-29 18:16:59 -0700421static int contextualize_policy(int mode, nodemask_t *nodes)
422{
423 if (!nodes)
424 return 0;
425
426 /* Update current mems_allowed */
427 cpuset_update_current_mems_allowed();
428 /* Ignore nodes not set in current->mems_allowed */
429 cpuset_restrict_to_mems_allowed(nodes->bits);
430 return mpol_check_policy(mode, nodes);
431}
432
Christoph Lameterd4984712006-01-08 01:00:55 -0800433static int swap_pages(struct list_head *pagelist)
434{
435 LIST_HEAD(moved);
436 LIST_HEAD(failed);
437 int n;
438
439 n = migrate_pages(pagelist, NULL, &moved, &failed);
440 putback_lru_pages(&failed);
441 putback_lru_pages(&moved);
442
443 return n;
444}
445
Christoph Lameter8bccd852005-10-29 18:16:59 -0700446long do_mbind(unsigned long start, unsigned long len,
447 unsigned long mode, nodemask_t *nmask, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448{
449 struct vm_area_struct *vma;
450 struct mm_struct *mm = current->mm;
451 struct mempolicy *new;
452 unsigned long end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 int err;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800454 LIST_HEAD(pagelist);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455
Christoph Lameter38e35862006-01-08 01:01:01 -0800456 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
457 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800458 || mode > MPOL_MAX)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 return -EINVAL;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800460 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
461 return -EPERM;
462
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 if (start & ~PAGE_MASK)
464 return -EINVAL;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800465
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 if (mode == MPOL_DEFAULT)
467 flags &= ~MPOL_MF_STRICT;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800468
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
470 end = start + len;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800471
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 if (end < start)
473 return -EINVAL;
474 if (end == start)
475 return 0;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800476
Christoph Lameter5fcbb232005-10-29 18:17:00 -0700477 if (mpol_check_policy(mode, nmask))
Christoph Lameter8bccd852005-10-29 18:16:59 -0700478 return -EINVAL;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800479
Christoph Lameter8bccd852005-10-29 18:16:59 -0700480 new = mpol_new(mode, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 if (IS_ERR(new))
482 return PTR_ERR(new);
483
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800484 /*
485 * If we are using the default policy then operation
486 * on discontinuous address spaces is okay after all
487 */
488 if (!new)
489 flags |= MPOL_MF_DISCONTIG_OK;
490
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700492 mode,nodes_addr(nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493
494 down_write(&mm->mmap_sem);
Christoph Lameter38e35862006-01-08 01:01:01 -0800495 vma = check_range(mm, start, end, nmask,
496 flags | MPOL_MF_INVERT, &pagelist);
497
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 err = PTR_ERR(vma);
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800499 if (!IS_ERR(vma)) {
Christoph Lameterd4984712006-01-08 01:00:55 -0800500 int nr_failed = 0;
501
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 err = mbind_range(vma, start, end, new);
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800503 if (!list_empty(&pagelist))
Christoph Lameterd4984712006-01-08 01:00:55 -0800504 nr_failed = swap_pages(&pagelist);
505
506 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800507 err = -EIO;
508 }
509 if (!list_empty(&pagelist))
510 putback_lru_pages(&pagelist);
511
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 up_write(&mm->mmap_sem);
513 mpol_free(new);
514 return err;
515}
516
517/* Set the process memory policy */
Christoph Lameter8bccd852005-10-29 18:16:59 -0700518long do_set_mempolicy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 struct mempolicy *new;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521
Christoph Lameter8bccd852005-10-29 18:16:59 -0700522 if (contextualize_policy(mode, nodes))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700524 new = mpol_new(mode, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525 if (IS_ERR(new))
526 return PTR_ERR(new);
527 mpol_free(current->mempolicy);
528 current->mempolicy = new;
529 if (new && new->policy == MPOL_INTERLEAVE)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700530 current->il_next = first_node(new->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531 return 0;
532}
533
534/* Fill a zone bitmap for a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700535static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536{
537 int i;
538
Andi Kleendfcd3c02005-10-29 18:15:48 -0700539 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540 switch (p->policy) {
541 case MPOL_BIND:
542 for (i = 0; p->v.zonelist->zones[i]; i++)
Christoph Lameter8bccd852005-10-29 18:16:59 -0700543 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
544 *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545 break;
546 case MPOL_DEFAULT:
547 break;
548 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700549 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 break;
551 case MPOL_PREFERRED:
552 /* or use current node instead of online map? */
553 if (p->v.preferred_node < 0)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700554 *nodes = node_online_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700556 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 break;
558 default:
559 BUG();
560 }
561}
562
563static int lookup_node(struct mm_struct *mm, unsigned long addr)
564{
565 struct page *p;
566 int err;
567
568 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
569 if (err >= 0) {
570 err = page_to_nid(p);
571 put_page(p);
572 }
573 return err;
574}
575
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576/* Retrieve NUMA policy */
Christoph Lameter8bccd852005-10-29 18:16:59 -0700577long do_get_mempolicy(int *policy, nodemask_t *nmask,
578 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700580 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581 struct mm_struct *mm = current->mm;
582 struct vm_area_struct *vma = NULL;
583 struct mempolicy *pol = current->mempolicy;
584
Paul Jackson68860ec2005-10-30 15:02:36 -0800585 cpuset_update_current_mems_allowed();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
587 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 if (flags & MPOL_F_ADDR) {
589 down_read(&mm->mmap_sem);
590 vma = find_vma_intersection(mm, addr, addr+1);
591 if (!vma) {
592 up_read(&mm->mmap_sem);
593 return -EFAULT;
594 }
595 if (vma->vm_ops && vma->vm_ops->get_policy)
596 pol = vma->vm_ops->get_policy(vma, addr);
597 else
598 pol = vma->vm_policy;
599 } else if (addr)
600 return -EINVAL;
601
602 if (!pol)
603 pol = &default_policy;
604
605 if (flags & MPOL_F_NODE) {
606 if (flags & MPOL_F_ADDR) {
607 err = lookup_node(mm, addr);
608 if (err < 0)
609 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700610 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 } else if (pol == current->mempolicy &&
612 pol->policy == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700613 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 } else {
615 err = -EINVAL;
616 goto out;
617 }
618 } else
Christoph Lameter8bccd852005-10-29 18:16:59 -0700619 *policy = pol->policy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620
621 if (vma) {
622 up_read(&current->mm->mmap_sem);
623 vma = NULL;
624 }
625
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626 err = 0;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700627 if (nmask)
628 get_zonemask(pol, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629
630 out:
631 if (vma)
632 up_read(&current->mm->mmap_sem);
633 return err;
634}
635
Christoph Lameter8bccd852005-10-29 18:16:59 -0700636/*
Christoph Lameter39743882006-01-08 01:00:51 -0800637 * For now migrate_pages simply swaps out the pages from nodes that are in
638 * the source set but not in the target set. In the future, we would
639 * want a function that moves pages between the two nodesets in such
640 * a way as to preserve the physical layout as much as possible.
641 *
642 * Returns the number of page that could not be moved.
643 */
644int do_migrate_pages(struct mm_struct *mm,
645 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
646{
647 LIST_HEAD(pagelist);
648 int count = 0;
649 nodemask_t nodes;
650
651 nodes_andnot(nodes, *from_nodes, *to_nodes);
Christoph Lameter39743882006-01-08 01:00:51 -0800652
653 down_read(&mm->mmap_sem);
654 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
655 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
Christoph Lameterd4984712006-01-08 01:00:55 -0800656
Christoph Lameter39743882006-01-08 01:00:51 -0800657 if (!list_empty(&pagelist)) {
Christoph Lameterd4984712006-01-08 01:00:55 -0800658 count = swap_pages(&pagelist);
659 putback_lru_pages(&pagelist);
Christoph Lameter39743882006-01-08 01:00:51 -0800660 }
Christoph Lameterd4984712006-01-08 01:00:55 -0800661
Christoph Lameter39743882006-01-08 01:00:51 -0800662 up_read(&mm->mmap_sem);
663 return count;
664}
665
666/*
Christoph Lameter8bccd852005-10-29 18:16:59 -0700667 * User space interface with variable sized bitmaps for nodelists.
668 */
669
670/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -0800671static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -0700672 unsigned long maxnode)
673{
674 unsigned long k;
675 unsigned long nlongs;
676 unsigned long endmask;
677
678 --maxnode;
679 nodes_clear(*nodes);
680 if (maxnode == 0 || !nmask)
681 return 0;
682
683 nlongs = BITS_TO_LONGS(maxnode);
684 if ((maxnode % BITS_PER_LONG) == 0)
685 endmask = ~0UL;
686 else
687 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
688
689 /* When the user specified more nodes than supported just check
690 if the non supported part is all zero. */
691 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
692 if (nlongs > PAGE_SIZE/sizeof(long))
693 return -EINVAL;
694 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
695 unsigned long t;
696 if (get_user(t, nmask + k))
697 return -EFAULT;
698 if (k == nlongs - 1) {
699 if (t & endmask)
700 return -EINVAL;
701 } else if (t)
702 return -EINVAL;
703 }
704 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
705 endmask = ~0UL;
706 }
707
708 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
709 return -EFAULT;
710 nodes_addr(*nodes)[nlongs-1] &= endmask;
711 return 0;
712}
713
714/* Copy a kernel node mask to user space */
715static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
716 nodemask_t *nodes)
717{
718 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
719 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
720
721 if (copy > nbytes) {
722 if (copy > PAGE_SIZE)
723 return -EINVAL;
724 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
725 return -EFAULT;
726 copy = nbytes;
727 }
728 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
729}
730
731asmlinkage long sys_mbind(unsigned long start, unsigned long len,
732 unsigned long mode,
733 unsigned long __user *nmask, unsigned long maxnode,
734 unsigned flags)
735{
736 nodemask_t nodes;
737 int err;
738
739 err = get_nodes(&nodes, nmask, maxnode);
740 if (err)
741 return err;
742 return do_mbind(start, len, mode, &nodes, flags);
743}
744
745/* Set the process memory policy */
746asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
747 unsigned long maxnode)
748{
749 int err;
750 nodemask_t nodes;
751
752 if (mode < 0 || mode > MPOL_MAX)
753 return -EINVAL;
754 err = get_nodes(&nodes, nmask, maxnode);
755 if (err)
756 return err;
757 return do_set_mempolicy(mode, &nodes);
758}
759
Christoph Lameter39743882006-01-08 01:00:51 -0800760/* Macro needed until Paul implements this function in kernel/cpusets.c */
761#define cpuset_mems_allowed(task) node_online_map
762
763asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
764 const unsigned long __user *old_nodes,
765 const unsigned long __user *new_nodes)
766{
767 struct mm_struct *mm;
768 struct task_struct *task;
769 nodemask_t old;
770 nodemask_t new;
771 nodemask_t task_nodes;
772 int err;
773
774 err = get_nodes(&old, old_nodes, maxnode);
775 if (err)
776 return err;
777
778 err = get_nodes(&new, new_nodes, maxnode);
779 if (err)
780 return err;
781
782 /* Find the mm_struct */
783 read_lock(&tasklist_lock);
784 task = pid ? find_task_by_pid(pid) : current;
785 if (!task) {
786 read_unlock(&tasklist_lock);
787 return -ESRCH;
788 }
789 mm = get_task_mm(task);
790 read_unlock(&tasklist_lock);
791
792 if (!mm)
793 return -EINVAL;
794
795 /*
796 * Check if this process has the right to modify the specified
797 * process. The right exists if the process has administrative
798 * capabilities, superuser priviledges or the same
799 * userid as the target process.
800 */
801 if ((current->euid != task->suid) && (current->euid != task->uid) &&
802 (current->uid != task->suid) && (current->uid != task->uid) &&
803 !capable(CAP_SYS_ADMIN)) {
804 err = -EPERM;
805 goto out;
806 }
807
808 task_nodes = cpuset_mems_allowed(task);
809 /* Is the user allowed to access the target nodes? */
810 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
811 err = -EPERM;
812 goto out;
813 }
814
815 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
816out:
817 mmput(mm);
818 return err;
819}
820
821
Christoph Lameter8bccd852005-10-29 18:16:59 -0700822/* Retrieve NUMA policy */
823asmlinkage long sys_get_mempolicy(int __user *policy,
824 unsigned long __user *nmask,
825 unsigned long maxnode,
826 unsigned long addr, unsigned long flags)
827{
828 int err, pval;
829 nodemask_t nodes;
830
831 if (nmask != NULL && maxnode < MAX_NUMNODES)
832 return -EINVAL;
833
834 err = do_get_mempolicy(&pval, &nodes, addr, flags);
835
836 if (err)
837 return err;
838
839 if (policy && put_user(pval, policy))
840 return -EFAULT;
841
842 if (nmask)
843 err = copy_nodes_to_user(nmask, maxnode, &nodes);
844
845 return err;
846}
847
Linus Torvalds1da177e2005-04-16 15:20:36 -0700848#ifdef CONFIG_COMPAT
849
850asmlinkage long compat_sys_get_mempolicy(int __user *policy,
851 compat_ulong_t __user *nmask,
852 compat_ulong_t maxnode,
853 compat_ulong_t addr, compat_ulong_t flags)
854{
855 long err;
856 unsigned long __user *nm = NULL;
857 unsigned long nr_bits, alloc_size;
858 DECLARE_BITMAP(bm, MAX_NUMNODES);
859
860 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
861 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
862
863 if (nmask)
864 nm = compat_alloc_user_space(alloc_size);
865
866 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
867
868 if (!err && nmask) {
869 err = copy_from_user(bm, nm, alloc_size);
870 /* ensure entire bitmap is zeroed */
871 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
872 err |= compat_put_bitmap(nmask, bm, nr_bits);
873 }
874
875 return err;
876}
877
878asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
879 compat_ulong_t maxnode)
880{
881 long err = 0;
882 unsigned long __user *nm = NULL;
883 unsigned long nr_bits, alloc_size;
884 DECLARE_BITMAP(bm, MAX_NUMNODES);
885
886 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
887 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
888
889 if (nmask) {
890 err = compat_get_bitmap(bm, nmask, nr_bits);
891 nm = compat_alloc_user_space(alloc_size);
892 err |= copy_to_user(nm, bm, alloc_size);
893 }
894
895 if (err)
896 return -EFAULT;
897
898 return sys_set_mempolicy(mode, nm, nr_bits+1);
899}
900
901asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
902 compat_ulong_t mode, compat_ulong_t __user *nmask,
903 compat_ulong_t maxnode, compat_ulong_t flags)
904{
905 long err = 0;
906 unsigned long __user *nm = NULL;
907 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700908 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909
910 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
911 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
912
913 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700914 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700916 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 }
918
919 if (err)
920 return -EFAULT;
921
922 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
923}
924
925#endif
926
927/* Return effective policy for a VMA */
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700928struct mempolicy *
929get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700931 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700932
933 if (vma) {
934 if (vma->vm_ops && vma->vm_ops->get_policy)
Christoph Lameter8bccd852005-10-29 18:16:59 -0700935 pol = vma->vm_ops->get_policy(vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936 else if (vma->vm_policy &&
937 vma->vm_policy->policy != MPOL_DEFAULT)
938 pol = vma->vm_policy;
939 }
940 if (!pol)
941 pol = &default_policy;
942 return pol;
943}
944
945/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +0100946static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947{
948 int nd;
949
950 switch (policy->policy) {
951 case MPOL_PREFERRED:
952 nd = policy->v.preferred_node;
953 if (nd < 0)
954 nd = numa_node_id();
955 break;
956 case MPOL_BIND:
957 /* Lower zones don't get a policy applied */
958 /* Careful: current->mems_allowed might have moved */
Al Viroaf4ca452005-10-21 02:55:38 -0400959 if (gfp_zone(gfp) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
961 return policy->v.zonelist;
962 /*FALL THROUGH*/
963 case MPOL_INTERLEAVE: /* should not happen */
964 case MPOL_DEFAULT:
965 nd = numa_node_id();
966 break;
967 default:
968 nd = 0;
969 BUG();
970 }
Al Viroaf4ca452005-10-21 02:55:38 -0400971 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972}
973
974/* Do dynamic interleaving for a process */
975static unsigned interleave_nodes(struct mempolicy *policy)
976{
977 unsigned nid, next;
978 struct task_struct *me = current;
979
980 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700981 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700983 next = first_node(policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 me->il_next = next;
985 return nid;
986}
987
988/* Do static interleaving for a VMA with known offset. */
989static unsigned offset_il_node(struct mempolicy *pol,
990 struct vm_area_struct *vma, unsigned long off)
991{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700992 unsigned nnodes = nodes_weight(pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700993 unsigned target = (unsigned)off % nnodes;
994 int c;
995 int nid = -1;
996
997 c = 0;
998 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700999 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000 c++;
1001 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 return nid;
1003}
1004
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001005/* Determine a node number for interleave */
1006static inline unsigned interleave_nid(struct mempolicy *pol,
1007 struct vm_area_struct *vma, unsigned long addr, int shift)
1008{
1009 if (vma) {
1010 unsigned long off;
1011
1012 off = vma->vm_pgoff;
1013 off += (addr - vma->vm_start) >> shift;
1014 return offset_il_node(pol, vma, off);
1015 } else
1016 return interleave_nodes(pol);
1017}
1018
1019/* Return a zonelist suitable for a huge page allocation. */
1020struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1021{
1022 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1023
1024 if (pol->policy == MPOL_INTERLEAVE) {
1025 unsigned nid;
1026
1027 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1028 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1029 }
1030 return zonelist_policy(GFP_HIGHUSER, pol);
1031}
1032
Linus Torvalds1da177e2005-04-16 15:20:36 -07001033/* Allocate a page in interleaved policy.
1034 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07001035static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1036 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037{
1038 struct zonelist *zl;
1039 struct page *page;
1040
Al Viroaf4ca452005-10-21 02:55:38 -04001041 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001042 page = __alloc_pages(gfp, order, zl);
1043 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07001044 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045 put_cpu();
1046 }
1047 return page;
1048}
1049
1050/**
1051 * alloc_page_vma - Allocate a page for a VMA.
1052 *
1053 * @gfp:
1054 * %GFP_USER user allocation.
1055 * %GFP_KERNEL kernel allocations,
1056 * %GFP_HIGHMEM highmem/user allocations,
1057 * %GFP_FS allocation should not call back into a file system.
1058 * %GFP_ATOMIC don't sleep.
1059 *
1060 * @vma: Pointer to VMA or NULL if not available.
1061 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1062 *
1063 * This function allocates a page from the kernel page pool and applies
1064 * a NUMA policy associated with the VMA or the current process.
1065 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1066 * mm_struct of the VMA to prevent it from going away. Should be used for
1067 * all allocations for pages that will be mapped into
1068 * user space. Returns NULL when no page can be allocated.
1069 *
1070 * Should be called with the mm_sem of the vma hold.
1071 */
1072struct page *
Al Virodd0fc662005-10-07 07:46:04 +01001073alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001074{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001075 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076
1077 cpuset_update_current_mems_allowed();
1078
1079 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1080 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001081
1082 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001083 return alloc_page_interleave(gfp, 0, nid);
1084 }
1085 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1086}
1087
1088/**
1089 * alloc_pages_current - Allocate pages.
1090 *
1091 * @gfp:
1092 * %GFP_USER user allocation,
1093 * %GFP_KERNEL kernel allocation,
1094 * %GFP_HIGHMEM highmem allocation,
1095 * %GFP_FS don't call back into a file system.
1096 * %GFP_ATOMIC don't sleep.
1097 * @order: Power of two of allocation size in pages. 0 is a single page.
1098 *
1099 * Allocate a page from the kernel page pool. When not in
1100 * interrupt context and apply the current process NUMA policy.
1101 * Returns NULL when no page can be allocated.
1102 *
1103 * Don't call cpuset_update_current_mems_allowed() unless
1104 * 1) it's ok to take cpuset_sem (can WAIT), and
1105 * 2) allocating for current task (not interrupt).
1106 */
Al Virodd0fc662005-10-07 07:46:04 +01001107struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001108{
1109 struct mempolicy *pol = current->mempolicy;
1110
1111 if ((gfp & __GFP_WAIT) && !in_interrupt())
1112 cpuset_update_current_mems_allowed();
1113 if (!pol || in_interrupt())
1114 pol = &default_policy;
1115 if (pol->policy == MPOL_INTERLEAVE)
1116 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1117 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1118}
1119EXPORT_SYMBOL(alloc_pages_current);
1120
1121/* Slow path of a mempolicy copy */
1122struct mempolicy *__mpol_copy(struct mempolicy *old)
1123{
1124 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1125
1126 if (!new)
1127 return ERR_PTR(-ENOMEM);
1128 *new = *old;
1129 atomic_set(&new->refcnt, 1);
1130 if (new->policy == MPOL_BIND) {
1131 int sz = ksize(old->v.zonelist);
1132 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1133 if (!new->v.zonelist) {
1134 kmem_cache_free(policy_cache, new);
1135 return ERR_PTR(-ENOMEM);
1136 }
1137 memcpy(new->v.zonelist, old->v.zonelist, sz);
1138 }
1139 return new;
1140}
1141
1142/* Slow path of a mempolicy comparison */
1143int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1144{
1145 if (!a || !b)
1146 return 0;
1147 if (a->policy != b->policy)
1148 return 0;
1149 switch (a->policy) {
1150 case MPOL_DEFAULT:
1151 return 1;
1152 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -07001153 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154 case MPOL_PREFERRED:
1155 return a->v.preferred_node == b->v.preferred_node;
1156 case MPOL_BIND: {
1157 int i;
1158 for (i = 0; a->v.zonelist->zones[i]; i++)
1159 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1160 return 0;
1161 return b->v.zonelist->zones[i] == NULL;
1162 }
1163 default:
1164 BUG();
1165 return 0;
1166 }
1167}
1168
1169/* Slow path of a mpol destructor. */
1170void __mpol_free(struct mempolicy *p)
1171{
1172 if (!atomic_dec_and_test(&p->refcnt))
1173 return;
1174 if (p->policy == MPOL_BIND)
1175 kfree(p->v.zonelist);
1176 p->policy = MPOL_DEFAULT;
1177 kmem_cache_free(policy_cache, p);
1178}
1179
1180/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 * Shared memory backing store policy support.
1182 *
1183 * Remember policies even when nobody has shared memory mapped.
1184 * The policies are kept in Red-Black tree linked from the inode.
1185 * They are protected by the sp->lock spinlock, which should be held
1186 * for any accesses to the tree.
1187 */
1188
1189/* lookup first element intersecting start-end */
1190/* Caller holds sp->lock */
1191static struct sp_node *
1192sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1193{
1194 struct rb_node *n = sp->root.rb_node;
1195
1196 while (n) {
1197 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1198
1199 if (start >= p->end)
1200 n = n->rb_right;
1201 else if (end <= p->start)
1202 n = n->rb_left;
1203 else
1204 break;
1205 }
1206 if (!n)
1207 return NULL;
1208 for (;;) {
1209 struct sp_node *w = NULL;
1210 struct rb_node *prev = rb_prev(n);
1211 if (!prev)
1212 break;
1213 w = rb_entry(prev, struct sp_node, nd);
1214 if (w->end <= start)
1215 break;
1216 n = prev;
1217 }
1218 return rb_entry(n, struct sp_node, nd);
1219}
1220
1221/* Insert a new shared policy into the list. */
1222/* Caller holds sp->lock */
1223static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1224{
1225 struct rb_node **p = &sp->root.rb_node;
1226 struct rb_node *parent = NULL;
1227 struct sp_node *nd;
1228
1229 while (*p) {
1230 parent = *p;
1231 nd = rb_entry(parent, struct sp_node, nd);
1232 if (new->start < nd->start)
1233 p = &(*p)->rb_left;
1234 else if (new->end > nd->end)
1235 p = &(*p)->rb_right;
1236 else
1237 BUG();
1238 }
1239 rb_link_node(&new->nd, parent, p);
1240 rb_insert_color(&new->nd, &sp->root);
1241 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1242 new->policy ? new->policy->policy : 0);
1243}
1244
1245/* Find shared policy intersecting idx */
1246struct mempolicy *
1247mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1248{
1249 struct mempolicy *pol = NULL;
1250 struct sp_node *sn;
1251
1252 if (!sp->root.rb_node)
1253 return NULL;
1254 spin_lock(&sp->lock);
1255 sn = sp_lookup(sp, idx, idx+1);
1256 if (sn) {
1257 mpol_get(sn->policy);
1258 pol = sn->policy;
1259 }
1260 spin_unlock(&sp->lock);
1261 return pol;
1262}
1263
1264static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1265{
1266 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1267 rb_erase(&n->nd, &sp->root);
1268 mpol_free(n->policy);
1269 kmem_cache_free(sn_cache, n);
1270}
1271
1272struct sp_node *
1273sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1274{
1275 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1276
1277 if (!n)
1278 return NULL;
1279 n->start = start;
1280 n->end = end;
1281 mpol_get(pol);
1282 n->policy = pol;
1283 return n;
1284}
1285
1286/* Replace a policy range. */
1287static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1288 unsigned long end, struct sp_node *new)
1289{
1290 struct sp_node *n, *new2 = NULL;
1291
1292restart:
1293 spin_lock(&sp->lock);
1294 n = sp_lookup(sp, start, end);
1295 /* Take care of old policies in the same range. */
1296 while (n && n->start < end) {
1297 struct rb_node *next = rb_next(&n->nd);
1298 if (n->start >= start) {
1299 if (n->end <= end)
1300 sp_delete(sp, n);
1301 else
1302 n->start = end;
1303 } else {
1304 /* Old policy spanning whole new range. */
1305 if (n->end > end) {
1306 if (!new2) {
1307 spin_unlock(&sp->lock);
1308 new2 = sp_alloc(end, n->end, n->policy);
1309 if (!new2)
1310 return -ENOMEM;
1311 goto restart;
1312 }
1313 n->end = start;
1314 sp_insert(sp, new2);
1315 new2 = NULL;
1316 break;
1317 } else
1318 n->end = start;
1319 }
1320 if (!next)
1321 break;
1322 n = rb_entry(next, struct sp_node, nd);
1323 }
1324 if (new)
1325 sp_insert(sp, new);
1326 spin_unlock(&sp->lock);
1327 if (new2) {
1328 mpol_free(new2->policy);
1329 kmem_cache_free(sn_cache, new2);
1330 }
1331 return 0;
1332}
1333
1334int mpol_set_shared_policy(struct shared_policy *info,
1335 struct vm_area_struct *vma, struct mempolicy *npol)
1336{
1337 int err;
1338 struct sp_node *new = NULL;
1339 unsigned long sz = vma_pages(vma);
1340
1341 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1342 vma->vm_pgoff,
1343 sz, npol? npol->policy : -1,
Andi Kleendfcd3c02005-10-29 18:15:48 -07001344 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345
1346 if (npol) {
1347 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1348 if (!new)
1349 return -ENOMEM;
1350 }
1351 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1352 if (err && new)
1353 kmem_cache_free(sn_cache, new);
1354 return err;
1355}
1356
1357/* Free a backing policy store on inode delete. */
1358void mpol_free_shared_policy(struct shared_policy *p)
1359{
1360 struct sp_node *n;
1361 struct rb_node *next;
1362
1363 if (!p->root.rb_node)
1364 return;
1365 spin_lock(&p->lock);
1366 next = rb_first(&p->root);
1367 while (next) {
1368 n = rb_entry(next, struct sp_node, nd);
1369 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001370 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371 mpol_free(n->policy);
1372 kmem_cache_free(sn_cache, n);
1373 }
1374 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375}
1376
1377/* assumes fs == KERNEL_DS */
1378void __init numa_policy_init(void)
1379{
1380 policy_cache = kmem_cache_create("numa_policy",
1381 sizeof(struct mempolicy),
1382 0, SLAB_PANIC, NULL, NULL);
1383
1384 sn_cache = kmem_cache_create("shared_policy_node",
1385 sizeof(struct sp_node),
1386 0, SLAB_PANIC, NULL, NULL);
1387
1388 /* Set interleaving policy for system init. This way not all
1389 the data structures allocated at system boot end up in node zero. */
1390
Christoph Lameter8bccd852005-10-29 18:16:59 -07001391 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 printk("numa_policy_init: interleaving failed\n");
1393}
1394
Christoph Lameter8bccd852005-10-29 18:16:59 -07001395/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396void numa_default_policy(void)
1397{
Christoph Lameter8bccd852005-10-29 18:16:59 -07001398 do_set_mempolicy(MPOL_DEFAULT, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399}
Paul Jackson68860ec2005-10-30 15:02:36 -08001400
1401/* Migrate a policy to a different set of nodes */
1402static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1403 const nodemask_t *new)
1404{
1405 nodemask_t tmp;
1406
1407 if (!pol)
1408 return;
1409
1410 switch (pol->policy) {
1411 case MPOL_DEFAULT:
1412 break;
1413 case MPOL_INTERLEAVE:
1414 nodes_remap(tmp, pol->v.nodes, *old, *new);
1415 pol->v.nodes = tmp;
1416 current->il_next = node_remap(current->il_next, *old, *new);
1417 break;
1418 case MPOL_PREFERRED:
1419 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1420 *old, *new);
1421 break;
1422 case MPOL_BIND: {
1423 nodemask_t nodes;
1424 struct zone **z;
1425 struct zonelist *zonelist;
1426
1427 nodes_clear(nodes);
1428 for (z = pol->v.zonelist->zones; *z; z++)
1429 node_set((*z)->zone_pgdat->node_id, nodes);
1430 nodes_remap(tmp, nodes, *old, *new);
1431 nodes = tmp;
1432
1433 zonelist = bind_zonelist(&nodes);
1434
1435 /* If no mem, then zonelist is NULL and we keep old zonelist.
1436 * If that old zonelist has no remaining mems_allowed nodes,
1437 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1438 */
1439
1440 if (zonelist) {
1441 /* Good - got mem - substitute new zonelist */
1442 kfree(pol->v.zonelist);
1443 pol->v.zonelist = zonelist;
1444 }
1445 break;
1446 }
1447 default:
1448 BUG();
1449 break;
1450 }
1451}
1452
1453/*
1454 * Someone moved this task to different nodes. Fixup mempolicies.
1455 *
1456 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1457 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1458 */
1459void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1460{
1461 rebind_policy(current->mempolicy, old, new);
1462}