blob: 4b077ec6c005d8e4221bf98cf0ca41c8b46eda49 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080086#include <linux/swap.h>
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080087#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080089
Linus Torvalds1da177e2005-04-16 15:20:36 -070090#include <asm/tlbflush.h>
91#include <asm/uaccess.h>
92
Christoph Lameter38e35862006-01-08 01:01:01 -080093/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080094#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -080095#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080096#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080097
Linus Torvalds1da177e2005-04-16 15:20:36 -070098static kmem_cache_t *policy_cache;
99static kmem_cache_t *sn_cache;
100
101#define PDprintk(fmt...)
102
103/* Highest zone. An specific allocation for a zone below that is not
104 policied. */
Christoph Lameter4be38e32006-01-06 00:11:17 -0800105int policy_zone = ZONE_DMA;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
Andi Kleend42c6992005-07-06 19:56:03 +0200107struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 .refcnt = ATOMIC_INIT(1), /* never free it */
109 .policy = MPOL_DEFAULT,
110};
111
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112/* Do sanity checking on a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700113static int mpol_check_policy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700115 int empty = nodes_empty(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116
117 switch (mode) {
118 case MPOL_DEFAULT:
119 if (!empty)
120 return -EINVAL;
121 break;
122 case MPOL_BIND:
123 case MPOL_INTERLEAVE:
124 /* Preferred will only use the first bit, but allow
125 more for now. */
126 if (empty)
127 return -EINVAL;
128 break;
129 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700130 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132/* Generate a custom zonelist for the BIND policy. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700133static struct zonelist *bind_zonelist(nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134{
135 struct zonelist *zl;
136 int num, max, nd;
137
Andi Kleendfcd3c02005-10-29 18:15:48 -0700138 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
140 if (!zl)
141 return NULL;
142 num = 0;
Christoph Lameter4be38e32006-01-06 00:11:17 -0800143 for_each_node_mask(nd, *nodes)
144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145 zl->zones[num] = NULL;
146 return zl;
147}
148
149/* Create a new policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151{
152 struct mempolicy *policy;
153
Andi Kleendfcd3c02005-10-29 18:15:48 -0700154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700163 policy->v.nodes = *nodes;
Andi Kleen8f493d72006-01-03 00:07:28 +0100164 if (nodes_weight(*nodes) == 0) {
165 kmem_cache_free(policy_cache, policy);
166 return ERR_PTR(-EINVAL);
167 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168 break;
169 case MPOL_PREFERRED:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700170 policy->v.preferred_node = first_node(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 if (policy->v.preferred_node >= MAX_NUMNODES)
172 policy->v.preferred_node = -1;
173 break;
174 case MPOL_BIND:
175 policy->v.zonelist = bind_zonelist(nodes);
176 if (policy->v.zonelist == NULL) {
177 kmem_cache_free(policy_cache, policy);
178 return ERR_PTR(-ENOMEM);
179 }
180 break;
181 }
182 policy->policy = mode;
183 return policy;
184}
185
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800186/* Check if we are the only process mapping the page in question */
187static inline int single_mm_mapping(struct mm_struct *mm,
188 struct address_space *mapping)
189{
190 struct vm_area_struct *vma;
191 struct prio_tree_iter iter;
192 int rc = 1;
193
194 spin_lock(&mapping->i_mmap_lock);
195 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
196 if (mm != vma->vm_mm) {
197 rc = 0;
198 goto out;
199 }
200 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
201 if (mm != vma->vm_mm) {
202 rc = 0;
203 goto out;
204 }
205out:
206 spin_unlock(&mapping->i_mmap_lock);
207 return rc;
208}
209
210/*
211 * Add a page to be migrated to the pagelist
212 */
213static void migrate_page_add(struct vm_area_struct *vma,
214 struct page *page, struct list_head *pagelist, unsigned long flags)
215{
216 /*
217 * Avoid migrating a page that is shared by others and not writable.
218 */
219 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
220 mapping_writably_mapped(page->mapping) ||
221 single_mm_mapping(vma->vm_mm, page->mapping)) {
222 int rc = isolate_lru_page(page);
223
224 if (rc == 1)
225 list_add(&page->lru, pagelist);
226 /*
227 * If the isolate attempt was not successful then we just
228 * encountered an unswappable page. Something must be wrong.
229 */
230 WARN_ON(rc == 0);
231 }
232}
233
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800234static void gather_stats(struct page *, void *);
235
Christoph Lameter38e35862006-01-08 01:01:01 -0800236/* Scan through pages checking if pages follow certain conditions. */
Nick Pigginb5810032005-10-29 18:16:12 -0700237static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800238 unsigned long addr, unsigned long end,
239 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800240 void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241{
Hugh Dickins91612e02005-06-21 17:15:07 -0700242 pte_t *orig_pte;
243 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700244 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700245
Hugh Dickins705e87c2005-10-29 18:16:27 -0700246 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700247 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800248 struct page *page;
Hugh Dickins91612e02005-06-21 17:15:07 -0700249 unsigned int nid;
250
251 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800253 page = vm_normal_page(vma, addr, *pte);
254 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800256 nid = page_to_nid(page);
Christoph Lameter38e35862006-01-08 01:01:01 -0800257 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
258 continue;
259
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800260 if (flags & MPOL_MF_STATS)
261 gather_stats(page, private);
Christoph Lameter132beac2006-01-08 01:01:02 -0800262 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
263 spin_unlock(ptl);
Christoph Lameter38e35862006-01-08 01:01:01 -0800264 migrate_page_add(vma, page, private, flags);
Christoph Lameter132beac2006-01-08 01:01:02 -0800265 spin_lock(ptl);
266 }
Christoph Lameter38e35862006-01-08 01:01:01 -0800267 else
268 break;
Hugh Dickins91612e02005-06-21 17:15:07 -0700269 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700270 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700271 return addr != end;
272}
273
Nick Pigginb5810032005-10-29 18:16:12 -0700274static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800275 unsigned long addr, unsigned long end,
276 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800277 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700278{
279 pmd_t *pmd;
280 unsigned long next;
281
282 pmd = pmd_offset(pud, addr);
283 do {
284 next = pmd_addr_end(addr, end);
285 if (pmd_none_or_clear_bad(pmd))
286 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800287 if (check_pte_range(vma, pmd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800288 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700289 return -EIO;
290 } while (pmd++, addr = next, addr != end);
291 return 0;
292}
293
Nick Pigginb5810032005-10-29 18:16:12 -0700294static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800295 unsigned long addr, unsigned long end,
296 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800297 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700298{
299 pud_t *pud;
300 unsigned long next;
301
302 pud = pud_offset(pgd, addr);
303 do {
304 next = pud_addr_end(addr, end);
305 if (pud_none_or_clear_bad(pud))
306 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800307 if (check_pmd_range(vma, pud, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800308 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700309 return -EIO;
310 } while (pud++, addr = next, addr != end);
311 return 0;
312}
313
Nick Pigginb5810032005-10-29 18:16:12 -0700314static inline int check_pgd_range(struct vm_area_struct *vma,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800315 unsigned long addr, unsigned long end,
316 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800317 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700318{
319 pgd_t *pgd;
320 unsigned long next;
321
Nick Pigginb5810032005-10-29 18:16:12 -0700322 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700323 do {
324 next = pgd_addr_end(addr, end);
325 if (pgd_none_or_clear_bad(pgd))
326 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800327 if (check_pud_range(vma, pgd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800328 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700329 return -EIO;
330 } while (pgd++, addr = next, addr != end);
331 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332}
333
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800334/* Check if a vma is migratable */
335static inline int vma_migratable(struct vm_area_struct *vma)
336{
337 if (vma->vm_flags & (
338 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
339 return 0;
340 return 1;
341}
342
343/*
344 * Check if all pages in a range are on a set of nodes.
345 * If pagelist != NULL then isolate pages from the LRU and
346 * put them on the pagelist.
347 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348static struct vm_area_struct *
349check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Christoph Lameter38e35862006-01-08 01:01:01 -0800350 const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351{
352 int err;
353 struct vm_area_struct *first, *vma, *prev;
354
355 first = find_vma(mm, start);
356 if (!first)
357 return ERR_PTR(-EFAULT);
358 prev = NULL;
359 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800360 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
361 if (!vma->vm_next && vma->vm_end < end)
362 return ERR_PTR(-EFAULT);
363 if (prev && prev->vm_end < vma->vm_start)
364 return ERR_PTR(-EFAULT);
365 }
366 if (!is_vm_hugetlb_page(vma) &&
367 ((flags & MPOL_MF_STRICT) ||
368 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
369 vma_migratable(vma)))) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700370 unsigned long endvma = vma->vm_end;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800371
Andi Kleen5b952b32005-09-13 01:25:08 -0700372 if (endvma > end)
373 endvma = end;
374 if (vma->vm_start > start)
375 start = vma->vm_start;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800376 err = check_pgd_range(vma, start, endvma, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800377 flags, private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 if (err) {
379 first = ERR_PTR(err);
380 break;
381 }
382 }
383 prev = vma;
384 }
385 return first;
386}
387
388/* Apply policy to a single VMA */
389static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
390{
391 int err = 0;
392 struct mempolicy *old = vma->vm_policy;
393
394 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
395 vma->vm_start, vma->vm_end, vma->vm_pgoff,
396 vma->vm_ops, vma->vm_file,
397 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
398
399 if (vma->vm_ops && vma->vm_ops->set_policy)
400 err = vma->vm_ops->set_policy(vma, new);
401 if (!err) {
402 mpol_get(new);
403 vma->vm_policy = new;
404 mpol_free(old);
405 }
406 return err;
407}
408
409/* Step 2: apply policy to a range and do splits. */
410static int mbind_range(struct vm_area_struct *vma, unsigned long start,
411 unsigned long end, struct mempolicy *new)
412{
413 struct vm_area_struct *next;
414 int err;
415
416 err = 0;
417 for (; vma && vma->vm_start < end; vma = next) {
418 next = vma->vm_next;
419 if (vma->vm_start < start)
420 err = split_vma(vma->vm_mm, vma, start, 1);
421 if (!err && vma->vm_end > end)
422 err = split_vma(vma->vm_mm, vma, end, 0);
423 if (!err)
424 err = policy_vma(vma, new);
425 if (err)
426 break;
427 }
428 return err;
429}
430
Christoph Lameter8bccd852005-10-29 18:16:59 -0700431static int contextualize_policy(int mode, nodemask_t *nodes)
432{
433 if (!nodes)
434 return 0;
435
436 /* Update current mems_allowed */
437 cpuset_update_current_mems_allowed();
438 /* Ignore nodes not set in current->mems_allowed */
439 cpuset_restrict_to_mems_allowed(nodes->bits);
440 return mpol_check_policy(mode, nodes);
441}
442
Christoph Lameterd4984712006-01-08 01:00:55 -0800443static int swap_pages(struct list_head *pagelist)
444{
445 LIST_HEAD(moved);
446 LIST_HEAD(failed);
447 int n;
448
449 n = migrate_pages(pagelist, NULL, &moved, &failed);
450 putback_lru_pages(&failed);
451 putback_lru_pages(&moved);
452
453 return n;
454}
455
Christoph Lameter8bccd852005-10-29 18:16:59 -0700456long do_mbind(unsigned long start, unsigned long len,
457 unsigned long mode, nodemask_t *nmask, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458{
459 struct vm_area_struct *vma;
460 struct mm_struct *mm = current->mm;
461 struct mempolicy *new;
462 unsigned long end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 int err;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800464 LIST_HEAD(pagelist);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465
Christoph Lameter38e35862006-01-08 01:01:01 -0800466 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
467 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800468 || mode > MPOL_MAX)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 return -EINVAL;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800470 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
471 return -EPERM;
472
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 if (start & ~PAGE_MASK)
474 return -EINVAL;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800475
Linus Torvalds1da177e2005-04-16 15:20:36 -0700476 if (mode == MPOL_DEFAULT)
477 flags &= ~MPOL_MF_STRICT;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800478
Linus Torvalds1da177e2005-04-16 15:20:36 -0700479 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
480 end = start + len;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800481
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 if (end < start)
483 return -EINVAL;
484 if (end == start)
485 return 0;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800486
Christoph Lameter5fcbb232005-10-29 18:17:00 -0700487 if (mpol_check_policy(mode, nmask))
Christoph Lameter8bccd852005-10-29 18:16:59 -0700488 return -EINVAL;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800489
Christoph Lameter8bccd852005-10-29 18:16:59 -0700490 new = mpol_new(mode, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491 if (IS_ERR(new))
492 return PTR_ERR(new);
493
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800494 /*
495 * If we are using the default policy then operation
496 * on discontinuous address spaces is okay after all
497 */
498 if (!new)
499 flags |= MPOL_MF_DISCONTIG_OK;
500
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
Andi Kleendfcd3c02005-10-29 18:15:48 -0700502 mode,nodes_addr(nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503
504 down_write(&mm->mmap_sem);
Christoph Lameter38e35862006-01-08 01:01:01 -0800505 vma = check_range(mm, start, end, nmask,
506 flags | MPOL_MF_INVERT, &pagelist);
507
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 err = PTR_ERR(vma);
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800509 if (!IS_ERR(vma)) {
Christoph Lameterd4984712006-01-08 01:00:55 -0800510 int nr_failed = 0;
511
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 err = mbind_range(vma, start, end, new);
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800513 if (!list_empty(&pagelist))
Christoph Lameterd4984712006-01-08 01:00:55 -0800514 nr_failed = swap_pages(&pagelist);
515
516 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800517 err = -EIO;
518 }
519 if (!list_empty(&pagelist))
520 putback_lru_pages(&pagelist);
521
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522 up_write(&mm->mmap_sem);
523 mpol_free(new);
524 return err;
525}
526
527/* Set the process memory policy */
Christoph Lameter8bccd852005-10-29 18:16:59 -0700528long do_set_mempolicy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530 struct mempolicy *new;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531
Christoph Lameter8bccd852005-10-29 18:16:59 -0700532 if (contextualize_policy(mode, nodes))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700534 new = mpol_new(mode, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535 if (IS_ERR(new))
536 return PTR_ERR(new);
537 mpol_free(current->mempolicy);
538 current->mempolicy = new;
539 if (new && new->policy == MPOL_INTERLEAVE)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700540 current->il_next = first_node(new->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 return 0;
542}
543
544/* Fill a zone bitmap for a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700545static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700546{
547 int i;
548
Andi Kleendfcd3c02005-10-29 18:15:48 -0700549 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 switch (p->policy) {
551 case MPOL_BIND:
552 for (i = 0; p->v.zonelist->zones[i]; i++)
Christoph Lameter8bccd852005-10-29 18:16:59 -0700553 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
554 *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555 break;
556 case MPOL_DEFAULT:
557 break;
558 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700559 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 break;
561 case MPOL_PREFERRED:
562 /* or use current node instead of online map? */
563 if (p->v.preferred_node < 0)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700564 *nodes = node_online_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700566 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 break;
568 default:
569 BUG();
570 }
571}
572
573static int lookup_node(struct mm_struct *mm, unsigned long addr)
574{
575 struct page *p;
576 int err;
577
578 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
579 if (err >= 0) {
580 err = page_to_nid(p);
581 put_page(p);
582 }
583 return err;
584}
585
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586/* Retrieve NUMA policy */
Christoph Lameter8bccd852005-10-29 18:16:59 -0700587long do_get_mempolicy(int *policy, nodemask_t *nmask,
588 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700589{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700590 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591 struct mm_struct *mm = current->mm;
592 struct vm_area_struct *vma = NULL;
593 struct mempolicy *pol = current->mempolicy;
594
Paul Jackson68860ec2005-10-30 15:02:36 -0800595 cpuset_update_current_mems_allowed();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
597 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598 if (flags & MPOL_F_ADDR) {
599 down_read(&mm->mmap_sem);
600 vma = find_vma_intersection(mm, addr, addr+1);
601 if (!vma) {
602 up_read(&mm->mmap_sem);
603 return -EFAULT;
604 }
605 if (vma->vm_ops && vma->vm_ops->get_policy)
606 pol = vma->vm_ops->get_policy(vma, addr);
607 else
608 pol = vma->vm_policy;
609 } else if (addr)
610 return -EINVAL;
611
612 if (!pol)
613 pol = &default_policy;
614
615 if (flags & MPOL_F_NODE) {
616 if (flags & MPOL_F_ADDR) {
617 err = lookup_node(mm, addr);
618 if (err < 0)
619 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700620 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621 } else if (pol == current->mempolicy &&
622 pol->policy == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700623 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624 } else {
625 err = -EINVAL;
626 goto out;
627 }
628 } else
Christoph Lameter8bccd852005-10-29 18:16:59 -0700629 *policy = pol->policy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630
631 if (vma) {
632 up_read(&current->mm->mmap_sem);
633 vma = NULL;
634 }
635
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636 err = 0;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700637 if (nmask)
638 get_zonemask(pol, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639
640 out:
641 if (vma)
642 up_read(&current->mm->mmap_sem);
643 return err;
644}
645
Christoph Lameter8bccd852005-10-29 18:16:59 -0700646/*
Christoph Lameter39743882006-01-08 01:00:51 -0800647 * For now migrate_pages simply swaps out the pages from nodes that are in
648 * the source set but not in the target set. In the future, we would
649 * want a function that moves pages between the two nodesets in such
650 * a way as to preserve the physical layout as much as possible.
651 *
652 * Returns the number of page that could not be moved.
653 */
654int do_migrate_pages(struct mm_struct *mm,
655 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
656{
657 LIST_HEAD(pagelist);
658 int count = 0;
659 nodemask_t nodes;
660
661 nodes_andnot(nodes, *from_nodes, *to_nodes);
Christoph Lameter39743882006-01-08 01:00:51 -0800662
663 down_read(&mm->mmap_sem);
664 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
665 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
Christoph Lameterd4984712006-01-08 01:00:55 -0800666
Christoph Lameter39743882006-01-08 01:00:51 -0800667 if (!list_empty(&pagelist)) {
Christoph Lameterd4984712006-01-08 01:00:55 -0800668 count = swap_pages(&pagelist);
669 putback_lru_pages(&pagelist);
Christoph Lameter39743882006-01-08 01:00:51 -0800670 }
Christoph Lameterd4984712006-01-08 01:00:55 -0800671
Christoph Lameter39743882006-01-08 01:00:51 -0800672 up_read(&mm->mmap_sem);
673 return count;
674}
675
676/*
Christoph Lameter8bccd852005-10-29 18:16:59 -0700677 * User space interface with variable sized bitmaps for nodelists.
678 */
679
680/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -0800681static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -0700682 unsigned long maxnode)
683{
684 unsigned long k;
685 unsigned long nlongs;
686 unsigned long endmask;
687
688 --maxnode;
689 nodes_clear(*nodes);
690 if (maxnode == 0 || !nmask)
691 return 0;
692
693 nlongs = BITS_TO_LONGS(maxnode);
694 if ((maxnode % BITS_PER_LONG) == 0)
695 endmask = ~0UL;
696 else
697 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
698
699 /* When the user specified more nodes than supported just check
700 if the non supported part is all zero. */
701 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
702 if (nlongs > PAGE_SIZE/sizeof(long))
703 return -EINVAL;
704 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
705 unsigned long t;
706 if (get_user(t, nmask + k))
707 return -EFAULT;
708 if (k == nlongs - 1) {
709 if (t & endmask)
710 return -EINVAL;
711 } else if (t)
712 return -EINVAL;
713 }
714 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
715 endmask = ~0UL;
716 }
717
718 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
719 return -EFAULT;
720 nodes_addr(*nodes)[nlongs-1] &= endmask;
721 return 0;
722}
723
724/* Copy a kernel node mask to user space */
725static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
726 nodemask_t *nodes)
727{
728 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
729 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
730
731 if (copy > nbytes) {
732 if (copy > PAGE_SIZE)
733 return -EINVAL;
734 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
735 return -EFAULT;
736 copy = nbytes;
737 }
738 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
739}
740
741asmlinkage long sys_mbind(unsigned long start, unsigned long len,
742 unsigned long mode,
743 unsigned long __user *nmask, unsigned long maxnode,
744 unsigned flags)
745{
746 nodemask_t nodes;
747 int err;
748
749 err = get_nodes(&nodes, nmask, maxnode);
750 if (err)
751 return err;
752 return do_mbind(start, len, mode, &nodes, flags);
753}
754
755/* Set the process memory policy */
756asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
757 unsigned long maxnode)
758{
759 int err;
760 nodemask_t nodes;
761
762 if (mode < 0 || mode > MPOL_MAX)
763 return -EINVAL;
764 err = get_nodes(&nodes, nmask, maxnode);
765 if (err)
766 return err;
767 return do_set_mempolicy(mode, &nodes);
768}
769
Christoph Lameter39743882006-01-08 01:00:51 -0800770/* Macro needed until Paul implements this function in kernel/cpusets.c */
771#define cpuset_mems_allowed(task) node_online_map
772
773asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
774 const unsigned long __user *old_nodes,
775 const unsigned long __user *new_nodes)
776{
777 struct mm_struct *mm;
778 struct task_struct *task;
779 nodemask_t old;
780 nodemask_t new;
781 nodemask_t task_nodes;
782 int err;
783
784 err = get_nodes(&old, old_nodes, maxnode);
785 if (err)
786 return err;
787
788 err = get_nodes(&new, new_nodes, maxnode);
789 if (err)
790 return err;
791
792 /* Find the mm_struct */
793 read_lock(&tasklist_lock);
794 task = pid ? find_task_by_pid(pid) : current;
795 if (!task) {
796 read_unlock(&tasklist_lock);
797 return -ESRCH;
798 }
799 mm = get_task_mm(task);
800 read_unlock(&tasklist_lock);
801
802 if (!mm)
803 return -EINVAL;
804
805 /*
806 * Check if this process has the right to modify the specified
807 * process. The right exists if the process has administrative
808 * capabilities, superuser priviledges or the same
809 * userid as the target process.
810 */
811 if ((current->euid != task->suid) && (current->euid != task->uid) &&
812 (current->uid != task->suid) && (current->uid != task->uid) &&
813 !capable(CAP_SYS_ADMIN)) {
814 err = -EPERM;
815 goto out;
816 }
817
818 task_nodes = cpuset_mems_allowed(task);
819 /* Is the user allowed to access the target nodes? */
820 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
821 err = -EPERM;
822 goto out;
823 }
824
825 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
826out:
827 mmput(mm);
828 return err;
829}
830
831
Christoph Lameter8bccd852005-10-29 18:16:59 -0700832/* Retrieve NUMA policy */
833asmlinkage long sys_get_mempolicy(int __user *policy,
834 unsigned long __user *nmask,
835 unsigned long maxnode,
836 unsigned long addr, unsigned long flags)
837{
838 int err, pval;
839 nodemask_t nodes;
840
841 if (nmask != NULL && maxnode < MAX_NUMNODES)
842 return -EINVAL;
843
844 err = do_get_mempolicy(&pval, &nodes, addr, flags);
845
846 if (err)
847 return err;
848
849 if (policy && put_user(pval, policy))
850 return -EFAULT;
851
852 if (nmask)
853 err = copy_nodes_to_user(nmask, maxnode, &nodes);
854
855 return err;
856}
857
Linus Torvalds1da177e2005-04-16 15:20:36 -0700858#ifdef CONFIG_COMPAT
859
860asmlinkage long compat_sys_get_mempolicy(int __user *policy,
861 compat_ulong_t __user *nmask,
862 compat_ulong_t maxnode,
863 compat_ulong_t addr, compat_ulong_t flags)
864{
865 long err;
866 unsigned long __user *nm = NULL;
867 unsigned long nr_bits, alloc_size;
868 DECLARE_BITMAP(bm, MAX_NUMNODES);
869
870 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
871 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
872
873 if (nmask)
874 nm = compat_alloc_user_space(alloc_size);
875
876 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
877
878 if (!err && nmask) {
879 err = copy_from_user(bm, nm, alloc_size);
880 /* ensure entire bitmap is zeroed */
881 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
882 err |= compat_put_bitmap(nmask, bm, nr_bits);
883 }
884
885 return err;
886}
887
888asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
889 compat_ulong_t maxnode)
890{
891 long err = 0;
892 unsigned long __user *nm = NULL;
893 unsigned long nr_bits, alloc_size;
894 DECLARE_BITMAP(bm, MAX_NUMNODES);
895
896 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
897 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
898
899 if (nmask) {
900 err = compat_get_bitmap(bm, nmask, nr_bits);
901 nm = compat_alloc_user_space(alloc_size);
902 err |= copy_to_user(nm, bm, alloc_size);
903 }
904
905 if (err)
906 return -EFAULT;
907
908 return sys_set_mempolicy(mode, nm, nr_bits+1);
909}
910
911asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
912 compat_ulong_t mode, compat_ulong_t __user *nmask,
913 compat_ulong_t maxnode, compat_ulong_t flags)
914{
915 long err = 0;
916 unsigned long __user *nm = NULL;
917 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700918 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919
920 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
921 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
922
923 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700924 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700926 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927 }
928
929 if (err)
930 return -EFAULT;
931
932 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
933}
934
935#endif
936
937/* Return effective policy for a VMA */
Christoph Lameter48fce342006-01-08 01:01:03 -0800938static struct mempolicy * get_vma_policy(struct task_struct *task,
939 struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700941 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942
943 if (vma) {
944 if (vma->vm_ops && vma->vm_ops->get_policy)
Christoph Lameter8bccd852005-10-29 18:16:59 -0700945 pol = vma->vm_ops->get_policy(vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946 else if (vma->vm_policy &&
947 vma->vm_policy->policy != MPOL_DEFAULT)
948 pol = vma->vm_policy;
949 }
950 if (!pol)
951 pol = &default_policy;
952 return pol;
953}
954
955/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +0100956static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700957{
958 int nd;
959
960 switch (policy->policy) {
961 case MPOL_PREFERRED:
962 nd = policy->v.preferred_node;
963 if (nd < 0)
964 nd = numa_node_id();
965 break;
966 case MPOL_BIND:
967 /* Lower zones don't get a policy applied */
968 /* Careful: current->mems_allowed might have moved */
Al Viroaf4ca452005-10-21 02:55:38 -0400969 if (gfp_zone(gfp) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
971 return policy->v.zonelist;
972 /*FALL THROUGH*/
973 case MPOL_INTERLEAVE: /* should not happen */
974 case MPOL_DEFAULT:
975 nd = numa_node_id();
976 break;
977 default:
978 nd = 0;
979 BUG();
980 }
Al Viroaf4ca452005-10-21 02:55:38 -0400981 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982}
983
984/* Do dynamic interleaving for a process */
985static unsigned interleave_nodes(struct mempolicy *policy)
986{
987 unsigned nid, next;
988 struct task_struct *me = current;
989
990 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700991 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700993 next = first_node(policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994 me->il_next = next;
995 return nid;
996}
997
998/* Do static interleaving for a VMA with known offset. */
999static unsigned offset_il_node(struct mempolicy *pol,
1000 struct vm_area_struct *vma, unsigned long off)
1001{
Andi Kleendfcd3c02005-10-29 18:15:48 -07001002 unsigned nnodes = nodes_weight(pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001003 unsigned target = (unsigned)off % nnodes;
1004 int c;
1005 int nid = -1;
1006
1007 c = 0;
1008 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -07001009 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001010 c++;
1011 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012 return nid;
1013}
1014
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001015/* Determine a node number for interleave */
1016static inline unsigned interleave_nid(struct mempolicy *pol,
1017 struct vm_area_struct *vma, unsigned long addr, int shift)
1018{
1019 if (vma) {
1020 unsigned long off;
1021
1022 off = vma->vm_pgoff;
1023 off += (addr - vma->vm_start) >> shift;
1024 return offset_il_node(pol, vma, off);
1025 } else
1026 return interleave_nodes(pol);
1027}
1028
1029/* Return a zonelist suitable for a huge page allocation. */
1030struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1031{
1032 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1033
1034 if (pol->policy == MPOL_INTERLEAVE) {
1035 unsigned nid;
1036
1037 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1038 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1039 }
1040 return zonelist_policy(GFP_HIGHUSER, pol);
1041}
1042
Linus Torvalds1da177e2005-04-16 15:20:36 -07001043/* Allocate a page in interleaved policy.
1044 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07001045static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1046 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047{
1048 struct zonelist *zl;
1049 struct page *page;
1050
Al Viroaf4ca452005-10-21 02:55:38 -04001051 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052 page = __alloc_pages(gfp, order, zl);
1053 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07001054 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055 put_cpu();
1056 }
1057 return page;
1058}
1059
1060/**
1061 * alloc_page_vma - Allocate a page for a VMA.
1062 *
1063 * @gfp:
1064 * %GFP_USER user allocation.
1065 * %GFP_KERNEL kernel allocations,
1066 * %GFP_HIGHMEM highmem/user allocations,
1067 * %GFP_FS allocation should not call back into a file system.
1068 * %GFP_ATOMIC don't sleep.
1069 *
1070 * @vma: Pointer to VMA or NULL if not available.
1071 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1072 *
1073 * This function allocates a page from the kernel page pool and applies
1074 * a NUMA policy associated with the VMA or the current process.
1075 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1076 * mm_struct of the VMA to prevent it from going away. Should be used for
1077 * all allocations for pages that will be mapped into
1078 * user space. Returns NULL when no page can be allocated.
1079 *
1080 * Should be called with the mm_sem of the vma hold.
1081 */
1082struct page *
Al Virodd0fc662005-10-07 07:46:04 +01001083alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001085 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086
1087 cpuset_update_current_mems_allowed();
1088
1089 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1090 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001091
1092 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093 return alloc_page_interleave(gfp, 0, nid);
1094 }
1095 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1096}
1097
1098/**
1099 * alloc_pages_current - Allocate pages.
1100 *
1101 * @gfp:
1102 * %GFP_USER user allocation,
1103 * %GFP_KERNEL kernel allocation,
1104 * %GFP_HIGHMEM highmem allocation,
1105 * %GFP_FS don't call back into a file system.
1106 * %GFP_ATOMIC don't sleep.
1107 * @order: Power of two of allocation size in pages. 0 is a single page.
1108 *
1109 * Allocate a page from the kernel page pool. When not in
1110 * interrupt context and apply the current process NUMA policy.
1111 * Returns NULL when no page can be allocated.
1112 *
1113 * Don't call cpuset_update_current_mems_allowed() unless
1114 * 1) it's ok to take cpuset_sem (can WAIT), and
1115 * 2) allocating for current task (not interrupt).
1116 */
Al Virodd0fc662005-10-07 07:46:04 +01001117struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118{
1119 struct mempolicy *pol = current->mempolicy;
1120
1121 if ((gfp & __GFP_WAIT) && !in_interrupt())
1122 cpuset_update_current_mems_allowed();
1123 if (!pol || in_interrupt())
1124 pol = &default_policy;
1125 if (pol->policy == MPOL_INTERLEAVE)
1126 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1127 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1128}
1129EXPORT_SYMBOL(alloc_pages_current);
1130
1131/* Slow path of a mempolicy copy */
1132struct mempolicy *__mpol_copy(struct mempolicy *old)
1133{
1134 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1135
1136 if (!new)
1137 return ERR_PTR(-ENOMEM);
1138 *new = *old;
1139 atomic_set(&new->refcnt, 1);
1140 if (new->policy == MPOL_BIND) {
1141 int sz = ksize(old->v.zonelist);
1142 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1143 if (!new->v.zonelist) {
1144 kmem_cache_free(policy_cache, new);
1145 return ERR_PTR(-ENOMEM);
1146 }
1147 memcpy(new->v.zonelist, old->v.zonelist, sz);
1148 }
1149 return new;
1150}
1151
1152/* Slow path of a mempolicy comparison */
1153int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1154{
1155 if (!a || !b)
1156 return 0;
1157 if (a->policy != b->policy)
1158 return 0;
1159 switch (a->policy) {
1160 case MPOL_DEFAULT:
1161 return 1;
1162 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -07001163 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001164 case MPOL_PREFERRED:
1165 return a->v.preferred_node == b->v.preferred_node;
1166 case MPOL_BIND: {
1167 int i;
1168 for (i = 0; a->v.zonelist->zones[i]; i++)
1169 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1170 return 0;
1171 return b->v.zonelist->zones[i] == NULL;
1172 }
1173 default:
1174 BUG();
1175 return 0;
1176 }
1177}
1178
1179/* Slow path of a mpol destructor. */
1180void __mpol_free(struct mempolicy *p)
1181{
1182 if (!atomic_dec_and_test(&p->refcnt))
1183 return;
1184 if (p->policy == MPOL_BIND)
1185 kfree(p->v.zonelist);
1186 p->policy = MPOL_DEFAULT;
1187 kmem_cache_free(policy_cache, p);
1188}
1189
1190/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001191 * Shared memory backing store policy support.
1192 *
1193 * Remember policies even when nobody has shared memory mapped.
1194 * The policies are kept in Red-Black tree linked from the inode.
1195 * They are protected by the sp->lock spinlock, which should be held
1196 * for any accesses to the tree.
1197 */
1198
1199/* lookup first element intersecting start-end */
1200/* Caller holds sp->lock */
1201static struct sp_node *
1202sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1203{
1204 struct rb_node *n = sp->root.rb_node;
1205
1206 while (n) {
1207 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1208
1209 if (start >= p->end)
1210 n = n->rb_right;
1211 else if (end <= p->start)
1212 n = n->rb_left;
1213 else
1214 break;
1215 }
1216 if (!n)
1217 return NULL;
1218 for (;;) {
1219 struct sp_node *w = NULL;
1220 struct rb_node *prev = rb_prev(n);
1221 if (!prev)
1222 break;
1223 w = rb_entry(prev, struct sp_node, nd);
1224 if (w->end <= start)
1225 break;
1226 n = prev;
1227 }
1228 return rb_entry(n, struct sp_node, nd);
1229}
1230
1231/* Insert a new shared policy into the list. */
1232/* Caller holds sp->lock */
1233static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1234{
1235 struct rb_node **p = &sp->root.rb_node;
1236 struct rb_node *parent = NULL;
1237 struct sp_node *nd;
1238
1239 while (*p) {
1240 parent = *p;
1241 nd = rb_entry(parent, struct sp_node, nd);
1242 if (new->start < nd->start)
1243 p = &(*p)->rb_left;
1244 else if (new->end > nd->end)
1245 p = &(*p)->rb_right;
1246 else
1247 BUG();
1248 }
1249 rb_link_node(&new->nd, parent, p);
1250 rb_insert_color(&new->nd, &sp->root);
1251 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1252 new->policy ? new->policy->policy : 0);
1253}
1254
1255/* Find shared policy intersecting idx */
1256struct mempolicy *
1257mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1258{
1259 struct mempolicy *pol = NULL;
1260 struct sp_node *sn;
1261
1262 if (!sp->root.rb_node)
1263 return NULL;
1264 spin_lock(&sp->lock);
1265 sn = sp_lookup(sp, idx, idx+1);
1266 if (sn) {
1267 mpol_get(sn->policy);
1268 pol = sn->policy;
1269 }
1270 spin_unlock(&sp->lock);
1271 return pol;
1272}
1273
1274static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1275{
1276 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1277 rb_erase(&n->nd, &sp->root);
1278 mpol_free(n->policy);
1279 kmem_cache_free(sn_cache, n);
1280}
1281
1282struct sp_node *
1283sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1284{
1285 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1286
1287 if (!n)
1288 return NULL;
1289 n->start = start;
1290 n->end = end;
1291 mpol_get(pol);
1292 n->policy = pol;
1293 return n;
1294}
1295
1296/* Replace a policy range. */
1297static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1298 unsigned long end, struct sp_node *new)
1299{
1300 struct sp_node *n, *new2 = NULL;
1301
1302restart:
1303 spin_lock(&sp->lock);
1304 n = sp_lookup(sp, start, end);
1305 /* Take care of old policies in the same range. */
1306 while (n && n->start < end) {
1307 struct rb_node *next = rb_next(&n->nd);
1308 if (n->start >= start) {
1309 if (n->end <= end)
1310 sp_delete(sp, n);
1311 else
1312 n->start = end;
1313 } else {
1314 /* Old policy spanning whole new range. */
1315 if (n->end > end) {
1316 if (!new2) {
1317 spin_unlock(&sp->lock);
1318 new2 = sp_alloc(end, n->end, n->policy);
1319 if (!new2)
1320 return -ENOMEM;
1321 goto restart;
1322 }
1323 n->end = start;
1324 sp_insert(sp, new2);
1325 new2 = NULL;
1326 break;
1327 } else
1328 n->end = start;
1329 }
1330 if (!next)
1331 break;
1332 n = rb_entry(next, struct sp_node, nd);
1333 }
1334 if (new)
1335 sp_insert(sp, new);
1336 spin_unlock(&sp->lock);
1337 if (new2) {
1338 mpol_free(new2->policy);
1339 kmem_cache_free(sn_cache, new2);
1340 }
1341 return 0;
1342}
1343
1344int mpol_set_shared_policy(struct shared_policy *info,
1345 struct vm_area_struct *vma, struct mempolicy *npol)
1346{
1347 int err;
1348 struct sp_node *new = NULL;
1349 unsigned long sz = vma_pages(vma);
1350
1351 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1352 vma->vm_pgoff,
1353 sz, npol? npol->policy : -1,
Andi Kleendfcd3c02005-10-29 18:15:48 -07001354 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001355
1356 if (npol) {
1357 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1358 if (!new)
1359 return -ENOMEM;
1360 }
1361 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1362 if (err && new)
1363 kmem_cache_free(sn_cache, new);
1364 return err;
1365}
1366
1367/* Free a backing policy store on inode delete. */
1368void mpol_free_shared_policy(struct shared_policy *p)
1369{
1370 struct sp_node *n;
1371 struct rb_node *next;
1372
1373 if (!p->root.rb_node)
1374 return;
1375 spin_lock(&p->lock);
1376 next = rb_first(&p->root);
1377 while (next) {
1378 n = rb_entry(next, struct sp_node, nd);
1379 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001380 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 mpol_free(n->policy);
1382 kmem_cache_free(sn_cache, n);
1383 }
1384 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001385}
1386
1387/* assumes fs == KERNEL_DS */
1388void __init numa_policy_init(void)
1389{
1390 policy_cache = kmem_cache_create("numa_policy",
1391 sizeof(struct mempolicy),
1392 0, SLAB_PANIC, NULL, NULL);
1393
1394 sn_cache = kmem_cache_create("shared_policy_node",
1395 sizeof(struct sp_node),
1396 0, SLAB_PANIC, NULL, NULL);
1397
1398 /* Set interleaving policy for system init. This way not all
1399 the data structures allocated at system boot end up in node zero. */
1400
Christoph Lameter8bccd852005-10-29 18:16:59 -07001401 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402 printk("numa_policy_init: interleaving failed\n");
1403}
1404
Christoph Lameter8bccd852005-10-29 18:16:59 -07001405/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406void numa_default_policy(void)
1407{
Christoph Lameter8bccd852005-10-29 18:16:59 -07001408 do_set_mempolicy(MPOL_DEFAULT, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409}
Paul Jackson68860ec2005-10-30 15:02:36 -08001410
1411/* Migrate a policy to a different set of nodes */
1412static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1413 const nodemask_t *new)
1414{
1415 nodemask_t tmp;
1416
1417 if (!pol)
1418 return;
1419
1420 switch (pol->policy) {
1421 case MPOL_DEFAULT:
1422 break;
1423 case MPOL_INTERLEAVE:
1424 nodes_remap(tmp, pol->v.nodes, *old, *new);
1425 pol->v.nodes = tmp;
1426 current->il_next = node_remap(current->il_next, *old, *new);
1427 break;
1428 case MPOL_PREFERRED:
1429 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1430 *old, *new);
1431 break;
1432 case MPOL_BIND: {
1433 nodemask_t nodes;
1434 struct zone **z;
1435 struct zonelist *zonelist;
1436
1437 nodes_clear(nodes);
1438 for (z = pol->v.zonelist->zones; *z; z++)
1439 node_set((*z)->zone_pgdat->node_id, nodes);
1440 nodes_remap(tmp, nodes, *old, *new);
1441 nodes = tmp;
1442
1443 zonelist = bind_zonelist(&nodes);
1444
1445 /* If no mem, then zonelist is NULL and we keep old zonelist.
1446 * If that old zonelist has no remaining mems_allowed nodes,
1447 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1448 */
1449
1450 if (zonelist) {
1451 /* Good - got mem - substitute new zonelist */
1452 kfree(pol->v.zonelist);
1453 pol->v.zonelist = zonelist;
1454 }
1455 break;
1456 }
1457 default:
1458 BUG();
1459 break;
1460 }
1461}
1462
1463/*
1464 * Someone moved this task to different nodes. Fixup mempolicies.
1465 *
1466 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1467 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1468 */
1469void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1470{
1471 rebind_policy(current->mempolicy, old, new);
1472}
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001473
1474/*
1475 * Display pages allocated per node and memory policy via /proc.
1476 */
1477
1478static const char *policy_types[] = { "default", "prefer", "bind",
1479 "interleave" };
1480
1481/*
1482 * Convert a mempolicy into a string.
1483 * Returns the number of characters in buffer (if positive)
1484 * or an error (negative)
1485 */
1486static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1487{
1488 char *p = buffer;
1489 int l;
1490 nodemask_t nodes;
1491 int mode = pol ? pol->policy : MPOL_DEFAULT;
1492
1493 switch (mode) {
1494 case MPOL_DEFAULT:
1495 nodes_clear(nodes);
1496 break;
1497
1498 case MPOL_PREFERRED:
1499 nodes_clear(nodes);
1500 node_set(pol->v.preferred_node, nodes);
1501 break;
1502
1503 case MPOL_BIND:
1504 get_zonemask(pol, &nodes);
1505 break;
1506
1507 case MPOL_INTERLEAVE:
1508 nodes = pol->v.nodes;
1509 break;
1510
1511 default:
1512 BUG();
1513 return -EFAULT;
1514 }
1515
1516 l = strlen(policy_types[mode]);
1517 if (buffer + maxlen < p + l + 1)
1518 return -ENOSPC;
1519
1520 strcpy(p, policy_types[mode]);
1521 p += l;
1522
1523 if (!nodes_empty(nodes)) {
1524 if (buffer + maxlen < p + 2)
1525 return -ENOSPC;
1526 *p++ = '=';
1527 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1528 }
1529 return p - buffer;
1530}
1531
1532struct numa_maps {
1533 unsigned long pages;
1534 unsigned long anon;
1535 unsigned long mapped;
1536 unsigned long mapcount_max;
1537 unsigned long node[MAX_NUMNODES];
1538};
1539
1540static void gather_stats(struct page *page, void *private)
1541{
1542 struct numa_maps *md = private;
1543 int count = page_mapcount(page);
1544
1545 if (count)
1546 md->mapped++;
1547
1548 if (count > md->mapcount_max)
1549 md->mapcount_max = count;
1550
1551 md->pages++;
1552
1553 if (PageAnon(page))
1554 md->anon++;
1555
1556 md->node[page_to_nid(page)]++;
1557 cond_resched();
1558}
1559
1560int show_numa_map(struct seq_file *m, void *v)
1561{
1562 struct task_struct *task = m->private;
1563 struct vm_area_struct *vma = v;
1564 struct numa_maps *md;
1565 int n;
1566 char buffer[50];
1567
1568 if (!vma->vm_mm)
1569 return 0;
1570
1571 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1572 if (!md)
1573 return 0;
1574
1575 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1576 &node_online_map, MPOL_MF_STATS, md);
1577
1578 if (md->pages) {
1579 mpol_to_str(buffer, sizeof(buffer),
1580 get_vma_policy(task, vma, vma->vm_start));
1581
1582 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1583 vma->vm_start, buffer, md->pages,
1584 md->mapped, md->mapcount_max);
1585
1586 if (md->anon)
1587 seq_printf(m," anon=%lu",md->anon);
1588
1589 for_each_online_node(n)
1590 if (md->node[n])
1591 seq_printf(m, " N%d=%lu", n, md->node[n]);
1592
1593 seq_putc(m, '\n');
1594 }
1595 kfree(md);
1596
1597 if (m->count < m->size)
1598 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1599 return 0;
1600}
1601