blob: 7051fe450e96c2e0a9622edff3d0c602db523110 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080086#include <linux/swap.h>
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080087#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080089
Linus Torvalds1da177e2005-04-16 15:20:36 -070090#include <asm/tlbflush.h>
91#include <asm/uaccess.h>
92
Christoph Lameter38e35862006-01-08 01:01:01 -080093/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080094#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -080095#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080096#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080097
Linus Torvalds1da177e2005-04-16 15:20:36 -070098static kmem_cache_t *policy_cache;
99static kmem_cache_t *sn_cache;
100
101#define PDprintk(fmt...)
102
103/* Highest zone. An specific allocation for a zone below that is not
104 policied. */
Christoph Lameter4be38e32006-01-06 00:11:17 -0800105int policy_zone = ZONE_DMA;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
Andi Kleend42c6992005-07-06 19:56:03 +0200107struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 .refcnt = ATOMIC_INIT(1), /* never free it */
109 .policy = MPOL_DEFAULT,
110};
111
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112/* Do sanity checking on a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700113static int mpol_check_policy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700115 int empty = nodes_empty(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116
117 switch (mode) {
118 case MPOL_DEFAULT:
119 if (!empty)
120 return -EINVAL;
121 break;
122 case MPOL_BIND:
123 case MPOL_INTERLEAVE:
124 /* Preferred will only use the first bit, but allow
125 more for now. */
126 if (empty)
127 return -EINVAL;
128 break;
129 }
Andi Kleendfcd3c02005-10-29 18:15:48 -0700130 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132/* Generate a custom zonelist for the BIND policy. */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700133static struct zonelist *bind_zonelist(nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134{
135 struct zonelist *zl;
136 int num, max, nd;
137
Andi Kleendfcd3c02005-10-29 18:15:48 -0700138 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
140 if (!zl)
141 return NULL;
142 num = 0;
Christoph Lameter4be38e32006-01-06 00:11:17 -0800143 for_each_node_mask(nd, *nodes)
144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145 zl->zones[num] = NULL;
146 return zl;
147}
148
149/* Create a new policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151{
152 struct mempolicy *policy;
153
Andi Kleendfcd3c02005-10-29 18:15:48 -0700154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700163 policy->v.nodes = *nodes;
Andi Kleen8f493d72006-01-03 00:07:28 +0100164 if (nodes_weight(*nodes) == 0) {
165 kmem_cache_free(policy_cache, policy);
166 return ERR_PTR(-EINVAL);
167 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168 break;
169 case MPOL_PREFERRED:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700170 policy->v.preferred_node = first_node(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 if (policy->v.preferred_node >= MAX_NUMNODES)
172 policy->v.preferred_node = -1;
173 break;
174 case MPOL_BIND:
175 policy->v.zonelist = bind_zonelist(nodes);
176 if (policy->v.zonelist == NULL) {
177 kmem_cache_free(policy_cache, policy);
178 return ERR_PTR(-ENOMEM);
179 }
180 break;
181 }
182 policy->policy = mode;
183 return policy;
184}
185
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800186static void gather_stats(struct page *, void *);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800187static void migrate_page_add(struct vm_area_struct *vma,
188 struct page *page, struct list_head *pagelist, unsigned long flags);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800189
Christoph Lameter38e35862006-01-08 01:01:01 -0800190/* Scan through pages checking if pages follow certain conditions. */
Nick Pigginb5810032005-10-29 18:16:12 -0700191static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800192 unsigned long addr, unsigned long end,
193 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800194 void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195{
Hugh Dickins91612e02005-06-21 17:15:07 -0700196 pte_t *orig_pte;
197 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700198 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700199
Hugh Dickins705e87c2005-10-29 18:16:27 -0700200 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700201 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800202 struct page *page;
Hugh Dickins91612e02005-06-21 17:15:07 -0700203 unsigned int nid;
204
205 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800207 page = vm_normal_page(vma, addr, *pte);
208 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800210 nid = page_to_nid(page);
Christoph Lameter38e35862006-01-08 01:01:01 -0800211 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
212 continue;
213
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800214 if (flags & MPOL_MF_STATS)
215 gather_stats(page, private);
Christoph Lameter132beac2006-01-08 01:01:02 -0800216 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
217 spin_unlock(ptl);
Christoph Lameter38e35862006-01-08 01:01:01 -0800218 migrate_page_add(vma, page, private, flags);
Christoph Lameter132beac2006-01-08 01:01:02 -0800219 spin_lock(ptl);
220 }
Christoph Lameter38e35862006-01-08 01:01:01 -0800221 else
222 break;
Hugh Dickins91612e02005-06-21 17:15:07 -0700223 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700224 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700225 return addr != end;
226}
227
Nick Pigginb5810032005-10-29 18:16:12 -0700228static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800229 unsigned long addr, unsigned long end,
230 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800231 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700232{
233 pmd_t *pmd;
234 unsigned long next;
235
236 pmd = pmd_offset(pud, addr);
237 do {
238 next = pmd_addr_end(addr, end);
239 if (pmd_none_or_clear_bad(pmd))
240 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800241 if (check_pte_range(vma, pmd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800242 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700243 return -EIO;
244 } while (pmd++, addr = next, addr != end);
245 return 0;
246}
247
Nick Pigginb5810032005-10-29 18:16:12 -0700248static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800249 unsigned long addr, unsigned long end,
250 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800251 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700252{
253 pud_t *pud;
254 unsigned long next;
255
256 pud = pud_offset(pgd, addr);
257 do {
258 next = pud_addr_end(addr, end);
259 if (pud_none_or_clear_bad(pud))
260 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800261 if (check_pmd_range(vma, pud, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800262 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700263 return -EIO;
264 } while (pud++, addr = next, addr != end);
265 return 0;
266}
267
Nick Pigginb5810032005-10-29 18:16:12 -0700268static inline int check_pgd_range(struct vm_area_struct *vma,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800269 unsigned long addr, unsigned long end,
270 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800271 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700272{
273 pgd_t *pgd;
274 unsigned long next;
275
Nick Pigginb5810032005-10-29 18:16:12 -0700276 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700277 do {
278 next = pgd_addr_end(addr, end);
279 if (pgd_none_or_clear_bad(pgd))
280 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800281 if (check_pud_range(vma, pgd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800282 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700283 return -EIO;
284 } while (pgd++, addr = next, addr != end);
285 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286}
287
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800288/* Check if a vma is migratable */
289static inline int vma_migratable(struct vm_area_struct *vma)
290{
291 if (vma->vm_flags & (
292 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
293 return 0;
294 return 1;
295}
296
297/*
298 * Check if all pages in a range are on a set of nodes.
299 * If pagelist != NULL then isolate pages from the LRU and
300 * put them on the pagelist.
301 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302static struct vm_area_struct *
303check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Christoph Lameter38e35862006-01-08 01:01:01 -0800304 const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305{
306 int err;
307 struct vm_area_struct *first, *vma, *prev;
308
309 first = find_vma(mm, start);
310 if (!first)
311 return ERR_PTR(-EFAULT);
312 prev = NULL;
313 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800314 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
315 if (!vma->vm_next && vma->vm_end < end)
316 return ERR_PTR(-EFAULT);
317 if (prev && prev->vm_end < vma->vm_start)
318 return ERR_PTR(-EFAULT);
319 }
320 if (!is_vm_hugetlb_page(vma) &&
321 ((flags & MPOL_MF_STRICT) ||
322 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
323 vma_migratable(vma)))) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700324 unsigned long endvma = vma->vm_end;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800325
Andi Kleen5b952b32005-09-13 01:25:08 -0700326 if (endvma > end)
327 endvma = end;
328 if (vma->vm_start > start)
329 start = vma->vm_start;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800330 err = check_pgd_range(vma, start, endvma, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800331 flags, private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 if (err) {
333 first = ERR_PTR(err);
334 break;
335 }
336 }
337 prev = vma;
338 }
339 return first;
340}
341
342/* Apply policy to a single VMA */
343static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
344{
345 int err = 0;
346 struct mempolicy *old = vma->vm_policy;
347
348 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
349 vma->vm_start, vma->vm_end, vma->vm_pgoff,
350 vma->vm_ops, vma->vm_file,
351 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
352
353 if (vma->vm_ops && vma->vm_ops->set_policy)
354 err = vma->vm_ops->set_policy(vma, new);
355 if (!err) {
356 mpol_get(new);
357 vma->vm_policy = new;
358 mpol_free(old);
359 }
360 return err;
361}
362
363/* Step 2: apply policy to a range and do splits. */
364static int mbind_range(struct vm_area_struct *vma, unsigned long start,
365 unsigned long end, struct mempolicy *new)
366{
367 struct vm_area_struct *next;
368 int err;
369
370 err = 0;
371 for (; vma && vma->vm_start < end; vma = next) {
372 next = vma->vm_next;
373 if (vma->vm_start < start)
374 err = split_vma(vma->vm_mm, vma, start, 1);
375 if (!err && vma->vm_end > end)
376 err = split_vma(vma->vm_mm, vma, end, 0);
377 if (!err)
378 err = policy_vma(vma, new);
379 if (err)
380 break;
381 }
382 return err;
383}
384
Christoph Lameter8bccd852005-10-29 18:16:59 -0700385static int contextualize_policy(int mode, nodemask_t *nodes)
386{
387 if (!nodes)
388 return 0;
389
390 /* Update current mems_allowed */
391 cpuset_update_current_mems_allowed();
392 /* Ignore nodes not set in current->mems_allowed */
393 cpuset_restrict_to_mems_allowed(nodes->bits);
394 return mpol_check_policy(mode, nodes);
395}
396
Linus Torvalds1da177e2005-04-16 15:20:36 -0700397/* Set the process memory policy */
Christoph Lameter8bccd852005-10-29 18:16:59 -0700398long do_set_mempolicy(int mode, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 struct mempolicy *new;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401
Christoph Lameter8bccd852005-10-29 18:16:59 -0700402 if (contextualize_policy(mode, nodes))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700404 new = mpol_new(mode, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 if (IS_ERR(new))
406 return PTR_ERR(new);
407 mpol_free(current->mempolicy);
408 current->mempolicy = new;
409 if (new && new->policy == MPOL_INTERLEAVE)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700410 current->il_next = first_node(new->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411 return 0;
412}
413
414/* Fill a zone bitmap for a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700415static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416{
417 int i;
418
Andi Kleendfcd3c02005-10-29 18:15:48 -0700419 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420 switch (p->policy) {
421 case MPOL_BIND:
422 for (i = 0; p->v.zonelist->zones[i]; i++)
Christoph Lameter8bccd852005-10-29 18:16:59 -0700423 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
424 *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 break;
426 case MPOL_DEFAULT:
427 break;
428 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700429 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 break;
431 case MPOL_PREFERRED:
432 /* or use current node instead of online map? */
433 if (p->v.preferred_node < 0)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700434 *nodes = node_online_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700436 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 break;
438 default:
439 BUG();
440 }
441}
442
443static int lookup_node(struct mm_struct *mm, unsigned long addr)
444{
445 struct page *p;
446 int err;
447
448 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
449 if (err >= 0) {
450 err = page_to_nid(p);
451 put_page(p);
452 }
453 return err;
454}
455
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456/* Retrieve NUMA policy */
Christoph Lameter8bccd852005-10-29 18:16:59 -0700457long do_get_mempolicy(int *policy, nodemask_t *nmask,
458 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700460 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 struct mm_struct *mm = current->mm;
462 struct vm_area_struct *vma = NULL;
463 struct mempolicy *pol = current->mempolicy;
464
Paul Jackson68860ec2005-10-30 15:02:36 -0800465 cpuset_update_current_mems_allowed();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
467 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468 if (flags & MPOL_F_ADDR) {
469 down_read(&mm->mmap_sem);
470 vma = find_vma_intersection(mm, addr, addr+1);
471 if (!vma) {
472 up_read(&mm->mmap_sem);
473 return -EFAULT;
474 }
475 if (vma->vm_ops && vma->vm_ops->get_policy)
476 pol = vma->vm_ops->get_policy(vma, addr);
477 else
478 pol = vma->vm_policy;
479 } else if (addr)
480 return -EINVAL;
481
482 if (!pol)
483 pol = &default_policy;
484
485 if (flags & MPOL_F_NODE) {
486 if (flags & MPOL_F_ADDR) {
487 err = lookup_node(mm, addr);
488 if (err < 0)
489 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700490 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491 } else if (pol == current->mempolicy &&
492 pol->policy == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700493 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 } else {
495 err = -EINVAL;
496 goto out;
497 }
498 } else
Christoph Lameter8bccd852005-10-29 18:16:59 -0700499 *policy = pol->policy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500
501 if (vma) {
502 up_read(&current->mm->mmap_sem);
503 vma = NULL;
504 }
505
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506 err = 0;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700507 if (nmask)
508 get_zonemask(pol, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509
510 out:
511 if (vma)
512 up_read(&current->mm->mmap_sem);
513 return err;
514}
515
Christoph Lameter8bccd852005-10-29 18:16:59 -0700516/*
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800517 * page migration
518 */
519
520/* Check if we are the only process mapping the page in question */
521static inline int single_mm_mapping(struct mm_struct *mm,
522 struct address_space *mapping)
523{
524 struct vm_area_struct *vma;
525 struct prio_tree_iter iter;
526 int rc = 1;
527
528 spin_lock(&mapping->i_mmap_lock);
529 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
530 if (mm != vma->vm_mm) {
531 rc = 0;
532 goto out;
533 }
534 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
535 if (mm != vma->vm_mm) {
536 rc = 0;
537 goto out;
538 }
539out:
540 spin_unlock(&mapping->i_mmap_lock);
541 return rc;
542}
543
544/*
545 * Add a page to be migrated to the pagelist
546 */
547static void migrate_page_add(struct vm_area_struct *vma,
548 struct page *page, struct list_head *pagelist, unsigned long flags)
549{
550 /*
551 * Avoid migrating a page that is shared by others and not writable.
552 */
553 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
554 mapping_writably_mapped(page->mapping) ||
555 single_mm_mapping(vma->vm_mm, page->mapping)) {
556 int rc = isolate_lru_page(page);
557
558 if (rc == 1)
559 list_add(&page->lru, pagelist);
560 /*
561 * If the isolate attempt was not successful then we just
562 * encountered an unswappable page. Something must be wrong.
563 */
564 WARN_ON(rc == 0);
565 }
566}
567
568static int swap_pages(struct list_head *pagelist)
569{
570 LIST_HEAD(moved);
571 LIST_HEAD(failed);
572 int n;
573
574 n = migrate_pages(pagelist, NULL, &moved, &failed);
575 putback_lru_pages(&failed);
576 putback_lru_pages(&moved);
577
578 return n;
579}
580
581/*
Christoph Lameter39743882006-01-08 01:00:51 -0800582 * For now migrate_pages simply swaps out the pages from nodes that are in
583 * the source set but not in the target set. In the future, we would
584 * want a function that moves pages between the two nodesets in such
585 * a way as to preserve the physical layout as much as possible.
586 *
587 * Returns the number of page that could not be moved.
588 */
589int do_migrate_pages(struct mm_struct *mm,
590 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
591{
592 LIST_HEAD(pagelist);
593 int count = 0;
594 nodemask_t nodes;
595
596 nodes_andnot(nodes, *from_nodes, *to_nodes);
Christoph Lameter39743882006-01-08 01:00:51 -0800597
598 down_read(&mm->mmap_sem);
599 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
600 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
Christoph Lameterd4984712006-01-08 01:00:55 -0800601
Christoph Lameter39743882006-01-08 01:00:51 -0800602 if (!list_empty(&pagelist)) {
Christoph Lameterd4984712006-01-08 01:00:55 -0800603 count = swap_pages(&pagelist);
604 putback_lru_pages(&pagelist);
Christoph Lameter39743882006-01-08 01:00:51 -0800605 }
Christoph Lameterd4984712006-01-08 01:00:55 -0800606
Christoph Lameter39743882006-01-08 01:00:51 -0800607 up_read(&mm->mmap_sem);
608 return count;
609}
610
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800611long do_mbind(unsigned long start, unsigned long len,
612 unsigned long mode, nodemask_t *nmask, unsigned long flags)
613{
614 struct vm_area_struct *vma;
615 struct mm_struct *mm = current->mm;
616 struct mempolicy *new;
617 unsigned long end;
618 int err;
619 LIST_HEAD(pagelist);
620
621 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
622 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
623 || mode > MPOL_MAX)
624 return -EINVAL;
625 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
626 return -EPERM;
627
628 if (start & ~PAGE_MASK)
629 return -EINVAL;
630
631 if (mode == MPOL_DEFAULT)
632 flags &= ~MPOL_MF_STRICT;
633
634 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
635 end = start + len;
636
637 if (end < start)
638 return -EINVAL;
639 if (end == start)
640 return 0;
641
642 if (mpol_check_policy(mode, nmask))
643 return -EINVAL;
644
645 new = mpol_new(mode, nmask);
646 if (IS_ERR(new))
647 return PTR_ERR(new);
648
649 /*
650 * If we are using the default policy then operation
651 * on discontinuous address spaces is okay after all
652 */
653 if (!new)
654 flags |= MPOL_MF_DISCONTIG_OK;
655
656 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
657 mode,nodes_addr(nodes)[0]);
658
659 down_write(&mm->mmap_sem);
660 vma = check_range(mm, start, end, nmask,
661 flags | MPOL_MF_INVERT, &pagelist);
662
663 err = PTR_ERR(vma);
664 if (!IS_ERR(vma)) {
665 int nr_failed = 0;
666
667 err = mbind_range(vma, start, end, new);
668 if (!list_empty(&pagelist))
669 nr_failed = swap_pages(&pagelist);
670
671 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
672 err = -EIO;
673 }
674 if (!list_empty(&pagelist))
675 putback_lru_pages(&pagelist);
676
677 up_write(&mm->mmap_sem);
678 mpol_free(new);
679 return err;
680}
681
Christoph Lameter39743882006-01-08 01:00:51 -0800682/*
Christoph Lameter8bccd852005-10-29 18:16:59 -0700683 * User space interface with variable sized bitmaps for nodelists.
684 */
685
686/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -0800687static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -0700688 unsigned long maxnode)
689{
690 unsigned long k;
691 unsigned long nlongs;
692 unsigned long endmask;
693
694 --maxnode;
695 nodes_clear(*nodes);
696 if (maxnode == 0 || !nmask)
697 return 0;
698
699 nlongs = BITS_TO_LONGS(maxnode);
700 if ((maxnode % BITS_PER_LONG) == 0)
701 endmask = ~0UL;
702 else
703 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
704
705 /* When the user specified more nodes than supported just check
706 if the non supported part is all zero. */
707 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
708 if (nlongs > PAGE_SIZE/sizeof(long))
709 return -EINVAL;
710 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
711 unsigned long t;
712 if (get_user(t, nmask + k))
713 return -EFAULT;
714 if (k == nlongs - 1) {
715 if (t & endmask)
716 return -EINVAL;
717 } else if (t)
718 return -EINVAL;
719 }
720 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
721 endmask = ~0UL;
722 }
723
724 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
725 return -EFAULT;
726 nodes_addr(*nodes)[nlongs-1] &= endmask;
727 return 0;
728}
729
730/* Copy a kernel node mask to user space */
731static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
732 nodemask_t *nodes)
733{
734 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
735 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
736
737 if (copy > nbytes) {
738 if (copy > PAGE_SIZE)
739 return -EINVAL;
740 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
741 return -EFAULT;
742 copy = nbytes;
743 }
744 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
745}
746
747asmlinkage long sys_mbind(unsigned long start, unsigned long len,
748 unsigned long mode,
749 unsigned long __user *nmask, unsigned long maxnode,
750 unsigned flags)
751{
752 nodemask_t nodes;
753 int err;
754
755 err = get_nodes(&nodes, nmask, maxnode);
756 if (err)
757 return err;
758 return do_mbind(start, len, mode, &nodes, flags);
759}
760
761/* Set the process memory policy */
762asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
763 unsigned long maxnode)
764{
765 int err;
766 nodemask_t nodes;
767
768 if (mode < 0 || mode > MPOL_MAX)
769 return -EINVAL;
770 err = get_nodes(&nodes, nmask, maxnode);
771 if (err)
772 return err;
773 return do_set_mempolicy(mode, &nodes);
774}
775
Christoph Lameter39743882006-01-08 01:00:51 -0800776/* Macro needed until Paul implements this function in kernel/cpusets.c */
777#define cpuset_mems_allowed(task) node_online_map
778
779asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
780 const unsigned long __user *old_nodes,
781 const unsigned long __user *new_nodes)
782{
783 struct mm_struct *mm;
784 struct task_struct *task;
785 nodemask_t old;
786 nodemask_t new;
787 nodemask_t task_nodes;
788 int err;
789
790 err = get_nodes(&old, old_nodes, maxnode);
791 if (err)
792 return err;
793
794 err = get_nodes(&new, new_nodes, maxnode);
795 if (err)
796 return err;
797
798 /* Find the mm_struct */
799 read_lock(&tasklist_lock);
800 task = pid ? find_task_by_pid(pid) : current;
801 if (!task) {
802 read_unlock(&tasklist_lock);
803 return -ESRCH;
804 }
805 mm = get_task_mm(task);
806 read_unlock(&tasklist_lock);
807
808 if (!mm)
809 return -EINVAL;
810
811 /*
812 * Check if this process has the right to modify the specified
813 * process. The right exists if the process has administrative
814 * capabilities, superuser priviledges or the same
815 * userid as the target process.
816 */
817 if ((current->euid != task->suid) && (current->euid != task->uid) &&
818 (current->uid != task->suid) && (current->uid != task->uid) &&
819 !capable(CAP_SYS_ADMIN)) {
820 err = -EPERM;
821 goto out;
822 }
823
824 task_nodes = cpuset_mems_allowed(task);
825 /* Is the user allowed to access the target nodes? */
826 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
827 err = -EPERM;
828 goto out;
829 }
830
831 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
832out:
833 mmput(mm);
834 return err;
835}
836
837
Christoph Lameter8bccd852005-10-29 18:16:59 -0700838/* Retrieve NUMA policy */
839asmlinkage long sys_get_mempolicy(int __user *policy,
840 unsigned long __user *nmask,
841 unsigned long maxnode,
842 unsigned long addr, unsigned long flags)
843{
844 int err, pval;
845 nodemask_t nodes;
846
847 if (nmask != NULL && maxnode < MAX_NUMNODES)
848 return -EINVAL;
849
850 err = do_get_mempolicy(&pval, &nodes, addr, flags);
851
852 if (err)
853 return err;
854
855 if (policy && put_user(pval, policy))
856 return -EFAULT;
857
858 if (nmask)
859 err = copy_nodes_to_user(nmask, maxnode, &nodes);
860
861 return err;
862}
863
Linus Torvalds1da177e2005-04-16 15:20:36 -0700864#ifdef CONFIG_COMPAT
865
866asmlinkage long compat_sys_get_mempolicy(int __user *policy,
867 compat_ulong_t __user *nmask,
868 compat_ulong_t maxnode,
869 compat_ulong_t addr, compat_ulong_t flags)
870{
871 long err;
872 unsigned long __user *nm = NULL;
873 unsigned long nr_bits, alloc_size;
874 DECLARE_BITMAP(bm, MAX_NUMNODES);
875
876 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
877 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
878
879 if (nmask)
880 nm = compat_alloc_user_space(alloc_size);
881
882 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
883
884 if (!err && nmask) {
885 err = copy_from_user(bm, nm, alloc_size);
886 /* ensure entire bitmap is zeroed */
887 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
888 err |= compat_put_bitmap(nmask, bm, nr_bits);
889 }
890
891 return err;
892}
893
894asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
895 compat_ulong_t maxnode)
896{
897 long err = 0;
898 unsigned long __user *nm = NULL;
899 unsigned long nr_bits, alloc_size;
900 DECLARE_BITMAP(bm, MAX_NUMNODES);
901
902 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
903 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
904
905 if (nmask) {
906 err = compat_get_bitmap(bm, nmask, nr_bits);
907 nm = compat_alloc_user_space(alloc_size);
908 err |= copy_to_user(nm, bm, alloc_size);
909 }
910
911 if (err)
912 return -EFAULT;
913
914 return sys_set_mempolicy(mode, nm, nr_bits+1);
915}
916
917asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
918 compat_ulong_t mode, compat_ulong_t __user *nmask,
919 compat_ulong_t maxnode, compat_ulong_t flags)
920{
921 long err = 0;
922 unsigned long __user *nm = NULL;
923 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700924 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925
926 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
927 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
928
929 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -0700930 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700931 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -0700932 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 }
934
935 if (err)
936 return -EFAULT;
937
938 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
939}
940
941#endif
942
943/* Return effective policy for a VMA */
Christoph Lameter48fce342006-01-08 01:01:03 -0800944static struct mempolicy * get_vma_policy(struct task_struct *task,
945 struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -0700947 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948
949 if (vma) {
950 if (vma->vm_ops && vma->vm_ops->get_policy)
Christoph Lameter8bccd852005-10-29 18:16:59 -0700951 pol = vma->vm_ops->get_policy(vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700952 else if (vma->vm_policy &&
953 vma->vm_policy->policy != MPOL_DEFAULT)
954 pol = vma->vm_policy;
955 }
956 if (!pol)
957 pol = &default_policy;
958 return pol;
959}
960
961/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +0100962static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700963{
964 int nd;
965
966 switch (policy->policy) {
967 case MPOL_PREFERRED:
968 nd = policy->v.preferred_node;
969 if (nd < 0)
970 nd = numa_node_id();
971 break;
972 case MPOL_BIND:
973 /* Lower zones don't get a policy applied */
974 /* Careful: current->mems_allowed might have moved */
Al Viroaf4ca452005-10-21 02:55:38 -0400975 if (gfp_zone(gfp) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
977 return policy->v.zonelist;
978 /*FALL THROUGH*/
979 case MPOL_INTERLEAVE: /* should not happen */
980 case MPOL_DEFAULT:
981 nd = numa_node_id();
982 break;
983 default:
984 nd = 0;
985 BUG();
986 }
Al Viroaf4ca452005-10-21 02:55:38 -0400987 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988}
989
990/* Do dynamic interleaving for a process */
991static unsigned interleave_nodes(struct mempolicy *policy)
992{
993 unsigned nid, next;
994 struct task_struct *me = current;
995
996 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -0700997 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -0700999 next = first_node(policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000 me->il_next = next;
1001 return nid;
1002}
1003
1004/* Do static interleaving for a VMA with known offset. */
1005static unsigned offset_il_node(struct mempolicy *pol,
1006 struct vm_area_struct *vma, unsigned long off)
1007{
Andi Kleendfcd3c02005-10-29 18:15:48 -07001008 unsigned nnodes = nodes_weight(pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 unsigned target = (unsigned)off % nnodes;
1010 int c;
1011 int nid = -1;
1012
1013 c = 0;
1014 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -07001015 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 c++;
1017 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 return nid;
1019}
1020
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001021/* Determine a node number for interleave */
1022static inline unsigned interleave_nid(struct mempolicy *pol,
1023 struct vm_area_struct *vma, unsigned long addr, int shift)
1024{
1025 if (vma) {
1026 unsigned long off;
1027
1028 off = vma->vm_pgoff;
1029 off += (addr - vma->vm_start) >> shift;
1030 return offset_il_node(pol, vma, off);
1031 } else
1032 return interleave_nodes(pol);
1033}
1034
1035/* Return a zonelist suitable for a huge page allocation. */
1036struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1037{
1038 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1039
1040 if (pol->policy == MPOL_INTERLEAVE) {
1041 unsigned nid;
1042
1043 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1044 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1045 }
1046 return zonelist_policy(GFP_HIGHUSER, pol);
1047}
1048
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049/* Allocate a page in interleaved policy.
1050 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07001051static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1052 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053{
1054 struct zonelist *zl;
1055 struct page *page;
1056
Al Viroaf4ca452005-10-21 02:55:38 -04001057 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058 page = __alloc_pages(gfp, order, zl);
1059 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07001060 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061 put_cpu();
1062 }
1063 return page;
1064}
1065
1066/**
1067 * alloc_page_vma - Allocate a page for a VMA.
1068 *
1069 * @gfp:
1070 * %GFP_USER user allocation.
1071 * %GFP_KERNEL kernel allocations,
1072 * %GFP_HIGHMEM highmem/user allocations,
1073 * %GFP_FS allocation should not call back into a file system.
1074 * %GFP_ATOMIC don't sleep.
1075 *
1076 * @vma: Pointer to VMA or NULL if not available.
1077 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1078 *
1079 * This function allocates a page from the kernel page pool and applies
1080 * a NUMA policy associated with the VMA or the current process.
1081 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1082 * mm_struct of the VMA to prevent it from going away. Should be used for
1083 * all allocations for pages that will be mapped into
1084 * user space. Returns NULL when no page can be allocated.
1085 *
1086 * Should be called with the mm_sem of the vma hold.
1087 */
1088struct page *
Al Virodd0fc662005-10-07 07:46:04 +01001089alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001091 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092
1093 cpuset_update_current_mems_allowed();
1094
1095 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1096 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001097
1098 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 return alloc_page_interleave(gfp, 0, nid);
1100 }
1101 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1102}
1103
1104/**
1105 * alloc_pages_current - Allocate pages.
1106 *
1107 * @gfp:
1108 * %GFP_USER user allocation,
1109 * %GFP_KERNEL kernel allocation,
1110 * %GFP_HIGHMEM highmem allocation,
1111 * %GFP_FS don't call back into a file system.
1112 * %GFP_ATOMIC don't sleep.
1113 * @order: Power of two of allocation size in pages. 0 is a single page.
1114 *
1115 * Allocate a page from the kernel page pool. When not in
1116 * interrupt context and apply the current process NUMA policy.
1117 * Returns NULL when no page can be allocated.
1118 *
1119 * Don't call cpuset_update_current_mems_allowed() unless
1120 * 1) it's ok to take cpuset_sem (can WAIT), and
1121 * 2) allocating for current task (not interrupt).
1122 */
Al Virodd0fc662005-10-07 07:46:04 +01001123struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124{
1125 struct mempolicy *pol = current->mempolicy;
1126
1127 if ((gfp & __GFP_WAIT) && !in_interrupt())
1128 cpuset_update_current_mems_allowed();
1129 if (!pol || in_interrupt())
1130 pol = &default_policy;
1131 if (pol->policy == MPOL_INTERLEAVE)
1132 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1133 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1134}
1135EXPORT_SYMBOL(alloc_pages_current);
1136
1137/* Slow path of a mempolicy copy */
1138struct mempolicy *__mpol_copy(struct mempolicy *old)
1139{
1140 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1141
1142 if (!new)
1143 return ERR_PTR(-ENOMEM);
1144 *new = *old;
1145 atomic_set(&new->refcnt, 1);
1146 if (new->policy == MPOL_BIND) {
1147 int sz = ksize(old->v.zonelist);
1148 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1149 if (!new->v.zonelist) {
1150 kmem_cache_free(policy_cache, new);
1151 return ERR_PTR(-ENOMEM);
1152 }
1153 memcpy(new->v.zonelist, old->v.zonelist, sz);
1154 }
1155 return new;
1156}
1157
1158/* Slow path of a mempolicy comparison */
1159int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1160{
1161 if (!a || !b)
1162 return 0;
1163 if (a->policy != b->policy)
1164 return 0;
1165 switch (a->policy) {
1166 case MPOL_DEFAULT:
1167 return 1;
1168 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -07001169 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 case MPOL_PREFERRED:
1171 return a->v.preferred_node == b->v.preferred_node;
1172 case MPOL_BIND: {
1173 int i;
1174 for (i = 0; a->v.zonelist->zones[i]; i++)
1175 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1176 return 0;
1177 return b->v.zonelist->zones[i] == NULL;
1178 }
1179 default:
1180 BUG();
1181 return 0;
1182 }
1183}
1184
1185/* Slow path of a mpol destructor. */
1186void __mpol_free(struct mempolicy *p)
1187{
1188 if (!atomic_dec_and_test(&p->refcnt))
1189 return;
1190 if (p->policy == MPOL_BIND)
1191 kfree(p->v.zonelist);
1192 p->policy = MPOL_DEFAULT;
1193 kmem_cache_free(policy_cache, p);
1194}
1195
1196/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001197 * Shared memory backing store policy support.
1198 *
1199 * Remember policies even when nobody has shared memory mapped.
1200 * The policies are kept in Red-Black tree linked from the inode.
1201 * They are protected by the sp->lock spinlock, which should be held
1202 * for any accesses to the tree.
1203 */
1204
1205/* lookup first element intersecting start-end */
1206/* Caller holds sp->lock */
1207static struct sp_node *
1208sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1209{
1210 struct rb_node *n = sp->root.rb_node;
1211
1212 while (n) {
1213 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1214
1215 if (start >= p->end)
1216 n = n->rb_right;
1217 else if (end <= p->start)
1218 n = n->rb_left;
1219 else
1220 break;
1221 }
1222 if (!n)
1223 return NULL;
1224 for (;;) {
1225 struct sp_node *w = NULL;
1226 struct rb_node *prev = rb_prev(n);
1227 if (!prev)
1228 break;
1229 w = rb_entry(prev, struct sp_node, nd);
1230 if (w->end <= start)
1231 break;
1232 n = prev;
1233 }
1234 return rb_entry(n, struct sp_node, nd);
1235}
1236
1237/* Insert a new shared policy into the list. */
1238/* Caller holds sp->lock */
1239static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1240{
1241 struct rb_node **p = &sp->root.rb_node;
1242 struct rb_node *parent = NULL;
1243 struct sp_node *nd;
1244
1245 while (*p) {
1246 parent = *p;
1247 nd = rb_entry(parent, struct sp_node, nd);
1248 if (new->start < nd->start)
1249 p = &(*p)->rb_left;
1250 else if (new->end > nd->end)
1251 p = &(*p)->rb_right;
1252 else
1253 BUG();
1254 }
1255 rb_link_node(&new->nd, parent, p);
1256 rb_insert_color(&new->nd, &sp->root);
1257 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1258 new->policy ? new->policy->policy : 0);
1259}
1260
1261/* Find shared policy intersecting idx */
1262struct mempolicy *
1263mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1264{
1265 struct mempolicy *pol = NULL;
1266 struct sp_node *sn;
1267
1268 if (!sp->root.rb_node)
1269 return NULL;
1270 spin_lock(&sp->lock);
1271 sn = sp_lookup(sp, idx, idx+1);
1272 if (sn) {
1273 mpol_get(sn->policy);
1274 pol = sn->policy;
1275 }
1276 spin_unlock(&sp->lock);
1277 return pol;
1278}
1279
1280static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1281{
1282 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1283 rb_erase(&n->nd, &sp->root);
1284 mpol_free(n->policy);
1285 kmem_cache_free(sn_cache, n);
1286}
1287
1288struct sp_node *
1289sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1290{
1291 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1292
1293 if (!n)
1294 return NULL;
1295 n->start = start;
1296 n->end = end;
1297 mpol_get(pol);
1298 n->policy = pol;
1299 return n;
1300}
1301
1302/* Replace a policy range. */
1303static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1304 unsigned long end, struct sp_node *new)
1305{
1306 struct sp_node *n, *new2 = NULL;
1307
1308restart:
1309 spin_lock(&sp->lock);
1310 n = sp_lookup(sp, start, end);
1311 /* Take care of old policies in the same range. */
1312 while (n && n->start < end) {
1313 struct rb_node *next = rb_next(&n->nd);
1314 if (n->start >= start) {
1315 if (n->end <= end)
1316 sp_delete(sp, n);
1317 else
1318 n->start = end;
1319 } else {
1320 /* Old policy spanning whole new range. */
1321 if (n->end > end) {
1322 if (!new2) {
1323 spin_unlock(&sp->lock);
1324 new2 = sp_alloc(end, n->end, n->policy);
1325 if (!new2)
1326 return -ENOMEM;
1327 goto restart;
1328 }
1329 n->end = start;
1330 sp_insert(sp, new2);
1331 new2 = NULL;
1332 break;
1333 } else
1334 n->end = start;
1335 }
1336 if (!next)
1337 break;
1338 n = rb_entry(next, struct sp_node, nd);
1339 }
1340 if (new)
1341 sp_insert(sp, new);
1342 spin_unlock(&sp->lock);
1343 if (new2) {
1344 mpol_free(new2->policy);
1345 kmem_cache_free(sn_cache, new2);
1346 }
1347 return 0;
1348}
1349
1350int mpol_set_shared_policy(struct shared_policy *info,
1351 struct vm_area_struct *vma, struct mempolicy *npol)
1352{
1353 int err;
1354 struct sp_node *new = NULL;
1355 unsigned long sz = vma_pages(vma);
1356
1357 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1358 vma->vm_pgoff,
1359 sz, npol? npol->policy : -1,
Andi Kleendfcd3c02005-10-29 18:15:48 -07001360 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361
1362 if (npol) {
1363 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1364 if (!new)
1365 return -ENOMEM;
1366 }
1367 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1368 if (err && new)
1369 kmem_cache_free(sn_cache, new);
1370 return err;
1371}
1372
1373/* Free a backing policy store on inode delete. */
1374void mpol_free_shared_policy(struct shared_policy *p)
1375{
1376 struct sp_node *n;
1377 struct rb_node *next;
1378
1379 if (!p->root.rb_node)
1380 return;
1381 spin_lock(&p->lock);
1382 next = rb_first(&p->root);
1383 while (next) {
1384 n = rb_entry(next, struct sp_node, nd);
1385 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001386 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 mpol_free(n->policy);
1388 kmem_cache_free(sn_cache, n);
1389 }
1390 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391}
1392
1393/* assumes fs == KERNEL_DS */
1394void __init numa_policy_init(void)
1395{
1396 policy_cache = kmem_cache_create("numa_policy",
1397 sizeof(struct mempolicy),
1398 0, SLAB_PANIC, NULL, NULL);
1399
1400 sn_cache = kmem_cache_create("shared_policy_node",
1401 sizeof(struct sp_node),
1402 0, SLAB_PANIC, NULL, NULL);
1403
1404 /* Set interleaving policy for system init. This way not all
1405 the data structures allocated at system boot end up in node zero. */
1406
Christoph Lameter8bccd852005-10-29 18:16:59 -07001407 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408 printk("numa_policy_init: interleaving failed\n");
1409}
1410
Christoph Lameter8bccd852005-10-29 18:16:59 -07001411/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001412void numa_default_policy(void)
1413{
Christoph Lameter8bccd852005-10-29 18:16:59 -07001414 do_set_mempolicy(MPOL_DEFAULT, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415}
Paul Jackson68860ec2005-10-30 15:02:36 -08001416
1417/* Migrate a policy to a different set of nodes */
1418static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1419 const nodemask_t *new)
1420{
1421 nodemask_t tmp;
1422
1423 if (!pol)
1424 return;
1425
1426 switch (pol->policy) {
1427 case MPOL_DEFAULT:
1428 break;
1429 case MPOL_INTERLEAVE:
1430 nodes_remap(tmp, pol->v.nodes, *old, *new);
1431 pol->v.nodes = tmp;
1432 current->il_next = node_remap(current->il_next, *old, *new);
1433 break;
1434 case MPOL_PREFERRED:
1435 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1436 *old, *new);
1437 break;
1438 case MPOL_BIND: {
1439 nodemask_t nodes;
1440 struct zone **z;
1441 struct zonelist *zonelist;
1442
1443 nodes_clear(nodes);
1444 for (z = pol->v.zonelist->zones; *z; z++)
1445 node_set((*z)->zone_pgdat->node_id, nodes);
1446 nodes_remap(tmp, nodes, *old, *new);
1447 nodes = tmp;
1448
1449 zonelist = bind_zonelist(&nodes);
1450
1451 /* If no mem, then zonelist is NULL and we keep old zonelist.
1452 * If that old zonelist has no remaining mems_allowed nodes,
1453 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1454 */
1455
1456 if (zonelist) {
1457 /* Good - got mem - substitute new zonelist */
1458 kfree(pol->v.zonelist);
1459 pol->v.zonelist = zonelist;
1460 }
1461 break;
1462 }
1463 default:
1464 BUG();
1465 break;
1466 }
1467}
1468
1469/*
1470 * Someone moved this task to different nodes. Fixup mempolicies.
1471 *
1472 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1473 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1474 */
1475void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1476{
1477 rebind_policy(current->mempolicy, old, new);
1478}
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001479
1480/*
1481 * Display pages allocated per node and memory policy via /proc.
1482 */
1483
1484static const char *policy_types[] = { "default", "prefer", "bind",
1485 "interleave" };
1486
1487/*
1488 * Convert a mempolicy into a string.
1489 * Returns the number of characters in buffer (if positive)
1490 * or an error (negative)
1491 */
1492static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1493{
1494 char *p = buffer;
1495 int l;
1496 nodemask_t nodes;
1497 int mode = pol ? pol->policy : MPOL_DEFAULT;
1498
1499 switch (mode) {
1500 case MPOL_DEFAULT:
1501 nodes_clear(nodes);
1502 break;
1503
1504 case MPOL_PREFERRED:
1505 nodes_clear(nodes);
1506 node_set(pol->v.preferred_node, nodes);
1507 break;
1508
1509 case MPOL_BIND:
1510 get_zonemask(pol, &nodes);
1511 break;
1512
1513 case MPOL_INTERLEAVE:
1514 nodes = pol->v.nodes;
1515 break;
1516
1517 default:
1518 BUG();
1519 return -EFAULT;
1520 }
1521
1522 l = strlen(policy_types[mode]);
1523 if (buffer + maxlen < p + l + 1)
1524 return -ENOSPC;
1525
1526 strcpy(p, policy_types[mode]);
1527 p += l;
1528
1529 if (!nodes_empty(nodes)) {
1530 if (buffer + maxlen < p + 2)
1531 return -ENOSPC;
1532 *p++ = '=';
1533 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1534 }
1535 return p - buffer;
1536}
1537
1538struct numa_maps {
1539 unsigned long pages;
1540 unsigned long anon;
1541 unsigned long mapped;
1542 unsigned long mapcount_max;
1543 unsigned long node[MAX_NUMNODES];
1544};
1545
1546static void gather_stats(struct page *page, void *private)
1547{
1548 struct numa_maps *md = private;
1549 int count = page_mapcount(page);
1550
1551 if (count)
1552 md->mapped++;
1553
1554 if (count > md->mapcount_max)
1555 md->mapcount_max = count;
1556
1557 md->pages++;
1558
1559 if (PageAnon(page))
1560 md->anon++;
1561
1562 md->node[page_to_nid(page)]++;
1563 cond_resched();
1564}
1565
1566int show_numa_map(struct seq_file *m, void *v)
1567{
1568 struct task_struct *task = m->private;
1569 struct vm_area_struct *vma = v;
1570 struct numa_maps *md;
1571 int n;
1572 char buffer[50];
1573
1574 if (!vma->vm_mm)
1575 return 0;
1576
1577 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1578 if (!md)
1579 return 0;
1580
1581 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1582 &node_online_map, MPOL_MF_STATS, md);
1583
1584 if (md->pages) {
1585 mpol_to_str(buffer, sizeof(buffer),
1586 get_vma_policy(task, vma, vma->vm_start));
1587
1588 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1589 vma->vm_start, buffer, md->pages,
1590 md->mapped, md->mapcount_max);
1591
1592 if (md->anon)
1593 seq_printf(m," anon=%lu",md->anon);
1594
1595 for_each_online_node(n)
1596 if (md->node[n])
1597 seq_printf(m, " N%d=%lu", n, md->node[n]);
1598
1599 seq_putc(m, '\n');
1600 }
1601 kfree(md);
1602
1603 if (m->count < m->size)
1604 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1605 return 0;
1606}
1607