blob: a94d994eaaa8544049b3d5996f8d0cfd35b228c6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
Linus Torvalds1da177e2005-04-16 15:20:36 -070066*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070080#include <linux/nsproxy.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070081#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080084#include <linux/swap.h>
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080085#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080087#include <linux/migrate.h>
Christoph Lameter95a402c2006-06-23 02:03:53 -070088#include <linux/rmap.h>
David Quigley86c3a762006-06-23 02:04:02 -070089#include <linux/security.h>
Adrian Bunkdbcb0f12007-10-16 01:26:26 -070090#include <linux/syscalls.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080091
Linus Torvalds1da177e2005-04-16 15:20:36 -070092#include <asm/tlbflush.h>
93#include <asm/uaccess.h>
94
Christoph Lameter38e35862006-01-08 01:01:01 -080095/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080096#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -080097#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080098#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080099
Pekka Enbergfcc234f2006-03-22 00:08:13 -0800100static struct kmem_cache *policy_cache;
101static struct kmem_cache *sn_cache;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103/* Highest zone. An specific allocation for a zone below that is not
104 policied. */
Christoph Lameter62672762007-02-10 01:43:07 -0800105enum zone_type policy_zone = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
Andi Kleend42c6992005-07-06 19:56:03 +0200107struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 .refcnt = ATOMIC_INIT(1), /* never free it */
109 .policy = MPOL_DEFAULT,
110};
111
David Rientjes37012942008-04-28 02:12:33 -0700112static const struct mempolicy_operations {
113 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
114 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
115} mpol_ops[MPOL_MAX];
116
Mel Gorman19770b32008-04-28 02:12:18 -0700117/* Check that the nodemask contains at least one populated zone */
David Rientjes37012942008-04-28 02:12:33 -0700118static int is_valid_nodemask(const nodemask_t *nodemask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119{
Mel Gorman19770b32008-04-28 02:12:18 -0700120 int nd, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121
Mel Gorman19770b32008-04-28 02:12:18 -0700122 /* Check that there is something useful in this mask */
123 k = policy_zone;
124
125 for_each_node_mask(nd, *nodemask) {
126 struct zone *z;
127
128 for (k = 0; k <= policy_zone; k++) {
129 z = &NODE_DATA(nd)->node_zones[k];
130 if (z->present_pages > 0)
131 return 1;
Andi Kleendd942ae2006-02-17 01:39:16 +0100132 }
133 }
Mel Gorman19770b32008-04-28 02:12:18 -0700134
135 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136}
137
David Rientjesf5b087b2008-04-28 02:12:27 -0700138static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
139{
David Rientjes4c50bc02008-04-28 02:12:30 -0700140 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
141}
142
143static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
144 const nodemask_t *rel)
145{
146 nodemask_t tmp;
147 nodes_fold(tmp, *orig, nodes_weight(*rel));
148 nodes_onto(*ret, tmp, *rel);
David Rientjesf5b087b2008-04-28 02:12:27 -0700149}
150
David Rientjes37012942008-04-28 02:12:33 -0700151static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
152{
153 if (nodes_empty(*nodes))
154 return -EINVAL;
155 pol->v.nodes = *nodes;
156 return 0;
157}
158
159static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
160{
161 if (!nodes)
162 pol->v.preferred_node = -1; /* local allocation */
163 else if (nodes_empty(*nodes))
164 return -EINVAL; /* no allowed nodes */
165 else
166 pol->v.preferred_node = first_node(*nodes);
167 return 0;
168}
169
170static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
171{
172 if (!is_valid_nodemask(nodes))
173 return -EINVAL;
174 pol->v.nodes = *nodes;
175 return 0;
176}
177
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178/* Create a new policy */
David Rientjes028fec42008-04-28 02:12:25 -0700179static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
180 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181{
182 struct mempolicy *policy;
David Rientjesf5b087b2008-04-28 02:12:27 -0700183 nodemask_t cpuset_context_nmask;
David Rientjes37012942008-04-28 02:12:33 -0700184 int localalloc = 0;
185 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186
David Rientjes028fec42008-04-28 02:12:25 -0700187 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
188 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
Paul Mundt140d5a42007-07-15 23:38:16 -0700189
Linus Torvalds1da177e2005-04-16 15:20:36 -0700190 if (mode == MPOL_DEFAULT)
David Rientjes37012942008-04-28 02:12:33 -0700191 return NULL;
192 if (!nodes || nodes_empty(*nodes)) {
193 if (mode != MPOL_PREFERRED)
194 return ERR_PTR(-EINVAL);
195 localalloc = 1; /* special case: no mode flags */
196 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
198 if (!policy)
199 return ERR_PTR(-ENOMEM);
200 atomic_set(&policy->refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201 policy->policy = mode;
David Rientjesf5b087b2008-04-28 02:12:27 -0700202
David Rientjes37012942008-04-28 02:12:33 -0700203 if (!localalloc) {
204 policy->flags = flags;
205 cpuset_update_task_memory_state();
206 if (flags & MPOL_F_RELATIVE_NODES)
207 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
208 &cpuset_current_mems_allowed);
209 else
210 nodes_and(cpuset_context_nmask, *nodes,
211 cpuset_current_mems_allowed);
212 if (mpol_store_user_nodemask(policy))
213 policy->w.user_nodemask = *nodes;
214 else
215 policy->w.cpuset_mems_allowed =
216 cpuset_mems_allowed(current);
217 }
218
219 ret = mpol_ops[mode].create(policy,
220 localalloc ? NULL : &cpuset_context_nmask);
221 if (ret < 0) {
222 kmem_cache_free(policy_cache, policy);
223 return ERR_PTR(ret);
224 }
225 return policy;
226}
227
228static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
229{
230}
231
232static void mpol_rebind_nodemask(struct mempolicy *pol,
233 const nodemask_t *nodes)
234{
235 nodemask_t tmp;
236
237 if (pol->flags & MPOL_F_STATIC_NODES)
238 nodes_and(tmp, pol->w.user_nodemask, *nodes);
239 else if (pol->flags & MPOL_F_RELATIVE_NODES)
240 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
241 else {
242 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
243 *nodes);
244 pol->w.cpuset_mems_allowed = *nodes;
245 }
246
247 pol->v.nodes = tmp;
248 if (!node_isset(current->il_next, tmp)) {
249 current->il_next = next_node(current->il_next, tmp);
250 if (current->il_next >= MAX_NUMNODES)
251 current->il_next = first_node(tmp);
252 if (current->il_next >= MAX_NUMNODES)
253 current->il_next = numa_node_id();
254 }
255}
256
257static void mpol_rebind_preferred(struct mempolicy *pol,
258 const nodemask_t *nodes)
259{
260 nodemask_t tmp;
261
262 /*
263 * check 'STATIC_NODES first, as preferred_node == -1 may be
264 * a temporary, "fallback" state for this policy.
265 */
266 if (pol->flags & MPOL_F_STATIC_NODES) {
267 int node = first_node(pol->w.user_nodemask);
268
269 if (node_isset(node, *nodes))
270 pol->v.preferred_node = node;
271 else
272 pol->v.preferred_node = -1;
273 } else if (pol->v.preferred_node == -1) {
274 return; /* no remap required for explicit local alloc */
275 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
276 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
277 pol->v.preferred_node = first_node(tmp);
278 } else {
279 pol->v.preferred_node = node_remap(pol->v.preferred_node,
280 pol->w.cpuset_mems_allowed,
281 *nodes);
282 pol->w.cpuset_mems_allowed = *nodes;
283 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284}
285
David Rientjes1d0d2682008-04-28 02:12:32 -0700286/* Migrate a policy to a different set of nodes */
287static void mpol_rebind_policy(struct mempolicy *pol,
288 const nodemask_t *newmask)
289{
David Rientjes1d0d2682008-04-28 02:12:32 -0700290 if (!pol)
291 return;
David Rientjes1d0d2682008-04-28 02:12:32 -0700292 if (!mpol_store_user_nodemask(pol) &&
293 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
294 return;
David Rientjes37012942008-04-28 02:12:33 -0700295 mpol_ops[pol->policy].rebind(pol, newmask);
David Rientjes1d0d2682008-04-28 02:12:32 -0700296}
297
298/*
299 * Wrapper for mpol_rebind_policy() that just requires task
300 * pointer, and updates task mempolicy.
301 */
302
303void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
304{
305 mpol_rebind_policy(tsk->mempolicy, new);
306}
307
308/*
309 * Rebind each vma in mm to new nodemask.
310 *
311 * Call holding a reference to mm. Takes mm->mmap_sem during call.
312 */
313
314void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
315{
316 struct vm_area_struct *vma;
317
318 down_write(&mm->mmap_sem);
319 for (vma = mm->mmap; vma; vma = vma->vm_next)
320 mpol_rebind_policy(vma->vm_policy, new);
321 up_write(&mm->mmap_sem);
322}
323
David Rientjes37012942008-04-28 02:12:33 -0700324static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
325 [MPOL_DEFAULT] = {
326 .rebind = mpol_rebind_default,
327 },
328 [MPOL_INTERLEAVE] = {
329 .create = mpol_new_interleave,
330 .rebind = mpol_rebind_nodemask,
331 },
332 [MPOL_PREFERRED] = {
333 .create = mpol_new_preferred,
334 .rebind = mpol_rebind_preferred,
335 },
336 [MPOL_BIND] = {
337 .create = mpol_new_bind,
338 .rebind = mpol_rebind_nodemask,
339 },
340};
341
Christoph Lameter397874d2006-03-06 15:42:53 -0800342static void gather_stats(struct page *, void *, int pte_dirty);
Christoph Lameterfc301282006-01-18 17:42:29 -0800343static void migrate_page_add(struct page *page, struct list_head *pagelist,
344 unsigned long flags);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800345
Christoph Lameter38e35862006-01-08 01:01:01 -0800346/* Scan through pages checking if pages follow certain conditions. */
Nick Pigginb5810032005-10-29 18:16:12 -0700347static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800348 unsigned long addr, unsigned long end,
349 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800350 void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351{
Hugh Dickins91612e02005-06-21 17:15:07 -0700352 pte_t *orig_pte;
353 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700354 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700355
Hugh Dickins705e87c2005-10-29 18:16:27 -0700356 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700357 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800358 struct page *page;
Andy Whitcroft25ba77c2006-12-06 20:33:03 -0800359 int nid;
Hugh Dickins91612e02005-06-21 17:15:07 -0700360
361 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800363 page = vm_normal_page(vma, addr, *pte);
364 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365 continue;
Nick Piggin053837f2006-01-18 17:42:27 -0800366 /*
367 * The check for PageReserved here is important to avoid
368 * handling zero pages and other pages that may have been
369 * marked special by the system.
370 *
371 * If the PageReserved would not be checked here then f.e.
372 * the location of the zero page could have an influence
373 * on MPOL_MF_STRICT, zero pages would be counted for
374 * the per node stats, and there would be useless attempts
375 * to put zero pages on the migration list.
376 */
Christoph Lameterf4598c82006-01-12 01:05:20 -0800377 if (PageReserved(page))
378 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800379 nid = page_to_nid(page);
Christoph Lameter38e35862006-01-08 01:01:01 -0800380 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
381 continue;
382
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800383 if (flags & MPOL_MF_STATS)
Christoph Lameter397874d2006-03-06 15:42:53 -0800384 gather_stats(page, private, pte_dirty(*pte));
Nick Piggin053837f2006-01-18 17:42:27 -0800385 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameterfc301282006-01-18 17:42:29 -0800386 migrate_page_add(page, private, flags);
Christoph Lameter38e35862006-01-08 01:01:01 -0800387 else
388 break;
Hugh Dickins91612e02005-06-21 17:15:07 -0700389 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700390 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700391 return addr != end;
392}
393
Nick Pigginb5810032005-10-29 18:16:12 -0700394static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800395 unsigned long addr, unsigned long end,
396 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800397 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700398{
399 pmd_t *pmd;
400 unsigned long next;
401
402 pmd = pmd_offset(pud, addr);
403 do {
404 next = pmd_addr_end(addr, end);
405 if (pmd_none_or_clear_bad(pmd))
406 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800407 if (check_pte_range(vma, pmd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800408 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700409 return -EIO;
410 } while (pmd++, addr = next, addr != end);
411 return 0;
412}
413
Nick Pigginb5810032005-10-29 18:16:12 -0700414static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800415 unsigned long addr, unsigned long end,
416 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800417 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700418{
419 pud_t *pud;
420 unsigned long next;
421
422 pud = pud_offset(pgd, addr);
423 do {
424 next = pud_addr_end(addr, end);
425 if (pud_none_or_clear_bad(pud))
426 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800427 if (check_pmd_range(vma, pud, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800428 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700429 return -EIO;
430 } while (pud++, addr = next, addr != end);
431 return 0;
432}
433
Nick Pigginb5810032005-10-29 18:16:12 -0700434static inline int check_pgd_range(struct vm_area_struct *vma,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800435 unsigned long addr, unsigned long end,
436 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800437 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700438{
439 pgd_t *pgd;
440 unsigned long next;
441
Nick Pigginb5810032005-10-29 18:16:12 -0700442 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700443 do {
444 next = pgd_addr_end(addr, end);
445 if (pgd_none_or_clear_bad(pgd))
446 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800447 if (check_pud_range(vma, pgd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800448 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700449 return -EIO;
450 } while (pgd++, addr = next, addr != end);
451 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452}
453
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800454/*
455 * Check if all pages in a range are on a set of nodes.
456 * If pagelist != NULL then isolate pages from the LRU and
457 * put them on the pagelist.
458 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459static struct vm_area_struct *
460check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Christoph Lameter38e35862006-01-08 01:01:01 -0800461 const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462{
463 int err;
464 struct vm_area_struct *first, *vma, *prev;
465
Christoph Lameter90036ee2006-03-16 23:03:59 -0800466 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
Christoph Lameter90036ee2006-03-16 23:03:59 -0800467
Christoph Lameterb20a3502006-03-22 00:09:12 -0800468 err = migrate_prep();
469 if (err)
470 return ERR_PTR(err);
Christoph Lameter90036ee2006-03-16 23:03:59 -0800471 }
Nick Piggin053837f2006-01-18 17:42:27 -0800472
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 first = find_vma(mm, start);
474 if (!first)
475 return ERR_PTR(-EFAULT);
476 prev = NULL;
477 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800478 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
479 if (!vma->vm_next && vma->vm_end < end)
480 return ERR_PTR(-EFAULT);
481 if (prev && prev->vm_end < vma->vm_start)
482 return ERR_PTR(-EFAULT);
483 }
484 if (!is_vm_hugetlb_page(vma) &&
485 ((flags & MPOL_MF_STRICT) ||
486 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
487 vma_migratable(vma)))) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700488 unsigned long endvma = vma->vm_end;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800489
Andi Kleen5b952b32005-09-13 01:25:08 -0700490 if (endvma > end)
491 endvma = end;
492 if (vma->vm_start > start)
493 start = vma->vm_start;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800494 err = check_pgd_range(vma, start, endvma, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800495 flags, private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 if (err) {
497 first = ERR_PTR(err);
498 break;
499 }
500 }
501 prev = vma;
502 }
503 return first;
504}
505
506/* Apply policy to a single VMA */
507static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
508{
509 int err = 0;
510 struct mempolicy *old = vma->vm_policy;
511
Paul Mundt140d5a42007-07-15 23:38:16 -0700512 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513 vma->vm_start, vma->vm_end, vma->vm_pgoff,
514 vma->vm_ops, vma->vm_file,
515 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
516
517 if (vma->vm_ops && vma->vm_ops->set_policy)
518 err = vma->vm_ops->set_policy(vma, new);
519 if (!err) {
520 mpol_get(new);
521 vma->vm_policy = new;
522 mpol_free(old);
523 }
524 return err;
525}
526
527/* Step 2: apply policy to a range and do splits. */
528static int mbind_range(struct vm_area_struct *vma, unsigned long start,
529 unsigned long end, struct mempolicy *new)
530{
531 struct vm_area_struct *next;
532 int err;
533
534 err = 0;
535 for (; vma && vma->vm_start < end; vma = next) {
536 next = vma->vm_next;
537 if (vma->vm_start < start)
538 err = split_vma(vma->vm_mm, vma, start, 1);
539 if (!err && vma->vm_end > end)
540 err = split_vma(vma->vm_mm, vma, end, 0);
541 if (!err)
542 err = policy_vma(vma, new);
543 if (err)
544 break;
545 }
546 return err;
547}
548
Paul Jacksonc61afb12006-03-24 03:16:08 -0800549/*
550 * Update task->flags PF_MEMPOLICY bit: set iff non-default
551 * mempolicy. Allows more rapid checking of this (combined perhaps
552 * with other PF_* flag bits) on memory allocation hot code paths.
553 *
554 * If called from outside this file, the task 'p' should -only- be
555 * a newly forked child not yet visible on the task list, because
556 * manipulating the task flags of a visible task is not safe.
557 *
558 * The above limitation is why this routine has the funny name
559 * mpol_fix_fork_child_flag().
560 *
561 * It is also safe to call this with a task pointer of current,
562 * which the static wrapper mpol_set_task_struct_flag() does,
563 * for use within this file.
564 */
565
566void mpol_fix_fork_child_flag(struct task_struct *p)
567{
568 if (p->mempolicy)
569 p->flags |= PF_MEMPOLICY;
570 else
571 p->flags &= ~PF_MEMPOLICY;
572}
573
574static void mpol_set_task_struct_flag(void)
575{
576 mpol_fix_fork_child_flag(current);
577}
578
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579/* Set the process memory policy */
David Rientjes028fec42008-04-28 02:12:25 -0700580static long do_set_mempolicy(unsigned short mode, unsigned short flags,
581 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 struct mempolicy *new;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584
David Rientjes028fec42008-04-28 02:12:25 -0700585 new = mpol_new(mode, flags, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586 if (IS_ERR(new))
587 return PTR_ERR(new);
588 mpol_free(current->mempolicy);
589 current->mempolicy = new;
Paul Jacksonc61afb12006-03-24 03:16:08 -0800590 mpol_set_task_struct_flag();
David Rientjesf5b087b2008-04-28 02:12:27 -0700591 if (new && new->policy == MPOL_INTERLEAVE &&
592 nodes_weight(new->v.nodes))
Andi Kleendfcd3c02005-10-29 18:15:48 -0700593 current->il_next = first_node(new->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 return 0;
595}
596
597/* Fill a zone bitmap for a policy */
Andi Kleendfcd3c02005-10-29 18:15:48 -0700598static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700600 nodes_clear(*nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601 switch (p->policy) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602 case MPOL_DEFAULT:
603 break;
Mel Gorman19770b32008-04-28 02:12:18 -0700604 case MPOL_BIND:
605 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700607 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 break;
609 case MPOL_PREFERRED:
Christoph Lameter56bbd652007-10-16 01:25:35 -0700610 /* or use current node instead of memory_map? */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 if (p->v.preferred_node < 0)
Christoph Lameter56bbd652007-10-16 01:25:35 -0700612 *nodes = node_states[N_HIGH_MEMORY];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 else
Andi Kleendfcd3c02005-10-29 18:15:48 -0700614 node_set(p->v.preferred_node, *nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615 break;
616 default:
617 BUG();
618 }
619}
620
621static int lookup_node(struct mm_struct *mm, unsigned long addr)
622{
623 struct page *p;
624 int err;
625
626 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
627 if (err >= 0) {
628 err = page_to_nid(p);
629 put_page(p);
630 }
631 return err;
632}
633
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634/* Retrieve NUMA policy */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700635static long do_get_mempolicy(int *policy, nodemask_t *nmask,
636 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700638 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639 struct mm_struct *mm = current->mm;
640 struct vm_area_struct *vma = NULL;
641 struct mempolicy *pol = current->mempolicy;
642
Paul Jacksoncf2a473c2006-01-08 01:01:54 -0800643 cpuset_update_task_memory_state();
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700644 if (flags &
645 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 return -EINVAL;
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700647
648 if (flags & MPOL_F_MEMS_ALLOWED) {
649 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
650 return -EINVAL;
651 *policy = 0; /* just so it's initialized */
652 *nmask = cpuset_current_mems_allowed;
653 return 0;
654 }
655
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 if (flags & MPOL_F_ADDR) {
657 down_read(&mm->mmap_sem);
658 vma = find_vma_intersection(mm, addr, addr+1);
659 if (!vma) {
660 up_read(&mm->mmap_sem);
661 return -EFAULT;
662 }
663 if (vma->vm_ops && vma->vm_ops->get_policy)
664 pol = vma->vm_ops->get_policy(vma, addr);
665 else
666 pol = vma->vm_policy;
667 } else if (addr)
668 return -EINVAL;
669
670 if (!pol)
671 pol = &default_policy;
672
673 if (flags & MPOL_F_NODE) {
674 if (flags & MPOL_F_ADDR) {
675 err = lookup_node(mm, addr);
676 if (err < 0)
677 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700678 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700679 } else if (pol == current->mempolicy &&
680 pol->policy == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700681 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682 } else {
683 err = -EINVAL;
684 goto out;
685 }
686 } else
David Rientjes028fec42008-04-28 02:12:25 -0700687 *policy = pol->policy | pol->flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700688
689 if (vma) {
690 up_read(&current->mm->mmap_sem);
691 vma = NULL;
692 }
693
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 err = 0;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700695 if (nmask)
696 get_zonemask(pol, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697
698 out:
699 if (vma)
700 up_read(&current->mm->mmap_sem);
701 return err;
702}
703
Christoph Lameterb20a3502006-03-22 00:09:12 -0800704#ifdef CONFIG_MIGRATION
Christoph Lameter8bccd852005-10-29 18:16:59 -0700705/*
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800706 * page migration
707 */
Christoph Lameterfc301282006-01-18 17:42:29 -0800708static void migrate_page_add(struct page *page, struct list_head *pagelist,
709 unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800710{
711 /*
Christoph Lameterfc301282006-01-18 17:42:29 -0800712 * Avoid migrating a page that is shared with others.
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800713 */
Christoph Lameterb20a3502006-03-22 00:09:12 -0800714 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
715 isolate_lru_page(page, pagelist);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800716}
717
Christoph Lameter742755a2006-06-23 02:03:55 -0700718static struct page *new_node_page(struct page *page, unsigned long node, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700719{
Mel Gorman769848c2007-07-17 04:03:05 -0700720 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700721}
722
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800723/*
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800724 * Migrate pages from one node to a target node.
725 * Returns error or the number of pages not migrated.
726 */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700727static int migrate_to_node(struct mm_struct *mm, int source, int dest,
728 int flags)
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800729{
730 nodemask_t nmask;
731 LIST_HEAD(pagelist);
732 int err = 0;
733
734 nodes_clear(nmask);
735 node_set(source, nmask);
736
737 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
738 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
739
Christoph Lameteraaa994b2006-06-23 02:03:52 -0700740 if (!list_empty(&pagelist))
Christoph Lameter95a402c2006-06-23 02:03:53 -0700741 err = migrate_pages(&pagelist, new_node_page, dest);
742
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800743 return err;
744}
745
746/*
747 * Move pages between the two nodesets so as to preserve the physical
748 * layout as much as possible.
Christoph Lameter39743882006-01-08 01:00:51 -0800749 *
750 * Returns the number of page that could not be moved.
751 */
752int do_migrate_pages(struct mm_struct *mm,
753 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
754{
755 LIST_HEAD(pagelist);
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800756 int busy = 0;
757 int err = 0;
758 nodemask_t tmp;
Christoph Lameter39743882006-01-08 01:00:51 -0800759
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800760 down_read(&mm->mmap_sem);
Christoph Lameter39743882006-01-08 01:00:51 -0800761
Christoph Lameter7b2259b2006-06-25 05:46:48 -0700762 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
763 if (err)
764 goto out;
765
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800766/*
767 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
768 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
769 * bit in 'tmp', and return that <source, dest> pair for migration.
770 * The pair of nodemasks 'to' and 'from' define the map.
771 *
772 * If no pair of bits is found that way, fallback to picking some
773 * pair of 'source' and 'dest' bits that are not the same. If the
774 * 'source' and 'dest' bits are the same, this represents a node
775 * that will be migrating to itself, so no pages need move.
776 *
777 * If no bits are left in 'tmp', or if all remaining bits left
778 * in 'tmp' correspond to the same bit in 'to', return false
779 * (nothing left to migrate).
780 *
781 * This lets us pick a pair of nodes to migrate between, such that
782 * if possible the dest node is not already occupied by some other
783 * source node, minimizing the risk of overloading the memory on a
784 * node that would happen if we migrated incoming memory to a node
785 * before migrating outgoing memory source that same node.
786 *
787 * A single scan of tmp is sufficient. As we go, we remember the
788 * most recent <s, d> pair that moved (s != d). If we find a pair
789 * that not only moved, but what's better, moved to an empty slot
790 * (d is not set in tmp), then we break out then, with that pair.
791 * Otherwise when we finish scannng from_tmp, we at least have the
792 * most recent <s, d> pair that moved. If we get all the way through
793 * the scan of tmp without finding any node that moved, much less
794 * moved to an empty node, then there is nothing left worth migrating.
795 */
Christoph Lameterd4984712006-01-08 01:00:55 -0800796
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800797 tmp = *from_nodes;
798 while (!nodes_empty(tmp)) {
799 int s,d;
800 int source = -1;
801 int dest = 0;
802
803 for_each_node_mask(s, tmp) {
804 d = node_remap(s, *from_nodes, *to_nodes);
805 if (s == d)
806 continue;
807
808 source = s; /* Node moved. Memorize */
809 dest = d;
810
811 /* dest not in remaining from nodes? */
812 if (!node_isset(dest, tmp))
813 break;
814 }
815 if (source == -1)
816 break;
817
818 node_clear(source, tmp);
819 err = migrate_to_node(mm, source, dest, flags);
820 if (err > 0)
821 busy += err;
822 if (err < 0)
823 break;
Christoph Lameter39743882006-01-08 01:00:51 -0800824 }
Christoph Lameter7b2259b2006-06-25 05:46:48 -0700825out:
Christoph Lameter39743882006-01-08 01:00:51 -0800826 up_read(&mm->mmap_sem);
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800827 if (err < 0)
828 return err;
829 return busy;
Christoph Lameterb20a3502006-03-22 00:09:12 -0800830
Christoph Lameter39743882006-01-08 01:00:51 -0800831}
832
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -0800833/*
834 * Allocate a new page for page migration based on vma policy.
835 * Start assuming that page is mapped by vma pointed to by @private.
836 * Search forward from there, if not. N.B., this assumes that the
837 * list of pages handed to migrate_pages()--which is how we get here--
838 * is in virtual address order.
839 */
Christoph Lameter742755a2006-06-23 02:03:55 -0700840static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700841{
842 struct vm_area_struct *vma = (struct vm_area_struct *)private;
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -0800843 unsigned long uninitialized_var(address);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700844
Lee Schermerhorn3ad33b22007-11-14 16:59:10 -0800845 while (vma) {
846 address = page_address_in_vma(page, vma);
847 if (address != -EFAULT)
848 break;
849 vma = vma->vm_next;
850 }
851
852 /*
853 * if !vma, alloc_page_vma() will use task or system default policy
854 */
855 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700856}
Christoph Lameterb20a3502006-03-22 00:09:12 -0800857#else
858
859static void migrate_page_add(struct page *page, struct list_head *pagelist,
860 unsigned long flags)
861{
862}
863
864int do_migrate_pages(struct mm_struct *mm,
865 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
866{
867 return -ENOSYS;
868}
Christoph Lameter95a402c2006-06-23 02:03:53 -0700869
Keith Owens69939742006-10-11 01:21:28 -0700870static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700871{
872 return NULL;
873}
Christoph Lameterb20a3502006-03-22 00:09:12 -0800874#endif
875
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700876static long do_mbind(unsigned long start, unsigned long len,
David Rientjes028fec42008-04-28 02:12:25 -0700877 unsigned short mode, unsigned short mode_flags,
878 nodemask_t *nmask, unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800879{
880 struct vm_area_struct *vma;
881 struct mm_struct *mm = current->mm;
882 struct mempolicy *new;
883 unsigned long end;
884 int err;
885 LIST_HEAD(pagelist);
886
David Rientjesa3b51e02008-04-28 02:12:23 -0700887 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
888 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800889 return -EINVAL;
Christoph Lameter74c00242006-03-14 19:50:21 -0800890 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800891 return -EPERM;
892
893 if (start & ~PAGE_MASK)
894 return -EINVAL;
895
896 if (mode == MPOL_DEFAULT)
897 flags &= ~MPOL_MF_STRICT;
898
899 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
900 end = start + len;
901
902 if (end < start)
903 return -EINVAL;
904 if (end == start)
905 return 0;
906
David Rientjes028fec42008-04-28 02:12:25 -0700907 new = mpol_new(mode, mode_flags, nmask);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800908 if (IS_ERR(new))
909 return PTR_ERR(new);
910
911 /*
912 * If we are using the default policy then operation
913 * on discontinuous address spaces is okay after all
914 */
915 if (!new)
916 flags |= MPOL_MF_DISCONTIG_OK;
917
David Rientjes028fec42008-04-28 02:12:25 -0700918 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
919 start, start + len, mode, mode_flags,
920 nmask ? nodes_addr(*nmask)[0] : -1);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800921
922 down_write(&mm->mmap_sem);
923 vma = check_range(mm, start, end, nmask,
924 flags | MPOL_MF_INVERT, &pagelist);
925
926 err = PTR_ERR(vma);
927 if (!IS_ERR(vma)) {
928 int nr_failed = 0;
929
930 err = mbind_range(vma, start, end, new);
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800931
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800932 if (!list_empty(&pagelist))
Christoph Lameter95a402c2006-06-23 02:03:53 -0700933 nr_failed = migrate_pages(&pagelist, new_vma_page,
934 (unsigned long)vma);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800935
936 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
937 err = -EIO;
938 }
Christoph Lameterb20a3502006-03-22 00:09:12 -0800939
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800940 up_write(&mm->mmap_sem);
941 mpol_free(new);
942 return err;
943}
944
Christoph Lameter39743882006-01-08 01:00:51 -0800945/*
Christoph Lameter8bccd852005-10-29 18:16:59 -0700946 * User space interface with variable sized bitmaps for nodelists.
947 */
948
949/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -0800950static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -0700951 unsigned long maxnode)
952{
953 unsigned long k;
954 unsigned long nlongs;
955 unsigned long endmask;
956
957 --maxnode;
958 nodes_clear(*nodes);
959 if (maxnode == 0 || !nmask)
960 return 0;
Andi Kleena9c930b2006-02-20 18:27:59 -0800961 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
Chris Wright636f13c2006-02-17 13:59:36 -0800962 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700963
964 nlongs = BITS_TO_LONGS(maxnode);
965 if ((maxnode % BITS_PER_LONG) == 0)
966 endmask = ~0UL;
967 else
968 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
969
970 /* When the user specified more nodes than supported just check
971 if the non supported part is all zero. */
972 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
973 if (nlongs > PAGE_SIZE/sizeof(long))
974 return -EINVAL;
975 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
976 unsigned long t;
977 if (get_user(t, nmask + k))
978 return -EFAULT;
979 if (k == nlongs - 1) {
980 if (t & endmask)
981 return -EINVAL;
982 } else if (t)
983 return -EINVAL;
984 }
985 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
986 endmask = ~0UL;
987 }
988
989 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
990 return -EFAULT;
991 nodes_addr(*nodes)[nlongs-1] &= endmask;
992 return 0;
993}
994
995/* Copy a kernel node mask to user space */
996static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
997 nodemask_t *nodes)
998{
999 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1000 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1001
1002 if (copy > nbytes) {
1003 if (copy > PAGE_SIZE)
1004 return -EINVAL;
1005 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1006 return -EFAULT;
1007 copy = nbytes;
1008 }
1009 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1010}
1011
1012asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1013 unsigned long mode,
1014 unsigned long __user *nmask, unsigned long maxnode,
1015 unsigned flags)
1016{
1017 nodemask_t nodes;
1018 int err;
David Rientjes028fec42008-04-28 02:12:25 -07001019 unsigned short mode_flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001020
David Rientjes028fec42008-04-28 02:12:25 -07001021 mode_flags = mode & MPOL_MODE_FLAGS;
1022 mode &= ~MPOL_MODE_FLAGS;
David Rientjesa3b51e02008-04-28 02:12:23 -07001023 if (mode >= MPOL_MAX)
1024 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001025 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1026 (mode_flags & MPOL_F_RELATIVE_NODES))
1027 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001028 err = get_nodes(&nodes, nmask, maxnode);
1029 if (err)
1030 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001031 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001032}
1033
1034/* Set the process memory policy */
1035asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1036 unsigned long maxnode)
1037{
1038 int err;
1039 nodemask_t nodes;
David Rientjes028fec42008-04-28 02:12:25 -07001040 unsigned short flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001041
David Rientjes028fec42008-04-28 02:12:25 -07001042 flags = mode & MPOL_MODE_FLAGS;
1043 mode &= ~MPOL_MODE_FLAGS;
1044 if ((unsigned int)mode >= MPOL_MAX)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001045 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001046 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1047 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001048 err = get_nodes(&nodes, nmask, maxnode);
1049 if (err)
1050 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001051 return do_set_mempolicy(mode, flags, &nodes);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001052}
1053
Christoph Lameter39743882006-01-08 01:00:51 -08001054asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1055 const unsigned long __user *old_nodes,
1056 const unsigned long __user *new_nodes)
1057{
1058 struct mm_struct *mm;
1059 struct task_struct *task;
1060 nodemask_t old;
1061 nodemask_t new;
1062 nodemask_t task_nodes;
1063 int err;
1064
1065 err = get_nodes(&old, old_nodes, maxnode);
1066 if (err)
1067 return err;
1068
1069 err = get_nodes(&new, new_nodes, maxnode);
1070 if (err)
1071 return err;
1072
1073 /* Find the mm_struct */
1074 read_lock(&tasklist_lock);
Pavel Emelyanov228ebcb2007-10-18 23:40:16 -07001075 task = pid ? find_task_by_vpid(pid) : current;
Christoph Lameter39743882006-01-08 01:00:51 -08001076 if (!task) {
1077 read_unlock(&tasklist_lock);
1078 return -ESRCH;
1079 }
1080 mm = get_task_mm(task);
1081 read_unlock(&tasklist_lock);
1082
1083 if (!mm)
1084 return -EINVAL;
1085
1086 /*
1087 * Check if this process has the right to modify the specified
1088 * process. The right exists if the process has administrative
Alexey Dobriyan7f927fc2006-03-28 01:56:53 -08001089 * capabilities, superuser privileges or the same
Christoph Lameter39743882006-01-08 01:00:51 -08001090 * userid as the target process.
1091 */
1092 if ((current->euid != task->suid) && (current->euid != task->uid) &&
1093 (current->uid != task->suid) && (current->uid != task->uid) &&
Christoph Lameter74c00242006-03-14 19:50:21 -08001094 !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001095 err = -EPERM;
1096 goto out;
1097 }
1098
1099 task_nodes = cpuset_mems_allowed(task);
1100 /* Is the user allowed to access the target nodes? */
Christoph Lameter74c00242006-03-14 19:50:21 -08001101 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001102 err = -EPERM;
1103 goto out;
1104 }
1105
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07001106 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
Christoph Lameter3b42d282007-08-31 00:12:08 -07001107 err = -EINVAL;
1108 goto out;
1109 }
1110
David Quigley86c3a762006-06-23 02:04:02 -07001111 err = security_task_movememory(task);
1112 if (err)
1113 goto out;
1114
Christoph Lameter511030b2006-02-28 16:58:57 -08001115 err = do_migrate_pages(mm, &old, &new,
Christoph Lameter74c00242006-03-14 19:50:21 -08001116 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
Christoph Lameter39743882006-01-08 01:00:51 -08001117out:
1118 mmput(mm);
1119 return err;
1120}
1121
1122
Christoph Lameter8bccd852005-10-29 18:16:59 -07001123/* Retrieve NUMA policy */
1124asmlinkage long sys_get_mempolicy(int __user *policy,
1125 unsigned long __user *nmask,
1126 unsigned long maxnode,
1127 unsigned long addr, unsigned long flags)
1128{
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001129 int err;
1130 int uninitialized_var(pval);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001131 nodemask_t nodes;
1132
1133 if (nmask != NULL && maxnode < MAX_NUMNODES)
1134 return -EINVAL;
1135
1136 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1137
1138 if (err)
1139 return err;
1140
1141 if (policy && put_user(pval, policy))
1142 return -EFAULT;
1143
1144 if (nmask)
1145 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1146
1147 return err;
1148}
1149
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150#ifdef CONFIG_COMPAT
1151
1152asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1153 compat_ulong_t __user *nmask,
1154 compat_ulong_t maxnode,
1155 compat_ulong_t addr, compat_ulong_t flags)
1156{
1157 long err;
1158 unsigned long __user *nm = NULL;
1159 unsigned long nr_bits, alloc_size;
1160 DECLARE_BITMAP(bm, MAX_NUMNODES);
1161
1162 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1163 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1164
1165 if (nmask)
1166 nm = compat_alloc_user_space(alloc_size);
1167
1168 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1169
1170 if (!err && nmask) {
1171 err = copy_from_user(bm, nm, alloc_size);
1172 /* ensure entire bitmap is zeroed */
1173 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1174 err |= compat_put_bitmap(nmask, bm, nr_bits);
1175 }
1176
1177 return err;
1178}
1179
1180asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1181 compat_ulong_t maxnode)
1182{
1183 long err = 0;
1184 unsigned long __user *nm = NULL;
1185 unsigned long nr_bits, alloc_size;
1186 DECLARE_BITMAP(bm, MAX_NUMNODES);
1187
1188 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1189 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1190
1191 if (nmask) {
1192 err = compat_get_bitmap(bm, nmask, nr_bits);
1193 nm = compat_alloc_user_space(alloc_size);
1194 err |= copy_to_user(nm, bm, alloc_size);
1195 }
1196
1197 if (err)
1198 return -EFAULT;
1199
1200 return sys_set_mempolicy(mode, nm, nr_bits+1);
1201}
1202
1203asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1204 compat_ulong_t mode, compat_ulong_t __user *nmask,
1205 compat_ulong_t maxnode, compat_ulong_t flags)
1206{
1207 long err = 0;
1208 unsigned long __user *nm = NULL;
1209 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -07001210 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001211
1212 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1213 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1214
1215 if (nmask) {
Andi Kleendfcd3c02005-10-29 18:15:48 -07001216 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001217 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c02005-10-29 18:15:48 -07001218 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001219 }
1220
1221 if (err)
1222 return -EFAULT;
1223
1224 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1225}
1226
1227#endif
1228
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001229/*
1230 * get_vma_policy(@task, @vma, @addr)
1231 * @task - task for fallback if vma policy == default
1232 * @vma - virtual memory area whose policy is sought
1233 * @addr - address in @vma for shared policy lookup
1234 *
1235 * Returns effective policy for a VMA at specified address.
1236 * Falls back to @task or system default policy, as necessary.
1237 * Returned policy has extra reference count if shared, vma,
1238 * or some other task's policy [show_numa_maps() can pass
1239 * @task != current]. It is the caller's responsibility to
1240 * free the reference in these cases.
1241 */
Christoph Lameter48fce342006-01-08 01:01:03 -08001242static struct mempolicy * get_vma_policy(struct task_struct *task,
1243 struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001244{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001245 struct mempolicy *pol = task->mempolicy;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001246 int shared_pol = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247
1248 if (vma) {
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001249 if (vma->vm_ops && vma->vm_ops->get_policy) {
Christoph Lameter8bccd852005-10-29 18:16:59 -07001250 pol = vma->vm_ops->get_policy(vma, addr);
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001251 shared_pol = 1; /* if pol non-NULL, add ref below */
1252 } else if (vma->vm_policy &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253 vma->vm_policy->policy != MPOL_DEFAULT)
1254 pol = vma->vm_policy;
1255 }
1256 if (!pol)
1257 pol = &default_policy;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001258 else if (!shared_pol && pol != current->mempolicy)
1259 mpol_get(pol); /* vma or other task's policy */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260 return pol;
1261}
1262
Mel Gorman19770b32008-04-28 02:12:18 -07001263/* Return a nodemask representing a mempolicy */
1264static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1265{
1266 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1267 if (unlikely(policy->policy == MPOL_BIND) &&
1268 gfp_zone(gfp) >= policy_zone &&
1269 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1270 return &policy->v.nodes;
1271
1272 return NULL;
1273}
1274
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275/* Return a zonelist representing a mempolicy */
Al Virodd0fc662005-10-07 07:46:04 +01001276static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277{
1278 int nd;
1279
1280 switch (policy->policy) {
1281 case MPOL_PREFERRED:
1282 nd = policy->v.preferred_node;
1283 if (nd < 0)
1284 nd = numa_node_id();
1285 break;
1286 case MPOL_BIND:
Mel Gorman19770b32008-04-28 02:12:18 -07001287 /*
1288 * Normally, MPOL_BIND allocations node-local are node-local
1289 * within the allowed nodemask. However, if __GFP_THISNODE is
1290 * set and the current node is part of the mask, we use the
1291 * the zonelist for the first node in the mask instead.
1292 */
1293 nd = numa_node_id();
1294 if (unlikely(gfp & __GFP_THISNODE) &&
1295 unlikely(!node_isset(nd, policy->v.nodes)))
1296 nd = first_node(policy->v.nodes);
1297 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298 case MPOL_INTERLEAVE: /* should not happen */
1299 case MPOL_DEFAULT:
1300 nd = numa_node_id();
1301 break;
1302 default:
1303 nd = 0;
1304 BUG();
1305 }
Mel Gorman0e884602008-04-28 02:12:14 -07001306 return node_zonelist(nd, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307}
1308
1309/* Do dynamic interleaving for a process */
1310static unsigned interleave_nodes(struct mempolicy *policy)
1311{
1312 unsigned nid, next;
1313 struct task_struct *me = current;
1314
1315 nid = me->il_next;
Andi Kleendfcd3c02005-10-29 18:15:48 -07001316 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c02005-10-29 18:15:48 -07001318 next = first_node(policy->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001319 if (next < MAX_NUMNODES)
1320 me->il_next = next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001321 return nid;
1322}
1323
Christoph Lameterdc85da12006-01-18 17:42:36 -08001324/*
1325 * Depending on the memory policy provide a node from which to allocate the
1326 * next slab entry.
1327 */
1328unsigned slab_node(struct mempolicy *policy)
1329{
David Rientjesa3b51e02008-04-28 02:12:23 -07001330 unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
Christoph Lameter765c4502006-09-27 01:50:08 -07001331
1332 switch (pol) {
Christoph Lameterdc85da12006-01-18 17:42:36 -08001333 case MPOL_INTERLEAVE:
1334 return interleave_nodes(policy);
1335
Mel Gormandd1a2392008-04-28 02:12:17 -07001336 case MPOL_BIND: {
Christoph Lameterdc85da12006-01-18 17:42:36 -08001337 /*
1338 * Follow bind policy behavior and start allocation at the
1339 * first node.
1340 */
Mel Gorman19770b32008-04-28 02:12:18 -07001341 struct zonelist *zonelist;
1342 struct zone *zone;
1343 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1344 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1345 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1346 &policy->v.nodes,
1347 &zone);
1348 return zone->node;
Mel Gormandd1a2392008-04-28 02:12:17 -07001349 }
Christoph Lameterdc85da12006-01-18 17:42:36 -08001350
1351 case MPOL_PREFERRED:
1352 if (policy->v.preferred_node >= 0)
1353 return policy->v.preferred_node;
1354 /* Fall through */
1355
1356 default:
1357 return numa_node_id();
1358 }
1359}
1360
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361/* Do static interleaving for a VMA with known offset. */
1362static unsigned offset_il_node(struct mempolicy *pol,
1363 struct vm_area_struct *vma, unsigned long off)
1364{
Andi Kleendfcd3c02005-10-29 18:15:48 -07001365 unsigned nnodes = nodes_weight(pol->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001366 unsigned target;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367 int c;
1368 int nid = -1;
1369
David Rientjesf5b087b2008-04-28 02:12:27 -07001370 if (!nnodes)
1371 return numa_node_id();
1372 target = (unsigned int)off % nnodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373 c = 0;
1374 do {
Andi Kleendfcd3c02005-10-29 18:15:48 -07001375 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376 c++;
1377 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001378 return nid;
1379}
1380
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001381/* Determine a node number for interleave */
1382static inline unsigned interleave_nid(struct mempolicy *pol,
1383 struct vm_area_struct *vma, unsigned long addr, int shift)
1384{
1385 if (vma) {
1386 unsigned long off;
1387
Nishanth Aravamudan3b98b082006-08-31 21:27:53 -07001388 /*
1389 * for small pages, there is no difference between
1390 * shift and PAGE_SHIFT, so the bit-shift is safe.
1391 * for huge pages, since vm_pgoff is in units of small
1392 * pages, we need to shift off the always 0 bits to get
1393 * a useful offset.
1394 */
1395 BUG_ON(shift < PAGE_SHIFT);
1396 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001397 off += (addr - vma->vm_start) >> shift;
1398 return offset_il_node(pol, vma, off);
1399 } else
1400 return interleave_nodes(pol);
1401}
1402
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001403#ifdef CONFIG_HUGETLBFS
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001404/*
1405 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1406 * @vma = virtual memory area whose policy is sought
1407 * @addr = address in @vma for shared policy lookup and interleave policy
1408 * @gfp_flags = for requested zone
Mel Gorman19770b32008-04-28 02:12:18 -07001409 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1410 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001411 *
1412 * Returns a zonelist suitable for a huge page allocation.
Mel Gorman19770b32008-04-28 02:12:18 -07001413 * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1414 * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001415 * If it is also a policy for which get_vma_policy() returns an extra
Mel Gorman19770b32008-04-28 02:12:18 -07001416 * reference, we must hold that reference until after the allocation.
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001417 * In that case, return policy via @mpol so hugetlb allocation can drop
Mel Gorman19770b32008-04-28 02:12:18 -07001418 * the reference. For non-'BIND referenced policies, we can/do drop the
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001419 * reference here, so the caller doesn't need to know about the special case
1420 * for default and current task policy.
1421 */
Mel Gorman396faf02007-07-17 04:03:13 -07001422struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
Mel Gorman19770b32008-04-28 02:12:18 -07001423 gfp_t gfp_flags, struct mempolicy **mpol,
1424 nodemask_t **nodemask)
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001425{
1426 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001427 struct zonelist *zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001428
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001429 *mpol = NULL; /* probably no unref needed */
Mel Gorman19770b32008-04-28 02:12:18 -07001430 *nodemask = NULL; /* assume !MPOL_BIND */
1431 if (pol->policy == MPOL_BIND) {
1432 *nodemask = &pol->v.nodes;
1433 } else if (pol->policy == MPOL_INTERLEAVE) {
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001434 unsigned nid;
1435
1436 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
Lee Schermerhorn69682d82008-03-10 11:43:45 -07001437 if (unlikely(pol != &default_policy &&
1438 pol != current->mempolicy))
1439 __mpol_free(pol); /* finished with pol */
Mel Gorman0e884602008-04-28 02:12:14 -07001440 return node_zonelist(nid, gfp_flags);
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001441 }
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001442
1443 zl = zonelist_policy(GFP_HIGHUSER, pol);
1444 if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1445 if (pol->policy != MPOL_BIND)
1446 __mpol_free(pol); /* finished with pol */
1447 else
1448 *mpol = pol; /* unref needed after allocation */
1449 }
1450 return zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001451}
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001452#endif
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001453
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454/* Allocate a page in interleaved policy.
1455 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07001456static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1457 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458{
1459 struct zonelist *zl;
1460 struct page *page;
1461
Mel Gorman0e884602008-04-28 02:12:14 -07001462 zl = node_zonelist(nid, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 page = __alloc_pages(gfp, order, zl);
Mel Gormandd1a2392008-04-28 02:12:17 -07001464 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
Christoph Lameterca889e62006-06-30 01:55:44 -07001465 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466 return page;
1467}
1468
1469/**
1470 * alloc_page_vma - Allocate a page for a VMA.
1471 *
1472 * @gfp:
1473 * %GFP_USER user allocation.
1474 * %GFP_KERNEL kernel allocations,
1475 * %GFP_HIGHMEM highmem/user allocations,
1476 * %GFP_FS allocation should not call back into a file system.
1477 * %GFP_ATOMIC don't sleep.
1478 *
1479 * @vma: Pointer to VMA or NULL if not available.
1480 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1481 *
1482 * This function allocates a page from the kernel page pool and applies
1483 * a NUMA policy associated with the VMA or the current process.
1484 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1485 * mm_struct of the VMA to prevent it from going away. Should be used for
1486 * all allocations for pages that will be mapped into
1487 * user space. Returns NULL when no page can be allocated.
1488 *
1489 * Should be called with the mm_sem of the vma hold.
1490 */
1491struct page *
Al Virodd0fc662005-10-07 07:46:04 +01001492alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001494 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001495 struct zonelist *zl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001496
Paul Jacksoncf2a473c2006-01-08 01:01:54 -08001497 cpuset_update_task_memory_state();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001498
1499 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1500 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001501
1502 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
Lee Schermerhorn69682d82008-03-10 11:43:45 -07001503 if (unlikely(pol != &default_policy &&
1504 pol != current->mempolicy))
1505 __mpol_free(pol); /* finished with pol */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001506 return alloc_page_interleave(gfp, 0, nid);
1507 }
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001508 zl = zonelist_policy(gfp, pol);
1509 if (pol != &default_policy && pol != current->mempolicy) {
1510 /*
1511 * slow path: ref counted policy -- shared or vma
1512 */
Mel Gorman19770b32008-04-28 02:12:18 -07001513 struct page *page = __alloc_pages_nodemask(gfp, 0,
1514 zl, nodemask_policy(gfp, pol));
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001515 __mpol_free(pol);
1516 return page;
1517 }
1518 /*
1519 * fast path: default or task policy
1520 */
Mel Gorman19770b32008-04-28 02:12:18 -07001521 return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001522}
1523
1524/**
1525 * alloc_pages_current - Allocate pages.
1526 *
1527 * @gfp:
1528 * %GFP_USER user allocation,
1529 * %GFP_KERNEL kernel allocation,
1530 * %GFP_HIGHMEM highmem allocation,
1531 * %GFP_FS don't call back into a file system.
1532 * %GFP_ATOMIC don't sleep.
1533 * @order: Power of two of allocation size in pages. 0 is a single page.
1534 *
1535 * Allocate a page from the kernel page pool. When not in
1536 * interrupt context and apply the current process NUMA policy.
1537 * Returns NULL when no page can be allocated.
1538 *
Paul Jacksoncf2a473c2006-01-08 01:01:54 -08001539 * Don't call cpuset_update_task_memory_state() unless
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 * 1) it's ok to take cpuset_sem (can WAIT), and
1541 * 2) allocating for current task (not interrupt).
1542 */
Al Virodd0fc662005-10-07 07:46:04 +01001543struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544{
1545 struct mempolicy *pol = current->mempolicy;
1546
1547 if ((gfp & __GFP_WAIT) && !in_interrupt())
Paul Jacksoncf2a473c2006-01-08 01:01:54 -08001548 cpuset_update_task_memory_state();
Christoph Lameter9b819d22006-09-25 23:31:40 -07001549 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001550 pol = &default_policy;
1551 if (pol->policy == MPOL_INTERLEAVE)
1552 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
Mel Gorman19770b32008-04-28 02:12:18 -07001553 return __alloc_pages_nodemask(gfp, order,
1554 zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001555}
1556EXPORT_SYMBOL(alloc_pages_current);
1557
Paul Jackson42253992006-01-08 01:01:59 -08001558/*
1559 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1560 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1561 * with the mems_allowed returned by cpuset_mems_allowed(). This
1562 * keeps mempolicies cpuset relative after its cpuset moves. See
1563 * further kernel/cpuset.c update_nodemask().
1564 */
Paul Jackson42253992006-01-08 01:01:59 -08001565
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566/* Slow path of a mempolicy copy */
1567struct mempolicy *__mpol_copy(struct mempolicy *old)
1568{
1569 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1570
1571 if (!new)
1572 return ERR_PTR(-ENOMEM);
Paul Jackson42253992006-01-08 01:01:59 -08001573 if (current_cpuset_is_being_rebound()) {
1574 nodemask_t mems = cpuset_mems_allowed(current);
1575 mpol_rebind_policy(old, &mems);
1576 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 *new = *old;
1578 atomic_set(&new->refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579 return new;
1580}
1581
David Rientjesf5b087b2008-04-28 02:12:27 -07001582static int mpol_match_intent(const struct mempolicy *a,
1583 const struct mempolicy *b)
1584{
1585 if (a->flags != b->flags)
1586 return 0;
1587 if (!mpol_store_user_nodemask(a))
1588 return 1;
1589 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1590}
1591
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592/* Slow path of a mempolicy comparison */
1593int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1594{
1595 if (!a || !b)
1596 return 0;
1597 if (a->policy != b->policy)
1598 return 0;
David Rientjesf5b087b2008-04-28 02:12:27 -07001599 if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1600 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001601 switch (a->policy) {
1602 case MPOL_DEFAULT:
1603 return 1;
Mel Gorman19770b32008-04-28 02:12:18 -07001604 case MPOL_BIND:
1605 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -07001607 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001608 case MPOL_PREFERRED:
1609 return a->v.preferred_node == b->v.preferred_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610 default:
1611 BUG();
1612 return 0;
1613 }
1614}
1615
1616/* Slow path of a mpol destructor. */
1617void __mpol_free(struct mempolicy *p)
1618{
1619 if (!atomic_dec_and_test(&p->refcnt))
1620 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621 p->policy = MPOL_DEFAULT;
1622 kmem_cache_free(policy_cache, p);
1623}
1624
1625/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626 * Shared memory backing store policy support.
1627 *
1628 * Remember policies even when nobody has shared memory mapped.
1629 * The policies are kept in Red-Black tree linked from the inode.
1630 * They are protected by the sp->lock spinlock, which should be held
1631 * for any accesses to the tree.
1632 */
1633
1634/* lookup first element intersecting start-end */
1635/* Caller holds sp->lock */
1636static struct sp_node *
1637sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1638{
1639 struct rb_node *n = sp->root.rb_node;
1640
1641 while (n) {
1642 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1643
1644 if (start >= p->end)
1645 n = n->rb_right;
1646 else if (end <= p->start)
1647 n = n->rb_left;
1648 else
1649 break;
1650 }
1651 if (!n)
1652 return NULL;
1653 for (;;) {
1654 struct sp_node *w = NULL;
1655 struct rb_node *prev = rb_prev(n);
1656 if (!prev)
1657 break;
1658 w = rb_entry(prev, struct sp_node, nd);
1659 if (w->end <= start)
1660 break;
1661 n = prev;
1662 }
1663 return rb_entry(n, struct sp_node, nd);
1664}
1665
1666/* Insert a new shared policy into the list. */
1667/* Caller holds sp->lock */
1668static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1669{
1670 struct rb_node **p = &sp->root.rb_node;
1671 struct rb_node *parent = NULL;
1672 struct sp_node *nd;
1673
1674 while (*p) {
1675 parent = *p;
1676 nd = rb_entry(parent, struct sp_node, nd);
1677 if (new->start < nd->start)
1678 p = &(*p)->rb_left;
1679 else if (new->end > nd->end)
1680 p = &(*p)->rb_right;
1681 else
1682 BUG();
1683 }
1684 rb_link_node(&new->nd, parent, p);
1685 rb_insert_color(&new->nd, &sp->root);
Paul Mundt140d5a42007-07-15 23:38:16 -07001686 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687 new->policy ? new->policy->policy : 0);
1688}
1689
1690/* Find shared policy intersecting idx */
1691struct mempolicy *
1692mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1693{
1694 struct mempolicy *pol = NULL;
1695 struct sp_node *sn;
1696
1697 if (!sp->root.rb_node)
1698 return NULL;
1699 spin_lock(&sp->lock);
1700 sn = sp_lookup(sp, idx, idx+1);
1701 if (sn) {
1702 mpol_get(sn->policy);
1703 pol = sn->policy;
1704 }
1705 spin_unlock(&sp->lock);
1706 return pol;
1707}
1708
1709static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1710{
Paul Mundt140d5a42007-07-15 23:38:16 -07001711 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712 rb_erase(&n->nd, &sp->root);
1713 mpol_free(n->policy);
1714 kmem_cache_free(sn_cache, n);
1715}
1716
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001717static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1718 struct mempolicy *pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719{
1720 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1721
1722 if (!n)
1723 return NULL;
1724 n->start = start;
1725 n->end = end;
1726 mpol_get(pol);
1727 n->policy = pol;
1728 return n;
1729}
1730
1731/* Replace a policy range. */
1732static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1733 unsigned long end, struct sp_node *new)
1734{
1735 struct sp_node *n, *new2 = NULL;
1736
1737restart:
1738 spin_lock(&sp->lock);
1739 n = sp_lookup(sp, start, end);
1740 /* Take care of old policies in the same range. */
1741 while (n && n->start < end) {
1742 struct rb_node *next = rb_next(&n->nd);
1743 if (n->start >= start) {
1744 if (n->end <= end)
1745 sp_delete(sp, n);
1746 else
1747 n->start = end;
1748 } else {
1749 /* Old policy spanning whole new range. */
1750 if (n->end > end) {
1751 if (!new2) {
1752 spin_unlock(&sp->lock);
1753 new2 = sp_alloc(end, n->end, n->policy);
1754 if (!new2)
1755 return -ENOMEM;
1756 goto restart;
1757 }
1758 n->end = start;
1759 sp_insert(sp, new2);
1760 new2 = NULL;
1761 break;
1762 } else
1763 n->end = start;
1764 }
1765 if (!next)
1766 break;
1767 n = rb_entry(next, struct sp_node, nd);
1768 }
1769 if (new)
1770 sp_insert(sp, new);
1771 spin_unlock(&sp->lock);
1772 if (new2) {
1773 mpol_free(new2->policy);
1774 kmem_cache_free(sn_cache, new2);
1775 }
1776 return 0;
1777}
1778
David Rientjesa3b51e02008-04-28 02:12:23 -07001779void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
David Rientjes028fec42008-04-28 02:12:25 -07001780 unsigned short flags, nodemask_t *policy_nodes)
Robin Holt7339ff82006-01-14 13:20:48 -08001781{
1782 info->root = RB_ROOT;
1783 spin_lock_init(&info->lock);
1784
1785 if (policy != MPOL_DEFAULT) {
1786 struct mempolicy *newpol;
1787
1788 /* Falls back to MPOL_DEFAULT on any error */
David Rientjes028fec42008-04-28 02:12:25 -07001789 newpol = mpol_new(policy, flags, policy_nodes);
Robin Holt7339ff82006-01-14 13:20:48 -08001790 if (!IS_ERR(newpol)) {
1791 /* Create pseudo-vma that contains just the policy */
1792 struct vm_area_struct pvma;
1793
1794 memset(&pvma, 0, sizeof(struct vm_area_struct));
1795 /* Policy covers entire file */
1796 pvma.vm_end = TASK_SIZE;
1797 mpol_set_shared_policy(info, &pvma, newpol);
1798 mpol_free(newpol);
1799 }
1800 }
1801}
1802
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803int mpol_set_shared_policy(struct shared_policy *info,
1804 struct vm_area_struct *vma, struct mempolicy *npol)
1805{
1806 int err;
1807 struct sp_node *new = NULL;
1808 unsigned long sz = vma_pages(vma);
1809
David Rientjes028fec42008-04-28 02:12:25 -07001810 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811 vma->vm_pgoff,
David Rientjes028fec42008-04-28 02:12:25 -07001812 sz, npol ? npol->policy : -1,
1813 npol ? npol->flags : -1,
Paul Mundt140d5a42007-07-15 23:38:16 -07001814 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815
1816 if (npol) {
1817 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1818 if (!new)
1819 return -ENOMEM;
1820 }
1821 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1822 if (err && new)
1823 kmem_cache_free(sn_cache, new);
1824 return err;
1825}
1826
1827/* Free a backing policy store on inode delete. */
1828void mpol_free_shared_policy(struct shared_policy *p)
1829{
1830 struct sp_node *n;
1831 struct rb_node *next;
1832
1833 if (!p->root.rb_node)
1834 return;
1835 spin_lock(&p->lock);
1836 next = rb_first(&p->root);
1837 while (next) {
1838 n = rb_entry(next, struct sp_node, nd);
1839 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001840 rb_erase(&n->nd, &p->root);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841 mpol_free(n->policy);
1842 kmem_cache_free(sn_cache, n);
1843 }
1844 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001845}
1846
1847/* assumes fs == KERNEL_DS */
1848void __init numa_policy_init(void)
1849{
Paul Mundtb71636e2007-07-15 23:38:15 -07001850 nodemask_t interleave_nodes;
1851 unsigned long largest = 0;
1852 int nid, prefer = 0;
1853
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 policy_cache = kmem_cache_create("numa_policy",
1855 sizeof(struct mempolicy),
Paul Mundt20c2df82007-07-20 10:11:58 +09001856 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857
1858 sn_cache = kmem_cache_create("shared_policy_node",
1859 sizeof(struct sp_node),
Paul Mundt20c2df82007-07-20 10:11:58 +09001860 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001861
Paul Mundtb71636e2007-07-15 23:38:15 -07001862 /*
1863 * Set interleaving policy for system init. Interleaving is only
1864 * enabled across suitably sized nodes (default is >= 16MB), or
1865 * fall back to the largest node if they're all smaller.
1866 */
1867 nodes_clear(interleave_nodes);
Christoph Lameter56bbd652007-10-16 01:25:35 -07001868 for_each_node_state(nid, N_HIGH_MEMORY) {
Paul Mundtb71636e2007-07-15 23:38:15 -07001869 unsigned long total_pages = node_present_pages(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870
Paul Mundtb71636e2007-07-15 23:38:15 -07001871 /* Preserve the largest node */
1872 if (largest < total_pages) {
1873 largest = total_pages;
1874 prefer = nid;
1875 }
1876
1877 /* Interleave this node? */
1878 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1879 node_set(nid, interleave_nodes);
1880 }
1881
1882 /* All too small, use the largest */
1883 if (unlikely(nodes_empty(interleave_nodes)))
1884 node_set(prefer, interleave_nodes);
1885
David Rientjes028fec42008-04-28 02:12:25 -07001886 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001887 printk("numa_policy_init: interleaving failed\n");
1888}
1889
Christoph Lameter8bccd852005-10-29 18:16:59 -07001890/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891void numa_default_policy(void)
1892{
David Rientjes028fec42008-04-28 02:12:25 -07001893 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894}
Paul Jackson68860ec2005-10-30 15:02:36 -08001895
Paul Jackson42253992006-01-08 01:01:59 -08001896/*
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001897 * Display pages allocated per node and memory policy via /proc.
1898 */
Helge Deller15ad7cd2006-12-06 20:40:36 -08001899static const char * const policy_types[] =
1900 { "default", "prefer", "bind", "interleave" };
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001901
1902/*
1903 * Convert a mempolicy into a string.
1904 * Returns the number of characters in buffer (if positive)
1905 * or an error (negative)
1906 */
1907static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1908{
1909 char *p = buffer;
1910 int l;
1911 nodemask_t nodes;
David Rientjesa3b51e02008-04-28 02:12:23 -07001912 unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
David Rientjesf5b087b2008-04-28 02:12:27 -07001913 unsigned short flags = pol ? pol->flags : 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001914
1915 switch (mode) {
1916 case MPOL_DEFAULT:
1917 nodes_clear(nodes);
1918 break;
1919
1920 case MPOL_PREFERRED:
1921 nodes_clear(nodes);
1922 node_set(pol->v.preferred_node, nodes);
1923 break;
1924
1925 case MPOL_BIND:
Mel Gorman19770b32008-04-28 02:12:18 -07001926 /* Fall through */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001927 case MPOL_INTERLEAVE:
1928 nodes = pol->v.nodes;
1929 break;
1930
1931 default:
1932 BUG();
1933 return -EFAULT;
1934 }
1935
1936 l = strlen(policy_types[mode]);
1937 if (buffer + maxlen < p + l + 1)
1938 return -ENOSPC;
1939
1940 strcpy(p, policy_types[mode]);
1941 p += l;
1942
David Rientjesf5b087b2008-04-28 02:12:27 -07001943 if (flags) {
1944 int need_bar = 0;
1945
1946 if (buffer + maxlen < p + 2)
1947 return -ENOSPC;
1948 *p++ = '=';
1949
1950 if (flags & MPOL_F_STATIC_NODES)
1951 p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
David Rientjes4c50bc02008-04-28 02:12:30 -07001952 if (flags & MPOL_F_RELATIVE_NODES)
1953 p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
David Rientjesf5b087b2008-04-28 02:12:27 -07001954 }
1955
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001956 if (!nodes_empty(nodes)) {
1957 if (buffer + maxlen < p + 2)
1958 return -ENOSPC;
1959 *p++ = '=';
1960 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1961 }
1962 return p - buffer;
1963}
1964
1965struct numa_maps {
1966 unsigned long pages;
1967 unsigned long anon;
Christoph Lameter397874d2006-03-06 15:42:53 -08001968 unsigned long active;
1969 unsigned long writeback;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001970 unsigned long mapcount_max;
Christoph Lameter397874d2006-03-06 15:42:53 -08001971 unsigned long dirty;
1972 unsigned long swapcache;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001973 unsigned long node[MAX_NUMNODES];
1974};
1975
Christoph Lameter397874d2006-03-06 15:42:53 -08001976static void gather_stats(struct page *page, void *private, int pte_dirty)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001977{
1978 struct numa_maps *md = private;
1979 int count = page_mapcount(page);
1980
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001981 md->pages++;
Christoph Lameter397874d2006-03-06 15:42:53 -08001982 if (pte_dirty || PageDirty(page))
1983 md->dirty++;
1984
1985 if (PageSwapCache(page))
1986 md->swapcache++;
1987
1988 if (PageActive(page))
1989 md->active++;
1990
1991 if (PageWriteback(page))
1992 md->writeback++;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001993
1994 if (PageAnon(page))
1995 md->anon++;
1996
Christoph Lameter397874d2006-03-06 15:42:53 -08001997 if (count > md->mapcount_max)
1998 md->mapcount_max = count;
1999
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002000 md->node[page_to_nid(page)]++;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002001}
2002
Andrew Morton7f709ed2006-03-07 21:55:22 -08002003#ifdef CONFIG_HUGETLB_PAGE
Christoph Lameter397874d2006-03-06 15:42:53 -08002004static void check_huge_range(struct vm_area_struct *vma,
2005 unsigned long start, unsigned long end,
2006 struct numa_maps *md)
2007{
2008 unsigned long addr;
2009 struct page *page;
2010
2011 for (addr = start; addr < end; addr += HPAGE_SIZE) {
2012 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2013 pte_t pte;
2014
2015 if (!ptep)
2016 continue;
2017
2018 pte = *ptep;
2019 if (pte_none(pte))
2020 continue;
2021
2022 page = pte_page(pte);
2023 if (!page)
2024 continue;
2025
2026 gather_stats(page, md, pte_dirty(*ptep));
2027 }
2028}
Andrew Morton7f709ed2006-03-07 21:55:22 -08002029#else
2030static inline void check_huge_range(struct vm_area_struct *vma,
2031 unsigned long start, unsigned long end,
2032 struct numa_maps *md)
2033{
2034}
2035#endif
Christoph Lameter397874d2006-03-06 15:42:53 -08002036
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002037int show_numa_map(struct seq_file *m, void *v)
2038{
Eric W. Biederman99f89552006-06-26 00:25:55 -07002039 struct proc_maps_private *priv = m->private;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002040 struct vm_area_struct *vma = v;
2041 struct numa_maps *md;
Christoph Lameter397874d2006-03-06 15:42:53 -08002042 struct file *file = vma->vm_file;
2043 struct mm_struct *mm = vma->vm_mm;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07002044 struct mempolicy *pol;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002045 int n;
2046 char buffer[50];
2047
Christoph Lameter397874d2006-03-06 15:42:53 -08002048 if (!mm)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002049 return 0;
2050
2051 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2052 if (!md)
2053 return 0;
2054
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07002055 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2056 mpol_to_str(buffer, sizeof(buffer), pol);
2057 /*
2058 * unref shared or other task's mempolicy
2059 */
2060 if (pol != &default_policy && pol != current->mempolicy)
2061 __mpol_free(pol);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002062
Christoph Lameter397874d2006-03-06 15:42:53 -08002063 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002064
Christoph Lameter397874d2006-03-06 15:42:53 -08002065 if (file) {
2066 seq_printf(m, " file=");
Jan Blunckc32c2f62008-02-14 19:38:43 -08002067 seq_path(m, &file->f_path, "\n\t= ");
Christoph Lameter397874d2006-03-06 15:42:53 -08002068 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2069 seq_printf(m, " heap");
2070 } else if (vma->vm_start <= mm->start_stack &&
2071 vma->vm_end >= mm->start_stack) {
2072 seq_printf(m, " stack");
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002073 }
Christoph Lameter397874d2006-03-06 15:42:53 -08002074
2075 if (is_vm_hugetlb_page(vma)) {
2076 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2077 seq_printf(m, " huge");
2078 } else {
2079 check_pgd_range(vma, vma->vm_start, vma->vm_end,
Christoph Lameter56bbd652007-10-16 01:25:35 -07002080 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
Christoph Lameter397874d2006-03-06 15:42:53 -08002081 }
2082
2083 if (!md->pages)
2084 goto out;
2085
2086 if (md->anon)
2087 seq_printf(m," anon=%lu",md->anon);
2088
2089 if (md->dirty)
2090 seq_printf(m," dirty=%lu",md->dirty);
2091
2092 if (md->pages != md->anon && md->pages != md->dirty)
2093 seq_printf(m, " mapped=%lu", md->pages);
2094
2095 if (md->mapcount_max > 1)
2096 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2097
2098 if (md->swapcache)
2099 seq_printf(m," swapcache=%lu", md->swapcache);
2100
2101 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2102 seq_printf(m," active=%lu", md->active);
2103
2104 if (md->writeback)
2105 seq_printf(m," writeback=%lu", md->writeback);
2106
Christoph Lameter56bbd652007-10-16 01:25:35 -07002107 for_each_node_state(n, N_HIGH_MEMORY)
Christoph Lameter397874d2006-03-06 15:42:53 -08002108 if (md->node[n])
2109 seq_printf(m, " N%d=%lu", n, md->node[n]);
2110out:
2111 seq_putc(m, '\n');
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002112 kfree(md);
2113
2114 if (m->count < m->size)
Eric W. Biederman99f89552006-06-26 00:25:55 -07002115 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002116 return 0;
2117}