mm: rework virtual memory accounting
When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which
testing the RLIMIT_DATA value to figure out if we're allowed to assign
new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been
commited that RLIMIT_DATA in a form it's implemented now doesn't do
anything useful because most of user-space libraries use mmap() syscall
for dynamic memory allocations.
Linus suggested to convert RLIMIT_DATA rlimit into something suitable
for anonymous memory accounting. But in this patch we go further, and
the changes are bundled together as:
* keep vma counting if CONFIG_PROC_FS=n, will be used for limits
* replace mm->shared_vm with better defined mm->data_vm
* account anonymous executable areas as executable
* account file-backed growsdown/up areas as stack
* drop struct file* argument from vm_stat_account
* enforce RLIMIT_DATA for size of data areas
This way code looks cleaner: now code/stack/data classification depends
only on vm_flags state:
VM_EXEC & ~VM_WRITE -> code (VmExe + VmLib in proc)
VM_GROWSUP | VM_GROWSDOWN -> stack (VmStk)
VM_WRITE & ~VM_SHARED & !stack -> data (VmData)
The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called
"shared", but that might be strange beast like readonly-private or VM_IO
area.
- RLIMIT_AS limits whole address space "VmSize"
- RLIMIT_STACK limits stack "VmStk" (but each vma individually)
- RLIMIT_DATA now limits "VmData"
Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kees Cook <keescook@google.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index f32b84a..b3f00b6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1220,24 +1220,6 @@
return NULL;
}
-#ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *mm, unsigned long flags,
- struct file *file, long pages)
-{
- const unsigned long stack_flags
- = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-
- mm->total_vm += pages;
-
- if (file) {
- mm->shared_vm += pages;
- if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
- mm->exec_vm += pages;
- } else if (flags & stack_flags)
- mm->stack_vm += pages;
-}
-#endif /* CONFIG_PROC_FS */
-
/*
* If a hint addr is less than mmap_min_addr change hint to be as
* low as possible but still greater than mmap_min_addr
@@ -1556,7 +1538,7 @@
unsigned long charged = 0;
/* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
@@ -1565,7 +1547,8 @@
*/
nr_pages = count_vma_pages_range(mm, addr, addr + len);
- if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
@@ -1664,7 +1647,7 @@
out:
perf_event_mmap(vma);
- vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
@@ -2111,7 +2094,7 @@
unsigned long new_start, actual_size;
/* address space limit tests */
- if (!may_expand_vm(mm, grow))
+ if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
@@ -2208,8 +2191,7 @@
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags,
- vma->vm_file, grow);
+ vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
@@ -2284,8 +2266,7 @@
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags,
- vma->vm_file, grow);
+ vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
@@ -2399,7 +2380,7 @@
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+ vm_stat_account(mm, vma->vm_flags, -nrpages);
vma = remove_vma(vma);
} while (vma);
vm_unacct_memory(nr_accounted);
@@ -2769,7 +2750,7 @@
}
/* Check against address space limits *after* clearing old maps... */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
@@ -2804,6 +2785,7 @@
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
+ mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
@@ -2995,9 +2977,28 @@
* Return true if the calling process may expand its vm space by the passed
* number of pages
*/
-int may_expand_vm(struct mm_struct *mm, unsigned long npages)
+bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
- return mm->total_vm + npages <= rlimit(RLIMIT_AS) >> PAGE_SHIFT;
+ if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
+ return false;
+
+ if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
+ (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
+ return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
+
+ return true;
+}
+
+void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
+{
+ mm->total_vm += npages;
+
+ if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
+ mm->exec_vm += npages;
+ else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
+ mm->stack_vm += npages;
+ else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ mm->data_vm += npages;
}
static int special_mapping_fault(struct vm_area_struct *vma,
@@ -3079,7 +3080,7 @@
if (ret)
goto out;
- mm->total_vm += len >> PAGE_SHIFT;
+ vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
perf_event_mmap(vma);