Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton:
"15 fixes"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm, docs: update memory.stat description with workingset* entries
mm: vmscan: scan until it finds eligible pages
mm, thp: copying user pages must schedule on collapse
dax: fix PMD data corruption when fault races with write
dax: fix data corruption when fault races with write
ext4: return to starting transaction in ext4_dax_huge_fault()
mm: fix data corruption due to stale mmap reads
dax: prevent invalidation of mapped DAX entries
Tigran has moved
mm, vmalloc: fix vmalloc users tracking properly
mm/khugepaged: add missed tracepoint for collapse_huge_page_swapin
gcov: support GCC 7.1
mm, vmstat: Remove spurious WARN() during zoneinfo print
time: delete current_fs_time()
hwpoison, memcg: forcibly uncharge LRU pages
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index e50b95c..dc5e2dc 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -918,6 +918,18 @@
Number of major page faults incurred
+ workingset_refault
+
+ Number of refaults of previously evicted pages
+
+ workingset_activate
+
+ Number of refaulted pages that were immediately activated
+
+ workingset_nodereclaim
+
+ Number of times a shadow node has been reclaimed
+
memory.swap.current
A read-only single value file which exists on non-root
diff --git a/Documentation/filesystems/bfs.txt b/Documentation/filesystems/bfs.txt
index 78043d5..843ce91 100644
--- a/Documentation/filesystems/bfs.txt
+++ b/Documentation/filesystems/bfs.txt
@@ -54,4 +54,4 @@
If you have any patches, questions or suggestions regarding this BFS
implementation please contact the author:
-Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+Tigran Aivazian <aivazian.tigran@gmail.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index 6b36037..f7d568b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2483,7 +2483,7 @@
F: drivers/net/ethernet/ec_bhf.c
BFS FILE SYSTEM
-M: "Tigran A. Aivazian" <tigran@aivazian.fsnet.co.uk>
+M: "Tigran A. Aivazian" <aivazian.tigran@gmail.com>
S: Maintained
F: Documentation/filesystems/bfs.txt
F: fs/bfs/
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 1d38e53..45db4d2 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -10,7 +10,7 @@
* Author: Peter Oruba <peter.oruba@amd.com>
*
* Based on work by:
- * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Tigran Aivazian <aivazian.tigran@gmail.com>
*
* early loader:
* Copyright (C) 2013 Advanced Micro Devices, Inc.
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b4a4cd3..e53d3c9 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -1,7 +1,7 @@
/*
* CPU Microcode Update Driver for Linux
*
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* 2006 Shaohua Li <shaohua.li@intel.com>
* 2013-2016 Borislav Petkov <bp@alien8.de>
*
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 8325d8a..afdfd23 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -1,7 +1,7 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* Intel CPU microcode early update for Linux
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f2deec0..25e312c 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -1,7 +1,7 @@
/*
* fs/bfs/inode.c
* BFS superblock and inode operations.
- * Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
*
* Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
@@ -19,7 +19,7 @@
#include <linux/uaccess.h>
#include "bfs.h"
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>");
MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux");
MODULE_LICENSE("GPL");
diff --git a/fs/dax.c b/fs/dax.c
index 18fe9bb..c22eaf1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -461,35 +461,6 @@
}
/*
- * Invalidate exceptional DAX entry if easily possible. This handles DAX
- * entries for invalidate_inode_pages() so we evict the entry only if we can
- * do so without blocking.
- */
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- int ret = 0;
- void *entry, **slot;
- struct radix_tree_root *page_tree = &mapping->page_tree;
-
- spin_lock_irq(&mapping->tree_lock);
- entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
- if (!entry || !radix_tree_exceptional_entry(entry) ||
- slot_locked(mapping, slot))
- goto out;
- if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
- goto out;
- radix_tree_delete(page_tree, index);
- mapping->nrexceptional--;
- ret = 1;
-out:
- spin_unlock_irq(&mapping->tree_lock);
- if (ret)
- dax_wake_mapping_entry_waiter(mapping, index, entry, true);
- return ret;
-}
-
-/*
* Invalidate exceptional DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -1044,7 +1015,7 @@
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
- if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+ if (iomap->flags & IOMAP_F_NEW) {
invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
@@ -1177,6 +1148,12 @@
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
+ entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+ if (IS_ERR(entry)) {
+ vmf_ret = dax_fault_return(PTR_ERR(entry));
+ goto out;
+ }
+
/*
* Note that we don't bother to use iomap_apply here: DAX required
* the file system block size to be equal the page size, which means
@@ -1185,17 +1162,11 @@
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
if (error) {
vmf_ret = dax_fault_return(error);
- goto out;
+ goto unlock_entry;
}
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
- goto finish_iomap;
- }
-
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- vmf_ret = dax_fault_return(PTR_ERR(entry));
- goto finish_iomap;
+ error = -EIO; /* fs corruption? */
+ goto error_finish_iomap;
}
sector = dax_iomap_sector(&iomap, pos);
@@ -1217,13 +1188,13 @@
}
if (error)
- goto error_unlock_entry;
+ goto error_finish_iomap;
__SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf);
if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW;
- goto unlock_entry;
+ goto finish_iomap;
}
switch (iomap.type) {
@@ -1243,7 +1214,7 @@
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, &entry, vmf);
- goto unlock_entry;
+ goto finish_iomap;
}
/*FALLTHRU*/
default:
@@ -1252,10 +1223,8 @@
break;
}
- error_unlock_entry:
+ error_finish_iomap:
vmf_ret = dax_fault_return(error) | major;
- unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
@@ -1270,7 +1239,9 @@
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
-out:
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
@@ -1417,19 +1388,6 @@
goto fallback;
/*
- * Note that we don't use iomap_apply here. We aren't doing I/O, only
- * setting up a mapping, so really we're using iomap_begin() as a way
- * to look up our filesystem block.
- */
- pos = (loff_t)pgoff << PAGE_SHIFT;
- error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
- if (error)
- goto fallback;
-
- if (iomap.offset + iomap.length < pos + PMD_SIZE)
- goto finish_iomap;
-
- /*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
@@ -1437,6 +1395,19 @@
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
+ goto fallback;
+
+ /*
+ * Note that we don't use iomap_apply here. We aren't doing I/O, only
+ * setting up a mapping, so really we're using iomap_begin() as a way
+ * to look up our filesystem block.
+ */
+ pos = (loff_t)pgoff << PAGE_SHIFT;
+ error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+ if (error)
+ goto unlock_entry;
+
+ if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;
switch (iomap.type) {
@@ -1446,7 +1417,7 @@
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
- goto unlock_entry;
+ break;
result = dax_pmd_load_hole(vmf, &iomap, &entry);
break;
default:
@@ -1454,8 +1425,6 @@
break;
}
- unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
@@ -1471,6 +1440,8 @@
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
+ unlock_entry:
+ put_locked_mapping_entry(mapping, pgoff, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cefa983..831fd6b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -257,6 +257,7 @@
enum page_entry_size pe_size)
{
int result;
+ handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -264,12 +265,24 @@
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ } else {
+ down_read(&EXT4_I(inode)->i_mmap_sem);
}
- down_read(&EXT4_I(inode)->i_mmap_sem);
- result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- if (write)
+ if (!IS_ERR(handle))
+ result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
+ else
+ result = VM_FAULT_SIGBUS;
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
+ } else {
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ }
return result;
}
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7fdf1d7..00ebac8 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -89,7 +89,6 @@
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
const struct iomap_ops *ops);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0ad325e..803e5a9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1431,7 +1431,6 @@
inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
}
-extern struct timespec current_fs_time(struct super_block *sb);
extern struct timespec current_time(struct inode *inode);
/*
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0328ce0..2d92dd0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -6,7 +6,6 @@
#include <linux/list.h>
#include <linux/llist.h>
#include <asm/page.h> /* pgprot_t */
-#include <asm/pgtable.h> /* PAGE_KERNEL */
#include <linux/rbtree.h>
struct vm_area_struct; /* vma defining user mapping in mm_types.h */
@@ -83,22 +82,14 @@
const void *caller);
#ifndef CONFIG_MMU
extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
-#else
-extern void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller);
-
-/*
- * We really want to have this inlined due to caller tracking. This
- * function is used by the highlevel vmalloc apis and so we want to track
- * their callers and inlining will achieve that.
- */
-static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
+static inline void *__vmalloc_node_flags_caller(unsigned long size, int node,
+ gfp_t flags, void *caller)
{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node_flags(size, node, flags);
}
+#else
+extern void *__vmalloc_node_flags_caller(unsigned long size,
+ int node, gfp_t flags, void *caller);
#endif
extern void vfree(const void *addr);
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 2f9df37..c51a49c 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -98,6 +98,12 @@
}
EXPORT_SYMBOL(__gcov_merge_icall_topn);
+void __gcov_exit(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 6a5c239..46a18e7 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
+#if (__GNUC__ >= 7)
+#define GCOV_COUNTERS 9
+#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6574bba..49c73c6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -230,20 +230,6 @@
return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
}
-/**
- * current_fs_time - Return FS time
- * @sb: Superblock.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs.
- */
-struct timespec current_fs_time(struct super_block *sb)
-{
- struct timespec now = current_kernel_time();
- return timespec_trunc(now, sb->s_time_gran);
-}
-EXPORT_SYMBOL(current_fs_time);
-
/*
* Convert jiffies to milliseconds and back.
*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7cb9c88..945fd1c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -612,7 +612,8 @@
spinlock_t *ptl)
{
pte_t *_pte;
- for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
struct page *src_page;
@@ -651,9 +652,7 @@
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
-
- address += PAGE_SIZE;
- page++;
+ cond_resched();
}
}
@@ -907,8 +906,10 @@
return false;
}
/* check if the pmd is still valid */
- if (mm_find_pmd(mm, address) != pmd)
+ if (mm_find_pmd(mm, address) != pmd) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
+ }
}
if (ret & VM_FAULT_ERROR) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ff73899..9417208 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5528,7 +5528,7 @@
next = page->lru.next;
VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
if (!page->mem_cgroup)
continue;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 73066b8..2527dfe 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -539,6 +539,13 @@
*/
ClearPageActive(p);
ClearPageUnevictable(p);
+
+ /*
+ * Poisoned page might never drop its ref count to 0 so we have
+ * to uncharge it manually from its memcg.
+ */
+ mem_cgroup_uncharge(p);
+
/*
* drop the page count elevated by isolate_lru_page()
*/
diff --git a/mm/truncate.c b/mm/truncate.c
index 83a059e..6479ed2 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -67,17 +67,14 @@
/*
* Invalidate exceptional entry if easily possible. This handles exceptional
- * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
- * clean entries.
+ * entries for invalidate_inode_pages().
*/
static int invalidate_exceptional_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
- /* Handled by shmem itself */
- if (shmem_mapping(mapping))
+ /* Handled by shmem itself, or for DAX we do nothing. */
+ if (shmem_mapping(mapping) || dax_mapping(mapping))
return 1;
- if (dax_mapping(mapping))
- return dax_invalidate_mapping_entry(mapping, index);
clear_shadow_entry(mapping, index, entry);
return 1;
}
@@ -689,7 +686,17 @@
cond_resched();
index++;
}
-
+ /*
+ * For DAX we invalidate page tables after invalidating radix tree. We
+ * could invalidate page tables while invalidating each entry however
+ * that would be expensive. And doing range unmapping before doesn't
+ * work as we have no cheap way to find whether radix tree entry didn't
+ * get remapped later.
+ */
+ if (dax_mapping(mapping)) {
+ unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT,
+ (loff_t)(end - start + 1) << PAGE_SHIFT, 0);
+ }
out:
cleancache_invalidate_inode(mapping);
return ret;
diff --git a/mm/util.c b/mm/util.c
index 718154d..464df34 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -382,7 +382,8 @@
if (ret || size <= PAGE_SIZE)
return ret;
- return __vmalloc_node_flags(size, node, flags);
+ return __vmalloc_node_flags_caller(size, node, flags,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 194c22e..34a1c3e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1649,6 +1649,9 @@
}
EXPORT_SYMBOL(vmap);
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
@@ -1791,7 +1794,7 @@
* with mm people.
*
*/
-void *__vmalloc_node(unsigned long size, unsigned long align,
+static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller)
{
@@ -1806,6 +1809,20 @@
}
EXPORT_SYMBOL(__vmalloc);
+static inline void *__vmalloc_node_flags(unsigned long size,
+ int node, gfp_t flags)
+{
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+ node, __builtin_return_address(0));
+}
+
+
+void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
+ void *caller)
+{
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
+}
+
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2f45c05..8ad39bb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1449,7 +1449,7 @@
*
* Appropriate locks must be held before calling this function.
*
- * @nr_to_scan: The number of pages to look through on the list.
+ * @nr_to_scan: The number of eligible pages to look through on the list.
* @lruvec: The LRU vector to pull pages from.
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
@@ -1469,11 +1469,13 @@
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
- unsigned long scan, nr_pages;
+ unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
- for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
- !list_empty(src); scan++) {
+ scan = 0;
+ for (total_scan = 0;
+ scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
+ total_scan++) {
struct page *page;
page = lru_to_page(src);
@@ -1487,6 +1489,13 @@
continue;
}
+ /*
+ * Do not count skipped pages because that makes the function
+ * return with no isolated pages if the LRU mostly contains
+ * ineligible pages. This causes the VM to not reclaim any
+ * pages, triggering a premature OOM.
+ */
+ scan++;
switch (__isolate_lru_page(page, mode)) {
case 0:
nr_pages = hpage_nr_pages(page);
@@ -1524,9 +1533,9 @@
skipped += nr_skipped[zid];
}
}
- *nr_scanned = scan;
+ *nr_scanned = total_scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
- scan, skipped, nr_taken, mode, lru);
+ total_scan, skipped, nr_taken, mode, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f5fa1bd..76f7367 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1359,8 +1359,6 @@
return zone == compare;
}
- /* The zone must be somewhere! */
- WARN_ON_ONCE(1);
return false;
}