Blame - mm/memory-failure.c - kernel/msm-5.4

blob: 22d2b2028e54e60db4fdb0c1a808a7463a1c03b0 [file] [log] [blame]

Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	1	/*
				2	* Copyright (C) 2008, 2009 Intel Corporation
				3	* Authors: Andi Kleen, Fengguang Wu
				4	*
				5	* This software may be redistributed and/or modified under the terms of
				6	* the GNU General Public License ("GPL") version 2 only as published by the
				7	* Free Software Foundation.
				8	*
				9	* High level machine check handler. Handles pages reported by the
				10	* hardware as being corrupted usually due to a 2bit ECC memory or cache
				11	* failure.
				12	*
				13	* Handles page cache pages in various states. The tricky part
				14	* here is that we can access any page asynchronous to other VM
				15	* users, because memory failures could happen anytime and anywhere,
				16	* possibly violating some of their assumptions. This is why this code
				17	* has to be extremely careful. Generally it tries to use normal locking
				18	* rules, as in get the standard locks, even if that means the
				19	* error handling takes potentially a long time.
				20	*
				21	* The operation to map back from RMAP chains to processes has to walk
				22	* the complete process list and has non linear complexity with the number
				23	* mappings. In short it can be quite slow. But since memory corruptions
				24	* are rare we hope to get away with this.
				25	*/
				26
				27	/*
				28	* Notebook:
				29	* - hugetlb needs more code
				30	* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
				31	* - pass bad pages to kdump next kernel
				32	*/
				33	#define DEBUG 1 /* remove me in 2.6.34 */
				34	#include <linux/kernel.h>
				35	#include <linux/mm.h>
				36	#include <linux/page-flags.h>
Wu Fengguang	478c5ff	2009-12-16 12:19:59 +0100	[diff] [blame^]	37	#include <linux/kernel-page-flags.h>
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	38	#include <linux/sched.h>
Hugh Dickins	01e00f8	2009-10-13 15:02:11 +0100	[diff] [blame]	39	#include <linux/ksm.h>
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	40	#include <linux/rmap.h>
				41	#include <linux/pagemap.h>
				42	#include <linux/swap.h>
				43	#include <linux/backing-dev.h>
				44	#include "internal.h"
				45
				46	int sysctl_memory_failure_early_kill __read_mostly = 0;
				47
				48	int sysctl_memory_failure_recovery __read_mostly = 1;
				49
				50	atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
				51
Wu Fengguang	7c116f2	2009-12-16 12:19:59 +0100	[diff] [blame]	52	u32 hwpoison_filter_dev_major = ~0U;
				53	u32 hwpoison_filter_dev_minor = ~0U;
Wu Fengguang	478c5ff	2009-12-16 12:19:59 +0100	[diff] [blame^]	54	u64 hwpoison_filter_flags_mask;
				55	u64 hwpoison_filter_flags_value;
Wu Fengguang	7c116f2	2009-12-16 12:19:59 +0100	[diff] [blame]	56	EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
				57	EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
Wu Fengguang	478c5ff	2009-12-16 12:19:59 +0100	[diff] [blame^]	58	EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
				59	EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
Wu Fengguang	7c116f2	2009-12-16 12:19:59 +0100	[diff] [blame]	60
				61	static int hwpoison_filter_dev(struct page *p)
				62	{
				63	struct address_space *mapping;
				64	dev_t dev;
				65
				66	if (hwpoison_filter_dev_major == ~0U &&
				67	hwpoison_filter_dev_minor == ~0U)
				68	return 0;
				69
				70	/*
				71	* page_mapping() does not accept slab page
				72	*/
				73	if (PageSlab(p))
				74	return -EINVAL;
				75
				76	mapping = page_mapping(p);
				77	if (mapping == NULL \|\| mapping->host == NULL)
				78	return -EINVAL;
				79
				80	dev = mapping->host->i_sb->s_dev;
				81	if (hwpoison_filter_dev_major != ~0U &&
				82	hwpoison_filter_dev_major != MAJOR(dev))
				83	return -EINVAL;
				84	if (hwpoison_filter_dev_minor != ~0U &&
				85	hwpoison_filter_dev_minor != MINOR(dev))
				86	return -EINVAL;
				87
				88	return 0;
				89	}
				90
Wu Fengguang	478c5ff	2009-12-16 12:19:59 +0100	[diff] [blame^]	91	static int hwpoison_filter_flags(struct page *p)
				92	{
				93	if (!hwpoison_filter_flags_mask)
				94	return 0;
				95
				96	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
				97	hwpoison_filter_flags_value)
				98	return 0;
				99	else
				100	return -EINVAL;
				101	}
				102
Wu Fengguang	7c116f2	2009-12-16 12:19:59 +0100	[diff] [blame]	103	int hwpoison_filter(struct page *p)
				104	{
				105	if (hwpoison_filter_dev(p))
				106	return -EINVAL;
				107
Wu Fengguang	478c5ff	2009-12-16 12:19:59 +0100	[diff] [blame^]	108	if (hwpoison_filter_flags(p))
				109	return -EINVAL;
				110
Wu Fengguang	7c116f2	2009-12-16 12:19:59 +0100	[diff] [blame]	111	return 0;
				112	}
				113	EXPORT_SYMBOL_GPL(hwpoison_filter);
				114
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	115	/*
				116	* Send all the processes who have the page mapped an ``action optional''
				117	* signal.
				118	*/
				119	static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
				120	unsigned long pfn)
				121	{
				122	struct siginfo si;
				123	int ret;
				124
				125	printk(KERN_ERR
				126	"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
				127	pfn, t->comm, t->pid);
				128	si.si_signo = SIGBUS;
				129	si.si_errno = 0;
				130	si.si_code = BUS_MCEERR_AO;
				131	si.si_addr = (void *)addr;
				132	#ifdef __ARCH_SI_TRAPNO
				133	si.si_trapno = trapno;
				134	#endif
				135	si.si_addr_lsb = PAGE_SHIFT;
				136	/*
				137	* Don't use force here, it's convenient if the signal
				138	* can be temporarily blocked.
				139	* This could cause a loop when the user sets SIGBUS
				140	* to SIG_IGN, but hopefully noone will do that?
				141	*/
				142	ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
				143	if (ret < 0)
				144	printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
				145	t->comm, t->pid, ret);
				146	return ret;
				147	}
				148
				149	/*
Andi Kleen	588f9ce	2009-12-16 12:19:57 +0100	[diff] [blame]	150	* When a unknown page type is encountered drain as many buffers as possible
				151	* in the hope to turn the page into a LRU or free page, which we can handle.
				152	*/
				153	void shake_page(struct page *p)
				154	{
				155	if (!PageSlab(p)) {
				156	lru_add_drain_all();
				157	if (PageLRU(p))
				158	return;
				159	drain_all_pages();
				160	if (PageLRU(p) \|\| is_free_buddy_page(p))
				161	return;
				162	}
				163	/*
				164	* Could call shrink_slab here (which would also
				165	* shrink other caches). Unfortunately that might
				166	* also access the corrupted page, which could be fatal.
				167	*/
				168	}
				169	EXPORT_SYMBOL_GPL(shake_page);
				170
				171	/*
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	172	* Kill all processes that have a poisoned page mapped and then isolate
				173	* the page.
				174	*
				175	* General strategy:
				176	* Find all processes having the page mapped and kill them.
				177	* But we keep a page reference around so that the page is not
				178	* actually freed yet.
				179	* Then stash the page away
				180	*
				181	* There's no convenient way to get back to mapped processes
				182	* from the VMAs. So do a brute-force search over all
				183	* running processes.
				184	*
				185	* Remember that machine checks are not common (or rather
				186	* if they are common you have other problems), so this shouldn't
				187	* be a performance issue.
				188	*
				189	* Also there are some races possible while we get from the
				190	* error detection to actually handle it.
				191	*/
				192
				193	struct to_kill {
				194	struct list_head nd;
				195	struct task_struct *tsk;
				196	unsigned long addr;
				197	unsigned addr_valid:1;
				198	};
				199
				200	/*
				201	* Failure handling: if we can't find or can't kill a process there's
				202	* not much we can do. We just print a message and ignore otherwise.
				203	*/
				204
				205	/*
				206	* Schedule a process for later kill.
				207	* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
				208	* TBD would GFP_NOIO be enough?
				209	*/
				210	static void add_to_kill(struct task_struct tsk, struct page p,
				211	struct vm_area_struct *vma,
				212	struct list_head *to_kill,
				213	struct to_kill **tkc)
				214	{
				215	struct to_kill *tk;
				216
				217	if (*tkc) {
				218	tk = *tkc;
				219	*tkc = NULL;
				220	} else {
				221	tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
				222	if (!tk) {
				223	printk(KERN_ERR
				224	"MCE: Out of memory while machine check handling\n");
				225	return;
				226	}
				227	}
				228	tk->addr = page_address_in_vma(p, vma);
				229	tk->addr_valid = 1;
				230
				231	/*
				232	* In theory we don't have to kill when the page was
				233	* munmaped. But it could be also a mremap. Since that's
				234	* likely very rare kill anyways just out of paranoia, but use
				235	* a SIGKILL because the error is not contained anymore.
				236	*/
				237	if (tk->addr == -EFAULT) {
				238	pr_debug("MCE: Unable to find user space address %lx in %s\n",
				239	page_to_pfn(p), tsk->comm);
				240	tk->addr_valid = 0;
				241	}
				242	get_task_struct(tsk);
				243	tk->tsk = tsk;
				244	list_add_tail(&tk->nd, to_kill);
				245	}
				246
				247	/*
				248	* Kill the processes that have been collected earlier.
				249	*
				250	* Only do anything when DOIT is set, otherwise just free the list
				251	* (this is used for clean pages which do not need killing)
				252	* Also when FAIL is set do a force kill because something went
				253	* wrong earlier.
				254	*/
				255	static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
				256	int fail, unsigned long pfn)
				257	{
				258	struct to_kill tk, next;
				259
				260	list_for_each_entry_safe (tk, next, to_kill, nd) {
				261	if (doit) {
				262	/*
André Goddard Rosa	af901ca	2009-11-14 13:09:05 -0200	[diff] [blame]	263	* In case something went wrong with munmapping
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	264	* make sure the process doesn't catch the
				265	* signal and then access the memory. Just kill it.
				266	* the signal handlers
				267	*/
				268	if (fail \|\| tk->addr_valid == 0) {
				269	printk(KERN_ERR
				270	"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
				271	pfn, tk->tsk->comm, tk->tsk->pid);
				272	force_sig(SIGKILL, tk->tsk);
				273	}
				274
				275	/*
				276	* In theory the process could have mapped
				277	* something else on the address in-between. We could
				278	* check for that, but we need to tell the
				279	* process anyways.
				280	*/
				281	else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
				282	pfn) < 0)
				283	printk(KERN_ERR
				284	"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
				285	pfn, tk->tsk->comm, tk->tsk->pid);
				286	}
				287	put_task_struct(tk->tsk);
				288	kfree(tk);
				289	}
				290	}
				291
				292	static int task_early_kill(struct task_struct *tsk)
				293	{
				294	if (!tsk->mm)
				295	return 0;
				296	if (tsk->flags & PF_MCE_PROCESS)
				297	return !!(tsk->flags & PF_MCE_EARLY);
				298	return sysctl_memory_failure_early_kill;
				299	}
				300
				301	/*
				302	* Collect processes when the error hit an anonymous page.
				303	*/
				304	static void collect_procs_anon(struct page page, struct list_head to_kill,
				305	struct to_kill **tkc)
				306	{
				307	struct vm_area_struct *vma;
				308	struct task_struct *tsk;
				309	struct anon_vma *av;
				310
				311	read_lock(&tasklist_lock);
				312	av = page_lock_anon_vma(page);
				313	if (av == NULL) /* Not actually mapped anymore */
				314	goto out;
				315	for_each_process (tsk) {
				316	if (!task_early_kill(tsk))
				317	continue;
				318	list_for_each_entry (vma, &av->head, anon_vma_node) {
				319	if (!page_mapped_in_vma(page, vma))
				320	continue;
				321	if (vma->vm_mm == tsk->mm)
				322	add_to_kill(tsk, page, vma, to_kill, tkc);
				323	}
				324	}
				325	page_unlock_anon_vma(av);
				326	out:
				327	read_unlock(&tasklist_lock);
				328	}
				329
				330	/*
				331	* Collect processes when the error hit a file mapped page.
				332	*/
				333	static void collect_procs_file(struct page page, struct list_head to_kill,
				334	struct to_kill **tkc)
				335	{
				336	struct vm_area_struct *vma;
				337	struct task_struct *tsk;
				338	struct prio_tree_iter iter;
				339	struct address_space *mapping = page->mapping;
				340
				341	/*
				342	* A note on the locking order between the two locks.
				343	* We don't rely on this particular order.
				344	* If you have some other code that needs a different order
				345	* feel free to switch them around. Or add a reverse link
				346	* from mm_struct to task_struct, then this could be all
				347	* done without taking tasklist_lock and looping over all tasks.
				348	*/
				349
				350	read_lock(&tasklist_lock);
				351	spin_lock(&mapping->i_mmap_lock);
				352	for_each_process(tsk) {
				353	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
				354
				355	if (!task_early_kill(tsk))
				356	continue;
				357
				358	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
				359	pgoff) {
				360	/*
				361	* Send early kill signal to tasks where a vma covers
				362	* the page but the corrupted page is not necessarily
				363	* mapped it in its pte.
				364	* Assume applications who requested early kill want
				365	* to be informed of all such data corruptions.
				366	*/
				367	if (vma->vm_mm == tsk->mm)
				368	add_to_kill(tsk, page, vma, to_kill, tkc);
				369	}
				370	}
				371	spin_unlock(&mapping->i_mmap_lock);
				372	read_unlock(&tasklist_lock);
				373	}
				374
				375	/*
				376	* Collect the processes who have the corrupted page mapped to kill.
				377	* This is done in two steps for locking reasons.
				378	* First preallocate one tokill structure outside the spin locks,
				379	* so that we can kill at least one process reasonably reliable.
				380	*/
				381	static void collect_procs(struct page page, struct list_head tokill)
				382	{
				383	struct to_kill *tk;
				384
				385	if (!page->mapping)
				386	return;
				387
				388	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
				389	if (!tk)
				390	return;
				391	if (PageAnon(page))
				392	collect_procs_anon(page, tokill, &tk);
				393	else
				394	collect_procs_file(page, tokill, &tk);
				395	kfree(tk);
				396	}
				397
				398	/*
				399	* Error handlers for various types of pages.
				400	*/
				401
				402	enum outcome {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame]	403	IGNORED, /* Error: cannot be handled */
				404	FAILED, /* Error: handling failed */
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	405	DELAYED, /* Will be handled later */
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	406	RECOVERED, /* Successfully recovered */
				407	};
				408
				409	static const char *action_name[] = {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame]	410	[IGNORED] = "Ignored",
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	411	[FAILED] = "Failed",
				412	[DELAYED] = "Delayed",
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	413	[RECOVERED] = "Recovered",
				414	};
				415
				416	/*
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	417	* XXX: It is possible that a page is isolated from LRU cache,
				418	* and then kept in swap cache or failed to remove from page cache.
				419	* The page count will stop it from being freed by unpoison.
				420	* Stress tests should be aware of this memory leak problem.
				421	*/
				422	static int delete_from_lru_cache(struct page *p)
				423	{
				424	if (!isolate_lru_page(p)) {
				425	/*
				426	* Clear sensible page flags, so that the buddy system won't
				427	* complain when the page is unpoison-and-freed.
				428	*/
				429	ClearPageActive(p);
				430	ClearPageUnevictable(p);
				431	/*
				432	* drop the page count elevated by isolate_lru_page()
				433	*/
				434	page_cache_release(p);
				435	return 0;
				436	}
				437	return -EIO;
				438	}
				439
				440	/*
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	441	* Error hit kernel page.
				442	* Do nothing, try to be lucky and not touch this instead. For a few cases we
				443	* could be more sophisticated.
				444	*/
				445	static int me_kernel(struct page *p, unsigned long pfn)
				446	{
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	447	return IGNORED;
				448	}
				449
				450	/*
				451	* Page in unknown state. Do nothing.
				452	*/
				453	static int me_unknown(struct page *p, unsigned long pfn)
				454	{
				455	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
				456	return FAILED;
				457	}
				458
				459	/*
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	460	* Clean (or cleaned) page cache page.
				461	*/
				462	static int me_pagecache_clean(struct page *p, unsigned long pfn)
				463	{
				464	int err;
				465	int ret = FAILED;
				466	struct address_space *mapping;
				467
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	468	delete_from_lru_cache(p);
				469
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	470	/*
				471	* For anonymous pages we're done the only reference left
				472	* should be the one m_f() holds.
				473	*/
				474	if (PageAnon(p))
				475	return RECOVERED;
				476
				477	/*
				478	* Now truncate the page in the page cache. This is really
				479	* more like a "temporary hole punch"
				480	* Don't do this for block devices when someone else
				481	* has a reference, because it could be file system metadata
				482	* and that's not safe to truncate.
				483	*/
				484	mapping = page_mapping(p);
				485	if (!mapping) {
				486	/*
				487	* Page has been teared down in the meanwhile
				488	*/
				489	return FAILED;
				490	}
				491
				492	/*
				493	* Truncation is a bit tricky. Enable it per file system for now.
				494	*
				495	* Open: to take i_mutex or not for this? Right now we don't.
				496	*/
				497	if (mapping->a_ops->error_remove_page) {
				498	err = mapping->a_ops->error_remove_page(mapping, p);
				499	if (err != 0) {
				500	printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
				501	pfn, err);
				502	} else if (page_has_private(p) &&
				503	!try_to_release_page(p, GFP_NOIO)) {
				504	pr_debug("MCE %#lx: failed to release buffers\n", pfn);
				505	} else {
				506	ret = RECOVERED;
				507	}
				508	} else {
				509	/*
				510	* If the file system doesn't support it just invalidate
				511	* This fails on dirty or anything with private pages
				512	*/
				513	if (invalidate_inode_page(p))
				514	ret = RECOVERED;
				515	else
				516	printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
				517	pfn);
				518	}
				519	return ret;
				520	}
				521
				522	/*
				523	* Dirty cache page page
				524	* Issues: when the error hit a hole page the error is not properly
				525	* propagated.
				526	*/
				527	static int me_pagecache_dirty(struct page *p, unsigned long pfn)
				528	{
				529	struct address_space *mapping = page_mapping(p);
				530
				531	SetPageError(p);
				532	/* TBD: print more information about the file. */
				533	if (mapping) {
				534	/*
				535	* IO error will be reported by write(), fsync(), etc.
				536	* who check the mapping.
				537	* This way the application knows that something went
				538	* wrong with its dirty file data.
				539	*
				540	* There's one open issue:
				541	*
				542	* The EIO will be only reported on the next IO
				543	* operation and then cleared through the IO map.
				544	* Normally Linux has two mechanisms to pass IO error
				545	* first through the AS_EIO flag in the address space
				546	* and then through the PageError flag in the page.
				547	* Since we drop pages on memory failure handling the
				548	* only mechanism open to use is through AS_AIO.
				549	*
				550	* This has the disadvantage that it gets cleared on
				551	* the first operation that returns an error, while
				552	* the PageError bit is more sticky and only cleared
				553	* when the page is reread or dropped. If an
				554	* application assumes it will always get error on
				555	* fsync, but does other operations on the fd before
				556	* and the page is dropped inbetween then the error
				557	* will not be properly reported.
				558	*
				559	* This can already happen even without hwpoisoned
				560	* pages: first on metadata IO errors (which only
				561	* report through AS_EIO) or when the page is dropped
				562	* at the wrong time.
				563	*
				564	* So right now we assume that the application DTRT on
				565	* the first EIO, but we're not worse than other parts
				566	* of the kernel.
				567	*/
				568	mapping_set_error(mapping, EIO);
				569	}
				570
				571	return me_pagecache_clean(p, pfn);
				572	}
				573
				574	/*
				575	* Clean and dirty swap cache.
				576	*
				577	* Dirty swap cache page is tricky to handle. The page could live both in page
				578	* cache and swap cache(ie. page is freshly swapped in). So it could be
				579	* referenced concurrently by 2 types of PTEs:
				580	* normal PTEs and swap PTEs. We try to handle them consistently by calling
				581	* try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
				582	* and then
				583	* - clear dirty bit to prevent IO
				584	* - remove from LRU
				585	* - but keep in the swap cache, so that when we return to it on
				586	* a later page fault, we know the application is accessing
				587	* corrupted data and shall be killed (we installed simple
				588	* interception code in do_swap_page to catch it).
				589	*
				590	* Clean swap cache pages can be directly isolated. A later page fault will
				591	* bring in the known good data from disk.
				592	*/
				593	static int me_swapcache_dirty(struct page *p, unsigned long pfn)
				594	{
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	595	ClearPageDirty(p);
				596	/* Trigger EIO in shmem: */
				597	ClearPageUptodate(p);
				598
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	599	if (!delete_from_lru_cache(p))
				600	return DELAYED;
				601	else
				602	return FAILED;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	603	}
				604
				605	static int me_swapcache_clean(struct page *p, unsigned long pfn)
				606	{
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	607	delete_from_swap_cache(p);
Wu Fengguang	e43c3af	2009-09-29 13:16:20 +0800	[diff] [blame]	608
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	609	if (!delete_from_lru_cache(p))
				610	return RECOVERED;
				611	else
				612	return FAILED;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	613	}
				614
				615	/*
				616	* Huge pages. Needs work.
				617	* Issues:
				618	* No rmap support so we cannot find the original mapper. In theory could walk
				619	* all MMs and look for the mappings, but that would be non atomic and racy.
				620	* Need rmap for hugepages for this. Alternatively we could employ a heuristic,
				621	* like just walking the current process and hoping it has it mapped (that
				622	* should be usually true for the common "shared database cache" case)
				623	* Should handle free huge pages and dequeue them too, but this needs to
				624	* handle huge page accounting correctly.
				625	*/
				626	static int me_huge_page(struct page *p, unsigned long pfn)
				627	{
				628	return FAILED;
				629	}
				630
				631	/*
				632	* Various page states we can handle.
				633	*
				634	* A page state is defined by its current page->flags bits.
				635	* The table matches them in order and calls the right handler.
				636	*
				637	* This is quite tricky because we can access page at any time
				638	* in its live cycle, so all accesses have to be extremly careful.
				639	*
				640	* This is not complete. More states could be added.
				641	* For any missing state don't attempt recovery.
				642	*/
				643
				644	#define dirty (1UL << PG_dirty)
				645	#define sc (1UL << PG_swapcache)
				646	#define unevict (1UL << PG_unevictable)
				647	#define mlock (1UL << PG_mlocked)
				648	#define writeback (1UL << PG_writeback)
				649	#define lru (1UL << PG_lru)
				650	#define swapbacked (1UL << PG_swapbacked)
				651	#define head (1UL << PG_head)
				652	#define tail (1UL << PG_tail)
				653	#define compound (1UL << PG_compound)
				654	#define slab (1UL << PG_slab)
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	655	#define reserved (1UL << PG_reserved)
				656
				657	static struct page_state {
				658	unsigned long mask;
				659	unsigned long res;
				660	char *msg;
				661	int (action)(struct page p, unsigned long pfn);
				662	} error_states[] = {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame]	663	{ reserved, reserved, "reserved kernel", me_kernel },
Wu Fengguang	95d01fc	2009-12-16 12:19:58 +0100	[diff] [blame]	664	/*
				665	* free pages are specially detected outside this table:
				666	* PG_buddy pages only make a small fraction of all free pages.
				667	*/
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	668
				669	/*
				670	* Could in theory check if slab page is free or if we can drop
				671	* currently unused objects without touching them. But just
				672	* treat it as standard kernel for now.
				673	*/
				674	{ slab, slab, "kernel slab", me_kernel },
				675
				676	#ifdef CONFIG_PAGEFLAGS_EXTENDED
				677	{ head, head, "huge", me_huge_page },
				678	{ tail, tail, "huge", me_huge_page },
				679	#else
				680	{ compound, compound, "huge", me_huge_page },
				681	#endif
				682
				683	{ sc\|dirty, sc\|dirty, "swapcache", me_swapcache_dirty },
				684	{ sc\|dirty, sc, "swapcache", me_swapcache_clean },
				685
				686	{ unevict\|dirty, unevict\|dirty, "unevictable LRU", me_pagecache_dirty},
				687	{ unevict, unevict, "unevictable LRU", me_pagecache_clean},
				688
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	689	{ mlock\|dirty, mlock\|dirty, "mlocked LRU", me_pagecache_dirty },
				690	{ mlock, mlock, "mlocked LRU", me_pagecache_clean },
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	691
				692	{ lru\|dirty, lru\|dirty, "LRU", me_pagecache_dirty },
				693	{ lru\|dirty, lru, "clean LRU", me_pagecache_clean },
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	694
				695	/*
				696	* Catchall entry: must be at end.
				697	*/
				698	{ 0, 0, "unknown page state", me_unknown },
				699	};
				700
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	701	static void action_result(unsigned long pfn, char *msg, int result)
				702	{
Wu Fengguang	a7560fc	2009-12-16 12:19:57 +0100	[diff] [blame]	703	struct page *page = pfn_to_page(pfn);
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	704
				705	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
				706	pfn,
Wu Fengguang	a7560fc	2009-12-16 12:19:57 +0100	[diff] [blame]	707	PageDirty(page) ? "dirty " : "",
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	708	msg, action_name[result]);
				709	}
				710
				711	static int page_action(struct page_state ps, struct page p,
Wu Fengguang	bd1ce5f	2009-12-16 12:19:57 +0100	[diff] [blame]	712	unsigned long pfn)
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	713	{
				714	int result;
Wu Fengguang	7456b04	2009-10-19 08:15:01 +0200	[diff] [blame]	715	int count;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	716
				717	result = ps->action(p, pfn);
				718	action_result(pfn, ps->msg, result);
Wu Fengguang	7456b04	2009-10-19 08:15:01 +0200	[diff] [blame]	719
Wu Fengguang	bd1ce5f	2009-12-16 12:19:57 +0100	[diff] [blame]	720	count = page_count(p) - 1;
Wu Fengguang	138ce28	2009-12-16 12:19:58 +0100	[diff] [blame]	721	if (ps->action == me_swapcache_dirty && result == DELAYED)
				722	count--;
				723	if (count != 0) {
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	724	printk(KERN_ERR
				725	"MCE %#lx: %s page still referenced by %d users\n",
Wu Fengguang	7456b04	2009-10-19 08:15:01 +0200	[diff] [blame]	726	pfn, ps->msg, count);
Wu Fengguang	138ce28	2009-12-16 12:19:58 +0100	[diff] [blame]	727	result = FAILED;
				728	}
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	729
				730	/* Could do more checks here if page looks ok */
				731	/*
				732	* Could adjust zone counters here to correct for the missing page.
				733	*/
				734
Wu Fengguang	138ce28	2009-12-16 12:19:58 +0100	[diff] [blame]	735	return (result == RECOVERED \|\| result == DELAYED) ? 0 : -EBUSY;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	736	}
				737
				738	#define N_UNMAP_TRIES 5
				739
				740	/*
				741	* Do all that is necessary to remove user space mappings. Unmap
				742	* the pages and send SIGBUS to the processes if the data was dirty.
				743	*/
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	744	static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	745	int trapno)
				746	{
				747	enum ttu_flags ttu = TTU_UNMAP \| TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS;
				748	struct address_space *mapping;
				749	LIST_HEAD(tokill);
				750	int ret;
				751	int i;
				752	int kill = 1;
				753
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	754	if (PageReserved(p) \|\| PageSlab(p))
				755	return SWAP_SUCCESS;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	756
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	757	/*
				758	* This check implies we don't kill processes if their pages
				759	* are in the swap cache early. Those are always late kills.
				760	*/
				761	if (!page_mapped(p))
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	762	return SWAP_SUCCESS;
				763
				764	if (PageCompound(p) \|\| PageKsm(p))
				765	return SWAP_FAIL;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	766
				767	if (PageSwapCache(p)) {
				768	printk(KERN_ERR
				769	"MCE %#lx: keeping poisoned page in swap cache\n", pfn);
				770	ttu \|= TTU_IGNORE_HWPOISON;
				771	}
				772
				773	/*
				774	* Propagate the dirty bit from PTEs to struct page first, because we
				775	* need this to decide if we should kill or just drop the page.
Wu Fengguang	db0480b	2009-12-16 12:19:58 +0100	[diff] [blame]	776	* XXX: the dirty test could be racy: set_page_dirty() may not always
				777	* be called inside page lock (it's recommended but not enforced).
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	778	*/
				779	mapping = page_mapping(p);
				780	if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
				781	if (page_mkclean(p)) {
				782	SetPageDirty(p);
				783	} else {
				784	kill = 0;
				785	ttu \|= TTU_IGNORE_HWPOISON;
				786	printk(KERN_INFO
				787	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
				788	pfn);
				789	}
				790	}
				791
				792	/*
				793	* First collect all the processes that have the page
				794	* mapped in dirty form. This has to be done before try_to_unmap,
				795	* because ttu takes the rmap data structures down.
				796	*
				797	* Error handling: We ignore errors here because
				798	* there's nothing that can be done.
				799	*/
				800	if (kill)
				801	collect_procs(p, &tokill);
				802
				803	/*
				804	* try_to_unmap can fail temporarily due to races.
				805	* Try a few times (RED-PEN better strategy?)
				806	*/
				807	for (i = 0; i < N_UNMAP_TRIES; i++) {
				808	ret = try_to_unmap(p, ttu);
				809	if (ret == SWAP_SUCCESS)
				810	break;
				811	pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
				812	}
				813
				814	if (ret != SWAP_SUCCESS)
				815	printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
				816	pfn, page_mapcount(p));
				817
				818	/*
				819	* Now that the dirty bit has been propagated to the
				820	* struct page and all unmaps done we can decide if
				821	* killing is needed or not. Only kill when the page
				822	* was dirty, otherwise the tokill list is merely
				823	* freed. When there was a problem unmapping earlier
				824	* use a more force-full uncatchable kill to prevent
				825	* any accesses to the poisoned memory.
				826	*/
				827	kill_procs_ao(&tokill, !!PageDirty(p), trapno,
				828	ret != SWAP_SUCCESS, pfn);
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	829
				830	return ret;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	831	}
				832
Andi Kleen	82ba011	2009-12-16 12:19:57 +0100	[diff] [blame]	833	int __memory_failure(unsigned long pfn, int trapno, int flags)
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	834	{
				835	struct page_state *ps;
				836	struct page *p;
				837	int res;
				838
				839	if (!sysctl_memory_failure_recovery)
				840	panic("Memory failure from trap %d on page %lx", trapno, pfn);
				841
				842	if (!pfn_valid(pfn)) {
Wu Fengguang	a7560fc	2009-12-16 12:19:57 +0100	[diff] [blame]	843	printk(KERN_ERR
				844	"MCE %#lx: memory outside kernel control\n",
				845	pfn);
				846	return -ENXIO;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	847	}
				848
				849	p = pfn_to_page(pfn);
				850	if (TestSetPageHWPoison(p)) {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame]	851	printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	852	return 0;
				853	}
				854
				855	atomic_long_add(1, &mce_bad_pages);
				856
				857	/*
				858	* We need/can do nothing about count=0 pages.
				859	* 1) it's a free page, and therefore in safe hand:
				860	* prep_new_page() will be the gate keeper.
				861	* 2) it's part of a non-compound high order page.
				862	* Implies some kernel user: cannot stop them from
				863	* R/W the page; let's pray that the page has been
				864	* used and will be freed some time later.
				865	* In fact it's dangerous to directly bump up page count from 0,
				866	* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
				867	*/
Andi Kleen	82ba011	2009-12-16 12:19:57 +0100	[diff] [blame]	868	if (!(flags & MF_COUNT_INCREASED) &&
				869	!get_page_unless_zero(compound_head(p))) {
Wu Fengguang	8d22ba1	2009-12-16 12:19:58 +0100	[diff] [blame]	870	if (is_free_buddy_page(p)) {
				871	action_result(pfn, "free buddy", DELAYED);
				872	return 0;
				873	} else {
				874	action_result(pfn, "high order kernel", IGNORED);
				875	return -EBUSY;
				876	}
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	877	}
				878
				879	/*
Wu Fengguang	e43c3af	2009-09-29 13:16:20 +0800	[diff] [blame]	880	* We ignore non-LRU pages for good reasons.
				881	* - PG_locked is only well defined for LRU pages and a few others
				882	* - to avoid races with __set_page_locked()
				883	* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
				884	* The check (unnecessarily) ignores LRU pages being isolated and
				885	* walked by the page reclaim code, however that's not a big loss.
				886	*/
				887	if (!PageLRU(p))
				888	lru_add_drain_all();
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	889	if (!PageLRU(p)) {
Wu Fengguang	e43c3af	2009-09-29 13:16:20 +0800	[diff] [blame]	890	action_result(pfn, "non LRU", IGNORED);
				891	put_page(p);
				892	return -EBUSY;
				893	}
Wu Fengguang	e43c3af	2009-09-29 13:16:20 +0800	[diff] [blame]	894
				895	/*
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	896	* Lock the page and wait for writeback to finish.
				897	* It's very difficult to mess with pages currently under IO
				898	* and in many cases impossible, so we just avoid it here.
				899	*/
				900	lock_page_nosync(p);
Wu Fengguang	847ce40	2009-12-16 12:19:58 +0100	[diff] [blame]	901
				902	/*
				903	* unpoison always clear PG_hwpoison inside page lock
				904	*/
				905	if (!PageHWPoison(p)) {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame]	906	printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
Wu Fengguang	847ce40	2009-12-16 12:19:58 +0100	[diff] [blame]	907	res = 0;
				908	goto out;
				909	}
Wu Fengguang	7c116f2	2009-12-16 12:19:59 +0100	[diff] [blame]	910	if (hwpoison_filter(p)) {
				911	if (TestClearPageHWPoison(p))
				912	atomic_long_dec(&mce_bad_pages);
				913	unlock_page(p);
				914	put_page(p);
				915	return 0;
				916	}
Wu Fengguang	847ce40	2009-12-16 12:19:58 +0100	[diff] [blame]	917
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	918	wait_on_page_writeback(p);
				919
				920	/*
				921	* Now take care of user space mappings.
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	922	* Abort on fail: __remove_from_page_cache() assumes unmapped page.
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	923	*/
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	924	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
				925	printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
				926	res = -EBUSY;
				927	goto out;
				928	}
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	929
				930	/*
				931	* Torn down by someone else?
				932	*/
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	933	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	934	action_result(pfn, "already truncated LRU", IGNORED);
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame]	935	res = -EBUSY;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	936	goto out;
				937	}
				938
				939	res = -EBUSY;
				940	for (ps = error_states;; ps++) {
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	941	if ((p->flags & ps->mask) == ps->res) {
Wu Fengguang	bd1ce5f	2009-12-16 12:19:57 +0100	[diff] [blame]	942	res = page_action(ps, p, pfn);
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	943	break;
				944	}
				945	}
				946	out:
				947	unlock_page(p);
				948	return res;
				949	}
				950	EXPORT_SYMBOL_GPL(__memory_failure);
				951
				952	/**
				953	* memory_failure - Handle memory failure of a page.
				954	* @pfn: Page Number of the corrupted page
				955	* @trapno: Trap number reported in the signal to user space.
				956	*
				957	* This function is called by the low level machine check code
				958	* of an architecture when it detects hardware memory corruption
				959	* of a page. It tries its best to recover, which includes
				960	* dropping pages, killing processes etc.
				961	*
				962	* The function is primarily of use for corruptions that
				963	* happen outside the current execution context (e.g. when
				964	* detected by a background scrubber)
				965	*
				966	* Must run in process context (e.g. a work queue) with interrupts
				967	* enabled and no spinlocks hold.
				968	*/
				969	void memory_failure(unsigned long pfn, int trapno)
				970	{
				971	__memory_failure(pfn, trapno, 0);
				972	}
Wu Fengguang	847ce40	2009-12-16 12:19:58 +0100	[diff] [blame]	973
				974	/**
				975	* unpoison_memory - Unpoison a previously poisoned page
				976	* @pfn: Page number of the to be unpoisoned page
				977	*
				978	* Software-unpoison a page that has been poisoned by
				979	* memory_failure() earlier.
				980	*
				981	* This is only done on the software-level, so it only works
				982	* for linux injected failures, not real hardware failures
				983	*
				984	* Returns 0 for success, otherwise -errno.
				985	*/
				986	int unpoison_memory(unsigned long pfn)
				987	{
				988	struct page *page;
				989	struct page *p;
				990	int freeit = 0;
				991
				992	if (!pfn_valid(pfn))
				993	return -ENXIO;
				994
				995	p = pfn_to_page(pfn);
				996	page = compound_head(p);
				997
				998	if (!PageHWPoison(p)) {
				999	pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
				1000	return 0;
				1001	}
				1002
				1003	if (!get_page_unless_zero(page)) {
				1004	if (TestClearPageHWPoison(p))
				1005	atomic_long_dec(&mce_bad_pages);
				1006	pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
				1007	return 0;
				1008	}
				1009
				1010	lock_page_nosync(page);
				1011	/*
				1012	* This test is racy because PG_hwpoison is set outside of page lock.
				1013	* That's acceptable because that won't trigger kernel panic. Instead,
				1014	* the PG_hwpoison page will be caught and isolated on the entrance to
				1015	* the free buddy page pool.
				1016	*/
				1017	if (TestClearPageHWPoison(p)) {
				1018	pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
				1019	atomic_long_dec(&mce_bad_pages);
				1020	freeit = 1;
				1021	}
				1022	unlock_page(page);
				1023
				1024	put_page(page);
				1025	if (freeit)
				1026	put_page(page);
				1027
				1028	return 0;
				1029	}
				1030	EXPORT_SYMBOL(unpoison_memory);