Blame - mm/memory-failure.c - kernel/msm-4.9

blob: fd1ac1537f06eee25681fdf3610d8221436253d8 [file] [log] [blame]

Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	1	/*
				2	* Copyright (C) 2008, 2009 Intel Corporation
				3	* Authors: Andi Kleen, Fengguang Wu
				4	*
				5	* This software may be redistributed and/or modified under the terms of
				6	* the GNU General Public License ("GPL") version 2 only as published by the
				7	* Free Software Foundation.
				8	*
				9	* High level machine check handler. Handles pages reported by the
				10	* hardware as being corrupted usually due to a 2bit ECC memory or cache
				11	* failure.
				12	*
				13	* Handles page cache pages in various states. The tricky part
				14	* here is that we can access any page asynchronous to other VM
				15	* users, because memory failures could happen anytime and anywhere,
				16	* possibly violating some of their assumptions. This is why this code
				17	* has to be extremely careful. Generally it tries to use normal locking
				18	* rules, as in get the standard locks, even if that means the
				19	* error handling takes potentially a long time.
				20	*
				21	* The operation to map back from RMAP chains to processes has to walk
				22	* the complete process list and has non linear complexity with the number
				23	* mappings. In short it can be quite slow. But since memory corruptions
				24	* are rare we hope to get away with this.
				25	*/
				26
				27	/*
				28	* Notebook:
				29	* - hugetlb needs more code
				30	* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
				31	* - pass bad pages to kdump next kernel
				32	*/
				33	#define DEBUG 1 /* remove me in 2.6.34 */
				34	#include <linux/kernel.h>
				35	#include <linux/mm.h>
				36	#include <linux/page-flags.h>
				37	#include <linux/sched.h>
Hugh Dickins	01e00f8	2009-10-13 15:02:11 +0100	[diff] [blame]	38	#include <linux/ksm.h>
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	39	#include <linux/rmap.h>
				40	#include <linux/pagemap.h>
				41	#include <linux/swap.h>
				42	#include <linux/backing-dev.h>
				43	#include "internal.h"
				44
				45	int sysctl_memory_failure_early_kill __read_mostly = 0;
				46
				47	int sysctl_memory_failure_recovery __read_mostly = 1;
				48
				49	atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
				50
				51	/*
				52	* Send all the processes who have the page mapped an ``action optional''
				53	* signal.
				54	*/
				55	static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
				56	unsigned long pfn)
				57	{
				58	struct siginfo si;
				59	int ret;
				60
				61	printk(KERN_ERR
				62	"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
				63	pfn, t->comm, t->pid);
				64	si.si_signo = SIGBUS;
				65	si.si_errno = 0;
				66	si.si_code = BUS_MCEERR_AO;
				67	si.si_addr = (void *)addr;
				68	#ifdef __ARCH_SI_TRAPNO
				69	si.si_trapno = trapno;
				70	#endif
				71	si.si_addr_lsb = PAGE_SHIFT;
				72	/*
				73	* Don't use force here, it's convenient if the signal
				74	* can be temporarily blocked.
				75	* This could cause a loop when the user sets SIGBUS
				76	* to SIG_IGN, but hopefully noone will do that?
				77	*/
				78	ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
				79	if (ret < 0)
				80	printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
				81	t->comm, t->pid, ret);
				82	return ret;
				83	}
				84
				85	/*
Andi Kleen	588f9ce	2009-12-16 12:19:57 +0100	[diff] [blame]	86	* When a unknown page type is encountered drain as many buffers as possible
				87	* in the hope to turn the page into a LRU or free page, which we can handle.
				88	*/
				89	void shake_page(struct page *p)
				90	{
				91	if (!PageSlab(p)) {
				92	lru_add_drain_all();
				93	if (PageLRU(p))
				94	return;
				95	drain_all_pages();
				96	if (PageLRU(p) \|\| is_free_buddy_page(p))
				97	return;
				98	}
				99	/*
				100	* Could call shrink_slab here (which would also
				101	* shrink other caches). Unfortunately that might
				102	* also access the corrupted page, which could be fatal.
				103	*/
				104	}
				105	EXPORT_SYMBOL_GPL(shake_page);
				106
				107	/*
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	108	* Kill all processes that have a poisoned page mapped and then isolate
				109	* the page.
				110	*
				111	* General strategy:
				112	* Find all processes having the page mapped and kill them.
				113	* But we keep a page reference around so that the page is not
				114	* actually freed yet.
				115	* Then stash the page away
				116	*
				117	* There's no convenient way to get back to mapped processes
				118	* from the VMAs. So do a brute-force search over all
				119	* running processes.
				120	*
				121	* Remember that machine checks are not common (or rather
				122	* if they are common you have other problems), so this shouldn't
				123	* be a performance issue.
				124	*
				125	* Also there are some races possible while we get from the
				126	* error detection to actually handle it.
				127	*/
				128
				129	struct to_kill {
				130	struct list_head nd;
				131	struct task_struct *tsk;
				132	unsigned long addr;
				133	unsigned addr_valid:1;
				134	};
				135
				136	/*
				137	* Failure handling: if we can't find or can't kill a process there's
				138	* not much we can do. We just print a message and ignore otherwise.
				139	*/
				140
				141	/*
				142	* Schedule a process for later kill.
				143	* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
				144	* TBD would GFP_NOIO be enough?
				145	*/
				146	static void add_to_kill(struct task_struct tsk, struct page p,
				147	struct vm_area_struct *vma,
				148	struct list_head *to_kill,
				149	struct to_kill **tkc)
				150	{
				151	struct to_kill *tk;
				152
				153	if (*tkc) {
				154	tk = *tkc;
				155	*tkc = NULL;
				156	} else {
				157	tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
				158	if (!tk) {
				159	printk(KERN_ERR
				160	"MCE: Out of memory while machine check handling\n");
				161	return;
				162	}
				163	}
				164	tk->addr = page_address_in_vma(p, vma);
				165	tk->addr_valid = 1;
				166
				167	/*
				168	* In theory we don't have to kill when the page was
				169	* munmaped. But it could be also a mremap. Since that's
				170	* likely very rare kill anyways just out of paranoia, but use
				171	* a SIGKILL because the error is not contained anymore.
				172	*/
				173	if (tk->addr == -EFAULT) {
				174	pr_debug("MCE: Unable to find user space address %lx in %s\n",
				175	page_to_pfn(p), tsk->comm);
				176	tk->addr_valid = 0;
				177	}
				178	get_task_struct(tsk);
				179	tk->tsk = tsk;
				180	list_add_tail(&tk->nd, to_kill);
				181	}
				182
				183	/*
				184	* Kill the processes that have been collected earlier.
				185	*
				186	* Only do anything when DOIT is set, otherwise just free the list
				187	* (this is used for clean pages which do not need killing)
				188	* Also when FAIL is set do a force kill because something went
				189	* wrong earlier.
				190	*/
				191	static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
				192	int fail, unsigned long pfn)
				193	{
				194	struct to_kill tk, next;
				195
				196	list_for_each_entry_safe (tk, next, to_kill, nd) {
				197	if (doit) {
				198	/*
André Goddard Rosa	af901ca	2009-11-14 13:09:05 -0200	[diff] [blame]	199	* In case something went wrong with munmapping
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	200	* make sure the process doesn't catch the
				201	* signal and then access the memory. Just kill it.
				202	* the signal handlers
				203	*/
				204	if (fail \|\| tk->addr_valid == 0) {
				205	printk(KERN_ERR
				206	"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
				207	pfn, tk->tsk->comm, tk->tsk->pid);
				208	force_sig(SIGKILL, tk->tsk);
				209	}
				210
				211	/*
				212	* In theory the process could have mapped
				213	* something else on the address in-between. We could
				214	* check for that, but we need to tell the
				215	* process anyways.
				216	*/
				217	else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
				218	pfn) < 0)
				219	printk(KERN_ERR
				220	"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
				221	pfn, tk->tsk->comm, tk->tsk->pid);
				222	}
				223	put_task_struct(tk->tsk);
				224	kfree(tk);
				225	}
				226	}
				227
				228	static int task_early_kill(struct task_struct *tsk)
				229	{
				230	if (!tsk->mm)
				231	return 0;
				232	if (tsk->flags & PF_MCE_PROCESS)
				233	return !!(tsk->flags & PF_MCE_EARLY);
				234	return sysctl_memory_failure_early_kill;
				235	}
				236
				237	/*
				238	* Collect processes when the error hit an anonymous page.
				239	*/
				240	static void collect_procs_anon(struct page page, struct list_head to_kill,
				241	struct to_kill **tkc)
				242	{
				243	struct vm_area_struct *vma;
				244	struct task_struct *tsk;
				245	struct anon_vma *av;
				246
				247	read_lock(&tasklist_lock);
				248	av = page_lock_anon_vma(page);
				249	if (av == NULL) /* Not actually mapped anymore */
				250	goto out;
				251	for_each_process (tsk) {
				252	if (!task_early_kill(tsk))
				253	continue;
				254	list_for_each_entry (vma, &av->head, anon_vma_node) {
				255	if (!page_mapped_in_vma(page, vma))
				256	continue;
				257	if (vma->vm_mm == tsk->mm)
				258	add_to_kill(tsk, page, vma, to_kill, tkc);
				259	}
				260	}
				261	page_unlock_anon_vma(av);
				262	out:
				263	read_unlock(&tasklist_lock);
				264	}
				265
				266	/*
				267	* Collect processes when the error hit a file mapped page.
				268	*/
				269	static void collect_procs_file(struct page page, struct list_head to_kill,
				270	struct to_kill **tkc)
				271	{
				272	struct vm_area_struct *vma;
				273	struct task_struct *tsk;
				274	struct prio_tree_iter iter;
				275	struct address_space *mapping = page->mapping;
				276
				277	/*
				278	* A note on the locking order between the two locks.
				279	* We don't rely on this particular order.
				280	* If you have some other code that needs a different order
				281	* feel free to switch them around. Or add a reverse link
				282	* from mm_struct to task_struct, then this could be all
				283	* done without taking tasklist_lock and looping over all tasks.
				284	*/
				285
				286	read_lock(&tasklist_lock);
				287	spin_lock(&mapping->i_mmap_lock);
				288	for_each_process(tsk) {
				289	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
				290
				291	if (!task_early_kill(tsk))
				292	continue;
				293
				294	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
				295	pgoff) {
				296	/*
				297	* Send early kill signal to tasks where a vma covers
				298	* the page but the corrupted page is not necessarily
				299	* mapped it in its pte.
				300	* Assume applications who requested early kill want
				301	* to be informed of all such data corruptions.
				302	*/
				303	if (vma->vm_mm == tsk->mm)
				304	add_to_kill(tsk, page, vma, to_kill, tkc);
				305	}
				306	}
				307	spin_unlock(&mapping->i_mmap_lock);
				308	read_unlock(&tasklist_lock);
				309	}
				310
				311	/*
				312	* Collect the processes who have the corrupted page mapped to kill.
				313	* This is done in two steps for locking reasons.
				314	* First preallocate one tokill structure outside the spin locks,
				315	* so that we can kill at least one process reasonably reliable.
				316	*/
				317	static void collect_procs(struct page page, struct list_head tokill)
				318	{
				319	struct to_kill *tk;
				320
				321	if (!page->mapping)
				322	return;
				323
				324	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
				325	if (!tk)
				326	return;
				327	if (PageAnon(page))
				328	collect_procs_anon(page, tokill, &tk);
				329	else
				330	collect_procs_file(page, tokill, &tk);
				331	kfree(tk);
				332	}
				333
				334	/*
				335	* Error handlers for various types of pages.
				336	*/
				337
				338	enum outcome {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame^]	339	IGNORED, /* Error: cannot be handled */
				340	FAILED, /* Error: handling failed */
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	341	DELAYED, /* Will be handled later */
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	342	RECOVERED, /* Successfully recovered */
				343	};
				344
				345	static const char *action_name[] = {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame^]	346	[IGNORED] = "Ignored",
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	347	[FAILED] = "Failed",
				348	[DELAYED] = "Delayed",
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	349	[RECOVERED] = "Recovered",
				350	};
				351
				352	/*
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	353	* XXX: It is possible that a page is isolated from LRU cache,
				354	* and then kept in swap cache or failed to remove from page cache.
				355	* The page count will stop it from being freed by unpoison.
				356	* Stress tests should be aware of this memory leak problem.
				357	*/
				358	static int delete_from_lru_cache(struct page *p)
				359	{
				360	if (!isolate_lru_page(p)) {
				361	/*
				362	* Clear sensible page flags, so that the buddy system won't
				363	* complain when the page is unpoison-and-freed.
				364	*/
				365	ClearPageActive(p);
				366	ClearPageUnevictable(p);
				367	/*
				368	* drop the page count elevated by isolate_lru_page()
				369	*/
				370	page_cache_release(p);
				371	return 0;
				372	}
				373	return -EIO;
				374	}
				375
				376	/*
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	377	* Error hit kernel page.
				378	* Do nothing, try to be lucky and not touch this instead. For a few cases we
				379	* could be more sophisticated.
				380	*/
				381	static int me_kernel(struct page *p, unsigned long pfn)
				382	{
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	383	return IGNORED;
				384	}
				385
				386	/*
				387	* Page in unknown state. Do nothing.
				388	*/
				389	static int me_unknown(struct page *p, unsigned long pfn)
				390	{
				391	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
				392	return FAILED;
				393	}
				394
				395	/*
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	396	* Clean (or cleaned) page cache page.
				397	*/
				398	static int me_pagecache_clean(struct page *p, unsigned long pfn)
				399	{
				400	int err;
				401	int ret = FAILED;
				402	struct address_space *mapping;
				403
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	404	delete_from_lru_cache(p);
				405
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	406	/*
				407	* For anonymous pages we're done the only reference left
				408	* should be the one m_f() holds.
				409	*/
				410	if (PageAnon(p))
				411	return RECOVERED;
				412
				413	/*
				414	* Now truncate the page in the page cache. This is really
				415	* more like a "temporary hole punch"
				416	* Don't do this for block devices when someone else
				417	* has a reference, because it could be file system metadata
				418	* and that's not safe to truncate.
				419	*/
				420	mapping = page_mapping(p);
				421	if (!mapping) {
				422	/*
				423	* Page has been teared down in the meanwhile
				424	*/
				425	return FAILED;
				426	}
				427
				428	/*
				429	* Truncation is a bit tricky. Enable it per file system for now.
				430	*
				431	* Open: to take i_mutex or not for this? Right now we don't.
				432	*/
				433	if (mapping->a_ops->error_remove_page) {
				434	err = mapping->a_ops->error_remove_page(mapping, p);
				435	if (err != 0) {
				436	printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
				437	pfn, err);
				438	} else if (page_has_private(p) &&
				439	!try_to_release_page(p, GFP_NOIO)) {
				440	pr_debug("MCE %#lx: failed to release buffers\n", pfn);
				441	} else {
				442	ret = RECOVERED;
				443	}
				444	} else {
				445	/*
				446	* If the file system doesn't support it just invalidate
				447	* This fails on dirty or anything with private pages
				448	*/
				449	if (invalidate_inode_page(p))
				450	ret = RECOVERED;
				451	else
				452	printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
				453	pfn);
				454	}
				455	return ret;
				456	}
				457
				458	/*
				459	* Dirty cache page page
				460	* Issues: when the error hit a hole page the error is not properly
				461	* propagated.
				462	*/
				463	static int me_pagecache_dirty(struct page *p, unsigned long pfn)
				464	{
				465	struct address_space *mapping = page_mapping(p);
				466
				467	SetPageError(p);
				468	/* TBD: print more information about the file. */
				469	if (mapping) {
				470	/*
				471	* IO error will be reported by write(), fsync(), etc.
				472	* who check the mapping.
				473	* This way the application knows that something went
				474	* wrong with its dirty file data.
				475	*
				476	* There's one open issue:
				477	*
				478	* The EIO will be only reported on the next IO
				479	* operation and then cleared through the IO map.
				480	* Normally Linux has two mechanisms to pass IO error
				481	* first through the AS_EIO flag in the address space
				482	* and then through the PageError flag in the page.
				483	* Since we drop pages on memory failure handling the
				484	* only mechanism open to use is through AS_AIO.
				485	*
				486	* This has the disadvantage that it gets cleared on
				487	* the first operation that returns an error, while
				488	* the PageError bit is more sticky and only cleared
				489	* when the page is reread or dropped. If an
				490	* application assumes it will always get error on
				491	* fsync, but does other operations on the fd before
				492	* and the page is dropped inbetween then the error
				493	* will not be properly reported.
				494	*
				495	* This can already happen even without hwpoisoned
				496	* pages: first on metadata IO errors (which only
				497	* report through AS_EIO) or when the page is dropped
				498	* at the wrong time.
				499	*
				500	* So right now we assume that the application DTRT on
				501	* the first EIO, but we're not worse than other parts
				502	* of the kernel.
				503	*/
				504	mapping_set_error(mapping, EIO);
				505	}
				506
				507	return me_pagecache_clean(p, pfn);
				508	}
				509
				510	/*
				511	* Clean and dirty swap cache.
				512	*
				513	* Dirty swap cache page is tricky to handle. The page could live both in page
				514	* cache and swap cache(ie. page is freshly swapped in). So it could be
				515	* referenced concurrently by 2 types of PTEs:
				516	* normal PTEs and swap PTEs. We try to handle them consistently by calling
				517	* try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
				518	* and then
				519	* - clear dirty bit to prevent IO
				520	* - remove from LRU
				521	* - but keep in the swap cache, so that when we return to it on
				522	* a later page fault, we know the application is accessing
				523	* corrupted data and shall be killed (we installed simple
				524	* interception code in do_swap_page to catch it).
				525	*
				526	* Clean swap cache pages can be directly isolated. A later page fault will
				527	* bring in the known good data from disk.
				528	*/
				529	static int me_swapcache_dirty(struct page *p, unsigned long pfn)
				530	{
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	531	ClearPageDirty(p);
				532	/* Trigger EIO in shmem: */
				533	ClearPageUptodate(p);
				534
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	535	if (!delete_from_lru_cache(p))
				536	return DELAYED;
				537	else
				538	return FAILED;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	539	}
				540
				541	static int me_swapcache_clean(struct page *p, unsigned long pfn)
				542	{
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	543	delete_from_swap_cache(p);
Wu Fengguang	e43c3af	2009-09-29 13:16:20 +0800	[diff] [blame]	544
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	545	if (!delete_from_lru_cache(p))
				546	return RECOVERED;
				547	else
				548	return FAILED;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	549	}
				550
				551	/*
				552	* Huge pages. Needs work.
				553	* Issues:
				554	* No rmap support so we cannot find the original mapper. In theory could walk
				555	* all MMs and look for the mappings, but that would be non atomic and racy.
				556	* Need rmap for hugepages for this. Alternatively we could employ a heuristic,
				557	* like just walking the current process and hoping it has it mapped (that
				558	* should be usually true for the common "shared database cache" case)
				559	* Should handle free huge pages and dequeue them too, but this needs to
				560	* handle huge page accounting correctly.
				561	*/
				562	static int me_huge_page(struct page *p, unsigned long pfn)
				563	{
				564	return FAILED;
				565	}
				566
				567	/*
				568	* Various page states we can handle.
				569	*
				570	* A page state is defined by its current page->flags bits.
				571	* The table matches them in order and calls the right handler.
				572	*
				573	* This is quite tricky because we can access page at any time
				574	* in its live cycle, so all accesses have to be extremly careful.
				575	*
				576	* This is not complete. More states could be added.
				577	* For any missing state don't attempt recovery.
				578	*/
				579
				580	#define dirty (1UL << PG_dirty)
				581	#define sc (1UL << PG_swapcache)
				582	#define unevict (1UL << PG_unevictable)
				583	#define mlock (1UL << PG_mlocked)
				584	#define writeback (1UL << PG_writeback)
				585	#define lru (1UL << PG_lru)
				586	#define swapbacked (1UL << PG_swapbacked)
				587	#define head (1UL << PG_head)
				588	#define tail (1UL << PG_tail)
				589	#define compound (1UL << PG_compound)
				590	#define slab (1UL << PG_slab)
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	591	#define reserved (1UL << PG_reserved)
				592
				593	static struct page_state {
				594	unsigned long mask;
				595	unsigned long res;
				596	char *msg;
				597	int (action)(struct page p, unsigned long pfn);
				598	} error_states[] = {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame^]	599	{ reserved, reserved, "reserved kernel", me_kernel },
Wu Fengguang	95d01fc	2009-12-16 12:19:58 +0100	[diff] [blame]	600	/*
				601	* free pages are specially detected outside this table:
				602	* PG_buddy pages only make a small fraction of all free pages.
				603	*/
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	604
				605	/*
				606	* Could in theory check if slab page is free or if we can drop
				607	* currently unused objects without touching them. But just
				608	* treat it as standard kernel for now.
				609	*/
				610	{ slab, slab, "kernel slab", me_kernel },
				611
				612	#ifdef CONFIG_PAGEFLAGS_EXTENDED
				613	{ head, head, "huge", me_huge_page },
				614	{ tail, tail, "huge", me_huge_page },
				615	#else
				616	{ compound, compound, "huge", me_huge_page },
				617	#endif
				618
				619	{ sc\|dirty, sc\|dirty, "swapcache", me_swapcache_dirty },
				620	{ sc\|dirty, sc, "swapcache", me_swapcache_clean },
				621
				622	{ unevict\|dirty, unevict\|dirty, "unevictable LRU", me_pagecache_dirty},
				623	{ unevict, unevict, "unevictable LRU", me_pagecache_clean},
				624
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	625	{ mlock\|dirty, mlock\|dirty, "mlocked LRU", me_pagecache_dirty },
				626	{ mlock, mlock, "mlocked LRU", me_pagecache_clean },
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	627
				628	{ lru\|dirty, lru\|dirty, "LRU", me_pagecache_dirty },
				629	{ lru\|dirty, lru, "clean LRU", me_pagecache_clean },
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	630
				631	/*
				632	* Catchall entry: must be at end.
				633	*/
				634	{ 0, 0, "unknown page state", me_unknown },
				635	};
				636
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	637	static void action_result(unsigned long pfn, char *msg, int result)
				638	{
Wu Fengguang	a7560fc	2009-12-16 12:19:57 +0100	[diff] [blame]	639	struct page *page = pfn_to_page(pfn);
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	640
				641	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
				642	pfn,
Wu Fengguang	a7560fc	2009-12-16 12:19:57 +0100	[diff] [blame]	643	PageDirty(page) ? "dirty " : "",
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	644	msg, action_name[result]);
				645	}
				646
				647	static int page_action(struct page_state ps, struct page p,
Wu Fengguang	bd1ce5f	2009-12-16 12:19:57 +0100	[diff] [blame]	648	unsigned long pfn)
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	649	{
				650	int result;
Wu Fengguang	7456b04	2009-10-19 08:15:01 +0200	[diff] [blame]	651	int count;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	652
				653	result = ps->action(p, pfn);
				654	action_result(pfn, ps->msg, result);
Wu Fengguang	7456b04	2009-10-19 08:15:01 +0200	[diff] [blame]	655
Wu Fengguang	bd1ce5f	2009-12-16 12:19:57 +0100	[diff] [blame]	656	count = page_count(p) - 1;
Wu Fengguang	7456b04	2009-10-19 08:15:01 +0200	[diff] [blame]	657	if (count != 0)
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	658	printk(KERN_ERR
				659	"MCE %#lx: %s page still referenced by %d users\n",
Wu Fengguang	7456b04	2009-10-19 08:15:01 +0200	[diff] [blame]	660	pfn, ps->msg, count);
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	661
				662	/* Could do more checks here if page looks ok */
				663	/*
				664	* Could adjust zone counters here to correct for the missing page.
				665	*/
				666
				667	return result == RECOVERED ? 0 : -EBUSY;
				668	}
				669
				670	#define N_UNMAP_TRIES 5
				671
				672	/*
				673	* Do all that is necessary to remove user space mappings. Unmap
				674	* the pages and send SIGBUS to the processes if the data was dirty.
				675	*/
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	676	static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	677	int trapno)
				678	{
				679	enum ttu_flags ttu = TTU_UNMAP \| TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS;
				680	struct address_space *mapping;
				681	LIST_HEAD(tokill);
				682	int ret;
				683	int i;
				684	int kill = 1;
				685
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	686	if (PageReserved(p) \|\| PageSlab(p))
				687	return SWAP_SUCCESS;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	688
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	689	/*
				690	* This check implies we don't kill processes if their pages
				691	* are in the swap cache early. Those are always late kills.
				692	*/
				693	if (!page_mapped(p))
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	694	return SWAP_SUCCESS;
				695
				696	if (PageCompound(p) \|\| PageKsm(p))
				697	return SWAP_FAIL;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	698
				699	if (PageSwapCache(p)) {
				700	printk(KERN_ERR
				701	"MCE %#lx: keeping poisoned page in swap cache\n", pfn);
				702	ttu \|= TTU_IGNORE_HWPOISON;
				703	}
				704
				705	/*
				706	* Propagate the dirty bit from PTEs to struct page first, because we
				707	* need this to decide if we should kill or just drop the page.
Wu Fengguang	db0480b	2009-12-16 12:19:58 +0100	[diff] [blame]	708	* XXX: the dirty test could be racy: set_page_dirty() may not always
				709	* be called inside page lock (it's recommended but not enforced).
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	710	*/
				711	mapping = page_mapping(p);
				712	if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
				713	if (page_mkclean(p)) {
				714	SetPageDirty(p);
				715	} else {
				716	kill = 0;
				717	ttu \|= TTU_IGNORE_HWPOISON;
				718	printk(KERN_INFO
				719	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
				720	pfn);
				721	}
				722	}
				723
				724	/*
				725	* First collect all the processes that have the page
				726	* mapped in dirty form. This has to be done before try_to_unmap,
				727	* because ttu takes the rmap data structures down.
				728	*
				729	* Error handling: We ignore errors here because
				730	* there's nothing that can be done.
				731	*/
				732	if (kill)
				733	collect_procs(p, &tokill);
				734
				735	/*
				736	* try_to_unmap can fail temporarily due to races.
				737	* Try a few times (RED-PEN better strategy?)
				738	*/
				739	for (i = 0; i < N_UNMAP_TRIES; i++) {
				740	ret = try_to_unmap(p, ttu);
				741	if (ret == SWAP_SUCCESS)
				742	break;
				743	pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
				744	}
				745
				746	if (ret != SWAP_SUCCESS)
				747	printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
				748	pfn, page_mapcount(p));
				749
				750	/*
				751	* Now that the dirty bit has been propagated to the
				752	* struct page and all unmaps done we can decide if
				753	* killing is needed or not. Only kill when the page
				754	* was dirty, otherwise the tokill list is merely
				755	* freed. When there was a problem unmapping earlier
				756	* use a more force-full uncatchable kill to prevent
				757	* any accesses to the poisoned memory.
				758	*/
				759	kill_procs_ao(&tokill, !!PageDirty(p), trapno,
				760	ret != SWAP_SUCCESS, pfn);
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	761
				762	return ret;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	763	}
				764
Andi Kleen	82ba011	2009-12-16 12:19:57 +0100	[diff] [blame]	765	int __memory_failure(unsigned long pfn, int trapno, int flags)
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	766	{
				767	struct page_state *ps;
				768	struct page *p;
				769	int res;
				770
				771	if (!sysctl_memory_failure_recovery)
				772	panic("Memory failure from trap %d on page %lx", trapno, pfn);
				773
				774	if (!pfn_valid(pfn)) {
Wu Fengguang	a7560fc	2009-12-16 12:19:57 +0100	[diff] [blame]	775	printk(KERN_ERR
				776	"MCE %#lx: memory outside kernel control\n",
				777	pfn);
				778	return -ENXIO;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	779	}
				780
				781	p = pfn_to_page(pfn);
				782	if (TestSetPageHWPoison(p)) {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame^]	783	printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	784	return 0;
				785	}
				786
				787	atomic_long_add(1, &mce_bad_pages);
				788
				789	/*
				790	* We need/can do nothing about count=0 pages.
				791	* 1) it's a free page, and therefore in safe hand:
				792	* prep_new_page() will be the gate keeper.
				793	* 2) it's part of a non-compound high order page.
				794	* Implies some kernel user: cannot stop them from
				795	* R/W the page; let's pray that the page has been
				796	* used and will be freed some time later.
				797	* In fact it's dangerous to directly bump up page count from 0,
				798	* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
				799	*/
Andi Kleen	82ba011	2009-12-16 12:19:57 +0100	[diff] [blame]	800	if (!(flags & MF_COUNT_INCREASED) &&
				801	!get_page_unless_zero(compound_head(p))) {
Wu Fengguang	8d22ba1	2009-12-16 12:19:58 +0100	[diff] [blame]	802	if (is_free_buddy_page(p)) {
				803	action_result(pfn, "free buddy", DELAYED);
				804	return 0;
				805	} else {
				806	action_result(pfn, "high order kernel", IGNORED);
				807	return -EBUSY;
				808	}
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	809	}
				810
				811	/*
Wu Fengguang	e43c3af	2009-09-29 13:16:20 +0800	[diff] [blame]	812	* We ignore non-LRU pages for good reasons.
				813	* - PG_locked is only well defined for LRU pages and a few others
				814	* - to avoid races with __set_page_locked()
				815	* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
				816	* The check (unnecessarily) ignores LRU pages being isolated and
				817	* walked by the page reclaim code, however that's not a big loss.
				818	*/
				819	if (!PageLRU(p))
				820	lru_add_drain_all();
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	821	if (!PageLRU(p)) {
Wu Fengguang	e43c3af	2009-09-29 13:16:20 +0800	[diff] [blame]	822	action_result(pfn, "non LRU", IGNORED);
				823	put_page(p);
				824	return -EBUSY;
				825	}
Wu Fengguang	e43c3af	2009-09-29 13:16:20 +0800	[diff] [blame]	826
				827	/*
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	828	* Lock the page and wait for writeback to finish.
				829	* It's very difficult to mess with pages currently under IO
				830	* and in many cases impossible, so we just avoid it here.
				831	*/
				832	lock_page_nosync(p);
Wu Fengguang	847ce40	2009-12-16 12:19:58 +0100	[diff] [blame]	833
				834	/*
				835	* unpoison always clear PG_hwpoison inside page lock
				836	*/
				837	if (!PageHWPoison(p)) {
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame^]	838	printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
Wu Fengguang	847ce40	2009-12-16 12:19:58 +0100	[diff] [blame]	839	res = 0;
				840	goto out;
				841	}
				842
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	843	wait_on_page_writeback(p);
				844
				845	/*
				846	* Now take care of user space mappings.
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	847	* Abort on fail: __remove_from_page_cache() assumes unmapped page.
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	848	*/
Wu Fengguang	1668bfd	2009-12-16 12:19:58 +0100	[diff] [blame]	849	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
				850	printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
				851	res = -EBUSY;
				852	goto out;
				853	}
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	854
				855	/*
				856	* Torn down by someone else?
				857	*/
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	858	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	859	action_result(pfn, "already truncated LRU", IGNORED);
Wu Fengguang	d95ea51	2009-12-16 12:19:58 +0100	[diff] [blame^]	860	res = -EBUSY;
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	861	goto out;
				862	}
				863
				864	res = -EBUSY;
				865	for (ps = error_states;; ps++) {
Wu Fengguang	dc2a1cb	2009-12-16 12:19:58 +0100	[diff] [blame]	866	if ((p->flags & ps->mask) == ps->res) {
Wu Fengguang	bd1ce5f	2009-12-16 12:19:57 +0100	[diff] [blame]	867	res = page_action(ps, p, pfn);
Andi Kleen	6a46079	2009-09-16 11:50:15 +0200	[diff] [blame]	868	break;
				869	}
				870	}
				871	out:
				872	unlock_page(p);
				873	return res;
				874	}
				875	EXPORT_SYMBOL_GPL(__memory_failure);
				876
				877	/**
				878	* memory_failure - Handle memory failure of a page.
				879	* @pfn: Page Number of the corrupted page
				880	* @trapno: Trap number reported in the signal to user space.
				881	*
				882	* This function is called by the low level machine check code
				883	* of an architecture when it detects hardware memory corruption
				884	* of a page. It tries its best to recover, which includes
				885	* dropping pages, killing processes etc.
				886	*
				887	* The function is primarily of use for corruptions that
				888	* happen outside the current execution context (e.g. when
				889	* detected by a background scrubber)
				890	*
				891	* Must run in process context (e.g. a work queue) with interrupts
				892	* enabled and no spinlocks hold.
				893	*/
				894	void memory_failure(unsigned long pfn, int trapno)
				895	{
				896	__memory_failure(pfn, trapno, 0);
				897	}
Wu Fengguang	847ce40	2009-12-16 12:19:58 +0100	[diff] [blame]	898
				899	/**
				900	* unpoison_memory - Unpoison a previously poisoned page
				901	* @pfn: Page number of the to be unpoisoned page
				902	*
				903	* Software-unpoison a page that has been poisoned by
				904	* memory_failure() earlier.
				905	*
				906	* This is only done on the software-level, so it only works
				907	* for linux injected failures, not real hardware failures
				908	*
				909	* Returns 0 for success, otherwise -errno.
				910	*/
				911	int unpoison_memory(unsigned long pfn)
				912	{
				913	struct page *page;
				914	struct page *p;
				915	int freeit = 0;
				916
				917	if (!pfn_valid(pfn))
				918	return -ENXIO;
				919
				920	p = pfn_to_page(pfn);
				921	page = compound_head(p);
				922
				923	if (!PageHWPoison(p)) {
				924	pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
				925	return 0;
				926	}
				927
				928	if (!get_page_unless_zero(page)) {
				929	if (TestClearPageHWPoison(p))
				930	atomic_long_dec(&mce_bad_pages);
				931	pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
				932	return 0;
				933	}
				934
				935	lock_page_nosync(page);
				936	/*
				937	* This test is racy because PG_hwpoison is set outside of page lock.
				938	* That's acceptable because that won't trigger kernel panic. Instead,
				939	* the PG_hwpoison page will be caught and isolated on the entrance to
				940	* the free buddy page pool.
				941	*/
				942	if (TestClearPageHWPoison(p)) {
				943	pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
				944	atomic_long_dec(&mce_bad_pages);
				945	freeit = 1;
				946	}
				947	unlock_page(page);
				948
				949	put_page(page);
				950	if (freeit)
				951	put_page(page);
				952
				953	return 0;
				954	}
				955	EXPORT_SYMBOL(unpoison_memory);