Blame - kernel/kexec.c - kernel/msm-4.9

blob: 25db14b89e82c529fcb217e311fbebf66c440c60 [file] [log] [blame]

Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1	/*
				2	* kexec.c - kexec system call
				3	* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
				4	*
				5	* This source code is licensed under the GNU General Public License,
				6	* Version 2. See the file COPYING for more details.
				7	*/
				8
Randy.Dunlap	c59ede7	2006-01-11 12:17:46 -0800	[diff] [blame]	9	#include <linux/capability.h>
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	10	#include <linux/mm.h>
				11	#include <linux/file.h>
				12	#include <linux/slab.h>
				13	#include <linux/fs.h>
				14	#include <linux/kexec.h>
				15	#include <linux/spinlock.h>
				16	#include <linux/list.h>
				17	#include <linux/highmem.h>
				18	#include <linux/syscalls.h>
				19	#include <linux/reboot.h>
				20	#include <linux/syscalls.h>
				21	#include <linux/ioport.h>
Alexander Nyberg	6e274d1	2005-06-25 14:58:26 -0700	[diff] [blame]	22	#include <linux/hardirq.h>
Magnus Damm	85916f8	2006-12-06 20:40:41 -0800	[diff] [blame]	23	#include <linux/elf.h>
				24	#include <linux/elfcore.h>
Alexander Nyberg	6e274d1	2005-06-25 14:58:26 -0700	[diff] [blame]	25
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	26	#include <asm/page.h>
				27	#include <asm/uaccess.h>
				28	#include <asm/io.h>
				29	#include <asm/system.h>
				30	#include <asm/semaphore.h>
				31
Vivek Goyal	cc57165	2006-01-09 20:51:41 -0800	[diff] [blame]	32	/* Per cpu memory for storing cpu states in case of system crash. */
				33	note_buf_t* crash_notes;
				34
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	35	/* Location of the reserved area for the crash kernel */
				36	struct resource crashk_res = {
				37	.name = "Crash kernel",
				38	.start = 0,
				39	.end = 0,
				40	.flags = IORESOURCE_BUSY \| IORESOURCE_MEM
				41	};
				42
Alexander Nyberg	6e274d1	2005-06-25 14:58:26 -0700	[diff] [blame]	43	int kexec_should_crash(struct task_struct *p)
				44	{
Sukadev Bhattiprolu	f400e19	2006-09-29 02:00:07 -0700	[diff] [blame]	45	if (in_interrupt() \|\| !p->pid \|\| is_init(p) \|\| panic_on_oops)
Alexander Nyberg	6e274d1	2005-06-25 14:58:26 -0700	[diff] [blame]	46	return 1;
				47	return 0;
				48	}
				49
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	50	/*
				51	* When kexec transitions to the new kernel there is a one-to-one
				52	* mapping between physical and virtual addresses. On processors
				53	* where you can disable the MMU this is trivial, and easy. For
				54	* others it is still a simple predictable page table to setup.
				55	*
				56	* In that environment kexec copies the new kernel to its final
				57	* resting place. This means I can only support memory whose
				58	* physical address can fit in an unsigned long. In particular
				59	* addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
				60	* If the assembly stub has more restrictive requirements
				61	* KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
				62	* defined more restrictively in <asm/kexec.h>.
				63	*
				64	* The code for the transition from the current kernel to the
				65	* the new kernel is placed in the control_code_buffer, whose size
				66	* is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
				67	* page of memory is necessary, but some architectures require more.
				68	* Because this memory must be identity mapped in the transition from
				69	* virtual to physical addresses it must live in the range
				70	* 0 - TASK_SIZE, as only the user space mappings are arbitrarily
				71	* modifiable.
				72	*
				73	* The assembly stub in the control code buffer is passed a linked list
				74	* of descriptor pages detailing the source pages of the new kernel,
				75	* and the destination addresses of those source pages. As this data
				76	* structure is not used in the context of the current OS, it must
				77	* be self-contained.
				78	*
				79	* The code has been made to work with highmem pages and will use a
				80	* destination page in its final resting place (if it happens
				81	* to allocate it). The end product of this is that most of the
				82	* physical address space, and most of RAM can be used.
				83	*
				84	* Future directions include:
				85	* - allocating a page table with the control code buffer identity
				86	* mapped, to simplify machine_kexec and make kexec_on_panic more
				87	* reliable.
				88	*/
				89
				90	/*
				91	* KIMAGE_NO_DEST is an impossible destination address..., for
				92	* allocating pages whose destination address we do not care about.
				93	*/
				94	#define KIMAGE_NO_DEST (-1UL)
				95
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	96	static int kimage_is_destination_range(struct kimage *image,
				97	unsigned long start, unsigned long end);
				98	static struct page kimage_alloc_page(struct kimage image,
Al Viro	9796fdd	2005-10-21 03:22:03 -0400	[diff] [blame]	99	gfp_t gfp_mask,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	100	unsigned long dest);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	101
				102	static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	103	unsigned long nr_segments,
				104	struct kexec_segment __user *segments)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	105	{
				106	size_t segment_bytes;
				107	struct kimage *image;
				108	unsigned long i;
				109	int result;
				110
				111	/* Allocate a controlling structure */
				112	result = -ENOMEM;
Burman Yan	4668edc	2006-12-06 20:38:51 -0800	[diff] [blame]	113	image = kzalloc(sizeof(*image), GFP_KERNEL);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	114	if (!image)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	115	goto out;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	116
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	117	image->head = 0;
				118	image->entry = &image->head;
				119	image->last_entry = &image->head;
				120	image->control_page = ~0; /* By default this does not apply */
				121	image->start = entry;
				122	image->type = KEXEC_TYPE_DEFAULT;
				123
				124	/* Initialize the list of control pages */
				125	INIT_LIST_HEAD(&image->control_pages);
				126
				127	/* Initialize the list of destination pages */
				128	INIT_LIST_HEAD(&image->dest_pages);
				129
				130	/* Initialize the list of unuseable pages */
				131	INIT_LIST_HEAD(&image->unuseable_pages);
				132
				133	/* Read in the segments */
				134	image->nr_segments = nr_segments;
				135	segment_bytes = nr_segments * sizeof(*segments);
				136	result = copy_from_user(image->segment, segments, segment_bytes);
				137	if (result)
				138	goto out;
				139
				140	/*
				141	* Verify we have good destination addresses. The caller is
				142	* responsible for making certain we don't attempt to load
				143	* the new image into invalid or reserved areas of RAM. This
				144	* just verifies it is an address we can use.
				145	*
				146	* Since the kernel does everything in page size chunks ensure
				147	* the destination addreses are page aligned. Too many
				148	* special cases crop of when we don't do this. The most
				149	* insidious is getting overlapping destination addresses
				150	* simply because addresses are changed to page size
				151	* granularity.
				152	*/
				153	result = -EADDRNOTAVAIL;
				154	for (i = 0; i < nr_segments; i++) {
				155	unsigned long mstart, mend;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	156
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	157	mstart = image->segment[i].mem;
				158	mend = mstart + image->segment[i].memsz;
				159	if ((mstart & ~PAGE_MASK) \|\| (mend & ~PAGE_MASK))
				160	goto out;
				161	if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
				162	goto out;
				163	}
				164
				165	/* Verify our destination addresses do not overlap.
				166	* If we alloed overlapping destination addresses
				167	* through very weird things can happen with no
				168	* easy explanation as one segment stops on another.
				169	*/
				170	result = -EINVAL;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	171	for (i = 0; i < nr_segments; i++) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	172	unsigned long mstart, mend;
				173	unsigned long j;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	174
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	175	mstart = image->segment[i].mem;
				176	mend = mstart + image->segment[i].memsz;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	177	for (j = 0; j < i; j++) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	178	unsigned long pstart, pend;
				179	pstart = image->segment[j].mem;
				180	pend = pstart + image->segment[j].memsz;
				181	/* Do the segments overlap ? */
				182	if ((mend > pstart) && (mstart < pend))
				183	goto out;
				184	}
				185	}
				186
				187	/* Ensure our buffer sizes are strictly less than
				188	* our memory sizes. This should always be the case,
				189	* and it is easier to check up front than to be surprised
				190	* later on.
				191	*/
				192	result = -EINVAL;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	193	for (i = 0; i < nr_segments; i++) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	194	if (image->segment[i].bufsz > image->segment[i].memsz)
				195	goto out;
				196	}
				197
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	198	result = 0;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	199	out:
				200	if (result == 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	201	*rimage = image;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	202	else
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	203	kfree(image);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	204
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	205	return result;
				206
				207	}
				208
				209	static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	210	unsigned long nr_segments,
				211	struct kexec_segment __user *segments)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	212	{
				213	int result;
				214	struct kimage *image;
				215
				216	/* Allocate and initialize a controlling structure */
				217	image = NULL;
				218	result = do_kimage_alloc(&image, entry, nr_segments, segments);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	219	if (result)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	220	goto out;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	221
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	222	*rimage = image;
				223
				224	/*
				225	* Find a location for the control code buffer, and add it
				226	* the vector of segments so that it's pages will also be
				227	* counted as destination pages.
				228	*/
				229	result = -ENOMEM;
				230	image->control_code_page = kimage_alloc_control_pages(image,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	231	get_order(KEXEC_CONTROL_CODE_SIZE));
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	232	if (!image->control_code_page) {
				233	printk(KERN_ERR "Could not allocate control_code_buffer\n");
				234	goto out;
				235	}
				236
				237	result = 0;
				238	out:
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	239	if (result == 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	240	*rimage = image;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	241	else
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	242	kfree(image);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	243
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	244	return result;
				245	}
				246
				247	static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	248	unsigned long nr_segments,
Alexey Dobriyan	314b6a4	2005-06-27 22:29:33 -0700	[diff] [blame]	249	struct kexec_segment __user *segments)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	250	{
				251	int result;
				252	struct kimage *image;
				253	unsigned long i;
				254
				255	image = NULL;
				256	/* Verify we have a valid entry point */
				257	if ((entry < crashk_res.start) \|\| (entry > crashk_res.end)) {
				258	result = -EADDRNOTAVAIL;
				259	goto out;
				260	}
				261
				262	/* Allocate and initialize a controlling structure */
				263	result = do_kimage_alloc(&image, entry, nr_segments, segments);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	264	if (result)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	265	goto out;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	266
				267	/* Enable the special crash kernel control page
				268	* allocation policy.
				269	*/
				270	image->control_page = crashk_res.start;
				271	image->type = KEXEC_TYPE_CRASH;
				272
				273	/*
				274	* Verify we have good destination addresses. Normally
				275	* the caller is responsible for making certain we don't
				276	* attempt to load the new image into invalid or reserved
				277	* areas of RAM. But crash kernels are preloaded into a
				278	* reserved area of ram. We must ensure the addresses
				279	* are in the reserved area otherwise preloading the
				280	* kernel could corrupt things.
				281	*/
				282	result = -EADDRNOTAVAIL;
				283	for (i = 0; i < nr_segments; i++) {
				284	unsigned long mstart, mend;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	285
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	286	mstart = image->segment[i].mem;
Vivek Goyal	50cccc6	2005-06-25 14:57:55 -0700	[diff] [blame]	287	mend = mstart + image->segment[i].memsz - 1;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	288	/* Ensure we are within the crash kernel limits */
				289	if ((mstart < crashk_res.start) \|\| (mend > crashk_res.end))
				290	goto out;
				291	}
				292
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	293	/*
				294	* Find a location for the control code buffer, and add
				295	* the vector of segments so that it's pages will also be
				296	* counted as destination pages.
				297	*/
				298	result = -ENOMEM;
				299	image->control_code_page = kimage_alloc_control_pages(image,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	300	get_order(KEXEC_CONTROL_CODE_SIZE));
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	301	if (!image->control_code_page) {
				302	printk(KERN_ERR "Could not allocate control_code_buffer\n");
				303	goto out;
				304	}
				305
				306	result = 0;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	307	out:
				308	if (result == 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	309	*rimage = image;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	310	else
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	311	kfree(image);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	312
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	313	return result;
				314	}
				315
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	316	static int kimage_is_destination_range(struct kimage *image,
				317	unsigned long start,
				318	unsigned long end)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	319	{
				320	unsigned long i;
				321
				322	for (i = 0; i < image->nr_segments; i++) {
				323	unsigned long mstart, mend;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	324
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	325	mstart = image->segment[i].mem;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	326	mend = mstart + image->segment[i].memsz;
				327	if ((end > mstart) && (start < mend))
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	328	return 1;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	329	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	330
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	331	return 0;
				332	}
				333
Al Viro	9796fdd	2005-10-21 03:22:03 -0400	[diff] [blame]	334	static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	335	{
				336	struct page *pages;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	337
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	338	pages = alloc_pages(gfp_mask, order);
				339	if (pages) {
				340	unsigned int count, i;
				341	pages->mapping = NULL;
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	342	set_page_private(pages, order);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	343	count = 1 << order;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	344	for (i = 0; i < count; i++)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	345	SetPageReserved(pages + i);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	346	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	347
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	348	return pages;
				349	}
				350
				351	static void kimage_free_pages(struct page *page)
				352	{
				353	unsigned int order, count, i;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	354
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	355	order = page_private(page);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	356	count = 1 << order;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	357	for (i = 0; i < count; i++)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	358	ClearPageReserved(page + i);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	359	__free_pages(page, order);
				360	}
				361
				362	static void kimage_free_page_list(struct list_head *list)
				363	{
				364	struct list_head pos, next;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	365
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	366	list_for_each_safe(pos, next, list) {
				367	struct page *page;
				368
				369	page = list_entry(pos, struct page, lru);
				370	list_del(&page->lru);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	371	kimage_free_pages(page);
				372	}
				373	}
				374
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	375	static struct page kimage_alloc_normal_control_pages(struct kimage image,
				376	unsigned int order)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	377	{
				378	/* Control pages are special, they are the intermediaries
				379	* that are needed while we copy the rest of the pages
				380	* to their final resting place. As such they must
				381	* not conflict with either the destination addresses
				382	* or memory the kernel is already using.
				383	*
				384	* The only case where we really need more than one of
				385	* these are for architectures where we cannot disable
				386	* the MMU and must instead generate an identity mapped
				387	* page table for all of the memory.
				388	*
				389	* At worst this runs in O(N) of the image size.
				390	*/
				391	struct list_head extra_pages;
				392	struct page *pages;
				393	unsigned int count;
				394
				395	count = 1 << order;
				396	INIT_LIST_HEAD(&extra_pages);
				397
				398	/* Loop while I can allocate a page and the page allocated
				399	* is a destination page.
				400	*/
				401	do {
				402	unsigned long pfn, epfn, addr, eaddr;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	403
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	404	pages = kimage_alloc_pages(GFP_KERNEL, order);
				405	if (!pages)
				406	break;
				407	pfn = page_to_pfn(pages);
				408	epfn = pfn + count;
				409	addr = pfn << PAGE_SHIFT;
				410	eaddr = epfn << PAGE_SHIFT;
				411	if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) \|\|
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	412	kimage_is_destination_range(image, addr, eaddr)) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	413	list_add(&pages->lru, &extra_pages);
				414	pages = NULL;
				415	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	416	} while (!pages);
				417
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	418	if (pages) {
				419	/* Remember the allocated page... */
				420	list_add(&pages->lru, &image->control_pages);
				421
				422	/* Because the page is already in it's destination
				423	* location we will never allocate another page at
				424	* that address. Therefore kimage_alloc_pages
				425	* will not return it (again) and we don't need
				426	* to give it an entry in image->segment[].
				427	*/
				428	}
				429	/* Deal with the destination pages I have inadvertently allocated.
				430	*
				431	* Ideally I would convert multi-page allocations into single
				432	* page allocations, and add everyting to image->dest_pages.
				433	*
				434	* For now it is simpler to just free the pages.
				435	*/
				436	kimage_free_page_list(&extra_pages);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	437
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	438	return pages;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	439	}
				440
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	441	static struct page kimage_alloc_crash_control_pages(struct kimage image,
				442	unsigned int order)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	443	{
				444	/* Control pages are special, they are the intermediaries
				445	* that are needed while we copy the rest of the pages
				446	* to their final resting place. As such they must
				447	* not conflict with either the destination addresses
				448	* or memory the kernel is already using.
				449	*
				450	* Control pages are also the only pags we must allocate
				451	* when loading a crash kernel. All of the other pages
				452	* are specified by the segments and we just memcpy
				453	* into them directly.
				454	*
				455	* The only case where we really need more than one of
				456	* these are for architectures where we cannot disable
				457	* the MMU and must instead generate an identity mapped
				458	* page table for all of the memory.
				459	*
				460	* Given the low demand this implements a very simple
				461	* allocator that finds the first hole of the appropriate
				462	* size in the reserved memory region, and allocates all
				463	* of the memory up to and including the hole.
				464	*/
				465	unsigned long hole_start, hole_end, size;
				466	struct page *pages;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	467
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	468	pages = NULL;
				469	size = (1 << order) << PAGE_SHIFT;
				470	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
				471	hole_end = hole_start + size - 1;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	472	while (hole_end <= crashk_res.end) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	473	unsigned long i;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	474
				475	if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	476	break;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	477	if (hole_end > crashk_res.end)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	478	break;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	479	/* See if I overlap any of the segments */
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	480	for (i = 0; i < image->nr_segments; i++) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	481	unsigned long mstart, mend;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	482
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	483	mstart = image->segment[i].mem;
				484	mend = mstart + image->segment[i].memsz - 1;
				485	if ((hole_end >= mstart) && (hole_start <= mend)) {
				486	/* Advance the hole to the end of the segment */
				487	hole_start = (mend + (size - 1)) & ~(size - 1);
				488	hole_end = hole_start + size - 1;
				489	break;
				490	}
				491	}
				492	/* If I don't overlap any segments I have found my hole! */
				493	if (i == image->nr_segments) {
				494	pages = pfn_to_page(hole_start >> PAGE_SHIFT);
				495	break;
				496	}
				497	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	498	if (pages)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	499	image->control_page = hole_end;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	500
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	501	return pages;
				502	}
				503
				504
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	505	struct page kimage_alloc_control_pages(struct kimage image,
				506	unsigned int order)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	507	{
				508	struct page *pages = NULL;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	509
				510	switch (image->type) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	511	case KEXEC_TYPE_DEFAULT:
				512	pages = kimage_alloc_normal_control_pages(image, order);
				513	break;
				514	case KEXEC_TYPE_CRASH:
				515	pages = kimage_alloc_crash_control_pages(image, order);
				516	break;
				517	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	518
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	519	return pages;
				520	}
				521
				522	static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
				523	{
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	524	if (*image->entry != 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	525	image->entry++;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	526
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	527	if (image->entry == image->last_entry) {
				528	kimage_entry_t *ind_page;
				529	struct page *page;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	530
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	531	page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	532	if (!page)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	533	return -ENOMEM;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	534
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	535	ind_page = page_address(page);
				536	*image->entry = virt_to_phys(ind_page) \| IND_INDIRECTION;
				537	image->entry = ind_page;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	538	image->last_entry = ind_page +
				539	((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	540	}
				541	*image->entry = entry;
				542	image->entry++;
				543	*image->entry = 0;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	544
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	545	return 0;
				546	}
				547
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	548	static int kimage_set_destination(struct kimage *image,
				549	unsigned long destination)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	550	{
				551	int result;
				552
				553	destination &= PAGE_MASK;
				554	result = kimage_add_entry(image, destination \| IND_DESTINATION);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	555	if (result == 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	556	image->destination = destination;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	557
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	558	return result;
				559	}
				560
				561
				562	static int kimage_add_page(struct kimage *image, unsigned long page)
				563	{
				564	int result;
				565
				566	page &= PAGE_MASK;
				567	result = kimage_add_entry(image, page \| IND_SOURCE);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	568	if (result == 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	569	image->destination += PAGE_SIZE;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	570
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	571	return result;
				572	}
				573
				574
				575	static void kimage_free_extra_pages(struct kimage *image)
				576	{
				577	/* Walk through and free any extra destination pages I may have */
				578	kimage_free_page_list(&image->dest_pages);
				579
				580	/* Walk through and free any unuseable pages I have cached */
				581	kimage_free_page_list(&image->unuseable_pages);
				582
				583	}
				584	static int kimage_terminate(struct kimage *image)
				585	{
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	586	if (*image->entry != 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	587	image->entry++;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	588
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	589	*image->entry = IND_DONE;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	590
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	591	return 0;
				592	}
				593
				594	#define for_each_kimage_entry(image, ptr, entry) \
				595	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
				596	ptr = (entry & IND_INDIRECTION)? \
				597	phys_to_virt((entry & PAGE_MASK)): ptr +1)
				598
				599	static void kimage_free_entry(kimage_entry_t entry)
				600	{
				601	struct page *page;
				602
				603	page = pfn_to_page(entry >> PAGE_SHIFT);
				604	kimage_free_pages(page);
				605	}
				606
				607	static void kimage_free(struct kimage *image)
				608	{
				609	kimage_entry_t *ptr, entry;
				610	kimage_entry_t ind = 0;
				611
				612	if (!image)
				613	return;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	614
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	615	kimage_free_extra_pages(image);
				616	for_each_kimage_entry(image, ptr, entry) {
				617	if (entry & IND_INDIRECTION) {
				618	/* Free the previous indirection page */
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	619	if (ind & IND_INDIRECTION)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	620	kimage_free_entry(ind);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	621	/* Save this indirection page until we are
				622	* done with it.
				623	*/
				624	ind = entry;
				625	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	626	else if (entry & IND_SOURCE)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	627	kimage_free_entry(entry);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	628	}
				629	/* Free the final indirection page */
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	630	if (ind & IND_INDIRECTION)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	631	kimage_free_entry(ind);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	632
				633	/* Handle any machine specific cleanup */
				634	machine_kexec_cleanup(image);
				635
				636	/* Free the kexec control pages... */
				637	kimage_free_page_list(&image->control_pages);
				638	kfree(image);
				639	}
				640
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	641	static kimage_entry_t kimage_dst_used(struct kimage image,
				642	unsigned long page)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	643	{
				644	kimage_entry_t *ptr, entry;
				645	unsigned long destination = 0;
				646
				647	for_each_kimage_entry(image, ptr, entry) {
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	648	if (entry & IND_DESTINATION)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	649	destination = entry & PAGE_MASK;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	650	else if (entry & IND_SOURCE) {
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	651	if (page == destination)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	652	return ptr;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	653	destination += PAGE_SIZE;
				654	}
				655	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	656
Alexey Dobriyan	314b6a4	2005-06-27 22:29:33 -0700	[diff] [blame]	657	return NULL;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	658	}
				659
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	660	static struct page kimage_alloc_page(struct kimage image,
Al Viro	9796fdd	2005-10-21 03:22:03 -0400	[diff] [blame]	661	gfp_t gfp_mask,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	662	unsigned long destination)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	663	{
				664	/*
				665	* Here we implement safeguards to ensure that a source page
				666	* is not copied to its destination page before the data on
				667	* the destination page is no longer useful.
				668	*
				669	* To do this we maintain the invariant that a source page is
				670	* either its own destination page, or it is not a
				671	* destination page at all.
				672	*
				673	* That is slightly stronger than required, but the proof
				674	* that no problems will not occur is trivial, and the
				675	* implementation is simply to verify.
				676	*
				677	* When allocating all pages normally this algorithm will run
				678	* in O(N) time, but in the worst case it will run in O(N^2)
				679	* time. If the runtime is a problem the data structures can
				680	* be fixed.
				681	*/
				682	struct page *page;
				683	unsigned long addr;
				684
				685	/*
				686	* Walk through the list of destination pages, and see if I
				687	* have a match.
				688	*/
				689	list_for_each_entry(page, &image->dest_pages, lru) {
				690	addr = page_to_pfn(page) << PAGE_SHIFT;
				691	if (addr == destination) {
				692	list_del(&page->lru);
				693	return page;
				694	}
				695	}
				696	page = NULL;
				697	while (1) {
				698	kimage_entry_t *old;
				699
				700	/* Allocate a page, if we run out of memory give up */
				701	page = kimage_alloc_pages(gfp_mask, 0);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	702	if (!page)
Alexey Dobriyan	314b6a4	2005-06-27 22:29:33 -0700	[diff] [blame]	703	return NULL;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	704	/* If the page cannot be used file it away */
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	705	if (page_to_pfn(page) >
				706	(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	707	list_add(&page->lru, &image->unuseable_pages);
				708	continue;
				709	}
				710	addr = page_to_pfn(page) << PAGE_SHIFT;
				711
				712	/* If it is the destination page we want use it */
				713	if (addr == destination)
				714	break;
				715
				716	/* If the page is not a destination page use it */
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	717	if (!kimage_is_destination_range(image, addr,
				718	addr + PAGE_SIZE))
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	719	break;
				720
				721	/*
				722	* I know that the page is someones destination page.
				723	* See if there is already a source page for this
				724	* destination page. And if so swap the source pages.
				725	*/
				726	old = kimage_dst_used(image, addr);
				727	if (old) {
				728	/* If so move it */
				729	unsigned long old_addr;
				730	struct page *old_page;
				731
				732	old_addr = *old & PAGE_MASK;
				733	old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
				734	copy_highpage(page, old_page);
				735	old = addr \| (old & ~PAGE_MASK);
				736
				737	/* The old page I have found cannot be a
				738	* destination page, so return it.
				739	*/
				740	addr = old_addr;
				741	page = old_page;
				742	break;
				743	}
				744	else {
				745	/* Place the page on the destination list I
				746	* will use it later.
				747	*/
				748	list_add(&page->lru, &image->dest_pages);
				749	}
				750	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	751
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	752	return page;
				753	}
				754
				755	static int kimage_load_normal_segment(struct kimage *image,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	756	struct kexec_segment *segment)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	757	{
				758	unsigned long maddr;
				759	unsigned long ubytes, mbytes;
				760	int result;
Alexey Dobriyan	314b6a4	2005-06-27 22:29:33 -0700	[diff] [blame]	761	unsigned char __user *buf;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	762
				763	result = 0;
				764	buf = segment->buf;
				765	ubytes = segment->bufsz;
				766	mbytes = segment->memsz;
				767	maddr = segment->mem;
				768
				769	result = kimage_set_destination(image, maddr);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	770	if (result < 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	771	goto out;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	772
				773	while (mbytes) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	774	struct page *page;
				775	char *ptr;
				776	size_t uchunk, mchunk;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	777
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	778	page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
				779	if (page == 0) {
				780	result = -ENOMEM;
				781	goto out;
				782	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	783	result = kimage_add_page(image, page_to_pfn(page)
				784	<< PAGE_SHIFT);
				785	if (result < 0)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	786	goto out;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	787
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	788	ptr = kmap(page);
				789	/* Start with a clear page */
				790	memset(ptr, 0, PAGE_SIZE);
				791	ptr += maddr & ~PAGE_MASK;
				792	mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	793	if (mchunk > mbytes)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	794	mchunk = mbytes;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	795
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	796	uchunk = mchunk;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	797	if (uchunk > ubytes)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	798	uchunk = ubytes;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	799
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	800	result = copy_from_user(ptr, buf, uchunk);
				801	kunmap(page);
				802	if (result) {
				803	result = (result < 0) ? result : -EIO;
				804	goto out;
				805	}
				806	ubytes -= uchunk;
				807	maddr += mchunk;
				808	buf += mchunk;
				809	mbytes -= mchunk;
				810	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	811	out:
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	812	return result;
				813	}
				814
				815	static int kimage_load_crash_segment(struct kimage *image,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	816	struct kexec_segment *segment)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	817	{
				818	/* For crash dumps kernels we simply copy the data from
				819	* user space to it's destination.
				820	* We do things a page at a time for the sake of kmap.
				821	*/
				822	unsigned long maddr;
				823	unsigned long ubytes, mbytes;
				824	int result;
Alexey Dobriyan	314b6a4	2005-06-27 22:29:33 -0700	[diff] [blame]	825	unsigned char __user *buf;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	826
				827	result = 0;
				828	buf = segment->buf;
				829	ubytes = segment->bufsz;
				830	mbytes = segment->memsz;
				831	maddr = segment->mem;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	832	while (mbytes) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	833	struct page *page;
				834	char *ptr;
				835	size_t uchunk, mchunk;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	836
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	837	page = pfn_to_page(maddr >> PAGE_SHIFT);
				838	if (page == 0) {
				839	result = -ENOMEM;
				840	goto out;
				841	}
				842	ptr = kmap(page);
				843	ptr += maddr & ~PAGE_MASK;
				844	mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	845	if (mchunk > mbytes)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	846	mchunk = mbytes;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	847
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	848	uchunk = mchunk;
				849	if (uchunk > ubytes) {
				850	uchunk = ubytes;
				851	/* Zero the trailing part of the page */
				852	memset(ptr + uchunk, 0, mchunk - uchunk);
				853	}
				854	result = copy_from_user(ptr, buf, uchunk);
Zou Nan hai	a7956113	2006-12-07 09:51:35 -0800	[diff] [blame]	855	kexec_flush_icache_page(page);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	856	kunmap(page);
				857	if (result) {
				858	result = (result < 0) ? result : -EIO;
				859	goto out;
				860	}
				861	ubytes -= uchunk;
				862	maddr += mchunk;
				863	buf += mchunk;
				864	mbytes -= mchunk;
				865	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	866	out:
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	867	return result;
				868	}
				869
				870	static int kimage_load_segment(struct kimage *image,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	871	struct kexec_segment *segment)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	872	{
				873	int result = -ENOMEM;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	874
				875	switch (image->type) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	876	case KEXEC_TYPE_DEFAULT:
				877	result = kimage_load_normal_segment(image, segment);
				878	break;
				879	case KEXEC_TYPE_CRASH:
				880	result = kimage_load_crash_segment(image, segment);
				881	break;
				882	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	883
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	884	return result;
				885	}
				886
				887	/*
				888	* Exec Kernel system call: for obvious reasons only root may call it.
				889	*
				890	* This call breaks up into three pieces.
				891	* - A generic part which loads the new kernel from the current
				892	* address space, and very carefully places the data in the
				893	* allocated pages.
				894	*
				895	* - A generic part that interacts with the kernel and tells all of
				896	* the devices to shut down. Preventing on-going dmas, and placing
				897	* the devices in a consistent state so a later kernel can
				898	* reinitialize them.
				899	*
				900	* - A machine specific part that includes the syscall number
				901	* and the copies the image to it's final destination. And
				902	* jumps into the image at entry.
				903	*
				904	* kexec does not sync, or unmount filesystems so if you need
				905	* that to happen you need to do that yourself.
				906	*/
Jeff Moyer	c330dda	2006-06-23 02:05:07 -0700	[diff] [blame]	907	struct kimage *kexec_image;
				908	struct kimage *kexec_crash_image;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	909	/*
				910	* A home grown binary mutex.
				911	* Nothing can wait so this mutex is safe to use
				912	* in interrupt context :)
				913	*/
Jeff Moyer	c330dda	2006-06-23 02:05:07 -0700	[diff] [blame]	914	static int kexec_lock;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	915
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	916	asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
				917	struct kexec_segment __user *segments,
				918	unsigned long flags)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	919	{
				920	struct kimage *dest_image, image;
				921	int locked;
				922	int result;
				923
				924	/* We only trust the superuser with rebooting the system. */
				925	if (!capable(CAP_SYS_BOOT))
				926	return -EPERM;
				927
				928	/*
				929	* Verify we have a legal set of flags
				930	* This leaves us room for future extensions.
				931	*/
				932	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
				933	return -EINVAL;
				934
				935	/* Verify we are on the appropriate architecture */
				936	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
				937	((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	938	return -EINVAL;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	939
				940	/* Put an artificial cap on the number
				941	* of segments passed to kexec_load.
				942	*/
				943	if (nr_segments > KEXEC_SEGMENT_MAX)
				944	return -EINVAL;
				945
				946	image = NULL;
				947	result = 0;
				948
				949	/* Because we write directly to the reserved memory
				950	* region when loading crash kernels we need a mutex here to
				951	* prevent multiple crash kernels from attempting to load
				952	* simultaneously, and to prevent a crash kernel from loading
				953	* over the top of a in use crash kernel.
				954	*
				955	* KISS: always take the mutex.
				956	*/
				957	locked = xchg(&kexec_lock, 1);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	958	if (locked)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	959	return -EBUSY;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	960
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	961	dest_image = &kexec_image;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	962	if (flags & KEXEC_ON_CRASH)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	963	dest_image = &kexec_crash_image;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	964	if (nr_segments > 0) {
				965	unsigned long i;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	966
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	967	/* Loading another kernel to reboot into */
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	968	if ((flags & KEXEC_ON_CRASH) == 0)
				969	result = kimage_normal_alloc(&image, entry,
				970	nr_segments, segments);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	971	/* Loading another kernel to switch to if this one crashes */
				972	else if (flags & KEXEC_ON_CRASH) {
				973	/* Free any current crash dump kernel before
				974	* we corrupt it.
				975	*/
				976	kimage_free(xchg(&kexec_crash_image, NULL));
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	977	result = kimage_crash_alloc(&image, entry,
				978	nr_segments, segments);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	979	}
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	980	if (result)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	981	goto out;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	982
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	983	result = machine_kexec_prepare(image);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	984	if (result)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	985	goto out;
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	986
				987	for (i = 0; i < nr_segments; i++) {
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	988	result = kimage_load_segment(image, &image->segment[i]);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	989	if (result)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	990	goto out;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	991	}
				992	result = kimage_terminate(image);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	993	if (result)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	994	goto out;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	995	}
				996	/* Install the new kernel, and Uninstall the old */
				997	image = xchg(dest_image, image);
				998
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	999	out:
Roland McGrath	0b4a8a7	2006-09-29 02:00:39 -0700	[diff] [blame]	1000	locked = xchg(&kexec_lock, 0); /* Release the mutex */
				1001	BUG_ON(!locked);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1002	kimage_free(image);
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	1003
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1004	return result;
				1005	}
				1006
				1007	#ifdef CONFIG_COMPAT
				1008	asmlinkage long compat_sys_kexec_load(unsigned long entry,
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	1009	unsigned long nr_segments,
				1010	struct compat_kexec_segment __user *segments,
				1011	unsigned long flags)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1012	{
				1013	struct compat_kexec_segment in;
				1014	struct kexec_segment out, __user *ksegments;
				1015	unsigned long i, result;
				1016
				1017	/* Don't allow clients that don't understand the native
				1018	* architecture to do anything.
				1019	*/
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	1020	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1021	return -EINVAL;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1022
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	1023	if (nr_segments > KEXEC_SEGMENT_MAX)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1024	return -EINVAL;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1025
				1026	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
				1027	for (i=0; i < nr_segments; i++) {
				1028	result = copy_from_user(&in, &segments[i], sizeof(in));
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	1029	if (result)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1030	return -EFAULT;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1031
				1032	out.buf = compat_ptr(in.buf);
				1033	out.bufsz = in.bufsz;
				1034	out.mem = in.mem;
				1035	out.memsz = in.memsz;
				1036
				1037	result = copy_to_user(&ksegments[i], &out, sizeof(out));
Maneesh Soni	72414d3	2005-06-25 14:58:28 -0700	[diff] [blame]	1038	if (result)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1039	return -EFAULT;
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1040	}
				1041
				1042	return sys_kexec_load(entry, nr_segments, ksegments, flags);
				1043	}
				1044	#endif
				1045
Alexander Nyberg	6e274d1	2005-06-25 14:58:26 -0700	[diff] [blame]	1046	void crash_kexec(struct pt_regs *regs)
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1047	{
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1048	int locked;
				1049
				1050
				1051	/* Take the kexec_lock here to prevent sys_kexec_load
				1052	* running on one cpu from replacing the crash kernel
				1053	* we are using after a panic on a different cpu.
				1054	*
				1055	* If the crash kernel was not located in a fixed area
				1056	* of memory the xchg(&kexec_crash_image) would be
				1057	* sufficient. But since I reuse the memory...
				1058	*/
				1059	locked = xchg(&kexec_lock, 1);
				1060	if (!locked) {
David Wilder	c0ce7d0	2006-06-23 15:29:34 -0700	[diff] [blame]	1061	if (kexec_crash_image) {
Vivek Goyal	e996e58	2006-01-09 20:51:44 -0800	[diff] [blame]	1062	struct pt_regs fixed_regs;
				1063	crash_setup_regs(&fixed_regs, regs);
				1064	machine_crash_shutdown(&fixed_regs);
David Wilder	c0ce7d0	2006-06-23 15:29:34 -0700	[diff] [blame]	1065	machine_kexec(kexec_crash_image);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1066	}
Roland McGrath	0b4a8a7	2006-09-29 02:00:39 -0700	[diff] [blame]	1067	locked = xchg(&kexec_lock, 0);
				1068	BUG_ON(!locked);
Eric W. Biederman	dc009d9	2005-06-25 14:57:52 -0700	[diff] [blame]	1069	}
				1070	}
Vivek Goyal	cc57165	2006-01-09 20:51:41 -0800	[diff] [blame]	1071
Magnus Damm	85916f8	2006-12-06 20:40:41 -0800	[diff] [blame]	1072	static u32 append_elf_note(u32 buf, char name, unsigned type, void data,
				1073	size_t data_len)
				1074	{
				1075	struct elf_note note;
				1076
				1077	note.n_namesz = strlen(name) + 1;
				1078	note.n_descsz = data_len;
				1079	note.n_type = type;
				1080	memcpy(buf, &note, sizeof(note));
				1081	buf += (sizeof(note) + 3)/4;
				1082	memcpy(buf, name, note.n_namesz);
				1083	buf += (note.n_namesz + 3)/4;
				1084	memcpy(buf, data, note.n_descsz);
				1085	buf += (note.n_descsz + 3)/4;
				1086
				1087	return buf;
				1088	}
				1089
				1090	static void final_note(u32 *buf)
				1091	{
				1092	struct elf_note note;
				1093
				1094	note.n_namesz = 0;
				1095	note.n_descsz = 0;
				1096	note.n_type = 0;
				1097	memcpy(buf, &note, sizeof(note));
				1098	}
				1099
				1100	void crash_save_cpu(struct pt_regs *regs, int cpu)
				1101	{
				1102	struct elf_prstatus prstatus;
				1103	u32 *buf;
				1104
				1105	if ((cpu < 0) \|\| (cpu >= NR_CPUS))
				1106	return;
				1107
				1108	/* Using ELF notes here is opportunistic.
				1109	* I need a well defined structure format
				1110	* for the data I pass, and I need tags
				1111	* on the data to indicate what information I have
				1112	* squirrelled away. ELF notes happen to provide
				1113	* all of that, so there is no need to invent something new.
				1114	*/
				1115	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
				1116	if (!buf)
				1117	return;
				1118	memset(&prstatus, 0, sizeof(prstatus));
				1119	prstatus.pr_pid = current->pid;
				1120	elf_core_copy_regs(&prstatus.pr_reg, regs);
Simon Horman	6672f76	2007-05-08 00:28:22 -0700	[diff] [blame^]	1121	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
				1122	&prstatus, sizeof(prstatus));
Magnus Damm	85916f8	2006-12-06 20:40:41 -0800	[diff] [blame]	1123	final_note(buf);
				1124	}
				1125
Vivek Goyal	cc57165	2006-01-09 20:51:41 -0800	[diff] [blame]	1126	static int __init crash_notes_memory_init(void)
				1127	{
				1128	/* Allocate memory for saving cpu registers. */
				1129	crash_notes = alloc_percpu(note_buf_t);
				1130	if (!crash_notes) {
				1131	printk("Kexec: Memory allocation for saving cpu register"
				1132	" states failed\n");
				1133	return -ENOMEM;
				1134	}
				1135	return 0;
				1136	}
				1137	module_init(crash_notes_memory_init)